Skip to content

Commit 72ff5b8

Browse files
dxoigmnCopilotmariusarvintemzweilin
authored
Release 2025.06 (#89)
# 🎉 Major Updates - Add support for image-text-to-text models (e.g., Llama3.2-Vision and UI-TARS) - Add support for additional text-to-text models (DeepAlignment, LlamaGuard3, and HarmBench Classifier) - Add example attack against LLaDa, a large language diffusion model - Add `DataMapper` abstraction to enable easy adaptation of existing datasets to models # 🎈 Minor Updates - Add `good_token_ids` support to GCG optimizer - Save best attack to disk at last step and reduced save state for hard-token attacks - Output only continuation tokens and not full prompt in evaluation - Remove check for back-to-back tags in tokenizer - Enable command-line modification of response via `response.prefix=` and `response.suffix=` - `TaggedTokenizer` now supports returning `input_map` when `return_tensors=None` # 🚧 Bug Fixes - Fix tokenizer prefix-space detection (e.g., Llama2's tokenizer) - Allow early stop with multi-sample datasets - All `make` commands now run in isolated virtual environments - `max_new_tokens` generates exactly that many tokens at test time regardless of `eos_token` --------- Co-authored-by: Copilot <[email protected]> Co-authored-by: Marius Arvinte <[email protected]> Co-authored-by: Weilin Xu <[email protected]>
1 parent 20a8760 commit 72ff5b8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+2043
-834
lines changed

Makefile

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,46 +10,49 @@ define REPEATED_CONTENT
1010
> $(shell python -c "print('Okay, so I need to tell someone about Saturn.' * 65)")
1111
endef
1212

13-
all: run-core run-cache run-batching run-reasoning run-examples
14-
15-
install:
16-
> uv venv
17-
> uv sync --extra gpu --extra dev
18-
19-
install-xpu:
20-
> uv venv
21-
> uv sync --extra xpu --extra dev
13+
all: run-core run-cache run-batching run-reasoning run-examples run-image-text-to-text run-dataset
2214

2315
# Run target for README.md examples
24-
run-core: install
25-
> uv run accelerate launch -m llmart model=llama3-8b-instruct data=basic steps=3
26-
> uv run accelerate launch -m llmart model=custom model.name=Intel/neural-chat-7b-v3-3 model.revision=7506dfc5fb325a8a8e0c4f9a6a001671833e5b8e data=basic steps=3
27-
> uv run python -m llmart model=llama3.1-70b-instruct model.device=null model.device_map=auto data=basic steps=3
28-
> uv run accelerate launch -m llmart model=llama3-8b-instruct data=advbench_behavior data.subset=[0] steps=3
16+
run-core:
17+
> $(RUN_GPU) accelerate launch -m llmart model=llama3-8b-instruct data=basic steps=3
18+
> $(RUN_GPU) accelerate launch -m llmart model=custom model.name=Intel/neural-chat-7b-v3-3 model.revision=7506dfc5fb325a8a8e0c4f9a6a001671833e5b8e data=basic steps=3
19+
> $(RUN_GPU) python -m llmart model=llama3.1-70b-instruct model.device=null model.device_map=auto data=basic steps=3
20+
> $(RUN_GPU) accelerate launch -m llmart model=llama3-8b-instruct data=advbench_behavior data.subset=[0] steps=3
2921

3022
# Run target for KV cache
31-
run-cache: install
32-
> uv run accelerate launch -m llmart model=llama3-8b-instruct data=advbench_behavior data.subset=[0] steps=3 use_kv_cache=True
23+
run-cache:
24+
> $(RUN_GPU) accelerate launch -m llmart model=llama3-8b-instruct data=advbench_behavior data.subset=[0] steps=3 use_kv_cache=True
3325

3426
# Run target for batch configurations
35-
run-batching: install
36-
> uv run accelerate launch --num_processes 2 -m llmart model=llama3-8b-instruct data=advbench_behavior data.subset=null data.n_train=4 data.n_val=1 data.n_test=1 steps=2 bs=8
37-
> uv run accelerate launch --num_processes 2 -m llmart model=llama3-8b-instruct data=advbench_behavior data.subset=null data.n_train=13 data.n_val=1 data.n_test=1 steps=2 bs=1 per_device_bs=5
38-
> uv run accelerate launch --num_processes 3 -m llmart model=llama3-8b-instruct data=advbench_behavior data.subset=null data.n_train=13 data.n_val=1 data.n_test=1 steps=2 bs=13
27+
run-batching:
28+
> $(RUN_GPU) accelerate launch --num_processes 2 -m llmart model=llama3-8b-instruct data=advbench_behavior data.subset=null data.n_train=4 data.n_val=1 data.n_test=1 steps=2 bs=8
29+
> $(RUN_GPU) accelerate launch --num_processes 2 -m llmart model=llama3-8b-instruct data=advbench_behavior data.subset=null data.n_train=13 data.n_val=1 data.n_test=1 steps=2 bs=1 per_device_bs=5
30+
> $(RUN_GPU) accelerate launch --num_processes 3 -m llmart model=llama3-8b-instruct data=advbench_behavior data.subset=null data.n_train=13 data.n_val=1 data.n_test=1 steps=2 bs=13
3931

4032
# Run target for reasoning examples
41-
run-reasoning: install
42-
> uv run accelerate launch -m llmart model=deepseek-r1-distill-llama-8b data=basic per_device_bs=64 "response.replace_with=`echo -e '\"<think>\nOkay, so I need to tell someone about Saturn.\n</think>\n\nNO WAY JOSE\"'`" steps=3
43-
> uv run accelerate launch -m llmart model=deepseek-r1-distill-llama-8b data=basic "response.replace_with=`echo -e '\"<think>\n$(REPEATED_CONTENT)\n</think>\n\nNO WAY JOSE\"'`" steps=3
33+
run-reasoning:
34+
> $(RUN_GPU) accelerate launch -m llmart model=deepseek-r1-distill-llama-8b data=basic per_device_bs=64 "response.prefix=`echo -e '\"<think>\nOkay, so I need to tell someone about Saturn.\n</think>\n\n\"'`" steps=3
35+
> $(RUN_GPU) accelerate launch -m llmart model=deepseek-r1-distill-llama-8b data=basic "response.prefix=`echo -e '\"<think>\n$(REPEATED_CONTENT)\n</think>\n\n\"'`" steps=3
4436

4537
# Run target for XPU
46-
run-xpu: install-xpu
47-
> uv run accelerate launch -m llmart model=custom model.name=meta-llama/Llama-3.2-3B-Instruct model.revision=0cb88a4f764b7a12671c53f0838cd831a0843b95 data=basic model.device=xpu
38+
run-xpu:
39+
> $(RUN_XPU) accelerate launch -m llmart model=custom model.name=meta-llama/Llama-3.2-3B-Instruct model.revision=0cb88a4f764b7a12671c53f0838cd831a0843b95 data=basic model.device=xpu
40+
41+
# Run image-text-to-text examples
42+
run-image-text-to-text:
43+
> $(RUN_GPU) accelerate launch -m llmart model=ui-tars-2b-sft data=ui-tars attack.suffix=20 attack.suffix_pad_left="" per_device_bs=1 optim.n_swaps=16 steps=3
44+
# NOTE: The example below does not run because mllama does not allow feeding pixel_values and inputs_embeds
45+
#> $(RUN_GPU) accelerate launch -m llmart model=llama3.2-11b-vision-instruct data=mllama per_device_bs=16 attack.suffix=40 steps=3
46+
47+
# Run dataset examples
48+
run-dataset:
49+
> $(RUN_GPU) accelerate launch -m llmart model=harmbench-classifier data=harmbench 'data.subset=[2]' per_device_bs=8 steps=3
50+
> $(RUN_GPU) accelerate launch -m llmart model=llamaguard3-1b data=toxic-chat 'data.subset=[0]' attack.suffix_pad_right="." data.mapper=toxic_chat_model_output steps=3
4851

49-
run-examples: install
52+
run-examples:
5053
> $(MAKE) -C examples
5154

5255
clean:
5356
> rm -rf .venv
5457

55-
.PHONY: all install install-xpu run-core run-cache run-batching run-reasoning run-xpu run-examples clean
58+
.PHONY: all run-core run-cache run-batching run-reasoning run-xpu run-image-text-to-text run-dataset run-examples clean

README.md

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,22 @@
1111
</div>
1212

1313
## 🆕 Latest updates
14-
❗Release 2025.04 brings full native support for running **LLM**art on [Intel AI PCs](https://www.intel.com/content/www/us/en/products/docs/processors/core-ultra/ai-pc.html)! This allows AI PC owners to _locally_ and rigorously evaluate the security of their own privately fine-tuned and deployed LLMs.
14+
Release 2025.06 significantly expands the types of models that can be attacked using **LLM**art and adds an image modality attack example that combines **LLM**art with Intel's [MART](https://github.com/IntelLabs/MART) library, as well the first ever attack on a diffusion language model (dLLM)!
1515

16-
This release also marks our transition to a `uv`-centric install experience. Enjoy robust, platform agnostic (Windows, Linux) one-line installs by using `uv sync --extra gpu` (for GPUs) or `uv sync --extra xpu` (for Intel XPUs).
16+
New core library support and examples for attacking VLMs. Check out our new [example](examples/vlm) on vision modality attacks against a [computer use model](https://huggingface.co/ByteDance-Seed/UI-TARS-7B-DPO)!
1717

18+
❗New core library support for out-of-the-box attacks against guardrail models and data formats such as [HarmBench](https://github.com/centerforaisafety/HarmBench). Just specify the model and data directly in the command line and press the Enter key!
19+
```bash
20+
uv run accelerate launch -m llmart model=harmbench-classifier data=harmbench data.subset=[0]
21+
```
22+
23+
❗New example for attacking the [LLaDA](https://ml-gsai.github.io/LLaDA-demo/) diffusion large language model. If you're an AI security expert, the conclusion won't suprise you: **LLM**art can crack it in ~10 minutes in our ready-to-run [example](examples/llada)!
24+
25+
❗We made it easier to adapt existing datasets to existing models via the [DataMapper](src/llmart/data.py#L93) abstraction. See [Custom Dataset or DataMapper](#custom-dataset-or-datamapper) for more details!
1826
<details>
1927
<summary>Past updates</summary>
28+
❗Release 2025.04 brings full native support for running **LLM**art on [Intel AI PCs](https://www.intel.com/content/www/us/en/products/docs/processors/core-ultra/ai-pc.html)! This allows AI PC owners to _locally_ and rigorously evaluate the security of their own privately fine-tuned and deployed LLMs. This release also marks our transition to a `uv`-centric install experience. Enjoy robust, platform agnostic (Windows, Linux) one-line installs by using `uv sync --extra gpu` (for GPUs) or `uv sync --extra xpu` (for Intel XPUs).
29+
2030
❗Release 2025.03 brings a new experimental functionality for letting **LLM**art automatically estimate the maximum usable `per_device_bs`. This can result in speed-ups up to 10x on devices with a sufficient amount of memory! Enable from the command line using `per_device_bs=-1`.
2131

2232
❗Release 2025.02 brings significant speed-ups to the core library, with zero user involvement.\
@@ -147,31 +157,28 @@ uv run python -m llmart model=llama3.1-70b-instruct model.device=null model.devi
147157
## :clipboard: Datasets
148158
Inspecting and adding a new dataset to **LLM**art only involves a single function that returns a list with the standard Hugging Face conversation format.
149159

160+
### Source code modification
150161
The `basic` dataset is the simplest instantiation that lives at [`datasets/basic.py`](src/llmart/datasets/basic.py), but is also powerful enough to support custom modifications.
151162

152163
For example, to modify the natural language prompt and desired open-ended response, the following basic edit is sufficient:
153164
```diff
154165
diff src/llmart/datasets/basic.py
155-
# Create conversation data structure and mark parts we care about
156-
conv = [
157-
- dict(role="user", content=mark_prompt("Tell me about the planet Saturn.")),
158-
- dict(role="assistant", content=mark_completion("NO WAY JOSE")),
159-
+ dict(role="user", content=mark_prompt("Tell me how to do something not allowed.")),
160-
+ dict(role="assistant", content=mark_completion("Sure, I'll tell you: first, you")),
161-
]
166+
example = dict(
167+
- prompt="Tell me about the planet Saturn.", completion="NO WAY JOSE"
168+
+ prompt="Tell me how to do something not allowed.", completion="Sure, I'll tell you: first, you"
169+
)
162170
```
163171

164172
Inducing a closed-ended response can be also directly done by typing out the end of turn token. For example, for the Llama 3 family of models this is:
165173
```diff
166174
diff src/llmart/datasets/basic.py
167-
# Create conversation data structure and mark parts we care about
168-
conv = [
169-
dict(role="user", content=mark_prompt("Tell me about the planet Saturn.")),
170-
- dict(role="assistant", content=mark_completion("NO WAY JOSE")),
171-
+ dict(role="assistant", content=mark_completion("NO WAY JOSE<|eot_id|>")),
172-
]
175+
example = dict(
176+
- prompt="Tell me about the planet Saturn.", completion="NO WAY JOSE"
177+
+ prompt="Tell me about the planet Saturn.", completion="No!<|eot_id|>"
178+
)
173179
```
174180

181+
### Command-line modification
175182
**LLM**art also supports loading the [AdvBench](https://github.com/llm-attacks/llm-attacks) dataset, which comes with pre-defined target responses to ensure consistent benchmarks.
176183

177184
Using AdvBench with **LLM**art requires specifying the desired subset of samples to attack. By default, the following command will automatically download the .csv file from its [original source](https://raw.githubusercontent.com/llm-attacks/llm-attacks/refs/heads/main/data/advbench/harmful_behaviors.csv) and use it as a dataset:
@@ -182,11 +189,34 @@ uv run accelerate launch -m llmart model=llama3-8b-instruct data=advbench_behavi
182189
To train a single adversarial attack on multiple samples, users can specify the exact samples via `data.subset=[0,1]`.
183190
The above command is also compatible with local modifications of the dataset by including the `dataset.files=/path/to/file.csv` argument.
184191

185-
In the most general case, you can write your own [dataset loading script](https://huggingface.co/docs/datasets/en/dataset_script) and pass it to **LLM**art:
192+
### Custom Dataset or DataMapper
193+
In the most general case, you can write your own [dataset loading script](https://huggingface.co/docs/datasets/en/dataset_script) or [DataMapper](src/llmart/data.py#L93) and pass it to **LLM**art. For example, you could write a custom `DataMapper` for the the dataset from [BoN Jailbreaking](https://github.com/jplhughes/bon-jailbreaking/) targeting the [Unispac/Llama2-7B-Chat-Augmented](https://huggingface.co/Unispac/Llama2-7B-Chat-Augmented) model by create a `/tmp/bon_jailbreaks.py` file with the following contents:
194+
```python
195+
from llmart import DataMapper
196+
197+
198+
class BoNJailbreaksMapper(DataMapper):
199+
""" Make text_jailbreaks.csv compatible with Llama2 chat template. """
200+
def __call__(self, batch):
201+
# batch contains the following keys from text_jailbreaks.csv:
202+
# direct_request,behavior_id,experiment,idx,model,augmented_file,response,length,label
203+
convs = [
204+
[
205+
dict(role="user", content=self.modify_prompt(direct_request)),
206+
dict(role="assistant", content=self.force_completion(response)),
207+
]
208+
for direct_request, response in zip(
209+
batch["direct_request"], batch["response"]
210+
)
211+
]
212+
return dict(conversation=convs)
213+
```
214+
You can then invoke the model
186215
```bash
187-
uv run accelerate launch -m llmart model=llama3-8b-instruct data=custom data.path=/path/to/dataset.py
216+
uv run accelerate launch -m llmart model=llama2-7b-deep-alignment data=custom data.path=csv data.files=https://raw.githubusercontent.com/jplhughes/bon-jailbreaking/refs/heads/main/docs/assets/data/text_jailbreaks.csv data.subset=[0] data.mapper=/tmp/bon_jailbreaks.py
188217
```
189-
Just make sure you conform to the output format in [`datasets/basic.py`](src/llmart/datasets/basic.py).
218+
219+
See [`datasets/basic.py`](src/llmart/datasets/basic.py) for how to write a custom dataset and/or datamapper.
190220

191221
## :chart_with_downwards_trend: Optimizers and schedulers
192222
Discrete optimization for language models [(Lei et al, 2019)](https://proceedings.mlsys.org/paper_files/paper/2019/hash/676638b91bc90529e09b22e58abb01d6-Abstract.html) &ndash; in particular the Greedy Coordinate Gradient (GCG) applied to auto-regressive LLMs [(Zou et al, 2023)](https://arxiv.org/abs/2307.15043) &ndash; is the main focus of [`optim.py`](src/llmart/optim.py).
@@ -216,7 +246,7 @@ If you find this repository useful in your work, please cite:
216246
author = {Cory Cornelius and Marius Arvinte and Sebastian Szyller and Weilin Xu and Nageen Himayat},
217247
title = {{LLMart}: {L}arge {L}anguage {M}odel adversarial robutness toolbox},
218248
url = {http://github.com/IntelLabs/LLMart},
219-
version = {2025.04},
249+
version = {2025.06},
220250
year = {2025},
221251
}
222252
```

examples/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
include ../preamble.mk
88

9-
EXAMPLE_DIRS := basic autogcg fact_checking llmguard random_strings unlearning
9+
EXAMPLE_DIRS := basic autogcg fact_checking llada llmguard random_strings unlearning vlm
1010

1111
all: run
1212

examples/autogcg/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,6 @@ args=--subset $(subset) --time_budget_s $(time_budget_s) --steps $(steps) --num_
1515
all: run
1616

1717
run:
18-
> uv run --with-requirements requirements.txt main.py $(args)
18+
> $(RUN_GPU) --with-requirements requirements.txt main.py $(args)
1919

2020
.PHONY: all run

examples/basic/Makefile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ args=--num_steps $(num_steps)
1212
all: run
1313

1414
run:
15-
> uv run --with-requirements requirements.txt main.py $(args)
15+
> $(RUN_GPU) --with-requirements requirements.txt main.py $(args)
1616

17-
.PHONY: all run
17+
run_notebook:
18+
> $(RUN_GPU) --with-requirements requirements.txt jupyter execute basic_dev_workflow.ipynb
19+
20+
.PHONY: all run run_notebook

examples/basic/basic_dev_workflow.ipynb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@
326326
" is_valid_input=wrapped_tokenizer.reencodes,\n",
327327
" batch_size=per_device_bs,\n",
328328
" use_kv_cache=False, # NOTE: KV caching is incompatible with optimizable position\n",
329+
" ignored_keys=wrapped_tokenizer.mask_names + [\"inputs_embeds\"],\n",
329330
")\n",
330331
"\n",
331332
"# Advanced: use a scheduler to reduce \"n_tokens\" by 0.5x on loss plateau after 50 steps\n",

examples/basic/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ def attack(
106106
is_valid_input=wrapped_tokenizer.reencodes,
107107
batch_size=per_device_bs,
108108
use_kv_cache=False, # NOTE: KV caching is incompatible with optimizable position
109+
ignored_keys=wrapped_tokenizer.mask_names
110+
+ ["inputs_embeds"], # NOTE: AdversarialBlockShift returns inputs_embeds
109111
)
110112

111113
# For each step

examples/fact_checking/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ args=--num_steps $(num_steps)
1212
all: run
1313

1414
run:
15-
> uv run --with-requirements requirements.txt claim.py $(args)
16-
> uv run --with-requirements requirements.txt document.py $(args)
15+
> $(RUN_GPU) --with-requirements requirements.txt claim.py $(args)
16+
> $(RUN_GPU) --with-requirements requirements.txt document.py $(args)
1717

1818
.PHONY: all run
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
fire==0.7.0
2-
vllm==0.8.4
2+
vllm==0.9.0
33
minicheck[llm] @ git+https://github.com/Liyan06/MiniCheck.git@main

examples/llada/Makefile

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#
2+
# Copyright (C) 2025 Intel Corporation
3+
#
4+
# SPDX-License-Identifier: Apache-2.0
5+
#
6+
7+
include ../../preamble.mk
8+
9+
num_steps=2
10+
args=--num_steps $(num_steps)
11+
12+
all: run
13+
14+
run:
15+
> wget https://raw.githubusercontent.com/ML-GSAI/LLaDA/f51cb1731f5a40ba35c15e51b6b66b147e689f24/generate.py
16+
> $(RUN_GPU) --with-requirements requirements.txt main.py $(args)
17+
18+
.PHONY: all run

0 commit comments

Comments
 (0)