Skip to content

Commit 9c2bace

Browse files
committed
Support Qwen3 Two Inputs (openvinotoolkit#575)
* Add Qwen3-Rerank Two Inputs Support * Update Tests
1 parent e469d9b commit 9c2bace

File tree

6 files changed

+288
-279
lines changed

6 files changed

+288
-279
lines changed

README.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -470,7 +470,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
470470
<tr>
471471
<td >BPE</td>
472472
<td >99.26</td>
473-
<td >6216</td>
473+
<td >6218</td>
474474
</tr>
475475
<tr>
476476
<td >SentencePiece</td>
@@ -524,6 +524,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
524524
<td >100.00</td>
525525
<td >253</td>
526526
</tr>
527+
<tr>
528+
<td >BPE</td>
529+
<td >Qwen/Qwen3-Reranker-0.6B</td>
530+
<td >100.00</td>
531+
<td >269</td>
532+
</tr>
527533
<tr>
528534
<td >BPE</td>
529535
<td >Salesforce/codegen-16B-multi</td>
@@ -584,12 +590,6 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
584590
<td >100.00</td>
585591
<td >251</td>
586592
</tr>
587-
<tr>
588-
<td >BPE</td>
589-
<td >gpt2</td>
590-
<td >100.00</td>
591-
<td >267</td>
592-
</tr>
593593
<tr>
594594
<td >BPE</td>
595595
<td >koalajun/Gemma-2-9b-it-Ko-Crypto-Translate</td>

python/openvino_tokenizers/hf_parser.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,9 @@ def post_tokenization(self) -> None:
306306
or post_processor_json["type"] == "ByteLevel"
307307
):
308308
self.add_truncation()
309+
self.pipeline.add_steps(
310+
CombineSegmentsStep([Sequence() for _ in range(self.number_of_inputs)], add_special_tokens=False)
311+
)
309312
self.add_padding(use_max_padding=self.use_max_padding)
310313
return
311314

python/openvino_tokenizers/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,13 @@ def update_rt_info_with_processor_template(
378378
parsed_post_processor = parse_processor_template(post_processor_json, hf_tokenizer)
379379
if parsed_post_processor is not None:
380380
ov_tokenizer.set_rt_info(json.dumps(parsed_post_processor), PROCESSED_POST_PROCESSOR_NAME)
381+
else:
382+
ov_tokenizer.set_rt_info(
383+
json.dumps(
384+
{"single": {"ids": [-1], "type_ids": [0]}, "pair": {"ids": [-1, -2], "type_ids": [0, 0]}}
385+
),
386+
PROCESSED_POST_PROCESSOR_NAME
387+
)
381388

382389

383390
def update_rt_info_with_params(

tests/pass_rates.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{
2-
"tests/tokenizers_test.py::test_": 0.9524735626595356
2+
"tests/tokenizers_test.py::test_": 0.9524793388429752
33
}

tests/stats.json

Lines changed: 269 additions & 269 deletions
Large diffs are not rendered by default.

tests/tokenizers_test.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@
9999
"koalajun/Gemma-2-9b-it-Ko-Crypto-Translate",
100100
"roberta-base",
101101
"deepseek-ai/DeepSeek-V3-0324",
102-
"gpt2",
102+
"Qwen/Qwen3-Reranker-0.6B",
103103
"ai-forever/rugpt3large_based_on_gpt2",
104104
"facebook/galactica-120b",
105105
"microsoft/deberta-base",
@@ -134,7 +134,6 @@
134134
]
135135
tiktiken_models = [
136136
"Qwen/Qwen-14B-Chat",
137-
# "Salesforce/xgen-7b-8k-base", # not compatible with transformers 4.44.0
138137
"THUDM/glm-4-9b-chat",
139138
]
140139
wordlevel_models = ["cisco-ai/mini-bart-g2p"]

0 commit comments

Comments
 (0)