Support Qwen3 Two Inputs (openvinotoolkit#575)

apaniukov · apaniukov · commit 9c2bace56d42 · 2025-11-10T13:30:34.000+01:00
* Add Qwen3-Rerank Two Inputs Support

* Update Tests
diff --git a/README.md b/README.md
@@ -470,7 +470,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >BPE</td>
       <td >99.26</td>
-      <td >6216</td>
+      <td >6218</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
@@ -524,6 +524,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
       <td >100.00</td>
       <td >253</td>
     </tr>
+    <tr>
+      <td >BPE</td>
+      <td >Qwen/Qwen3-Reranker-0.6B</td>
+      <td >100.00</td>
+      <td >269</td>
+    </tr>
     <tr>
       <td >BPE</td>
       <td >Salesforce/codegen-16B-multi</td>
@@ -584,12 +590,6 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
       <td >100.00</td>
       <td >251</td>
     </tr>
-    <tr>
-      <td >BPE</td>
-      <td >gpt2</td>
-      <td >100.00</td>
-      <td >267</td>
-    </tr>
     <tr>
       <td >BPE</td>
       <td >koalajun/Gemma-2-9b-it-Ko-Crypto-Translate</td>
diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
@@ -306,6 +306,9 @@ def post_tokenization(self) -> None:
             or post_processor_json["type"] == "ByteLevel"
         ):
             self.add_truncation()
+            self.pipeline.add_steps(
+                CombineSegmentsStep([Sequence() for _ in range(self.number_of_inputs)], add_special_tokens=False)
+            )
             self.add_padding(use_max_padding=self.use_max_padding)
             return
 
diff --git a/python/openvino_tokenizers/utils.py b/python/openvino_tokenizers/utils.py
@@ -378,6 +378,13 @@ def update_rt_info_with_processor_template(
     parsed_post_processor = parse_processor_template(post_processor_json, hf_tokenizer)
     if parsed_post_processor is not None:
         ov_tokenizer.set_rt_info(json.dumps(parsed_post_processor), PROCESSED_POST_PROCESSOR_NAME)
+    else:
+        ov_tokenizer.set_rt_info(
+            json.dumps(
+                {"single": {"ids": [-1], "type_ids": [0]}, "pair": {"ids": [-1, -2], "type_ids": [0, 0]}}
+            ),
+            PROCESSED_POST_PROCESSOR_NAME
+        )
 
 
 def update_rt_info_with_params(
diff --git a/tests/pass_rates.json b/tests/pass_rates.json
@@ -1,3 +1,3 @@
 {
-    "tests/tokenizers_test.py::test_": 0.9524735626595356
+    "tests/tokenizers_test.py::test_": 0.9524793388429752
 }
diff --git a/tests/stats.json b/tests/stats.json
diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py
@@ -99,7 +99,7 @@
     "koalajun/Gemma-2-9b-it-Ko-Crypto-Translate",
     "roberta-base",
     "deepseek-ai/DeepSeek-V3-0324",
-    "gpt2",
+    "Qwen/Qwen3-Reranker-0.6B",
     "ai-forever/rugpt3large_based_on_gpt2",
     "facebook/galactica-120b",
     "microsoft/deberta-base",
@@ -134,7 +134,6 @@
 ]
 tiktiken_models = [
     "Qwen/Qwen-14B-Chat",
-    # "Salesforce/xgen-7b-8k-base",  # not compatible with transformers 4.44.0
     "THUDM/glm-4-9b-chat",
 ]
 wordlevel_models = ["cisco-ai/mini-bart-g2p"]

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`{`
`2`		`- "tests/tokenizers_test.py::test_": 0.9524735626595356`
	`2`	`+ "tests/tokenizers_test.py::test_": 0.9524793388429752`
`3`	`3`	`}`