Skip to content

Commit 6dc4c39

Browse files
authored
Update Metaspace Conversion (openvinotoolkit#578)
1 parent 0f72a13 commit 6dc4c39

File tree

6 files changed

+66
-57
lines changed

6 files changed

+66
-57
lines changed

README.md

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -469,12 +469,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
469469
<tbody>
470470
<tr>
471471
<td >BPE</td>
472-
<td >99.26</td>
473-
<td >6218</td>
472+
<td >99.45</td>
473+
<td >6216</td>
474474
</tr>
475475
<tr>
476476
<td >SentencePiece</td>
477-
<td >89.50</td>
477+
<td >89.76</td>
478478
<td >6036</td>
479479
</tr>
480480
<tr>
@@ -515,7 +515,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
515515
<tr>
516516
<td >BPE</td>
517517
<td >NousResearch/Llama-2-13b-hf</td>
518-
<td >97.61</td>
518+
<td >100.00</td>
519519
<td >251</td>
520520
</tr>
521521
<tr>
@@ -605,7 +605,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
605605
<tr>
606606
<td >BPE</td>
607607
<td >llava-hf/LLaVA-NeXT-Video-7B-hf</td>
608-
<td >97.61</td>
608+
<td >100.00</td>
609609
<td >251</td>
610610
</tr>
611611
<tr>
@@ -671,13 +671,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
671671
<tr>
672672
<td >SentencePiece</td>
673673
<td >NousResearch/Llama-2-13b-hf</td>
674-
<td >94.42</td>
674+
<td >96.02</td>
675675
<td >251</td>
676676
</tr>
677677
<tr>
678678
<td >SentencePiece</td>
679679
<td >NousResearch/Llama-2-13b-hf_legacy</td>
680-
<td >97.61</td>
680+
<td >99.20</td>
681681
<td >251</td>
682682
</tr>
683683
<tr>
@@ -737,13 +737,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
737737
<tr>
738738
<td >SentencePiece</td>
739739
<td >llava-hf/LLaVA-NeXT-Video-7B-hf</td>
740-
<td >93.63</td>
740+
<td >95.22</td>
741741
<td >251</td>
742742
</tr>
743743
<tr>
744744
<td >SentencePiece</td>
745745
<td >llava-hf/LLaVA-NeXT-Video-7B-hf_legacy</td>
746-
<td >96.81</td>
746+
<td >98.41</td>
747747
<td >251</td>
748748
</tr>
749749
<tr>

benchmark/benchmark.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,9 @@ def main(
212212
) -> None:
213213
hf_tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=trust)
214214

215+
if hf_tokenizer.pad_token is None:
216+
hf_tokenizer.pad_token = hf_tokenizer.eos_token
217+
215218
hint = properties.hint.PerformanceMode.THROUGHPUT if tput else properties.hint.PerformanceMode.LATENCY
216219
config = {properties.hint.performance_mode(): hint}
217220
if per_layer_stats:

python/openvino_tokenizers/hf_parser.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -211,24 +211,6 @@ def parse_normalizer_step(self, step_dict: dict[str, Any]) -> None:
211211
except KeyError:
212212
raise OVTypeError(f"Normalizer type '{step_dict['type']}' is not supported")
213213

214-
@staticmethod
215-
def check_metaspace_normalizer(normalizer_dict: dict[str, Any]) -> bool:
216-
if normalizer_dict.get("type") == "Sequence":
217-
normalizers = normalizer_dict["normalizers"]
218-
219-
if len(normalizers) != 2:
220-
return False
221-
first, second = normalizers
222-
first_prerend = bool(first.get("type") == "Prepend" and first.get("prepend") == "▁")
223-
second_replace = bool(
224-
second.get("type") == "Replace"
225-
and second.get("pattern", {}).get("String") == " "
226-
and second.get("content") == "▁"
227-
)
228-
return first_prerend and second_replace
229-
230-
return False
231-
232214
def normalization(self) -> None:
233215
if self.tokenizer_json["normalizer"] is None:
234216
return

python/openvino_tokenizers/tokenizer_pipeline.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1493,12 +1493,32 @@ def merge_regex_split_steps(self) -> None:
14931493
steps_without_pre_tokenization[first_step_position:first_step_position] = new_regex_split_steps
14941494
self.steps = steps_without_pre_tokenization
14951495

1496+
def update_metaspace_step_with_special_tokens(self) -> None:
1497+
"""
1498+
No metaspace insertion when input starts with special token.
1499+
"""
1500+
if not self.is_metaspace_prepend_first:
1501+
return
1502+
special_tokens_split = next(
1503+
(step for step in self.steps if isinstance(step, SpecialTokensSplit)),
1504+
None,
1505+
)
1506+
if not special_tokens_split:
1507+
return
1508+
metaspace_step, special_tokens_split = self.steps[:2]
1509+
1510+
metaspace_step.regex_search_pattern = r"(^)((?!{}| |$)|(?=[\r\n\t\f\v]))".format(
1511+
"|".join(quote_meta(token.text) for token in special_tokens_split.special_tokens)
1512+
)
1513+
metaspace_step.global_replace = False
1514+
14961515
def finalize(self) -> None:
14971516
if self.finalized:
14981517
return
14991518

15001519
self.merge_normalization_steps()
15011520
self.del_duplicated_split_steps()
1521+
self.update_metaspace_step_with_special_tokens()
15021522

15031523
for step in copy(self.steps):
15041524
step.finalize()
@@ -1507,6 +1527,10 @@ def finalize(self) -> None:
15071527
self.merge_regex_split_steps()
15081528
self.finalized = True
15091529

1530+
@property
1531+
def is_metaspace_prepend_first(self) -> bool:
1532+
return isinstance(self.steps[0], RegexNormalizationStep)
1533+
15101534
def get_tokenizer_ov_subgraph(self) -> Model:
15111535
self.finalize()
15121536

@@ -1516,7 +1540,7 @@ def get_tokenizer_ov_subgraph(self) -> Model:
15161540
for input_node in string_inputs:
15171541
input_node = _get_opset_factory("opset15").create("StringTensorUnpack", input_node.outputs()).outputs()
15181542

1519-
if isinstance(self.steps[0], RegexNormalizationStep):
1543+
if self.is_metaspace_prepend_first:
15201544
prepend_metaspace_step = self.steps.pop(0)
15211545
input_node = prepend_metaspace_step.get_ov_subgraph(input_node)
15221546

tests/pass_rates.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{
2-
"tests/tokenizers_test.py::test_": 0.9524793388429752
2+
"tests/tokenizers_test.py::test_": 0.9541752765285038
33
}

0 commit comments

Comments
 (0)