@@ -18,9 +18,10 @@ namespace onmt
1818 const std::string Tokenizer::spacer_marker = " ▁" ;
1919 const std::string Tokenizer::ph_marker_open = " ⦅" ;
2020 const std::string Tokenizer::ph_marker_close = " ⦆" ;
21+ const std::string Tokenizer::escaped_character_prefix = " %" ;
22+ const size_t Tokenizer::escaped_character_width = 4 ;
2123 static const unicode::code_point_t ph_marker_open_cp = 0xFF5F ;
2224 static const unicode::code_point_t ph_marker_close_cp = 0xFF60 ;
23- static const std::string protected_character = " %" ;
2425 static const std::vector<std::pair<unicode::code_point_t , std::string>> substitutes = {
2526 {0x2581 /* ▁ */ , " _" },
2627 {0xFFED /* ■ */ , " ■" },
@@ -32,7 +33,6 @@ namespace onmt
3233
3334 static const int placeholder_alphabet = -2 ;
3435 static const int number_alphabet = -3 ;
35- static const int hex_value_width = 4 ;
3636
3737 Tokenizer::Mode Tokenizer::str_to_mode (const std::string& mode)
3838 {
@@ -290,21 +290,23 @@ namespace onmt
290290
291291 static inline void unescape_characters (std::string& str)
292292 {
293+ const auto & prefix = Tokenizer::escaped_character_prefix;
294+ const auto & width = Tokenizer::escaped_character_width;
295+
293296 for (size_t offset = 0 ;;)
294297 {
295- const size_t index = str.find (protected_character, offset);
296- if (index == std::string::npos
297- || index + protected_character.size () + hex_value_width > str.size ())
298+ const size_t index = str.find (prefix, offset);
299+ if (index == std::string::npos || index + prefix.size () + width > str.size ())
298300 break ;
299301
300- const std::string code = str.substr (index + protected_character .size (), hex_value_width );
302+ const std::string code = str.substr (index + prefix .size (), width );
301303 const int v = hex_to_int (code);
302304 const std::string c = unicode::cp_to_utf8 (v);
303- if (c.empty () || !c[0 ] || int_to_hex (v, hex_value_width ) != code)
304- offset = index + protected_character .size ();
305+ if (c.empty () || !c[0 ] || int_to_hex (v, width ) != code)
306+ offset = index + prefix .size ();
305307 else
306308 {
307- str.replace (index, protected_character .size () + hex_value_width , c);
309+ str.replace (index, prefix .size () + width , c);
308310 offset = index + 1 ;
309311 }
310312 }
@@ -635,7 +637,8 @@ namespace onmt
635637 if (_no_substitution)
636638 append (character);
637639 else
638- append (protected_character + int_to_hex (character.value , hex_value_width));
640+ append (Tokenizer::escaped_character_prefix
641+ + int_to_hex (character.value , Tokenizer::escaped_character_width));
639642 }
640643
641644 void flush_feature ()
0 commit comments