File tree Expand file tree Collapse file tree 2 files changed +33
-1
lines changed
main/java/org/wltea/analyzer/core
test/java/org/wltea/analyzer/lucene Expand file tree Collapse file tree 2 files changed +33
-1
lines changed Original file line number Diff line number Diff line change @@ -56,7 +56,7 @@ private void processSurrogatePairs(AnalyzeContext context) {
5656 outputSingleCharLexeme (context , this .start );
5757 this .highSurrogate = Optional .empty ();
5858 }
59- this .start = context . getCursor () ;
59+ this .start = - 1 ;
6060 this .end = -1 ;
6161 }
6262
Original file line number Diff line number Diff line change @@ -84,6 +84,38 @@ public void tokenizeCase5_correctly()
8484 assert values [3 ].equals ("凤" );
8585 }
8686
87+ /**
88+ * Surrogate Pair混合超出缓存区测试
89+ */
90+ @ Test
91+ public void tokenizeCase6_correctly ()
92+ {
93+ Configuration cfg = TestUtils .createFakeConfigurationSub (false );
94+ // build a string with '菩' + spaces + 60 surrogate pairs
95+ StringBuilder sb = new StringBuilder (4006 );
96+ sb .append ("菩" );
97+ for (int i = 0 ; i < 3995 ; i ++) {
98+ sb .append (' ' );
99+ }
100+ // Append the surrogate pair 41 times
101+ for (int i = 0 ; i < 41 ; i ++) {
102+ sb .append ("\uDB84 \uDD2E " );
103+ }
104+ String [] values = tokenize (cfg , sb .toString ());
105+
106+ // First token should be '菩'
107+ assert values [0 ].equals ("菩" );
108+
109+ // There should be 41 tokens total (菩 + 41 surrogate pairs)
110+ assert values .length == 42 ;
111+
112+ // Verify all surrogate pair tokens
113+ for (int i = 1 ; i <= 41 ; i ++) {
114+ assert values [i ].equals ("\uDB84 \uDD2E " ) : "Token at index " + i + " is not the expected surrogate pair" ;
115+ }
116+ }
117+
118+
87119 /**
88120 * 用ik_max_word分词器分词
89121 */
You can’t perform that action at this time.
0 commit comments