fix: SurrogatePairSegmenter problem in ISSUES#1100 and add testcode tokenizeCase6_correctly (#1103)

kin122 · web-flow · commit 867361365127 · 2025-05-08T14:23:16.000+08:00
diff --git a/core/src/main/java/org/wltea/analyzer/core/SurrogatePairSegmenter.java b/core/src/main/java/org/wltea/analyzer/core/SurrogatePairSegmenter.java
@@ -56,7 +56,7 @@ private void processSurrogatePairs(AnalyzeContext context) {
                 outputSingleCharLexeme(context, this.start);
                 this.highSurrogate = Optional.empty();
             }
-            this.start = context.getCursor();
+            this.start = -1;
             this.end = -1;
         }
 
diff --git a/core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java b/core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java
@@ -84,6 +84,38 @@ public void tokenizeCase5_correctly()
         assert values[3].equals("凤");
     }
 
+    /**
+     * Surrogate Pair混合超出缓存区测试
+     */
+    @Test
+    public void tokenizeCase6_correctly()
+    {
+        Configuration cfg = TestUtils.createFakeConfigurationSub(false);
+        // build a string with '菩' + spaces + 60 surrogate pairs
+        StringBuilder sb = new StringBuilder(4006);
+        sb.append("菩");
+        for (int i = 0; i < 3995; i++) {
+            sb.append(' ');
+        }
+        // Append the surrogate pair 41 times
+        for (int i = 0; i < 41; i++) {
+            sb.append("\uDB84\uDD2E ");
+        }
+        String[] values = tokenize(cfg, sb.toString());
+        
+        // First token should be '菩'
+        assert values[0].equals("菩");
+        
+        // There should be 41 tokens total (菩 + 41 surrogate pairs)
+        assert values.length == 42;
+        
+        // Verify all surrogate pair tokens
+        for (int i = 1; i <= 41; i++) {
+            assert values[i].equals("\uDB84\uDD2E") : "Token at index " + i + " is not the expected surrogate pair";
+        }
+    }
+
+
     /**
      * 用ik_max_word分词器分词
      */

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ private void processSurrogatePairs(AnalyzeContext context) {`
`56`	`56`	`outputSingleCharLexeme(context, this.start);`
`57`	`57`	`this.highSurrogate = Optional.empty();`
`58`	`58`	`}`
`59`		`- this.start = context.getCursor();`
	`59`	`+ this.start = -1;`
`60`	`60`	`this.end = -1;`
`61`	`61`	`}`
`62`	`62`