Skip to content

Commit 8673613

Browse files
authored
fix: SurrogatePairSegmenter problem in ISSUES#1100 and add testcode tokenizeCase6_correctly (#1103)
1 parent 22a1644 commit 8673613

File tree

2 files changed

+33
-1
lines changed

2 files changed

+33
-1
lines changed

core/src/main/java/org/wltea/analyzer/core/SurrogatePairSegmenter.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ private void processSurrogatePairs(AnalyzeContext context) {
5656
outputSingleCharLexeme(context, this.start);
5757
this.highSurrogate = Optional.empty();
5858
}
59-
this.start = context.getCursor();
59+
this.start = -1;
6060
this.end = -1;
6161
}
6262

core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,38 @@ public void tokenizeCase5_correctly()
8484
assert values[3].equals("凤");
8585
}
8686

87+
/**
88+
* Surrogate Pair混合超出缓存区测试
89+
*/
90+
@Test
91+
public void tokenizeCase6_correctly()
92+
{
93+
Configuration cfg = TestUtils.createFakeConfigurationSub(false);
94+
// build a string with '菩' + spaces + 60 surrogate pairs
95+
StringBuilder sb = new StringBuilder(4006);
96+
sb.append("菩");
97+
for (int i = 0; i < 3995; i++) {
98+
sb.append(' ');
99+
}
100+
// Append the surrogate pair 41 times
101+
for (int i = 0; i < 41; i++) {
102+
sb.append("\uDB84\uDD2E ");
103+
}
104+
String[] values = tokenize(cfg, sb.toString());
105+
106+
// First token should be '菩'
107+
assert values[0].equals("菩");
108+
109+
// There should be 41 tokens total (菩 + 41 surrogate pairs)
110+
assert values.length == 42;
111+
112+
// Verify all surrogate pair tokens
113+
for (int i = 1; i <= 41; i++) {
114+
assert values[i].equals("\uDB84\uDD2E") : "Token at index " + i + " is not the expected surrogate pair";
115+
}
116+
}
117+
118+
87119
/**
88120
* 用ik_max_word分词器分词
89121
*/

0 commit comments

Comments
 (0)