From fb485cb7a349ac9c3f3da48f9e2bc5a6646c40e0 Mon Sep 17 00:00:00 2001
From: Joshua Horton <joshua_horton@sil.org>
Date: Thu, 21 May 2026 16:01:35 -0500
Subject: [PATCH] change(web): generalize determineSuggestionRange

To facilitate using the same suggestion-application-range logic for all model types, not just the first-class ones that implement LexiconTraversals.

Build-bot: skip build:web
Test-bot: skip
---
 .../src/main/correction/context-token.ts      |  47 ++++++-
 .../worker-thread/src/main/predict-helpers.ts | 121 +++++++++++++-----
 .../determine-suggestion-range.tests.ts       |  20 +--
 3 files changed, 142 insertions(+), 46 deletions(-)

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts
index 0c492c76255..ce1d8f9830a 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts
@@ -35,11 +35,42 @@ function textToCharTransforms(text: string, transformId?: number): Transform[] {
     [...text].map(insert => ({insert, deleteLeft: 0}));
 }
 
+
+/**
+ * Implements an interface similar to ContextToken that is useful for handling
+ * cases that should not be considered correctable.
+ */
+export interface ContextTokenLike {
+  /**
+   * Generates text corresponding to the net effects of the most likely inputs
+   * received that can correspond to the represented token.
+   */
+  exampleInput: string;
+
+  /**
+   * Reports the length in codepoints of corrected text represented by the
+   * current token.
+   */
+  codepointLength: number;
+
+  /**
+   * Whether or not the token is likely still being edited by the user (due to
+   * adjacency of the caret)
+   */
+  isPartial?: boolean;
+
+  /**
+   * Gets a compact string-based representation of `inputRange` that
+   * maps compatible token source ranges to each other.
+   */
+  sourceRangeKey?: string;
+}
+
 /**
  * Represents cached data about one token (either a word or a unit of whitespace)
  * in the context and associated correction-search progress and results.
  */
-export class ContextToken {
+export class ContextToken implements ContextTokenLike {
   /**
    * Indicates whether or not the token is considered whitespace.
    */
@@ -54,6 +85,10 @@ export class ContextToken {
   }
   private _searchModule: SearchQuotientNode;
 
+  /**
+   * Whether or not the token is likely still being edited by the user (due to
+   * adjacency of the caret)
+   */
   isPartial: boolean;
 
   /**
@@ -118,6 +153,14 @@ export class ContextToken {
     return new ContextToken(searchModule, isPartial);
   }
 
+  /**
+   * Reports the length in codepoints of corrected text represented by the
+   * current token.
+   */
+  get codepointLength() {
+    return this._searchModule.codepointLength;
+  }
+
   get inputCount() {
     return this._searchModule.inputCount;
   }
@@ -155,7 +198,7 @@ export class ContextToken {
 
   /**
    * Generates text corresponding to the net effects of the most likely inputs
-   * received that can correspond to the current instance.
+   * received that can correspond to the represented token.
    */
   get exampleInput(): string {
     return this.searchModule.bestExample.text;
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
index 891dd923cc8..b35796e83a4 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
@@ -5,9 +5,9 @@ import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbre
 
 import TransformUtils from './transformUtils.js';
 import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js';
+import { ContextTokenLike } from './correction/context-token.js';
 import { ContextTokenization } from './correction/context-tokenization.js';
 import { ContextTracker } from './correction/context-tracker.js';
-import { ContextToken } from './correction/context-token.js';
 import { ContextState, determineContextSlideTransform } from './correction/context-state.js';
 import { ContextTransition } from './correction/context-transition.js';
 import { ExecutionTimer } from './correction/execution-timer.js';
@@ -75,6 +75,43 @@ export const CORRECTION_SEARCH_THRESHOLDS = {
   REPLACEMENT_SEARCH_THRESHOLD: 4 as const // e^-4 = 0.0183156388.  Allows "80%" of an extra edit.
 }
 
+/**
+ * Represents the minimum replacement range and effects required for
+ * suggestions.
+ *
+ * These values are based on properties of the transition from their base
+ * context-tokenization to their target-tokenization (and its represented
+ * context variant).
+ */
+export interface SuggestionReplacement<T extends ContextTokenLike> {
+  /**
+   * Tokens lost from the base context-tokenization in the target
+   * context-tokenization due to the transition event.
+   *
+   * These are implicitly replaced when applying Suggestions.
+   */
+  tokensToRemove: T[],
+
+  /**
+   * Tokens added (after the removed tokens) to the base context-tokenization to
+   * produce the target context-tokenization.
+   *
+   * As these are "new" tokens generated by the transition, Suggestions should represent
+   * corrections and predictions rooted upon these tokens.
+   */
+  tokensToPredict: T[],
+
+  /**
+   * Indicates the total range of left-deletion needed when applying suggestions.
+   */
+  deleteLeft: number,
+
+  /**
+   * Indicates the id of the underlying context transition.
+   */
+  transitionId?: number
+}
+
 /**
  * Collates information related to suggestions during the suggestion generation
  * process.
@@ -397,53 +434,67 @@ export function determineSuggestionAlignment(
  * @param variantForSuggestions
  * @returns
  */
-export function determineSuggestionRange(
-  userContextTokenization: ContextTokenization,
-  variantForSuggestions: ContextTokenization
-): { tokensToRemove: ContextToken[], tokensToPredict: ContextToken[] } {
-  // Assumption:  spaceIds monotonically increase as new ones are generated.
-  // Given this, we backtrace on the token tails until finding a spot where the
-  // spaceIds match, dropping any that are newer than the last found in the
-  // other.
-  //
-  // We full-replace all tokens affected by an applied suggestion, so if there's
-  // a mismatch between the final form of a token, that implies that suggestions
-  // would replace the original form of the token anyway.
-  const tokenSetA = userContextTokenization.tokens.slice();
-  const tokenSetB = variantForSuggestions.tokens.slice();
-
-  const tokensToRemove: ContextToken[] = [];
-  const tokensToPredict: ContextToken[] = [];
-
-  const tailIdFor = (tokens: ContextToken[]) => tokens[tokens.length-1]?.spaceId ?? -1;
-  let tailOfA = tailIdFor(tokenSetA);
-  let tailOfB = tailIdFor(tokenSetB);
-  while(tailOfA != tailOfB) {
-    if(tailOfA < tailOfB) {
-      tokensToPredict.push(tokenSetB.pop());
-      tailOfB = tailIdFor(tokenSetB);
-    } else {
-      tokensToRemove.push(tokenSetA.pop());
-      tailOfA = tailIdFor(tokenSetA);
+export function determineSuggestionRange<T extends ContextTokenLike>(
+  userContextTokenization: T[],
+  variantForSuggestions: T[],
+  equalityChecker: (a: T, b: T) => boolean
+): SuggestionReplacement<T> {
+  // Add null/undefined guards to the equality checker.
+  const temp = equalityChecker;
+  equalityChecker = (a, b) => {
+    if(!a || !b) {
+      return false;
+    }
+
+    return temp(a, b);
+  }
+
+  const deleteLeftCalc = (tokenSet: T[], predictCount: number) => {
+    // TODO:  once we start activating multi-tokenization for real, only the
+    // 'reduce' component should remain.
+    return (predictCount > 1)
+      ? (tokenSet[tokenSet.length - 1]?.codepointLength ?? 0)
+      : tokenSet.reduce((prev, curr) => prev + curr.codepointLength, 0);
+  }
+
+  const tokenSetA = userContextTokenization.slice();
+  const tokenSetB = variantForSuggestions.slice();
+
+  let aHeadIndexInB = tokenSetB.findIndex((t) => equalityChecker(t, tokenSetA[0]));
+  let bHeadIndexInA = tokenSetA.findIndex((t) => equalityChecker(t, tokenSetB[0]));
+
+  if(aHeadIndexInB == -1 && bHeadIndexInA == -1) {
+    // Both are full replacements.
+    return {
+      tokensToRemove: tokenSetA,
+      tokensToPredict: tokenSetB,
+      deleteLeft: deleteLeftCalc(tokenSetA, tokenSetB.length)
     }
+  } else if(aHeadIndexInB != 0 && bHeadIndexInA != 0) {
+    throw new Error("Leading edge of context should not differ in both tokenizations.");
+  }
+
+  let tailOffset = 0;
+  while(equalityChecker(tokenSetA[bHeadIndexInA + tailOffset], tokenSetB[aHeadIndexInB + tailOffset])) {
+    tailOffset++;
   }
 
-  tokensToPredict.reverse();
+  const tokensToRemove: T[] = tokenSetA.slice(bHeadIndexInA + tailOffset);
+  const tokensToPredict: T[] = tokenSetB.slice(aHeadIndexInB + tailOffset);
 
   // Can occur when backspacing to the end of a previous word.
   if(tokensToPredict.length == 0) {
     if(tokenSetA.length == 0 || tokenSetB.length == 0) {
       throw new Error("Invalid state - a tokenization is missing expected tokens");
     }
-    tokensToRemove.push(tokenSetA.pop());
-    tokensToPredict.push(tokenSetB.pop());
+    tokensToRemove.unshift(tokenSetA[bHeadIndexInA + tailOffset - 1]);
+    tokensToPredict.unshift(tokenSetB[aHeadIndexInB + tailOffset - 1]);
   }
 
-  tokensToRemove.reverse();
-
   return {
     tokensToRemove,
-    tokensToPredict
+    tokensToPredict,
+    deleteLeft: deleteLeftCalc(tokensToRemove, tokensToPredict.length)
   }
 }
 
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts
index 3445abaadd9..e18174895ec 100644
--- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts
+++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts
@@ -170,12 +170,14 @@ function buildQuickBrownFixture() {
   };
 }
 
+const tokenEquality = (a: ContextToken, b: ContextToken) => a.spaceId == b.spaceId;
+
 describe('determineSuggestionRange', () => {
   it('adjusts the final token if no tokenization changes occur', () => {
     const fixture = buildQuickBrownFixture();
     const noChange = fixture.variations.noChange;
 
-    const analysis = determineSuggestionRange(fixture.baseTokenization, noChange.tokenization);
+    const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, noChange.tokenization.tokens, tokenEquality);
 
     assert.sameOrderedMembers(analysis.tokensToRemove, noChange.range.tokensToRemove);
     assert.sameOrderedMembers(analysis.tokensToPredict, noChange.range.tokensToPredict);
@@ -185,7 +187,7 @@ describe('determineSuggestionRange', () => {
     const fixture = buildQuickBrownFixture();
     const plainInsert = fixture.variations.plainInsert;
 
-    const analysis = determineSuggestionRange(fixture.baseTokenization, plainInsert.tokenization);
+    const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, plainInsert.tokenization.tokens, tokenEquality);
 
     assert.sameOrderedMembers(analysis.tokensToRemove, plainInsert.range.tokensToRemove);
     assert.sameOrderedMembers(analysis.tokensToPredict, plainInsert.range.tokensToPredict);
@@ -195,7 +197,7 @@ describe('determineSuggestionRange', () => {
     const fixture = buildQuickBrownFixture();
     const newTokenInsert = fixture.variations.newTokenInsert;
 
-    const analysis = determineSuggestionRange(fixture.baseTokenization, newTokenInsert.tokenization);
+    const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, newTokenInsert.tokenization.tokens, tokenEquality);
 
     assert.sameOrderedMembers(analysis.tokensToRemove, newTokenInsert.range.tokensToRemove);
     assert.sameOrderedMembers(analysis.tokensToPredict, newTokenInsert.range.tokensToPredict);
@@ -205,7 +207,7 @@ describe('determineSuggestionRange', () => {
     const fixture = buildQuickBrownFixture();
     const charReplace = fixture.variations.charReplace;
 
-    const analysis = determineSuggestionRange(fixture.baseTokenization, charReplace.tokenization);
+    const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, charReplace.tokenization.tokens, tokenEquality);
 
     assert.sameOrderedMembers(analysis.tokensToRemove, charReplace.range.tokensToRemove);
     assert.sameOrderedMembers(analysis.tokensToPredict, charReplace.range.tokensToPredict);
@@ -215,7 +217,7 @@ describe('determineSuggestionRange', () => {
     const fixture = buildQuickBrownFixture();
     const del5Insert5 = fixture.variations.del5Insert5;
 
-    const analysis = determineSuggestionRange(fixture.baseTokenization, del5Insert5.tokenization);
+    const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, del5Insert5.tokenization.tokens, tokenEquality);
 
     assert.sameOrderedMembers(analysis.tokensToRemove, del5Insert5.range.tokensToRemove);
     assert.sameOrderedMembers(analysis.tokensToPredict, del5Insert5.range.tokensToPredict);
@@ -225,7 +227,7 @@ describe('determineSuggestionRange', () => {
     const fixture = buildQuickBrownFixture();
     const eraseToken = fixture.variations.eraseToken;
 
-    const analysis = determineSuggestionRange(fixture.baseTokenization, eraseToken.tokenization);
+    const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, eraseToken.tokenization.tokens, tokenEquality);
 
     assert.sameOrderedMembers(analysis.tokensToRemove, eraseToken.range.tokensToRemove);
     assert.sameOrderedMembers(analysis.tokensToPredict, eraseToken.range.tokensToPredict);
@@ -235,7 +237,7 @@ describe('determineSuggestionRange', () => {
     const fixture = buildQuickBrownFixture();
     const deleteToBound = fixture.variations.deleteToBound;
 
-    const analysis = determineSuggestionRange(fixture.baseTokenization, deleteToBound.tokenization);
+    const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, deleteToBound.tokenization.tokens, tokenEquality);
 
     assert.sameOrderedMembers(analysis.tokensToRemove, deleteToBound.range.tokensToRemove);
     assert.sameOrderedMembers(analysis.tokensToPredict, deleteToBound.range.tokensToPredict);
@@ -255,7 +257,7 @@ describe('determineSuggestionRange', () => {
       null
     )
 
-    const analysis = determineSuggestionRange(originalQuickBrownTokenization, foxVsAlligatorTokenization);
+    const analysis = determineSuggestionRange(originalQuickBrownTokenization.tokens, foxVsAlligatorTokenization.tokens, tokenEquality);
 
     assert.sameOrderedMembers(
       analysis.tokensToRemove,
@@ -279,7 +281,7 @@ describe('determineSuggestionRange', () => {
       null
     )
 
-    const analysis = determineSuggestionRange(originalQuickBrownTokenization, dogsAndCatTokenization);
+    const analysis = determineSuggestionRange(originalQuickBrownTokenization.tokens, dogsAndCatTokenization.tokens, tokenEquality);
 
     assert.sameOrderedMembers(
       analysis.tokensToRemove,