From 4538c2856c52ed04eac7a25d3fd0dbb091ecf976 Mon Sep 17 00:00:00 2001
From: Joshua Horton <joshua_horton@sil.org>
Date: Tue, 2 Jun 2026 09:17:34 -0500
Subject: [PATCH 1/4] fix(web): transitionId undefined check, extra unit test

---
 .../worker-thread/src/main/predict-helpers.ts |  2 +-
 .../predict-from-correction-sequence.tests.ts | 47 +++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
index f0b067f0c48..3437d7b4c66 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
@@ -750,7 +750,7 @@ export function predictFromCorrectionSequence(
       }
     }, {sample: '', p: 1})
 
-    if(transitionId) {
+    if(transitionId !== undefined) {
       fullPrediction.sample.transform.id = transitionId;
       fullPrediction.sample.transformId = transitionId;
     }
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts
index dc76a4c1db9..a5582f1b42b 100644
--- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts
+++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts
@@ -191,6 +191,53 @@ describe('predictFromCorrectionSequence', () => {
       assert.approximately(predictions[1].totalProb, 0.02 * 0.6, 0.00001);
       predictions.forEach((prediction) => assert.equal(prediction.prediction.sample.transformId, transitionID));
     });
+
+    it('constructs suggestions without input (as if after a context reset)', () => {
+      const context: Context = {
+        left: 'appl',
+        right: '',
+        startOfBuffer: true,
+        endOfBuffer: true
+      };
+
+      const correctionDistribution: Distribution<Transform> = [{
+          sample: {
+            insert: 'appl',
+            deleteLeft: 4
+          },
+          p: 1
+        }
+      ];
+
+      const dummied_suggestions: Outcome<Suggestion>[] = [
+        {
+          transform: {
+            insert: "apple",
+            deleteLeft: 4
+          },
+          displayAs: "apple",
+          p: 0.5
+        }
+      ];
+
+      const model = new DummyModel({
+        ...DUMMY_MODEL_CONFIG,
+        futureSuggestions: [ dummied_suggestions ]
+      });
+
+      const transitionID = 12345;
+      const predictions = predictFromCorrectionSequence(model, correctionDistribution, context, transitionID);
+      predictions.forEach((entry) => assert.equal(entry.correction.sample, 'appl'));
+      predictions.forEach((entry) => assert.equal(entry.correction.p, 1));
+      predictions.sort(tupleDisplayOrderSort);
+
+      assert.sameDeepOrderedMembers(predictions.map((entry) => entry.prediction.sample), dummied_suggestions.map((s) => {
+        delete s.p;
+        s.transformId = transitionID;
+        s.transform.id = transitionID;
+        return s;
+      }));
+    });
   });
 
   describe('on a sequence of corrections', () => {

From b576c939b002b9d34c33c087a7ca659971c1087c Mon Sep 17 00:00:00 2001
From: Joshua Horton <joshua_horton@sil.org>
Date: Tue, 19 May 2026 13:09:12 -0500
Subject: [PATCH 2/4] refactor(web): expose suggestion-root parameters for use
 in unit tests

Build-bot: skip build:web
Test-bot: skip
---
 .../worker-thread/src/main/predict-helpers.ts |  85 +++--
 ...ine-tokenized-correction-sequence.tests.ts | 361 ++++++++++++++++++
 2 files changed, 423 insertions(+), 23 deletions(-)
 create mode 100644 web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
index 3437d7b4c66..35f2df19db8 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
@@ -13,7 +13,6 @@ import { ContextTransition } from './correction/context-transition.js';
 import { ExecutionTimer } from './correction/execution-timer.js';
 import ModelCompositor from './model-compositor.js';
 import { EDIT_DISTANCE_COST_SCALE, getBestTokenMatches } from './correction/distance-modeler.js';
-import { TokenResultMapping } from './correction/token-result-mapping.js';
 
 const searchForProperty = defaultWordbreaker.searchForProperty;
 
@@ -28,6 +27,7 @@ import Reversion = LexicalModelTypes.Reversion;
 import Suggestion = LexicalModelTypes.Suggestion;
 import SuggestionTag = LexicalModelTypes.SuggestionTag;
 import Transform = LexicalModelTypes.Transform;
+import { TokenResult } from './correction/tokenization-corrector.js';
 
 /*
  * The functions in this file exist to provide unit-testable stateless components for the
@@ -442,24 +442,55 @@ export function determineSuggestionRange<T extends ContextTokenLike>(
   }
 }
 
+/**
+ * Specifies the core, preprocessed data necessary for generating predictions,
+ * regardless of model type.
+ */
+export interface PredictionParameters {
+  /**
+   * The portion of context that should remain unchanged by generated suggestions
+   */
+  rootContext: Context,
+
+  /**
+   * A tokenization of the corrected part of the context, usable to generate
+   * suggestions.
+   *
+   * Note that each correction will be applied iteratively to the rootContext.
+   * That is, when suggesting based on the correction at index 1, the
+   * "unchanged" (root) context used for that suggestion will include the
+   * changes from the entry at index 0 (or possibly, a suggestion derived from it).
+   */
+  tokenizedCorrection: ProbabilityMass<Transform>[],
+
+  /**
+   * A closure to be applied to the generated suggestion's metadata.
+   * @param entry
+   * @returns
+   */
+  applyInPost: (entry: CorrectionPredictionTuple) => void
+}
+
 /**
  * This function takes in metadata about generated corrections (for models that
- * implement Traversals) and uses that to construct predictions based upon those
- * corrections.
- * @param transition    Context-transition data underlying the tokenization that led to the correction
- * @param tokenization  The tokenization from which the correction was generated.
- * @param match         The generated correction itself - the correction string and its cost
- * @param costFactor    A multiplicative factor used to adjust the cost when building prediction probabilities.
+ * implement Traversals) and uses that to produce the corresponding parameters
+ * to use for generating suggestions.
+ * @param transition    Context-transition data underlying the tokenization that
+ * led to the correction
+ * @param tokenization  The tokenization from which the correction was
+ * generated.
+ * @param match         The generated correction itself - the correction string
+ * and its cost
+ * @param costFactor    A multiplicative factor used to adjust the cost when
+ * building prediction probabilities.
  * @returns
  */
-export function buildAndMapPredictions(
+export function determineTokenizedCorrectionSequence(
   transition: ContextTransition,
   tokenization: ContextTokenization,
-  match: Readonly<TokenResultMapping>,
+  match: Readonly<TokenResult>,
   costFactor: number
-): CorrectionPredictionTuple[] {
-  const model = transition.final.model;
-
+): PredictionParameters {
   const applicationTarget = transition.base.displayTokenization;
   const { deleteLeft } = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId);
 
@@ -469,7 +500,12 @@ export function buildAndMapPredictions(
   const correctionTransform: Transform = {
     insert: match.matchString,  // insert correction string
     deleteLeft: 0,
-    id: transition.transitionId // The correction should always be based on the most recent external transform/transcription ID.
+  }
+
+  // The correction should always be based on the most recent external
+  // transform/transcription ID.
+  if(transition.transitionId !== undefined) {
+    correctionTransform.id = transition.transitionId;
   }
 
   const rootCost = match.totalCost;
@@ -478,15 +514,16 @@ export function buildAndMapPredictions(
     p: Math.exp(-rootCost * costFactor)
   };
 
-  const predictions = predictFromCorrectionSequence(model, [predictionRoot], rootContext, transition.transitionId);
-  predictions.forEach((entry) => {
-    entry.preservationTransform = tokenization.taillessTrueKeystroke;
-    // // Will need an extra lookup layer if the suggestion is generated from within a cluster.
-    // entry.baseTokenization = transition.final.tokenizationSourceMap.get(tokenization);
-    entry.prediction.sample.transform.deleteLeft = deleteLeft;
-  });
-
-  return predictions;
+  return {
+    rootContext,
+    tokenizedCorrection: [predictionRoot],
+    applyInPost: (entry: CorrectionPredictionTuple) => {
+      entry.preservationTransform = tokenization.taillessTrueKeystroke;
+      // // Will need an extra lookup layer if the suggestion is generated from within a cluster.
+      // entry.baseTokenization = transition.final.tokenizationSourceMap.get(tokenization);
+      entry.prediction.sample.transform.deleteLeft = deleteLeft;
+    }
+  };
 }
 
 /**
@@ -600,7 +637,9 @@ export async function correctAndEnumerate(
      */
     const costFactor = (tokenization.tail.inputCount <= 1) ? ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT : 1;
 
-    const predictions = buildAndMapPredictions(transition, tokenization, match, costFactor);
+    const predictionPrep = determineTokenizedCorrectionSequence(transition, tokenization, match, costFactor);
+    const predictions = predictFromCorrectionSequence(lexicalModel, predictionPrep.tokenizedCorrection, predictionPrep.rootContext, transition.transitionId);
+    predictions.forEach((p) => predictionPrep.applyInPost(p));
 
     // Only set 'best correction' cost when a correction ACTUALLY YIELDS predictions.
     if(predictions.length > 0 && bestCorrectionCost === undefined) {
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts
new file mode 100644
index 00000000000..16df9814971
--- /dev/null
+++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts
@@ -0,0 +1,361 @@
+/*
+ * Keyman is copyright (C) SIL Global. MIT License.
+ *
+ * Created by jahorton on 2026-05-19
+ *
+ * This file tests the prediction helper-method responsible for preparing
+ * corrections for multi-token prediction for our standard models, all of which
+ * utilize LexiconTraversals and the context-tokenization-caching subsystem.
+ */
+
+import { assert } from 'chai';
+
+import { LexicalModelTypes } from "@keymanapp/common-types";
+import * as wordBreakers from '@keymanapp/models-wordbreakers';
+import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs';
+import { KMWString } from '@keymanapp/web-utils';
+
+import { determineTokenizedCorrectionSequence, models, ContextState, ContextToken, ContextTokenization, CorrectionPredictionTuple } from "@keymanapp/lm-worker/test-index";
+
+import Context = LexicalModelTypes.Context;
+import ProbabilityMass = LexicalModelTypes.ProbabilityMass;
+import Transform = LexicalModelTypes.Transform;
+import TrieModel = models.TrieModel;
+
+const testModel = new TrieModel(
+  jsonFixture('models/tries/english-1000'), {
+    wordBreaker: wordBreakers.default,
+  }
+);
+
+describe('determineTokenizedCorrectionSequence', () => {
+  it(`properly analyzes common-case token-extension - adding a letter to an existing word`, () => {
+    const context: Context = {
+      left: 'the quick brown f',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    };
+
+    const trueInput: ProbabilityMass<Transform> = {
+      sample: {
+        insert: 'o',
+        deleteLeft: 0
+      },
+      p: .5
+    };
+
+    const state = new ContextState(context, testModel);
+    const transition = state.analyzeTransition(context, [trueInput]);
+
+
+    const results = determineTokenizedCorrectionSequence(
+      transition,
+      transition.final.displayTokenization, {
+        matchString: 'fo',
+        inputSamplingCost: -Math.log(trueInput.p),
+        knownCost: 0,
+        totalCost: -Math.log(trueInput.p)
+      },
+      1
+    );
+
+    assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, {
+      casingForm: undefined,
+      left: 'the quick brown ',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    });
+
+    assert.deepEqual(results.tokenizedCorrection, [
+      {
+        sample: {
+          insert: 'fo',
+          deleteLeft: 0
+        },
+        p: trueInput.p
+      }
+    ]);
+  });
+
+  it(`properly analyzes common-case whitespace - ending a token and adding a new one`, () => {
+    const context: Context = {
+      left: 'the quick brown',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    };
+
+    const trueInput: ProbabilityMass<Transform> = {
+      sample: {
+        insert: ' ',
+        deleteLeft: 0
+      },
+      p: .5
+    };
+
+    const state = new ContextState(context, testModel);
+    const transition = state.analyzeTransition(context, [trueInput]);
+
+
+    const results = determineTokenizedCorrectionSequence(
+      transition,
+      transition.final.displayTokenization, {
+        matchString: ' ',
+        inputSamplingCost: -Math.log(trueInput.p),
+        knownCost: 0,
+        totalCost: -Math.log(trueInput.p)
+      },
+      1
+    );
+
+    assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, {
+      casingForm: undefined,
+      left: 'the quick brown',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    });
+
+    assert.deepEqual(results.tokenizedCorrection, [
+      {
+        sample: {
+          insert: ' ',
+          deleteLeft: 0
+        },
+        p: trueInput.p
+      }
+    ]);
+  });
+
+  it(`properly analyzes common-case word-start - beginning a new token`, () => {
+    const context: Context = {
+      left: 'the quick brown ',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    };
+
+    const trueInput: ProbabilityMass<Transform> = {
+      sample: {
+        insert: 'f',
+        deleteLeft: 0
+      },
+      p: .5
+    };
+
+    const state = new ContextState(context, testModel);
+    const transition = state.analyzeTransition(context, [trueInput]);
+
+
+    const results = determineTokenizedCorrectionSequence(
+      transition,
+      transition.final.displayTokenization, {
+        matchString: 'f',
+        inputSamplingCost: -Math.log(trueInput.p),
+        knownCost: 0,
+        totalCost: -Math.log(trueInput.p)
+      },
+      1
+    );
+
+    assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, {
+      casingForm: undefined,
+      left: 'the quick brown ',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    });
+
+    assert.deepEqual(results.tokenizedCorrection, [
+      {
+        sample: {
+          insert: 'f',
+          deleteLeft: 0
+        },
+        p: trueInput.p
+      }
+    ]);
+  });
+
+  it(`properly analyzes post-merge case`, () => {
+    let context: Context = {
+      left: 'the quick brown fox ',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    };
+
+    const trueInput: ProbabilityMass<Transform> = {
+      sample: {
+        insert: 't',
+        deleteLeft: 0
+      },
+      p: .5
+    };
+
+    const constructingState = new ContextState(context, testModel);
+    const tokens = constructingState.displayTokenization.tokens;
+    tokens.push(ContextToken.fromRawText(testModel, 'can'));
+    tokens.push(ContextToken.fromRawText(testModel, '\''));
+
+    context = models.applyTransform({insert: 'can\'', deleteLeft: 0}, context);
+
+    const state = new ContextState(context, testModel, new ContextTokenization(tokens));
+    const transition = state.analyzeTransition(context, [trueInput]);
+
+    const results = determineTokenizedCorrectionSequence(
+      transition,
+      transition.final.displayTokenization, {
+        matchString: 'can\'t',
+        inputSamplingCost: -Math.log(trueInput.p),
+        knownCost: 0,
+        totalCost: -Math.log(trueInput.p)
+      },
+      1
+    );
+
+    assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, {
+      casingForm: undefined,
+      left: 'the quick brown fox ',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    });
+
+    assert.deepEqual(results.tokenizedCorrection, [
+      {
+        sample: {
+          insert: 'can\'t',
+          deleteLeft: 0
+        },
+        p: trueInput.p
+      }
+    ]);
+  });
+
+  // Will be handled far better after resolving multi-tokenization handling.
+  it.skip(`properly analyzes post-split case`, () => {
+    const context: Context = {
+      left: 'the quick brown fox can\'',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    };
+
+    const trueInput: ProbabilityMass<Transform> = {
+      sample: {
+        insert: ' ',
+        deleteLeft: 0
+      },
+      p: .5
+    };
+
+    const state = new ContextState(context, testModel);
+    assert.equal(state.displayTokenization.tail.exampleInput, 'can\'');
+    const transition = state.analyzeTransition(context, [trueInput]);
+
+    const results = determineTokenizedCorrectionSequence(
+      transition,
+      transition.final.displayTokenization, {
+        matchString: ' ',
+        inputSamplingCost: -Math.log(trueInput.p),
+        knownCost: 0,
+        totalCost: -Math.log(trueInput.p)
+      },
+      1
+    );
+
+    assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, {
+      casingForm: undefined,
+      left: 'the quick brown fox ',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    });
+
+    assert.deepEqual(results.tokenizedCorrection, [
+      {
+        sample: {
+          insert: ' ',
+          deleteLeft: 0
+        },
+        p: trueInput.p
+      }
+    ]);
+  });
+
+  it(`properly analyzes conplex transition - multi-token replacement`, () => {
+    const context: Context = {
+      left: 'the quick brown f',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    };
+
+    const trueInput: ProbabilityMass<Transform> = {
+      sample: {
+        insert: 'fast red d',
+        deleteLeft: 'quick brown f'.length
+      },
+      p: .5
+    };
+
+    const state = new ContextState(context, testModel);
+    const transition = state.analyzeTransition(context, [trueInput]);
+
+    const results = determineTokenizedCorrectionSequence(
+      transition,
+      transition.final.displayTokenization, {
+        matchString: 'd',
+        inputSamplingCost: -Math.log(trueInput.p),
+        knownCost: 0,
+        totalCost: -Math.log(trueInput.p)
+      },
+      1
+    );
+
+    // Large-scale deletions will receive enhanced handling soon.  But, for now, it's
+    // deleted by the `preservationTransform`, not here.
+    assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, {
+      casingForm: undefined,
+      left: 'the quick brown ',
+      right: '',
+      startOfBuffer: true,
+      endOfBuffer: true
+    });
+
+    assert.deepEqual(results.tokenizedCorrection, [
+      {
+        sample: {
+          insert: 'd',
+          deleteLeft: 0
+        },
+        p: trueInput.p
+      }
+    ]);
+
+    const dummiedTuple: CorrectionPredictionTuple = {
+      prediction: {
+        sample: {
+          transform: { insert: 'dog', deleteLeft: 0 },
+          displayAs: 'dog'
+        },
+        p: .25
+      },
+      correction: {
+        sample: 'd',
+        p: trueInput.p
+      },
+      totalProb: .25 * trueInput.p
+    };
+
+    results.applyInPost(dummiedTuple);
+
+    assert.deepEqual(dummiedTuple.preservationTransform, {
+      insert: trueInput.sample.insert.substring(0, KMWString.length(trueInput.sample.insert) - 1), // remove the 'd'.
+      deleteLeft: trueInput.sample.deleteLeft - 1
+    });
+  });
+});
\ No newline at end of file

From 2c1f43a25f96e5bd7b45d98dbdfaceeb9478a7da Mon Sep 17 00:00:00 2001
From: Joshua Horton <joshua_horton@sil.org>
Date: Wed, 6 May 2026 13:21:08 -0500
Subject: [PATCH 3/4] change(web): simplify mapWhitespacedTokenization
 requirements

To better handle inputs that shift the word-boundary in some custom models and models released before Keyman 14.0, this PR provides generalized re-use of the whitespace-based token-transition algorithm used for our most prominently-supported models.

Build-bot: skip build:web
Test-bot: skip
---
 .../main/correction/context-tokenization.ts   | 337 ++++++++++--------
 1 file changed, 182 insertions(+), 155 deletions(-)

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
index fc2f81615c1..830ff90b966 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
@@ -10,7 +10,7 @@
 import { LexicalModelTypes } from '@keymanapp/common-types';
 import { KMWString } from '@keymanapp/web-utils';
 
-import { ContextToken } from './context-token.js';
+import { ContextToken, ContextTokenLike } from './context-token.js';
 import TransformUtils from '../transformUtils.js';
 import { computeDistance, EditOperation, EditTuple } from './classical-calculation.js';
 import { determineModelTokenizer } from '../model-helpers.js';
@@ -334,7 +334,7 @@ export class ContextTokenization {
   }
 
   /**
-   * Given the existing tokenization and an incoming input `Transform`, this
+   * Given this existing tokenization and an incoming input `Transform`, this
    * method precomputes how both the current, pre-application tokenization will
    * be altered and how the incoming Transform will be tokenized.
    *
@@ -351,158 +351,7 @@ export class ContextTokenization {
     transform: Transform,
     edgeOptions?: EdgeWindowOptions
   ): TokenizationTransitionEdits {
-    // Step 4:  now that our window's been properly updated, determine what the
-    // input's effects on the context is.
-    //
-    // Context does not slide within this function.
-    //
-    // Assumption:  this alignment cannot fail; we KNOW there's a solid
-    // before-and-after relationship here, and we can base it on the results of
-    // a prior syncToSourceWindow call.
-    //
-    // We don't wish to do the full tokenization here - we only want to check
-    // over the last few tokens that might reasonably shift.  We also want to
-    // batch effects.
-
-    // Do not mutate the original transform; it can cause unexpected assertion
-    // effects in unit tests.
-    const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0};
-    const edgeWindow = buildEdgeWindow(this.tokens, edgeTransform, false, edgeOptions);
-    const {
-      retokenizationText,
-      editBoundary,
-      sliceIndex: edgeSliceIndex
-    } = edgeWindow;
-    // Prevent mutation of the original return property.
-    const stackedDeletes = edgeWindow.deleteLengths.slice();
-
-    const tokenize = determineModelTokenizer(lexicalModel);
-    const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text);
-    if(postTokenization.length == 0) {
-      postTokenization.push('');
-    }
-    const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform);
-
-    // What does the edge's retokenization look like when we remove the inserted portions?
-    const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex);
-    const insertBoundaryToken = postTokenization[firstInsertPostIndex];
-
-    // Note:  requires that helpers have not mutated `stackedInserts`.
-    const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0]));
-
-    // Do not preserve empty tokens here, even if tokenization normally would produce one.
-    // It's redundant and replaceable for tokenization batching efforts.
-    if(uninsertedBoundaryToken != '') {
-      retokenizedEdge.push(uninsertedBoundaryToken);
-    }
-
-    // We've found the root token within the root context state to which deletes (and inserts)
-    // may be applied.
-    // We've also found the last post-application token to which transform changes contributed.
-    // How do these indices line up - we need to properly construct and index our transforms,
-    // but 'merge' and 'split' edits can mess up that indexing.
-
-    const currentTokens = this.tokens;
-    const preTokenization = currentTokens
-      .slice(edgeSliceIndex, editBoundary.tokenIndex+1)
-      .map(t => t.exampleInput);
-
-    // Determine the effects of splits & merges as applied to the original
-    // cached context state.
-    const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits(
-      preTokenization,
-      postTokenization.slice(0, firstInsertPostIndex+1)
-    );
-
-    /*
-     * Final steps:  We can now safely index the transforms.  Let's do it!
-     * 1. Determine the first index a Transform may align to
-     * 2. Build the transforms
-     *
-     * Notes:
-     * - text applied to the end of a 'merged' token at the tail:  should have
-     *   index 0, not -1.
-     *   - pretokenization index will mismatch by -1: -SUM(merge size - 1)
-     *   - Ex: can + ' + t => can't
-     *          -1   0          0
-     * - text applied to the end of a 'split' token at the tail:  should also
-     *   have index 0, not 1.
-     *   - posttokenization index will mismatch by +1: SUM(split size - 1)
-     *   - new token after 'split':  index 1
-     *   - Ex: can' + ? => can + ' + ?
-     *          0          -1    0   1
-     *
-     * The first transform applies at the end of the retokenized zone and its
-     * associated index.  The question:  were there deletes that occurred?
-     */
-
-    const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex;
-    let shiftDeletes = false;
-    // first popped entry == 0 - a delete no-op.
-    if(stackedDeletes[stackedDeletes.length - 1] == 0) {
-      // the boundary indices found by both methods above differ
-      if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) {
-        shiftDeletes = true;
-      }
-
-      // there are no inserts, so we don't affect the boundary token we landed on.
-      if(stackedDeletes.length > 1 && transform.insert == '') {
-        shiftDeletes = true;
-      }
-    }
-
-    if(shiftDeletes) {
-      // Do not add a zero-length delete if we're not actually altering the
-      // corresponding token at all.
-      stackedDeletes.pop();
-    }
-
-    // The first delete always applies to index 0. If the built edge window
-    // omits a context-final empty-string, adjust the tokenization indices
-    // accordingly.
-    const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0);
-    // Mutates stackedInserts, stackedDeletes.
-    const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length);
-    const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex);
-
-    // If there's an empty transform in the 0 position and we already know we're
-    // dropping tokens - and only deleting - we're dropping an
-    // otherwise-untracked empty token - make sure it's included!
-    const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0));
-    // Past that, if we have more delete entries than insert entries for our transforms, we
-    // dropped some tokens outright.
-    const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0);
-
-    // Final step:  check for any unexpected boundary shifts not mappable to 'merge' / 'split'
-    // and not caused by transforms.  All transforms always apply in sequence at the end.
-    const unmappedEdits: EditTuple<EditOperation>[] = [];
-    for(let i = 0; i < editPath.length - transformMap.size; i++) {
-      const op = editPath[i].op;
-      switch(op) {
-        case 'merge':
-        case 'split':
-          // already calculated
-          // can fall through to the `continue;` line.
-        case 'match':
-          continue;
-        default:
-          // Should only be substitutions here.
-          // We may wish to add extra analysis in the future when supporting
-          // prediction from multiple competing tokenizations.
-          unmappedEdits.push(editPath[i] as EditTuple<EditOperation>);
-      }
-    }
-
-    return {
-      alignment: {
-        edgeWindow: {...edgeWindow, retokenization: retokenizedEdge},
-        merges,
-        splits,
-        unmappedEdits,
-        removedTokenCount
-      },
-      tokenizedTransform: transformMap,
-    };
+    return mapWhitespacedTokenization(this.tokens, lexicalModel, transform, edgeOptions);
   }
 
   /**
@@ -763,6 +612,184 @@ interface RetokenizedEdgeWindow extends EdgeWindow {
   retokenization: string[];
 }
 
+/**
+ * Given an existing tokenization and an incoming input `Transform`, this
+ * method precomputes how both the current, pre-application tokenization will
+ * be altered and how the incoming Transform will be tokenized.
+ *
+ * This function is able to operate with a reduced interface, not requiring
+ * the full ContextToken/ContextState/etc subsystem and its related
+ * SearchQuotientNode requirements.
+ *
+ * Note that this method is designed for use with languages that employ
+ * classical space-based wordbreaking.  Do not use it for languages that need
+ * dictionary-based wordbreaking support!
+ * @param tokens
+ * @param lexicalModel
+ * @param transform
+ * @param edgeOptions
+ * @returns
+ */
+export function mapWhitespacedTokenization(
+  tokens: ContextTokenLike[],
+  lexicalModel: LexicalModel,
+  transform: Transform,
+  edgeOptions?: EdgeWindowOptions
+): TokenizationTransitionEdits {
+  // Step 4:  now that our window's been properly updated, determine what the
+  // input's effects on the context is.
+  //
+  // Context does not slide within this function.
+  //
+  // Assumption:  this alignment cannot fail; we KNOW there's a solid
+  // before-and-after relationship here, and we can base it on the results of
+  // a prior syncToSourceWindow call.
+  //
+  // We don't wish to do the full tokenization here - we only want to check
+  // over the last few tokens that might reasonably shift.  We also want to
+  // batch effects.
+
+  // Do not mutate the original transform; it can cause unexpected assertion
+  // effects in unit tests.
+  const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0};
+  const edgeWindow = buildEdgeWindow(tokens, edgeTransform, false, edgeOptions);
+  const {
+    retokenizationText,
+    editBoundary,
+    sliceIndex: edgeSliceIndex
+  } = edgeWindow;
+  // Prevent mutation of the original return property.
+  const stackedDeletes = edgeWindow.deleteLengths.slice();
+
+  const tokenize = determineModelTokenizer(lexicalModel);
+  const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text);
+  if(postTokenization.length == 0) {
+    postTokenization.push('');
+  }
+  const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform);
+
+  // What does the edge's retokenization look like when we remove the inserted portions?
+  const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex);
+  const insertBoundaryToken = postTokenization[firstInsertPostIndex];
+
+  // Note:  requires that helpers have not mutated `stackedInserts`.
+  const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0]));
+
+  // Do not preserve empty tokens here, even if tokenization normally would produce one.
+  // It's redundant and replaceable for tokenization batching efforts.
+  if(uninsertedBoundaryToken != '') {
+    retokenizedEdge.push(uninsertedBoundaryToken);
+  }
+
+  // We've found the root token within the root context state to which deletes (and inserts)
+  // may be applied.
+  // We've also found the last post-application token to which transform changes contributed.
+  // How do these indices line up - we need to properly construct and index our transforms,
+  // but 'merge' and 'split' edits can mess up that indexing.
+
+  const currentTokens = tokens;
+  const preTokenization = currentTokens
+    .slice(edgeSliceIndex, editBoundary.tokenIndex+1)
+    .map(t => t.exampleInput);
+
+  // Determine the effects of splits & merges as applied to the original
+  // cached context state.
+  const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits(
+    preTokenization,
+    postTokenization.slice(0, firstInsertPostIndex+1)
+  );
+
+  /*
+    * Final steps:  We can now safely index the transforms.  Let's do it!
+    * 1. Determine the first index a Transform may align to
+    * 2. Build the transforms
+    *
+    * Notes:
+    * - text applied to the end of a 'merged' token at the tail:  should have
+    *   index 0, not -1.
+    *   - pretokenization index will mismatch by -1: -SUM(merge size - 1)
+    *   - Ex: can + ' + t => can't
+    *          -1   0          0
+    * - text applied to the end of a 'split' token at the tail:  should also
+    *   have index 0, not 1.
+    *   - posttokenization index will mismatch by +1: SUM(split size - 1)
+    *   - new token after 'split':  index 1
+    *   - Ex: can' + ? => can + ' + ?
+    *          0          -1    0   1
+    *
+    * The first transform applies at the end of the retokenized zone and its
+    * associated index.  The question:  were there deletes that occurred?
+    */
+
+  const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex;
+  let shiftDeletes = false;
+  // first popped entry == 0 - a delete no-op.
+  if(stackedDeletes[stackedDeletes.length - 1] == 0) {
+    // the boundary indices found by both methods above differ
+    if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) {
+      shiftDeletes = true;
+    }
+
+    // there are no inserts, so we don't affect the boundary token we landed on.
+    if(stackedDeletes.length > 1 && transform.insert == '') {
+      shiftDeletes = true;
+    }
+  }
+
+  if(shiftDeletes) {
+    // Do not add a zero-length delete if we're not actually altering the
+    // corresponding token at all.
+    stackedDeletes.pop();
+  }
+
+  // The first delete always applies to index 0. If the built edge window
+  // omits a context-final empty-string, adjust the tokenization indices
+  // accordingly.
+  const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0);
+  // Mutates stackedInserts, stackedDeletes.
+  const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length);
+  const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex);
+
+  // If there's an empty transform in the 0 position and we already know we're
+  // dropping tokens - and only deleting - we're dropping an
+  // otherwise-untracked empty token - make sure it's included!
+  const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0));
+  // Past that, if we have more delete entries than insert entries for our transforms, we
+  // dropped some tokens outright.
+  const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0);
+
+  // Final step:  check for any unexpected boundary shifts not mappable to 'merge' / 'split'
+  // and not caused by transforms.  All transforms always apply in sequence at the end.
+  const unmappedEdits: EditTuple<EditOperation>[] = [];
+  for(let i = 0; i < editPath.length - transformMap.size; i++) {
+    const op = editPath[i].op;
+    switch(op) {
+      case 'merge':
+      case 'split':
+        // already calculated
+        // can fall through to the `continue;` line.
+      case 'match':
+        continue;
+      default:
+        // Should only be substitutions here.
+        // We may wish to add extra analysis in the future when supporting
+        // prediction from multiple competing tokenizations.
+        unmappedEdits.push(editPath[i] as EditTuple<EditOperation>);
+    }
+  }
+
+  return {
+    alignment: {
+      edgeWindow: {...edgeWindow, retokenization: retokenizedEdge},
+      merges,
+      splits,
+      unmappedEdits,
+      removedTokenCount
+    },
+    tokenizedTransform: transformMap,
+  };
+}
+
 /**
  * Constructs a window on one side of the represented context that is aligned to
  * existing tokenization.
@@ -777,7 +804,7 @@ interface RetokenizedEdgeWindow extends EdgeWindow {
  * @returns
  */
 export function buildEdgeWindow(
-  currentTokens: ContextToken[],
+  currentTokens: ContextTokenLike[],
   // Requires deleteRight be explicitly set.
   transform: Transform & { deleteRight: number },
   applyAtFront: boolean,

From b6ba677a55cb6997b5d9f1e020cefab949577395 Mon Sep 17 00:00:00 2001
From: Joshua Horton <joshua_horton@sil.org>
Date: Thu, 28 May 2026 16:16:32 -0500
Subject: [PATCH 4/4] refactor(web): define common buildCorrectionSequence
 method used for all model types

Build-bot: skip build:web
Test-bot: skip
---
 .../worker-thread/src/main/predict-helpers.ts | 51 ++++++++++++-------
 ...ine-tokenized-correction-sequence.tests.ts |  2 +-
 2 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
index 35f2df19db8..f89143356b6 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
@@ -13,6 +13,7 @@ import { ContextTransition } from './correction/context-transition.js';
 import { ExecutionTimer } from './correction/execution-timer.js';
 import ModelCompositor from './model-compositor.js';
 import { EDIT_DISTANCE_COST_SCALE, getBestTokenMatches } from './correction/distance-modeler.js';
+import { TokenResult } from './correction/tokenization-corrector.js';
 
 const searchForProperty = defaultWordbreaker.searchForProperty;
 
@@ -27,7 +28,6 @@ import Reversion = LexicalModelTypes.Reversion;
 import Suggestion = LexicalModelTypes.Suggestion;
 import SuggestionTag = LexicalModelTypes.SuggestionTag;
 import Transform = LexicalModelTypes.Transform;
-import { TokenResult } from './correction/tokenization-corrector.js';
 
 /*
  * The functions in this file exist to provide unit-testable stateless components for the
@@ -471,6 +471,34 @@ export interface PredictionParameters {
   applyInPost: (entry: CorrectionPredictionTuple) => void
 }
 
+export function buildCorrectionSequence(
+  transitionEffects: ReturnType<typeof determineSuggestionRange>,
+  context: Context,
+  match: Readonly<TokenResult>,
+  costFactor: number
+) {
+  const { deleteLeft } = transitionEffects;
+
+  const rootContext = models.applyTransform({insert: '', deleteLeft}, context);
+
+  // Replace the existing context with the correction.
+  const correctionTransform: Transform = {
+    insert: match.matchString,  // insert correction string
+    deleteLeft: 0,
+  }
+
+  const rootCost = match.totalCost;
+  const predictionRoot = {
+    sample: correctionTransform,
+    p: Math.exp(-rootCost * costFactor)
+  };
+
+  return {
+    rootContext,
+    tokenizedCorrection: [predictionRoot]
+  };
+}
+
 /**
  * This function takes in metadata about generated corrections (for models that
  * implement Traversals) and uses that to produce the corresponding parameters
@@ -492,31 +520,20 @@ export function determineTokenizedCorrectionSequence(
   costFactor: number
 ): PredictionParameters {
   const applicationTarget = transition.base.displayTokenization;
-  const { deleteLeft } = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId);
-
-  const rootContext = models.applyTransform({insert: '', deleteLeft}, transition.base.context);
+  const transitionParams = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId);
 
-  // Replace the existing context with the correction.
-  const correctionTransform: Transform = {
-    insert: match.matchString,  // insert correction string
-    deleteLeft: 0,
-  }
+  const suggestionParams = buildCorrectionSequence(transitionParams, transition.base.context, match, costFactor);
 
   // The correction should always be based on the most recent external
   // transform/transcription ID.
   if(transition.transitionId !== undefined) {
-    correctionTransform.id = transition.transitionId;
+    suggestionParams.tokenizedCorrection.map((t) => t.sample.id = transition.transitionId);
   }
 
-  const rootCost = match.totalCost;
-  const predictionRoot = {
-    sample: correctionTransform,
-    p: Math.exp(-rootCost * costFactor)
-  };
+  const { deleteLeft } = transitionParams;
 
   return {
-    rootContext,
-    tokenizedCorrection: [predictionRoot],
+    ...suggestionParams,
     applyInPost: (entry: CorrectionPredictionTuple) => {
       entry.preservationTransform = tokenization.taillessTrueKeystroke;
       // // Will need an extra lookup layer if the suggestion is generated from within a cluster.
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts
index 16df9814971..f79c5aeb18b 100644
--- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts
+++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts
@@ -286,7 +286,7 @@ describe('determineTokenizedCorrectionSequence', () => {
     ]);
   });
 
-  it(`properly analyzes conplex transition - multi-token replacement`, () => {
+  it(`properly analyzes complex transition - multi-token replacement`, () => {
     const context: Context = {
       left: 'the quick brown f',
       right: '',