keymanapp · jahorton · May 6, 2026 · May 8, 2026 · May 21, 2026 · May 22, 2026
diff --git a/web/src/engine/predictive-text/templates/src/tokenization.ts b/web/src/engine/predictive-text/templates/src/tokenization.ts
@@ -95,6 +95,10 @@ export function tokenize(
     currentIndex = nextIndex;
   }
 
+  if(tokenization.left.length == 0) {
+    tokenization.left.push({text: '', isWhitespace: false});
+  }
+
   // New step 2: handle any rejoins needed.
 
   // Handle any desired special handling for directly-pre-caret scenarios - where for this

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts
@@ -71,7 +71,8 @@ export function determineModelTokenizer(model: LexicalModel) {
     if(model.wordbreaker) {
       return models.tokenize(model.wordbreaker, context);
     } else {
-      return null;
+      // Not ideal for pre-14.0 models, but it'll do for now.
+      return models.tokenize(wordBreakers.default, context);
     }
   }
 }

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
@@ -6,7 +6,7 @@ import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbre
 import TransformUtils from './transformUtils.js';
 import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js';
 import { ContextTokenLike } from './correction/context-token.js';
-import { ContextTokenization } from './correction/context-tokenization.js';
+import { ContextTokenization, mapWhitespacedTokenization } from './correction/context-tokenization.js';
 import { ContextTracker } from './correction/context-tracker.js';
 import { ContextState, determineContextSlideTransform } from './correction/context-state.js';
 import { ContextTransition } from './correction/context-transition.js';
@@ -191,88 +191,64 @@ export function tupleDisplayOrderSort(a: CorrectionPredictionTuple, b: Correctio
   return b.totalProb - a.totalProb;
 }
 
-export async function correctAndEnumerateWithoutTraversals(
+export function determineTraversallessCorrectionSequences(
   lexicalModel: LexicalModel,
-  transformDistribution: Distribution<Transform>,
+  corrections: Distribution<Transform>,
   context: Context
-): Promise<{
-  /**
-   * For models that support correction-search caching, this provides the
-   * cached object corresponding to this method's operation.
-   *
-   * Otherwise, is `null`.
-   */
-  postContextState?: ContextState;
+): PredictionParameters[] {
+  let returnedPredictionData: PredictionParameters[] = [];
 
-  /**
-   * The suggestions generated based on the user's input state.
-   */
-  rawPredictions: CorrectionPredictionTuple[];
+  const tokenizer = determineModelTokenizer(lexicalModel);
+  const wordbreak = determineModelWordbreaker(lexicalModel);
 
-  /**
-   * The id of a prior ContextTransition event that triggered a Suggestion found
-   * at the end of the Context.  Will be undefined if no edits have occurred
-   * since the Suggestion was applied.
-   */
-  revertableTransitionId?: number
-}> {
-  const inputTransform = transformDistribution[0].sample;
-  let rawPredictions: CorrectionPredictionTuple[] = [];
+  const tokenization = tokenizer(context); // issue at present if no tokens exist!
+  const tokenMapper = (t: models.Token) => {
+    return {
+      exampleInput: t.text,
+      codepointLength: KMWString.length(t.text)
+    } as ContextTokenLike;
+  }
 
-  let predictionRoots: ProbabilityMass<Transform>[];
+  for(let correction of corrections) {
+    // Step 1:  determine tokenization effects.  We can't use the
+    // ContextTokenization pattern due to the model's lack of LexiconTraversal
+    // support, though.
+    const transformId = correction.sample.id;
+    const postContext = models.applyTransform(correction.sample, context);
+    const postTokenization = tokenizer(postContext);
+
+    const transitionEffects = determineSuggestionRange(tokenization.left.map(tokenMapper), postTokenization.left.map(tokenMapper), (a, b) => a.exampleInput == b.exampleInput);
+    const match: TokenResult = {
+      matchString: wordbreak(postContext),
+      inputSamplingCost: -Math.log(correction.p),
+      knownCost: 0,
+      totalCost: -Math.log(correction.p)
+    };
 
-  // Only allow new-word suggestions if space was the most likely keypress.
-  const allowSpace = TransformUtils.isWhitespace(inputTransform);
-  const allowBksp = TransformUtils.isBackspace(inputTransform);
+    const suggestionParams = buildCorrectionSequence(transitionEffects, context, match, 1);
 
-  // Generates raw prediction distributions for each valid input.  Can only 'correct'
-  // against the final input.
-  //
-  // This is the old, 12.0-13.0 'correction' style.
-  if(allowSpace) {
-    // Detect start of new word; prevent whitespace loss here.
-    predictionRoots = [{sample: inputTransform, p: 1.0}];
-  } else {
-    predictionRoots = transformDistribution.map((alt) => {
-      let transform = alt.sample;
-
-      // Filter out special keys unless they're expected.
-      if(TransformUtils.isWhitespace(transform) && !allowSpace) {
-        return null;
-      } else if(TransformUtils.isBackspace(transform) && !allowBksp) {
-        return null;
-      }
+    const tokenizationMapping = mapWhitespacedTokenization(tokenization.left.map((t) => { return {exampleInput: t.text, codepointLength: KMWString.length(t.text)} }), lexicalModel, correction.sample);
+    const tokenizedCorrection = tokenizationMapping.tokenizedTransform;
+    const tokenizedCorrectionEntries = [...tokenizedCorrection.values()];
 
-      return alt;
-    });
-  }
+    // IF:  array has multiple entries, then build the preservation-transform as below, including the deleteLeft.
+    // If not, don't make one!
+    const preservationTransform = tokenizedCorrectionEntries.slice(0, -1).reduce((accum, curr) => {
+      return { insert: accum.insert + curr.insert, deleteLeft: accum.deleteLeft + curr.deleteLeft };
+    }, { insert: '', deleteLeft: 0, id: correction.sample.id});
 
-  const wordbreak = determineModelWordbreaker(lexicalModel);
-  // Remove `null` entries, then determine suggestions.
-  predictionRoots.forEach((pr) => {
-    const postContext = models.applyTransform(pr.sample, context);
-    const tailTokenText = wordbreak(postContext);
-    const rootContext = models.applyTransform({insert: '', deleteLeft: KMWString.length(tailTokenText)}, postContext);
-
-    const results = predictFromCorrectionSequence(lexicalModel, [{
-      sample: {
-        insert: tailTokenText,
-        deleteLeft: 0,
-        id: pr.sample.id
-      },
-      p: pr.p
-    }], rootContext, pr.sample.id);
-    results.forEach((r) => rawPredictions.push(r));
-  })
-
-  if(allowSpace) {
-    rawPredictions.forEach((entry) => entry.preservationTransform = inputTransform);
+    returnedPredictionData.push({
+      ...suggestionParams,
+      applyInPost: (p) => {
+        p.preservationTransform = preservationTransform;
+        if(transformId) {
+          p.prediction.sample.transformId = transformId;
+        }
+      }
+    })
   }
 
-  return {
-    postContextState: null,
-    rawPredictions: rawPredictions
-  };
+  return returnedPredictionData;
 }
 
 /**
@@ -586,7 +562,14 @@ export async function correctAndEnumerate(
   // It's mostly here to support models compiled before Keyman 14.0, which was
   // when the `LexiconTraversal` pattern was established.
   if(!contextTracker) {
-    return correctAndEnumerateWithoutTraversals(lexicalModel, transformDistribution, context);
+    const predictionData = determineTraversallessCorrectionSequences(lexicalModel, transformDistribution, context);
+    return {
+      rawPredictions: predictionData.flatMap((entry) => {
+        const predictions = predictFromCorrectionSequence(lexicalModel, entry.tokenizedCorrection, entry.rootContext, transformDistribution[0]?.sample.id);
+        predictions.forEach((p) => entry.applyInPost(p));
+        return predictions;
+      })
+    };
   }
 
   // 'else':  the current, 14.0+ pattern, which is able to leverage

diff --git a/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts
@@ -175,7 +175,7 @@ describe('Tokenization functions', function() {
     });
 
     it('properly handles empty-context cases', function() {
-      // Wordbreaking on a empty space => no word.
+      // Wordbreaking on a empty space => no word, but empty initial token.
       let context = {
         left: '', startOfBuffer: true,
         right: '', endOfBuffer: true
@@ -184,7 +184,7 @@ describe('Tokenization functions', function() {
       let tokenization = models.tokenize(wordBreakers.default, context);
 
       let expectedResult: models.Tokenization = {
-        left: [],
+        left: [{text: '', isWhitespace: false}],
         right: [],
         caretSplitsToken: false
       };
@@ -193,11 +193,11 @@ describe('Tokenization functions', function() {
     });
 
     it('properly handles null context cases', function() {
-      // Wordbreaking on a empty space => no word.
+      // Wordbreaking on a empty space => no word, but empty initial token.
       let tokenization = models.tokenize(wordBreakers.default, null);
 
       let expectedResult: models.Tokenization = {
-        left: [],
+        left: [{text: '', isWhitespace: false}],
         right: [],
         caretSplitsToken: false
       };

diff --git a/...ve-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts b/...ve-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts
@@ -236,7 +236,7 @@ describe('determineTokenizedCorrectionSequence', () => {
   });
 
   // Will be handled far better after resolving multi-tokenization handling.
-  it.skip(`properly analyzes post-split case`, () => {
+  it.skip(`properly analyzes post-split new-wordbreak case`, () => {
     const context: Context = {
       left: 'the quick brown fox can\'',
       right: '',