From 4538c2856c52ed04eac7a25d3fd0dbb091ecf976 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Tue, 2 Jun 2026 09:17:34 -0500 Subject: [PATCH 1/4] fix(web): transitionId undefined check, extra unit test --- .../worker-thread/src/main/predict-helpers.ts | 2 +- .../predict-from-correction-sequence.tests.ts | 47 +++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index f0b067f0c48..3437d7b4c66 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -750,7 +750,7 @@ export function predictFromCorrectionSequence( } }, {sample: '', p: 1}) - if(transitionId) { + if(transitionId !== undefined) { fullPrediction.sample.transform.id = transitionId; fullPrediction.sample.transformId = transitionId; } diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts index dc76a4c1db9..a5582f1b42b 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts @@ -191,6 +191,53 @@ describe('predictFromCorrectionSequence', () => { assert.approximately(predictions[1].totalProb, 0.02 * 0.6, 0.00001); predictions.forEach((prediction) => assert.equal(prediction.prediction.sample.transformId, transitionID)); }); + + it('constructs suggestions without input (as if after a context reset)', () => { + const context: Context = { + left: 'appl', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const correctionDistribution: Distribution = [{ + sample: { + insert: 'appl', + deleteLeft: 4 + }, + p: 1 + } + ]; + + const dummied_suggestions: Outcome[] = [ + { + transform: { + insert: "apple", + deleteLeft: 4 + }, + displayAs: "apple", + p: 0.5 + } + ]; + + const model = new DummyModel({ + ...DUMMY_MODEL_CONFIG, + futureSuggestions: [ dummied_suggestions ] + }); + + const transitionID = 12345; + const predictions = predictFromCorrectionSequence(model, correctionDistribution, context, transitionID); + predictions.forEach((entry) => assert.equal(entry.correction.sample, 'appl')); + predictions.forEach((entry) => assert.equal(entry.correction.p, 1)); + predictions.sort(tupleDisplayOrderSort); + + assert.sameDeepOrderedMembers(predictions.map((entry) => entry.prediction.sample), dummied_suggestions.map((s) => { + delete s.p; + s.transformId = transitionID; + s.transform.id = transitionID; + return s; + })); + }); }); describe('on a sequence of corrections', () => { From b576c939b002b9d34c33c087a7ca659971c1087c Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Tue, 19 May 2026 13:09:12 -0500 Subject: [PATCH 2/4] refactor(web): expose suggestion-root parameters for use in unit tests Build-bot: skip build:web Test-bot: skip --- .../worker-thread/src/main/predict-helpers.ts | 85 +++-- ...ine-tokenized-correction-sequence.tests.ts | 361 ++++++++++++++++++ 2 files changed, 423 insertions(+), 23 deletions(-) create mode 100644 web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 3437d7b4c66..35f2df19db8 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -13,7 +13,6 @@ import { ContextTransition } from './correction/context-transition.js'; import { ExecutionTimer } from './correction/execution-timer.js'; import ModelCompositor from './model-compositor.js'; import { EDIT_DISTANCE_COST_SCALE, getBestTokenMatches } from './correction/distance-modeler.js'; -import { TokenResultMapping } from './correction/token-result-mapping.js'; const searchForProperty = defaultWordbreaker.searchForProperty; @@ -28,6 +27,7 @@ import Reversion = LexicalModelTypes.Reversion; import Suggestion = LexicalModelTypes.Suggestion; import SuggestionTag = LexicalModelTypes.SuggestionTag; import Transform = LexicalModelTypes.Transform; +import { TokenResult } from './correction/tokenization-corrector.js'; /* * The functions in this file exist to provide unit-testable stateless components for the @@ -442,24 +442,55 @@ export function determineSuggestionRange( } } +/** + * Specifies the core, preprocessed data necessary for generating predictions, + * regardless of model type. + */ +export interface PredictionParameters { + /** + * The portion of context that should remain unchanged by generated suggestions + */ + rootContext: Context, + + /** + * A tokenization of the corrected part of the context, usable to generate + * suggestions. + * + * Note that each correction will be applied iteratively to the rootContext. + * That is, when suggesting based on the correction at index 1, the + * "unchanged" (root) context used for that suggestion will include the + * changes from the entry at index 0 (or possibly, a suggestion derived from it). + */ + tokenizedCorrection: ProbabilityMass[], + + /** + * A closure to be applied to the generated suggestion's metadata. + * @param entry + * @returns + */ + applyInPost: (entry: CorrectionPredictionTuple) => void +} + /** * This function takes in metadata about generated corrections (for models that - * implement Traversals) and uses that to construct predictions based upon those - * corrections. - * @param transition Context-transition data underlying the tokenization that led to the correction - * @param tokenization The tokenization from which the correction was generated. - * @param match The generated correction itself - the correction string and its cost - * @param costFactor A multiplicative factor used to adjust the cost when building prediction probabilities. + * implement Traversals) and uses that to produce the corresponding parameters + * to use for generating suggestions. + * @param transition Context-transition data underlying the tokenization that + * led to the correction + * @param tokenization The tokenization from which the correction was + * generated. + * @param match The generated correction itself - the correction string + * and its cost + * @param costFactor A multiplicative factor used to adjust the cost when + * building prediction probabilities. * @returns */ -export function buildAndMapPredictions( +export function determineTokenizedCorrectionSequence( transition: ContextTransition, tokenization: ContextTokenization, - match: Readonly, + match: Readonly, costFactor: number -): CorrectionPredictionTuple[] { - const model = transition.final.model; - +): PredictionParameters { const applicationTarget = transition.base.displayTokenization; const { deleteLeft } = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId); @@ -469,7 +500,12 @@ export function buildAndMapPredictions( const correctionTransform: Transform = { insert: match.matchString, // insert correction string deleteLeft: 0, - id: transition.transitionId // The correction should always be based on the most recent external transform/transcription ID. + } + + // The correction should always be based on the most recent external + // transform/transcription ID. + if(transition.transitionId !== undefined) { + correctionTransform.id = transition.transitionId; } const rootCost = match.totalCost; @@ -478,15 +514,16 @@ export function buildAndMapPredictions( p: Math.exp(-rootCost * costFactor) }; - const predictions = predictFromCorrectionSequence(model, [predictionRoot], rootContext, transition.transitionId); - predictions.forEach((entry) => { - entry.preservationTransform = tokenization.taillessTrueKeystroke; - // // Will need an extra lookup layer if the suggestion is generated from within a cluster. - // entry.baseTokenization = transition.final.tokenizationSourceMap.get(tokenization); - entry.prediction.sample.transform.deleteLeft = deleteLeft; - }); - - return predictions; + return { + rootContext, + tokenizedCorrection: [predictionRoot], + applyInPost: (entry: CorrectionPredictionTuple) => { + entry.preservationTransform = tokenization.taillessTrueKeystroke; + // // Will need an extra lookup layer if the suggestion is generated from within a cluster. + // entry.baseTokenization = transition.final.tokenizationSourceMap.get(tokenization); + entry.prediction.sample.transform.deleteLeft = deleteLeft; + } + }; } /** @@ -600,7 +637,9 @@ export async function correctAndEnumerate( */ const costFactor = (tokenization.tail.inputCount <= 1) ? ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT : 1; - const predictions = buildAndMapPredictions(transition, tokenization, match, costFactor); + const predictionPrep = determineTokenizedCorrectionSequence(transition, tokenization, match, costFactor); + const predictions = predictFromCorrectionSequence(lexicalModel, predictionPrep.tokenizedCorrection, predictionPrep.rootContext, transition.transitionId); + predictions.forEach((p) => predictionPrep.applyInPost(p)); // Only set 'best correction' cost when a correction ACTUALLY YIELDS predictions. if(predictions.length > 0 && bestCorrectionCost === undefined) { diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts new file mode 100644 index 00000000000..16df9814971 --- /dev/null +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts @@ -0,0 +1,361 @@ +/* + * Keyman is copyright (C) SIL Global. MIT License. + * + * Created by jahorton on 2026-05-19 + * + * This file tests the prediction helper-method responsible for preparing + * corrections for multi-token prediction for our standard models, all of which + * utilize LexiconTraversals and the context-tokenization-caching subsystem. + */ + +import { assert } from 'chai'; + +import { LexicalModelTypes } from "@keymanapp/common-types"; +import * as wordBreakers from '@keymanapp/models-wordbreakers'; +import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; +import { KMWString } from '@keymanapp/web-utils'; + +import { determineTokenizedCorrectionSequence, models, ContextState, ContextToken, ContextTokenization, CorrectionPredictionTuple } from "@keymanapp/lm-worker/test-index"; + +import Context = LexicalModelTypes.Context; +import ProbabilityMass = LexicalModelTypes.ProbabilityMass; +import Transform = LexicalModelTypes.Transform; +import TrieModel = models.TrieModel; + +const testModel = new TrieModel( + jsonFixture('models/tries/english-1000'), { + wordBreaker: wordBreakers.default, + } +); + +describe('determineTokenizedCorrectionSequence', () => { + it(`properly analyzes common-case token-extension - adding a letter to an existing word`, () => { + const context: Context = { + left: 'the quick brown f', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 'o', + deleteLeft: 0 + }, + p: .5 + }; + + const state = new ContextState(context, testModel); + const transition = state.analyzeTransition(context, [trueInput]); + + + const results = determineTokenizedCorrectionSequence( + transition, + transition.final.displayTokenization, { + matchString: 'fo', + inputSamplingCost: -Math.log(trueInput.p), + knownCost: 0, + totalCost: -Math.log(trueInput.p) + }, + 1 + ); + + assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, { + casingForm: undefined, + left: 'the quick brown ', + right: '', + startOfBuffer: true, + endOfBuffer: true + }); + + assert.deepEqual(results.tokenizedCorrection, [ + { + sample: { + insert: 'fo', + deleteLeft: 0 + }, + p: trueInput.p + } + ]); + }); + + it(`properly analyzes common-case whitespace - ending a token and adding a new one`, () => { + const context: Context = { + left: 'the quick brown', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: ' ', + deleteLeft: 0 + }, + p: .5 + }; + + const state = new ContextState(context, testModel); + const transition = state.analyzeTransition(context, [trueInput]); + + + const results = determineTokenizedCorrectionSequence( + transition, + transition.final.displayTokenization, { + matchString: ' ', + inputSamplingCost: -Math.log(trueInput.p), + knownCost: 0, + totalCost: -Math.log(trueInput.p) + }, + 1 + ); + + assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, { + casingForm: undefined, + left: 'the quick brown', + right: '', + startOfBuffer: true, + endOfBuffer: true + }); + + assert.deepEqual(results.tokenizedCorrection, [ + { + sample: { + insert: ' ', + deleteLeft: 0 + }, + p: trueInput.p + } + ]); + }); + + it(`properly analyzes common-case word-start - beginning a new token`, () => { + const context: Context = { + left: 'the quick brown ', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 'f', + deleteLeft: 0 + }, + p: .5 + }; + + const state = new ContextState(context, testModel); + const transition = state.analyzeTransition(context, [trueInput]); + + + const results = determineTokenizedCorrectionSequence( + transition, + transition.final.displayTokenization, { + matchString: 'f', + inputSamplingCost: -Math.log(trueInput.p), + knownCost: 0, + totalCost: -Math.log(trueInput.p) + }, + 1 + ); + + assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, { + casingForm: undefined, + left: 'the quick brown ', + right: '', + startOfBuffer: true, + endOfBuffer: true + }); + + assert.deepEqual(results.tokenizedCorrection, [ + { + sample: { + insert: 'f', + deleteLeft: 0 + }, + p: trueInput.p + } + ]); + }); + + it(`properly analyzes post-merge case`, () => { + let context: Context = { + left: 'the quick brown fox ', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 't', + deleteLeft: 0 + }, + p: .5 + }; + + const constructingState = new ContextState(context, testModel); + const tokens = constructingState.displayTokenization.tokens; + tokens.push(ContextToken.fromRawText(testModel, 'can')); + tokens.push(ContextToken.fromRawText(testModel, '\'')); + + context = models.applyTransform({insert: 'can\'', deleteLeft: 0}, context); + + const state = new ContextState(context, testModel, new ContextTokenization(tokens)); + const transition = state.analyzeTransition(context, [trueInput]); + + const results = determineTokenizedCorrectionSequence( + transition, + transition.final.displayTokenization, { + matchString: 'can\'t', + inputSamplingCost: -Math.log(trueInput.p), + knownCost: 0, + totalCost: -Math.log(trueInput.p) + }, + 1 + ); + + assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, { + casingForm: undefined, + left: 'the quick brown fox ', + right: '', + startOfBuffer: true, + endOfBuffer: true + }); + + assert.deepEqual(results.tokenizedCorrection, [ + { + sample: { + insert: 'can\'t', + deleteLeft: 0 + }, + p: trueInput.p + } + ]); + }); + + // Will be handled far better after resolving multi-tokenization handling. + it.skip(`properly analyzes post-split case`, () => { + const context: Context = { + left: 'the quick brown fox can\'', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: ' ', + deleteLeft: 0 + }, + p: .5 + }; + + const state = new ContextState(context, testModel); + assert.equal(state.displayTokenization.tail.exampleInput, 'can\''); + const transition = state.analyzeTransition(context, [trueInput]); + + const results = determineTokenizedCorrectionSequence( + transition, + transition.final.displayTokenization, { + matchString: ' ', + inputSamplingCost: -Math.log(trueInput.p), + knownCost: 0, + totalCost: -Math.log(trueInput.p) + }, + 1 + ); + + assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, { + casingForm: undefined, + left: 'the quick brown fox ', + right: '', + startOfBuffer: true, + endOfBuffer: true + }); + + assert.deepEqual(results.tokenizedCorrection, [ + { + sample: { + insert: ' ', + deleteLeft: 0 + }, + p: trueInput.p + } + ]); + }); + + it(`properly analyzes conplex transition - multi-token replacement`, () => { + const context: Context = { + left: 'the quick brown f', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 'fast red d', + deleteLeft: 'quick brown f'.length + }, + p: .5 + }; + + const state = new ContextState(context, testModel); + const transition = state.analyzeTransition(context, [trueInput]); + + const results = determineTokenizedCorrectionSequence( + transition, + transition.final.displayTokenization, { + matchString: 'd', + inputSamplingCost: -Math.log(trueInput.p), + knownCost: 0, + totalCost: -Math.log(trueInput.p) + }, + 1 + ); + + // Large-scale deletions will receive enhanced handling soon. But, for now, it's + // deleted by the `preservationTransform`, not here. + assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, { + casingForm: undefined, + left: 'the quick brown ', + right: '', + startOfBuffer: true, + endOfBuffer: true + }); + + assert.deepEqual(results.tokenizedCorrection, [ + { + sample: { + insert: 'd', + deleteLeft: 0 + }, + p: trueInput.p + } + ]); + + const dummiedTuple: CorrectionPredictionTuple = { + prediction: { + sample: { + transform: { insert: 'dog', deleteLeft: 0 }, + displayAs: 'dog' + }, + p: .25 + }, + correction: { + sample: 'd', + p: trueInput.p + }, + totalProb: .25 * trueInput.p + }; + + results.applyInPost(dummiedTuple); + + assert.deepEqual(dummiedTuple.preservationTransform, { + insert: trueInput.sample.insert.substring(0, KMWString.length(trueInput.sample.insert) - 1), // remove the 'd'. + deleteLeft: trueInput.sample.deleteLeft - 1 + }); + }); +}); \ No newline at end of file From 2c1f43a25f96e5bd7b45d98dbdfaceeb9478a7da Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Wed, 6 May 2026 13:21:08 -0500 Subject: [PATCH 3/4] change(web): simplify mapWhitespacedTokenization requirements To better handle inputs that shift the word-boundary in some custom models and models released before Keyman 14.0, this PR provides generalized re-use of the whitespace-based token-transition algorithm used for our most prominently-supported models. Build-bot: skip build:web Test-bot: skip --- .../main/correction/context-tokenization.ts | 337 ++++++++++-------- 1 file changed, 182 insertions(+), 155 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index fc2f81615c1..830ff90b966 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -10,7 +10,7 @@ import { LexicalModelTypes } from '@keymanapp/common-types'; import { KMWString } from '@keymanapp/web-utils'; -import { ContextToken } from './context-token.js'; +import { ContextToken, ContextTokenLike } from './context-token.js'; import TransformUtils from '../transformUtils.js'; import { computeDistance, EditOperation, EditTuple } from './classical-calculation.js'; import { determineModelTokenizer } from '../model-helpers.js'; @@ -334,7 +334,7 @@ export class ContextTokenization { } /** - * Given the existing tokenization and an incoming input `Transform`, this + * Given this existing tokenization and an incoming input `Transform`, this * method precomputes how both the current, pre-application tokenization will * be altered and how the incoming Transform will be tokenized. * @@ -351,158 +351,7 @@ export class ContextTokenization { transform: Transform, edgeOptions?: EdgeWindowOptions ): TokenizationTransitionEdits { - // Step 4: now that our window's been properly updated, determine what the - // input's effects on the context is. - // - // Context does not slide within this function. - // - // Assumption: this alignment cannot fail; we KNOW there's a solid - // before-and-after relationship here, and we can base it on the results of - // a prior syncToSourceWindow call. - // - // We don't wish to do the full tokenization here - we only want to check - // over the last few tokens that might reasonably shift. We also want to - // batch effects. - - // Do not mutate the original transform; it can cause unexpected assertion - // effects in unit tests. - const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0}; - const edgeWindow = buildEdgeWindow(this.tokens, edgeTransform, false, edgeOptions); - const { - retokenizationText, - editBoundary, - sliceIndex: edgeSliceIndex - } = edgeWindow; - // Prevent mutation of the original return property. - const stackedDeletes = edgeWindow.deleteLengths.slice(); - - const tokenize = determineModelTokenizer(lexicalModel); - const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text); - if(postTokenization.length == 0) { - postTokenization.push(''); - } - const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform); - - // What does the edge's retokenization look like when we remove the inserted portions? - const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex); - const insertBoundaryToken = postTokenization[firstInsertPostIndex]; - - // Note: requires that helpers have not mutated `stackedInserts`. - const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0])); - - // Do not preserve empty tokens here, even if tokenization normally would produce one. - // It's redundant and replaceable for tokenization batching efforts. - if(uninsertedBoundaryToken != '') { - retokenizedEdge.push(uninsertedBoundaryToken); - } - - // We've found the root token within the root context state to which deletes (and inserts) - // may be applied. - // We've also found the last post-application token to which transform changes contributed. - // How do these indices line up - we need to properly construct and index our transforms, - // but 'merge' and 'split' edits can mess up that indexing. - - const currentTokens = this.tokens; - const preTokenization = currentTokens - .slice(edgeSliceIndex, editBoundary.tokenIndex+1) - .map(t => t.exampleInput); - - // Determine the effects of splits & merges as applied to the original - // cached context state. - const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits( - preTokenization, - postTokenization.slice(0, firstInsertPostIndex+1) - ); - - /* - * Final steps: We can now safely index the transforms. Let's do it! - * 1. Determine the first index a Transform may align to - * 2. Build the transforms - * - * Notes: - * - text applied to the end of a 'merged' token at the tail: should have - * index 0, not -1. - * - pretokenization index will mismatch by -1: -SUM(merge size - 1) - * - Ex: can + ' + t => can't - * -1 0 0 - * - text applied to the end of a 'split' token at the tail: should also - * have index 0, not 1. - * - posttokenization index will mismatch by +1: SUM(split size - 1) - * - new token after 'split': index 1 - * - Ex: can' + ? => can + ' + ? - * 0 -1 0 1 - * - * The first transform applies at the end of the retokenized zone and its - * associated index. The question: were there deletes that occurred? - */ - - const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex; - let shiftDeletes = false; - // first popped entry == 0 - a delete no-op. - if(stackedDeletes[stackedDeletes.length - 1] == 0) { - // the boundary indices found by both methods above differ - if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) { - shiftDeletes = true; - } - - // there are no inserts, so we don't affect the boundary token we landed on. - if(stackedDeletes.length > 1 && transform.insert == '') { - shiftDeletes = true; - } - } - - if(shiftDeletes) { - // Do not add a zero-length delete if we're not actually altering the - // corresponding token at all. - stackedDeletes.pop(); - } - - // The first delete always applies to index 0. If the built edge window - // omits a context-final empty-string, adjust the tokenization indices - // accordingly. - const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0); - // Mutates stackedInserts, stackedDeletes. - const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length); - const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex); - - // If there's an empty transform in the 0 position and we already know we're - // dropping tokens - and only deleting - we're dropping an - // otherwise-untracked empty token - make sure it's included! - const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0)); - // Past that, if we have more delete entries than insert entries for our transforms, we - // dropped some tokens outright. - const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0); - - // Final step: check for any unexpected boundary shifts not mappable to 'merge' / 'split' - // and not caused by transforms. All transforms always apply in sequence at the end. - const unmappedEdits: EditTuple[] = []; - for(let i = 0; i < editPath.length - transformMap.size; i++) { - const op = editPath[i].op; - switch(op) { - case 'merge': - case 'split': - // already calculated - // can fall through to the `continue;` line. - case 'match': - continue; - default: - // Should only be substitutions here. - // We may wish to add extra analysis in the future when supporting - // prediction from multiple competing tokenizations. - unmappedEdits.push(editPath[i] as EditTuple); - } - } - - return { - alignment: { - edgeWindow: {...edgeWindow, retokenization: retokenizedEdge}, - merges, - splits, - unmappedEdits, - removedTokenCount - }, - tokenizedTransform: transformMap, - }; + return mapWhitespacedTokenization(this.tokens, lexicalModel, transform, edgeOptions); } /** @@ -763,6 +612,184 @@ interface RetokenizedEdgeWindow extends EdgeWindow { retokenization: string[]; } +/** + * Given an existing tokenization and an incoming input `Transform`, this + * method precomputes how both the current, pre-application tokenization will + * be altered and how the incoming Transform will be tokenized. + * + * This function is able to operate with a reduced interface, not requiring + * the full ContextToken/ContextState/etc subsystem and its related + * SearchQuotientNode requirements. + * + * Note that this method is designed for use with languages that employ + * classical space-based wordbreaking. Do not use it for languages that need + * dictionary-based wordbreaking support! + * @param tokens + * @param lexicalModel + * @param transform + * @param edgeOptions + * @returns + */ +export function mapWhitespacedTokenization( + tokens: ContextTokenLike[], + lexicalModel: LexicalModel, + transform: Transform, + edgeOptions?: EdgeWindowOptions +): TokenizationTransitionEdits { + // Step 4: now that our window's been properly updated, determine what the + // input's effects on the context is. + // + // Context does not slide within this function. + // + // Assumption: this alignment cannot fail; we KNOW there's a solid + // before-and-after relationship here, and we can base it on the results of + // a prior syncToSourceWindow call. + // + // We don't wish to do the full tokenization here - we only want to check + // over the last few tokens that might reasonably shift. We also want to + // batch effects. + + // Do not mutate the original transform; it can cause unexpected assertion + // effects in unit tests. + const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0}; + const edgeWindow = buildEdgeWindow(tokens, edgeTransform, false, edgeOptions); + const { + retokenizationText, + editBoundary, + sliceIndex: edgeSliceIndex + } = edgeWindow; + // Prevent mutation of the original return property. + const stackedDeletes = edgeWindow.deleteLengths.slice(); + + const tokenize = determineModelTokenizer(lexicalModel); + const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text); + if(postTokenization.length == 0) { + postTokenization.push(''); + } + const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform); + + // What does the edge's retokenization look like when we remove the inserted portions? + const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex); + const insertBoundaryToken = postTokenization[firstInsertPostIndex]; + + // Note: requires that helpers have not mutated `stackedInserts`. + const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0])); + + // Do not preserve empty tokens here, even if tokenization normally would produce one. + // It's redundant and replaceable for tokenization batching efforts. + if(uninsertedBoundaryToken != '') { + retokenizedEdge.push(uninsertedBoundaryToken); + } + + // We've found the root token within the root context state to which deletes (and inserts) + // may be applied. + // We've also found the last post-application token to which transform changes contributed. + // How do these indices line up - we need to properly construct and index our transforms, + // but 'merge' and 'split' edits can mess up that indexing. + + const currentTokens = tokens; + const preTokenization = currentTokens + .slice(edgeSliceIndex, editBoundary.tokenIndex+1) + .map(t => t.exampleInput); + + // Determine the effects of splits & merges as applied to the original + // cached context state. + const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits( + preTokenization, + postTokenization.slice(0, firstInsertPostIndex+1) + ); + + /* + * Final steps: We can now safely index the transforms. Let's do it! + * 1. Determine the first index a Transform may align to + * 2. Build the transforms + * + * Notes: + * - text applied to the end of a 'merged' token at the tail: should have + * index 0, not -1. + * - pretokenization index will mismatch by -1: -SUM(merge size - 1) + * - Ex: can + ' + t => can't + * -1 0 0 + * - text applied to the end of a 'split' token at the tail: should also + * have index 0, not 1. + * - posttokenization index will mismatch by +1: SUM(split size - 1) + * - new token after 'split': index 1 + * - Ex: can' + ? => can + ' + ? + * 0 -1 0 1 + * + * The first transform applies at the end of the retokenized zone and its + * associated index. The question: were there deletes that occurred? + */ + + const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex; + let shiftDeletes = false; + // first popped entry == 0 - a delete no-op. + if(stackedDeletes[stackedDeletes.length - 1] == 0) { + // the boundary indices found by both methods above differ + if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) { + shiftDeletes = true; + } + + // there are no inserts, so we don't affect the boundary token we landed on. + if(stackedDeletes.length > 1 && transform.insert == '') { + shiftDeletes = true; + } + } + + if(shiftDeletes) { + // Do not add a zero-length delete if we're not actually altering the + // corresponding token at all. + stackedDeletes.pop(); + } + + // The first delete always applies to index 0. If the built edge window + // omits a context-final empty-string, adjust the tokenization indices + // accordingly. + const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0); + // Mutates stackedInserts, stackedDeletes. + const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length); + const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex); + + // If there's an empty transform in the 0 position and we already know we're + // dropping tokens - and only deleting - we're dropping an + // otherwise-untracked empty token - make sure it's included! + const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0)); + // Past that, if we have more delete entries than insert entries for our transforms, we + // dropped some tokens outright. + const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0); + + // Final step: check for any unexpected boundary shifts not mappable to 'merge' / 'split' + // and not caused by transforms. All transforms always apply in sequence at the end. + const unmappedEdits: EditTuple[] = []; + for(let i = 0; i < editPath.length - transformMap.size; i++) { + const op = editPath[i].op; + switch(op) { + case 'merge': + case 'split': + // already calculated + // can fall through to the `continue;` line. + case 'match': + continue; + default: + // Should only be substitutions here. + // We may wish to add extra analysis in the future when supporting + // prediction from multiple competing tokenizations. + unmappedEdits.push(editPath[i] as EditTuple); + } + } + + return { + alignment: { + edgeWindow: {...edgeWindow, retokenization: retokenizedEdge}, + merges, + splits, + unmappedEdits, + removedTokenCount + }, + tokenizedTransform: transformMap, + }; +} + /** * Constructs a window on one side of the represented context that is aligned to * existing tokenization. @@ -777,7 +804,7 @@ interface RetokenizedEdgeWindow extends EdgeWindow { * @returns */ export function buildEdgeWindow( - currentTokens: ContextToken[], + currentTokens: ContextTokenLike[], // Requires deleteRight be explicitly set. transform: Transform & { deleteRight: number }, applyAtFront: boolean, From b6ba677a55cb6997b5d9f1e020cefab949577395 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Thu, 28 May 2026 16:16:32 -0500 Subject: [PATCH 4/4] refactor(web): define common buildCorrectionSequence method used for all model types Build-bot: skip build:web Test-bot: skip --- .../worker-thread/src/main/predict-helpers.ts | 51 ++++++++++++------- ...ine-tokenized-correction-sequence.tests.ts | 2 +- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 35f2df19db8..f89143356b6 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -13,6 +13,7 @@ import { ContextTransition } from './correction/context-transition.js'; import { ExecutionTimer } from './correction/execution-timer.js'; import ModelCompositor from './model-compositor.js'; import { EDIT_DISTANCE_COST_SCALE, getBestTokenMatches } from './correction/distance-modeler.js'; +import { TokenResult } from './correction/tokenization-corrector.js'; const searchForProperty = defaultWordbreaker.searchForProperty; @@ -27,7 +28,6 @@ import Reversion = LexicalModelTypes.Reversion; import Suggestion = LexicalModelTypes.Suggestion; import SuggestionTag = LexicalModelTypes.SuggestionTag; import Transform = LexicalModelTypes.Transform; -import { TokenResult } from './correction/tokenization-corrector.js'; /* * The functions in this file exist to provide unit-testable stateless components for the @@ -471,6 +471,34 @@ export interface PredictionParameters { applyInPost: (entry: CorrectionPredictionTuple) => void } +export function buildCorrectionSequence( + transitionEffects: ReturnType, + context: Context, + match: Readonly, + costFactor: number +) { + const { deleteLeft } = transitionEffects; + + const rootContext = models.applyTransform({insert: '', deleteLeft}, context); + + // Replace the existing context with the correction. + const correctionTransform: Transform = { + insert: match.matchString, // insert correction string + deleteLeft: 0, + } + + const rootCost = match.totalCost; + const predictionRoot = { + sample: correctionTransform, + p: Math.exp(-rootCost * costFactor) + }; + + return { + rootContext, + tokenizedCorrection: [predictionRoot] + }; +} + /** * This function takes in metadata about generated corrections (for models that * implement Traversals) and uses that to produce the corresponding parameters @@ -492,31 +520,20 @@ export function determineTokenizedCorrectionSequence( costFactor: number ): PredictionParameters { const applicationTarget = transition.base.displayTokenization; - const { deleteLeft } = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId); - - const rootContext = models.applyTransform({insert: '', deleteLeft}, transition.base.context); + const transitionParams = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId); - // Replace the existing context with the correction. - const correctionTransform: Transform = { - insert: match.matchString, // insert correction string - deleteLeft: 0, - } + const suggestionParams = buildCorrectionSequence(transitionParams, transition.base.context, match, costFactor); // The correction should always be based on the most recent external // transform/transcription ID. if(transition.transitionId !== undefined) { - correctionTransform.id = transition.transitionId; + suggestionParams.tokenizedCorrection.map((t) => t.sample.id = transition.transitionId); } - const rootCost = match.totalCost; - const predictionRoot = { - sample: correctionTransform, - p: Math.exp(-rootCost * costFactor) - }; + const { deleteLeft } = transitionParams; return { - rootContext, - tokenizedCorrection: [predictionRoot], + ...suggestionParams, applyInPost: (entry: CorrectionPredictionTuple) => { entry.preservationTransform = tokenization.taillessTrueKeystroke; // // Will need an extra lookup layer if the suggestion is generated from within a cluster. diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts index 16df9814971..f79c5aeb18b 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts @@ -286,7 +286,7 @@ describe('determineTokenizedCorrectionSequence', () => { ]); }); - it(`properly analyzes conplex transition - multi-token replacement`, () => { + it(`properly analyzes complex transition - multi-token replacement`, () => { const context: Context = { left: 'the quick brown f', right: '',