diff --git a/web/src/engine/predictive-text/templates/src/tokenization.ts b/web/src/engine/predictive-text/templates/src/tokenization.ts index fd8ed28d5ca..47ef927fa5b 100644 --- a/web/src/engine/predictive-text/templates/src/tokenization.ts +++ b/web/src/engine/predictive-text/templates/src/tokenization.ts @@ -95,6 +95,10 @@ export function tokenize( currentIndex = nextIndex; } + if(tokenization.left.length == 0) { + tokenization.left.push({text: '', isWhitespace: false}); + } + // New step 2: handle any rejoins needed. // Handle any desired special handling for directly-pre-caret scenarios - where for this diff --git a/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts index 071cad588c5..7b5115e308f 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts @@ -71,7 +71,8 @@ export function determineModelTokenizer(model: LexicalModel) { if(model.wordbreaker) { return models.tokenize(model.wordbreaker, context); } else { - return null; + // Not ideal for pre-14.0 models, but it'll do for now. + return models.tokenize(wordBreakers.default, context); } } } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index f89143356b6..99cc9c82812 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -6,7 +6,7 @@ import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbre import TransformUtils from './transformUtils.js'; import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js'; import { ContextTokenLike } from './correction/context-token.js'; -import { ContextTokenization } from './correction/context-tokenization.js'; +import { ContextTokenization, mapWhitespacedTokenization } from './correction/context-tokenization.js'; import { ContextTracker } from './correction/context-tracker.js'; import { ContextState, determineContextSlideTransform } from './correction/context-state.js'; import { ContextTransition } from './correction/context-transition.js'; @@ -191,88 +191,64 @@ export function tupleDisplayOrderSort(a: CorrectionPredictionTuple, b: Correctio return b.totalProb - a.totalProb; } -export async function correctAndEnumerateWithoutTraversals( +export function determineTraversallessCorrectionSequences( lexicalModel: LexicalModel, - transformDistribution: Distribution, + corrections: Distribution, context: Context -): Promise<{ - /** - * For models that support correction-search caching, this provides the - * cached object corresponding to this method's operation. - * - * Otherwise, is `null`. - */ - postContextState?: ContextState; +): PredictionParameters[] { + let returnedPredictionData: PredictionParameters[] = []; - /** - * The suggestions generated based on the user's input state. - */ - rawPredictions: CorrectionPredictionTuple[]; + const tokenizer = determineModelTokenizer(lexicalModel); + const wordbreak = determineModelWordbreaker(lexicalModel); - /** - * The id of a prior ContextTransition event that triggered a Suggestion found - * at the end of the Context. Will be undefined if no edits have occurred - * since the Suggestion was applied. - */ - revertableTransitionId?: number -}> { - const inputTransform = transformDistribution[0].sample; - let rawPredictions: CorrectionPredictionTuple[] = []; + const tokenization = tokenizer(context); // issue at present if no tokens exist! + const tokenMapper = (t: models.Token) => { + return { + exampleInput: t.text, + codepointLength: KMWString.length(t.text) + } as ContextTokenLike; + } - let predictionRoots: ProbabilityMass[]; + for(let correction of corrections) { + // Step 1: determine tokenization effects. We can't use the + // ContextTokenization pattern due to the model's lack of LexiconTraversal + // support, though. + const transformId = correction.sample.id; + const postContext = models.applyTransform(correction.sample, context); + const postTokenization = tokenizer(postContext); + + const transitionEffects = determineSuggestionRange(tokenization.left.map(tokenMapper), postTokenization.left.map(tokenMapper), (a, b) => a.exampleInput == b.exampleInput); + const match: TokenResult = { + matchString: wordbreak(postContext), + inputSamplingCost: -Math.log(correction.p), + knownCost: 0, + totalCost: -Math.log(correction.p) + }; - // Only allow new-word suggestions if space was the most likely keypress. - const allowSpace = TransformUtils.isWhitespace(inputTransform); - const allowBksp = TransformUtils.isBackspace(inputTransform); + const suggestionParams = buildCorrectionSequence(transitionEffects, context, match, 1); - // Generates raw prediction distributions for each valid input. Can only 'correct' - // against the final input. - // - // This is the old, 12.0-13.0 'correction' style. - if(allowSpace) { - // Detect start of new word; prevent whitespace loss here. - predictionRoots = [{sample: inputTransform, p: 1.0}]; - } else { - predictionRoots = transformDistribution.map((alt) => { - let transform = alt.sample; - - // Filter out special keys unless they're expected. - if(TransformUtils.isWhitespace(transform) && !allowSpace) { - return null; - } else if(TransformUtils.isBackspace(transform) && !allowBksp) { - return null; - } + const tokenizationMapping = mapWhitespacedTokenization(tokenization.left.map((t) => { return {exampleInput: t.text, codepointLength: KMWString.length(t.text)} }), lexicalModel, correction.sample); + const tokenizedCorrection = tokenizationMapping.tokenizedTransform; + const tokenizedCorrectionEntries = [...tokenizedCorrection.values()]; - return alt; - }); - } + // IF: array has multiple entries, then build the preservation-transform as below, including the deleteLeft. + // If not, don't make one! + const preservationTransform = tokenizedCorrectionEntries.slice(0, -1).reduce((accum, curr) => { + return { insert: accum.insert + curr.insert, deleteLeft: accum.deleteLeft + curr.deleteLeft }; + }, { insert: '', deleteLeft: 0, id: correction.sample.id}); - const wordbreak = determineModelWordbreaker(lexicalModel); - // Remove `null` entries, then determine suggestions. - predictionRoots.forEach((pr) => { - const postContext = models.applyTransform(pr.sample, context); - const tailTokenText = wordbreak(postContext); - const rootContext = models.applyTransform({insert: '', deleteLeft: KMWString.length(tailTokenText)}, postContext); - - const results = predictFromCorrectionSequence(lexicalModel, [{ - sample: { - insert: tailTokenText, - deleteLeft: 0, - id: pr.sample.id - }, - p: pr.p - }], rootContext, pr.sample.id); - results.forEach((r) => rawPredictions.push(r)); - }) - - if(allowSpace) { - rawPredictions.forEach((entry) => entry.preservationTransform = inputTransform); + returnedPredictionData.push({ + ...suggestionParams, + applyInPost: (p) => { + p.preservationTransform = preservationTransform; + if(transformId) { + p.prediction.sample.transformId = transformId; + } + } + }) } - return { - postContextState: null, - rawPredictions: rawPredictions - }; + return returnedPredictionData; } /** @@ -586,7 +562,14 @@ export async function correctAndEnumerate( // It's mostly here to support models compiled before Keyman 14.0, which was // when the `LexiconTraversal` pattern was established. if(!contextTracker) { - return correctAndEnumerateWithoutTraversals(lexicalModel, transformDistribution, context); + const predictionData = determineTraversallessCorrectionSequences(lexicalModel, transformDistribution, context); + return { + rawPredictions: predictionData.flatMap((entry) => { + const predictions = predictFromCorrectionSequence(lexicalModel, entry.tokenizedCorrection, entry.rootContext, transformDistribution[0]?.sample.id); + predictions.forEach((p) => entry.applyInPost(p)); + return predictions; + }) + }; } // 'else': the current, 14.0+ pattern, which is able to leverage diff --git a/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts index 3bc636c4128..0aa4f9551ed 100644 --- a/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts @@ -175,7 +175,7 @@ describe('Tokenization functions', function() { }); it('properly handles empty-context cases', function() { - // Wordbreaking on a empty space => no word. + // Wordbreaking on a empty space => no word, but empty initial token. let context = { left: '', startOfBuffer: true, right: '', endOfBuffer: true @@ -184,7 +184,7 @@ describe('Tokenization functions', function() { let tokenization = models.tokenize(wordBreakers.default, context); let expectedResult: models.Tokenization = { - left: [], + left: [{text: '', isWhitespace: false}], right: [], caretSplitsToken: false }; @@ -193,11 +193,11 @@ describe('Tokenization functions', function() { }); it('properly handles null context cases', function() { - // Wordbreaking on a empty space => no word. + // Wordbreaking on a empty space => no word, but empty initial token. let tokenization = models.tokenize(wordBreakers.default, null); let expectedResult: models.Tokenization = { - left: [], + left: [{text: '', isWhitespace: false}], right: [], caretSplitsToken: false }; diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts index f79c5aeb18b..71697b2b7ef 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts @@ -236,7 +236,7 @@ describe('determineTokenizedCorrectionSequence', () => { }); // Will be handled far better after resolving multi-tokenization handling. - it.skip(`properly analyzes post-split case`, () => { + it.skip(`properly analyzes post-split new-wordbreak case`, () => { const context: Context = { left: 'the quick brown fox can\'', right: '', diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-traversalless-correction-sequences.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-traversalless-correction-sequences.tests.ts new file mode 100644 index 00000000000..42efbdf5888 --- /dev/null +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-traversalless-correction-sequences.tests.ts @@ -0,0 +1,406 @@ +/* + * Keyman is copyright (C) SIL Global. MIT License. + * + * Created by jahorton on 2026-05-18 + * + * This file tests the prediction helper-method responsible for preparing + * corrections for multi-token prediction for some custom and all legacy models. + */ + +import { assert } from 'chai'; + +import { LexicalModelTypes } from "@keymanapp/common-types"; +import * as wordBreakers from '@keymanapp/models-wordbreakers'; +import { KMWString } from '@keymanapp/web-utils'; + +import { CorrectionPredictionTuple, determineTraversallessCorrectionSequences, models } from "@keymanapp/lm-worker/test-index"; + +import Context = LexicalModelTypes.Context; +import DummyModel = models.DummyModel; +import DummyOptions = models.DummyOptions; +import ProbabilityMass = LexicalModelTypes.ProbabilityMass; +import Transform = LexicalModelTypes.Transform; + + +/* + * This file's tests use these parts of a lexical model: + * - model.wordbreaker + * - model.toKey + * - model.applyCasing + * - model.punctuation + */ + +const DUMMY_MODEL_CONFIG: DummyOptions = { + punctuation: { + quotesForKeepSuggestion: { + open: '<', + close: '>' + }, + insertAfterWord: '\u00a0' // non-breaking space + }, + wordbreaker: wordBreakers.default +}; + +const testModel = new DummyModel({ + ...DUMMY_MODEL_CONFIG, + // No suggestions needed here, so we don't define any. +}); + +describe('determineTraversallessCorrectionSequences', () => { + it(`processes common-case corrections correctly - on context reset with existing text`, () => { + const context = { + left: 'appl', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: '', + deleteLeft: 0 + }, + p: 1 + }; + + const predictionRootEntries = determineTraversallessCorrectionSequences(testModel, [trueInput], context); + + assert.equal(predictionRootEntries.length, 1); + const entry = predictionRootEntries[0]; + + assert.deepEqual( + { + ...entry.rootContext, casingForm: entry.rootContext.casingForm ?? undefined + }, { + casingForm: undefined, + left: '', + right: '', + startOfBuffer: true, + endOfBuffer: true + } + ); + + assert.deepEqual(entry.tokenizedCorrection, [{ + sample: { + insert: 'appl', + deleteLeft: 0 + }, + p: trueInput.p + }]); + }); + + it(`processes standard-case corrections correctly - text appended to existing token`, () => { + const context: Context = { + left: 'I want an iPhon', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 'e', + deleteLeft: 0 + }, + p: 1 + }; + + const predictionRootEntries = determineTraversallessCorrectionSequences(testModel, [trueInput], context); + + assert.equal(predictionRootEntries.length, 1); + const entry = predictionRootEntries[0]; + + assert.deepEqual( + { + ...entry.rootContext, casingForm: entry.rootContext.casingForm ?? undefined + }, { + casingForm: undefined, + left: 'I want an ', + right: '', + startOfBuffer: true, + endOfBuffer: true + } + ); + + assert.deepEqual(entry.tokenizedCorrection, [{ + sample: { + insert: 'iPhone', + deleteLeft: 0 + }, + p: trueInput.p + }]); + }); + + it(`properly analyzes common-case token-extension - adding a letter to an existing word`, () => { + const context: Context = { + left: 'the quick brown f', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 'o', + deleteLeft: 0 + }, + p: .5 + }; + + const predictionRootEntries = determineTraversallessCorrectionSequences(testModel, [trueInput], context); + assert.equal(predictionRootEntries.length, 1); + const entry = predictionRootEntries[0]; + + assert.deepEqual( + { + ...entry.rootContext, casingForm: entry.rootContext.casingForm ?? undefined + }, { + casingForm: undefined, + left: 'the quick brown ', + right: '', + startOfBuffer: true, + endOfBuffer: true + } + ); + + assert.deepEqual(entry.tokenizedCorrection, [{ + sample: { + insert: 'fo', + deleteLeft: 0 + }, + p: trueInput.p + }]); + }); + + it(`properly analyzes common-case whitespace - ending a token and adding a new one`, () => { + const context: Context = { + left: 'the quick brown', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: ' ', + deleteLeft: 0 + }, + p: .5 + }; + + const predictionRootEntries = determineTraversallessCorrectionSequences(testModel, [trueInput], context); + assert.equal(predictionRootEntries.length, 1); + const entry = predictionRootEntries[0]; + + assert.deepEqual( + { + ...entry.rootContext, casingForm: entry.rootContext.casingForm ?? undefined + }, { + casingForm: undefined, + left: 'the quick brown', + right: '', + startOfBuffer: true, + endOfBuffer: true + } + ); + + assert.deepEqual(entry.tokenizedCorrection, [{ + sample: { + insert: '', // empty token after a whitespace. + deleteLeft: 0 + }, + p: trueInput.p + }]); + }); + + + it(`properly analyzes common-case word-start - beginning a new token`, () => { + const context: Context = { + left: 'the quick brown ', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 'f', + deleteLeft: 0 + }, + p: .5 + }; + + const predictionRootEntries = determineTraversallessCorrectionSequences(testModel, [trueInput], context); + assert.equal(predictionRootEntries.length, 1); + const entry = predictionRootEntries[0]; + + assert.deepEqual( + { + ...entry.rootContext, casingForm: entry.rootContext.casingForm ?? undefined + }, { + casingForm: undefined, + left: 'the quick brown ', + right: '', + startOfBuffer: true, + endOfBuffer: true + } + ); + + assert.deepEqual(entry.tokenizedCorrection, [{ + sample: { + insert: 'f', + deleteLeft: 0 + }, + p: trueInput.p + }]); + }); + + it(`properly analyzes post-merge case`, () => { + let context: Context = { + left: 'the quick brown fox can\'', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 't', + deleteLeft: 0 + }, + p: .5 + }; + + const predictionRootEntries = determineTraversallessCorrectionSequences(testModel, [trueInput], context); + assert.equal(predictionRootEntries.length, 1); + const entry = predictionRootEntries[0]; + + assert.deepEqual( + { + ...entry.rootContext, casingForm: entry.rootContext.casingForm ?? undefined + }, { + casingForm: undefined, + left: 'the quick brown fox ', + right: '', + startOfBuffer: true, + endOfBuffer: true + } + ); + + assert.deepEqual(entry.tokenizedCorrection, [{ + sample: { + insert: 'can\'t', + deleteLeft: 0 + }, + p: trueInput.p + }]); + }); + + it(`properly analyzes post-split new-wordbreak case`, () => { + const context: Context = { + left: 'the quick brown fox can\'', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: ' ', + deleteLeft: 0 + }, + p: .5 + }; + + const predictionRootEntries = determineTraversallessCorrectionSequences(testModel, [trueInput], context); + assert.equal(predictionRootEntries.length, 1); + const entry = predictionRootEntries[0]; + + // assert.deepEqual( + // { + // ...entry.rootContext, casingForm: entry.rootContext.casingForm ?? undefined + // }, { + // casingForm: undefined, + // // Proper logic requires full multi-token awareness; predictions are currently + // // based on just the last token. + // left: 'the quick brown fox can\'', + // right: '', + // startOfBuffer: true, + // endOfBuffer: true + // } + // ); + + assert.deepEqual(entry.tokenizedCorrection, [{ + sample: { + insert: '', // empty token after a whitespace. + deleteLeft: 0 + }, + p: trueInput.p + }]); + }); + + it(`properly analyzes complex transition - multi-token replacement`, () => { + const context: Context = { + left: 'the quick brown f', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 'fast red d', + deleteLeft: 'quick brown f'.length + }, + p: .5 + }; + + const predictionRootEntries = determineTraversallessCorrectionSequences(testModel, [trueInput], context); + assert.equal(predictionRootEntries.length, 1); + const entry = predictionRootEntries[0]; + + assert.deepEqual( + { + ...entry.rootContext, casingForm: entry.rootContext.casingForm ?? undefined + }, { + casingForm: undefined, + // Large-scale deletions will receive enhanced handling soon. But, for now, it's + // deleted by the `preservationTransform`, not here. + left: 'the quick brown ', + right: '', + startOfBuffer: true, + endOfBuffer: true + } + ); + + assert.deepEqual(entry.tokenizedCorrection, [{ + sample: { + insert: 'd', + deleteLeft: 0 + }, + p: trueInput.p + }]); + + const dummiedTuple: CorrectionPredictionTuple = { + prediction: { + sample: { + transform: { insert: 'dog', deleteLeft: 0 }, + displayAs: 'dog' + }, + p: .25 + }, + correction: { + sample: 'd', + p: trueInput.p + }, + totalProb: .25 * trueInput.p + }; + + entry.applyInPost(dummiedTuple); + + assert.deepEqual(dummiedTuple.preservationTransform, { + insert: trueInput.sample.insert.substring(0, KMWString.length(trueInput.sample.insert) - 1), // remove the 'd'. + deleteLeft: trueInput.sample.deleteLeft - 1 + }); + }); +}); \ No newline at end of file diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-custom-punctuation.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-custom-punctuation.tests.ts index cd4dbd106e0..01f1a6ac96a 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-custom-punctuation.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-custom-punctuation.tests.ts @@ -87,17 +87,30 @@ describe('Custom Punctuation', function () { // the tests run smoothly. wordbreaker: (text) => { const textLen = text.length; - if(text.charAt(textLen - 1) == " ") { - return [ - {text: text.substring(0, 1), start: 0, end: 1, length: 1}, - {text: text.substring(1, textLen-2), start: 1, end: textLen-1, length: textLen-2}, - {text: text.substring(textLen-1), start: textLen-1, end: textLen, length: 1} - ]; + if(text.charAt(0) == "᚛") { // ensure the prior token component (the '᚛') wordbreaks. + if(text.charAt(textLen - 1) == " ") { // ensure the insert-after component word-breaks. + return [ + {text: text.substring(0, 1), start: 0, end: 1, length: 1}, + {text: text.substring(1, textLen-2), start: 1, end: textLen-1, length: textLen-2}, + {text: text.substring(textLen-1), start: textLen-1, end: textLen, length: 1} + ]; + } else { + return [ + {text: text.substring(0, 1), start: 0, end: 1, length: 1}, + {text: text.substring(1), start: 1, end: textLen, length: textLen-1} + ]; + } } else { - return [ - {text: text.substring(0, 1), start: 0, end: 1, length: 1}, - {text: text.substring(1), start: 1, end: textLen, length: textLen-1} - ]; + if(text.charAt(textLen - 1) == " ") { + return [ + {text: text.substring(0, textLen-2), start: 0, end: textLen-1, length: textLen-1}, + {text: text.substring(textLen-1), start: textLen-1, end: textLen, length: 1} + ]; + } else { + return [ + {text: text, start: 0, end: textLen, length: textLen} + ]; + } } } });