diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/token-result-mapping.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/token-result-mapping.ts index c85e9e9b6d4..862ac1f740f 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/token-result-mapping.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/token-result-mapping.ts @@ -75,6 +75,10 @@ export class TokenResultMapping implements CorrectionResultMapping, return this.node; } + get inputCount(): number { + return this.matchingSpace.inputCount; + } + get inputSequence(): ProbabilityMass[] { return this.node.priorInput; } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts index e47c6b04fad..a63a7179608 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts @@ -34,6 +34,7 @@ import { MAX_EDIT_THRESHOLD_FACTOR } from "./search-quotient-spur.js"; export type TokenResult = { matchString: string, inputSamplingCost: number, + inputCount: number, knownCost: number, totalCost: number } @@ -196,6 +197,7 @@ export class TokenizationCorrector implements CorrectionSearchable { return {exampleInput: t.text, codepointLength: KMWString.length(t.text)} }), lexicalModel, correction.sample); const tokenizedCorrection = tokenizationMapping.tokenizedTransform; const tokenizedCorrectionEntries = [...tokenizedCorrection.values()]; + if(tokenizedCorrection.size > 1 || correction.sample.insert.length >= match.matchString.length) { + match.inputCount = 1; + } + + const suggestionParams = buildCorrectionSequence(transitionEffects, context, match); // IF: array has multiple entries, then build the preservation-transform as below, including the deleteLeft. // If not, don't make one! @@ -451,7 +455,6 @@ export function buildCorrectionSequence( transitionEffects: ReturnType, context: Context, match: Readonly, - costFactor: number ) { const { deleteLeft } = transitionEffects; @@ -463,6 +466,21 @@ export function buildCorrectionSequence( deleteLeft: 0, } + /* If we're dealing with the FIRST keystroke of a new sequence, we'll **dramatically** boost + * the exponent to ensure only VERY nearby corrections have a chance of winning, and only if + * there are significantly more likely words. We only need this to allow very minor fat-finger + * adjustments for 100% keystroke-sequence corrections in order to prevent finickiness on + * key borders. + * + * Technically, the probabilities this produces won't be normalized as-is... but there's no + * true NEED to do so for it, even if it'd be 'nice to have'. Consistently tracking when + * to apply it could become tricky, so it's simpler to leave out. + * + * Worst-case, it's possible to temporarily add normalization if a code deep-dive + * is needed in the future. + */ + const costFactor = (match.inputCount <= 1) ? ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT : 1; + const rootCost = match.totalCost; const predictionRoot = { sample: correctionTransform, @@ -492,13 +510,12 @@ export function buildCorrectionSequence( export function determineTokenizedCorrectionSequence( transition: ContextTransition, tokenization: ContextTokenization, - match: Readonly, - costFactor: number + match: Readonly ): PredictionParameters { const applicationTarget = transition.base.displayTokenization; const transitionParams = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId); - const suggestionParams = buildCorrectionSequence(transitionParams, transition.base.context, match, costFactor); + const suggestionParams = buildCorrectionSequence(transitionParams, transition.base.context, match); // The correction should always be based on the most recent external // transform/transcription ID. @@ -622,28 +639,13 @@ export async function correctAndEnumerate( continue; } - /* If we're dealing with the FIRST keystroke of a new sequence, we'll **dramatically** boost - * the exponent to ensure only VERY nearby corrections have a chance of winning, and only if - * there are significantly more likely words. We only need this to allow very minor fat-finger - * adjustments for 100% keystroke-sequence corrections in order to prevent finickiness on - * key borders. - * - * Technically, the probabilities this produces won't be normalized as-is... but there's no - * true NEED to do so for it, even if it'd be 'nice to have'. Consistently tracking when - * to apply it could become tricky, so it's simpler to leave out. - * - * Worst-case, it's possible to temporarily add normalization if a code deep-dive - * is needed in the future. - */ - const costFactor = (tokenization.tail.inputCount <= 1) ? ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT : 1; - - const predictionPrep = determineTokenizedCorrectionSequence(transition, tokenization, match, costFactor); + const predictionPrep = determineTokenizedCorrectionSequence(transition, tokenization, match); const predictions = predictFromCorrectionSequence(lexicalModel, predictionPrep.tokenizedCorrection, predictionPrep.rootContext, transition.transitionId); predictions.forEach((p) => predictionPrep.applyInPost(p)); // Only set 'best correction' cost when a correction ACTUALLY YIELDS predictions. - if(predictions.length > 0 && bestCorrectionCost === undefined) { - bestCorrectionCost = match.totalCost * costFactor; + if(predictions.length > 0 && (bestCorrectionCost === undefined || bestCorrectionCost > match.totalCost)) { + bestCorrectionCost = match.totalCost; } // If we're getting the same prediction again, it's lower-cost. Update! diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts index 71697b2b7ef..537cd7a714d 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-tokenized-correction-sequence.tests.ts @@ -15,7 +15,7 @@ import * as wordBreakers from '@keymanapp/models-wordbreakers'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; import { KMWString } from '@keymanapp/web-utils'; -import { determineTokenizedCorrectionSequence, models, ContextState, ContextToken, ContextTokenization, CorrectionPredictionTuple } from "@keymanapp/lm-worker/test-index"; +import { determineTokenizedCorrectionSequence, models, ContextState, ContextToken, ContextTokenization, CorrectionPredictionTuple, ModelCompositor } from "@keymanapp/lm-worker/test-index"; import Context = LexicalModelTypes.Context; import ProbabilityMass = LexicalModelTypes.ProbabilityMass; @@ -54,10 +54,10 @@ describe('determineTokenizedCorrectionSequence', () => { transition.final.displayTokenization, { matchString: 'fo', inputSamplingCost: -Math.log(trueInput.p), + inputCount: 2, knownCost: 0, totalCost: -Math.log(trueInput.p) - }, - 1 + } ); assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, { @@ -104,10 +104,10 @@ describe('determineTokenizedCorrectionSequence', () => { transition.final.displayTokenization, { matchString: ' ', inputSamplingCost: -Math.log(trueInput.p), + inputCount: 1, knownCost: 0, totalCost: -Math.log(trueInput.p) - }, - 1 + } ); assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, { @@ -118,15 +118,12 @@ describe('determineTokenizedCorrectionSequence', () => { endOfBuffer: true }); - assert.deepEqual(results.tokenizedCorrection, [ - { - sample: { - insert: ' ', - deleteLeft: 0 - }, - p: trueInput.p - } - ]); + assert.equal(results.tokenizedCorrection.length, 1); + assert.deepEqual(results.tokenizedCorrection[0].sample, { + insert: ' ', + deleteLeft: 0 + }); + assert.approximately(results.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000); }); it(`properly analyzes common-case word-start - beginning a new token`, () => { @@ -154,10 +151,10 @@ describe('determineTokenizedCorrectionSequence', () => { transition.final.displayTokenization, { matchString: 'f', inputSamplingCost: -Math.log(trueInput.p), + inputCount: 1, knownCost: 0, totalCost: -Math.log(trueInput.p) - }, - 1 + } ); assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, { @@ -168,15 +165,13 @@ describe('determineTokenizedCorrectionSequence', () => { endOfBuffer: true }); - assert.deepEqual(results.tokenizedCorrection, [ - { - sample: { - insert: 'f', - deleteLeft: 0 - }, - p: trueInput.p - } - ]); + + assert.equal(results.tokenizedCorrection.length, 1); + assert.deepEqual(results.tokenizedCorrection[0].sample, { + insert: 'f', + deleteLeft: 0 + }); + assert.approximately(results.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000); }); it(`properly analyzes post-merge case`, () => { @@ -210,10 +205,10 @@ describe('determineTokenizedCorrectionSequence', () => { transition.final.displayTokenization, { matchString: 'can\'t', inputSamplingCost: -Math.log(trueInput.p), + inputCount: 5, knownCost: 0, totalCost: -Math.log(trueInput.p) - }, - 1 + } ); assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, { @@ -261,10 +256,10 @@ describe('determineTokenizedCorrectionSequence', () => { transition.final.displayTokenization, { matchString: ' ', inputSamplingCost: -Math.log(trueInput.p), + inputCount: 1, knownCost: 0, totalCost: -Math.log(trueInput.p) - }, - 1 + } ); assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, { @@ -275,15 +270,13 @@ describe('determineTokenizedCorrectionSequence', () => { endOfBuffer: true }); - assert.deepEqual(results.tokenizedCorrection, [ - { - sample: { - insert: ' ', - deleteLeft: 0 - }, - p: trueInput.p - } - ]); + + assert.equal(results.tokenizedCorrection.length, 1); + assert.deepEqual(results.tokenizedCorrection[0].sample, { + insert: ' ', + deleteLeft: 0 + }); + assert.approximately(results.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000); }); it(`properly analyzes complex transition - multi-token replacement`, () => { @@ -310,10 +303,10 @@ describe('determineTokenizedCorrectionSequence', () => { transition.final.displayTokenization, { matchString: 'd', inputSamplingCost: -Math.log(trueInput.p), + inputCount: 1, knownCost: 0, totalCost: -Math.log(trueInput.p) - }, - 1 + } ); // Large-scale deletions will receive enhanced handling soon. But, for now, it's @@ -326,15 +319,13 @@ describe('determineTokenizedCorrectionSequence', () => { endOfBuffer: true }); - assert.deepEqual(results.tokenizedCorrection, [ - { - sample: { - insert: 'd', - deleteLeft: 0 - }, - p: trueInput.p - } - ]); + + assert.equal(results.tokenizedCorrection.length, 1); + assert.deepEqual(results.tokenizedCorrection[0].sample, { + insert: 'd', + deleteLeft: 0 + }); + assert.approximately(results.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000); const dummiedTuple: CorrectionPredictionTuple = { prediction: { diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-traversalless-correction-sequences.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-traversalless-correction-sequences.tests.ts index 42efbdf5888..edb7f956993 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-traversalless-correction-sequences.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-traversalless-correction-sequences.tests.ts @@ -13,7 +13,7 @@ import { LexicalModelTypes } from "@keymanapp/common-types"; import * as wordBreakers from '@keymanapp/models-wordbreakers'; import { KMWString } from '@keymanapp/web-utils'; -import { CorrectionPredictionTuple, determineTraversallessCorrectionSequences, models } from "@keymanapp/lm-worker/test-index"; +import { CorrectionPredictionTuple, ModelCompositor, determineTraversallessCorrectionSequences, models } from "@keymanapp/lm-worker/test-index"; import Context = LexicalModelTypes.Context; import DummyModel = models.DummyModel; @@ -204,13 +204,12 @@ describe('determineTraversallessCorrectionSequences', () => { } ); - assert.deepEqual(entry.tokenizedCorrection, [{ - sample: { - insert: '', // empty token after a whitespace. - deleteLeft: 0 - }, - p: trueInput.p - }]); + assert.equal(entry.tokenizedCorrection.length, 1); + assert.deepEqual(entry.tokenizedCorrection[0].sample, { + insert: '', + deleteLeft: 0 + }); + assert.approximately(entry.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000); }); @@ -246,13 +245,12 @@ describe('determineTraversallessCorrectionSequences', () => { } ); - assert.deepEqual(entry.tokenizedCorrection, [{ - sample: { - insert: 'f', - deleteLeft: 0 - }, - p: trueInput.p - }]); + assert.equal(entry.tokenizedCorrection.length, 1); + assert.deepEqual(entry.tokenizedCorrection[0].sample, { + insert: 'f', + deleteLeft: 0 + }); + assert.approximately(entry.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000); }); it(`properly analyzes post-merge case`, () => { @@ -330,13 +328,12 @@ describe('determineTraversallessCorrectionSequences', () => { // } // ); - assert.deepEqual(entry.tokenizedCorrection, [{ - sample: { - insert: '', // empty token after a whitespace. - deleteLeft: 0 - }, - p: trueInput.p - }]); + assert.equal(entry.tokenizedCorrection.length, 1); + assert.deepEqual(entry.tokenizedCorrection[0].sample, { + insert: '', + deleteLeft: 0 + }); + assert.approximately(entry.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000); }); it(`properly analyzes complex transition - multi-token replacement`, () => { @@ -373,13 +370,12 @@ describe('determineTraversallessCorrectionSequences', () => { } ); - assert.deepEqual(entry.tokenizedCorrection, [{ - sample: { - insert: 'd', - deleteLeft: 0 - }, - p: trueInput.p - }]); + assert.equal(entry.tokenizedCorrection.length, 1); + assert.deepEqual(entry.tokenizedCorrection[0].sample, { + insert: 'd', + deleteLeft: 0 + }); + assert.approximately(entry.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000); const dummiedTuple: CorrectionPredictionTuple = { prediction: {