From fb485cb7a349ac9c3f3da48f9e2bc5a6646c40e0 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Thu, 21 May 2026 16:01:35 -0500 Subject: [PATCH] change(web): generalize determineSuggestionRange To facilitate using the same suggestion-application-range logic for all model types, not just the first-class ones that implement LexiconTraversals. Build-bot: skip build:web Test-bot: skip --- .../src/main/correction/context-token.ts | 47 ++++++- .../worker-thread/src/main/predict-helpers.ts | 121 +++++++++++++----- .../determine-suggestion-range.tests.ts | 20 +-- 3 files changed, 142 insertions(+), 46 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts index 0c492c76255..ce1d8f9830a 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts @@ -35,11 +35,42 @@ function textToCharTransforms(text: string, transformId?: number): Transform[] { [...text].map(insert => ({insert, deleteLeft: 0})); } + +/** + * Implements an interface similar to ContextToken that is useful for handling + * cases that should not be considered correctable. + */ +export interface ContextTokenLike { + /** + * Generates text corresponding to the net effects of the most likely inputs + * received that can correspond to the represented token. + */ + exampleInput: string; + + /** + * Reports the length in codepoints of corrected text represented by the + * current token. + */ + codepointLength: number; + + /** + * Whether or not the token is likely still being edited by the user (due to + * adjacency of the caret) + */ + isPartial?: boolean; + + /** + * Gets a compact string-based representation of `inputRange` that + * maps compatible token source ranges to each other. + */ + sourceRangeKey?: string; +} + /** * Represents cached data about one token (either a word or a unit of whitespace) * in the context and associated correction-search progress and results. */ -export class ContextToken { +export class ContextToken implements ContextTokenLike { /** * Indicates whether or not the token is considered whitespace. */ @@ -54,6 +85,10 @@ export class ContextToken { } private _searchModule: SearchQuotientNode; + /** + * Whether or not the token is likely still being edited by the user (due to + * adjacency of the caret) + */ isPartial: boolean; /** @@ -118,6 +153,14 @@ export class ContextToken { return new ContextToken(searchModule, isPartial); } + /** + * Reports the length in codepoints of corrected text represented by the + * current token. + */ + get codepointLength() { + return this._searchModule.codepointLength; + } + get inputCount() { return this._searchModule.inputCount; } @@ -155,7 +198,7 @@ export class ContextToken { /** * Generates text corresponding to the net effects of the most likely inputs - * received that can correspond to the current instance. + * received that can correspond to the represented token. */ get exampleInput(): string { return this.searchModule.bestExample.text; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 891dd923cc8..b35796e83a4 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -5,9 +5,9 @@ import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbre import TransformUtils from './transformUtils.js'; import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js'; +import { ContextTokenLike } from './correction/context-token.js'; import { ContextTokenization } from './correction/context-tokenization.js'; import { ContextTracker } from './correction/context-tracker.js'; -import { ContextToken } from './correction/context-token.js'; import { ContextState, determineContextSlideTransform } from './correction/context-state.js'; import { ContextTransition } from './correction/context-transition.js'; import { ExecutionTimer } from './correction/execution-timer.js'; @@ -75,6 +75,43 @@ export const CORRECTION_SEARCH_THRESHOLDS = { REPLACEMENT_SEARCH_THRESHOLD: 4 as const // e^-4 = 0.0183156388. Allows "80%" of an extra edit. } +/** + * Represents the minimum replacement range and effects required for + * suggestions. + * + * These values are based on properties of the transition from their base + * context-tokenization to their target-tokenization (and its represented + * context variant). + */ +export interface SuggestionReplacement { + /** + * Tokens lost from the base context-tokenization in the target + * context-tokenization due to the transition event. + * + * These are implicitly replaced when applying Suggestions. + */ + tokensToRemove: T[], + + /** + * Tokens added (after the removed tokens) to the base context-tokenization to + * produce the target context-tokenization. + * + * As these are "new" tokens generated by the transition, Suggestions should represent + * corrections and predictions rooted upon these tokens. + */ + tokensToPredict: T[], + + /** + * Indicates the total range of left-deletion needed when applying suggestions. + */ + deleteLeft: number, + + /** + * Indicates the id of the underlying context transition. + */ + transitionId?: number +} + /** * Collates information related to suggestions during the suggestion generation * process. @@ -397,53 +434,67 @@ export function determineSuggestionAlignment( * @param variantForSuggestions * @returns */ -export function determineSuggestionRange( - userContextTokenization: ContextTokenization, - variantForSuggestions: ContextTokenization -): { tokensToRemove: ContextToken[], tokensToPredict: ContextToken[] } { - // Assumption: spaceIds monotonically increase as new ones are generated. - // Given this, we backtrace on the token tails until finding a spot where the - // spaceIds match, dropping any that are newer than the last found in the - // other. - // - // We full-replace all tokens affected by an applied suggestion, so if there's - // a mismatch between the final form of a token, that implies that suggestions - // would replace the original form of the token anyway. - const tokenSetA = userContextTokenization.tokens.slice(); - const tokenSetB = variantForSuggestions.tokens.slice(); - - const tokensToRemove: ContextToken[] = []; - const tokensToPredict: ContextToken[] = []; - - const tailIdFor = (tokens: ContextToken[]) => tokens[tokens.length-1]?.spaceId ?? -1; - let tailOfA = tailIdFor(tokenSetA); - let tailOfB = tailIdFor(tokenSetB); - while(tailOfA != tailOfB) { - if(tailOfA < tailOfB) { - tokensToPredict.push(tokenSetB.pop()); - tailOfB = tailIdFor(tokenSetB); - } else { - tokensToRemove.push(tokenSetA.pop()); - tailOfA = tailIdFor(tokenSetA); +export function determineSuggestionRange( + userContextTokenization: T[], + variantForSuggestions: T[], + equalityChecker: (a: T, b: T) => boolean +): SuggestionReplacement { + // Add null/undefined guards to the equality checker. + const temp = equalityChecker; + equalityChecker = (a, b) => { + if(!a || !b) { + return false; + } + + return temp(a, b); + } + + const deleteLeftCalc = (tokenSet: T[], predictCount: number) => { + // TODO: once we start activating multi-tokenization for real, only the + // 'reduce' component should remain. + return (predictCount > 1) + ? (tokenSet[tokenSet.length - 1]?.codepointLength ?? 0) + : tokenSet.reduce((prev, curr) => prev + curr.codepointLength, 0); + } + + const tokenSetA = userContextTokenization.slice(); + const tokenSetB = variantForSuggestions.slice(); + + let aHeadIndexInB = tokenSetB.findIndex((t) => equalityChecker(t, tokenSetA[0])); + let bHeadIndexInA = tokenSetA.findIndex((t) => equalityChecker(t, tokenSetB[0])); + + if(aHeadIndexInB == -1 && bHeadIndexInA == -1) { + // Both are full replacements. + return { + tokensToRemove: tokenSetA, + tokensToPredict: tokenSetB, + deleteLeft: deleteLeftCalc(tokenSetA, tokenSetB.length) } + } else if(aHeadIndexInB != 0 && bHeadIndexInA != 0) { + throw new Error("Leading edge of context should not differ in both tokenizations."); + } + + let tailOffset = 0; + while(equalityChecker(tokenSetA[bHeadIndexInA + tailOffset], tokenSetB[aHeadIndexInB + tailOffset])) { + tailOffset++; } - tokensToPredict.reverse(); + const tokensToRemove: T[] = tokenSetA.slice(bHeadIndexInA + tailOffset); + const tokensToPredict: T[] = tokenSetB.slice(aHeadIndexInB + tailOffset); // Can occur when backspacing to the end of a previous word. if(tokensToPredict.length == 0) { if(tokenSetA.length == 0 || tokenSetB.length == 0) { throw new Error("Invalid state - a tokenization is missing expected tokens"); } - tokensToRemove.push(tokenSetA.pop()); - tokensToPredict.push(tokenSetB.pop()); + tokensToRemove.unshift(tokenSetA[bHeadIndexInA + tailOffset - 1]); + tokensToPredict.unshift(tokenSetB[aHeadIndexInB + tailOffset - 1]); } - tokensToRemove.reverse(); - return { tokensToRemove, - tokensToPredict + tokensToPredict, + deleteLeft: deleteLeftCalc(tokensToRemove, tokensToPredict.length) } } diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts index 3445abaadd9..e18174895ec 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-range.tests.ts @@ -170,12 +170,14 @@ function buildQuickBrownFixture() { }; } +const tokenEquality = (a: ContextToken, b: ContextToken) => a.spaceId == b.spaceId; + describe('determineSuggestionRange', () => { it('adjusts the final token if no tokenization changes occur', () => { const fixture = buildQuickBrownFixture(); const noChange = fixture.variations.noChange; - const analysis = determineSuggestionRange(fixture.baseTokenization, noChange.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, noChange.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, noChange.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, noChange.range.tokensToPredict); @@ -185,7 +187,7 @@ describe('determineSuggestionRange', () => { const fixture = buildQuickBrownFixture(); const plainInsert = fixture.variations.plainInsert; - const analysis = determineSuggestionRange(fixture.baseTokenization, plainInsert.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, plainInsert.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, plainInsert.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, plainInsert.range.tokensToPredict); @@ -195,7 +197,7 @@ describe('determineSuggestionRange', () => { const fixture = buildQuickBrownFixture(); const newTokenInsert = fixture.variations.newTokenInsert; - const analysis = determineSuggestionRange(fixture.baseTokenization, newTokenInsert.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, newTokenInsert.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, newTokenInsert.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, newTokenInsert.range.tokensToPredict); @@ -205,7 +207,7 @@ describe('determineSuggestionRange', () => { const fixture = buildQuickBrownFixture(); const charReplace = fixture.variations.charReplace; - const analysis = determineSuggestionRange(fixture.baseTokenization, charReplace.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, charReplace.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, charReplace.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, charReplace.range.tokensToPredict); @@ -215,7 +217,7 @@ describe('determineSuggestionRange', () => { const fixture = buildQuickBrownFixture(); const del5Insert5 = fixture.variations.del5Insert5; - const analysis = determineSuggestionRange(fixture.baseTokenization, del5Insert5.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, del5Insert5.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, del5Insert5.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, del5Insert5.range.tokensToPredict); @@ -225,7 +227,7 @@ describe('determineSuggestionRange', () => { const fixture = buildQuickBrownFixture(); const eraseToken = fixture.variations.eraseToken; - const analysis = determineSuggestionRange(fixture.baseTokenization, eraseToken.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, eraseToken.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, eraseToken.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, eraseToken.range.tokensToPredict); @@ -235,7 +237,7 @@ describe('determineSuggestionRange', () => { const fixture = buildQuickBrownFixture(); const deleteToBound = fixture.variations.deleteToBound; - const analysis = determineSuggestionRange(fixture.baseTokenization, deleteToBound.tokenization); + const analysis = determineSuggestionRange(fixture.baseTokenization.tokens, deleteToBound.tokenization.tokens, tokenEquality); assert.sameOrderedMembers(analysis.tokensToRemove, deleteToBound.range.tokensToRemove); assert.sameOrderedMembers(analysis.tokensToPredict, deleteToBound.range.tokensToPredict); @@ -255,7 +257,7 @@ describe('determineSuggestionRange', () => { null ) - const analysis = determineSuggestionRange(originalQuickBrownTokenization, foxVsAlligatorTokenization); + const analysis = determineSuggestionRange(originalQuickBrownTokenization.tokens, foxVsAlligatorTokenization.tokens, tokenEquality); assert.sameOrderedMembers( analysis.tokensToRemove, @@ -279,7 +281,7 @@ describe('determineSuggestionRange', () => { null ) - const analysis = determineSuggestionRange(originalQuickBrownTokenization, dogsAndCatTokenization); + const analysis = determineSuggestionRange(originalQuickBrownTokenization.tokens, dogsAndCatTokenization.tokens, tokenEquality); assert.sameOrderedMembers( analysis.tokensToRemove,