Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ export class TokenResultMapping implements CorrectionResultMapping<SearchNode>,
return this.node;
}

get inputCount(): number {
return this.matchingSpace.inputCount;
}

get inputSequence(): ProbabilityMass<Transform>[] {
return this.node.priorInput;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import { MAX_EDIT_THRESHOLD_FACTOR } from "./search-quotient-spur.js";
export type TokenResult = {
matchString: string,
inputSamplingCost: number,
inputCount: number,
knownCost: number,
totalCost: number
}
Expand Down Expand Up @@ -196,6 +197,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
this._generatedTokenResults.set(uncorrectable.spaceId, {
matchString: lockedResult.text,
inputSamplingCost: -Math.log(lockedResult.p),
inputCount: uncorrectable.inputCount,
knownCost: 0,
totalCost: -Math.log(lockedResult.p)
});
Expand Down Expand Up @@ -298,6 +300,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
this._generatedTokenResults.set(correctableToUpdate.spaceId, {
matchString: lockedResult.text,
inputSamplingCost: -Math.log(lockedResult.p),
inputCount: correctableToUpdate.inputCount,
knownCost: MAX_EDIT_THRESHOLD_FACTOR, // we'll use the same threshold at which further search is terminated.
totalCost: -Math.log(lockedResult.p) + MAX_EDIT_THRESHOLD_FACTOR * EDIT_DISTANCE_COST_SCALE
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,15 +221,19 @@ export function determineTraversallessCorrectionSequences(
const match: TokenResult = {
matchString: wordbreak(postContext),
inputSamplingCost: -Math.log(correction.p),
inputCount: 2,
knownCost: 0,
totalCost: -Math.log(correction.p)
};

const suggestionParams = buildCorrectionSequence(transitionEffects, context, match, 1);

const tokenizationMapping = mapWhitespacedTokenization(tokenization.left.map((t) => { return {exampleInput: t.text, codepointLength: KMWString.length(t.text)} }), lexicalModel, correction.sample);
const tokenizedCorrection = tokenizationMapping.tokenizedTransform;
const tokenizedCorrectionEntries = [...tokenizedCorrection.values()];
if(tokenizedCorrection.size > 1 || correction.sample.insert.length >= match.matchString.length) {
match.inputCount = 1;
}

const suggestionParams = buildCorrectionSequence(transitionEffects, context, match);

// IF: array has multiple entries, then build the preservation-transform as below, including the deleteLeft.
// If not, don't make one!
Expand Down Expand Up @@ -451,7 +455,6 @@ export function buildCorrectionSequence(
transitionEffects: ReturnType<typeof determineSuggestionRange>,
context: Context,
match: Readonly<TokenResult>,
costFactor: number
) {
const { deleteLeft } = transitionEffects;

Expand All @@ -463,6 +466,21 @@ export function buildCorrectionSequence(
deleteLeft: 0,
}

/* If we're dealing with the FIRST keystroke of a new sequence, we'll **dramatically** boost
* the exponent to ensure only VERY nearby corrections have a chance of winning, and only if
* there are significantly more likely words. We only need this to allow very minor fat-finger
* adjustments for 100% keystroke-sequence corrections in order to prevent finickiness on
* key borders.
*
* Technically, the probabilities this produces won't be normalized as-is... but there's no
* true NEED to do so for it, even if it'd be 'nice to have'. Consistently tracking when
* to apply it could become tricky, so it's simpler to leave out.
*
* Worst-case, it's possible to temporarily add normalization if a code deep-dive
* is needed in the future.
*/
const costFactor = (match.inputCount <= 1) ? ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT : 1;

const rootCost = match.totalCost;
const predictionRoot = {
sample: correctionTransform,
Expand Down Expand Up @@ -492,13 +510,12 @@ export function buildCorrectionSequence(
export function determineTokenizedCorrectionSequence(
transition: ContextTransition,
tokenization: ContextTokenization,
match: Readonly<TokenResult>,
costFactor: number
match: Readonly<TokenResult>
): PredictionParameters {
const applicationTarget = transition.base.displayTokenization;
const transitionParams = determineSuggestionRange(applicationTarget.tokens, tokenization.tokens, (a, b) => a.spaceId == b.spaceId);

const suggestionParams = buildCorrectionSequence(transitionParams, transition.base.context, match, costFactor);
const suggestionParams = buildCorrectionSequence(transitionParams, transition.base.context, match);

// The correction should always be based on the most recent external
// transform/transcription ID.
Expand Down Expand Up @@ -622,28 +639,13 @@ export async function correctAndEnumerate(
continue;
}

/* If we're dealing with the FIRST keystroke of a new sequence, we'll **dramatically** boost
* the exponent to ensure only VERY nearby corrections have a chance of winning, and only if
* there are significantly more likely words. We only need this to allow very minor fat-finger
* adjustments for 100% keystroke-sequence corrections in order to prevent finickiness on
* key borders.
*
* Technically, the probabilities this produces won't be normalized as-is... but there's no
* true NEED to do so for it, even if it'd be 'nice to have'. Consistently tracking when
* to apply it could become tricky, so it's simpler to leave out.
*
* Worst-case, it's possible to temporarily add normalization if a code deep-dive
* is needed in the future.
*/
const costFactor = (tokenization.tail.inputCount <= 1) ? ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT : 1;

const predictionPrep = determineTokenizedCorrectionSequence(transition, tokenization, match, costFactor);
const predictionPrep = determineTokenizedCorrectionSequence(transition, tokenization, match);
const predictions = predictFromCorrectionSequence(lexicalModel, predictionPrep.tokenizedCorrection, predictionPrep.rootContext, transition.transitionId);
predictions.forEach((p) => predictionPrep.applyInPost(p));

// Only set 'best correction' cost when a correction ACTUALLY YIELDS predictions.
if(predictions.length > 0 && bestCorrectionCost === undefined) {
bestCorrectionCost = match.totalCost * costFactor;
if(predictions.length > 0 && (bestCorrectionCost === undefined || bestCorrectionCost > match.totalCost)) {
bestCorrectionCost = match.totalCost;
}

// If we're getting the same prediction again, it's lower-cost. Update!
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import * as wordBreakers from '@keymanapp/models-wordbreakers';
import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs';
import { KMWString } from '@keymanapp/web-utils';

import { determineTokenizedCorrectionSequence, models, ContextState, ContextToken, ContextTokenization, CorrectionPredictionTuple } from "@keymanapp/lm-worker/test-index";
import { determineTokenizedCorrectionSequence, models, ContextState, ContextToken, ContextTokenization, CorrectionPredictionTuple, ModelCompositor } from "@keymanapp/lm-worker/test-index";

import Context = LexicalModelTypes.Context;
import ProbabilityMass = LexicalModelTypes.ProbabilityMass;
Expand Down Expand Up @@ -54,10 +54,10 @@ describe('determineTokenizedCorrectionSequence', () => {
transition.final.displayTokenization, {
matchString: 'fo',
inputSamplingCost: -Math.log(trueInput.p),
inputCount: 2,
knownCost: 0,
totalCost: -Math.log(trueInput.p)
},
1
}
);

assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, {
Expand Down Expand Up @@ -104,10 +104,10 @@ describe('determineTokenizedCorrectionSequence', () => {
transition.final.displayTokenization, {
matchString: ' ',
inputSamplingCost: -Math.log(trueInput.p),
inputCount: 1,
knownCost: 0,
totalCost: -Math.log(trueInput.p)
},
1
}
);

assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, {
Expand All @@ -118,15 +118,12 @@ describe('determineTokenizedCorrectionSequence', () => {
endOfBuffer: true
});

assert.deepEqual(results.tokenizedCorrection, [
{
sample: {
insert: ' ',
deleteLeft: 0
},
p: trueInput.p
}
]);
assert.equal(results.tokenizedCorrection.length, 1);
assert.deepEqual(results.tokenizedCorrection[0].sample, {
insert: ' ',
deleteLeft: 0
});
assert.approximately(results.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000);
});

it(`properly analyzes common-case word-start - beginning a new token`, () => {
Expand Down Expand Up @@ -154,10 +151,10 @@ describe('determineTokenizedCorrectionSequence', () => {
transition.final.displayTokenization, {
matchString: 'f',
inputSamplingCost: -Math.log(trueInput.p),
inputCount: 1,
knownCost: 0,
totalCost: -Math.log(trueInput.p)
},
1
}
);

assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, {
Expand All @@ -168,15 +165,13 @@ describe('determineTokenizedCorrectionSequence', () => {
endOfBuffer: true
});

assert.deepEqual(results.tokenizedCorrection, [
{
sample: {
insert: 'f',
deleteLeft: 0
},
p: trueInput.p
}
]);

assert.equal(results.tokenizedCorrection.length, 1);
assert.deepEqual(results.tokenizedCorrection[0].sample, {
insert: 'f',
deleteLeft: 0
});
assert.approximately(results.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000);
});

it(`properly analyzes post-merge case`, () => {
Expand Down Expand Up @@ -210,10 +205,10 @@ describe('determineTokenizedCorrectionSequence', () => {
transition.final.displayTokenization, {
matchString: 'can\'t',
inputSamplingCost: -Math.log(trueInput.p),
inputCount: 5,
knownCost: 0,
totalCost: -Math.log(trueInput.p)
},
1
}
);

assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, {
Expand Down Expand Up @@ -261,10 +256,10 @@ describe('determineTokenizedCorrectionSequence', () => {
transition.final.displayTokenization, {
matchString: ' ',
inputSamplingCost: -Math.log(trueInput.p),
inputCount: 1,
knownCost: 0,
totalCost: -Math.log(trueInput.p)
},
1
}
);

assert.deepEqual({...results.rootContext, casingForm: results.rootContext.casingForm}, {
Expand All @@ -275,15 +270,13 @@ describe('determineTokenizedCorrectionSequence', () => {
endOfBuffer: true
});

assert.deepEqual(results.tokenizedCorrection, [
{
sample: {
insert: ' ',
deleteLeft: 0
},
p: trueInput.p
}
]);

assert.equal(results.tokenizedCorrection.length, 1);
assert.deepEqual(results.tokenizedCorrection[0].sample, {
insert: ' ',
deleteLeft: 0
});
assert.approximately(results.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000);
});

it(`properly analyzes complex transition - multi-token replacement`, () => {
Expand All @@ -310,10 +303,10 @@ describe('determineTokenizedCorrectionSequence', () => {
transition.final.displayTokenization, {
matchString: 'd',
inputSamplingCost: -Math.log(trueInput.p),
inputCount: 1,
knownCost: 0,
totalCost: -Math.log(trueInput.p)
},
1
}
);

// Large-scale deletions will receive enhanced handling soon. But, for now, it's
Expand All @@ -326,15 +319,13 @@ describe('determineTokenizedCorrectionSequence', () => {
endOfBuffer: true
});

assert.deepEqual(results.tokenizedCorrection, [
{
sample: {
insert: 'd',
deleteLeft: 0
},
p: trueInput.p
}
]);

assert.equal(results.tokenizedCorrection.length, 1);
assert.deepEqual(results.tokenizedCorrection[0].sample, {
insert: 'd',
deleteLeft: 0
});
assert.approximately(results.tokenizedCorrection[0].p, Math.pow(trueInput.p, ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT), Number.EPSILON*1000);

const dummiedTuple: CorrectionPredictionTuple = {
prediction: {
Expand Down
Loading
Loading