Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions web/src/engine/predictive-text/templates/src/tokenization.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ export function tokenize(
currentIndex = nextIndex;
}

if(tokenization.left.length == 0) {
tokenization.left.push({text: '', isWhitespace: false});
}

// New step 2: handle any rejoins needed.

// Handle any desired special handling for directly-pre-caret scenarios - where for this
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ export function determineModelTokenizer(model: LexicalModel) {
if(model.wordbreaker) {
return models.tokenize(model.wordbreaker, context);
} else {
return null;
// Not ideal for pre-14.0 models, but it'll do for now.
return models.tokenize(wordBreakers.default, context);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbre
import TransformUtils from './transformUtils.js';
import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js';
import { ContextTokenLike } from './correction/context-token.js';
import { ContextTokenization } from './correction/context-tokenization.js';
import { ContextTokenization, mapWhitespacedTokenization } from './correction/context-tokenization.js';
import { ContextTracker } from './correction/context-tracker.js';
import { ContextState, determineContextSlideTransform } from './correction/context-state.js';
import { ContextTransition } from './correction/context-transition.js';
Expand Down Expand Up @@ -191,88 +191,64 @@ export function tupleDisplayOrderSort(a: CorrectionPredictionTuple, b: Correctio
return b.totalProb - a.totalProb;
}

export async function correctAndEnumerateWithoutTraversals(
export function determineTraversallessCorrectionSequences(
lexicalModel: LexicalModel,
transformDistribution: Distribution<Transform>,
corrections: Distribution<Transform>,
context: Context
): Promise<{
/**
* For models that support correction-search caching, this provides the
* cached object corresponding to this method's operation.
*
* Otherwise, is `null`.
*/
postContextState?: ContextState;
): PredictionParameters[] {
let returnedPredictionData: PredictionParameters[] = [];

/**
* The suggestions generated based on the user's input state.
*/
rawPredictions: CorrectionPredictionTuple[];
const tokenizer = determineModelTokenizer(lexicalModel);
const wordbreak = determineModelWordbreaker(lexicalModel);

/**
* The id of a prior ContextTransition event that triggered a Suggestion found
* at the end of the Context. Will be undefined if no edits have occurred
* since the Suggestion was applied.
*/
revertableTransitionId?: number
}> {
const inputTransform = transformDistribution[0].sample;
let rawPredictions: CorrectionPredictionTuple[] = [];
const tokenization = tokenizer(context); // issue at present if no tokens exist!
const tokenMapper = (t: models.Token) => {
return {
exampleInput: t.text,
codepointLength: KMWString.length(t.text)
} as ContextTokenLike;
}

let predictionRoots: ProbabilityMass<Transform>[];
for(let correction of corrections) {
// Step 1: determine tokenization effects. We can't use the
// ContextTokenization pattern due to the model's lack of LexiconTraversal
// support, though.
const transformId = correction.sample.id;
const postContext = models.applyTransform(correction.sample, context);
const postTokenization = tokenizer(postContext);

const transitionEffects = determineSuggestionRange(tokenization.left.map(tokenMapper), postTokenization.left.map(tokenMapper), (a, b) => a.exampleInput == b.exampleInput);
const match: TokenResult = {
matchString: wordbreak(postContext),
inputSamplingCost: -Math.log(correction.p),
knownCost: 0,
totalCost: -Math.log(correction.p)
};

// Only allow new-word suggestions if space was the most likely keypress.
const allowSpace = TransformUtils.isWhitespace(inputTransform);
const allowBksp = TransformUtils.isBackspace(inputTransform);
const suggestionParams = buildCorrectionSequence(transitionEffects, context, match, 1);

// Generates raw prediction distributions for each valid input. Can only 'correct'
// against the final input.
//
// This is the old, 12.0-13.0 'correction' style.
if(allowSpace) {
// Detect start of new word; prevent whitespace loss here.
predictionRoots = [{sample: inputTransform, p: 1.0}];
} else {
predictionRoots = transformDistribution.map((alt) => {
let transform = alt.sample;

// Filter out special keys unless they're expected.
if(TransformUtils.isWhitespace(transform) && !allowSpace) {
return null;
} else if(TransformUtils.isBackspace(transform) && !allowBksp) {
return null;
}
const tokenizationMapping = mapWhitespacedTokenization(tokenization.left.map((t) => { return {exampleInput: t.text, codepointLength: KMWString.length(t.text)} }), lexicalModel, correction.sample);
const tokenizedCorrection = tokenizationMapping.tokenizedTransform;
const tokenizedCorrectionEntries = [...tokenizedCorrection.values()];

return alt;
});
}
// IF: array has multiple entries, then build the preservation-transform as below, including the deleteLeft.
// If not, don't make one!
const preservationTransform = tokenizedCorrectionEntries.slice(0, -1).reduce((accum, curr) => {
return { insert: accum.insert + curr.insert, deleteLeft: accum.deleteLeft + curr.deleteLeft };
}, { insert: '', deleteLeft: 0, id: correction.sample.id});

const wordbreak = determineModelWordbreaker(lexicalModel);
// Remove `null` entries, then determine suggestions.
predictionRoots.forEach((pr) => {
const postContext = models.applyTransform(pr.sample, context);
const tailTokenText = wordbreak(postContext);
const rootContext = models.applyTransform({insert: '', deleteLeft: KMWString.length(tailTokenText)}, postContext);

const results = predictFromCorrectionSequence(lexicalModel, [{
sample: {
insert: tailTokenText,
deleteLeft: 0,
id: pr.sample.id
},
p: pr.p
}], rootContext, pr.sample.id);
results.forEach((r) => rawPredictions.push(r));
})

if(allowSpace) {
rawPredictions.forEach((entry) => entry.preservationTransform = inputTransform);
returnedPredictionData.push({
...suggestionParams,
applyInPost: (p) => {
p.preservationTransform = preservationTransform;
if(transformId) {
p.prediction.sample.transformId = transformId;
}
}
})
}

return {
postContextState: null,
rawPredictions: rawPredictions
};
return returnedPredictionData;
}

/**
Expand Down Expand Up @@ -586,7 +562,14 @@ export async function correctAndEnumerate(
// It's mostly here to support models compiled before Keyman 14.0, which was
// when the `LexiconTraversal` pattern was established.
if(!contextTracker) {
return correctAndEnumerateWithoutTraversals(lexicalModel, transformDistribution, context);
const predictionData = determineTraversallessCorrectionSequences(lexicalModel, transformDistribution, context);
return {
rawPredictions: predictionData.flatMap((entry) => {
const predictions = predictFromCorrectionSequence(lexicalModel, entry.tokenizedCorrection, entry.rootContext, transformDistribution[0]?.sample.id);
predictions.forEach((p) => entry.applyInPost(p));
return predictions;
})
};
}

// 'else': the current, 14.0+ pattern, which is able to leverage
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ describe('Tokenization functions', function() {
});

it('properly handles empty-context cases', function() {
// Wordbreaking on a empty space => no word.
// Wordbreaking on a empty space => no word, but empty initial token.
let context = {
left: '', startOfBuffer: true,
right: '', endOfBuffer: true
Expand All @@ -184,7 +184,7 @@ describe('Tokenization functions', function() {
let tokenization = models.tokenize(wordBreakers.default, context);

let expectedResult: models.Tokenization = {
left: [],
left: [{text: '', isWhitespace: false}],
right: [],
caretSplitsToken: false
};
Expand All @@ -193,11 +193,11 @@ describe('Tokenization functions', function() {
});

it('properly handles null context cases', function() {
// Wordbreaking on a empty space => no word.
// Wordbreaking on a empty space => no word, but empty initial token.
let tokenization = models.tokenize(wordBreakers.default, null);

let expectedResult: models.Tokenization = {
left: [],
left: [{text: '', isWhitespace: false}],
right: [],
caretSplitsToken: false
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ describe('determineTokenizedCorrectionSequence', () => {
});

// Will be handled far better after resolving multi-tokenization handling.
it.skip(`properly analyzes post-split case`, () => {
it.skip(`properly analyzes post-split new-wordbreak case`, () => {
const context: Context = {
left: 'the quick brown fox can\'',
right: '',
Expand Down
Loading
Loading