From 50058504425459f1a3bd79bb8ac51d34316f281c Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Thu, 13 Nov 2025 13:15:06 -0600 Subject: [PATCH 1/3] docs(web): starts internal doc on SearchSpace design, requirements, and analysis Build-bot: skip Test-bot: skip --- .../worker-thread/docs/search-spaces.md | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 web/src/engine/predictive-text/worker-thread/docs/search-spaces.md diff --git a/web/src/engine/predictive-text/worker-thread/docs/search-spaces.md b/web/src/engine/predictive-text/worker-thread/docs/search-spaces.md new file mode 100644 index 00000000000..65be5380a9e --- /dev/null +++ b/web/src/engine/predictive-text/worker-thread/docs/search-spaces.md @@ -0,0 +1,76 @@ +# The SearchSpace types + +The `SearchSpace` interface exists to represent portions of the dynamically-generated graph used for correction-searching within the predictive-text engine. As new input is received, new extensions to previous `SearchSpace`s may be created to extend the graph's reach, appending newly-received input to the context token to be corrected. Loosely speaking, different instances of `SearchSpace` correspond to different potential tokenizations of the input and/or to different requirements for constructing and applying generated suggestions. + +There are two implementations of this interface: +- `SearchPath`, which extends a `SearchSpace` by a single set of recent inputs affecting the range of represented text in the same manner. +- `SearchCluster`, which exists to _group_ `SearchPath` instances that represent the same range of represented text and input set. + +Both are capable of generating potential corrections, utilizing their links to prior `SearchSpace` entries in a manner that shares common corrected prefixes with other `SearchSpace` branches. + +## The Underlying Problem + +### Defining the Problem +It is easily possible for a user to fat-finger, accidentally typing a standard letter instead of the spacebar or similar when the latter is intended. For languages using standard whitespace-based wordbreaking, this implies that the word boundaries seen in the context should not be considered absolute; we should model cases where the word-boundaries land elsewhere due to fat-finger effects. Additionally, we have standing plans to support dictionary-based wordbreaking for languages that do not utilize whitespaces between words - this adds an extra case in which word-boundaries cannot be considered absolute. + +Keyman keyboard rules further complicate matters. They do not need to consider side-effects for predictive-text, and it's easily possible for a rule to output text changes that affect (or even _effect_) multiple text tokens within the context. + +Take Greek, which has special word-final behavior for sigma: +- within a word, `σ` is the lowercase form. +- at the end of a word, it should appear as `ς` instead. + +Keyman keyboards exist with the following behavior: + +```keyman +"σ" + [K_SPACE] > "ς " +``` + +Note that this results in a context-manipulation (a `Transform`) spanning two tokens: +- it alters the end of the word currently at the end of context +- it also adds a whitespace token + +There also exist keyboards like `khmer_angkor` that may perform character reordering, performing significant left-deletions and insertions in a single keystroke. Furthermore, there's little saying that a keyboard can't be written that deletes a full grapheme cluster, rather than an individual key - a process that would add multiple left-deletions without any insertions. + +We need structures and design to cover _all_ such cases - and to do so reliably and consistently. + +### Requirements + +Our model for correction-search requires the following properties: + +1. It must be able to model cases where keystrokes do not consistently correspond to the same text tokens. + +Consider typing the word `banner` in English. The `n` key is close to the spacebar, so its fat-finger distribution may contain the spacebar's output as an option. Thus, there is the opportunity for correction to(or from) `ban` + ` ` + `er` instead, replacing the second `n` with a space. The same keystrokes thus may correspond to one or three text tokens. + +Consider the implications of this: +- Perhaps `banner` is the intended word +- Alternatively, `ban` is a legitimate word, and `er` is a perfectly valid English prefix. What if the user is typing a word starting with `er`... do we have good suggestions based on that? + - Suppose this gives us the following suggestions: `banner`, `ban errors`, `ban erosion`. + - How much of the context should be altered when applying these suggestions? + + +2. Each individual `SearchSpace` should only model correction of inputs that result in tokens of the same codepoint length as each other. + - Consider the effects of a long left-deletion input. + - If a `SearchSpace` held variable-length text, a large left-deletion could delete the whole block of text for some cases, but not for others; this gets messy to handle! + - We want to ensure that `deleteLeft` effects can be modeled clearly and consistently, without ambiguity. + +### Analysis + +1. Each keystroke's probability may be considered independent from other keystrokes. + - To be clear, this is a simplifying assumption; attempting to do otherwise would both be speculative and complicate things further. + +2. It is not possible to guarantee that one keystroke will only extend a previous `SearchSpace` in one way. + - If the incoming keystroke produces `Transform`s that have different `insert` length without varying the left-deletion count, this _must_ result in multiple `SearchSpace`s, as the total codepoint length will vary accordingly. + - Also of note: if left-deleting, it is possible for a left-deletion to erase the token adjacent to the text insertion point. + +3. When constructing and applying `Suggestion`s, it helps greatly to determine which `SearchSpace` led to it. + - This allows us to determine _which_ keystrokes are being replaced, as well as _what_ parts of the Context will be affected. + - If a correction is only possible if the context were to have a different initial word-boundary than currently exists, this implies a need to correct the Context accordingly as part of applying a suggestion. + +4. Following from the points above, it is possible for two or more `SearchSpace` segments to model input sequences with the same properties, as follows: + - The input length in codepoints is identical. + - The same keystrokes, and portions thereof, are utilized. + +For example, consider a case with two keystrokes, each of which has versions emitting insert strings of one and two characters. Taking two chars from one and one char from the other will result in a `SearchSpace` that models a total of two keystrokes that fully covers the two keys. + +For such cases, any future keystrokes can extend both input sequences in the same manner. While the actual correction-text may differ, the net effect it has on the properties of a token necessary for correction and construction of suggestions is identical. The `SearchCluster` variant of `SearchSpace` exists for such cases, modeling the convergence of multiple `SearchPath`s and extending all of them together at once. \ No newline at end of file From 17696a5f50eb54b19f40bb677064451fd48700f9 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Thu, 13 Nov 2025 16:35:37 -0600 Subject: [PATCH 2/3] feat(web): ensure all child spaces are notified when parent reaches end of a correction path Build-bot: skip build:web Test-bot: skip --- .../src/main/correction/search-cluster.ts | 28 ++++++++++++----- .../src/main/correction/search-path.ts | 31 +++++++++++++++++++ .../src/main/correction/search-space.ts | 8 +++++ 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts index 0708ca7fbf8..e46d4a2f341 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts @@ -30,14 +30,21 @@ export class SearchCluster implements SearchSpace { private selectionQueue: PriorityQueue = new PriorityQueue(PATH_QUEUE_COMPARATOR); readonly spaceId: number; - // We use an array and not a PriorityQueue b/c batch-heapifying at a single point in time - // is cheaper than iteratively building a priority queue. /** - * This tracks all paths that have reached the end of a viable input-matching path - even - * those of lower cost that produce the same correction as other paths. + * Holds all `incomingNode` child buffers - buffers to hold nodes processed by + * this SearchCluster but not yet by child SearchSpaces. + */ + private childBuffers: SearchNode[][] = []; + + // We use an array and not a PriorityQueue b/c batch-heapifying at a single + // point in time is cheaper than iteratively building a priority queue. + /** + * This tracks all paths that have reached the end of a viable input-matching + * path - even those of lower cost that produce the same correction as other + * paths. * - * When new input is received, its entries are then used to append edges to the path in order - * to find potential paths to reach a new viable end. + * When new input is received, its entries are then used to append edges to + * the path in order to find potential paths to reach a new viable end. */ private completedPaths?: SearchNode[] = []; @@ -152,6 +159,7 @@ export class SearchCluster implements SearchSpace { this.selectionQueue.enqueue(bestPath); if(currentResult.type == 'complete') { + this.bufferNode(currentResult.finalNode); this.completedPaths?.push(currentResult.finalNode); currentResult.spaceId = this.spaceId; } @@ -163,8 +171,12 @@ export class SearchCluster implements SearchSpace { return this.completedPaths?.map((n => new SearchResult(n, this.spaceId))) ?? []; } - public stopTrackingResults() { - delete this.completedPaths; + public addResultBuffer(nodeBuffer: SearchNode[]): void { + this.childBuffers.push(nodeBuffer); + } + + private bufferNode(node: SearchNode) { + this.childBuffers.forEach((buf) => buf.push(node)); } get model(): LexicalModelTypes.LexicalModel { diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts index c3f66e06723..3007787c5ae 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts @@ -30,6 +30,19 @@ export const QUEUE_NODE_COMPARATOR: Comparator = function(arg1, arg2 // Whenever a wordbreak boundary is crossed, a new instance should be made. export class SearchPath implements SearchSpace { private selectionQueue: PriorityQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR); + + /** + * Holds all incoming Nodes generated from a parent `SearchSpace` that have not yet been + * extended with this `SearchSpace`'s input. + */ + private incomingNodes: SearchNode[] = []; + + /** + * Holds all `incomingNode` child buffers - buffers to hold nodes processed by + * this SearchPath but not yet by child SearchSpaces. + */ + private childBuffers: SearchNode[][] = []; + readonly inputs?: Distribution; readonly inputSource?: PathInputProperties; @@ -135,6 +148,7 @@ export class SearchPath implements SearchSpace { this.codepointLength = baseLength + this.edgeLength - deleteLeft; this.addEdgesForNodes(parentSpace.previousResults.map(r => r.node)); + parentSpace.addResultBuffer(this.incomingNodes); return; } @@ -416,6 +430,11 @@ export class SearchPath implements SearchSpace { * @returns */ public handleNextNode(): PathResult { + if(this.incomingNodes.length > 0) { + this.addEdgesForNodes(this.incomingNodes); + this.incomingNodes = []; + } + const parentCost = this.parentSpace?.currentCost ?? Number.POSITIVE_INFINITY; const localCost = this.selectionQueue.peek()?.currentCost ?? Number.POSITIVE_INFINITY; @@ -427,6 +446,9 @@ export class SearchPath implements SearchSpace { } const result = this.parentSpace.handleNextNode(); + // The parent will insert the node into our queue. We don't need it, though + // any siblings certainly will. + this.incomingNodes = []; if(result.type == 'complete') { this.addEdgesForNodes([result.finalNode]); @@ -492,6 +514,7 @@ export class SearchPath implements SearchSpace { } } + this.bufferNode(currentNode); return { type: 'complete', cost: currentNode.currentCost, @@ -508,6 +531,14 @@ export class SearchPath implements SearchSpace { return Object.values(this.returnedValues ?? {}).map(v => new SearchResult(v)); } + public addResultBuffer(nodeBuffer: SearchNode[]): void { + this.childBuffers.push(nodeBuffer); + } + + private bufferNode(node: SearchNode) { + this.childBuffers.forEach((buf) => buf.push(node)); + } + public get inputSegments(): InputSegment[] { if(!this.parentSpace) { return []; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts index d2e42364aed..07d8c6e3eb6 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts @@ -237,4 +237,12 @@ export interface SearchSpace { readonly constituentPaths: SearchPath[][]; isSameSpace(space: SearchSpace): boolean; + + /** + * This is used among SearchSpaces to ensure that nodes processed by earlier portions + * of the correction-search dynamic graph are provided to all child SearchSpaces for + * construction of new portions of the graph corresponding to their modeled inputs. + * @param nodeBuffer + */ + addResultBuffer(nodeBuffer: SearchNode[]): void; } \ No newline at end of file From b31bcadb86c569a6427c1c2e2d23711c5c860864 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Mon, 17 Nov 2025 14:08:37 -0600 Subject: [PATCH 3/3] feat(web): add multi-space search unit tests --- .../common/web-utils/src/priority-queue.ts | 23 +- .../src/main/correction/distance-modeler.ts | 1 + .../src/main/correction/search-cluster.ts | 1 + .../src/main/correction/search-path.ts | 20 +- .../correction-search/search-space.tests.ts | 555 +++++++++++++----- 5 files changed, 438 insertions(+), 162 deletions(-) diff --git a/web/src/engine/common/web-utils/src/priority-queue.ts b/web/src/engine/common/web-utils/src/priority-queue.ts index bbbb627291f..66e02fb263d 100644 --- a/web/src/engine/common/web-utils/src/priority-queue.ts +++ b/web/src/engine/common/web-utils/src/priority-queue.ts @@ -47,7 +47,7 @@ export default class PriorityQueue { this.comparator = comparator; this.heap = (initialEntries ?? []).slice(0); - this.heapify(); + this._heapify(); } private static leftChildIndex(index: number): number { @@ -67,13 +67,22 @@ export default class PriorityQueue { * are properly satisfied. * - O(N) when 'heapifying' the whole heap * - O(N) worst-case for partial heap operations (as part of an enqueueAll) - *

*/ - private heapify(): void; - private heapify(start: number, end: number): void; - private heapify(start?: number, end?: number): void { + public heapify(): void { + this._heapify(); + } + + /** + * Maintains internal state, rearranging the internal state until all heap constraints + * are properly satisfied. + * - O(N) when 'heapifying' the whole heap + * - O(N) worst-case for partial heap operations (as part of an enqueueAll) + */ + private _heapify(): void; + private _heapify(start: number, end: number): void; + private _heapify(start?: number, end?: number): void { if(start == undefined || end == undefined) { - this.heapify(0, this.count - 1); + this._heapify(0, this.count - 1); return; } @@ -161,7 +170,7 @@ export default class PriorityQueue { const firstParent = PriorityQueue.parentIndex(firstIndex); // The 'parent' of index 0 will return -1, which is illegal. - this.heapify(firstParent >= 0 ? firstParent : 0, PriorityQueue.parentIndex(this.count-1)); + this._heapify(firstParent >= 0 ? firstParent : 0, PriorityQueue.parentIndex(this.count-1)); } /** diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts index 64c559194b1..42cda23f758 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts @@ -653,6 +653,7 @@ export async function *getBestMatches(searchSpaces: SearchSpace[], timer: Execut let bestQueue = spaceQueue.dequeue(); const newResult = bestQueue.handleNextNode(); spaceQueue.enqueue(bestQueue); + spaceQueue.heapify(); if(newResult.type == 'none') { return null; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts index e46d4a2f341..605ce0cc025 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts @@ -157,6 +157,7 @@ export class SearchCluster implements SearchSpace { const bestPath = this.selectionQueue.dequeue(); const currentResult = bestPath.handleNextNode(); this.selectionQueue.enqueue(bestPath); + this.selectionQueue.heapify(); if(currentResult.type == 'complete') { this.bufferNode(currentResult.finalNode); diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts index 3007787c5ae..975991f90c4 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts @@ -400,6 +400,14 @@ export class SearchPath implements SearchSpace { } public get currentCost(): number { + if(this.incomingNodes.length > 0) { + this.addEdgesForNodes(this.incomingNodes); + + // Preserve the array instance, but trash all entries. + // The array is registered with the parent; do not replace! + this.incomingNodes.splice(0, this.incomingNodes.length); + } + const parentCost = this.parentSpace?.currentCost ?? Number.POSITIVE_INFINITY; const localCost = this.selectionQueue.peek()?.currentCost ?? Number.POSITIVE_INFINITY; @@ -432,13 +440,16 @@ export class SearchPath implements SearchSpace { public handleNextNode(): PathResult { if(this.incomingNodes.length > 0) { this.addEdgesForNodes(this.incomingNodes); - this.incomingNodes = []; + + // Preserve the array instance, but trash all entries. + // The array is registered with the parent; do not replace! + this.incomingNodes.splice(0, this.incomingNodes.length); } const parentCost = this.parentSpace?.currentCost ?? Number.POSITIVE_INFINITY; const localCost = this.selectionQueue.peek()?.currentCost ?? Number.POSITIVE_INFINITY; - if(parentCost <= localCost) { + if(parentCost < localCost) { if(parentCost == Number.POSITIVE_INFINITY) { return { type: 'none' @@ -448,7 +459,10 @@ export class SearchPath implements SearchSpace { const result = this.parentSpace.handleNextNode(); // The parent will insert the node into our queue. We don't need it, though // any siblings certainly will. - this.incomingNodes = []; + + // Preserve the array instance, but trash all entries. + // The array is registered with the parent; do not replace! + this.incomingNodes.splice(0, this.incomingNodes.length); if(result.type == 'complete') { this.addEdgesForNodes([result.finalNode]); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-space.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-space.tests.ts index 9af9bcb746e..eb6fc74563f 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-space.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-space.tests.ts @@ -10,7 +10,7 @@ import { assert } from 'chai'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; -import { correction, getBestMatches, models, SearchPath } from '@keymanapp/lm-worker/test-index'; +import { correction, generateSpaceSeed, getBestMatches, models, SearchCluster, SearchPath } from '@keymanapp/lm-worker/test-index'; import SearchResult = correction.SearchResult; import TrieModel = models.TrieModel; @@ -22,189 +22,440 @@ function buildTestTimer() { } describe('Correction Searching', () => { - const checkRepeatableResults_teh = async (iter: AsyncGenerator) => { - const firstIterResult = await iter.next(); // {value: , done: } - assert.isFalse(firstIterResult.done); - - const firstResult: correction.SearchResult = firstIterResult.value; // Retrieves - // No checks on the first set's cost. - assert.equal(firstResult.matchString, "ten"); - - // All start with 'te' but one, and invoke one edit of the same cost. - // 'th' has an 'h' at the same cost (input 3) of the 'e' (input 2). - const secondBatch = [ - 'tec', 'tel', 'tem', - 'ter', 'tes', 'th', - 'te' - ]; - - async function checkBatch(batch: string[], prevCost: number) { - let cost; - while(batch.length > 0) { - const iter_result = await iter.next(); - assert.isFalse(iter_result.done); - - const result = iter_result.value; - assert.isAbove(result.totalCost, prevCost); - if(cost !== undefined) { - assert.equal(result.totalCost, cost); - } else { - cost = result.totalCost; + describe('without multi-tokenization; using a single SearchPath sequence', () => { + const checkRepeatableResults_teh = async (iter: AsyncGenerator) => { + const firstIterResult = await iter.next(); // {value: , done: } + assert.isFalse(firstIterResult.done); + + const firstResult: correction.SearchResult = firstIterResult.value; // Retrieves + // No checks on the first set's cost. + assert.equal(firstResult.matchString, "ten"); + + // All start with 'te' but one, and invoke one edit of the same cost. + // 'th' has an 'h' at the same cost (input 3) of the 'e' (input 2). + const secondBatch = [ + 'tec', 'tel', 'tem', + 'ter', 'tes', 'th', + 'te' + ]; + + async function checkBatch(batch: string[], prevCost: number) { + let cost; + while(batch.length > 0) { + const iter_result = await iter.next(); + assert.isFalse(iter_result.done); + + const result = iter_result.value; + assert.isAbove(result.totalCost, prevCost); + if(cost !== undefined) { + assert.equal(result.totalCost, cost); + } else { + cost = result.totalCost; + } + + const matchIndex = batch.findIndex((entry) => entry == result.matchString); + assert.notEqual(matchIndex, -1, `'${result.matchString}' received as prediction too early`); + batch.splice(matchIndex, 1); } - const matchIndex = batch.findIndex((entry) => entry == result.matchString); - assert.notEqual(matchIndex, -1, `'${result.matchString}' received as prediction too early`); - batch.splice(matchIndex, 1); + return cost; } - return cost; + const secondCost = await checkBatch(secondBatch, firstResult.totalCost); + + // Single hard edit, all other input probability aspects are equal + const thirdBatch = [ + // 't' -> 'b' (sub) + 'beh', + // '' -> 'c' (insertion) + 'tech' + ]; + + await checkBatch(thirdBatch, secondCost); + + // All replace the low-likelihood case for the third input. + const fourthBatch = [ + 'the', 'thi', 'tho', 'thr', + 'thu', 'tha' + ]; + + await checkBatch(fourthBatch, secondCost); + + // Replace the _first_ input's char OR insert an extra char, + // also matching the low-likelihood third-char option. + const fifthBatch = [ + 'cen', 'en', 'gen', + 'ken', 'len', 'men', + 'sen', 'then', 'wen' + ]; + + await checkBatch(fifthBatch, secondCost); } - const secondCost = await checkBatch(secondBatch, firstResult.totalCost); + it('Simple search without input', async () => { + // The combinatorial effect here is a bit much to fully test. + const rootTraversal = testModel.traverseFromRoot(); + assert.isNotEmpty(rootTraversal); + + const searchSpace = new SearchPath(testModel); + + const iter = getBestMatches([searchSpace], buildTestTimer()); + const firstResult = await iter.next(); + assert.isFalse(firstResult.done); + }); + + // Hmm... how best to update this... + it('Simple search (paralleling "Small integration test")', async () => { + // The combinatorial effect here is a bit much to fully test. + const rootTraversal = testModel.traverseFromRoot(); + assert.isNotEmpty(rootTraversal); + + const searchPath = new SearchPath(testModel); + + // VERY artificial distributions. + const synthInput1 = [ + {sample: {insert: 't', deleteLeft: 0}, p: 1} // Transform, probability + ]; + + const synthInput2 = [ + {sample: {insert: 'e', deleteLeft: 0}, p: 0.75}, // Transform, probability + {sample: {insert: 'h', deleteLeft: 0}, p: 0.25} + ]; + + const synthInput3 = [ + {sample: {insert: 'h', deleteLeft: 0}, p: 0.75}, // Transform, probability + {sample: {insert: 'n', deleteLeft: 0}, p: 0.25} + ]; + + const searchPath1 = new SearchPath(searchPath, synthInput1, synthInput1[0]); + const searchPath2 = new SearchPath(searchPath1, synthInput2, synthInput2[0]); + const searchPath3 = new SearchPath(searchPath2, synthInput3, synthInput3[0]); + + assert.notEqual(searchPath1.spaceId, searchPath.spaceId); + assert.notEqual(searchPath2.spaceId, searchPath1.spaceId); + assert.notEqual(searchPath3.spaceId, searchPath2.spaceId); + + const iter = getBestMatches([searchPath3], buildTestTimer()); // disables the correction-search timeout. + await checkRepeatableResults_teh(iter); + }); + + it('Allows reiteration (sequentially)', async () => { + // The combinatorial effect here is a bit much to fully test. + const rootTraversal = testModel.traverseFromRoot(); + assert.isNotEmpty(rootTraversal); + + const searchPath = new SearchPath(testModel); + + // VERY artificial distributions. + const synthInput1 = [ + {sample: {insert: 't', deleteLeft: 0}, p: 1} // Transform, probability + ]; + + const synthInput2 = [ + {sample: {insert: 'e', deleteLeft: 0}, p: 0.75}, // Transform, probability + {sample: {insert: 'h', deleteLeft: 0}, p: 0.25} + ]; + + const synthInput3 = [ + {sample: {insert: 'h', deleteLeft: 0}, p: 0.75}, // Transform, probability + {sample: {insert: 'n', deleteLeft: 0}, p: 0.25} + ]; + + const searchPath1 = new SearchPath(searchPath, synthInput1, synthInput1[0]); + const searchPath2 = new SearchPath(searchPath1, synthInput2, synthInput2[0]); + const searchPath3 = new SearchPath(searchPath2, synthInput3, synthInput3[0]); + + assert.notEqual(searchPath1.spaceId, searchPath.spaceId); + assert.notEqual(searchPath2.spaceId, searchPath1.spaceId); + assert.notEqual(searchPath3.spaceId, searchPath2.spaceId); + + const iter = getBestMatches([searchPath3], buildTestTimer()); // disables the correction-search timeout. + await checkRepeatableResults_teh(iter); + + // The key: do we get the same results the second time? + // Reset the iterator first... + const iter2 = getBestMatches([searchPath3], buildTestTimer()); // disables the correction-search timeout. + await checkRepeatableResults_teh(iter2); + }); + + it('Empty search space, loaded model', async () => { + // The combinatorial effect here is a bit much to fully test. + const rootTraversal = testModel.traverseFromRoot(); + assert.isNotEmpty(rootTraversal); + + const searchSpace = new SearchPath(testModel); + const timer = buildTestTimer(); + const iter = getBestMatches([searchSpace], timer); + + // While there's no input, insertion operations can produce suggestions. + const resultState = await iter.next(); + const result: SearchResult = resultState.value; + + // Just one suggestion root should be returned as the first result. + assert.equal(result.totalCost, 0); // Gives a perfect match + assert.equal(result.inputSequence.length, 0); // for a state with no input and + assert.equal(result.matchString, ''); // an empty match string. + assert.isFalse(resultState.done); + + // Should be able to reach more, though. + const laterResultState = await iter.next(); + const laterResult: SearchResult = laterResultState.value; + + // Edit required: an 'insertion' edge (no input matched, but char pulled + // from lexicon) + assert.isAbove(laterResult.totalCost, 0); + // The most likely word in the lexicon starts with 't'. + assert.equal(laterResult.matchString, 't'); + assert.isFalse(resultState.done); + }); + }); - // Single hard edit, all other input probability aspects are equal - const thirdBatch = [ - // 't' -> 'b' (sub) - 'beh', - // '' -> 'c' (insertion) - 'tech' - ]; + describe('with divergent SearchSpaces', () => { + const buildPathFixture = () => { + const rootPath = new SearchPath(testModel); + + const distrib_t1 = [ + { sample: { insert: 't', deleteLeft: 0, id: 11 }, p: 1 } + ]; + const tPath = new SearchPath(rootPath, distrib_t1, distrib_t1[0]); + + // Note: this does not reflect the actual intended use pattern for these + // types. It's useful for clear testing, though. + // + // In particular, this test is acting as if the following characters + // wouldn't be part of the same TokenizationPath, yet also using the same + // subsetId, as if they were part of the same TokenizationPath. + const distrib_h2 = [ + { sample: { insert: 'h', deleteLeft: 0, id: 12 }, p: 0.5 } + ]; + const distrib_o2 = [ + { sample: { insert: 'o', deleteLeft: 0, id: 12 }, p: 0.3 } + ]; + const distrib_i2 = [ + { sample: { insert: 'r', deleteLeft: 0, id: 12 }, p: 0.2 } + ]; + + const thPath = new SearchPath(tPath, distrib_h2, distrib_h2[0]); + const toPath = new SearchPath(tPath, distrib_o2, thPath.inputSource); + const trPath = new SearchPath(tPath, distrib_i2, thPath.inputSource); + + const twoCharCluster = new SearchCluster([thPath, toPath, trPath]); + + const distrib_v3 = [ + { sample: { insert: 'e', deleteLeft: 0, id: 13 }, p: 0.4 }, + { sample: { insert: 'o', deleteLeft: 0, id: 13 }, p: 0.3 }, + { sample: { insert: 'a', deleteLeft: 0, id: 13 }, p: 0.2 }, + { sample: { insert: 'i', deleteLeft: 0, id: 13 }, p: 0.1 } + ]; + + const thvPath = new SearchPath(thPath, distrib_v3, distrib_v3[0]); + const tovPath = new SearchPath(toPath, distrib_v3, thvPath.inputSource); + const trvPath = new SearchPath(trPath, distrib_v3, thvPath.inputSource); + + const clvPath = new SearchPath(twoCharCluster, distrib_v3, thvPath.inputSource); + + const distrib_n4 = [ + { sample: { insert: 'n', deleteLeft: 0, id: 14 }, p: 0.4 }, + { sample: { insert: 'u', deleteLeft: 0, id: 14 }, p: 0.1 } + ]; + const distrib_v3r = [ + { sample: { insert: 'é', deleteLeft: 1, id: 14 }, p: 0.2 }, + { sample: { insert: 'ó', deleteLeft: 1, id: 14 }, p: 0.15 }, + { sample: { insert: 'á', deleteLeft: 1, id: 14 }, p: 0.1 }, + { sample: { insert: 'í', deleteLeft: 1, id: 14 }, p: 0.05 } + ]; + + const thvnPath = new SearchPath(thvPath, distrib_n4, distrib_n4[0]); + const tovnPath = new SearchPath(tovPath, distrib_n4, thvnPath.inputSource); + const trvnPath = new SearchPath(trvPath, distrib_n4, thvnPath.inputSource); + const clvnPath = new SearchPath(clvPath, distrib_n4, thvnPath.inputSource); + + const thvrPath = new SearchPath(thvPath, distrib_v3r, {...thvnPath.inputSource, subsetId: generateSpaceSeed()}); + const tovrPath = new SearchPath(tovPath, distrib_v3r, thvrPath.inputSource); + const trvrPath = new SearchPath(trvPath, distrib_v3r, thvrPath.inputSource); + const clvrPath = new SearchPath(clvPath, distrib_v3r, thvrPath.inputSource); + + const paths = { + clusterless: { + thvnPath, tovnPath, trvnPath, thvrPath, tovrPath, trvrPath + }, + clustered: { + clvnPath, clvrPath + } + }; - await checkBatch(thirdBatch, secondCost); + const clusterVsPaths = { + paths: { + thPath, trPath, toPath + }, + cluster: twoCharCluster + }; - // All replace the low-likelihood case for the third input. - const fourthBatch = [ - 'the', 'thi', 'tho', 'thr', - 'thu', 'tha' - ]; + return {paths, clusterVsPaths}; + } - await checkBatch(fourthBatch, secondCost); + it('correctly searches across multiple paths with common ancestry (clusterless)', async () => { + const paths = buildPathFixture().paths.clusterless; - // Replace the _first_ input's char OR insert an extra char, - // also matching the low-likelihood third-char option. - const fifthBatch = [ - 'cen', 'en', 'gen', - 'ken', 'len', 'men', - 'sen', 'then', 'wen' - ]; + const gen_thvn = getBestMatches([paths.thvnPath], buildTestTimer()); + assert.equal(((await gen_thvn.next()).value as SearchResult).matchString, 'then'); - await checkBatch(fifthBatch, secondCost); - } + // Passes through both t and h, then diverges. + const gen_thvr = getBestMatches([paths.thvrPath], buildTestTimer()); + assert.equal(((await gen_thvr.next()).value as SearchResult).matchString, 'the'); - it('Simple search without input', async () => { - // The combinatorial effect here is a bit much to fully test. - const rootTraversal = testModel.traverseFromRoot(); - assert.isNotEmpty(rootTraversal); + // Passes through t, then diverges + const gen_trvn = getBestMatches([paths.trvnPath], buildTestTimer()); + assert.equal(((await gen_trvn.next()).value as SearchResult).matchString, 'trou'); - const searchSpace = new SearchPath(testModel); + // Passes through t and r, then diverges. + const gen_trvr = getBestMatches([paths.trvrPath], buildTestTimer()); - const iter = getBestMatches([searchSpace], buildTestTimer()); - const firstResult = await iter.next(); - assert.isFalse(firstResult.done); - }); + // Do we get further expected results if we keep querying? + assert.equal(((await gen_trvr.next()).value as SearchResult).matchString, 'tre'); + assert.equal(((await gen_trvr.next()).value as SearchResult).matchString, 'tro'); + assert.equal(((await gen_trvr.next()).value as SearchResult).matchString, 'tra'); + }); - // Hmm... how best to update this... - it('Simple search (paralleling "Small integration test")', async () => { - // The combinatorial effect here is a bit much to fully test. - const rootTraversal = testModel.traverseFromRoot(); - assert.isNotEmpty(rootTraversal); + it('correctly searches across multiple paths with common ancestry (clustered)', async () => { + const paths = buildPathFixture().paths.clustered; - const searchPath = new SearchPath(testModel); + const gen_clvn = getBestMatches([paths.clvnPath], buildTestTimer()); + const clvnMatches: string[] = []; - // VERY artificial distributions. - const synthInput1 = [ - {sample: {insert: 't', deleteLeft: 0}, p: 1} // Transform, probability - ]; + for(let i=0; i < 10; i++) { + clvnMatches.push(((await gen_clvn.next()).value as SearchResult).matchString); + } + assert.includeMembers(clvnMatches, ['then', 'than', 'thin', 'thou', 'trou']); - const synthInput2 = [ - {sample: {insert: 'e', deleteLeft: 0}, p: 0.75}, // Transform, probability - {sample: {insert: 'h', deleteLeft: 0}, p: 0.25} - ]; + // Passes through both t and h, then diverges. + const gen_clvr = getBestMatches([paths.clvrPath], buildTestTimer()); + const clvrMatches: string[] = []; - const synthInput3 = [ - {sample: {insert: 'h', deleteLeft: 0}, p: 0.75}, // Transform, probability - {sample: {insert: 'n', deleteLeft: 0}, p: 0.25} - ]; + for(let i=0; i < 10; i++) { + clvrMatches.push(((await gen_clvr.next()).value as SearchResult).matchString); + } + assert.includeMembers(clvrMatches, ['the', 'tho', 'tha', 'tre', 'tro', 'thi']); + }); - const searchPath1 = new SearchPath(searchPath, synthInput1, synthInput1[0]); - const searchPath2 = new SearchPath(searchPath1, synthInput2, synthInput2[0]); - const searchPath3 = new SearchPath(searchPath2, synthInput3, synthInput3[0]); + it('correctly searches across multiple paths when search is unevenly staggered', async () => { + const isolatedPaths = buildPathFixture().paths.clustered; - assert.notEqual(searchPath1.spaceId, searchPath.spaceId); - assert.notEqual(searchPath2.spaceId, searchPath1.spaceId); - assert.notEqual(searchPath3.spaceId, searchPath2.spaceId); + const gen_clvn1 = getBestMatches([isolatedPaths.clvnPath], buildTestTimer()); + const isolatedClvnMatches: Set = new Set(); - const iter = getBestMatches([searchPath3], buildTestTimer()); // disables the correction-search timeout. - await checkRepeatableResults_teh(iter); - }); + const SET_COUNT = 3; + const COUNT_PER_SET = 4; + const TOTAL_COUNT = SET_COUNT * COUNT_PER_SET; - it('Allows reiteration (sequentially)', async () => { - // The combinatorial effect here is a bit much to fully test. - const rootTraversal = testModel.traverseFromRoot(); - assert.isNotEmpty(rootTraversal); + while(isolatedClvnMatches.size < TOTAL_COUNT) { + isolatedClvnMatches.add(((await gen_clvn1.next()).value as SearchResult).matchString); + } - const searchPath = new SearchPath(testModel); + // Passes through both t and h, then diverges. + const gen_clvr1 = getBestMatches([isolatedPaths.clvrPath], buildTestTimer()); + const isolatedClvrMatches: Set = new Set(); - // VERY artificial distributions. - const synthInput1 = [ - {sample: {insert: 't', deleteLeft: 0}, p: 1} // Transform, probability - ]; + while(isolatedClvrMatches.size < TOTAL_COUNT) { + isolatedClvrMatches.add(((await gen_clvr1.next()).value as SearchResult).matchString); + } - const synthInput2 = [ - {sample: {insert: 'e', deleteLeft: 0}, p: 0.75}, // Transform, probability - {sample: {insert: 'h', deleteLeft: 0}, p: 0.25} - ]; + // Rebuild anew, and stagger searching four at a time on each, landing on 12 in total per. + const paths = buildPathFixture().paths.clustered; - const synthInput3 = [ - {sample: {insert: 'h', deleteLeft: 0}, p: 0.75}, // Transform, probability - {sample: {insert: 'n', deleteLeft: 0}, p: 0.25} - ]; + const gen_clvn2 = getBestMatches([paths.clvnPath], buildTestTimer()); + const gen_clvr2 = getBestMatches([paths.clvrPath], buildTestTimer()); - const searchPath1 = new SearchPath(searchPath, synthInput1, synthInput1[0]); - const searchPath2 = new SearchPath(searchPath1, synthInput2, synthInput2[0]); - const searchPath3 = new SearchPath(searchPath2, synthInput3, synthInput3[0]); + const clvnMatches: Set = new Set(); + const clvrMatches: Set = new Set(); - assert.notEqual(searchPath1.spaceId, searchPath.spaceId); - assert.notEqual(searchPath2.spaceId, searchPath1.spaceId); - assert.notEqual(searchPath3.spaceId, searchPath2.spaceId); + // Follow the search paths in a staggered manner; this may cause some + // results in one to be of higher cost than what's available from the + // other. + for(let s=0; s < SET_COUNT; s++) { + const SET_MAX = (s + 1) * COUNT_PER_SET; + while(clvnMatches.size < SET_MAX) { + clvnMatches.add(((await gen_clvn2.next()).value as SearchResult).matchString); + } - const iter = getBestMatches([searchPath3], buildTestTimer()); // disables the correction-search timeout. - await checkRepeatableResults_teh(iter); + while(clvrMatches.size < SET_MAX) { + clvrMatches.add(((await gen_clvr2.next()).value as SearchResult).matchString); + } + } - // The key: do we get the same results the second time? - // Reset the iterator first... - const iter2 = getBestMatches([searchPath3], buildTestTimer()); // disables the correction-search timeout. - await checkRepeatableResults_teh(iter2); - }); + assert.sameDeepMembers([...clvnMatches], [...isolatedClvnMatches]); + assert.sameDeepMembers([...clvrMatches], [...isolatedClvrMatches]); + }); + + it('returns the same results, in order, from SearchCluster as from constituent SearchPaths', async () => { + // See issue #14366 - duplicate results may appear due to a later + // right-delete having a lower-cost total than its parent. We use `Set`s + // here to avoid duplicate issues and look simply at what results arise + // first. + // + // From the fixture's construction, note `distrib_v3` and `distrib_v3r`. + // The "duplicate results" scenario arises when the key selected from + // `distrib_v3` does not match, but is deleted and replaced by a valid key + // from `distrib_v3r`. As the latter is reached later, with lower cost, + // it does get reported again. Resolving #14366 properly should help + // mitigate this issue. + + // --- + + // Build independently; let the cluster own a separate, disconnected copy of the paths. + const {paths: pathTest} = buildPathFixture().clusterVsPaths; + + // Validate that the paths individually return the following match strings. + const gen_th = getBestMatches([pathTest.thPath], buildTestTimer()); + assert.equal(((await gen_th.next()).value as SearchResult).matchString, 'th'); + + const gen_to = getBestMatches([pathTest.toPath], buildTestTimer()); + assert.equal(((await gen_to.next()).value as SearchResult).matchString, 'to'); + + const gen_tr = getBestMatches([pathTest.trPath], buildTestTimer()); + assert.equal(((await gen_tr.next()).value as SearchResult).matchString, 'tr'); + + // And now for the real test. + + const {cluster} = buildPathFixture().clusterVsPaths; + // Build independently; let the cluster own a separate, disconnected copy of the paths. + const {paths} = buildPathFixture().clusterVsPaths; + + const clusterGen = getBestMatches([cluster], buildTestTimer()); + const pathsGen = getBestMatches([...Object.values(paths)], buildTestTimer()); + + const genResults: SearchResult[] = []; + const pathsResults: SearchResult[] = []; + + for(let i=0; i < 10; i++) { + genResults.push(((await clusterGen.next()).value as SearchResult)); + // This one can see duplicates for some prefixes due to some paths having outbound + // paths of lower total cost. + pathsResults.push(((await pathsGen.next()).value as SearchResult)); + } - it('Empty search space, loaded model', async () => { - // The combinatorial effect here is a bit much to fully test. - const rootTraversal = testModel.traverseFromRoot(); - assert.isNotEmpty(rootTraversal); - - const searchSpace = new SearchPath(testModel); - const timer = buildTestTimer(); - const iter = getBestMatches([searchSpace], timer); - - // While there's no input, insertion operations can produce suggestions. - const resultState = await iter.next(); - const result: SearchResult = resultState.value; - - // Just one suggestion root should be returned as the first result. - assert.equal(result.totalCost, 0); // Gives a perfect match - assert.equal(result.inputSequence.length, 0); // for a state with no input and - assert.equal(result.matchString, ''); // an empty match string. - assert.isFalse(resultState.done); - - // Should be able to reach more, though. - const laterResultState = await iter.next(); - const laterResult: SearchResult = laterResultState.value; - - // Edit required: an 'insertion' edge (no input matched, but char pulled - // from lexicon) - assert.isAbove(laterResult.totalCost, 0); - // The most likely word in the lexicon starts with 't'. - assert.equal(laterResult.matchString, 't'); - assert.isFalse(resultState.done); + assert.deepEqual(genResults.map(r => r.matchString), pathsResults.map(r => r.matchString)); + + // Ensure that all of the clearly-supported prefixes above show up as results. + assert.sameDeepMembers(pathsResults.slice(0, 3).map(r => r.matchString), ['th', 'to', 'tr']); + // These involve likely-enough corrections that should show, given the model fixture. + assert.includeDeepMembers(pathsResults.map(r => r.matchString), [ + 'ty', // 'type' is quite frequent according to the text fixture. + 't', // Deleting the second keystroke outright lands here. + 'oth', // What if we insert an 'o' early on? 'other' is a very common English word + 'ti' // 'time' is pretty common too. + ]); + + // NOTE: this level of corrections does not yet consider the word likelihood - only + // the raw correction cost. No ordering of "likely word" to "unlikely word" should + // occur yet. + + // 'time': weight 934 + // 'type': weight 540 + const timeResult = pathsResults.find(r => r.matchString == 'ti'); + const typeResult = pathsResults.find(r => r.matchString == 'ty'); + // Correction to either should be equally likely. + assert.equal(timeResult.totalCost, typeResult.totalCost); + }); }); });