From ea8e88fa1116d82c218ed051adbfef7ed5cdb36d Mon Sep 17 00:00:00 2001 From: Colin Brown Date: Wed, 4 Aug 2021 11:04:02 +0100 Subject: [PATCH] Added test to ensure character arrays that include spaces are tokenized as expected. Improved tokenizer to 1. create tokens for spaces when supplied array includes them 2. optimize occurance counting loop so that iteration stops once nGramMax tokens acquired --- src/classifier.js | 34 ++++++++++++++++++++-------------- test/classifier.js | 16 +++++++++++++--- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/classifier.js b/src/classifier.js index 073f74a..afc0776 100644 --- a/src/classifier.js +++ b/src/classifier.js @@ -68,7 +68,7 @@ class Classifier { tokens = this.vectorize(tokens) } - // Set up an empty entry for the label if it does not exist + // Set up an empty entry for the label if it does not exist if (typeof this._model.data[label] === 'undefined') { this._model.data[label] = {} } @@ -148,7 +148,7 @@ class Classifier { /** * Split a string into an array of lowercase words, with all non-letter characters removed - * + * * @param {string} input * @return {Array} */ @@ -179,7 +179,7 @@ class Classifier { if (!(words instanceof Array)) { throw new Error('input must be either a string or Array') } - + if (this._model.nGramMax < this._model.nGramMin) { throw new Error('Invalid nGramMin/nGramMax combination in model config') } @@ -190,22 +190,28 @@ class Classifier { // based on the models configured min/max values words.forEach((word, index) => { let sequence = '' - - words.slice(index).forEach(nextWord => { + let tokenCount = 0 + let nextWord + + // Create n-gram(s) of between nGramMin and nGramMax words from segment starting at (index) + // Increment the occurrence counter (tokens[sequence]) for each n-gram created + // Stop looping once we have nGramMax words (or reach the end of the segment) + let segment = words.slice(index) + while (tokenCount < this._model.nGramMax && tokenCount < segment.length) { + nextWord = segment[tokenCount] sequence += sequence ? (' ' + nextWord) : nextWord - let tokenCount = sequence.split(' ').length + tokenCount++ + if(tokenCount >= this._model.nGramMin && tokenCount <= this._model.nGramMax) { + if (typeof tokens[sequence] === 'undefined') { + tokens[sequence] = 0 + } - if (tokenCount < this._model.nGramMin || tokenCount > this._model.nGramMax) { - return + ++tokens[sequence] } + } + }) - if (typeof tokens[sequence] === 'undefined') { - tokens[sequence] = 0 - } - ++tokens[sequence] - }) - }) return tokens } diff --git a/test/classifier.js b/test/classifier.js index a8b3791..b5573da 100644 --- a/test/classifier.js +++ b/test/classifier.js @@ -74,7 +74,7 @@ describe('Classifier', () => { const classifier = new Classifier() classifier.model.nGramMin = 2 - + expect(() => classifier.tokenize('Hello world!')).to.throw(Error) }) @@ -129,6 +129,16 @@ describe('Classifier', () => { }) }) + it('should create a unigrams for the space character from an array of characters including a space', () => { + const classifier = new Classifier() + + expect(classifier.tokenize([' ','a','b'])).to.eql({ + ' ': 1, + 'a': 1, + 'b': 1 + }) + }) + it('should increment the occurrence of the duplicate tokens', () => { const classifier = new Classifier() @@ -195,7 +205,7 @@ describe('Classifier', () => { expect(() => classifier.train('test', [])).to.throw(Error) }) - + it('should add tokens to the vocabulary (if not configured to false)', () => { const classifier = new Classifier() @@ -254,7 +264,7 @@ describe('Classifier', () => { expect(classifier.train('hello world', 'test')).to.equal(classifier) }) }) - + describe('cosineSimilarity', () => { it('should throw an error if v1 is not an object literal', () => { const classifier = new Classifier()