diff --git a/integration/analyzer_peliasIndexStreetOneEdgeGram.js b/integration/analyzer_peliasIndexStreetOneEdgeGram.js new file mode 100644 index 00000000..3c789073 --- /dev/null +++ b/integration/analyzer_peliasIndexStreetOneEdgeGram.js @@ -0,0 +1,192 @@ +// validate analyzer is behaving as expected + +var tape = require('tape'), + elastictest = require('elastictest'), + schema = require('../schema'), + punctuation = require('../punctuation'); + +module.exports.tests = {}; + +module.exports.tests.analyze = function (test, common) { + test('analyze', function (t) { + + var suite = new elastictest.Suite(common.clientOpts, { schema: schema }); + var assertAnalysis = analyze.bind(null, suite, t, 'peliasIndexStreetOneEdgeGram'); + suite.action(function (done) { setTimeout(done, 500); }); // wait for es to bring some shards up + + assertAnalysis('lowercase', 'F', ['f']); + assertAnalysis('asciifolding', 'á', ['a']); + assertAnalysis('asciifolding', 'ß', ['s', 'ss']); + assertAnalysis('asciifolding', 'æ', ['a', 'ae']); + assertAnalysis('asciifolding', 'ł', ['l']); + assertAnalysis('asciifolding', 'ɰ', ['m']); + assertAnalysis('trim', ' f ', ['f']); + + // full_token_address_suffix_expansion + assertAnalysis('full_token_address_suffix_expansion', 'rd', ['r', 'rd', 'ro', 'roa', 'road']); + assertAnalysis('full_token_address_suffix_expansion', 'ctr', ['c', 'ct', 'ctr', 'ce', 'cen', 'cent', 'cente', 'center']); + + assertAnalysis('peliasIndexStreetOneEdgeGramFilter', '1 a ab abc abcdefghij', ['1', 'a', 'ab', 'abc', 'abcd', 'abcde', 'abcdef', 'abcdefg', 'abcdefgh', 'abcdefghi', 'abcdefghij']); + assertAnalysis('removeAllZeroNumericPrefix', '00001', ['1']); + + assertAnalysis('unique', '1 1 1', ['1']); + assertAnalysis('notnull', ' / / ', []); + + assertAnalysis('no kstem', 'mcdonalds', ['m', 'mc', 'mcd', 'mcdo', 'mcdon', 'mcdona', 'mcdonal', 'mcdonald', 'mcdonalds']); + assertAnalysis('no kstem', 'McDonald\'s', ['m', 'mc', 'mcd', 'mcdo', 'mcdon', 'mcdona', 'mcdonal', 'mcdonald', 'mcdonalds']); + assertAnalysis('no kstem', 'peoples', ['p', 'pe', 'peo', 'peop', 'peopl', 'people', 'peoples']); + + // remove punctuation (handled by the char_filter) + assertAnalysis('punctuation', punctuation.all.join(''), ['-', '-&']); + + // ensure that very large grams are created + assertAnalysis('largeGrams', 'grolmanstrasse', [ + 'g', 'gr', 'gro', 'grol', 'grolm', 'grolma', 'grolman', 'grolmans', 'grolmanst', + 'grolmanstr', 'grolmanstra', 'grolmanstras', 'grolmanstrass', + 'grolmanstrasse' + ]); + assertAnalysis('largeGrams2', 'Flughafeninformation', ['f', 'fl', 'flu', + 'flug', 'flugh', 'flugha', 'flughaf', 'flughafe', 'flughafen', 'flughafeni', + 'flughafenin', 'flughafeninf', 'flughafeninfo', 'flughafeninfor', + 'flughafeninform', 'flughafeninforma', 'flughafeninformat', 'flughafeninformati', + 'flughafeninformatio', 'flughafeninformation' + ]); + + suite.run(t.end); + }); +}; + +// address suffix expansions should only performed in a way that is +// safe for 'partial tokens'. +module.exports.tests.address_suffix_expansions = function (test, common) { + test('address suffix expansions', function (t) { + + var suite = new elastictest.Suite(common.clientOpts, { schema: schema }); + var assertAnalysis = analyze.bind(null, suite, t, 'peliasIndexStreetOneEdgeGram'); + suite.action(function (done) { setTimeout(done, 500); }); // wait for es to bring some shards up + + assertAnalysis('safe expansions', 'aly', [ + 'a', 'al', 'aly', 'all', 'alle', 'alley' + ]); + + assertAnalysis('safe expansions', 'xing', [ + 'x', 'xi', 'xin', 'xing', 'c', 'cr', 'cro', 'cros', 'cross', 'crossi', 'crossin', 'crossing' + ]); + + assertAnalysis('safe expansions', 'rd', [ + 'r', 'rd', 'ro', 'roa', 'road' + ]); + + assertAnalysis('unsafe expansion', 'ct st', [ + 'c', 'ct', 'co', 'cou', 'cour', 'court', 's', 'st', 'str', 'stre', 'stree', 'street' + ]); + + suite.run(t.end); + }); +}; + +// stop words should be disabled so that the entire street prefix is indexed as ngrams +module.exports.tests.stop_words = function (test, common) { + test('stop words', function (t) { + + var suite = new elastictest.Suite(common.clientOpts, { schema: schema }); + var assertAnalysis = analyze.bind(null, suite, t, 'peliasIndexStreetOneEdgeGram'); + suite.action(function (done) { setTimeout(done, 500); }); // wait for es to bring some shards up + + assertAnalysis('street suffix', 'AB street', [ + 'a', 'ab', 's', 'st', 'str', 'stre', 'stree', 'street' + ]); + + assertAnalysis('street suffix (abbreviation)', 'AB st', [ + 'a', 'ab', 's', 'st', 'str', 'stre', 'stree', 'street' + ]); + + suite.run(t.end); + }); +}; + +module.exports.tests.street = function (test, common) { + test('street', function (t) { + + var suite = new elastictest.Suite(common.clientOpts, { schema: schema }); + var assertAnalysis = analyze.bind(null, suite, t, 'peliasIndexStreetOneEdgeGram'); + suite.action(function (done) { setTimeout(done, 500); }); // wait for es to bring some shards up + + assertAnalysis('street', 'mapzen place', [ + 'm', 'ma', 'map', 'mapz', 'mapze', 'mapzen', 'p', 'pl', 'pla', 'plac', 'place' + ]); + + assertAnalysis('street', 'w 26 st', [ + 'w', 'we', 'wes', 'west', '26', 's', 'st', 'str', 'stre', 'stree', 'street' + ]); + + assertAnalysis('street', '83 st', [ + '83', 's', 'st', 'str', 'stre', 'stree', 'street' + ]); + + suite.run(t.end); + }); +}; + +// @see: https://github.com/pelias/api/issues/600 +module.exports.tests.unicode = function (test, common) { + test('normalization', function (t) { + + var suite = new elastictest.Suite(common.clientOpts, { schema: schema }); + var assertAnalysis = analyze.bind(null, suite, t, 'peliasIndexStreetOneEdgeGram'); + suite.action(function (done) { setTimeout(done, 500); }); // wait for es to bring some shards up + + var latin_large_letter_e_with_acute = String.fromCodePoint(0x00C9); + var latin_small_letter_e_with_acute = String.fromCodePoint(0x00E9); + var combining_acute_accent = String.fromCodePoint(0x0301); + var latin_large_letter_e = String.fromCodePoint(0x0045); + var latin_small_letter_e = String.fromCodePoint(0x0065); + + // Chambéry (both forms appear the same) + var composed = "Chamb" + latin_small_letter_e_with_acute + "ry"; + var decomposed = "Chamb" + combining_acute_accent + latin_small_letter_e + "ry" + + assertAnalysis('composed', composed, ['c', 'ch', 'cha', 'cham', 'chamb', 'chambe', 'chamber', 'chambery']); + assertAnalysis('decomposed', decomposed, ['c', 'ch', 'cha', 'cham', 'chamb', 'chambe', 'chamber', 'chambery']); + + // Één (both forms appear the same) + var composed = latin_large_letter_e_with_acute + latin_small_letter_e_with_acute + "n"; + var decomposed = combining_acute_accent + latin_large_letter_e + combining_acute_accent + latin_small_letter_e + "n" + + assertAnalysis('composed', composed, ['e', 'ee', 'een']); + assertAnalysis('decomposed', decomposed, ['e', 'ee', 'een']); + + suite.run(t.end); + }); +}; + +module.exports.all = function (tape, common) { + + function test(name, testFunction) { + return tape('peliasIndexStreetOneEdgeGram: ' + name, testFunction); + } + + for (var testCase in module.exports.tests) { + module.exports.tests[testCase](test, common); + } +}; + +function analyze(suite, t, analyzer, comment, text, expected) { + suite.assert(function (done) { + suite.client.indices.analyze({ + index: suite.props.index, + analyzer: analyzer, + text: text + }, function (err, res) { + if (err) console.error(err); + t.deepEqual(simpleTokens(res.tokens), expected, comment); + done(); + }); + }); +} + +function simpleTokens(tokens) { + return tokens.map(function (t) { + return t.token; + }); +} diff --git a/integration/run.js b/integration/run.js index 8758906a..835cbd7e 100644 --- a/integration/run.js +++ b/integration/run.js @@ -33,6 +33,7 @@ var tests = [ require('./analyzer_peliasHousenumber.js'), require('./analyzer_peliasZip.js'), require('./analyzer_peliasStreet.js'), + require('./analyzer_peliasIndexStreetOneEdgeGram.js'), require('./address_matching.js'), require('./admin_matching.js'), require('./source_layer_sourceid_filtering.js'), diff --git a/mappings/document.js b/mappings/document.js index 60b5ff20..5e2cec32 100644 --- a/mappings/document.js +++ b/mappings/document.js @@ -39,10 +39,30 @@ var schema = { street: { type: 'string', analyzer: 'peliasStreet', + fields: { + ngram: { + type: 'string', + analyzer: 'peliasIndexStreetOneEdgeGram', + doc_values: false, + fielddata: { + format: 'disabled' + } + } + } }, cross_street: { type: 'string', analyzer: 'peliasStreet', + fields: { + ngram: { + type: 'string', + analyzer: 'peliasIndexStreetOneEdgeGram', + doc_values: false, + fielddata: { + format: 'disabled' + } + } + } }, zip: { type: 'string', diff --git a/settings.js b/settings.js index a31096c3..eb62b047 100644 --- a/settings.js +++ b/settings.js @@ -162,7 +162,33 @@ function generate(){ "remove_ordinals", "trim" ] - } + }, + "peliasIndexStreetOneEdgeGram": { + "type": "custom", + "tokenizer": "peliasStreetTokenizer", + "char_filter": ["punctuation", "nfkc_normalizer"], + "filter": [ + "lowercase", + "trim", + "remove_duplicate_spaces", + "custom_street", + "street_suffix", + "directionals", + "icu_folding", + "remove_ordinals", + "removeAllZeroNumericPrefix", + "trim", + "surround_single_characters_with_word_markers", + "house_number_word_delimiter", + "remove_single_characters", + "surround_house_numbers_with_word_markers", + "peliasOneEdgeGramFilter", + "eliminate_tokens_starting_with_word_marker", + "remove_encapsulating_word_markers", + "unique", + "notnull" + ] + }, }, "filter" : { "notnull" :{ diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 750b79a4..bc51f907 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -161,6 +161,35 @@ "remove_ordinals", "trim" ] + }, + "peliasIndexStreetOneEdgeGram": { + "type": "custom", + "tokenizer": "peliasStreetTokenizer", + "char_filter": [ + "punctuation", + "nfkc_normalizer" + ], + "filter": [ + "lowercase", + "trim", + "remove_duplicate_spaces", + "custom_street", + "street_suffix", + "directionals", + "icu_folding", + "remove_ordinals", + "removeAllZeroNumericPrefix", + "trim", + "surround_single_characters_with_word_markers", + "house_number_word_delimiter", + "remove_single_characters", + "surround_house_numbers_with_word_markers", + "peliasOneEdgeGramFilter", + "eliminate_tokens_starting_with_word_marker", + "remove_encapsulating_word_markers", + "unique", + "notnull" + ] } }, "filter": { @@ -859,11 +888,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -1496,11 +1545,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -2133,11 +2202,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -2770,11 +2859,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -3407,11 +3516,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -4044,11 +4173,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -4681,11 +4830,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -5318,11 +5487,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -5955,11 +6144,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -6592,11 +6801,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -7229,11 +7458,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -7866,11 +8115,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -8503,11 +8772,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -9140,11 +9429,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string", @@ -9777,11 +10086,31 @@ }, "street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "cross_street": { "type": "string", - "analyzer": "peliasStreet" + "analyzer": "peliasStreet", + "fields": { + "ngram": { + "type": "string", + "analyzer": "peliasIndexStreetOneEdgeGram", + "doc_values": false, + "fielddata": { + "format": "disabled" + } + } + } }, "zip": { "type": "string",