From 19e565086e5a61cdbfa9b8e7f7e09b004a96df75 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 4 Aug 2020 11:13:39 +0200 Subject: [PATCH 1/2] fix(custom_multiword_synonyms): re-enable support for custom multi-word synonyms --- settings.js | 21 +++++++++++++-- synonyms/linter.js | 7 +++++ test/fixtures/expected.json | 52 +++++++++++++++++++++++++++++++++++++ test/settings.js | 4 +++ 4 files changed, 82 insertions(+), 2 deletions(-) diff --git a/settings.js b/settings.js index dca59cd3..84558a3c 100644 --- a/settings.js +++ b/settings.js @@ -34,6 +34,7 @@ function generate(){ "filter": [ "lowercase", "trim", + "synonyms/custom_admin/multiword", "admin_synonyms_multiplexer", "icu_folding", "word_delimiter", @@ -49,6 +50,7 @@ function generate(){ "filter": [ "lowercase", "trim", + "synonyms/custom_name/multiword", "name_synonyms_multiplexer", "icu_folding", "remove_ordinals", @@ -81,6 +83,7 @@ function generate(){ "lowercase", "trim", "remove_duplicate_spaces", + "synonyms/custom_name/multiword", "name_synonyms_multiplexer", "icu_folding", "remove_ordinals", @@ -126,6 +129,7 @@ function generate(){ "lowercase", "trim", "remove_duplicate_spaces", + "synonyms/custom_street/multiword", "street_synonyms_multiplexer", "icu_folding", "remove_ordinals", @@ -227,10 +231,23 @@ function generate(){ // dynamically create filters for all synonym files in the ./synonyms directory. // each filter is given the same name as the file, paths separators are replaced with // underscores and the file extension is removed. - _.each(synonyms, (synonym, name) => { + // note: if no synonym entries are present in the list we use an array + // containing an empty space to avoid elasticsearch schema parsing errors. + _.each(synonyms, (entries, name) => { + + const singleWordEntries = entries.filter(e => !/\s/.test(e)) + const multiWordEntries = entries.filter(e => /\s/.test(e)) + + // generate a filter containing single-word synonyms settings.analysis.filter[`synonyms/${name}`] = { "type": "synonym", - "synonyms": !_.isEmpty(synonym) ? synonym : [''] + "synonyms": !_.isEmpty(singleWordEntries) ? singleWordEntries : [''] + }; + + // generate a filter containing multi-word synonyms + settings.analysis.filter[`synonyms/${name}/multiword`] = { + "type": "synonym", + "synonyms": !_.isEmpty(multiWordEntries) ? multiWordEntries : [''] }; }); diff --git a/synonyms/linter.js b/synonyms/linter.js index f4417899..22730a78 100644 --- a/synonyms/linter.js +++ b/synonyms/linter.js @@ -41,6 +41,7 @@ function linter(synonyms) { letterCasing(line, logprefix, tokens); tokensSanityCheck(line, logprefix, tokens); multiWordCheck(line, logprefix, tokens); + tokenReplacementCheck(line, logprefix); // tokenLengthCheck(line, logprefix, tokens); }) }) @@ -74,6 +75,12 @@ function multiWordCheck(line, logprefix, tokens) { }); } +function tokenReplacementCheck(line, logprefix) { + if (/=>/.test(line)) { + logger.warn(`${logprefix} synonym rule '=>' is not supported, use ',' instead`); + } +} + function tokenLengthCheck(line, logprefix, tokens) { _.each(tokens, token => { if (token.length <= 1) { diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 80ac9b0d..f8693c95 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -30,6 +30,7 @@ "filter": [ "lowercase", "trim", + "synonyms/custom_admin/multiword", "admin_synonyms_multiplexer", "icu_folding", "word_delimiter", @@ -48,6 +49,7 @@ "filter": [ "lowercase", "trim", + "synonyms/custom_name/multiword", "name_synonyms_multiplexer", "icu_folding", "remove_ordinals", @@ -86,6 +88,7 @@ "lowercase", "trim", "remove_duplicate_spaces", + "synonyms/custom_name/multiword", "name_synonyms_multiplexer", "icu_folding", "remove_ordinals", @@ -142,6 +145,7 @@ "lowercase", "trim", "remove_duplicate_spaces", + "synonyms/custom_street/multiword", "street_synonyms_multiplexer", "icu_folding", "remove_ordinals", @@ -218,18 +222,36 @@ "" ] }, + "synonyms/custom_admin/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, "synonyms/custom_name": { "type": "synonym", "synonyms": [ "" ] }, + "synonyms/custom_name/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, "synonyms/custom_street": { "type": "synonym", "synonyms": [ "" ] }, + "synonyms/custom_street/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, "synonyms/directionals": { "type": "synonym", "synonyms": [ @@ -304,6 +326,12 @@ "sud,s" ] }, + "synonyms/directionals/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, "synonyms/personal_titles": { "type": "synonym", "synonyms": [ @@ -500,6 +528,12 @@ "veuve,vve" ] }, + "synonyms/personal_titles/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, "synonyms/place_names": { "type": "synonym", "synonyms": [ @@ -819,6 +853,12 @@ "étang,etang" ] }, + "synonyms/place_names/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, "synonyms/punctuation": { "type": "synonym", "synonyms": [ @@ -826,6 +866,12 @@ "&,und" ] }, + "synonyms/punctuation/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] + }, "synonyms/streets": { "type": "synonym", "synonyms": [ @@ -1639,6 +1685,12 @@ "wl,well", "wls,wells" ] + }, + "synonyms/streets/multiword": { + "type": "synonym", + "synonyms": [ + "" + ] } }, "char_filter": { diff --git a/test/settings.js b/test/settings.js index badb401c..26d7272d 100644 --- a/test/settings.js +++ b/test/settings.js @@ -83,6 +83,7 @@ module.exports.tests.peliasAdminAnalyzer = function(test, common) { t.deepEqual(analyzer.filter, [ "lowercase", "trim", + "synonyms/custom_admin/multiword", "admin_synonyms_multiplexer", "icu_folding", "word_delimiter", @@ -130,6 +131,7 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) { t.deepEqual( analyzer.filter, [ "lowercase", "trim", + "synonyms/custom_name/multiword", "name_synonyms_multiplexer", "icu_folding", "remove_ordinals", @@ -186,6 +188,7 @@ module.exports.tests.peliasPhraseAnalyzer = function(test, common) { "lowercase", "trim", "remove_duplicate_spaces", + "synonyms/custom_name/multiword", "name_synonyms_multiplexer", "icu_folding", "remove_ordinals", @@ -293,6 +296,7 @@ module.exports.tests.peliasStreetAnalyzer = function(test, common) { "lowercase", "trim", "remove_duplicate_spaces", + "synonyms/custom_street/multiword", "street_synonyms_multiplexer", "icu_folding", "remove_ordinals", From 1baeadfe9529e8ede8207c8a688ee5459efe3c48 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 4 Aug 2020 12:50:35 +0200 Subject: [PATCH 2/2] fix(multiword_synonyms): fix tokenizer regex used by linter and synonyms filter(s) --- settings.js | 7 +++++-- synonyms/linter.js | 11 ++++++++--- synonyms/place_names/fr.txt | 2 +- synonyms/streets/en.txt | 8 ++++---- test/fixtures/expected.json | 10 +++++----- 5 files changed, 23 insertions(+), 15 deletions(-) diff --git a/settings.js b/settings.js index 84558a3c..fe214673 100644 --- a/settings.js +++ b/settings.js @@ -235,8 +235,11 @@ function generate(){ // containing an empty space to avoid elasticsearch schema parsing errors. _.each(synonyms, (entries, name) => { - const singleWordEntries = entries.filter(e => !/\s/.test(e)) - const multiWordEntries = entries.filter(e => /\s/.test(e)) + // same tokenizer regex as above except without comma + // (which is a delimeter within the synonym files) + const tokenizerRegex = new RegExp('[\\s/\\\\-]+'); + const singleWordEntries = entries.filter(e => !tokenizerRegex.test(e)) + const multiWordEntries = entries.filter(e => tokenizerRegex.test(e)) // generate a filter containing single-word synonyms settings.analysis.filter[`synonyms/${name}`] = { diff --git a/synonyms/linter.js b/synonyms/linter.js index 22730a78..f7ab2a89 100644 --- a/synonyms/linter.js +++ b/synonyms/linter.js @@ -2,6 +2,11 @@ const _ = require('lodash'); const logger = require('pelias-logger').get('schema-synonyms'); const punctuation = require('../punctuation'); +// same tokenizer regex as the schema +const TOKENIZER_REGEX = new RegExp('[\\s,/\\\\-]+'); +const DEMIMETER_REGEX = /,|=>/g +const REPLACEMENT_REGEX = /=>/ + /** * The synonyms linter attempts to warn the user when making * common mistakes with synonyms. @@ -22,7 +27,7 @@ function linter(synonyms) { logger.debug(`[line] ${line}`); // split the lines by delimeter - let tokens = line.split(/,|=>/g).map(t => t.trim()); + let tokens = line.split(DEMIMETER_REGEX).map(t => t.trim()); // strip blacklisted punctuation from synonyms // the 'punctuation.blacklist' contains a list of characters which are @@ -69,14 +74,14 @@ function tokensSanityCheck(line, logprefix, tokens) { function multiWordCheck(line, logprefix, tokens) { _.each(tokens, token => { - if (/\s/.test(token)){ + if (TOKENIZER_REGEX.test(token)){ logger.warn(`${logprefix} multi word synonyms may cause issues with phrase queries:`, token); } }); } function tokenReplacementCheck(line, logprefix) { - if (/=>/.test(line)) { + if (REPLACEMENT_REGEX.test(line)) { logger.warn(`${logprefix} synonym rule '=>' is not supported, use ',' instead`); } } diff --git a/synonyms/place_names/fr.txt b/synonyms/place_names/fr.txt index 8bc060dc..9a29fc74 100644 --- a/synonyms/place_names/fr.txt +++ b/synonyms/place_names/fr.txt @@ -1,5 +1,5 @@ abbaye, abe -auto-école, autoécole, autoecole +autoécole, autoecole aéroport, aeroport bastide, bstd baston, bast diff --git a/synonyms/streets/en.txt b/synonyms/streets/en.txt index c211d0a3..636c194a 100644 --- a/synonyms/streets/en.txt +++ b/synonyms/streets/en.txt @@ -91,12 +91,12 @@ crest, crst, cst crief, crf croft, cft cross, cs, crss -crossing, crsg, xing, csg, x-ing -crossroad, crd, xroad, x-road, xrd, x-rd +crossing, crsg, xing, csg +crossroad, crd, xroad, xrd crossroads, xrds -crossway, cowy, crwy, xway, xwy, x-way +crossway, cowy, crwy, xway, xwy cruiseway, cuwy, crwy -cul-de-sac, culdesac, cds, cusac, csac +culdesac, cds, cusac, csac curve, cve, crv, crve, curv cutting, cttg, ctg, cutt dale, dle diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index f8693c95..5e80218c 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -776,7 +776,7 @@ "volcan,vlcn", "voluntarios,voluntos", "abbaye,abe", - "auto-école,autoécole,autoecole", + "autoécole,autoecole", "aéroport,aeroport", "bastide,bstd", "baston,bast", @@ -988,12 +988,12 @@ "crief,crf", "croft,cft", "cross,cs,crss", - "crossing,crsg,xing,csg,x-ing", - "crossroad,crd,xroad,x-road,xrd,x-rd", + "crossing,crsg,xing,csg", + "crossroad,crd,xroad,xrd", "crossroads,xrds", - "crossway,cowy,crwy,xway,xwy,x-way", + "crossway,cowy,crwy,xway,xwy", "cruiseway,cuwy,crwy", - "cul-de-sac,culdesac,cds,cusac,csac", + "culdesac,cds,cusac,csac", "curve,cve,crv,crve,curv", "cutting,cttg,ctg,cutt", "dale,dle",