From 28135b50ae0a187558652249032dcdac05124fec Mon Sep 17 00:00:00 2001 From: Siarhei Fedartsou Date: Sat, 1 Mar 2025 17:27:56 +0100 Subject: [PATCH 1/4] Move icuTokenizer flag into featureFlags object in config --- .github/workflows/_integration_tests.yml | 2 +- .github/workflows/_unit_tests.yml | 2 +- configValidation.js | 10 ++++++---- integration/analyzer_peliasIndexOneEdgeGram.js | 2 +- integration/analyzer_peliasQuery.js | 2 +- integration/analyzer_peliasStreet.js | 2 +- settings-icu.js | 2 +- settings.js | 2 +- test/fixtures/config-icu-tokenizer.json | 3 +-- test/settings.js | 4 ++-- 10 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/workflows/_integration_tests.yml b/.github/workflows/_integration_tests.yml index ee163d8e..1e594c90 100644 --- a/.github/workflows/_integration_tests.yml +++ b/.github/workflows/_integration_tests.yml @@ -23,7 +23,7 @@ jobs: - name: Run integration tests run: | if [ "${{ matrix.icuTokenizer }}" = "true" ]; then - jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json + jq -n '{ featureFlags: { icuTokenizer: true } }' > $(pwd)/config-icu.json export PELIAS_CONFIG=$(pwd)/config-icu.json fi npm install diff --git a/.github/workflows/_unit_tests.yml b/.github/workflows/_unit_tests.yml index d0ef57f0..2b3081f2 100644 --- a/.github/workflows/_unit_tests.yml +++ b/.github/workflows/_unit_tests.yml @@ -19,7 +19,7 @@ jobs: run: | npm install if [ "${{ matrix.icuTokenizer }}" = "true" ]; then - jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json + jq -n '{ featureFlags: { icuTokenizer: true } }' > $(pwd)/config-icu.json export PELIAS_CONFIG=$(pwd)/config-icu.json fi npm run test \ No newline at end of file diff --git a/configValidation.js b/configValidation.js index 02d462ed..1aa3b282 100644 --- a/configValidation.js +++ b/configValidation.js @@ -2,14 +2,16 @@ const Joi = require('@hapi/joi'); // Schema Configuration // schema.indexName: populated by defaults if not overridden -// schema.icuTokenizer: boolean, optional, defaults to false // esclient: object, validation performed by elasticsearch module +// featureFlags.icuTokenizer: boolean, optional, defaults to false const schema = Joi.object().required().keys({ schema: Joi.object().required().keys({ - indexName: Joi.string().required(), - icuTokenizer: Joi.boolean().optional() + indexName: Joi.string().required() }), - esclient: Joi.object().required() + esclient: Joi.object().required(), + featureFlags: Joi.object().optional().keys({ + icuTokenizer: Joi.boolean().optional() + }).default({}).unknown(true) }).unknown(true); module.exports = { diff --git a/integration/analyzer_peliasIndexOneEdgeGram.js b/integration/analyzer_peliasIndexOneEdgeGram.js index 523c092f..dc2b3c06 100644 --- a/integration/analyzer_peliasIndexOneEdgeGram.js +++ b/integration/analyzer_peliasIndexOneEdgeGram.js @@ -85,7 +85,7 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] ); assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] ); - if (config.schema.icuTokenizer) { + if (config.featureFlags.icuTokenizer) { assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [ '0:ซ', '0:ซอ', '0:ซอย', '1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ', diff --git a/integration/analyzer_peliasQuery.js b/integration/analyzer_peliasQuery.js index fc5a579d..b68b6e76 100644 --- a/integration/analyzer_peliasQuery.js +++ b/integration/analyzer_peliasQuery.js @@ -68,7 +68,7 @@ module.exports.tests.functional = function(test, common){ assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]); // complicated tokenization for some Asian languages - if (config.schema.icuTokenizer) { + if (config.featureFlags.icuTokenizer) { assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index 7f861f9a..a0061212 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -24,7 +24,7 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] ); // complicated tokenization for some Asian languages - if (config.schema.icuTokenizer) { + if (config.featureFlags.icuTokenizer) { assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); diff --git a/settings-icu.js b/settings-icu.js index 2ebfa2c0..081eb3ba 100644 --- a/settings-icu.js +++ b/settings-icu.js @@ -4,7 +4,7 @@ const _ = require('lodash'); * This module contains modifications to the Pelias schema to adopt the elastic ICU tokenizer. * This tokenizer improves word-splitting of non-latin alphabets (particularly Asian languages). * - * It can be enabled by setting `config.schema.icuTokenizer` in your `pelias.json` config. + * It can be enabled by setting `config.featureFlags.icuTokenizer` in your `pelias.json` config. * Note: this must be set *before* you create your elasticsearch index or it will have no effect. * * This feature is considered beta, we encourage testing & feedback from the community in order diff --git a/settings.js b/settings.js index be3ad673..21b16073 100644 --- a/settings.js +++ b/settings.js @@ -293,7 +293,7 @@ function generate(){ }); // Experimental ICU tokenizer - if (config.schema.icuTokenizer) { + if (config.featureFlags.icuTokenizer) { settings = settingsICU(settings); } diff --git a/test/fixtures/config-icu-tokenizer.json b/test/fixtures/config-icu-tokenizer.json index 81e70ce7..4112b0e8 100644 --- a/test/fixtures/config-icu-tokenizer.json +++ b/test/fixtures/config-icu-tokenizer.json @@ -8,8 +8,7 @@ } } }, - "schema": { + "featureFlags": { "icuTokenizer": true } } - \ No newline at end of file diff --git a/test/settings.js b/test/settings.js index 37789dfd..14e8fc1d 100644 --- a/test/settings.js +++ b/test/settings.js @@ -50,14 +50,14 @@ module.exports.tests.analysis = function(test, common) { }; function mayBeAmpersandMapper() { - if (config.schema.icuTokenizer) { + if (config.featureFlags.icuTokenizer) { return ['ampersand_mapper']; } return []; } function mayBeAmpersandReplacer() { - if (config.schema.icuTokenizer) { + if (config.featureFlags.icuTokenizer) { return ['ampersand_replacer']; } return []; From 38767a233123e311a6a858d6a9960d9e045bde3e Mon Sep 17 00:00:00 2001 From: Siarhei Fedartsou Date: Sat, 1 Mar 2025 17:41:58 +0100 Subject: [PATCH 2/4] Move icuTokenizer flag into featureFlags object in config --- configValidation.js | 2 +- integration/analyzer_peliasIndexOneEdgeGram.js | 2 +- integration/analyzer_peliasQuery.js | 4 ++-- integration/analyzer_peliasStreet.js | 2 +- settings.js | 2 +- test/fixtures/config-icu-tokenizer.json | 3 ++- test/settings.js | 4 ++-- 7 files changed, 10 insertions(+), 9 deletions(-) diff --git a/configValidation.js b/configValidation.js index 1aa3b282..9699c9c8 100644 --- a/configValidation.js +++ b/configValidation.js @@ -11,7 +11,7 @@ const schema = Joi.object().required().keys({ esclient: Joi.object().required(), featureFlags: Joi.object().optional().keys({ icuTokenizer: Joi.boolean().optional() - }).default({}).unknown(true) + }).unknown(true) }).unknown(true); module.exports = { diff --git a/integration/analyzer_peliasIndexOneEdgeGram.js b/integration/analyzer_peliasIndexOneEdgeGram.js index dc2b3c06..17438ccc 100644 --- a/integration/analyzer_peliasIndexOneEdgeGram.js +++ b/integration/analyzer_peliasIndexOneEdgeGram.js @@ -85,7 +85,7 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] ); assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] ); - if (config.featureFlags.icuTokenizer) { + if (config.featureFlags?.icuTokenizer) { assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [ '0:ซ', '0:ซอ', '0:ซอย', '1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ', diff --git a/integration/analyzer_peliasQuery.js b/integration/analyzer_peliasQuery.js index b68b6e76..bee117fa 100644 --- a/integration/analyzer_peliasQuery.js +++ b/integration/analyzer_peliasQuery.js @@ -24,7 +24,7 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis('thai_digits', '๐๑๒๓๔๕๖๗ ๘๙', ['01234567', '89']); // leading zero remains assertAnalysis('thai_digits', '๑๒๓๔๕๖๗๐ ๘๙', ['12345670', '89']); assertAnalysis('digit_glued_to_word', 'john doe42', ['john', 'doe42']); - if (config.schema.icuTokenizer) { + if (config.featureFlags?.icuTokenizer) { assertAnalysis('thai_tonemarks', 'ก่ก้ก๊ก๋ข่ข้ข๊ข๋ค่ค้ค๊ค๋ฆ่ฆ้ฆ๊ฆ๋', ['กก', 'กก', 'ขขขขคคคคฆฆฆฆ']); assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); } else { @@ -68,7 +68,7 @@ module.exports.tests.functional = function(test, common){ assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]); // complicated tokenization for some Asian languages - if (config.featureFlags.icuTokenizer) { + if (config.featureFlags?.icuTokenizer) { assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index a0061212..d2960574 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -24,7 +24,7 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] ); // complicated tokenization for some Asian languages - if (config.featureFlags.icuTokenizer) { + if (config.featureFlags?.icuTokenizer) { assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); diff --git a/settings.js b/settings.js index 21b16073..03de559b 100644 --- a/settings.js +++ b/settings.js @@ -293,7 +293,7 @@ function generate(){ }); // Experimental ICU tokenizer - if (config.featureFlags.icuTokenizer) { + if (config.featureFlags?.icuTokenizer) { settings = settingsICU(settings); } diff --git a/test/fixtures/config-icu-tokenizer.json b/test/fixtures/config-icu-tokenizer.json index 4112b0e8..999c298a 100644 --- a/test/fixtures/config-icu-tokenizer.json +++ b/test/fixtures/config-icu-tokenizer.json @@ -9,6 +9,7 @@ } }, "featureFlags": { - "icuTokenizer": true + "icuTokenizer": true, + "some_another_feature_flag": "just_to_check_that_we_can_have_fields_not_defined_in_schema_here" } } diff --git a/test/settings.js b/test/settings.js index 14e8fc1d..b42d8e64 100644 --- a/test/settings.js +++ b/test/settings.js @@ -50,14 +50,14 @@ module.exports.tests.analysis = function(test, common) { }; function mayBeAmpersandMapper() { - if (config.featureFlags.icuTokenizer) { + if (config.featureFlags?.icuTokenizer) { return ['ampersand_mapper']; } return []; } function mayBeAmpersandReplacer() { - if (config.featureFlags.icuTokenizer) { + if (config.featureFlags?.icuTokenizer) { return ['ampersand_replacer']; } return []; From 711979b5399526c7444f31ba46a25b7dea3d3106 Mon Sep 17 00:00:00 2001 From: Siarhei Fedartsou Date: Sat, 1 Mar 2025 17:51:27 +0100 Subject: [PATCH 3/4] Move icuTokenizer flag into featureFlags object in config --- test/configValidation.js | 23 +++++++++++++++++++++++ test/fixtures/config-icu-tokenizer.json | 3 +-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/test/configValidation.js b/test/configValidation.js index f5469b6a..8e6afa75 100644 --- a/test/configValidation.js +++ b/test/configValidation.js @@ -5,6 +5,29 @@ const configValidation = require('../configValidation'); module.exports.tests = {}; module.exports.tests.interface = function(test, common) { + + test('does not throw on unknown feature flags', function(t) { + var config = { + schema: { + indexName: "pelias" + }, + esclient: {}, + featureFlags: { + unknown_feature_flag: { + nested: true + }, + unknown_feature_flag2: true + } + }; + + t.doesNotThrow(function () { + configValidation.validate(config); + }); + t.end(); + + }); + + test('config without schema should throw error', function(t) { var config = { esclient: {} diff --git a/test/fixtures/config-icu-tokenizer.json b/test/fixtures/config-icu-tokenizer.json index 999c298a..4112b0e8 100644 --- a/test/fixtures/config-icu-tokenizer.json +++ b/test/fixtures/config-icu-tokenizer.json @@ -9,7 +9,6 @@ } }, "featureFlags": { - "icuTokenizer": true, - "some_another_feature_flag": "just_to_check_that_we_can_have_fields_not_defined_in_schema_here" + "icuTokenizer": true } } From 09312d59f3447b2eae6c4c2337eca6271f995590 Mon Sep 17 00:00:00 2001 From: Siarhei Fedartsou Date: Sat, 1 Mar 2025 17:52:59 +0100 Subject: [PATCH 4/4] Move icuTokenizer flag into featureFlags object in config --- test/configValidation.js | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/configValidation.js b/test/configValidation.js index 8e6afa75..f65cbd34 100644 --- a/test/configValidation.js +++ b/test/configValidation.js @@ -7,7 +7,7 @@ module.exports.tests = {}; module.exports.tests.interface = function(test, common) { test('does not throw on unknown feature flags', function(t) { - var config = { + const config = { schema: { indexName: "pelias" }, @@ -29,7 +29,7 @@ module.exports.tests.interface = function(test, common) { test('config without schema should throw error', function(t) { - var config = { + const config = { esclient: {} }; @@ -41,7 +41,7 @@ module.exports.tests.interface = function(test, common) { }); test('config without schema.indexName should throw error', function(t) { - var config = { + const config = { schema: {}, esclient: {} }; @@ -55,7 +55,7 @@ module.exports.tests.interface = function(test, common) { test('config with non-string schema.indexName should throw error', function(t) { [null, 17, {}, [], false].forEach((value) => { - var config = { + const config = { schema: { indexName: value, }, @@ -74,7 +74,7 @@ module.exports.tests.interface = function(test, common) { test('config with non-object esclient should throw error', function(t) { [null, 17, [], 'string', true].forEach((value) => { - var config = { + const config = { schema: { indexName: 'example_index', }, @@ -92,7 +92,7 @@ module.exports.tests.interface = function(test, common) { }); test('config with string schema.indexName and object esclient should not throw error', function(t) { - var config = { + const config = { schema: { indexName: 'example_index', }, @@ -115,7 +115,7 @@ module.exports.all = function (tape, common) { return tape('configValidation: ' + name, testFunction); } - for( var testCase in module.exports.tests ){ + for( const testCase in module.exports.tests ){ module.exports.tests[testCase](test, common); } };