From dd9392a5019c4f82794d47a1dba308d711b2abbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Wed, 15 Apr 2026 14:23:45 +0200 Subject: [PATCH 1/3] perf(elastic): filter system fields in es instead of in whelk. --- .../elasticsearch/libris_search_boost.json | 1 + .../main/groovy/whelk/search2/ESSettings.java | 24 ++++++++++++++++++- .../src/main/groovy/whelk/search2/Query.java | 12 +--------- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/librisxl-tools/elasticsearch/libris_search_boost.json b/librisxl-tools/elasticsearch/libris_search_boost.json index 7c0bad6dd8..b74e5b39b1 100644 --- a/librisxl-tools/elasticsearch/libris_search_boost.json +++ b/librisxl-tools/elasticsearch/libris_search_boost.json @@ -122,6 +122,7 @@ } ], "source_excludes": [ + "*_4_digits_*", "@reverse.instanceOf.@reverse.itemOf.hasComponent", "@reverse.instanceOf.@reverse.itemOf.itemOf", "@reverse.instanceOf.@reverse.itemOf.librissearch:itemNote", diff --git a/whelk-core/src/main/groovy/whelk/search2/ESSettings.java b/whelk-core/src/main/groovy/whelk/search2/ESSettings.java index 3575e838dd..635a373174 100644 --- a/whelk-core/src/main/groovy/whelk/search2/ESSettings.java +++ b/whelk-core/src/main/groovy/whelk/search2/ESSettings.java @@ -12,12 +12,31 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static whelk.JsonLd.SEARCH_KEY; +import static whelk.component.ElasticSearch.CARD_STR; +import static whelk.component.ElasticSearch.CHIP_STR; +import static whelk.component.ElasticSearch.SEARCH_CARD_STR; +import static whelk.component.ElasticSearch.TOP_STR; import static whelk.search2.QueryUtil.matchAny; import static whelk.util.Jackson.mapper; public class ESSettings { private static final String BOOST_SETTINGS_FILE = "libris_search_boost.json"; + private static final List SYSTEM_SOURCE_EXCLUDES = List.of( + "*.__*", + "*." + SEARCH_KEY, + "_es_id", + "_links", + "_outerEmbellishments", + "_ids", + "_sortKeyByLang", + TOP_STR, + CHIP_STR, + CARD_STR, + SEARCH_CARD_STR + ); + private EsMappings mappings; private final Boost boost; private final List sourceExcludes; @@ -76,7 +95,10 @@ public Boost loadBoostSettings() { private List loadSourceExcludesSettings() { Map settings = toMap(Boost.class.getClassLoader().getResourceAsStream(BOOST_SETTINGS_FILE)); - return getAsStream(settings, "source_excludes").map(String.class::cast).toList(); + return Stream.concat( + SYSTEM_SOURCE_EXCLUDES.stream(), + getAsStream(settings, "source_excludes").map(String.class::cast) + ).toList(); } public static Boost loadBoostSettings(String json) { diff --git a/whelk-core/src/main/groovy/whelk/search2/Query.java b/whelk-core/src/main/groovy/whelk/search2/Query.java index 604335bc60..3360bc22b6 100644 --- a/whelk-core/src/main/groovy/whelk/search2/Query.java +++ b/whelk-core/src/main/groovy/whelk/search2/Query.java @@ -403,17 +403,7 @@ private Map applyLens(Map framedThing) { } private static Map removeSystemInternalProperties(Map framedThing) { - DocumentUtil.traverse(framedThing, (value, path) -> { - if (value instanceof Map m) { - m.keySet().removeIf(k -> - k instanceof String key - && key.startsWith("_") - && !JsonLd.Platform.CATEGORY_BY_COLLECTION.equals(key) - ); - } - - return DocumentUtil.NOP; - }); + framedThing.remove("_id"); return framedThing; } From 69e9cd10ebb756e121c33e57afcea57e37538424 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 16 Apr 2026 08:33:34 +0200 Subject: [PATCH 2/3] refactor(elastic): constants for SystemFields --- .../whelk/sru/servlet/XSearchServlet.java | 5 +- .../whelk/component/ElasticSearch.groovy | 75 ++++++++++++------- .../main/groovy/whelk/search/ESQuery.groovy | 16 ++-- .../groovy/whelk/search/ElasticFind.groovy | 4 +- .../main/groovy/whelk/search2/ESSettings.java | 46 +++++++----- .../groovy/whelk/search2/ObjectQuery.java | 4 +- .../src/main/groovy/whelk/search2/Query.java | 4 +- 7 files changed, 98 insertions(+), 56 deletions(-) diff --git a/sru/src/main/java/whelk/sru/servlet/XSearchServlet.java b/sru/src/main/java/whelk/sru/servlet/XSearchServlet.java index 1205d7b1ae..0f4e7bfb98 100644 --- a/sru/src/main/java/whelk/sru/servlet/XSearchServlet.java +++ b/sru/src/main/java/whelk/sru/servlet/XSearchServlet.java @@ -75,6 +75,7 @@ import static whelk.JsonLd.TYPE_KEY; import static whelk.JsonLd.WORK_KEY; import static whelk.JsonLd.asList; +import static whelk.component.ElasticSearch.SystemFields.SORT_KEY_BY_LANG; import static whelk.util.DocumentUtil.getAtPath; /** @@ -110,8 +111,8 @@ public class XSearchServlet extends WhelkHttpServlet { private static final Map ORDER = Map.of( // "rank" is default - "alphabetical", "_sortKeyByLang.sv", - "-alphabetical", "-_sortKeyByLang.sv", + "alphabetical", SORT_KEY_BY_LANG + ".sv", + "-alphabetical", "-" + SORT_KEY_BY_LANG + ".sv", "chronological", "-publication.year", // reverse of XL "-chronological", "publication.year" ); diff --git a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy index 4add551f9b..d8dd3e1d71 100644 --- a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy +++ b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy @@ -32,6 +32,16 @@ import static whelk.JsonLd.REVERSE_KEY import static whelk.JsonLd.THING_KEY import static whelk.JsonLd.TYPE_KEY import static whelk.JsonLd.asList +import static whelk.component.ElasticSearch.SystemFields.CARD_STR +import static whelk.component.ElasticSearch.SystemFields.CHIP_STR +import static whelk.component.ElasticSearch.SystemFields.ES_ID +import static whelk.component.ElasticSearch.SystemFields.FLATTENED_LANG_MAP_PREFIX +import static whelk.component.ElasticSearch.SystemFields.IDS +import static whelk.component.ElasticSearch.SystemFields.LINKS +import static whelk.component.ElasticSearch.SystemFields.OUTER_EMBELLISHMENTS +import static whelk.component.ElasticSearch.SystemFields.SEARCH_CARD_STR +import static whelk.component.ElasticSearch.SystemFields.SORT_KEY_BY_LANG +import static whelk.component.ElasticSearch.SystemFields.TOP_STR import static whelk.exception.UnexpectedHttpStatusException.isBadRequest import static whelk.exception.UnexpectedHttpStatusException.isNotFound import static whelk.util.FresnelUtil.Options.NO_FALLBACK @@ -42,6 +52,36 @@ import static whelk.util.Jackson.mapper @Log class ElasticSearch { + + static class SystemFields { + /** + In ES up until 7.8 we could use the _id field for aggregations and sorting, but it was discouraged + for performance reasons. In 7.9 such use was deprecated, and since 8.x it's no longer supported, so + we follow the advice and use a separate field. + (https://www.elastic.co/guide/en/elasticsearch/reference/8.8/mapping-id-field.html). */ + public static final String ES_ID = '_es_id' + + public static final String LINKS = '_links' + public static final String OUTER_EMBELLISHMENTS = '_outerEmbellishments' + public static final String SORT_KEY_BY_LANG = '_sortKeyByLang' + + public static final String IDS = '_ids' + public static final String TOP_STR = '_topStr' + public static final String CHIP_STR = '_chipStr' + public static final String CARD_STR = '_cardStr' + public static final String SEARCH_CARD_STR = '_searchCardStr' + + public static final String FLATTENED_LANG_MAP_PREFIX = '__' + } + + private static final Set SEARCH_STRINGS = [ + JsonLd.SEARCH_KEY, + TOP_STR, + CHIP_STR, + CARD_STR, + SEARCH_CARD_STR + ] as Set + static final String BULK_CONTENT_TYPE = "application/x-ndjson" static final String SEARCH_TYPE = "dfs_query_then_fetch" @@ -88,19 +128,6 @@ class ElasticSearch { ) } - public static final String TOP_STR = '_topStr' - public static final String CHIP_STR = '_chipStr' - public static final String CARD_STR = '_cardStr' - public static final String SEARCH_CARD_STR = '_searchCardStr' - - private static final Set SEARCH_STRINGS = [ - JsonLd.SEARCH_KEY, - TOP_STR, - CHIP_STR, - CARD_STR, - SEARCH_CARD_STR - ] as Set - ElasticSearch(Properties props, JsonLd jsonLd) { this( props.getProperty("elasticHost"), @@ -595,8 +622,8 @@ class ElasticSearch { Map searchCard = JsonLd.frame(thingId, copy.data) - searchCard['_links'] = links - searchCard['_outerEmbellishments'] = copy.getEmbellishments() - links + searchCard[LINKS] = links + searchCard[OUTER_EMBELLISHMENTS] = copy.getEmbellishments() - links Map incomingLinkCountByRelation = whelk.getStorage().getIncomingLinkCountByIdAndRelation(stripHash(copy.getShortId())) var totalItems = incomingLinkCountByRelation.values().sum(0) @@ -617,7 +644,7 @@ class ElasticSearch { ] try { - searchCard['_sortKeyByLang'] = buildSortKeyByLang(searchCard, whelk) + searchCard[SORT_KEY_BY_LANG] = buildSortKeyByLang(searchCard, whelk) } catch (Exception e) { log.error("Couldn't create sort key for {}: {}", document.shortId, e, e) } @@ -630,7 +657,7 @@ class ElasticSearch { log.error("Couldn't create search fields for {}: {}", document.shortId, e, e) } - searchCard['_ids'] = collectIds(embellishedGraph, integralIds) + searchCard[IDS] = collectIds(embellishedGraph, integralIds) DocumentUtil.traverse(searchCard) { value, path -> if (path && SEARCH_STRINGS.contains(path.last())) { @@ -695,11 +722,7 @@ class ElasticSearch { return DocumentUtil.NOP } - // In ES up until 7.8 we could use the _id field for aggregations and sorting, but it was discouraged - // for performance reasons. In 7.9 such use was deprecated, and since 8.x it's no longer supported, so - // we follow the advice and use a separate field. - // (https://www.elastic.co/guide/en/elasticsearch/reference/8.8/mapping-id-field.html). - searchCard["_es_id"] = toElasticId(copy.getShortId()) + searchCard[ES_ID] = toElasticId(copy.getShortId()) if (log.isTraceEnabled()) { log.trace("Framed data: ${searchCard}") @@ -710,7 +733,7 @@ class ElasticSearch { @CompileStatic static String flattenedLangMapKey(String key) { - return '__' + key + return FLATTENED_LANG_MAP_PREFIX + key } private static Set collectIds(List embellishedGraph, Collection integralIds) { @@ -939,8 +962,8 @@ class ElasticSearch { * @return an Iterable of system IDs. */ Iterable getAffectedIds(Collection iris) { - def t1 = iris.collect {['term': ['_links': ['value': it]]]} - def t2 = iris.collect {['term': ['_outerEmbellishments': ['value': it]]]} + def t1 = iris.collect {['term': [(LINKS): ['value': it]]]} + def t2 = iris.collect {['term': [(OUTER_EMBELLISHMENTS): ['value': it]]]} Map query = [ 'bool': ['should': t1 + t2 ] ] @@ -1038,7 +1061,7 @@ class ElasticSearch { private abstract class Scroll implements Iterator { final int FETCH_SIZE = 500 - protected final List SORT = [['_es_id': 'asc']] + protected final List SORT = [[(ES_ID): 'asc']] protected final List FILTER_PATH = ['took', 'hits.hits.sort', 'pit_id', 'hits.total.value'] Iterator fetchedItems diff --git a/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy b/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy index f5a0e3fc7e..0b5d493330 100644 --- a/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy +++ b/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy @@ -6,12 +6,16 @@ import groovy.transform.TypeCheckingMode import groovy.util.logging.Log4j2 as Log import whelk.JsonLd import whelk.Whelk +import whelk.component.ElasticSearch import whelk.exception.InvalidQueryException import whelk.util.DocumentUtil import whelk.util.Unicode import java.util.function.Function +import static whelk.component.ElasticSearch.SystemFields.CHIP_STR +import static whelk.component.ElasticSearch.SystemFields.LINKS +import static whelk.component.ElasticSearch.SystemFields.SORT_KEY_BY_LANG import static whelk.component.ElasticSearch.flattenedLangMapKey import static whelk.util.Jackson.mapper import static whelk.util.Unicode.stripPrefix @@ -51,8 +55,8 @@ class ESQuery { private static final String FILTERED_AGG_NAME = 'a' private static final String NESTED_AGG_NAME = 'n' - public static final String SPELL_CHECK_FIELD = '_chipStr.trigram' - private static final String SPELL_CHECK_FIELD_REVERSE = '_chipStr.reverse' + public static final String SPELL_CHECK_FIELD = CHIP_STR + '.trigram' + private static final String SPELL_CHECK_FIELD_REVERSE = CHIP_STR + '.reverse' private static final Map recordsOverCacheRecordsBoost = [ 'bool': ['should': [ @@ -186,7 +190,7 @@ class ESQuery { } if (queryParameters.containsKey('o')) { - queryParameters.put('_links', queryParameters.get('o')) + queryParameters.put(LINKS, queryParameters.get('o')) } q = Unicode.normalizeForSearch(getQueryString(queryParameters)) @@ -271,7 +275,7 @@ class ESQuery { 'bool': [ 'must' : [ 'prefix': [ - ("_sortKeyByLang.${suggest}.keyword".toString()): [ + ("${SORT_KEY_BY_LANG}.${suggest}.keyword".toString()): [ 'value': q ] ] @@ -279,7 +283,7 @@ class ESQuery { ] ], 'sort' : [ - ("_sortKeyByLang.${suggest}.keyword".toString()): 'asc' + ("${SORT_KEY_BY_LANG}.${suggest}.keyword".toString()): 'asc' ] ] } else { @@ -718,7 +722,7 @@ class ESQuery { parameters.each { String key, value -> if (key == 'p') { value.each { - p.put(it, parameters['_links']) + p.put(it, parameters[LINKS]) } } else if (key.startsWith(OR_PREFIX)) { or.put(key.substring(OR_PREFIX.size()), value) diff --git a/whelk-core/src/main/groovy/whelk/search/ElasticFind.groovy b/whelk-core/src/main/groovy/whelk/search/ElasticFind.groovy index f95644a435..b68d961260 100644 --- a/whelk-core/src/main/groovy/whelk/search/ElasticFind.groovy +++ b/whelk-core/src/main/groovy/whelk/search/ElasticFind.groovy @@ -3,6 +3,8 @@ package whelk.search import groovy.transform.CompileStatic import whelk.component.ElasticSearch +import static whelk.component.ElasticSearch.SystemFields.ES_ID + @CompileStatic class ElasticFind { private static final int PAGE_SIZE = 100 @@ -96,7 +98,7 @@ class ElasticFind { p.put("_offset", [Integer.toString(offset)] as String[]) p.put("_limit", [Integer.toString(PAGE_SIZE)] as String[]) - p.putIfAbsent("_sort", ["_es_id"] as String[]) + p.putIfAbsent("_sort", [ES_ID] as String[]) return p } diff --git a/whelk-core/src/main/groovy/whelk/search2/ESSettings.java b/whelk-core/src/main/groovy/whelk/search2/ESSettings.java index 635a373174..2d491992ff 100644 --- a/whelk-core/src/main/groovy/whelk/search2/ESSettings.java +++ b/whelk-core/src/main/groovy/whelk/search2/ESSettings.java @@ -13,30 +13,22 @@ import java.util.stream.Stream; import static whelk.JsonLd.SEARCH_KEY; -import static whelk.component.ElasticSearch.CARD_STR; -import static whelk.component.ElasticSearch.CHIP_STR; -import static whelk.component.ElasticSearch.SEARCH_CARD_STR; -import static whelk.component.ElasticSearch.TOP_STR; +import static whelk.component.ElasticSearch.SystemFields.FLATTENED_LANG_MAP_PREFIX; +import static whelk.component.ElasticSearch.SystemFields.CARD_STR; +import static whelk.component.ElasticSearch.SystemFields.CHIP_STR; +import static whelk.component.ElasticSearch.SystemFields.ES_ID; +import static whelk.component.ElasticSearch.SystemFields.IDS; +import static whelk.component.ElasticSearch.SystemFields.LINKS; +import static whelk.component.ElasticSearch.SystemFields.OUTER_EMBELLISHMENTS; +import static whelk.component.ElasticSearch.SystemFields.SEARCH_CARD_STR; +import static whelk.component.ElasticSearch.SystemFields.SORT_KEY_BY_LANG; +import static whelk.component.ElasticSearch.SystemFields.TOP_STR; import static whelk.search2.QueryUtil.matchAny; import static whelk.util.Jackson.mapper; public class ESSettings { private static final String BOOST_SETTINGS_FILE = "libris_search_boost.json"; - private static final List SYSTEM_SOURCE_EXCLUDES = List.of( - "*.__*", - "*." + SEARCH_KEY, - "_es_id", - "_links", - "_outerEmbellishments", - "_ids", - "_sortKeyByLang", - TOP_STR, - CHIP_STR, - CARD_STR, - SEARCH_CARD_STR - ); - private EsMappings mappings; private final Boost boost; private final List sourceExcludes; @@ -94,9 +86,25 @@ public Boost loadBoostSettings() { } private List loadSourceExcludesSettings() { + var systemSourceExcludes = List.of( + ES_ID, + LINKS, + OUTER_EMBELLISHMENTS, + SORT_KEY_BY_LANG, + + IDS, + TOP_STR, + CHIP_STR, + CARD_STR, + SEARCH_CARD_STR, + + "*." + FLATTENED_LANG_MAP_PREFIX + "*", + "*." + SEARCH_KEY + ); + Map settings = toMap(Boost.class.getClassLoader().getResourceAsStream(BOOST_SETTINGS_FILE)); return Stream.concat( - SYSTEM_SOURCE_EXCLUDES.stream(), + systemSourceExcludes.stream(), getAsStream(settings, "source_excludes").map(String.class::cast) ).toList(); } diff --git a/whelk-core/src/main/groovy/whelk/search2/ObjectQuery.java b/whelk-core/src/main/groovy/whelk/search2/ObjectQuery.java index 4785bd8978..acabee68ce 100644 --- a/whelk-core/src/main/groovy/whelk/search2/ObjectQuery.java +++ b/whelk-core/src/main/groovy/whelk/search2/ObjectQuery.java @@ -2,6 +2,7 @@ import whelk.JsonLd; import whelk.Whelk; +import whelk.component.ElasticSearch; import whelk.exception.InvalidQueryException; import whelk.search2.querytree.And; import whelk.search2.querytree.Condition; @@ -28,6 +29,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static whelk.component.ElasticSearch.SystemFields.LINKS; import static whelk.search2.QueryParams.ApiParams.CUSTOM_SITE_FILTER; import static whelk.search2.QueryParams.ApiParams.OBJECT; import static whelk.search2.QueryParams.ApiParams.PREDICATES; @@ -128,7 +130,7 @@ protected List> predicateLinks() { } private Condition objectFilter() { - return new Condition("_links", Operator.EQUALS, new Term(object.iri())); + return new Condition(LINKS, Operator.EQUALS, new Term(object.iri())); } private Map getPAggQuery(Map> predicateToSubjectTypes) { diff --git a/whelk-core/src/main/groovy/whelk/search2/Query.java b/whelk-core/src/main/groovy/whelk/search2/Query.java index 3360bc22b6..fe20d0e13f 100644 --- a/whelk-core/src/main/groovy/whelk/search2/Query.java +++ b/whelk-core/src/main/groovy/whelk/search2/Query.java @@ -3,6 +3,7 @@ import com.google.common.base.Predicates; import whelk.JsonLd; import whelk.Whelk; +import whelk.component.ElasticSearch; import whelk.exception.InvalidQueryException; import whelk.search2.querytree.And; import whelk.search2.querytree.Condition; @@ -41,6 +42,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static whelk.component.ElasticSearch.SystemFields.ES_ID; import static whelk.component.ElasticSearch.flattenedLangMapKey; import static whelk.search2.EsMappings.FOUR_DIGITS_KEYWORD_SUFFIX; import static whelk.search2.EsMappings.FOUR_DIGITS_SHORT_SUFFIX; @@ -537,7 +539,7 @@ else if (slice.shouldCountTopLevelDocs() && isInsideNested) { "aggs", Map.of( REVERSE_NESTED_AGG_NAME, Map.of( "cardinality", Map.of( - "field", "_es_id" + "field", ES_ID ) ) ) From 10e450c907ad3fdb4a99665e29e5d3c8044f87de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 16 Apr 2026 08:32:43 +0200 Subject: [PATCH 3/3] chore(elastic): remove dead code --- .../main/groovy/whelk/component/ElasticSearch.groovy | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy index d8dd3e1d71..f7fe844c2f 100644 --- a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy +++ b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy @@ -85,16 +85,6 @@ class ElasticSearch { static final String BULK_CONTENT_TYPE = "application/x-ndjson" static final String SEARCH_TYPE = "dfs_query_then_fetch" - // FIXME: de-KBV/Libris-ify: configurable - static final List REMOVABLE_BASE_URIS = [ - 'http://libris.kb.se/', - 'https://libris.kb.se/', - 'http://id.kb.se/vocab/', - 'https://id.kb.se/vocab/', - 'http://id.kb.se/', - 'https://id.kb.se/', - ] - public int maxResultWindow = 10000 // Elasticsearch default (fallback value) public int maxTermsCount = 65536 // Elasticsearch default (fallback value)