Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions librisxl-tools/elasticsearch/libris_search_boost.json
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@
}
],
"source_excludes": [
"*_4_digits_*",
"@reverse.instanceOf.@reverse.itemOf.hasComponent",
"@reverse.instanceOf.@reverse.itemOf.itemOf",
"@reverse.instanceOf.@reverse.itemOf.librissearch:itemNote",
Expand Down
5 changes: 3 additions & 2 deletions sru/src/main/java/whelk/sru/servlet/XSearchServlet.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
import static whelk.JsonLd.TYPE_KEY;
import static whelk.JsonLd.WORK_KEY;
import static whelk.JsonLd.asList;
import static whelk.component.ElasticSearch.SystemFields.SORT_KEY_BY_LANG;
import static whelk.util.DocumentUtil.getAtPath;

/**
Expand Down Expand Up @@ -110,8 +111,8 @@ public class XSearchServlet extends WhelkHttpServlet {

private static final Map<String, String> ORDER = Map.of(
// "rank" is default
"alphabetical", "_sortKeyByLang.sv",
"-alphabetical", "-_sortKeyByLang.sv",
"alphabetical", SORT_KEY_BY_LANG + ".sv",
"-alphabetical", "-" + SORT_KEY_BY_LANG + ".sv",
"chronological", "-publication.year", // reverse of XL
"-chronological", "publication.year"
);
Expand Down
85 changes: 49 additions & 36 deletions whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,16 @@ import static whelk.JsonLd.REVERSE_KEY
import static whelk.JsonLd.THING_KEY
import static whelk.JsonLd.TYPE_KEY
import static whelk.JsonLd.asList
import static whelk.component.ElasticSearch.SystemFields.CARD_STR
import static whelk.component.ElasticSearch.SystemFields.CHIP_STR
import static whelk.component.ElasticSearch.SystemFields.ES_ID
import static whelk.component.ElasticSearch.SystemFields.FLATTENED_LANG_MAP_PREFIX
import static whelk.component.ElasticSearch.SystemFields.IDS
import static whelk.component.ElasticSearch.SystemFields.LINKS
import static whelk.component.ElasticSearch.SystemFields.OUTER_EMBELLISHMENTS
import static whelk.component.ElasticSearch.SystemFields.SEARCH_CARD_STR
import static whelk.component.ElasticSearch.SystemFields.SORT_KEY_BY_LANG
import static whelk.component.ElasticSearch.SystemFields.TOP_STR
import static whelk.exception.UnexpectedHttpStatusException.isBadRequest
import static whelk.exception.UnexpectedHttpStatusException.isNotFound
import static whelk.util.FresnelUtil.Options.NO_FALLBACK
Expand All @@ -42,19 +52,39 @@ import static whelk.util.Jackson.mapper

@Log
class ElasticSearch {

static class SystemFields {
/**
In ES up until 7.8 we could use the _id field for aggregations and sorting, but it was discouraged
for performance reasons. In 7.9 such use was deprecated, and since 8.x it's no longer supported, so
we follow the advice and use a separate field.
(https://www.elastic.co/guide/en/elasticsearch/reference/8.8/mapping-id-field.html). */
public static final String ES_ID = '_es_id'

public static final String LINKS = '_links'
public static final String OUTER_EMBELLISHMENTS = '_outerEmbellishments'
public static final String SORT_KEY_BY_LANG = '_sortKeyByLang'

public static final String IDS = '_ids'
public static final String TOP_STR = '_topStr'
public static final String CHIP_STR = '_chipStr'
public static final String CARD_STR = '_cardStr'
public static final String SEARCH_CARD_STR = '_searchCardStr'

public static final String FLATTENED_LANG_MAP_PREFIX = '__'
}

private static final Set<String> SEARCH_STRINGS = [
JsonLd.SEARCH_KEY,
TOP_STR,
CHIP_STR,
CARD_STR,
SEARCH_CARD_STR
] as Set

static final String BULK_CONTENT_TYPE = "application/x-ndjson"
static final String SEARCH_TYPE = "dfs_query_then_fetch"

// FIXME: de-KBV/Libris-ify: configurable
static final List<String> REMOVABLE_BASE_URIS = [
'http://libris.kb.se/',
'https://libris.kb.se/',
'http://id.kb.se/vocab/',
'https://id.kb.se/vocab/',
'http://id.kb.se/',
'https://id.kb.se/',
]

public int maxResultWindow = 10000 // Elasticsearch default (fallback value)
public int maxTermsCount = 65536 // Elasticsearch default (fallback value)

Expand Down Expand Up @@ -88,19 +118,6 @@ class ElasticSearch {
)
}

public static final String TOP_STR = '_topStr'
public static final String CHIP_STR = '_chipStr'
public static final String CARD_STR = '_cardStr'
public static final String SEARCH_CARD_STR = '_searchCardStr'

private static final Set<String> SEARCH_STRINGS = [
JsonLd.SEARCH_KEY,
TOP_STR,
CHIP_STR,
CARD_STR,
SEARCH_CARD_STR
] as Set

ElasticSearch(Properties props, JsonLd jsonLd) {
this(
props.getProperty("elasticHost"),
Expand Down Expand Up @@ -595,8 +612,8 @@ class ElasticSearch {

Map searchCard = JsonLd.frame(thingId, copy.data)

searchCard['_links'] = links
searchCard['_outerEmbellishments'] = copy.getEmbellishments() - links
searchCard[LINKS] = links
searchCard[OUTER_EMBELLISHMENTS] = copy.getEmbellishments() - links

Map<String, Long> incomingLinkCountByRelation = whelk.getStorage().getIncomingLinkCountByIdAndRelation(stripHash(copy.getShortId()))
var totalItems = incomingLinkCountByRelation.values().sum(0)
Expand All @@ -617,7 +634,7 @@ class ElasticSearch {
]

try {
searchCard['_sortKeyByLang'] = buildSortKeyByLang(searchCard, whelk)
searchCard[SORT_KEY_BY_LANG] = buildSortKeyByLang(searchCard, whelk)
} catch (Exception e) {
log.error("Couldn't create sort key for {}: {}", document.shortId, e, e)
}
Expand All @@ -630,7 +647,7 @@ class ElasticSearch {
log.error("Couldn't create search fields for {}: {}", document.shortId, e, e)
}

searchCard['_ids'] = collectIds(embellishedGraph, integralIds)
searchCard[IDS] = collectIds(embellishedGraph, integralIds)

DocumentUtil.traverse(searchCard) { value, path ->
if (path && SEARCH_STRINGS.contains(path.last())) {
Expand Down Expand Up @@ -695,11 +712,7 @@ class ElasticSearch {
return DocumentUtil.NOP
}

// In ES up until 7.8 we could use the _id field for aggregations and sorting, but it was discouraged
// for performance reasons. In 7.9 such use was deprecated, and since 8.x it's no longer supported, so
// we follow the advice and use a separate field.
// (https://www.elastic.co/guide/en/elasticsearch/reference/8.8/mapping-id-field.html).
searchCard["_es_id"] = toElasticId(copy.getShortId())
searchCard[ES_ID] = toElasticId(copy.getShortId())

if (log.isTraceEnabled()) {
log.trace("Framed data: ${searchCard}")
Expand All @@ -710,7 +723,7 @@ class ElasticSearch {

@CompileStatic
static String flattenedLangMapKey(String key) {
return '__' + key
return FLATTENED_LANG_MAP_PREFIX + key
}

private static Set<String> collectIds(List embellishedGraph, Collection<String> integralIds) {
Expand Down Expand Up @@ -939,8 +952,8 @@ class ElasticSearch {
* @return an Iterable of system IDs.
*/
Iterable<String> getAffectedIds(Collection<String> iris) {
def t1 = iris.collect {['term': ['_links': ['value': it]]]}
def t2 = iris.collect {['term': ['_outerEmbellishments': ['value': it]]]}
def t1 = iris.collect {['term': [(LINKS): ['value': it]]]}
def t2 = iris.collect {['term': [(OUTER_EMBELLISHMENTS): ['value': it]]]}
Map query = [
'bool': ['should': t1 + t2 ]
]
Expand Down Expand Up @@ -1038,7 +1051,7 @@ class ElasticSearch {
private abstract class Scroll<T> implements Iterator<T> {
final int FETCH_SIZE = 500

protected final List SORT = [['_es_id': 'asc']]
protected final List SORT = [[(ES_ID): 'asc']]
protected final List FILTER_PATH = ['took', 'hits.hits.sort', 'pit_id', 'hits.total.value']

Iterator<T> fetchedItems
Expand Down
16 changes: 10 additions & 6 deletions whelk-core/src/main/groovy/whelk/search/ESQuery.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@ import groovy.transform.TypeCheckingMode
import groovy.util.logging.Log4j2 as Log
import whelk.JsonLd
import whelk.Whelk
import whelk.component.ElasticSearch
import whelk.exception.InvalidQueryException
import whelk.util.DocumentUtil
import whelk.util.Unicode

import java.util.function.Function

import static whelk.component.ElasticSearch.SystemFields.CHIP_STR
import static whelk.component.ElasticSearch.SystemFields.LINKS
import static whelk.component.ElasticSearch.SystemFields.SORT_KEY_BY_LANG
import static whelk.component.ElasticSearch.flattenedLangMapKey
import static whelk.util.Jackson.mapper
import static whelk.util.Unicode.stripPrefix
Expand Down Expand Up @@ -51,8 +55,8 @@ class ESQuery {
private static final String FILTERED_AGG_NAME = 'a'
private static final String NESTED_AGG_NAME = 'n'

public static final String SPELL_CHECK_FIELD = '_chipStr.trigram'
private static final String SPELL_CHECK_FIELD_REVERSE = '_chipStr.reverse'
public static final String SPELL_CHECK_FIELD = CHIP_STR + '.trigram'
private static final String SPELL_CHECK_FIELD_REVERSE = CHIP_STR + '.reverse'

private static final Map recordsOverCacheRecordsBoost = [
'bool': ['should': [
Expand Down Expand Up @@ -186,7 +190,7 @@ class ESQuery {
}

if (queryParameters.containsKey('o')) {
queryParameters.put('_links', queryParameters.get('o'))
queryParameters.put(LINKS, queryParameters.get('o'))
}

q = Unicode.normalizeForSearch(getQueryString(queryParameters))
Expand Down Expand Up @@ -271,15 +275,15 @@ class ESQuery {
'bool': [
'must' : [
'prefix': [
("_sortKeyByLang.${suggest}.keyword".toString()): [
("${SORT_KEY_BY_LANG}.${suggest}.keyword".toString()): [
'value': q
]
]
]
]
],
'sort' : [
("_sortKeyByLang.${suggest}.keyword".toString()): 'asc'
("${SORT_KEY_BY_LANG}.${suggest}.keyword".toString()): 'asc'
]
]
} else {
Expand Down Expand Up @@ -718,7 +722,7 @@ class ESQuery {
parameters.each { String key, value ->
if (key == 'p') {
value.each {
p.put(it, parameters['_links'])
p.put(it, parameters[LINKS])
}
} else if (key.startsWith(OR_PREFIX)) {
or.put(key.substring(OR_PREFIX.size()), value)
Expand Down
4 changes: 3 additions & 1 deletion whelk-core/src/main/groovy/whelk/search/ElasticFind.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package whelk.search
import groovy.transform.CompileStatic
import whelk.component.ElasticSearch

import static whelk.component.ElasticSearch.SystemFields.ES_ID

@CompileStatic
class ElasticFind {
private static final int PAGE_SIZE = 100
Expand Down Expand Up @@ -96,7 +98,7 @@ class ElasticFind {
p.put("_offset", [Integer.toString(offset)] as String[])
p.put("_limit", [Integer.toString(PAGE_SIZE)] as String[])

p.putIfAbsent("_sort", ["_es_id"] as String[])
p.putIfAbsent("_sort", [ES_ID] as String[])

return p
}
Expand Down
32 changes: 31 additions & 1 deletion whelk-core/src/main/groovy/whelk/search2/ESSettings.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,17 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static whelk.JsonLd.SEARCH_KEY;
import static whelk.component.ElasticSearch.SystemFields.FLATTENED_LANG_MAP_PREFIX;
import static whelk.component.ElasticSearch.SystemFields.CARD_STR;
import static whelk.component.ElasticSearch.SystemFields.CHIP_STR;
import static whelk.component.ElasticSearch.SystemFields.ES_ID;
import static whelk.component.ElasticSearch.SystemFields.IDS;
import static whelk.component.ElasticSearch.SystemFields.LINKS;
import static whelk.component.ElasticSearch.SystemFields.OUTER_EMBELLISHMENTS;
import static whelk.component.ElasticSearch.SystemFields.SEARCH_CARD_STR;
import static whelk.component.ElasticSearch.SystemFields.SORT_KEY_BY_LANG;
import static whelk.component.ElasticSearch.SystemFields.TOP_STR;
import static whelk.search2.QueryUtil.matchAny;
import static whelk.util.Jackson.mapper;

Expand Down Expand Up @@ -75,8 +86,27 @@ public Boost loadBoostSettings() {
}

private List<String> loadSourceExcludesSettings() {
var systemSourceExcludes = List.of(
ES_ID,
LINKS,
OUTER_EMBELLISHMENTS,
SORT_KEY_BY_LANG,

IDS,
TOP_STR,
CHIP_STR,
CARD_STR,
SEARCH_CARD_STR,

"*." + FLATTENED_LANG_MAP_PREFIX + "*",
"*." + SEARCH_KEY
);

Map<?, ?> settings = toMap(Boost.class.getClassLoader().getResourceAsStream(BOOST_SETTINGS_FILE));
return getAsStream(settings, "source_excludes").map(String.class::cast).toList();
return Stream.concat(
systemSourceExcludes.stream(),
getAsStream(settings, "source_excludes").map(String.class::cast)
).toList();
}

public static Boost loadBoostSettings(String json) {
Expand Down
4 changes: 3 additions & 1 deletion whelk-core/src/main/groovy/whelk/search2/ObjectQuery.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import whelk.JsonLd;
import whelk.Whelk;
import whelk.component.ElasticSearch;
import whelk.exception.InvalidQueryException;
import whelk.search2.querytree.And;
import whelk.search2.querytree.Condition;
Expand All @@ -28,6 +29,7 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static whelk.component.ElasticSearch.SystemFields.LINKS;
import static whelk.search2.QueryParams.ApiParams.CUSTOM_SITE_FILTER;
import static whelk.search2.QueryParams.ApiParams.OBJECT;
import static whelk.search2.QueryParams.ApiParams.PREDICATES;
Expand Down Expand Up @@ -128,7 +130,7 @@ protected List<Map<String, Object>> predicateLinks() {
}

private Condition objectFilter() {
return new Condition("_links", Operator.EQUALS, new Term(object.iri()));
return new Condition(LINKS, Operator.EQUALS, new Term(object.iri()));
}

private Map<String, Object> getPAggQuery(Map<Property, List<String>> predicateToSubjectTypes) {
Expand Down
16 changes: 4 additions & 12 deletions whelk-core/src/main/groovy/whelk/search2/Query.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.google.common.base.Predicates;
import whelk.JsonLd;
import whelk.Whelk;
import whelk.component.ElasticSearch;
import whelk.exception.InvalidQueryException;
import whelk.search2.querytree.And;
import whelk.search2.querytree.Condition;
Expand Down Expand Up @@ -41,6 +42,7 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static whelk.component.ElasticSearch.SystemFields.ES_ID;
import static whelk.component.ElasticSearch.flattenedLangMapKey;
import static whelk.search2.EsMappings.FOUR_DIGITS_KEYWORD_SUFFIX;
import static whelk.search2.EsMappings.FOUR_DIGITS_SHORT_SUFFIX;
Expand Down Expand Up @@ -403,17 +405,7 @@ private Map<String, Object> applyLens(Map<String, Object> framedThing) {
}

private static Map<String, Object> removeSystemInternalProperties(Map<String, Object> framedThing) {
DocumentUtil.traverse(framedThing, (value, path) -> {
if (value instanceof Map<?, ?> m) {
m.keySet().removeIf(k ->
k instanceof String key
&& key.startsWith("_")
&& !JsonLd.Platform.CATEGORY_BY_COLLECTION.equals(key)
);
}

return DocumentUtil.NOP;
});
framedThing.remove("_id");
return framedThing;
}

Expand Down Expand Up @@ -547,7 +539,7 @@ else if (slice.shouldCountTopLevelDocs() && isInsideNested) {
"aggs", Map.of(
REVERSE_NESTED_AGG_NAME, Map.of(
"cardinality", Map.of(
"field", "_es_id"
"field", ES_ID
)
)
)
Expand Down
Loading