diff --git a/importers/src/main/groovy/whelk/reindexer/ElasticReindexer.groovy b/importers/src/main/groovy/whelk/reindexer/ElasticReindexer.groovy index 2ca9a8f426..19e56bc197 100644 --- a/importers/src/main/groovy/whelk/reindexer/ElasticReindexer.groovy +++ b/importers/src/main/groovy/whelk/reindexer/ElasticReindexer.groovy @@ -110,7 +110,9 @@ class ElasticReindexer { threadPool.awaitAllAndShutdown() log.info("Done! $counter documents reindexed in ${(System.currentTimeMillis() - startTime) / 1000} seconds.") - log.info("New number of mappings/fields in ES: ${whelk.elastic.getFieldCount()}") + for (var ix : whelk.elastic.allIndexNames()) { + log.info("New number of mappings/fields in ES: ${ix} : ${whelk.elastic.getFieldCount(ix)}") + } whelk.storage.logStats() } catch (Throwable e) { log.error("Reindex failed with: ${e}", e) diff --git a/whelk-core/src/main/groovy/whelk/Document.groovy b/whelk-core/src/main/groovy/whelk/Document.groovy index 70a4611a5d..487a71c072 100644 --- a/whelk-core/src/main/groovy/whelk/Document.groovy +++ b/whelk-core/src/main/groovy/whelk/Document.groovy @@ -2,6 +2,7 @@ package whelk import groovy.transform.CompileStatic import groovy.util.logging.Log4j2 as Log +import whelk.exception.WhelkRuntimeException import whelk.util.DocumentUtil import whelk.util.LegacyIntegrationTools import whelk.util.PropertyLoader @@ -14,6 +15,7 @@ import java.time.ZonedDateTime import java.time.format.DateTimeFormatter import java.util.function.Predicate +import static whelk.JsonLd.TYPE_KEY import static whelk.util.Jackson.mapper /** @@ -54,6 +56,7 @@ class Document { static final List recordPath = ["@graph", 0] static final List recordIdPath = ["@graph", 0, "@id"] static final List workIdPath = ["@graph", 1, "instanceOf", "@id"] + static final List workTypePath = ["@graph", 1, "instanceOf", "@type"] static final List thingMetaPath = ["@graph", 1, "meta", "@id"] static final List recordSameAsPath = ["@graph", 0, "sameAs"] static final List recordTypedIDsPath = ["@graph", 0, "identifiedBy"] @@ -929,6 +932,22 @@ class Document { return getRecordIdentifiers().first().endsWith("#work-record") } + // FIXME + // All these "virtual record" methods are hardcoded for blank Works + String singleThingTypeOrVirtualThingType() { + var types = isVirtual() + ? _get(workTypePath, data) + : _get(thingTypePath, data) + + if (types instanceof String) { + return (String) types + } + else if (types instanceof List) { + return (String) ((List) types).getFirst() + } + throw new WhelkRuntimeException("Unexpected ${TYPE_KEY}: ${types}") + } + // All these "virtual record" methods are hardcoded for blank Works void centerOnVirtualMainEntity() { if (!isVirtual()) { diff --git a/whelk-core/src/main/groovy/whelk/JsonLd.groovy b/whelk-core/src/main/groovy/whelk/JsonLd.groovy index 9f3ba50cb1..59ab7946c3 100644 --- a/whelk-core/src/main/groovy/whelk/JsonLd.groovy +++ b/whelk-core/src/main/groovy/whelk/JsonLd.groovy @@ -743,6 +743,12 @@ class JsonLd { return superTermOf } + /** + * + * @param type + * @param baseType + * @return true if types is a subclass of baseType OR type == baseType + */ boolean isSubClassOf(String type, String baseType) { if (!type) { return false diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index f56b855643..aa3121bfeb 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -78,6 +78,12 @@ class Whelk { boolean skipIndexDependers = false boolean skipSparql = false + enum EsMode { + ELASTIC_ENABLED, + ELASTIC_DISABLED + } + private EsMode esMode + // useCache may be set to true only when doing initial imports (temporary processes with the rest of Libris down). // Any other use of this results in a "local" cache, which will not be invalidated when data changes elsewhere, // resulting in potential serving of stale data. @@ -97,15 +103,14 @@ class Whelk { } static Whelk createLoadedSearchWhelk(Properties configuration, boolean useCache = false) { - Whelk whelk = new Whelk(configuration, useCache) + Whelk whelk = new Whelk(configuration, EsMode.ELASTIC_ENABLED, useCache) whelk.configureAndLoad(configuration) return whelk } - Whelk(PostgreSQLComponent pg, ElasticSearch es) { + Whelk(PostgreSQLComponent pg, EsMode esMode) { this(pg) - this.elastic = es - log.info("Using index: $elastic") + this.esMode = esMode } Whelk(PostgreSQLComponent pg) { @@ -114,8 +119,8 @@ class Whelk { log.info("Started with storage: $storage") } - Whelk(Properties conf, useCache = false) { - this(useCache ? new CachingPostgreSQLComponent(conf) : new PostgreSQLComponent(conf), new ElasticSearch(conf)) + Whelk(Properties conf, EsMode esMode, useCache = false) { + this(useCache ? new CachingPostgreSQLComponent(conf) : new PostgreSQLComponent(conf), esMode) } private void configureAndLoad(Properties configuration) { @@ -150,6 +155,11 @@ class Whelk { loadCoreData(systemContextUri) + if (this.esMode == EsMode.ELASTIC_ENABLED) { + this.elastic = new ElasticSearch(configuration, jsonld) + elasticFind = new ElasticFind(new ESQuery(this)) + } + sparqlUpdater = SparqlUpdater.build(storage, jsonld.context, configuration) sparqlQueryClient = new SparqlQueryClient(configuration.getProperty('sparqlEndpoint', null), jsonld); } @@ -241,9 +251,6 @@ class Whelk { void setJsonld(JsonLd jsonld) { this.jsonld = jsonld storage.setJsonld(jsonld) - if (elastic) { - elasticFind = new ElasticFind(new ESQuery(this)) - } initDocumentNormalizers() this.fresnelUtil = new FresnelUtil(jsonld) } @@ -366,7 +373,8 @@ class Whelk { removedLinks.each { link -> String id = storage.getSystemIdByIri(link.iri) if (id) { - elastic.decrementReverseLinks(id, link.relation) + Document doc = storage.load(id) + elastic.decrementReverseLinks(doc, link.relation) } } @@ -386,7 +394,7 @@ class Whelk { reindexAffectedReverseIntegral(doc) } else { // just update link counter - elastic.incrementReverseLinks(id, link.relation) + elastic.incrementReverseLinks(doc, link.relation) } } } diff --git a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy index 64d7862e89..74bc25795c 100644 --- a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy +++ b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy @@ -1,6 +1,7 @@ package whelk.component import groovy.json.JsonOutput +import groovy.transform.Memoized import groovy.util.logging.Log4j2 as Log import org.apache.commons.codec.binary.Base64 import se.kb.libris.utils.isbn.ConvertException @@ -57,7 +58,9 @@ class ElasticSearch { public int maxResultWindow = 10000 // Elasticsearch default (fallback value) public int maxTermsCount = 65536 // Elasticsearch default (fallback value) - String defaultIndex = null + String mainIndex = null + Map baseTypeToSubIndex = new HashMap<>(); + Map subIndexToBaseType = new HashMap<>(); private List elasticHosts private String elasticUser private String elasticPassword @@ -65,9 +68,12 @@ class ElasticSearch { private ElasticClient bulkClient private boolean isPitApiAvailable = false private static final int ES_LOG_MIN_DURATION = 2000 // Only log queries taking at least this amount of milliseconds + private static final String SUB_IX_SEPARATOR = '-' private final Queue indexingRetryQueue = new LinkedBlockingQueue<>() + private final JsonLd jsonLd + private static final class DerivedLenses { public static final FresnelUtil.Lens CARD_ONLY = new FresnelUtil.Lens( FresnelUtil.CARD_CHAIN, @@ -95,20 +101,33 @@ class ElasticSearch { SEARCH_CARD_STR ] as Set - ElasticSearch(Properties props) { + ElasticSearch(Properties props, JsonLd jsonLd) { this( props.getProperty("elasticHost"), props.getProperty("elasticIndex"), + Optional.ofNullable(props.getProperty("elasticSubIndexTypes")) + .map { it.split(",") as List } + .map { it.collect(s -> s.trim()) } + .orElse(Collections.emptyList()), props.getProperty("elasticUser"), - props.getProperty("elasticPassword") + props.getProperty("elasticPassword"), + jsonLd ) } - ElasticSearch(String elasticHost, String elasticIndex, String elasticUser, String elasticPassword) { + private ElasticSearch( + String elasticHost, + String elasticIndex, + List elasticSubIndexTypes, + String elasticUser, + String elasticPassword, + JsonLd jsonLd) + { this.elasticHosts = getElasticHosts(elasticHost) - this.defaultIndex = elasticIndex + this.mainIndex = elasticIndex this.elasticUser = elasticUser this.elasticPassword = elasticPassword + this.jsonLd = jsonLd client = ElasticClient.withDefaultHttpClient(elasticHosts, elasticUser, elasticPassword) bulkClient = ElasticClient.withBulkHttpClient(elasticHosts, elasticUser, elasticPassword) @@ -124,6 +143,26 @@ class ElasticSearch { }, 60*1000, 10*1000) initSettings() + + // initSettings() waits for ES available. Do this after + for (var type : elasticSubIndexTypes) { + // is this a physical index with a numerical suffix? it is the case when indexing to a new index + var suffix = mainIndex.find('_\\d$') + var base = suffix ? Unicode.stripSuffix(mainIndex, suffix) : mainIndex + // elastic index names must be lowercase + var ix = base + SUB_IX_SEPARATOR + type.toLowerCase() + (suffix ?: '') + + if (indexExists(ix)) { + baseTypeToSubIndex.put(type, ix); + subIndexToBaseType.put(ix, type); + } else { + log.info("Could not find subindex ${ix}. Disabled it.") + } + } + + log.info("Hosts: ${elasticHosts}") + log.info("Index: ${mainIndex}") + log.info("Subindices: ${getSubIndexNames()}") } void initSettings() { @@ -168,18 +207,35 @@ class ElasticSearch { return hosts } - String getIndexName() { defaultIndex } + String getIndexName() { mainIndex } + + Collection getSubIndexNames() { + subIndexToBaseType.keySet() + } + + @Memoized + List allIndexNames() { + [getIndexName()] + getSubIndexNames() + } - /** - * Get ES mappings for associated index - * - */ - Map getMappings() { + String getBaseTypeForSubIndex(String subIndex) { + return subIndexToBaseType[subIndex] + } + + List> getAllMappings() { + allIndexNames().collect{ getMappings(it) } + } + + /** + * Get ES mappings for associated index + * + */ + Map getMappings(String index) { Map response try { - response = client.performRequest('GET', "/${indexName}/_mappings", '') + response = client.performRequest('GET', "/${index}/_mappings", '') } catch (UnexpectedHttpStatusException e) { - log.warn("Got unexpected status code ${e.statusCode} when getting ES mappings: ${e.message}", e) + log.warn("Got unexpected status code ${e.statusCode} when getting ES mappings for ${index}: ${e.message}", e) return [:] } @@ -188,20 +244,37 @@ class ElasticSearch { List keys = response.keySet() as List if (keys.size() == 1 && response[(keys[0])].containsKey('mappings')) { - return response[(keys[0])]['mappings'] + return (Map) response[(keys[0])]['mappings'] } else { - log.warn("Couldn't get mappings from ES index ${indexName}, response was ${response}.") + log.warn("Couldn't get mappings from ES index ${index}, response was ${response}.") return [:] } } + boolean indexExists(String index) { + try { + client.performRequest('GET', "/${index}/_settings", '') + return true + } catch (UnexpectedHttpStatusException e) { + if (e.statusCode == 404) { + return false + } + throw e + } + } + /** * Get ES settings for associated index + * NOTE assumes that all subindices have the same settings */ Map getSettings() { + return getSettings(mainIndex) + } + + Map getSettings(String index) { Map response try { - response = client.performRequest('GET', "/${indexName}/_settings", '') + response = client.performRequest('GET', "/${index}/_settings", '') } catch (UnexpectedHttpStatusException e) { // When ES is starting up there is a time when it accepts connections but cannot yet authenticate // users because the security index is unavailable. This results in a 401 Unauthorized (with the @@ -224,24 +297,41 @@ class ElasticSearch { throw new RuntimeException("Couldn't get settings from ES index ${indexName}, response was ${response}.") } } - - int getFieldCount() { + + int getFieldCount(String index) { Map response try { - response = client.performRequest('GET', "/${indexName}/_field_caps?fields=*", '') + response = client.performRequest('GET', "/${index}/_field_caps?fields=*", '') } catch (Exception e) { - log.warn("Error getting fields from ES: $e", e) + log.warn("Error getting fields from ES for ${index}: $e", e) return -1 } try { return response.fields.size() } catch (Exception e) { - log.warn("Error parsing response when getting number of fields from ES: $e", e) + log.warn("Error parsing response when getting number of fields from ES for ${index}: $e", e) return -1 } } + String getIndexForDoc(Document doc) { + return getIndexForType(doc.singleThingTypeOrVirtualThingType()) + } + + @Memoized + String getIndexForType(String type) { + for (var e : subIndexToBaseType.entrySet()) { + var subIndex = e.getKey() + var baseType = e.getValue() + if (jsonLd.isSubClassOf(type, baseType)) { + return subIndex + } + } + + return mainIndex + } + void bulkIndex(Collection docs, Whelk whelk) { if (docs) { String bulkString = docs.findResults{ doc -> @@ -330,7 +420,7 @@ class ElasticSearch { } String createActionRow(Document doc) { - def action = ["index" : [ "_index" : indexName, + def action = ["index" : [ "_index" : getIndexForDoc(doc), "_id" : toElasticId(doc.getShortId()) ]] return mapper.writeValueAsString(action) } @@ -341,10 +431,10 @@ class ElasticSearch { try { Map responseMap = client.performRequest( 'PUT', - "/${indexName}/_doc/${toElasticId(doc.getShortId())}", + "/${getIndexForDoc(doc)}/_doc/${toElasticId(doc.getShortId())}", getShapeForIndex(doc, whelk)) if (log.isDebugEnabled()) { - log.debug("Indexed the document ${doc.getShortId()} as ${indexName}/_doc/${responseMap['_id']} as version ${responseMap['_version']}") + log.debug("Indexed the document ${doc.getShortId()} as ${getIndexForDoc(doc)}/_doc/${responseMap['_id']} as version ${responseMap['_version']}") } } catch (Exception e) { if (!isBadRequest(e)) { @@ -357,15 +447,15 @@ class ElasticSearch { } } - void incrementReverseLinks(String shortId, String relation) { - updateReverseLinkCounter(shortId, relation, 1) + void incrementReverseLinks(Document doc, String relation) { + updateReverseLinkCounter(doc.shortId, relation, 1, getIndexForDoc(doc)) } - void decrementReverseLinks(String shortId, String relation) { - updateReverseLinkCounter(shortId, relation, -1) + void decrementReverseLinks(Document doc, String relation) { + updateReverseLinkCounter(doc.shortId, relation, -1, getIndexForDoc(doc)) } - private void updateReverseLinkCounter(String shortId, String relation, int deltaCount) { + private void updateReverseLinkCounter(String shortId, String relation, int deltaCount, String index) { // An indexed document will always have reverseLinks.totalItems set to an integer, // and reverseLinks.totalItemsByRelation set to a map, but reverseLinks.totalItemsByRelation['foo'] // doesn't necessarily exist at this time; hence the null check before trying to update the link counter. @@ -383,7 +473,7 @@ class ElasticSearch { try { client.performRequest( 'POST', - "/${indexName}/_update/${toElasticId(shortId)}", + "/${index}/_update/${toElasticId(shortId)}", body) } catch (Exception e) { @@ -397,7 +487,7 @@ class ElasticSearch { } else { log.warn("Failed to update reverse link counter ($deltaCount) for $shortId: $e, placing in retry queue.", e) - indexingRetryQueue.add({ -> updateReverseLinkCounter(shortId, relation, deltaCount) }) + indexingRetryQueue.add({ -> updateReverseLinkCounter(shortId, relation, deltaCount, index) }) } } } @@ -409,7 +499,7 @@ class ElasticSearch { def dsl = ["query":["term":["_id":toElasticId(identifier)]]] try { Map responseMap = client.performRequest('POST', - "/${indexName}/_delete_by_query", + "/${allIndexNames().join(',')}/_delete_by_query", JsonOutput.toJson(dsl)) if (log.isDebugEnabled()) { @@ -493,7 +583,6 @@ class ElasticSearch { boolean isVirtualWork = copy.isVirtual() if (isVirtualWork) { copy.centerOnVirtualMainEntity() - } copy.setThingMeta(document.getCompleteId()) List thingIds = copy.getThingIdentifiers() @@ -811,24 +900,24 @@ class ElasticSearch { isnis.findAll{ it.size() == 16 }.collect { Unicode.formatIsni(it) } } - Map multiQuery(List jsonDslList) { + Map multiQuery(List jsonDslList, Collection indexNames = Collections.emptyList()) { return performQuery( jsonDslList.collect { [[:], it].collect { JsonOutput.toJson(it) + '\n' } }.flatten().join(), - getMultiSearchQueryUrl() + getMultiSearchQueryUrl(indexNames) ) } - Map query(Map jsonDsl) { - return performQuery(JsonOutput.toJson(jsonDsl), getQueryUrl()) + Map query(Map jsonDsl, Collection indexNames = Collections.emptyList()) { + return performQuery(JsonOutput.toJson(jsonDsl), getQueryUrl([], indexNames)) } - Map queryIds(Map jsonDsl) { + Map queryIds(Map jsonDsl, Collection indexNames = Collections.emptyList()) { return performQuery( JsonOutput.toJson(jsonDsl), - getQueryUrl(['took','hits.total','hits.hits._id']) + getQueryUrl(['took','hits.total','hits.hits._id'], indexNames) ) } - + /** * Find all other documents that need to be re-indexed because * of changes in linked document(s) @@ -893,18 +982,27 @@ class ElasticSearch { getQueryUrl(filterPath, null) } - private getMultiSearchQueryUrl() { - return getQueryUrl([], indexName, true) + private getMultiSearchQueryUrl(Collection indexNames) { + return getQueryUrl([], indexNames, true) } - private String getQueryUrl(filterPath = [], index = indexName, multiSearch = false) { - def url = (index ? "/${index}" : '') + (multiSearch ? '/_msearch' : '/_search') + "?search_type=$SEARCH_TYPE" + private String getQueryUrl(filterPath = [], Collection indexNames, multiSearch = false) { + boolean noIndex = indexNames == null + var ix = noIndex ? '' : "/${indexString(indexNames)}" + def url = ix + (multiSearch ? '/_msearch' : '/_search') + "?search_type=$SEARCH_TYPE" if (filterPath) { url += "&filter_path=${filterPath.join(',')}" } return url.toString() } + private String indexString(Collection indexNames) { + var ixs = indexNames.isEmpty() ? allIndexNames() : indexNames + ixs.size() == 1 + ? ixs.first() + : new HashSet<>(ixs).join(",") + } + static String toElasticId(String id) { if (id.contains("/")) { return Base64.encodeBase64URLSafeString(id.getBytes("UTF-8")) @@ -947,7 +1045,7 @@ class ElasticSearch { abstract Map nextRequest() String queryPath() { - getQueryUrl(filterPath) + getQueryUrl(filterPath, Collections.emptyList()) } @Override @@ -1064,7 +1162,7 @@ class ElasticSearch { String queryPath() { isPitApiAvailable // point in time is created on index and then index cannot be specified here ? getQueryUrlWithoutIndex(filterPath) - : getQueryUrl(filterPath) + : getQueryUrl(filterPath, Collections.emptyList()) } @Override @@ -1092,10 +1190,11 @@ class ElasticSearch { return request } } - + + // TODO support specifying indices? private String createPointInTime(String keepAlive = "1m") { try { - return performRequest('POST', "/$indexName/_pit?keep_alive=$keepAlive").id + return performRequest('POST', "/${allIndexNames().join(',')}/_pit?keep_alive=$keepAlive").id } catch (Exception e) { log.warn("Failed to create Point In Time: $e") diff --git a/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy b/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy index 060d64599b..fe5ddabad0 100644 --- a/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy +++ b/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy @@ -10,6 +10,8 @@ import whelk.exception.InvalidQueryException import whelk.util.DocumentUtil import whelk.util.Unicode +import java.util.function.Function + import static whelk.component.ElasticSearch.flattenedLangMapKey import static whelk.util.Jackson.mapper import static whelk.util.Unicode.stripPrefix @@ -86,15 +88,17 @@ class ESQuery { void initFieldMappings(Whelk whelk) { if (whelk.elastic) { - Map mappings = whelk.elastic.getMappings() - this.keywordFields = getKeywordFields(mappings) - this.fourDigitShortFields = getFourDigitShortFields(mappings) - this.fourDigitKeywordFields = getFourDigitKeywordFields(mappings) - this.dateFields = getFieldsOfType('date', mappings) - this.nestedFields = getFieldsOfType('nested', mappings) - this.nestedNotInParentFields = nestedFields - getFieldsWithSetting('include_in_parent', true, mappings) - - if (DocumentUtil.getAtPath(mappings, ['properties', '_sortKeyByLang', 'properties', 'sv', 'fields', 'trigram'], null)) { + var allMappings = whelk.elastic.getAllMappings() + var union = (Function f) -> (Set) allMappings.collect{f(it)}.sum() + this.keywordFields = union{ Map m -> getKeywordFields(m) } + this.fourDigitShortFields = union{ Map m -> getFourDigitShortFields(m) } + this.fourDigitKeywordFields = union{ Map m -> getFourDigitKeywordFields(m) } + this.dateFields = union{ Map m -> getFieldsOfType('date', m) } + this.nestedFields = union{ Map m -> getFieldsOfType('nested', m) } + var includeInParent = union{ Map m -> getFieldsWithSetting('include_in_parent', true, m) } + this.nestedNotInParentFields = nestedFields - includeInParent + + if (allMappings.any {DocumentUtil.getAtPath(it, ['properties', '_sortKeyByLang', 'properties', 'sv', 'fields', 'trigram'], null) }) { ENABLE_SPELL_CHECK = true } log.info("ENABLE_SPELL_CHECK = ${ENABLE_SPELL_CHECK}") diff --git a/whelk-core/src/main/groovy/whelk/search2/ESSettings.java b/whelk-core/src/main/groovy/whelk/search2/ESSettings.java index 4491185322..f33e3964c9 100644 --- a/whelk-core/src/main/groovy/whelk/search2/ESSettings.java +++ b/whelk-core/src/main/groovy/whelk/search2/ESSettings.java @@ -24,7 +24,7 @@ public class ESSettings { public ESSettings(Whelk whelk) { if (whelk.elastic != null) { - this.mappings = new EsMappings(whelk.elastic.getMappings()); + this.mappings = new EsMappings(whelk.elastic.getAllMappings()); this.maxItems = whelk.elastic.maxResultWindow; } this.boost = loadBoostSettings(); diff --git a/whelk-core/src/main/groovy/whelk/search2/EsMappings.java b/whelk-core/src/main/groovy/whelk/search2/EsMappings.java index 451bfbd7ef..92e36534f6 100644 --- a/whelk-core/src/main/groovy/whelk/search2/EsMappings.java +++ b/whelk-core/src/main/groovy/whelk/search2/EsMappings.java @@ -3,9 +3,11 @@ import whelk.util.DocumentUtil; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.function.BiPredicate; +import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -28,16 +30,17 @@ public class EsMappings { public static String FOUR_DIGITS_KEYWORD_SUFFIX = "_4_digits_keyword"; public static String FOUR_DIGITS_SHORT_SUFFIX = "_4_digits_short"; - public EsMappings(Map mappings) { - this.keywordSubfieldFields = getKeywordSubfieldFields(mappings); - this.fourDigitsKeywordFields = getFourDigitsKeywordFields(mappings); - this.fourDigitsShortFields = getFourDigitsShortFields(mappings); - this.keywordTypeFields = getFieldsOfType("keyword", mappings); - this.dateTypeFields = getFieldsOfType("date", mappings); - this.nestedTypeFields = getFieldsOfType("nested", mappings); - this.longTypeFields = getFieldsOfType("long", mappings); + public EsMappings(List> mappings) { + this.keywordSubfieldFields = union(mappings, EsMappings::getKeywordSubfieldFields); + this.fourDigitsKeywordFields = union(mappings, EsMappings::getFourDigitsKeywordFields); + this.fourDigitsShortFields = union(mappings, EsMappings::getFourDigitsShortFields); + this.keywordTypeFields = union(mappings, m -> getFieldsOfType("keyword", m)); + this.dateTypeFields = union(mappings, m -> getFieldsOfType("date", m)); + this.nestedTypeFields = union(mappings, m -> getFieldsOfType("nested", m)); + this.longTypeFields = union(mappings, m -> getFieldsOfType("long", m)); this.nestedNotInParentFields = new HashSet<>(nestedTypeFields); - this.nestedNotInParentFields.removeAll(getFieldsWithSetting("include_in_parent", true, mappings)); + var includeInParent = union(mappings, m -> getFieldsWithSetting("include_in_parent", true, m)); + this.nestedNotInParentFields.removeAll(includeInParent); } public boolean hasKeywordSubfield(String fieldPath) { @@ -130,4 +133,11 @@ private static Set getFieldsByCondition(Map mappings, BiPredicate< DocumentUtil.traverse(mappings.get("properties"), visitor); return fields; } + + private static Set union (List mappings, Function> f) { + return mappings.stream() + .map(f) + .reduce(new HashSet<>(), (a,b) -> {a.addAll(b); return a;}); + } + } diff --git a/whelk-core/src/main/groovy/whelk/search2/ObjectQuery.java b/whelk-core/src/main/groovy/whelk/search2/ObjectQuery.java index faf25b2704..a59ee02def 100644 --- a/whelk-core/src/main/groovy/whelk/search2/ObjectQuery.java +++ b/whelk-core/src/main/groovy/whelk/search2/ObjectQuery.java @@ -5,6 +5,7 @@ import whelk.exception.InvalidQueryException; import whelk.search2.querytree.And; import whelk.search2.querytree.Condition; +import whelk.search2.querytree.EsQuery; import whelk.search2.querytree.EsQueryTree; import whelk.search2.querytree.ExpandedQueryTree; import whelk.search2.querytree.Link; @@ -44,7 +45,7 @@ public ObjectQuery(QueryParams queryParams, AppParams appParams, VocabMappings v } @Override - protected Map doGetEsQueryDsl() { + protected EsQuery doGetEsQuery() { JsonLd ld = whelk.getJsonld(); ReducedQueryTree queryTree = (ReducedQueryTree) getFullQueryTree().add(objectFilter()); @@ -84,7 +85,7 @@ protected Map doGetEsQueryDsl() { if (!queryParams.stats.on) { esQueryDsl.put("aggs", getPAggQuery(predicateToSubjectTypes)); - return esQueryDsl; + return new EsQuery(esQueryDsl, Collections.emptyList()); } List subjectTypes = Stream.concat(givenSubjectTypes.stream(), inferredSubjectTypes.stream()).toList(); @@ -92,7 +93,7 @@ protected Map doGetEsQueryDsl() { aggQuery.putAll(getPAggQuery(predicateToSubjectTypes)); esQueryDsl.put("aggs", aggQuery); - return esQueryDsl; + return new EsQuery(esQueryDsl, Collections.emptyList()); } @Override diff --git a/whelk-core/src/main/groovy/whelk/search2/PredicateObjectQuery.java b/whelk-core/src/main/groovy/whelk/search2/PredicateObjectQuery.java index 51089b9cf6..54b5677cbb 100644 --- a/whelk-core/src/main/groovy/whelk/search2/PredicateObjectQuery.java +++ b/whelk-core/src/main/groovy/whelk/search2/PredicateObjectQuery.java @@ -5,6 +5,7 @@ import whelk.exception.InvalidQueryException; import whelk.search2.querytree.And; import whelk.search2.querytree.Condition; +import whelk.search2.querytree.EsQuery; import whelk.search2.querytree.EsQueryTree; import whelk.search2.querytree.ExpandedQueryTree; import whelk.search2.querytree.Node; @@ -15,6 +16,7 @@ import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -28,7 +30,7 @@ public PredicateObjectQuery(QueryParams queryParams, AppParams appParams, VocabM } @Override - protected Map doGetEsQueryDsl() { + protected EsQuery doGetEsQuery() { JsonLd ld = whelk.getJsonld(); Map> predicatesByInferredSubjectType = new HashMap<>(); @@ -63,7 +65,7 @@ protected Map doGetEsQueryDsl() { Map esQueryDsl = buildEsQueryDsl(esQueryTree.getMainQuery()); if (!queryParams.stats.on) { - return esQueryDsl; + return new EsQuery(esQueryDsl, Collections.emptyList()); } Set subjectTypes = Stream.concat(queryTree.getRdfSubjectTypesList().stream(), predicatesByInferredSubjectType.keySet().stream()) @@ -71,7 +73,7 @@ protected Map doGetEsQueryDsl() { var aggQuery = getEsAggQuery(subjectTypes); esQueryDsl.put("aggs", aggQuery); - return esQueryDsl; + return new EsQuery(esQueryDsl, Collections.emptyList()); } private Node predicateObjectFilter(Collection predicates) { diff --git a/whelk-core/src/main/groovy/whelk/search2/Query.java b/whelk-core/src/main/groovy/whelk/search2/Query.java index 0984afb3cc..dea6e7e5bc 100644 --- a/whelk-core/src/main/groovy/whelk/search2/Query.java +++ b/whelk-core/src/main/groovy/whelk/search2/Query.java @@ -7,6 +7,7 @@ import whelk.exception.InvalidQueryException; import whelk.search2.querytree.And; import whelk.search2.querytree.Condition; +import whelk.search2.querytree.EsQuery; import whelk.search2.querytree.EsQueryTree; import whelk.search2.querytree.ExpandedQueryTree; import whelk.search2.querytree.FilterAlias; @@ -18,6 +19,7 @@ import whelk.search2.querytree.QueryTree; import whelk.search2.querytree.ReducedQueryTree; import whelk.search2.querytree.Resource; +import whelk.search2.querytree.Type; import whelk.search2.querytree.Value; import whelk.search2.querytree.YearRange; @@ -65,7 +67,7 @@ public class Query { private final Stats stats; private SelectedFacets selectedFacets; - private Map esQueryDsl; + private EsQuery esQuery; private QueryResult queryResult; private ReducedQueryTree fullQueryTree; @@ -119,18 +121,32 @@ public String findUrl() { return QueryUtil.makeFindUrl(qTree.toQueryString(), queryParams); } - protected Map doGetEsQueryDsl() { + protected EsQuery doGetEsQuery() { JsonLd ld = whelk.getJsonld(); - ExpandedQueryTree expandedQueryTree = getFullQueryTree().expand(ld); + var fullQueryTree = getFullQueryTree(); + + var indexNames = fullQueryTree.getRdfSubjectTypesList().stream().map(whelk.elastic::getIndexForType).toList(); + /* TODO? + // remove type condition that exactly matches subindex content + if (indexNames.size() == 1 && !indexNames.getFirst().equals(whelk.elastic.getBaseIndex())) { + var baseType = whelk.elastic.getBaseTypeForSubIndex(indexNames.getFirst()); + var removeFromTopLevel = new Type(base, whelk.getJsonld()) + ... + } + */ + + ExpandedQueryTree expandedQueryTree = fullQueryTree.expand(ld); ESSettings currentEsSettings = queryParams.boost != null ? esSettings.withBoostSettings(queryParams.boost) : esSettings; if (!queryParams.stats.on) { EsQueryTree esQueryTree = new EsQueryTree(expandedQueryTree, currentEsSettings); - return buildEsQueryDsl(esQueryTree.getMainQuery()); + var esQueryDsl = buildEsQueryDsl(esQueryTree.getMainQuery()); + return new EsQuery(esQueryDsl, Collections.emptyList()); } + EsQueryTree esQueryTree = new EsQueryTree(expandedQueryTree, currentEsSettings, getSelectedFacets()); - Map esQueryDsl = buildEsQueryDsl(esQueryTree.getMainQuery(), esQueryTree.getPostFilter()); + var esQueryDsl = buildEsQueryDsl(esQueryTree.getMainQuery(), esQueryTree.getPostFilter()); esQueryDsl.put("aggs", getEsAggQuery(getFullQueryTree().getRdfSubjectTypesList())); - return esQueryDsl; + return new EsQuery(esQueryDsl, indexNames); } protected Map buildEsQueryDsl(Map mainQuery) { @@ -154,7 +170,7 @@ protected List> predicateLinks() { protected QueryResult getQueryResult() { if (queryResult == null) { - this.queryResult = new QueryResult(doQuery(getEsQueryDsl()), queryParams.debug); + this.queryResult = new QueryResult(doQuery(getEsQuery()), queryParams.debug); } return queryResult; } @@ -215,7 +231,7 @@ protected Map getPartialCollectionView() { view.put("maxItems", esSettings.maxItems()); if (queryParams.debug.contains(QueryParams.Debug.ES_QUERY)) { - view.put(QueryParams.ApiParams.DEBUG, Map.of(QueryParams.Debug.ES_QUERY, getEsQueryDsl())); + view.put(QueryParams.ApiParams.DEBUG, Map.of(QueryParams.Debug.ES_QUERY, getEsQuery().dsl())); } linkLoader.loadChips(); @@ -283,10 +299,10 @@ private List> getSearchMapping() { return mappings; } - private Map doQuery(Object dsl) { - return dsl instanceof List l - ? whelk.elastic.multiQuery(l) - : whelk.elastic.query((Map) dsl); + private Map doQuery(EsQuery esQuery) { + return esQuery.dsl() instanceof List l + ? whelk.elastic.multiQuery(l, esQuery.indexNames()) + : whelk.elastic.query((Map) esQuery.dsl(), esQuery.indexNames()); } private List collectOptionalFilters() { @@ -316,11 +332,11 @@ private static SearchMode getSearchMode(QueryParams queryParams) { return SearchMode.STANDARD_SEARCH; } - private Map getEsQueryDsl() { - if (esQueryDsl == null) { - this.esQueryDsl = doGetEsQueryDsl(); + private EsQuery getEsQuery() { + if (esQuery == null) { + this.esQuery = doGetEsQuery(); } - return esQueryDsl; + return esQuery; } private QueryTree mergeTrees(QueryTree baseTree, List other) { diff --git a/whelk-core/src/main/groovy/whelk/search2/QueryUtil.java b/whelk-core/src/main/groovy/whelk/search2/QueryUtil.java index bd97c275b1..c0de7c1a53 100644 --- a/whelk-core/src/main/groovy/whelk/search2/QueryUtil.java +++ b/whelk-core/src/main/groovy/whelk/search2/QueryUtil.java @@ -125,7 +125,11 @@ public static Map boolWrap(Map m) { } public static Map nestedWrap(String nestedPath, Map query) { - return Map.of("nested", Map.of("path", nestedPath, "query", query)); + return Map.of("nested", Map.of( + "ignore_unmapped", true, // otherwise can fail when searching multiple indices + "path", nestedPath, + "query", query + )); } public static Map matchAny() { diff --git a/whelk-core/src/main/groovy/whelk/search2/SuggestQuery.java b/whelk-core/src/main/groovy/whelk/search2/SuggestQuery.java index e7111e86c6..08f7f6e3cc 100644 --- a/whelk-core/src/main/groovy/whelk/search2/SuggestQuery.java +++ b/whelk-core/src/main/groovy/whelk/search2/SuggestQuery.java @@ -2,9 +2,22 @@ import whelk.Whelk; import whelk.exception.InvalidQueryException; -import whelk.search2.querytree.*; +import whelk.search2.querytree.And; +import whelk.search2.querytree.Condition; +import whelk.search2.querytree.EsQuery; +import whelk.search2.querytree.EsQueryTree; +import whelk.search2.querytree.FreeText; +import whelk.search2.querytree.Link; +import whelk.search2.querytree.Node; +import whelk.search2.querytree.Or; +import whelk.search2.querytree.Property; +import whelk.search2.querytree.QueryTree; +import whelk.search2.querytree.QueryTreeBuilder; +import whelk.search2.querytree.Selector; +import whelk.search2.querytree.Token; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; @@ -95,12 +108,12 @@ protected Map getPartialCollectionView() { } @Override - protected Map doGetEsQueryDsl() { + protected EsQuery doGetEsQuery() { var queryTree = getFullQueryTree(suggestQueryTree).expand(whelk.getJsonld()); var esQueryTree = new EsQueryTree(queryTree, esSettings); var queryDsl = buildEsQueryDsl(esQueryTree.getMainQuery()); queryDsl.remove("sort"); - return queryDsl; + return new EsQuery(queryDsl, Collections.emptyList()); } private List getApplicablePredicates(Map item, Map propertyByKey) { diff --git a/whelk-core/src/main/groovy/whelk/search2/querytree/EsQuery.java b/whelk-core/src/main/groovy/whelk/search2/querytree/EsQuery.java new file mode 100644 index 0000000000..fcbc230c78 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/search2/querytree/EsQuery.java @@ -0,0 +1,6 @@ +package whelk.search2.querytree; + +import java.util.List; + +public record EsQuery(Object dsl, List indexNames) { +} diff --git a/whelk-core/src/test/groovy/whelk/search2/ObjectQuerySpec.groovy b/whelk-core/src/test/groovy/whelk/search2/ObjectQuerySpec.groovy index aa922c7154..c179c830b4 100644 --- a/whelk-core/src/test/groovy/whelk/search2/ObjectQuerySpec.groovy +++ b/whelk-core/src/test/groovy/whelk/search2/ObjectQuerySpec.groovy @@ -32,6 +32,7 @@ class ObjectQuerySpec extends Specification { ], "p6": [ "nested": [ + "ignore_unmapped" : true, "path" : "p3", "query": [ "bool": [ diff --git a/whelk-core/src/test/groovy/whelk/search2/querytree/ConditionSpec.groovy b/whelk-core/src/test/groovy/whelk/search2/querytree/ConditionSpec.groovy index a5a8ac026c..4d1babb764 100644 --- a/whelk-core/src/test/groovy/whelk/search2/querytree/ConditionSpec.groovy +++ b/whelk-core/src/test/groovy/whelk/search2/querytree/ConditionSpec.groovy @@ -121,6 +121,7 @@ class ConditionSpec extends Specification { "bool": [ "must_not": [ "nested": [ + "ignore_unmapped" : true, "query": [ "bool": [ "filter": [ diff --git a/whelk-core/src/test/groovy/whelk/search2/querytree/EsQueryTreeSpec.groovy b/whelk-core/src/test/groovy/whelk/search2/querytree/EsQueryTreeSpec.groovy index fd7f4a2b9f..770a0fb7fd 100644 --- a/whelk-core/src/test/groovy/whelk/search2/querytree/EsQueryTreeSpec.groovy +++ b/whelk-core/src/test/groovy/whelk/search2/querytree/EsQueryTreeSpec.groovy @@ -193,6 +193,7 @@ class EsQueryTreeSpec extends Specification { expect: esQueryTree.getMainQuery() == [ "nested": [ + "ignore_unmapped" : true, "path" : "p3", "query": [ "bool": [ @@ -229,6 +230,7 @@ class EsQueryTreeSpec extends Specification { expect: esQueryTree.getMainQuery() == [ "nested": [ + "ignore_unmapped" : true, "path" : "p3", "query": [ "bool": [ @@ -267,6 +269,7 @@ class EsQueryTreeSpec extends Specification { "bool": [ "must": [[ "nested": [ + "ignore_unmapped" : true, "query": [ "bool": [ "must": [[ @@ -313,6 +316,7 @@ class EsQueryTreeSpec extends Specification { expect: esQueryTree.getMainQuery() == [ "nested": [ + "ignore_unmapped" : true, "path" : "p3", "query": [ "bool": [ @@ -362,6 +366,7 @@ class EsQueryTreeSpec extends Specification { expect: esQueryTree.getMainQuery() == [ "nested": [ + "ignore_unmapped" : true, "query": [ "bool": [ "must": [[ @@ -432,6 +437,7 @@ class EsQueryTreeSpec extends Specification { ] esQueryTree.getPostFilter() == [ "nested": [ + "ignore_unmapped" : true, "path" : "p3", "query": [ "bool": [ @@ -486,6 +492,7 @@ class EsQueryTreeSpec extends Specification { ] esQueryTree.getPostFilter() == [ "nested": [ + "ignore_unmapped" : true, "path" : "p3", "query": [ "bool": [ diff --git a/whelk-core/src/test/groovy/whelk/search2/querytree/TestData.groovy b/whelk-core/src/test/groovy/whelk/search2/querytree/TestData.groovy index 5b19a918c6..2c03eff637 100644 --- a/whelk-core/src/test/groovy/whelk/search2/querytree/TestData.groovy +++ b/whelk-core/src/test/groovy/whelk/search2/querytree/TestData.groovy @@ -283,6 +283,7 @@ class TestData { '@reverse.instanceOf.category.@id' : ['type': 'keyword'] ] ] - return new EsMappings(mappings) + // TODO + return new EsMappings(List.of(mappings)) } } diff --git a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy index 4ac6bca8f9..e912de7bff 100644 --- a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy +++ b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy @@ -809,7 +809,7 @@ class WhelkTool { if (whelk.elastic) { log " ElasticSearch:" log " hosts: ${whelk.elastic.elasticHosts}" - log " index: ${whelk.elastic.defaultIndex}" + log " index: ${whelk.elastic.mainIndex}" } log "Using script: $script" log "Using report dir: $reportsDir"