diff --git a/rest/src/main/groovy/whelk/rest/api/Crud.groovy b/rest/src/main/groovy/whelk/rest/api/Crud.groovy index 8107c3e27b..e38f9defca 100644 --- a/rest/src/main/groovy/whelk/rest/api/Crud.groovy +++ b/rest/src/main/groovy/whelk/rest/api/Crud.groovy @@ -191,6 +191,13 @@ class Crud extends HttpServlet { sendGetResponse(response, body, eTag, request.getPath(), request.getContentType(), request.getId()) } else { ETag eTag + + if (doc.isPlaceholder()) { + whelk.external.getEphemeral(doc.getThingIdentifiers().first()).ifPresent({ ext -> + doc.setThing(ext.getThing()) + }) + } + if (request.shouldEmbellish()) { String plainChecksum = doc.getChecksum(jsonld) whelk.embellish(doc) @@ -679,7 +686,13 @@ class Crud extends HttpServlet { try { if (doc) { String activeSigel = request.getHeader(XL_ACTIVE_SIGEL_HEADER) + String collection = doc.getLegacyCollection(jsonld) + + if (doc.isCacheRecord()) { + throw new BadRequestException("Cannot POST/PUT cache record") + } + if (isUpdate) { // You are not allowed to change collection when updating a record diff --git a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy new file mode 100644 index 0000000000..faa1771149 --- /dev/null +++ b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy @@ -0,0 +1,140 @@ +package whelk.rest.api + +import whelk.Document +import whelk.JsonLd +import whelk.Whelk +import whelk.external.Wikidata +import whelk.util.WhelkFactory + +import javax.servlet.ServletException +import javax.servlet.http.HttpServlet +import javax.servlet.http.HttpServletRequest +import javax.servlet.http.HttpServletResponse +import java.util.function.Predicate + +import static whelk.JsonLd.CONTEXT_KEY +import static whelk.JsonLd.TYPE_KEY + +class ExternalEntitiesSearchAPI extends HttpServlet { + Whelk whelk + + @Override + void init() { + whelk = WhelkFactory.getSingletonWhelk() + } + + @Override + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + String q = request.getParameter('q')?.trim() ?: '' + def types = request.getParameterMap().get(TYPE_KEY) as List ?: [] + def language = request.getParameter('_lang') ?: 'sv' + + def items = JsonLd.looksLikeIri(q) + ? selectExternal(q, types) + : searchExternal(q, types, language) + + SearchUtils.Lookup lookup = new SearchUtils.Lookup(whelk) + + def mappings = [] + if (q) { + mappings << ['variable' : 'q', + 'predicate': lookup.chip('textQuery'), + 'value' : q] + } + def (paramMappings, _) = SearchUtils.mapParams(lookup, request.getParameterMap()) + mappings.addAll(paramMappings) + + def result = [ + (CONTEXT_KEY): Crud.CONTEXT_PATH, + (TYPE_KEY) : 'PartialCollectionView', + 'itemOffset' : 0, + 'totalItems' : items.size(), + 'search' : [ + 'mapping': mappings + ], + 'items' : items + ] + + lookup.run() + + HttpTools.sendResponse(response, result, MimeTypes.JSONLD) + } + + List searchExternal(String q, Collection types, languageTag) { + def typeFilter = typeFilter(types) + + def uris = Wikidata.query(q, languageTag, 5) + uris.removeAll(whelk.external.getBannedImports()) + + def inWhelk = whelk.getCards(uris) + + uris + .collect { uri -> + if (inWhelk[uri]) { + def doc = new Document(inWhelk[uri]) + insertReverseLinkCount(doc) + doc + } + else { + whelk.external.getEphemeral(uri).orElse(null) + } + } + .grep() + .findAll {typeFilter.test(it) } + .collect { doc -> + whelk.embellish(doc) + JsonLd.frame(doc.getThingIdentifiers().first(), doc.data) + } + } + + private Predicate typeFilter(Collection types) { + boolean isAnyTypeOk = !types || types.any { it == '*' } + return { Document doc -> + def extType = doc.getThingType() + isAnyTypeOk || types.any { it == extType || whelk.jsonld.isSubClassOf(extType, (String) it)} + } + } + + List selectExternal(String iri, Collection types) { + def theTypeFilter = typeFilter(types) + + def inWhelk = whelk.getCards([iri]) + if (inWhelk[iri]) { + return whelkResult(inWhelk[iri], theTypeFilter) + } + + return whelk.external.getEphemeral(iri).map ({ doc -> + def extId = doc.getThingIdentifiers().first() + inWhelk = whelk.getCards([extId]) + if (inWhelk[extId]) { // iri was an alias/sameAs + return whelkResult(inWhelk[extId], theTypeFilter) + } + + if (theTypeFilter.test(doc)) { + whelk.embellish(doc) + [JsonLd.frame(doc.getThingIdentifiers().first(), doc.data)] + } else { + [] + } + }).orElse([]) + } + + List whelkResult(Map data, typeFilter) { + Document doc = new Document(data) + if (!typeFilter.test(doc)) { + return [] + } + insertReverseLinkCount(doc) + whelk.embellish(doc) + def framed = JsonLd.frame(doc.getThingIdentifiers().first(), doc.data) + return [framed] + } + + void insertReverseLinkCount(Document doc) { + whelk.elastic.retrieveIndexedDocument(doc.getShortId())?.with { + if (it.reverseLinks) { + doc.data[JsonLd.GRAPH_KEY][1]['reverseLinks'] = it.reverseLinks + } + } + } +} diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy index 499a7f9e1f..61f3e12fac 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy @@ -8,7 +8,6 @@ import groovy.util.logging.Log4j2 as Log import whelk.Document import whelk.JsonLd import whelk.Whelk -import whelk.component.DocumentNormalizer import whelk.exception.InvalidQueryException import whelk.exception.WhelkRuntimeException import whelk.search.ESQuery @@ -16,6 +15,10 @@ import whelk.search.ElasticFind import whelk.search.RangeParameterPrefix import whelk.util.DocumentUtil +import static whelk.JsonLd.GRAPH_KEY +import static whelk.JsonLd.ID_KEY +import static whelk.JsonLd.TYPE_KEY + @Log class SearchUtils { @@ -35,7 +38,7 @@ class SearchUtils { Whelk whelk JsonLd ld ESQuery esQuery - URI vocabUri + SearchUtils(Whelk whelk) { this(whelk.jsonld) @@ -45,9 +48,6 @@ class SearchUtils { SearchUtils(JsonLd jsonld) { this.ld = jsonld - if (ld.vocabId) { - vocabUri = new URI(ld.vocabId) - } } Map doSearch(Map queryParameters) { @@ -130,9 +130,10 @@ class SearchUtils { // // TODO Only manipulate `_limit` in one place queryParameters['_limit'] = [limit.toString()] - + Map esResult = esQuery.doQuery(queryParameters, suggest) - Lookup lookup = new Lookup() + + Lookup lookup = new Lookup(whelk) List mappings = [] if (query) { @@ -159,9 +160,10 @@ class SearchUtils { item.identifiedBy?.with { List ids -> ids.removeAll { (Document.isIsni(it) || Document.isOrcid(it) ) && it.value?.size() == 16+3 } } // This object must be re-added because it might get filtered out in applyLens(). - item['reverseLinks'] = it['reverseLinks'] - if (item['reverseLinks'] != null) + if (it['reverseLinks']) { + item['reverseLinks'] = it['reverseLinks'] item['reverseLinks'][JsonLd.ID_KEY] = Document.getBASE_URI().resolve('find?o=' + URLEncoder.encode(it['@id'], 'UTF-8').toString()).toString() + } return item } } @@ -173,13 +175,14 @@ class SearchUtils { k = stripPrefix((String) k, ESQuery.OR_PREFIX) ((List) aggregations[k]?['buckets'])?.removeIf { it['key'] in v } } - + Map stats = null if (addStats == null || (addStats == 'true' || addStats == 'on')) { stats = buildStats(lookup, aggregations, makeFindUrl(SearchType.ELASTIC, stripNonStatsParams(pageParams)), (total > 0 && !predicates) ? reverseObject : null) } + if (!stats) { log.debug("No stats found for query: ${queryParameters}") } @@ -187,7 +190,7 @@ class SearchUtils { (query ? mappings.tail() : mappings).each { Map mapping -> Map params = removeMappingFromParams(pageParams, mapping) String upUrl = makeFindUrl(SearchType.ELASTIC, params, offset) - mapping['up'] = [ (JsonLd.ID_KEY): upUrl ] + mapping['up'] = [ (ID_KEY): upUrl ] } if (reverseObject) { @@ -195,7 +198,7 @@ class SearchUtils { mappings << [ 'variable' : 'o', 'object' : lookup.chip(reverseObject), // TODO: object/predicate/??? - 'up' : [(JsonLd.ID_KEY): upUrl], + 'up' : [(ID_KEY): upUrl], ] } @@ -205,7 +208,7 @@ class SearchUtils { 'variable' : 'p', 'object' : reverseObject, 'predicate': lookup.chip(predicates.first()), - 'up' : [(JsonLd.ID_KEY): upUrl], + 'up' : [(ID_KEY): upUrl], ] } @@ -222,20 +225,20 @@ class SearchUtils { result['_debug'] = esResult['_debug'] } - result['maxItems'] = esQuery.getMaxItems().toString() + result['maxItems'] = esQuery.getMaxItems() lookup.run() return result } - + Map removeMappingFromParams(Map pageParams, Map mapping) { Map params = pageParams.clone() String variable = mapping['variable'] def param = params[variable] List values = param instanceof List ? param.clone() : param ? [param] : [] if ('object' in mapping) { - def value = mapping.object[JsonLd.ID_KEY] + def value = mapping.object[ID_KEY] values.remove(value) } else if ('value' in mapping) { def value = mapping.value @@ -275,8 +278,8 @@ class SearchUtils { private Map assembleSearchResults(SearchType st, List items, List mappings, Map pageParams, int limit, int offset, int total) { - Map result = [(JsonLd.TYPE_KEY): 'PartialCollectionView'] - result[(JsonLd.ID_KEY)] = makeFindUrl(st, pageParams, offset) + Map result = [(TYPE_KEY): 'PartialCollectionView'] + result[(ID_KEY)] = makeFindUrl(st, pageParams, offset) result['itemOffset'] = offset result['itemsPerPage'] = limit result['totalItems'] = total @@ -292,20 +295,6 @@ class SearchUtils { return result } - /** - * Create ES filter for specified siteBaseUri. - * - */ - Map makeSiteFilter(String siteBaseUri) { - return ['should': [ - ['prefix': [(JsonLd.ID_KEY): siteBaseUri]], - // ideally, we'd use ID_KEY here too, but that - // breaks the test case :/ - ['prefix': ['sameAs.@id': siteBaseUri]] - ], - 'minimum_should_match': 1] - } - /** * Build the term aggregation part of an ES query. * @@ -357,7 +346,7 @@ class SearchUtils { String searchPageUrl = "${baseUrlForKey}&${ESQuery.AND_PREFIX}${makeParam(key, itemId)}" Map observation = ['totalItems': bucket.getAt('doc_count'), - 'view': [(JsonLd.ID_KEY): searchPageUrl], + 'view': [(ID_KEY): searchPageUrl], 'object': lookup.chip(itemId)] observations << observation @@ -377,7 +366,7 @@ class SearchUtils { 'dimension' : JsonLd.REVERSE_KEY, 'observation': counts.collect { List relations, long count -> def viewUrl = baseUrl + '&' + - relations.collect{ makeParam('p', it + '.' + JsonLd.ID_KEY) }.join('&') + relations.collect{ makeParam('p', it + '.' + ID_KEY) }.join('&') [ 'totalItems': count, 'view' : ['@id': viewUrl], @@ -456,7 +445,7 @@ class SearchUtils { private int numberOfIncomingLinks(String iri) { try { - def doc = new ElasticFind(esQuery).find([(JsonLd.ID_KEY): [iri]]).first() + def doc = new ElasticFind(esQuery).find([(ID_KEY): [iri]]).first() return doc['reverseLinks']['totalItems'] } catch (Exception e) { @@ -465,9 +454,21 @@ class SearchUtils { } } - private class Lookup { + static class Lookup { private Multimap iriPos = ArrayListMultimap.create() + + private Whelk whelk + private JsonLd ld + private URI vocabUri + Lookup(Whelk whelk) { + this.whelk = whelk + this.ld = whelk.jsonld + if (ld.vocabId) { + vocabUri = new URI(ld.vocabId) + } + } + Map chip(String itemRepr) { boolean matchesTerm = false def itemId = itemRepr @@ -480,8 +481,8 @@ class SearchUtils { if (termKey in ld.vocabIndex) { return ld.vocabIndex[termKey] } - - if (!itemId.startsWith('http') && itemId.contains('.')) { + + if (!JsonLd.looksLikeIri(itemId) && itemId.contains('.')) { String[] parts = itemId.split('\\.') List chain = parts .findAll { it != JsonLd.ID_KEY } @@ -525,37 +526,37 @@ class SearchUtils { it.value.putAll(chip) } } - } - - private Map dummyChip(String itemId) { - [(JsonLd.ID_KEY): itemId, 'label': itemId] - } - /* - * Read vocab term data from storage. - * - * Returns null if not found. - * - */ - private String getFullUri(String id) { - try { - if (vocabUri) { - return vocabUri.resolve(id).toString() + private Map dummyChip(String itemId) { + [(ID_KEY): itemId, 'label': itemId] + } + + /* + * Read vocab term data from storage. + * + * Returns null if not found. + * + */ + private String getFullUri(String id) { + try { + if (vocabUri) { + return vocabUri.resolve(id).toString() + } + } + catch (IllegalArgumentException e) { + // Couldn't resolve, which means id isn't a valid IRI. + // No need to check the db. + return null } } - catch (IllegalArgumentException e) { - // Couldn't resolve, which means id isn't a valid IRI. - // No need to check the db. - return null + + // FIXME move to Document or JsonLd + private Map getEntry(Map jsonLd, String entryId) { + // we rely on this convention for the time being. + return jsonLd[(GRAPH_KEY)].find { it[ID_KEY] == entryId } } } - // FIXME move to Document or JsonLd - private Map getEntry(Map jsonLd, String entryId) { - // we rely on this convention for the time being. - return jsonLd[(JsonLd.GRAPH_KEY)].find { it[JsonLd.ID_KEY] == entryId } - } - /** * Create a URL for '/find' with the specified query parameters. * @@ -623,20 +624,20 @@ class SearchUtils { Offsets offsets = new Offsets(total, limit, offset) - result['first'] = [(JsonLd.ID_KEY): makeFindUrl(st, pageParams)] - result['last'] = [(JsonLd.ID_KEY): makeFindUrl(st, pageParams, offsets.last)] + result['first'] = [(ID_KEY): makeFindUrl(st, pageParams)] + result['last'] = [(ID_KEY): makeFindUrl(st, pageParams, offsets.last)] if (offsets.prev != null) { if (offsets.prev == 0) { result['previous'] = result['first'] } else { - result['previous'] = [(JsonLd.ID_KEY): makeFindUrl(st, pageParams, + result['previous'] = [(ID_KEY): makeFindUrl(st, pageParams, offsets.prev)] } } if (offsets.next) { - result['next'] = [(JsonLd.ID_KEY): makeFindUrl(st, pageParams, + result['next'] = [(ID_KEY): makeFindUrl(st, pageParams, offsets.next)] } @@ -710,7 +711,7 @@ class SearchUtils { * filtered out. * */ - private Tuple2 mapParams(Lookup lookup, Map params) { + static Tuple2 mapParams(Lookup lookup, Map params) { List result = [] Map pageParams = [:] List reservedParams = getReservedParameters() @@ -723,11 +724,11 @@ class SearchUtils { String valueProp String termKey def value - if (param == JsonLd.TYPE_KEY || param == JsonLd.ID_KEY) { + if (param == TYPE_KEY || param == ID_KEY) { valueProp = 'object' termKey = param value = lookup.chip(val).with { it[JsonLd.ID_KEY] = val; return it } - } else if (param.endsWith(".${JsonLd.ID_KEY}")) { + } else if (param.endsWith(".${ID_KEY}")) { valueProp = 'object' termKey = param[0..-5] value = lookup.chip(val).with { it[JsonLd.ID_KEY] = val; return it } @@ -761,7 +762,7 @@ class SearchUtils { /* * Return a list of reserved query params */ - private List getReservedParameters() { + private static List getReservedParameters() { return ['q', 'p', 'o', 'value', '_limit', '_offset', '_suggest'] } diff --git a/rest/src/main/webapp/WEB-INF/web.xml b/rest/src/main/webapp/WEB-INF/web.xml index 345a555382..b3665c0cd4 100644 --- a/rest/src/main/webapp/WEB-INF/web.xml +++ b/rest/src/main/webapp/WEB-INF/web.xml @@ -62,6 +62,10 @@ RemoteSearch whelk.rest.api.RemoteSearchAPI + + ExternalEntitiesSearchAPI + whelk.rest.api.ExternalEntitiesSearchAPI + MarcConverter whelk.rest.api.ConverterAPI @@ -116,6 +120,11 @@ /_remotesearch + + ExternalEntitiesSearchAPI + /_externalentities + + MarcConverter /_convert diff --git a/rest/src/test/groovy/whelk/rest/api/SearchUtilsSpec.groovy b/rest/src/test/groovy/whelk/rest/api/SearchUtilsSpec.groovy index 1383d7a429..2942566a5e 100644 --- a/rest/src/test/groovy/whelk/rest/api/SearchUtilsSpec.groovy +++ b/rest/src/test/groovy/whelk/rest/api/SearchUtilsSpec.groovy @@ -23,16 +23,6 @@ class SearchUtilsSpec extends Specification { assert !(urir =~ pattern) } - def "Should make site filter"() { - when: - String url = "http://example.com" - Map expected = ['should': [['prefix': ['@id': url]], - ['prefix': ['sameAs.@id': url]]], - 'minimum_should_match': 1] - then: - assert search.makeSiteFilter(url) == expected - } - def "Should build aggregation query"() { when: Map tree = ['@type': []] diff --git a/whelk-core/build.gradle b/whelk-core/build.gradle index 9a91be1367..1c7f2169b8 100644 --- a/whelk-core/build.gradle +++ b/whelk-core/build.gradle @@ -95,7 +95,8 @@ dependencies { api 'commons-io:commons-io:2.11.0' implementation "org.apache.httpcomponents:httpclient:${httpComponentsClientVersion}" implementation "org.apache.httpcomponents:httpcore:${httpComponentsCoreVersion}" - api 'org.apache.jena:apache-jena-libs:3.0.1' + api 'org.apache.jena:apache-jena-libs:3.17.0' + implementation 'org.apache.jena:apache-jena-libs:3.17.0' api "org.codehaus.groovy:groovy-json:${groovyVersion}" api "org.codehaus.groovy:groovy-xml:${groovyVersion}" api "org.codehaus.groovy:groovy-yaml:${groovyVersion}" diff --git a/whelk-core/src/main/groovy/whelk/Document.groovy b/whelk-core/src/main/groovy/whelk/Document.groovy index 0580d6de04..9bee8c7661 100644 --- a/whelk-core/src/main/groovy/whelk/Document.groovy +++ b/whelk-core/src/main/groovy/whelk/Document.groovy @@ -50,6 +50,7 @@ class Document { static final List thingCarrierTypesPath = ["@graph", 1, "carrierType"] static final List thingInSchemePath = ["@graph",1,"inScheme","@id"] static final List recordIdPath = ["@graph", 0, "@id"] + static final List recordTypePath = ["@graph", 0, "@type"] static final List workIdPath = ["@graph", 1, "instanceOf", "@id"] static final List thingMetaPath = ["@graph", 1, "meta", "@id"] static final List recordSameAsPath = ["@graph", 0, "sameAs"] @@ -171,12 +172,22 @@ class Document { String getThingType() { get(thingTypePath) } + String getRecordType() { get(recordTypePath) } + + String setRecordType(type) { set(recordTypePath, type) } + String getRecordStatus() { return get(statusPath) } void setRecordStatus(status) { set(statusPath, status) } void setThingMeta(meta) { set(thingMetaPath, meta) } + Map getThing() { (Map) get(thingPath) } + + void setThing(thing) { _removeLeafObject(thingPath, data); set(thingPath, thing) } + + void setRecordId(id) { set(recordIdPath, id) } + /** * Will have base URI prepended if not already there */ @@ -342,6 +353,14 @@ class Document { String getLegacyCollection(JsonLd jsonld) { LegacyIntegrationTools.determineLegacyCollection(this, jsonld) } + + boolean isPlaceholder() { + return getRecordType() == JsonLd.PLACEHOLDER_RECORD_TYPE + } + + boolean isCacheRecord() { + return getRecordType() == JsonLd.CACHE_RECORD_TYPE + } String getHeldBySigel() { String uri = get(sigelPath) @@ -714,22 +733,22 @@ class Document { return _get(path, data) } - static Object _get(List path, Object root) { + static Object _get(List path, Object root, Object defaultTo = null) { // Start at root data node Object node = root for (Object step : path) { if ((node instanceof Map) && !(step instanceof String)) { log.warn("Needed string as map key, but was given: " + step + ". (path was: " + path + ")") - return null + return defaultTo } else if ((node instanceof List) && !(step instanceof Integer)) { log.warn("Needed integer as list index, but was given: " + step + ". (path was: " + path + ")") - return null + return defaultTo } node = node[step] if (node == null) { - return null + return defaultTo } } @@ -894,8 +913,16 @@ class Document { private static boolean isSet(String key, JsonLd jsonLd) { jsonLd && key && jsonLd.isSetContainer(key) } - - public String toVerboseString() { + + String toVerboseString() { return "{completeId=" + getCompleteId() + ", baseUri=" + baseUri.toString() + ", base identifiers:" + getRecordIdentifiers().join(','); } + + void replaceLinks(Map oldToNew) { + DocumentUtil.findKey(data, JsonLd.ID_KEY) { value, path -> + if (oldToNew.containsKey(value)) { + new DocumentUtil.Replace(oldToNew[(String) value]) + } + } + } } diff --git a/whelk-core/src/main/groovy/whelk/JsonLd.groovy b/whelk-core/src/main/groovy/whelk/JsonLd.groovy index 038bb8234c..ac9ab7671d 100644 --- a/whelk-core/src/main/groovy/whelk/JsonLd.groovy +++ b/whelk-core/src/main/groovy/whelk/JsonLd.groovy @@ -42,6 +42,8 @@ class JsonLd { static final String RECORD_TYPE = 'Record' static final String CACHE_RECORD_TYPE = 'CacheRecord' + static final String PLACEHOLDER_RECORD_TYPE = 'PlaceholderRecord' + static final String PLACEHOLDER_ENTITY_TYPE = 'Resource' static final String SEARCH_KEY = "_str" @@ -548,7 +550,7 @@ class JsonLd { static List asList(o) { return (o instanceof List) ? (List) o : o != null ? [o] : [] } - + static boolean looksLikeIri(String s) { s && (s.startsWith('https://') || s.startsWith('http://')) } diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index 51f08c9095..442522326e 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -2,6 +2,8 @@ package whelk import com.google.common.collect.Iterables import groovy.transform.CompileStatic +import groovy.transform.TypeChecked +import groovy.transform.TypeCheckingMode import groovy.util.logging.Log4j2 as Log import se.kb.libris.Normalizers import whelk.component.CachingPostgreSQLComponent @@ -13,17 +15,21 @@ import whelk.component.SparqlUpdater import whelk.converter.marc.MarcFrameConverter import whelk.converter.marc.RomanizationStep import whelk.exception.StorageCreateFailedException -import whelk.filter.LanguageLinker import whelk.exception.WhelkException +import whelk.external.ExternalEntities +import whelk.filter.LanguageLinker import whelk.filter.LinkFinder import whelk.filter.NormalizerChain import whelk.meta.WhelkConstants import whelk.search.ESQuery import whelk.search.ElasticFind +import whelk.util.LegacyIntegrationTools import whelk.util.PropertyLoader import whelk.util.Romanizer import java.time.ZoneId +import java.util.function.Consumer +import java.util.function.Function /** * The Whelk is the root component of the XL system. @@ -57,6 +63,7 @@ class Whelk { RomanizationStep.LanguageResources languageResources ElasticFind elasticFind Relations relations + ExternalEntities external DocumentNormalizer normalizer Romanizer romanizer @@ -215,6 +222,7 @@ class Whelk { elasticFind = new ElasticFind(new ESQuery(this)) initDocumentNormalizers(elasticFind) } + external = new ExternalEntities(this) } // FIXME: de-KBV/Libris-ify: some of these are KBV specific, is that a problem? @@ -332,7 +340,7 @@ class Whelk { private void reindexAffected(Document document, Set preUpdateLinks, Set postUpdateLinks) { Set addedLinks = (postUpdateLinks - preUpdateLinks) Set removedLinks = (preUpdateLinks - postUpdateLinks) - + removedLinks.findResults { storage.getSystemIdByIri(it.iri) } .each{id -> elastic.decrementReverseLinks(id) } @@ -428,6 +436,7 @@ class Whelk { throw new StorageCreateFailedException(document.getShortId(), "Document considered a duplicate of : " + collidingIDs) } + createCacheRecordsAndPlaceholders(changedBy, document) boolean success = storage.createDocument(document, changedIn, changedBy, collection, deleted) if (success) { indexAsyncOrSync { @@ -457,6 +466,7 @@ class Whelk { preUpdateDoc = doc.clone() updateAgent.update(doc) normalize(doc) + createCacheRecordsAndPlaceholders(changedBy, doc, preUpdateDoc) }) if (updated == null || preUpdateDoc == null) { @@ -472,6 +482,8 @@ class Whelk { void storeAtomicUpdate(Document doc, boolean minorUpdate, boolean writeIdenticalVersions, String changedIn, String changedBy, String oldChecksum) { normalize(doc) Document preUpdateDoc = storage.load(doc.shortId) + + createCacheRecordsAndPlaceholders(changedBy, doc, preUpdateDoc) Document updated = storage.storeAtomicUpdate(doc, minorUpdate, writeIdenticalVersions, changedIn, changedBy, oldChecksum) if (updated == null) { @@ -481,7 +493,7 @@ class Whelk { reindexUpdated(updated, preUpdateDoc) sparqlUpdater?.pollNow() } - + /** * This is a variant of createDocument that does no or minimal denormalization or indexing. * It should NOT be used to create records in a production environment. Its intended purpose is @@ -508,10 +520,11 @@ class Whelk { updated.getThingIdentifiers()[0] && updated.getThingIdentifiers()[0] != preUpdateDoc.getThingIdentifiers()[0] } - + @TypeChecked(TypeCheckingMode.SKIP) void embellish(Document document, List levels = null) { - def docsByIris = { List iris -> bulkLoad(iris).values().collect{ it.data } } - Embellisher e = new Embellisher(jsonld, docsByIris, storage.&getCards, relations.&getByReverse) + def getDocs = andGetExternal({ List iris -> bulkLoad(iris).values().collect{ it.data } }) + def getCards = andGetExternal(storage.&getCards, true) + Embellisher e = new Embellisher(jsonld, getDocs, getCards, relations.&getByReverse) if (levels) { e.setEmbellishLevels(levels) @@ -523,7 +536,40 @@ class Whelk { e.embellish(document) } - + + //FIXME + @TypeChecked(TypeCheckingMode.SKIP) + private def andGetExternal(Function, Iterable> f, cards = false) { + def thingId = { graph -> (String) Document._get(Document.thingIdPath, graph) } + + return { Iterable iris -> + def result = f.apply(iris).collect { + def d = new Document(it) + if (d.isPlaceholder()) { + external.getEphemeral(d.getThingIdentifiers().first()).ifPresent({ ext -> + d.setThing(cards ? jsonld.toCard(ext.getThing(), false) : ext.getThing()) + }) + d.data + } else { + it + } + } + + // get external for IRIs that don't have placeholders + // TODO: only needed if we don't store placeholders for everything + def found = result.collect(thingId) + def missing = ((iris as Set) - (found as Set)) + def ext = missing + .collect{ external.getEphemeral(it) } + .findAll{ it.isPresent() } + .collect {cards ? jsonld.toCard(it.get().data) : it.get().data } + + result += ext + + return result + } + } + /** * Get cards * @param iris @@ -591,4 +637,68 @@ class Whelk { ZoneId getTimezone() { return timezone } + + private void createCacheRecordsAndPlaceholders(String changedBy, Document postUpdateDoc, Document preUpdateDoc = null) { + Set postUpdateLinks = postUpdateDoc.getExternalRefs() + Set preUpdateLinks = preUpdateDoc?.getExternalRefs() ?: new HashSet() //Collections.EMPTY_SET groovy compiler...? + + def iris = { Set s -> s.collect { it.iri } as Set } + Set addedIris = iris(postUpdateLinks) - iris(preUpdateLinks) + + def redirects = createCacheRecordsAndPlaceholders(changedBy, addedIris, !postUpdateDoc.isCacheRecord()) + if (redirects) { + postUpdateDoc.replaceLinks(redirects) + } + } + + private Map createCacheRecordsAndPlaceholders(String changedBy, Set iris, boolean tryFetchExternal) { + Set brokenOrExternalIris = iris - storage.getSystemIdsByIris(iris).keySet() + + boolean minorUpdate = true + def changedIn = 'xl' // FIXME + def collection = LegacyIntegrationTools.NO_MARC_COLLECTION + def deleted = false + + Map redirectedIris = [:] + + brokenOrExternalIris.each { iri -> + def doc = tryFetchExternal + ? external.get(iri).orElse(ExternalEntities.getPlaceholder(iri)) + : ExternalEntities.getPlaceholder(iri) + + if (doc.getThingIdentifiers().first() != iri) { + redirectedIris[iri] = doc.getThingIdentifiers().first() + } + + try { + createDocument(doc, changedIn, changedBy, collection, deleted) + } + catch (StorageCreateFailedException ignored) { + // Another transaction already created it -> OK + } + } + + // Check if old placeholder records can be replaced with cache records + bulkLoad(iris - brokenOrExternalIris).values() + .findAll{doc -> doc.isPlaceholder() } + .each { doc -> + try { + String iri = doc.getThingIdentifiers().first() + external.getEphemeral(iri).ifPresent( (Consumer) { Document extDoc -> + def checkSum = doc.getChecksum(jsonld) + extDoc.setRecordId(doc.getRecordIdentifiers().first()) + if (extDoc.getThingIdentifiers().first() != iri) { + redirectedIris[iri] = extDoc.getThingIdentifiers().first() + extDoc.addThingIdentifier(iri) + } + storeAtomicUpdate(extDoc, minorUpdate, false, changedIn, changedBy, checkSum) + }) + } + catch (Exception e) { // TODO + log.warn("Failed to update ${doc.shortId}: $e", e) + } + } + + return redirectedIris + } } diff --git a/whelk-core/src/main/groovy/whelk/component/DependencyCache.groovy b/whelk-core/src/main/groovy/whelk/component/DependencyCache.groovy index 9863eceb38..244047ffb6 100644 --- a/whelk-core/src/main/groovy/whelk/component/DependencyCache.groovy +++ b/whelk-core/src/main/groovy/whelk/component/DependencyCache.groovy @@ -7,10 +7,10 @@ import com.google.common.util.concurrent.ListenableFuture import com.google.common.util.concurrent.ListenableFutureTask import com.google.common.util.concurrent.ThreadFactoryBuilder import groovy.util.logging.Log4j2 as Log -import io.prometheus.client.guava.cache.CacheMetricsCollector import whelk.Document import whelk.Link import whelk.exception.MissingMainIriException +import whelk.util.Metrics import java.util.concurrent.Callable import java.util.concurrent.Executor @@ -25,9 +25,7 @@ import static whelk.component.PostgreSQLComponent.NotificationType.DEPENDENCY_CA class DependencyCache { private static final int CACHE_SIZE = 50_000 private static final int REFRESH_INTERVAL_MINUTES = 5 - - private static final CacheMetricsCollector cacheMetrics = new CacheMetricsCollector().register() - + PostgreSQLComponent storage private Executor cacheRefresher = Executors.newSingleThreadExecutor( @@ -48,8 +46,8 @@ class DependencyCache { DependencyCache(PostgreSQLComponent storage) { this.storage = storage - cacheMetrics.addCache('dependersCache', dependersCache) - cacheMetrics.addCache('dependencyCache', dependenciesCache) + Metrics.cacheMetrics.addCache('dependersCache', dependersCache) + Metrics.cacheMetrics.addCache('dependencyCache', dependenciesCache) } Set getDependenciesOfType(String iri, String typeOfRelation) { diff --git a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy index 3a796e0030..4d911e8497 100644 --- a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy +++ b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy @@ -283,7 +283,7 @@ class ElasticSearch { } } } - + void remove(String identifier) { if (log.isDebugEnabled()) { log.debug("Deleting object with identifier ${toElasticId(identifier)}.") @@ -306,8 +306,28 @@ class ElasticSearch { log.warn("Record with id $identifier was not deleted from the Elasticsearch index: $e") } } + + Map retrieveIndexedDocument(String systemId) { + try { + mapper.readValue(client.performRequest('GET', + "/${indexName}/_doc/$systemId/_source", ''), Map) + } catch (UnexpectedHttpStatusException e) { + if (isMissingDocument(e)) { + return null + } + else { + throw e + } + } + } String getShapeForIndex(Document document, Whelk whelk) { + if (document.isPlaceholder()) { + whelk.external.getEphemeral(document.getThingIdentifiers().first()).ifPresent({ ext -> + document.setThing(ext.getThing()) + }) + } + Document copy = document.clone() whelk.embellish(copy, ['search-chips']) diff --git a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy index cba3868c74..2d4f5413fd 100644 --- a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy +++ b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy @@ -935,6 +935,10 @@ class PostgreSQLComponent { } } + if (preUpdateDoc.isCacheRecord() && !doc.isCacheRecord()) { + throw new RuntimeException("Cannot change cache record to not be cache record (${doc.getShortId()})") + } + if (doVerifyDocumentIdRetention) { verifyDocumentIdRetention(preUpdateDoc, doc, connection) } @@ -1078,7 +1082,7 @@ class PostgreSQLComponent { } } - if (sparqlQueueEnabled) { + if (sparqlQueueEnabled && !doc.isCacheRecord() && !doc.isPlaceholder()) { sparqlQueueAdd(doc.getShortId(), connection) } } diff --git a/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy new file mode 100644 index 0000000000..8bdea561e4 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy @@ -0,0 +1,115 @@ +package whelk.external + +import com.google.common.cache.CacheBuilder +import com.google.common.cache.CacheLoader +import com.google.common.cache.LoadingCache +import whelk.Document +import whelk.IdGenerator +import whelk.JsonLd +import whelk.Whelk +import whelk.util.Metrics + +class ExternalEntities { + private final List mappers + + private static final int CACHE_SIZE = 10_000 + private final Set bannedImports + + private LoadingCache> cache = CacheBuilder.newBuilder() + .maximumSize(CACHE_SIZE) + .recordStats() + .build(new CacheLoader>() { + @Override + Optional load(String iri) throws Exception { + return getInternal(iri) + } + }) + + ExternalEntities(Whelk whelk) { + Map countryMappings = loadCountryMappings(whelk) + mappers = [ + new Wikidata(countryMappings), + ] + bannedImports = Collections.unmodifiableSet(countryMappings.keySet()) + + Metrics.cacheMetrics.addCache('external-entities', cache) + } + + Optional get(String iri) { + if (mappers.any { it.mightHandle(iri) }) { + cache.get(iri).map{ it.clone() } + } + else { + Optional.empty() + } + } + + Optional getEphemeral(String iri) { + get(iri).map {doc -> + doc.setRecordId("${doc.getThingIdentifiers().first()}#record".toString()) + doc.setRecordType(JsonLd.PLACEHOLDER_RECORD_TYPE) + doc + } + } + + Set getBannedImports() { + return bannedImports + } + + private Optional getInternal(String iri) { + Document d = mappers.findResult { mapper -> + mapper.getThing(iri).map{ document(it, JsonLd.CACHE_RECORD_TYPE, mapper.datasetId()) }.orElse(null) + } + + return Optional.ofNullable (d) + } + + static Document getPlaceholder(String iri) { + def thing = [ + '@id' : iri, + '@type': JsonLd.PLACEHOLDER_ENTITY_TYPE + ] + + document(thing, JsonLd.PLACEHOLDER_RECORD_TYPE) + } + + private static Document document(Map thing, String recordType, String dataset = null) { + def record = [ + '@id' : Document.BASE_URI.toString() + IdGenerator.generate(), + '@type' : recordType, + 'mainEntity': ['@id': thing.'@id'] + ] + + if (dataset) { + record.inDataset = ['@id': dataset] + } + + new Document([ + '@graph': [ + record, + thing + ] + ]) + } + + private static Map loadCountryMappings(Whelk whelk) { + if (!whelk.elasticFind) { + return [:] + } + + def query = [ + (JsonLd.TYPE_KEY): ['Country'], + "q" : ["*"], + '_sort' : [JsonLd.ID_KEY] + ] + + Map result = [:] + def recordIds = whelk.elasticFind.findIds(query).collect{ whelk.baseUri.toString() + it } + whelk.bulkLoad(recordIds).collect { id, doc -> + JsonLd.asList(doc.getThing()['exactMatch']).each { match -> + result[(String) match[JsonLd.ID_KEY]] = doc.getThingIdentifiers().first() + } + } + return result + } +} diff --git a/whelk-core/src/main/groovy/whelk/external/Mapper.groovy b/whelk-core/src/main/groovy/whelk/external/Mapper.groovy new file mode 100644 index 0000000000..65f4a23fbd --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/external/Mapper.groovy @@ -0,0 +1,7 @@ +package whelk.external + +interface Mapper { + boolean mightHandle(String iri) + Optional getThing(String iri) + String datasetId() +} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy b/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy new file mode 100644 index 0000000000..2573abfec1 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy @@ -0,0 +1,98 @@ +package whelk.external + +import org.apache.jena.query.ParameterizedSparqlString +import org.apache.jena.query.Query +import org.apache.jena.query.QueryExecution +import org.apache.jena.query.QueryExecutionFactory +import org.apache.jena.query.ResultSet +import org.apache.jena.query.ResultSetFactory +import org.apache.jena.rdf.model.Model +import org.apache.jena.rdf.model.RDFNode +import org.apache.jena.shared.PrefixMapping + +class QueryRunner { + static final Map nsPrefixes = + [ + "bd" : "http://www.bigdata.com/rdf#", + "kbv" : "https://id.kb.se/vocab/", + "p" : "http://www.wikidata.org/prop/", + "pq" : "http://www.wikidata.org/prop/qualifier/", + "ps" : "http://www.wikidata.org/prop/statement/", + "rdfs" : "http://www.w3.org/2000/01/rdf-schema#", + "sdo" : "http://schema.org/", + "skos" : "http://www.w3.org/2004/02/skos/core#", + "wd" : "http://www.wikidata.org/entity/", + "wdt" : "http://www.wikidata.org/prop/direct/", + "wdtn" : "http://www.wikidata.org/prop/direct-normalized/", + "wikibase": "http://wikiba.se/ontology#" + ] + + static PrefixMapping prefixMapping = PrefixMapping.Factory.create().setNsPrefixes(nsPrefixes) + + static ResultSet localSelectResult(String queryString, Model graph) { + Query q = prepareQuery(queryString) + QueryExecution qExec = localQueryExec(q, graph) + ResultSet rs = selectQuery(qExec) + + return rs + } + + static ResultSet remoteSelectResult(String queryString, String sparqlEndpoint) { + Query q = prepareQuery(queryString) + QueryExecution qExec = remoteQueryExec(q, sparqlEndpoint) + ResultSet rs = selectQuery(qExec) + + return rs + } + + static ResultSet selectQuery(QueryExecution qe) { + ResultSet resultSet + + try { + ResultSet results = qe.execSelect() + resultSet = ResultSetFactory.copyResults(results) + } catch (Exception ex) { + println(ex.getMessage()) + } finally { + qe.close() + } + + return resultSet + } + + static Model constructQuery(QueryExecution qe) { + try { + return qe.execConstruct() + } catch (Exception ex) { + println(ex.getMessage()) + } finally { + qe.close() + } + } + + static boolean askQuery(QueryExecution qe) { + try { + return qe.execAsk() + } catch (Exception ex) { + println(ex.getMessage()) + } finally { + qe.close() + } + } + + static QueryExecution remoteQueryExec(Query query, String sparqlEndpoint) { + return QueryExecutionFactory.sparqlService(sparqlEndpoint, query) + } + + static QueryExecution localQueryExec(Query query, Model graph) { + return QueryExecutionFactory.create(query, graph) + } + + static Query prepareQuery(String command, Collection values = null) { + ParameterizedSparqlString paramString = new ParameterizedSparqlString(command, prefixMapping) + values?.eachWithIndex { v, i -> + paramString.setParam(i, v) + } + return paramString.asQuery() + } +} diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy new file mode 100644 index 0000000000..b3efd4cbc1 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -0,0 +1,389 @@ +package whelk.external + +import groovy.transform.Memoized +import org.apache.jena.query.QuerySolution +import org.apache.jena.query.ResultSet +import org.apache.jena.rdf.model.Model +import org.apache.jena.rdf.model.ModelFactory +import org.apache.jena.rdf.model.RDFNode +import whelk.component.ElasticSearch +import whelk.exception.WhelkRuntimeException +import whelk.util.Metrics +import groovy.util.logging.Log4j2 as Log + +import java.net.http.HttpClient +import java.net.http.HttpRequest +import java.net.http.HttpResponse +import java.nio.charset.StandardCharsets +import java.time.Duration + +import static whelk.util.Jackson.mapper + +@Log +class Wikidata implements Mapper { + Map countryMap + + Wikidata(Map countryMap) { + this.countryMap = countryMap + log.info("Initialized with ${countryMap.size()} country mappings") + } + + @Override + Optional getThing(String iri) { + if (!isWikidata(iri)) { + return Optional.empty() + } + + WikidataEntity wdEntity = new WikidataEntity(iri, countryMap) + + return Optional.ofNullable(wdEntity.convert()) + } + + @Override + boolean mightHandle(String iri) { + return isWikidata(iri) + } + + @Override + String datasetId() { + 'https://id.kb.se/datasets/wikidata' + } + + static boolean isWikidata(String iri) { + iri.startsWith("https://www.wikidata.org") || iri.startsWith("http://www.wikidata.org") + } + + static List query(String query, String langTag, int limit) { + try { + performQuery(query, langTag, limit) + } + catch (Exception e) { + throw new WhelkRuntimeException("Error querying wikidata: $e", e) + } + } + + /** + * Search Wikidata using the wbsearchentities API + * Documented here: https://www.wikidata.org/w/api.php?action=help&modules=wbsearchentities + * + * Language parameter: "Search in this language. This only affects how entities are selected, not + * the language in which the results are returned: this is controlled by the "uselang" parameter." + * + * @param query the query string + * @param langTag language code for language to search in + * @param limit max number of hits + * @return a list of entity URIs + */ + private static List performQuery(String query, String langTag, int limit) { + HttpClient client = HttpClient.newBuilder().followRedirects(HttpClient.Redirect.NORMAL).build() + def base = 'https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json' + def q = URLEncoder.encode(query, StandardCharsets.UTF_8) + String uri = "$base&limit=$limit&language=$langTag&uselang=$langTag&search=$q" + + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(uri)) + .timeout(Duration.ofSeconds(30)) + .GET() + .build() + + def httpResponse = client.send(request, HttpResponse.BodyHandlers.ofString()) + def result = mapper.readValue(httpResponse.body(), Map.class) + .get('search') + .collect { (String) it['concepturi'] } + + return result + } +} + +class WikidataEntity { + static final String WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql" + static final String WIKIDATA_ENTITY_NS = "http://www.wikidata.org/entity/" + + // Wikidata property short ids + static final String COUNTRY = "P17" + static final String DDC = "P1036" + static final String EDITION = "P747" + static final String END_TIME = "P582" + static final String FAST = "P2163" + static final String FREEBASE = "P646" + static final String GEONAMES = "P1566" + static final String GETTY = "P1667" + static final String INSTANCE_OF = "P31" + static final String LC_AUTH = "P244" + static final String LOCATED_IN = "P131" // located in the administrative territorial entity + static final String SUBCLASS_OF = "P279" + static final String TORA = "P4820" + static final String YSO = "P2347" + + // Wikidata class short ids + static final String GEO_FEATURE = "Q618123" + static final String HUMAN = "Q5" + static final String SWEDISH_MUNI = "Q127448" + static final String SWEDISH_COUNTY = "Q200547" + + enum KbvType { + PLACE(GEO_FEATURE), + PERSON(HUMAN), + OTHER('') + + String wikidataType + + private KbvType(String wikidataType) { + this.wikidataType = wikidataType + } + } + + Model graph + + String entityIri + String shortId + + Map countryMap + + WikidataEntity(String iri, Map countryMap) { + try { + graph = ModelFactory.createDefaultModel() + this.shortId = getShortId(iri) + this.entityIri = WIKIDATA_ENTITY_NS + shortId + loadGraph() + } + catch (ExceptionInInitializerError e) { + e.printStackTrace() + } + this.countryMap = countryMap + } + + private void loadGraph() { + try { + Metrics.clientTimer.labels(Wikidata.class.getSimpleName(), 'ttl-dump').time { + graph.read("https://www.wikidata.org/wiki/Special:EntityData/${shortId}.ttl?flavor=dump", "Turtle") + } + } catch (Exception ex) { + println("Unable to load graph for entity ${entityIri}") + } + } + + Map convert() { + switch (type()) { + case KbvType.PLACE: return convertPlace() + case KbvType.PERSON: return convertPerson() + default: return null + } + } + + Map convertPlace() { + Map place = + [ + '@id' : entityIri, + '@type': "Place" + ] + + List prefLabel = getPrefLabel().findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } + if (!prefLabel.isEmpty()) + place['prefLabelByLang'] = prefLabel.collectEntries { [it.getLanguage(), it.getLexicalForm()] } + + List description = getDescription().findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } + if (!description.isEmpty()) + place['descriptionByLang'] = description.collectEntries { [it.getLanguage(), it.getLexicalForm()] } + + List country = getCountry().findAll { it.toString() != entityIri } + if (!country.isEmpty()) + place['country'] = country.collect { ['@id': replaceIfCountry(it.toString())] } + + List locatedIn = getLocatedIn() - country + if (!locatedIn.isEmpty()) + place['locatedIn'] = locatedIn.collect { ['@id': replaceIfCountry(it.toString())] } + + List ddc = getDdc().collect { code, edition -> + Map bNode = + [ + '@type': "ClassificationDdc", + 'code' : code.toString() + ] + if (edition) + bNode['edition'] = ['@id': edition.toString()] + + return bNode + } + + List lcsh = getLcsh().collect { + ['@id': it.toString()] + } + + List fast = getFast().collect { + ['@id': it.toString()] + } + + List getty = getGetty().collect { + ['@id': it.toString()] + } + + List closeMatches = ddc + lcsh + fast + getty + + if (closeMatches) { + place['closeMatch'] = closeMatches + } + + List identifiers = getPlaceIdentifiers() + if (!identifiers.isEmpty()) + place['exactMatch'] = identifiers.collect { ['@id': it.toString()] } + + return place + } + + String replaceIfCountry(String id) { + return countryMap.get(id, id) + } + + Map convertPerson() { + Map person = + [ + '@id' : entityIri, + '@type': "Person" + ] + + List prefLabel = getPrefLabel().findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } + if (!prefLabel.isEmpty()) + person['prefLabelByLang'] = prefLabel.collectEntries { [it.getLanguage(), it.getLexicalForm()] } + + return person + } + + List getPrefLabel() { + String queryString = "SELECT ?prefLabel { wd:${shortId} skos:prefLabel ?prefLabel }" + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("prefLabel") } + } + + List getDescription() { + String queryString = "SELECT ?description { wd:${shortId} sdo:description ?description }" + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("description") } + } + + List getCountry() { + String queryString = "SELECT ?country { wd:${shortId} wdt:${COUNTRY} ?country }" + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("country") } + } + + List getLocatedIn() { + String queryString = """ + SELECT DISTINCT ?place { + wd:${shortId} p:${LOCATED_IN} ?stmt . + ?stmt ps:${LOCATED_IN} ?place . + FILTER NOT EXISTS { ?stmt pq:${END_TIME} ?endTime } + } + """ + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("place") } + } + + List> getDdc() { + String queryString = """ + SELECT ?code ?edition { + wd:${shortId} wdt:${DDC} ?code ; + wdt:${DDC} ?stmt . + OPTIONAL { ?stmt pq:${EDITION} ?edition } + } + """ + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { [it.get("code"), it.get("edition")] } + } + + List getLcsh() { + String queryString = """ + SELECT ?id { + wd:${shortId} wdt:${LC_AUTH} ?shortId . + bind(iri(concat("http://id.loc.gov/authorities/subjects/", ?shortId)) as ?id) + FILTER(strstarts(?shortId, "sh")) + } + """ + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("id") } + } + + List getFast() { + String queryString = """ + SELECT ?fastId { + wd:${shortId} wdtn:${FAST} ?fastId ; + } + """ + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("fastId") } + } + + List getGetty() { + String queryString = """ + SELECT ?fastId { + wd:${shortId} wdtn:${GETTY} ?gettyId ; + } + """ + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("gettyId") } + } + + List getPlaceIdentifiers() { + String queryString = """ + SELECT ?freebaseId ?geonamesId ?toraId ?ysoId { + VALUES ?place { wd:${shortId} } + + OPTIONAL { ?place wdtn:${FREEBASE} ?freebaseId } + OPTIONAL { ?place wdtn:${GEONAMES} ?geonamesId } + OPTIONAL { ?place wdt:${TORA} ?toraShortId } + OPTIONAL { ?place wdtn:${YSO} ?ysoId } + + bind(iri(concat("https://data.riksarkivet.se/tora/", ?toraShortId)) as ?toraId) + } + """ + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + QuerySolution singleRowResult = rs.next() + + return rs.getResultVars().findResults { singleRowResult?.get(it) } + } + + KbvType type() { + String queryString = "SELECT ?type { wd:${shortId} wdt:${INSTANCE_OF} ?type }" + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + Set wdTypes = rs.collect { it.get("type").toString() } as Set + + return KbvType.values().find { getSubclasses(it).intersect(wdTypes) } ?: KbvType.OTHER + } + + @Memoized + static Set getSubclasses(KbvType type) { + if (type == KbvType.OTHER) { + return Collections.EMPTY_SET + } + + String queryString = "SELECT ?class { ?class wdt:${SUBCLASS_OF}* wd:${type.wikidataType} }" + + ResultSet rs = QueryRunner.remoteSelectResult(queryString, WIKIDATA_ENDPOINT) + + return rs.collect { it.get("class").toString() }.toSet() + } + + String getShortId(String iri) { + iri.replaceAll(/.*\//, '') + } +} + diff --git a/whelk-core/src/main/groovy/whelk/util/Metrics.groovy b/whelk-core/src/main/groovy/whelk/util/Metrics.groovy index 46a741dfb6..7256dacefa 100644 --- a/whelk-core/src/main/groovy/whelk/util/Metrics.groovy +++ b/whelk-core/src/main/groovy/whelk/util/Metrics.groovy @@ -2,8 +2,11 @@ package whelk.util import io.prometheus.client.Counter import io.prometheus.client.Summary +import io.prometheus.client.guava.cache.CacheMetricsCollector class Metrics { + static final CacheMetricsCollector cacheMetrics = new CacheMetricsCollector().register() + static final Summary clientTimer = Summary.build() .labelNames("target", "method") .quantile(0.5, 0.05) diff --git a/whelktool/scripts/analysis/lxl-2483-broken-or-external-links.groovy b/whelktool/scripts/analysis/lxl-2483-broken-or-external-links.groovy new file mode 100644 index 0000000000..f600ea8433 --- /dev/null +++ b/whelktool/scripts/analysis/lxl-2483-broken-or-external-links.groovy @@ -0,0 +1,31 @@ +import groovy.transform.Memoized +import whelk.util.DocumentUtil + +whelk = getWhelk() + +selectBySqlWhere('deleted is false', silent: true) { doc -> + DocumentUtil.findKey(doc.graph, "@id") { value, path -> + if (is404(value)) { + incrementStats('404', value) + } + } +} + +@Memoized +boolean is404(iri) { + String systemId = whelk.storage.getSystemIdByIri(iri) + return !systemId +} + +def getWhelk() { + // A little hack to get a handle to whelk... + def whelk = null + selectByIds(['https://id.kb.se/marc']) { docItem -> + whelk = docItem.whelk + } + if (!whelk) { + throw new RuntimeException("Could not get Whelk") + } + return whelk +} + diff --git a/whelktool/scripts/analysis/lxl-2483-publication-place.groovy b/whelktool/scripts/analysis/lxl-2483-publication-place.groovy new file mode 100644 index 0000000000..0a235479b6 --- /dev/null +++ b/whelktool/scripts/analysis/lxl-2483-publication-place.groovy @@ -0,0 +1,46 @@ +import whelk.Document + +errors = getReportWriter("errors.txt") + +prod = ['publication', 'production', 'manufacture'] + +selectByCollection('bib') { doc -> + try { + process(doc) + } + catch (Exception e) { + def m = "${doc.doc.shortId} $e" + println(m) + errors.println(m) + } +} + +void process(doc) { + prod.each { p -> + getPathSafe(doc.graph, [1, p], []).each { + def place = asList(getPathSafe(it, ['place', 'label'])).flatten().join(' | ') + if (place) { + incrementStats(p, place) + } + } + } +} + +Object getPathSafe(item, path, defaultTo = null) { + for (p in path) { + if (item[p] != null) { + item = item[p] + } else { + return defaultTo + } + } + return item +} + +private List asList(Object o) { + if (o == null) + return [] + if (o instanceof List) + return o + return [o] +} \ No newline at end of file