From 82553e9a7d960cdf75288e206698d1061b4d72f5 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 29 Nov 2023 14:17:53 +0100 Subject: [PATCH 1/7] Use constants, add comments, improve naming etc. --- librisworks/run.sh | 10 +- .../add-missing-contribution-data.groovy | 6 + .../scripts/contributions-to-instance.groovy | 9 ++ .../drop-anonymous-translations.groovy | 4 + librisworks/scripts/find-work-clusters.groovy | 36 +++-- .../scripts/language-in-work-title.groovy | 5 + .../lxl-4150-deduplicate-contribution.groovy | 45 +++++- librisworks/scripts/merge-works.groovy | 41 +++-- librisworks/scripts/swedish-fiction.groovy | 5 + librisworks/scripts/title-clusters.groovy | 5 + .../se/kb/libris/mergeworks/DisplayDoc.groovy | 105 +++++++++---- .../groovy/se/kb/libris/mergeworks/Doc.groovy | 147 +++++++++++------- .../se/kb/libris/mergeworks/Util.groovy | 116 ++++++++------ .../libris/mergeworks/WorkComparator.groovy | 9 +- .../mergeworks/compare/ContentType.groovy | 3 +- .../mergeworks/compare/GenreForm.groovy | 38 +++-- .../compare/IntendedAudience.groovy | 2 + .../mergeworks/compare/TranslationOf.groovy | 23 ++- 18 files changed, 416 insertions(+), 193 deletions(-) diff --git a/librisworks/run.sh b/librisworks/run.sh index bed301cfba..2cd0b8fa16 100755 --- a/librisworks/run.sh +++ b/librisworks/run.sh @@ -1,6 +1,9 @@ #!/bin/bash set -eu +# Find, match and merge work descriptions that describe the same work. +# Usage example: ./run.sh qa --num-threads 8 + count_lines() { if [ -f $1 ]; then wc -l $1 | cut -d ' ' -f 1 @@ -47,15 +50,16 @@ ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/4-roles-to-instance echo "Finding new clusters..." time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \ $ARGS --report $ALL/$WHELKTOOL_REPORT $SCRIPTS_DIR/find-work-clusters.groovy >$ALL/$CLUSTER_TSV 2>/dev/null + +# Filter out duplicates +sort -uo $ALL/$CLUSTER_TSV $ALL/$CLUSTER_TSV + NUM_CLUSTERS=$(count_lines $ALL/$CLUSTER_TSV) echo "$NUM_CLUSTERS clusters found" if [ $NUM_CLUSTERS == 0 ]; then exit 0 fi -# Filter out duplicates -sort -uo $ALL/$CLUSTER_TSV $ALL/$CLUSTER_TSV - echo echo "Finding title clusters..." time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$ALL/$CLUSTER_TSV -jar $JAR_FILE \ diff --git a/librisworks/scripts/add-missing-contribution-data.groovy b/librisworks/scripts/add-missing-contribution-data.groovy index e9f87032cd..05eead6637 100644 --- a/librisworks/scripts/add-missing-contribution-data.groovy +++ b/librisworks/scripts/add-missing-contribution-data.groovy @@ -1,3 +1,8 @@ +/** + * Use various methods for completing and normalizing contributions within a work cluster. + * See individual methods for details. + */ + import groovy.transform.Memoized import org.apache.commons.lang3.StringUtils @@ -132,6 +137,7 @@ selectByIds(clusters.flatten()) { bib -> modified |= tryLinkAgent(c, id) // if there are more roles stated in responsibilityStatement other than the existing ones in this contribution, add those modified |= tryAddRolesFromRespStatement(c, contributionsInRespStatement, respStatement, id) + // if two local agents match on name and one of them has lifeSpan and the other doesn't, add that lifeSpan to the one missing it. modified |= tryAddLifeSpanToLocalAgent(c, id) } diff --git a/librisworks/scripts/contributions-to-instance.groovy b/librisworks/scripts/contributions-to-instance.groovy index c981ae4517..a19c6df4a8 100644 --- a/librisworks/scripts/contributions-to-instance.groovy +++ b/librisworks/scripts/contributions-to-instance.groovy @@ -1,3 +1,12 @@ +/** + * Move contribution to instance if the role's domain is (or is subclass of) Embodiment. + * Also move illustrator to instance if none of the following criteria is met: + * - The illustrator is the primary contributor (PrimaryContribution) + * - Classification indicates a picture book or comics + * - Genre/form indicates a picture book or comics + * See isComics() and isPictureBook() below for details. + */ + import whelk.Whelk import java.util.concurrent.ConcurrentHashMap diff --git a/librisworks/scripts/drop-anonymous-translations.groovy b/librisworks/scripts/drop-anonymous-translations.groovy index 34df52aabd..e522d8bc28 100644 --- a/librisworks/scripts/drop-anonymous-translations.groovy +++ b/librisworks/scripts/drop-anonymous-translations.groovy @@ -1,3 +1,7 @@ +/** + * Drop works that are translations but lacking a translator in contribution from clusters. + */ + import se.kb.libris.mergeworks.Doc new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster -> diff --git a/librisworks/scripts/find-work-clusters.groovy b/librisworks/scripts/find-work-clusters.groovy index b69a4b0ffb..47681a374b 100644 --- a/librisworks/scripts/find-work-clusters.groovy +++ b/librisworks/scripts/find-work-clusters.groovy @@ -1,7 +1,21 @@ /** + * Find clusters of records that may contain descriptions of the same work. + * In short, similar descriptions are found by, for each bib record, querying Elastic for other records + * having the same instance or work title and the same agent(s) in work contribution. + * The ids found by the query becomes a cluster. + * See script for more details. + * * (When running, redirect STDERR to avoid annoying prints from whelktool) */ +import static se.kb.libris.mergeworks.Util.AGENT +import static se.kb.libris.mergeworks.Util.HAS_TITLE +import static se.kb.libris.mergeworks.Util.MAIN_TITLE +import static se.kb.libris.mergeworks.Util.PRIMARY +import static se.kb.libris.mergeworks.Util.CONTRIBUTION +import static whelk.JsonLd.ID_KEY +import static whelk.JsonLd.TYPE_KEY + PrintWriter failedQueries = getReportWriter("failed-queries") PrintWriter tooLargeResult = getReportWriter("too-large-result") @@ -12,6 +26,7 @@ def process = { bib -> if (!work) return + // Get mainTitle from both instance and work (we want to search for both when they differ) def titles = [instance, work].grep().collect { title(it) }.grep().unique() Set ids = [] @@ -27,6 +42,7 @@ def process = { bib -> if (ids.size() > 1000) { tooLargeResult.println("Results: ${ids.size()} Id: ${bib.doc.shortId} Titles: ${titles}") } else if (ids.size() > 1) { + // Sort so that duplicate clusters can easily be identified println(ids.sort().join('\t')) } } @@ -51,6 +67,7 @@ Map> buildQuery(Map work, String title) { insertLinkedAgents(work) def card = getWhelk().jsonld.toCard(work, false, true) + // If there is a primary contributor, include only that in agent in the query def author = primaryContributor(card).collect { esSafe(it) } if (author) { query["or-instanceOf.contribution._str"] = author @@ -58,6 +75,7 @@ Map> buildQuery(Map work, String title) { return query } + // If no primary contributor, include all agents in the query def allContributors = contributors(card).collect { esSafe(it) } if (allContributors) { query["or-instanceOf.contribution._str"] = allContributors @@ -69,29 +87,29 @@ Map> buildQuery(Map work, String title) { } private void insertLinkedAgents(work) { - asList(work['contribution']).each { - def agent = asList(it.agent).find() - if (agent && agent['@id']) { - it.agent = loadThing(agent['@id']) + asList(work[CONTRIBUTION]).each { + def agent = asList(it[AGENT]).find() + if (agent && agent[ID_KEY]) { + it.agent = loadThing(agent[ID_KEY]) } } } private String title(Map thing) { - return getAtPath(thing, ['hasTitle', 0, 'mainTitle']) + return getAtPath(thing, [HAS_TITLE, 0, MAIN_TITLE]) } private List primaryContributor(work) { - contributorStrings(asList(work['contribution']).find { it['@type'] == "PrimaryContribution" }) + contributorStrings(asList(work[CONTRIBUTION]).find { it[TYPE_KEY] == PRIMARY }) } private List contributors(work) { - asList(work['contribution']).collect { contributorStrings(it) }.grep().flatten() + asList(work[CONTRIBUTION]).collect { contributorStrings(it) }.grep().flatten() } //getAtPath(contribution, ['_str'])?.with { String s -> s.replaceAll(/[^ \p{IsAlphabetic}]/, '') } private List contributorStrings(contribution) { - List variants = asList(contribution?.agent) + asList(getAtPath(contribution, ['agent', 'hasVariant'])) + List variants = asList(contribution?[AGENT]) + asList(getAtPath(contribution, [AGENT, 'hasVariant'])) variants.grep().collect { name(it) }.grep() } @@ -108,7 +126,7 @@ private String esSafe(String s) { } private loadIfLink(Map work) { - work?['@id'] ? loadThing(work['@id']) : work + work?[ID_KEY] ? loadThing(work[ID_KEY]) : work } private Map loadThing(def id) { diff --git a/librisworks/scripts/language-in-work-title.groovy b/librisworks/scripts/language-in-work-title.groovy index c05faa2d74..08b889dda3 100644 --- a/librisworks/scripts/language-in-work-title.groovy +++ b/librisworks/scripts/language-in-work-title.groovy @@ -1,3 +1,8 @@ +/** + * Remove language appearing as a substring of work main title if already described in the language property. + * E.g. "Pippi Långstrump (Svenska)" --> "Pippi Långstrump" when work.language = [{'@id': 'https://id.kb.se/language/swe'}] + */ + import groovy.transform.Memoized import whelk.util.DocumentUtil diff --git a/librisworks/scripts/lxl-4150-deduplicate-contribution.groovy b/librisworks/scripts/lxl-4150-deduplicate-contribution.groovy index 8174b3d5d5..a949d3160f 100644 --- a/librisworks/scripts/lxl-4150-deduplicate-contribution.groovy +++ b/librisworks/scripts/lxl-4150-deduplicate-contribution.groovy @@ -1,26 +1,57 @@ +/** + * Merge contributions having the same agent. Prefer merging into PrimaryContribution. + * Example: + * [ + * { + * "@type": "PrimaryContribution", + * "agent": {"@id": "https://libris.kb.se/X#it"}, + * "role": [{"@id": "https://id.kb.se/relator/author"}] + * }, + * { + * "@type": "Contribution", + * "agent": {"@id": "https://libris.kb.se/X#it"}, + * "role": [{"@id": "https://id.kb.se/relator/illustrator"}] + * } + * ] + * ---> + * [ + * { + * "@type": "PrimaryContribution", + * "agent": {"@id": "https://libris.kb.se/X#it"}, + * "role": [{"@id": "https://id.kb.se/relator/author"}, {"@id": "https://id.kb.se/relator/illustrator"}] + * } + * ] + */ + +import static se.kb.libris.mergeworks.Util.AGENT +import static se.kb.libris.mergeworks.Util.CONTRIBUTION +import static se.kb.libris.mergeworks.Util.PRIMARY +import static se.kb.libris.mergeworks.Util.ROLE +import static whelk.JsonLd.TYPE_KEY + def ids = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } }.flatten() selectByIds(ids) { bib -> def work = bib.graph[1].instanceOf - def contribution = work?.contribution + def contribution = work?[CONTRIBUTION] if (!contribution) return - def duplicates = contribution.countBy { asList(it.agent) }.findResults { it.value > 1 ? it.key : null } + def duplicates = contribution.countBy { asList(it[AGENT]) }.findResults { it.value > 1 ? it.key : null } duplicates.each { d -> - def primaryContributionIdx = contribution.findIndexOf { asList(it.agent) == d && it['@type'] == 'PrimaryContribution' } + def primaryContributionIdx = contribution.findIndexOf { asList(it[AGENT]) == d && it[TYPE_KEY] == PRIMARY } def mergeIntoIdx = primaryContributionIdx > -1 ? primaryContributionIdx - : contribution.findIndexOf { asList(it.agent) == d } + : contribution.findIndexOf { asList(it[AGENT]) == d } def mergeInto = contribution[mergeIntoIdx] - def roles = contribution.findResults { asList(it.agent) == d ? asList(it.role) : null }.flatten().unique() - if (roles) mergeInto['role'] = roles + def roles = contribution.findResults { asList(it[AGENT]) == d ? asList(it[ROLE]) : null }.flatten().unique() + if (roles) mergeInto[ROLE] = roles def idx = 0 contribution.removeAll { - def removeIf = asList(it.agent) == d && idx != mergeIntoIdx + def removeIf = asList(it[AGENT]) == d && idx != mergeIntoIdx idx += 1 return removeIf } diff --git a/librisworks/scripts/merge-works.groovy b/librisworks/scripts/merge-works.groovy index 1bdfc81ee7..aebffa87a4 100644 --- a/librisworks/scripts/merge-works.groovy +++ b/librisworks/scripts/merge-works.groovy @@ -1,8 +1,27 @@ +/** + * Match and merge works. + * + * First create clusters of works that are considered equal according to given criteria. + * If a work cluster contains only local works (two or more), merge those and create a new linkable work. + * If a work cluster contains exactly one linked work and at least one local work, merge the local work(s) into the linked one. + * If a work cluster contains two or more linked works, report. There should be no duplicate linked works. + * + * If multiple work clusters are found, add closeMatch links from each unique work to each resulting linked work. + * + * See script for details. + */ + import se.kb.libris.mergeworks.Html import se.kb.libris.mergeworks.WorkComparator import se.kb.libris.mergeworks.Doc +import static whelk.JsonLd.GRAPH_KEY +import static whelk.JsonLd.ID_KEY + import static se.kb.libris.mergeworks.Util.workClusters +import static whelk.JsonLd.THING_KEY +import static whelk.JsonLd.TYPE_KEY +import static whelk.JsonLd.WORK_KEY maybeDuplicates = getReportWriter("maybe-duplicate-linked-works.tsv") multiWorkReport = getReportWriter("multi-work-clusters.html") @@ -34,6 +53,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster -> List>> uniqueWorksAndTheirInstances = [] workClusters(docs, c).each { wc -> + // Only local works have instance data in the same record def (localWorks, linkedWorks) = wc.split { it.instanceData } if (linkedWorks.isEmpty()) { if (localWorks.size() == 1) { @@ -64,7 +84,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster -> } // New merged work if (!workDoc.existsInStorage && !workDoc.instanceData) { - addAdminMetadata(workDoc, instanceDocs.collect { ['@id': it.recordIri()] }) + addAdminMetadata(workDoc, instanceDocs.collect { [(ID_KEY): it.recordIri()] }) addCloseMatch(workDoc, linkableWorks) saveAndLink(workDoc, instanceDocs, workDoc.existsInStorage) // writeWorkReport(docs, workDoc, instanceDocs, WorkStatus.NEW) @@ -76,6 +96,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster -> } } + // Multiple unique works in same title cluster, save report showing how they differ. if (uniqueWorksAndTheirInstances.size() > 1) { def (workDocs, instanceDocs) = uniqueWorksAndTheirInstances.transpose() multiWorkReport.print(Html.hubTable(workDocs, instanceDocs) + Html.HORIZONTAL_RULE) @@ -101,20 +122,20 @@ void saveAndLink(Doc workDoc, Collection instanceDocs = [], boolean existsI if (!instanceDocs.isEmpty()) { selectByIds(instanceDocs.collect { it.shortId() }) { - it.graph[1]['instanceOf'] = ['@id': workDoc.thingIri()] + it.graph[1][WORK_KEY] = [(ID_KEY): workDoc.thingIri()] it.scheduleSave(changedBy: changedBy, generationProcess: generationProcess) } } } Doc createNewWork(Map workData) { - workData['@id'] = "TEMPID#it" + workData[ID_KEY] = "TEMPID#it" Map data = [ - "@graph": [ + (GRAPH_KEY): [ [ - "@id" : "TEMPID", - "@type" : "Record", - "mainEntity": ["@id": "TEMPID#it"], + (ID_KEY) : "TEMPID", + (TYPE_KEY) : "Record", + (THING_KEY): [(ID_KEY): "TEMPID#it"], ], workData @@ -127,12 +148,12 @@ Doc createNewWork(Map workData) { void addAdminMetadata(Doc doc, List derivedFrom) { doc.record()['hasChangeNote'] = [ [ - '@type': 'CreateNote', + (ID_KEY): 'CreateNote', 'tool' : ['@id': 'https://id.kb.se/generator/mergeworks'] ] ] doc.record()['derivedFrom'] = derivedFrom - doc.record()['descriptionLanguage'] = ['@id': 'https://id.kb.se/language/swe'] + doc.record()['descriptionLanguage'] = [(ID_KEY): 'https://id.kb.se/language/swe'] } void writeWorkReport(Collection titleCluster, Doc derivedWork, Collection derivedFrom, WorkStatus workStatus) { @@ -175,7 +196,7 @@ boolean addCloseMatch(Doc workDoc, List linkableWorks) { def linkTo = linkableWorks.findAll { d -> d.workIri() != workDoc.thingIri() && d.primaryContributor() == workDoc.primaryContributor() - }.collect { ['@id': it.workIri()] } + }.collect { [(ID_KEY): it.workIri()] } def closeMatch = asList(workDoc.workData['closeMatch']) diff --git a/librisworks/scripts/swedish-fiction.groovy b/librisworks/scripts/swedish-fiction.groovy index 76a65c7e7c..e3d1f28f42 100644 --- a/librisworks/scripts/swedish-fiction.groovy +++ b/librisworks/scripts/swedish-fiction.groovy @@ -1,3 +1,8 @@ +/** + * Filter clusters: Keep only clusters that are part of the SVSK selection. + * SVSK ≈ fiction in Swedish, see detailed criteria in script. + */ + import se.kb.libris.mergeworks.Doc new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) {cluster -> diff --git a/librisworks/scripts/title-clusters.groovy b/librisworks/scripts/title-clusters.groovy index 28e6a0eab8..0b599833ae 100644 --- a/librisworks/scripts/title-clusters.groovy +++ b/librisworks/scripts/title-clusters.groovy @@ -1,3 +1,7 @@ +/** + * Partition each cluster into smaller clusters based on strict title matching. + */ + import se.kb.libris.mergeworks.Doc import static se.kb.libris.mergeworks.Util.partition @@ -15,6 +19,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) {cluster -> Collection> titleClusters(Collection docs) { return partitionByTitle(docs) .findAll { !it.any { doc -> doc.hasGenericTitle() } } + // Replace instances sharing the same linked work with only the linked work .collect { loadUniqueLinkedWorks(it) + it.findAll {d -> !d.workIri() } } .findAll { it.size() > 1 } .sort { a, b -> a.first().view.instanceDisplayTitle() <=> b.first().view.instanceDisplayTitle() } diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy index 3fed6e1987..7b9c549204 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy @@ -2,8 +2,37 @@ package se.kb.libris.mergeworks import whelk.Document import whelk.JsonLd +import whelk.Whelk import whelk.util.DocumentUtil +import static Util.AGENT +import static Util.CLASSIFICATION +import static Util.CODE +import static Util.CONTRIBUTION +import static Util.EDITION_STATEMENT +import static Util.ENCODING_LEVEL +import static Util.EXTENT +import static Util.FAMILY_NAME +import static Util.FLAT_TITLE +import static Util.GIVEN_NAME +import static Util.HAS_TITLE +import static Util.IDENTIFIED_BY +import static Util.IN_SCHEME +import static Util.LABEL +import static Util.LIFE_SPAN +import static Util.NAME +import static Util.PHYS_NOTE +import static Util.PRIMARY +import static Util.PUBLICATION +import static Util.REPRODUCTION_OF +import static Util.RESP_STATEMENT +import static Util.ROLE +import static Util.VERSION +import static whelk.JsonLd.ID_KEY +import static whelk.JsonLd.TYPE_KEY +import static whelk.JsonLd.WORK_KEY + + class DisplayDoc { Doc doc Map framed @@ -13,60 +42,76 @@ class DisplayDoc { } private static String displayTitle(Map thing) { - thing['hasTitle'].collect { it['@type'] + ": " + it['flatTitle'] }.join(', ') + thing[HAS_TITLE].collect { it[TYPE_KEY] + ": " + it[FLAT_TITLE] }.join(', ') } String instanceDisplayTitle() { - displayTitle(['hasTitle': Util.flatTitles(doc.instanceTitle())]) + displayTitle([(HAS_TITLE): Util.flatTitles(doc.instanceTitle())]) } - // TODO... String getDisplayText(String field) { - if (field == 'contribution') { + if (field == CONTRIBUTION) { return contributorStrings().join("
") - } else if (field == 'classification') { + } else if (field == CLASSIFICATION) { return classificationStrings().join("
") } else if (field == 'instance title') { return doc.instanceTitle() ?: '' } else if (field == 'instance type') { return doc.instanceType() ?: '' - } else if (field == 'editionStatement') { + } else if (field == EDITION_STATEMENT) { return doc.editionStatement() ?: '' - } else if (field == 'responsibilityStatement') { + } else if (field == RESP_STATEMENT) { return doc.responsibilityStatement() ?: '' - } else if (field == 'encodingLevel') { + } else if (field == ENCODING_LEVEL) { return doc.encodingLevel() - } else if (field == 'publication') { + } else if (field == PUBLICATION) { return chipString(doc.publication()) - } else if (field == 'identifiedBy') { + } else if (field == IDENTIFIED_BY) { return chipString(doc.identifiedBy()) - } else if (field == 'extent') { + } else if (field == EXTENT) { return chipString(doc.extent() ?: []) - } else if (field == 'reproductionOf') { + } else if (field == REPRODUCTION_OF) { return reproductionOfLink() - } else if (field == 'physicalDetailsNote') { + } else if (field == PHYS_NOTE) { return doc.physicalDetailsNote() ?: '' } else { return chipString(doc.workData.getOrDefault(field, [])) } } - protected String chipString(def thing) { - Util.chipString(thing, doc.whelk) + private String chipString(def thing) { + if (thing instanceof Integer) { + return thing + } + + def chips = doc.whelk.jsonld.toChip(thing) + if (chips.size() < 2) { + chips = thing + } + if (chips instanceof List) { + return chips.collect { valuesString(it) }.sort().join('
') + } + return valuesString(chips) + } + + private String valuesString(def thing) { + if (thing instanceof List) { + return thing.collect { valuesString(it) }.join(' • ') + } + if (thing instanceof Map) { + return thing.findAll { k, v -> k != TYPE_KEY }.values().collect { valuesString(it) }.join(' • ') + } + return thing.toString() } private String reproductionOfLink() { def base = Document.getBASE_URI().toString() def shortId = doc.reproductionOf() - ? doc.reproductionOf()[0]['@id'].substring(base.length()).replace('#it', '') + ? doc.reproductionOf()[0][ID_KEY].substring(base.length()).replace('#it', '') : '' return "$shortId" } - String tooltip(String string, String tooltip) { - """${string}""" - } - String link() { String base = Document.getBASE_URI().toString() String kat = "katalogisering/" @@ -75,7 +120,7 @@ class DisplayDoc { } private List contributorStrings() { - List path = doc.instanceData ? ['instanceOf', 'contribution'] : ['contribution'] + List path = doc.instanceData ? [WORK_KEY, CONTRIBUTION] : [CONTRIBUTION] List contribution = DocumentUtil.getAtPath(getFramed(), path, []) return contribution.collect { Map c -> @@ -86,14 +131,14 @@ class DisplayDoc { private String contributionStr(Map contribution) { StringBuilder s = new StringBuilder() - if (contribution['@type'] == 'PrimaryContribution') { + if (contribution[TYPE_KEY] == PRIMARY) { s.append('') } - s.append(flatMaybeLinked(contribution['role'], ['code', 'label']).with { it.isEmpty() ? it : it + ': ' }) - s.append(flatMaybeLinked(contribution['agent'], ['givenName', 'familyName', 'lifeSpan', 'name'])) + s.append(flatMaybeLinked(contribution[ROLE], [CODE, LABEL]).with { it.isEmpty() ? it : it + ': ' }) + s.append(flatMaybeLinked(contribution[AGENT], [GIVEN_NAME, FAMILY_NAME, LIFE_SPAN, NAME])) - if (contribution['@type'] == 'PrimaryContribution') { + if (contribution[TYPE_KEY] == PRIMARY) { s.append('') } @@ -101,13 +146,13 @@ class DisplayDoc { } List classificationStrings() { - List path = doc.instanceData ? ['instanceOf', 'classification'] : ['classification'] + List path = doc.instanceData ? [WORK_KEY, CLASSIFICATION] : [CLASSIFICATION] List classification = DocumentUtil.getAtPath(getFramed(), path, []) classification.collect { c -> StringBuilder s = new StringBuilder() - s.append(flatMaybeLinked(c['inScheme'], ['code', 'version']).with { it.isEmpty() ? it : it + ': ' }) - s.append(flatMaybeLinked(c, ['code'])) + s.append(flatMaybeLinked(c[IN_SCHEME], [CODE, VERSION]).with { it.isEmpty() ? it : it + ': ' }) + s.append(flatMaybeLinked(c, [CODE])) return s.toString() } } @@ -121,8 +166,8 @@ class DisplayDoc { } String s = flatten(thing, order, ', ') - thing['@id'] - ? """$s""" + thing[ID_KEY] + ? """$s""" : s } diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy index ee7468beb2..2378fa9b1e 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy @@ -1,14 +1,49 @@ package se.kb.libris.mergeworks import whelk.Document -import whelk.JsonLd import whelk.Whelk import whelk.datatool.DocumentItem import whelk.util.DocumentUtil import static Util.asList import static Util.Relator - +import static Util.AGENT +import static Util.CLASSIFICATION +import static Util.CODE +import static Util.CONTENT_TYPE +import static Util.CONTRIBUTION +import static Util.EDITION_STATEMENT +import static Util.ENCODING_LEVEL +import static Util.EXTENT +import static Util.GENRE_FORM +import static Util.HAS_PART +import static Util.HAS_TITLE +import static Util.IDENTIFIED_BY +import static Util.INTENDED_AUDIENCE +import static Util.IN_SCHEME +import static Util.LABEL +import static Util.MAIN_TITLE +import static Util.PART_NAME +import static Util.PART_NUMBER +import static Util.PHYS_NOTE +import static Util.PRIMARY +import static Util.PUBLICATION +import static Util.REPRODUCTION_OF +import static Util.RESP_STATEMENT +import static Util.ROLE +import static Util.SUBTITLE +import static Util.TITLE +import static Util.TITLE_REMAINDER +import static Util.TRANSLATION_OF +import static whelk.JsonLd.GRAPH_KEY +import static whelk.JsonLd.ID_KEY +import static whelk.JsonLd.TYPE_KEY +import static whelk.JsonLd.WORK_KEY + + +/** + * Wrapper around a whelk.Document for easy access to various entities/properties + */ class Doc { public static final String SAOGF_SKÖN = 'https://id.kb.se/term/saogf/Sk%C3%B6nlitteratur' public static final List MARC_FICTION = [ @@ -66,9 +101,9 @@ class Doc { } void setData() { - if (mainEntity()['instanceOf']) { + if (mainEntity()[WORK_KEY]) { instanceData = mainEntity() - workData = asList(instanceData['instanceOf']).find() + workData = asList(instanceData[WORK_KEY]).find() } else { workData = mainEntity() } @@ -83,11 +118,11 @@ class Doc { } Map record() { - document.data['@graph'][0] + document.data[GRAPH_KEY][0] } Map mainEntity() { - document.data['@graph'][1] + document.data[GRAPH_KEY][1] } String shortId() { @@ -103,15 +138,15 @@ class Doc { } String encodingLevel() { - return record()['encodingLevel'] ?: '' + return record()[ENCODING_LEVEL] ?: '' } String workIri() { - workData['@id'] + workData[ID_KEY] } List workTitle() { - asList(workData['hasTitle']) + asList(workData[HAS_TITLE]) } List flatWorkTitle() { @@ -123,7 +158,7 @@ class Doc { } List instanceTitle() { - asList(instanceData?.hasTitle) + asList(instanceData?[HAS_TITLE]) } List flatInstanceTitle() { @@ -135,67 +170,67 @@ class Doc { } String workType() { - workData['@type'] + workData[TYPE_KEY] } String instanceType() { - instanceData?.'@type' + instanceData?[TYPE_KEY] } List translationOf() { - asList(workData['translationOf']) + asList(workData[TRANSLATION_OF]) } List contribution() { - asList(workData['contribution']) + asList(workData[CONTRIBUTION]) } List classification() { - asList(workData['classification']) + asList(workData[CLASSIFICATION]) } List genreForm() { - asList(workData['genreForm']) + asList(workData[GENRE_FORM]) } List intendedAudience() { - asList(workData['intendedAudience']) + asList(workData[INTENDED_AUDIENCE]) } List publication() { - asList(instanceData?.publication) + asList(instanceData?[PUBLICATION]) } List identifiedBy() { - asList(instanceData?.identifiedBy) + asList(instanceData?[IDENTIFIED_BY]) } List extent() { - asList(instanceData?.extent) + asList(instanceData?[EXTENT]) } List reproductionOf() { - asList(instanceData?.reproductionOf) + asList(instanceData?[REPRODUCTION_OF]) } Map primaryContributor() { - contribution().findResult { it['@type'] == 'PrimaryContribution' ? asList(it.agent).find() : null } + contribution().findResult { it[TYPE_KEY] == PRIMARY ? asList(it[AGENT]).find() : null } } String editionStatement() { - instanceData?.editionStatement + instanceData?[EDITION_STATEMENT] } String responsibilityStatement() { - instanceData?.responsibilityStatement + instanceData?[RESP_STATEMENT] } String physicalDetailsNote() { - instanceData?.physicalDetailsNote + instanceData?[PHYS_NOTE] } int numPages() { - String extent = DocumentUtil.getAtPath(extent(), [0, 'label', 0]) ?: DocumentUtil.getAtPath(extent(), [0, 'label'], '') + String extent = DocumentUtil.getAtPath(extent(), [0, LABEL, 0]) ?: DocumentUtil.getAtPath(extent(), [0, LABEL], '') return numPages(extent) } @@ -219,40 +254,41 @@ class Doc { } boolean isManuscript() { - instanceType() == 'Manuscript' || [['@id': 'https://id.kb.se/term/saogf/Manuskript'], ['@id': 'https://id.kb.se/term/saogf/Handskrifter']].intersect(genreForm()) + instanceType() == 'Manuscript' || [[(ID_KEY): 'https://id.kb.se/term/saogf/Manuskript'], [(ID_KEY): 'https://id.kb.se/term/saogf/Handskrifter']].intersect(genreForm()) } boolean isInSb17Bibliography() { - asList(record()['bibliography']).contains(['@id': 'https://libris.kb.se/library/SB17']) + asList(record()['bibliography']).contains([(ID_KEY): 'https://libris.kb.se/library/SB17']) } boolean isMaybeAggregate() { - hasPart() - || classification().any { it.inScheme?.code =~ /[Kk]ssb/ && it.code?.contains('(s)') } - || !contribution().any { it['@type'] == 'PrimaryContribution' && it['agent'] } + hasParts() + // (s) means "samlingsverk" + || classification().any { it[IN_SCHEME]?[CODE] =~ /[Kk]ssb/ && it[CODE]?.contains('(s)') } + || !contribution().any { it[TYPE_KEY] == PRIMARY && it[AGENT] } || hasRelationshipWithContribution() } boolean intendedForMarcPreAdolescent() { - intendedAudience().contains(['@id': 'https://id.kb.se/marc/PreAdolescent']) + intendedAudience().contains([(ID_KEY): 'https://id.kb.se/marc/PreAdolescent']) } - boolean hasPart() { - workData['hasPart'] || instanceData['hasTitle'].findAll { it['@type'] == 'Title' }.any { - it.hasPart?.size() > 1 - || it.hasPart?.any { p -> asList(p.partName).size() > 1 - || asList(p.partNumber).size() > 1 } + boolean hasParts() { + workData[HAS_PART] || instanceData[HAS_TITLE].findAll { it[TYPE_KEY] == TITLE }.any { + it[HAS_PART]?.size() > 1 + || it[HAS_PART]?.any { p -> asList(p[PART_NAME]).size() > 1 + || asList(p[PART_NUMBER]).size() > 1 } // space+semicolon indicates an aggregate if it is not preceded by a slash // aggregate: Måsen ; Onkel Vanja ; Körsbärsträdgården // not aggregate: En visa för de döda / Patrick Dunne ; översättning: Hans Lindeberg - || [it.mainTitle, it.titleRemainder, it.subtitle].findAll().toString() =~ /(? asList(r['entity']).any { e -> - e.containsKey('contribution') + e.containsKey(CONTRIBUTION) } } } @@ -262,19 +298,19 @@ class Doc { } boolean isMarcFiction() { - genreForm().any { it['@id'] in MARC_FICTION } + genreForm().any { it[ID_KEY] in MARC_FICTION } } boolean isMarcNotFiction() { - genreForm().any { it['@id'] in MARC_NOT_FICTION } + genreForm().any { it[ID_KEY] in MARC_NOT_FICTION } } boolean isSaogfFiction() { - genreForm().any { it['@id'] == SAOGF_SKÖN || whelk.relations.isImpliedBy(SAOGF_SKÖN, it['@id'] ?: '') } + genreForm().any { whelk.relations.isImpliedBy(SAOGF_SKÖN, it[ID_KEY] ?: '') } } boolean isSabFiction() { - classification().any { it.inScheme?.code =~ /[Kk]ssb/ && it.code =~ /^(H|h|uH|ufH|ugH)/ } + classification().any { it[IN_SCHEME]?[CODE] =~ /[Kk]ssb/ && it[CODE] =~ /^(H|h|uH|ufH|ugH)/ } } boolean isNotFiction() { @@ -283,7 +319,7 @@ class Doc { } boolean isText() { - workData['@type'] == 'Text' + workData[TYPE_KEY] == 'Text' } boolean isAnonymousTranslation() { @@ -292,7 +328,7 @@ class Doc { boolean hasAnyRole(List relators) { contribution().any { - asList(it['role']).intersect(relators.collect { [(JsonLd.ID_KEY): it.iri] }) + asList(it[ROLE]).intersect(relators.collect { [(ID_KEY): it.iri] }) } } @@ -301,11 +337,11 @@ class Doc { } boolean isSabDrama() { - classification().any { it.code?.contains('Hc.02') || it.code?.contains('Hce.02') } + classification().any { it[CODE]?.contains('Hc.02') || it[CODE]?.contains('Hce.02') } } boolean isGfDrama() { - asList(genreForm()).any { it['@id'] in DRAMA_GF } + asList(genreForm()).any { it[ID_KEY] in DRAMA_GF } } boolean isNotRegularText() { @@ -320,24 +356,23 @@ class Doc { 'https://id.kb.se/term/barngf/Bliss%20%28symbolspr%C3%A5k%29' ] as Set - def saogfTactile = 'https://id.kb.se/term/saogf/Taktila%20verk' - - asList(workData['contentType']).contains(['@id': 'https://id.kb.se/term/rda/TactileText']) - || asList(instanceData?.carrierType).any { it['@id'] in ['https://id.kb.se/marc/Braille', 'https://id.kb.se/marc/TacMaterialType-b'] } - || genreForm().any {it['@id'] in barnGfs || it['@id'] == saogfTactile || whelk.relations.isImpliedBy(saogfTactile, it['@id']) } + asList(workData[CONTENT_TYPE]).contains([(ID_KEY): 'https://id.kb.se/term/rda/TactileText']) + || asList(instanceData?.carrierType).any { it[ID_KEY] in ['https://id.kb.se/marc/Braille', 'https://id.kb.se/marc/TacMaterialType-b'] } + || genreForm().any {it[ID_KEY] in barnGfs + || whelk.relations.isImpliedBy('https://id.kb.se/term/saogf/Taktila%20verk', it[ID_KEY]) } } boolean isThesis() { - genreForm().any { it == ['@id': 'https://id.kb.se/marc/Thesis'] } + genreForm().any { it == [(ID_KEY): 'https://id.kb.se/marc/Thesis'] } } boolean hasDistinguishingEdition() { - (instanceData?.editionStatement ?: '').toString().toLowerCase().contains("förk") + (instanceData?[EDITION_STATEMENT] ?: '').toString().toLowerCase().contains("förk") } void addComparisonProps() { if (hasDistinguishingEdition()) { - workData['_editionStatement'] = instanceData['editionStatement'] + workData['_editionStatement'] = instanceData[EDITION_STATEMENT] } workData['_numPages'] = numPages() } @@ -349,6 +384,6 @@ class Doc { void sortContribution() { // PrimaryContribution first - contribution()?.sort {it['@type'] != 'PrimaryContribution' } + contribution()?.sort { it[TYPE_KEY] != PRIMARY } } } \ No newline at end of file diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index 664abff825..3e7dac45ac 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -6,9 +6,53 @@ import whelk.util.DocumentUtil import whelk.util.Unicode import static se.kb.libris.mergeworks.compare.IntendedAudience.preferredComparisonOrder +import static whelk.JsonLd.ID_KEY +import static whelk.JsonLd.TYPE_KEY class Util { - static def titleComponents = ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName'] + static final String CLASSIFICATION = 'classification' + static final String IN_SCHEME = 'inScheme' + static final String VERSION = 'version' + + static final String PRIMARY = 'PrimaryContribution' + static final String CONTRIBUTION = 'contribution' + static final String AGENT = 'agent' + static final String GIVEN_NAME = 'givenName' + static final String FAMILY_NAME = 'familyName' + static final String NAME = 'name' + static final String ROLE = 'role' + static final String LIFE_SPAN = 'lifeSpan' + + static final String TITLE = 'Title' + static final String HAS_TITLE = 'hasTitle' + static final String MAIN_TITLE = 'mainTitle' + static final String SUBTITLE = 'subtitle' + static final String TITLE_REMAINDER = 'titleRemainder' + static final String PART_NUMBER = 'partNumber' + static final String PART_NAME = 'partName' + static final String FLAT_TITLE = 'flatTitle' + static final String SOURCE = 'source' + + static final String CODE = 'code' + static final String LABEL = 'label' + static final String HAS_PART = 'hasPart' + + static final String TRANSLATION_OF = 'translationOf' + static final String GENRE_FORM = 'genreForm' + static final String INTENDED_AUDIENCE = 'intendedAudience' + static final String CONTENT_TYPE = 'contentType' + + static final String PUBLICATION = 'publication' + static final String EXTENT = 'extent' + static final String REPRODUCTION_OF = 'reproductionOf' + static final String IDENTIFIED_BY = 'identifiedBy' + static final String EDITION_STATEMENT = 'editionStatement' + static final String RESP_STATEMENT = 'responsibilityStatement' + static final String PHYS_NOTE = 'physicalDetailsNote' + + static final String ENCODING_LEVEL = 'encodingLevel' + + static def titleComponents = [MAIN_TITLE, TITLE_REMAINDER, SUBTITLE, HAS_PART, PART_NUMBER, PART_NAME] static enum Relator { TRANSLATOR('https://id.kb.se/relator/translator'), @@ -79,15 +123,15 @@ class Util { } static boolean hasGenericTitle(List hasTitle) { - hasTitle.any { it['mainTitle'] && normalize((String) it['mainTitle']) in GENERIC_TITLES } + hasTitle.any { it[MAIN_TITLE] && normalize((String) it[MAIN_TITLE]) in GENERIC_TITLES } } static List dropGenericSubTitles(List hasTitle) { hasTitle.collect { def copy = new TreeMap(it) - if (copy['subtitle'] || copy['titleRemainder']) { + if (copy[SUBTITLE] || copy[TITLE_REMAINDER]) { DocumentUtil.traverse(copy) { value, path -> - if (('subtitle' in path || 'titleRemainder' in path) && value instanceof String) { + if ((SUBTITLE in path || TITLE_REMAINDER in path) && value instanceof String) { if (genericSubtitle(value)) { new DocumentUtil.Remove() } else { @@ -109,9 +153,9 @@ class Util { static List flatTitles(List hasTitle) { dropGenericSubTitles(hasTitle).collect { def title = new TreeMap<>() - title['flatTitle'] = normalize(DisplayDoc.flatten(it, titleComponents)) - if (it['@type']) { - title['@type'] = it['@type'] + title[FLAT_TITLE] = normalize(DisplayDoc.flatten(it, titleComponents)) + if (it[TYPE_KEY]) { + title[TYPE_KEY] = it[TYPE_KEY] } title @@ -133,32 +177,7 @@ class Util { static List getFlatTitle(List hasTitle) { flatTitles(hasTitle) .grep(isTitle) - .collect { it['flatTitle'] } - } - - static String chipString(def thing, Whelk whelk) { - if (thing instanceof Integer) { - return thing - } - - def chips = whelk.jsonld.toChip(thing) - if (chips.size() < 2) { - chips = thing - } - if (chips instanceof List) { - return chips.collect { valuesString(it) }.sort().join('
') - } - return valuesString(chips) - } - - private static String valuesString(def thing) { - if (thing instanceof List) { - return thing.collect { valuesString(it) }.join(' • ') - } - if (thing instanceof Map) { - return thing.findAll { k, v -> k != '@type' }.values().collect { valuesString(it) }.join(' • ') - } - return thing.toString() + .collect { it[FLAT_TITLE] } } // (docs on some of these levels are normally filtered out before we reach here) @@ -177,19 +196,20 @@ class Util { static void appendTitlePartsToMainTitle(Map title, String partNumber, String partName = null) { def part = [partNumber, partName].grep().join(', ') if (part) { - title['mainTitle'] += "${title['mainTitle'][-1] == '.' ? '' : '.'} $part" + title[MAIN_TITLE] += "${title[MAIN_TITLE][-1] == '.' ? '' : '.'} $part" } } static String findTitlePart(List title, String prop) { // partName/partNumber is usually found in hasPart but not always - def partNumber = title.findResult { Map t -> t[prop] ?: t['hasPart'].findResult { it[prop] } } - return asList(partNumber).find() + def titlePart = title.findResult { Map t -> t[prop] ?: t[HAS_PART].findResult { it[prop] } } + return asList(titlePart).find() } // Return the most common title for the best encodingLevel static def bestTitle(Collection docs) { - def linkedWorkTitle = docs.findResult { it.workIri() ? it.workData['hasTitle'] : null } + // Always keep title on existing linked work as is + def linkedWorkTitle = docs.findResult { it.workIri() ? it.workData[HAS_TITLE] : null } if (linkedWorkTitle) { return linkedWorkTitle } @@ -197,12 +217,14 @@ class Util { def bestInstanceTitle = mostCommonHighestEncodingLevel(docs, this.&mostCommonInstanceTitle) def bestWorkTitle = mostCommonHighestEncodingLevel(docs, this.&mostCommonWorkTitle) - def partNumber = findTitlePart(bestInstanceTitle, 'partNumber') - def partName = findTitlePart(bestInstanceTitle, 'partName') + def partNumber = findTitlePart(bestInstanceTitle, PART_NUMBER) + def partName = findTitlePart(bestInstanceTitle, PART_NAME) - def workTitleShape = { it.subMap(['@type', 'mainTitle', 'subtitle', 'titleRemainder', 'source', 'marc:nonfilingChars']) } + def workTitleShape = { it.subMap([TYPE_KEY, MAIN_TITLE, SUBTITLE, TITLE_REMAINDER, SOURCE, 'marc:nonfilingChars']) } + // Prefer existing work title over instance titles if (bestWorkTitle) { + // Include part number in work title if present in instance titles. return bestWorkTitle.each { appendTitlePartsToMainTitle(it, partNumber) } .collect(workTitleShape) } @@ -236,7 +258,7 @@ class Util { static def mostCommonOriginalTitle(Collection docs) { return mostCommonWorkTitle(docs) { Doc d -> - d.translationOf().findResult { it['hasTitle'] }?.findAll(isTitle) + d.translationOf().findResult { it[HAS_TITLE] }?.findAll(isTitle) } } @@ -254,7 +276,7 @@ class Util { static def mostCommonInstanceTitle(Collection docs) { def addSource = { t, d -> - return t.collect { it.plus(['source': [d.instanceData.subMap('@id')]]) } + return t.collect { it.plus([(SOURCE): [d.instanceData.subMap(ID_KEY)]]) } } def instanceTitles = docs.collect { it.instanceTitle().findAll(isTitle) } @@ -263,6 +285,7 @@ class Util { if (instanceTitles.grep()) { def instanceTitleToDoc = [instanceTitles, docs].transpose().collectEntries() def best = mostCommon(instanceTitles.grep()) + // Source is picked arbitrary among the instances having the most common title return addSource(best, instanceTitleToDoc[best]) } @@ -277,14 +300,15 @@ class Util { .first() } - static def isTitle = { it.'@type' == 'Title' } + static def isTitle = { it[TYPE_KEY] == TITLE } static String name(Map agent) { - (agent.givenName && agent.familyName) - ? normalize("${agent.givenName} ${agent.familyName}") - : agent.name ? normalize("${agent.name}") : null + (agent[GIVEN_NAME] && agent[FAMILY_NAME]) + ? normalize("${agent[GIVEN_NAME]} ${agent[FAMILY_NAME]}") + : agent[NAME] ? normalize("${agent[NAME]}") : null } + // Cluster records that seem to describe the same work static Collection> workClusters(Collection docs, WorkComparator c) { docs.each { if (it.instanceData) { diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy index 121b83cb1b..1fba97296d 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy @@ -4,6 +4,7 @@ import datatool.util.DocumentComparator import se.kb.libris.mergeworks.compare.* import static Util.bestTitle +import static se.kb.libris.mergeworks.Util.HAS_TITLE class WorkComparator { Set fields @@ -22,6 +23,7 @@ class WorkComparator { '@id' : new Id() ] + // These properties are not considered when comparing works, nor are they included when merging static Set ignore = ['closeMatch'] static FieldHandler DEFAULT = new Default() @@ -61,19 +63,20 @@ class WorkComparator { } } - if (!result['hasTitle']) { + // Usually none of the works that are merged have a title already, so hasTitle needs to be added separately. + if (!result[HAS_TITLE]) { def bestTitle = bestTitle(docs) if (bestTitle) { - result['hasTitle'] = bestTitle + result[HAS_TITLE] = bestTitle } } + // There is only room for one Dewey code in the classification property. Move any additional to additionalClassificationDdc. Classification.moveAdditionalDewey(result, docs) return result } - // TODO: preserve order? e.g. subject private Object mergeField(String field, FieldHandler h, Collection docs) { Object value = docs.first().workData.get(field) def rest = docs.drop(1) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/ContentType.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/ContentType.groovy index 4d0d3e498b..158ba13664 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/ContentType.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/ContentType.groovy @@ -1,12 +1,13 @@ package se.kb.libris.mergeworks.compare import static se.kb.libris.mergeworks.Util.asList +import static whelk.JsonLd.ID_KEY class ContentType extends StuffSet { private static def allowedValues = ['https://id.kb.se/term/rda/StillImage', 'https://id.kb.se/term/rda/Text'] @Override boolean isCompatible(Object a, Object b) { - asList(a).every { it['@id'] in allowedValues } && asList(b).every { it['@id'] in allowedValues } + asList(a).every { it[ID_KEY] in allowedValues } && asList(b).every { it[ID_KEY] in allowedValues } } } diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/GenreForm.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/GenreForm.groovy index 9e30a2d2bd..205e613a71 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/GenreForm.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/GenreForm.groovy @@ -1,29 +1,27 @@ package se.kb.libris.mergeworks.compare -import datatool.util.DocumentComparator +import static whelk.JsonLd.ID_KEY -//FIXME class GenreForm extends StuffSet { - private static final DocumentComparator c = new DocumentComparator() - - // Terms that will be merged (values precede keys) - private static def norm = [ - (['@id': 'https://id.kb.se/marc/NotFictionNotFurtherSpecified']): [ - ['@id': 'https://id.kb.se/marc/FictionNotFurtherSpecified'], - ['@id': 'https://id.kb.se/marc/Autobiography'], - ['@id': 'https://id.kb.se/marc/Biography'] + // When merging, the values in this map are preferred over the keys. + // E.g. 'https://id.kb.se/marc/Novel' overwrites 'https://id.kb.se/marc/FictionNotFurtherSpecified' + private static def precedenceRules = [ + ([(ID_KEY): 'https://id.kb.se/marc/NotFictionNotFurtherSpecified']): [ + [(ID_KEY): 'https://id.kb.se/marc/FictionNotFurtherSpecified'], + [(ID_KEY): 'https://id.kb.se/marc/Autobiography'], + [(ID_KEY): 'https://id.kb.se/marc/Biography'] ], - (['@id': 'https://id.kb.se/marc/FictionNotFurtherSpecified']) : [ - ['@id': 'https://id.kb.se/marc/Poetry'], - ['@id': 'https://id.kb.se/marc/Novel'] + ([(ID_KEY): 'https://id.kb.se/marc/FictionNotFurtherSpecified']) : [ + [(ID_KEY): 'https://id.kb.se/marc/Poetry'], + [(ID_KEY): 'https://id.kb.se/marc/Novel'] ], ] @Override boolean isCompatible(Object a, Object b) { def lattLast = { - it['@id'] == 'https://id.kb.se/term/saogf/L%C3%A4ttl%C3%A4st' - || it['@id'] == 'https://id.kb.se/term/barngf/L%C3%A4ttl%C3%A4sta%20b%C3%B6cker' + it[ID_KEY] == 'https://id.kb.se/term/saogf/L%C3%A4ttl%C3%A4st' + || it[ID_KEY] == 'https://id.kb.se/term/barngf/L%C3%A4ttl%C3%A4sta%20b%C3%B6cker' || it['prefLabel'] == 'Lättläst' } a.any(lattLast) == b.any(lattLast) @@ -32,15 +30,15 @@ class GenreForm extends StuffSet { @Override Object merge(Object a, Object b) { return mergeCompatibleElements(super.merge(a, b)) { gf1, gf2 -> - if (n(gf1, gf2)) { - gf2 - } else if (n(gf2, gf1)) { + if (precedes(gf1, gf2)) { gf1 + } else if (precedes(gf2, gf1)) { + gf2 } } } - boolean n(a, b) { - norm[a]?.any { it == b || n(it, b) } + boolean precedes(a, b) { + precedenceRules[b]?.any { it == a || precedes(a, it) } } } \ No newline at end of file diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy index ce2569f866..d3ae1b1ce8 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy @@ -14,6 +14,8 @@ class IntendedAudience extends StuffSet { || !(asList(a) + asList(b)).findResults { it == ADULT }.containsAll([true, false]) } + // Sort docs so that those with marc:Adult, marc:General or nothing in intendedAudience come first, + // since we prefer marc:Adult rather than e.g. marc:Juvenile clustered with marc:General/empty. static void preferredComparisonOrder(Collection docs) { docs.sort { Doc d -> d.intendedAudience().with { diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/TranslationOf.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/TranslationOf.groovy index dd0dc578d7..e762805453 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/TranslationOf.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/TranslationOf.groovy @@ -5,13 +5,17 @@ import se.kb.libris.mergeworks.Doc import se.kb.libris.mergeworks.Util import org.apache.commons.lang3.NotImplementedException +import static se.kb.libris.mergeworks.Util.HAS_TITLE +import static se.kb.libris.mergeworks.Util.TRANSLATION_OF +import static whelk.JsonLd.TYPE_KEY + class TranslationOf implements ValuePicker { DocumentComparator c = new DocumentComparator() @Override boolean isCompatible(Object a, Object b) { // @type is sometimes Work, sometimes Text. Should not matter for comparison - // We assume that there are never more than one object in translationOf + // We assume that there is never more than one object in translationOf a = Util.asList(a)[0] b = Util.asList(b)[0] a && b && c.isEqual(noTypeNoTitle(a), noTypeNoTitle(b)) && noTitleOrSameTitle(a, b) @@ -24,23 +28,26 @@ class TranslationOf implements ValuePicker { @Override Object pick(Collection values) { - // TODO: which title to pick when matched with already existing linked work? - def translationOf = values.first().workData['translationOf'] + def linkedWorkTranslationOf = docs.findResult { it.workIri() ? it.translationOf() : null } + if (linkedWorkTranslationOf) { + return linkedWorkTranslationOf + } + def translationOf = values.first().workData[TRANSLATION_OF] def title = Util.bestOriginalTitle(values) if (title) { - Util.asList(translationOf)[0]['hasTitle'] = title + Util.asList(translationOf)[0][HAS_TITLE] = title } return translationOf } Map noTypeNoTitle(Map m) { - m.findAll { k, v -> !(k in ['@type', 'hasTitle']) } + m.findAll { k, v -> !(k in [TYPE_KEY, HAS_TITLE]) } } boolean noTitleOrSameTitle(Map a, Map b) { - !a['hasTitle'] - || !b['hasTitle'] - || !Util.getFlatTitle(a['hasTitle']).intersect(Util.getFlatTitle(b['hasTitle'])).isEmpty() + !a[HAS_TITLE] + || !b[HAS_TITLE] + || !Util.getFlatTitle(a[HAS_TITLE]).intersect(Util.getFlatTitle(b[HAS_TITLE])).isEmpty() } } From 546d53e0cf6d0f7e319b10b3e46982cb96d0ccc5 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 29 Nov 2023 14:19:33 +0100 Subject: [PATCH 2/7] Add unit tests --- librisworks/build.gradle | 2 +- .../se/kb/libris/mergeworks/DocSpec.groovy | 78 ++++++ .../se/kb/libris/mergeworks/UtilSpec.groovy | 224 ++++++++++++++++++ .../mergeworks/compare/ContentTypeSpec.groovy | 19 ++ .../mergeworks/compare/GenreFormSpec.groovy | 38 +++ .../compare/IntendedAudienceSpec.groovy | 45 ++++ .../compare/TranslationOfSpec.groovy | 34 +++ 7 files changed, 439 insertions(+), 1 deletion(-) create mode 100644 librisworks/src/test/groovy/se/kb/libris/mergeworks/DocSpec.groovy create mode 100644 librisworks/src/test/groovy/se/kb/libris/mergeworks/UtilSpec.groovy create mode 100644 librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ContentTypeSpec.groovy create mode 100644 librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/GenreFormSpec.groovy create mode 100644 librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/IntendedAudienceSpec.groovy create mode 100644 librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/TranslationOfSpec.groovy diff --git a/librisworks/build.gradle b/librisworks/build.gradle index b01041315d..4d92a5b15c 100644 --- a/librisworks/build.gradle +++ b/librisworks/build.gradle @@ -16,8 +16,8 @@ repositories { dependencies { implementation project(':whelktool') + implementation project(':whelk-core') compileOnly "org.codehaus.groovy:groovy:${groovyVersion}" - compileOnly project(':whelk-core') scriptsCompileOnly sourceSets.main.output scriptsCompileOnly project(':whelk-core') testImplementation "org.spockframework:spock-core:${spockVersion}" diff --git a/librisworks/src/test/groovy/se/kb/libris/mergeworks/DocSpec.groovy b/librisworks/src/test/groovy/se/kb/libris/mergeworks/DocSpec.groovy new file mode 100644 index 0000000000..6a0c178ac8 --- /dev/null +++ b/librisworks/src/test/groovy/se/kb/libris/mergeworks/DocSpec.groovy @@ -0,0 +1,78 @@ +package se.kb.libris.mergeworks + +import whelk.Document +import whelk.Whelk + +import spock.lang.Specification + +class DocSpec extends Specification { + static def whelk = null + static { + try { + whelk = Whelk.createLoadedSearchWhelk() + } catch (Exception e) { + System.err.println("Unable to instantiate whelk: $e") + } + } + + def "work has multiple parts"() { + given: + Map data = ['@graph': [[:], mainEntity]] + Doc doc = new Doc(whelk, new Document(data)) + + expect: + doc.hasParts() == result + + where: + mainEntity || result + ['instanceOf': ['hasPart': [['@type': 'Text']]]] || true + ['hasTitle': [['@type': 'Title', 'hasPart': [[:], [:]]]], 'instanceOf': ['@type': 'Text']] || true + ['hasTitle': [['@type': 'Title', 'hasPart': [[:]]]], 'instanceOf': ['@type': 'Text']] || false + ['hasTitle': [['@type': 'Title', 'hasPart': [['partNumber': ['1', '2']]]]], 'instanceOf': ['@type': 'Text']] || true + ['hasTitle': [['@type': 'Title', 'hasPart': [['partNumber': ['1']]]]], 'instanceOf': ['@type': 'Text']] || false + ['hasTitle': [['@type': 'Title', 'mainTitle': 'x ; y']], 'instanceOf': ['@type': 'Text']] || true + ['hasTitle': [['@type': 'Title', 'mainTitle': 'x / y ; z']], 'instanceOf': ['@type': 'Text']] || false + ['hasTitle': [['@type': 'Title', 'mainTitle': 'x ;', 'subtitle': 'y']], 'instanceOf': ['@type': 'Text']] || true + ['hasTitle': [['@type': 'Title', 'mainTitle': 'x / y ;', 'titleRemainder': 'z']], 'instanceOf': ['@type': 'Text']] || false + } + + def "work is not regular text"() { + given: + Map data = ['@graph': [[:], mainEntity]] + Doc doc = new Doc(whelk, new Document(data)) + + expect: + doc.isNotRegularText() == result + + where: + mainEntity || result + ['instanceOf': ['@type': 'Text']] || false + ['instanceOf': ['@type': 'Text', 'contentType': [['@id': 'https://id.kb.se/term/rda/TactileText']]]] || true + ['carrierType': [['@id': 'https://id.kb.se/marc/Braille']], 'instanceOf': ['@type': 'Text']] || true + ['instanceOf': ['@type': 'Text', 'genreForm': [['@id': 'https://id.kb.se/term/barngf/Mekaniska%20b%C3%B6cker']]]] || true + ['instanceOf': ['@type': 'Text', 'genreForm': [['@id': 'https://id.kb.se/term/saogf/Taktila%20verk']]]] || true + ['instanceOf': ['@type': 'Text', 'genreForm': [['@id': 'https://id.kb.se/term/saogf/Punktskriftsb%C3%B6cker']]]] || true + } + + def "parse extent"() { + expect: + Doc.numPages(extent) == pages + + where: + extent | pages + "" | -1 + "114, [1] s." | 114 + "[4], 105, [2] s." | 105 + "21 s., ([4], 21, [5] s.)" | 21 + "[108] s., (Ca 110 s.)" | 110 + "80 s., (80, [3] s., [8] pl.-bl. i färg)" | 80 + "622, [8] s." | 622 + "[2] s., s. 635-919, [7] s." | 919 // ?? + "[1], iv, 295 s." | 295 + "3 vol." | -1 + //"249, (1) s." | 249 + //"[8] s., s. 11-370" | 370 + //[12] s., s. 15-256 | 256 + "25 onumrerade sidor" | 25 + } +} diff --git a/librisworks/src/test/groovy/se/kb/libris/mergeworks/UtilSpec.groovy b/librisworks/src/test/groovy/se/kb/libris/mergeworks/UtilSpec.groovy new file mode 100644 index 0000000000..8474b0e441 --- /dev/null +++ b/librisworks/src/test/groovy/se/kb/libris/mergeworks/UtilSpec.groovy @@ -0,0 +1,224 @@ +package se.kb.libris.mergeworks + +import spock.lang.Specification +import whelk.Document +import whelk.Whelk + +class UtilSpec extends Specification { + static def whelk = null + static { + try { + whelk = Whelk.createLoadedSearchWhelk() + } catch (Exception e) { + System.err.println("Unable to instantiate whelk: $e") + } + } + + def "generic mainTitle"() { + expect: + Util.hasGenericTitle([['mainTitle': mainTitle]]) == result + + where: + mainTitle || result + 'dikter' || true + 'Samlade verk' || true + 'Tusen och en natt' || true + 'en lite ovanligare titel' || false + } + + def "drop generic subtitle"() { + given: + def hasTitle = [['mainTitle': 'x', 'subtitle': subtitle]] + def dropped = Util.dropGenericSubTitles(hasTitle) + + expect: + dropped[0]['subtitle'] == result && hasTitle == [['mainTitle': 'x', 'subtitle': subtitle]] + + where: + subtitle || result + 'en äktenskapshistoria' || null + 'avhandlingar' || null + 'En Roland Hassel-thriller' || null + 'En lite ovanligare titel : Roman' || 'En lite ovanligare titel' + 'En lite ovanligare titel' || 'En lite ovanligare titel' + } + + def "flatten titles"() { + expect: + Util.flatTitles(title)[0]['flatTitle'] == result + + where: + title || result + [['mainTitle': 'x']] || 'x' + [['mainTitle': 'x', 'subtitle': 'y']] || 'x y' + [['mainTitle': 'x', 'subtitle': 'roman']] || 'x' + [['mainTitle': 'x', 'hasPart': ['partNumber': '1', 'partName': 'y']]] || 'x 1 y' + [['mainTitle': 'x-y.', 'hasPart': ['partNumber': '[1]'], 'subtitle': 'é ']] || 'x y e 1' + } + + def "find title parts"() { + expect: + Util.findTitlePart(title, 'partNumber') == partNumber + Util.findTitlePart(title, 'partName') == partName + + where: + title || partNumber || partName + [['hasPart': [['partNumber': '1', 'partName': ['x']]]]] || '1' || 'x' + [['hasPart': [['partNumber': '1']]]] || '1' || null + [['hasPart': [['partName': 'x']]]] || null || 'x' + [['partNumber': '1', 'partName': 'x']] || '1' || 'x' + [['mainTitle': 'x']] || null || null + } + + def "append title parts to main title"() { + given: + Util.appendTitlePartsToMainTitle(title, partNumber, partName) + + expect: + title == result + + where: + title || partNumber || partName || result + ['mainTitle': 'x'] || '1' || 'y' || ['mainTitle': 'x. 1, y'] + ['mainTitle': 'x.'] || '1' || null || ['mainTitle': 'x. 1'] + ['mainTitle': 'x.'] || null || 'y' || ['mainTitle': 'x. y'] + ['mainTitle': 'x.'] || null || null || ['mainTitle': 'x.'] + } + + def "pick best work title"() { + def fl = 'marc:FullLevel' + def ml = 'marc:MinimalLevel' + + def createDoc = { tuple, i -> + def (instanceTitle, encodingLevel) = tuple + + def data = [ + '@graph': [ + [ + 'encodingLevel': encodingLevel + ], + [ + '@id' : "https://libris.kb.se/x$i", + 'hasTitle' : [['@type': 'Title', 'mainTitle': instanceTitle]], + 'instanceOf': ['@type': 'Text'] + ] + ] + ] + return new Doc(whelk, new Document(data)) + } + + def collectDocs = { instanceTitles, encodingLevels -> + [instanceTitles, encodingLevels].transpose().withIndex().collect(createDoc) + } + + // Same encoding level, no work titles --> pick most common instance title + when: + def instanceTitles1 = ['T', 'T.', 't', 't'] + def encodingLevels1 = [fl, fl, fl, fl] + def docs1 = collectDocs([instanceTitles1, encodingLevels1]) + + then: + Util.bestTitle(docs1) == [['@type': 'Title', 'mainTitle': 't', 'source': [['@id': 'https://libris.kb.se/x3']]]] + + // Different encoding levels, no work titles --> pick most common instance title among docs with highest level + when: + def instanceTitles2 = ['T', 'T', 't', 't', 't'] + def encodingLevels2 = [fl, fl, fl, ml, ml] + def docs2 = collectDocs([instanceTitles2, encodingLevels2]) + + then: + Util.bestTitle(docs2) == [['@type': 'Title', 'mainTitle': 'T', 'source': [['@id': 'https://libris.kb.se/x1']]]] + + // Pick existing work title over instance titles + when: + def instanceTitles3 = ['t', 't', 't', 't'] + def encodingLevels3 = [ml, fl, ml, fl] + def docs3 = collectDocs([instanceTitles3, encodingLevels3]) + docs3[0].workData['hasTitle'] = [['@type': 'Title', 'mainTitle': 'T']] + + then: + Util.bestTitle(docs3) == [['@type': 'Title', 'mainTitle': 'T']] + + // Pick from linkable work over local works + when: + def instanceTitles4 = ['t', 't', 't'] + def encodingLevels4 = [null, fl, fl] + def docs4 = collectDocs([instanceTitles4, encodingLevels4]) + docs4[0].workData['hasTitle'] = [['@type': 'Title', 'mainTitle': 'T']] + docs4[0].workData['@id'] = 'https://libris.kb.se/y' + docs4[1].workData['hasTitle'] = [['@type': 'Title', 'mainTitle': 't.']] + docs4[2].workData['hasTitle'] = [['@type': 'Title', 'mainTitle': 't.']] + + then: + Util.bestTitle(docs4) == [['@type': 'Title', 'mainTitle': 'T']] + + // Ignore generic subtitles + when: + def instanceTitles5 = ['t', 't', 't', 'T', 'T'] + def encodingLevels5 = [fl, fl, fl, fl, fl] + def docs5 = collectDocs([instanceTitles5, encodingLevels5]) + docs5[0].instanceTitle()[0]['subtitle'] = 'roman' + docs5[1].instanceTitle()[0]['subtitle'] = 'en roman' + + then: + Util.bestTitle(docs5) == [['@type': 'Title', 'mainTitle': 't', 'source': [['@id': 'https://libris.kb.se/x2']]]] + + // Don't ignore any subtitle + when: + def instanceTitles6 = ['t', 'T', 'T'] + def encodingLevels6 = [fl, fl, fl] + def docs6 = collectDocs([instanceTitles6, encodingLevels6]).each { + it.instanceTitle()[0]['subtitle'] = 'en lite ovanligare titel' + } + + then: + Util.bestTitle(docs6) == [['@type': 'Title', 'mainTitle': 'T', 'subtitle': 'en lite ovanligare titel', 'source': [['@id': 'https://libris.kb.se/x2']]]] + + // Append parts to mainTitle + when: + def instanceTitles7 = ['T', 'T', 'T'] + def encodingLevels7 = [fl, fl, fl] + def docs7 = collectDocs([instanceTitles7, encodingLevels7]).each { + it.instanceTitle()[0]['hasPart'] = [['partNumber': '1', 'partName': 'Delens titel']] + } + + then: + Util.bestTitle(docs7) == [['@type': 'Title', 'mainTitle': 'T. 1, Delens titel', 'source': [['@id': 'https://libris.kb.se/x2']]]] + + // Append only partNumber to existing work mainTitle + when: + def instanceTitles8 = ['t', 't', 't'] + def encodingLevels8 = [fl, fl, fl] + def docs8 = collectDocs([instanceTitles8, encodingLevels8]).each { + it.instanceTitle()[0]['hasPart'] = [['partNumber': '1', 'partName': 'Delens titel']] + } + docs8[0].workData['hasTitle'] = [['@type': 'Title', 'mainTitle': 'T.']] + + then: + Util.bestTitle(docs8) == [['@type': 'Title', 'mainTitle': 'T. 1']] + } + + def "pick best original title"() { + given: + def fl = 'marc:FullLevel' + def ml = 'marc:MinimalLevel' + def origTitles = ['t', 't', 'T', 'T', 'T.', 'T.'] + def encodingLevels = [fl, ml, fl, fl, ml, fl] + def docs = [origTitles, encodingLevels].transpose().collect { origTitle, encodingLevel -> + def data = [ + '@graph': [ + [ + 'encodingLevel': encodingLevel + ], + [ + 'instanceOf': ['translationOf': ['hasTitle': [['@type': 'Title', 'mainTitle': origTitle]]]] + ] + ] + ] + return new Doc(whelk, new Document(data)) + } + + expect: + Util.bestOriginalTitle(docs) == [['@type': 'Title', 'mainTitle': 'T']] + } +} diff --git a/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ContentTypeSpec.groovy b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ContentTypeSpec.groovy new file mode 100644 index 0000000000..c32ad4b90d --- /dev/null +++ b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ContentTypeSpec.groovy @@ -0,0 +1,19 @@ +package se.kb.libris.mergeworks.compare + +import spock.lang.Specification + +class ContentTypeSpec extends Specification { + def "is compatible"() { + expect: + new ContentType().isCompatible(a, b) == result + + where: + a || b || result + [['@id': 'https://id.kb.se/term/rda/Text']] || [['@id': 'https://id.kb.se/term/rda/Text']] || true + [['@id': 'https://id.kb.se/term/rda/StillImage']] || [['@id': 'https://id.kb.se/term/rda/Text']] || true + [] || [['@id': 'https://id.kb.se/term/rda/Text']] || true + [['@id': 'https://id.kb.se/term/rda/X']] || [['@id': 'https://id.kb.se/term/rda/Text']] || false + [['@id': 'https://id.kb.se/term/rda/X']] || [] || false + [['label': 'x']] || [['@id': 'https://id.kb.se/term/rda/Text']] || false + } +} diff --git a/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/GenreFormSpec.groovy b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/GenreFormSpec.groovy new file mode 100644 index 0000000000..4808c10b31 --- /dev/null +++ b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/GenreFormSpec.groovy @@ -0,0 +1,38 @@ +package se.kb.libris.mergeworks.compare + +import spock.lang.Specification + +class GenreFormSpec extends Specification { + private static def x = ['@id': 'https://id.kb.se/x'] + private static def y = ['@id': 'https://id.kb.se/y'] + private static def z = ['prefLabel': 'z'] + private static def saoGfLatt = ['@id': 'https://id.kb.se/term/saogf/L%C3%A4ttl%C3%A4st'] + private static def marcFiction = ['@id': 'https://id.kb.se/marc/FictionNotFurtherSpecified'] + private static def marcNotFiction = ['@id': 'https://id.kb.se/marc/NotFictionNotFurtherSpecified'] + private static def marcNovel = ['@id': 'https://id.kb.se/marc/Novel'] + + def "is compatible"() { + expect: + new GenreForm().isCompatible(a, b) == result + + where: + a || b || result + [x] || [y] || true + [x] || [x, y, z] || true + [x] || [y, saoGfLatt] || false + [x, saoGfLatt] || [y, saoGfLatt] || true + } + + def "merge"() { + expect: + new GenreForm().merge(a, b) as Set == result as Set + + where: + a || b || result + [x] || [y] || [x, y] + [x] || [x, y, z] || [x, y, z] + [x, saoGfLatt] || [y, saoGfLatt] || [x, y, saoGfLatt] + [x, marcFiction] || [marcNotFiction] || [x, marcFiction] + [x, marcNovel] || [marcNotFiction, y] || [x, y, marcNovel] + } +} diff --git a/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/IntendedAudienceSpec.groovy b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/IntendedAudienceSpec.groovy new file mode 100644 index 0000000000..aa7ac5eac1 --- /dev/null +++ b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/IntendedAudienceSpec.groovy @@ -0,0 +1,45 @@ +package se.kb.libris.mergeworks.compare + +import se.kb.libris.mergeworks.Doc +import spock.lang.Specification +import whelk.Document +import whelk.Whelk + +class IntendedAudienceSpec extends Specification { + private static def general = ['@id': 'https://id.kb.se/marc/General'] + private static def adult = ['@id': 'https://id.kb.se/marc/Adult'] + private static def juvenile = ['@id': 'https://id.kb.se/marc/Juvenile'] + private static def blank = ['label': 'x'] + + def "is compatible"() { + expect: + new IntendedAudience().isCompatible(a, b) == result + + where: + a || b || result + [juvenile] || [juvenile] || true + [juvenile] || [general] || true + [juvenile] || [] || true + [juvenile] || [blank, general, juvenile] || true + [juvenile] || [adult] || false + [adult] || [adult] || true + [adult] || [general] || true + [adult] || [juvenile, general] || false + [adult] || [blank] || false + [adult] || [] || true + } + + def "preferred comparison order"() { + given: + Whelk whelk = Whelk.createLoadedSearchWhelk() + def intendedAudience = [[juvenile], [adult], [juvenile], [], [adult], [general]] + List docs = intendedAudience.collect { + def data = ['@graph': [[], ['instanceOf': ['intendedAudience': it]]]] + return new Doc(whelk, new Document(data)) + } + IntendedAudience.preferredComparisonOrder(docs) + + expect: + docs*.intendedAudience() == [[general], [adult], [], [adult], [juvenile], [juvenile]] + } +} \ No newline at end of file diff --git a/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/TranslationOfSpec.groovy b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/TranslationOfSpec.groovy new file mode 100644 index 0000000000..82aa45a864 --- /dev/null +++ b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/TranslationOfSpec.groovy @@ -0,0 +1,34 @@ +package se.kb.libris.mergeworks.compare + +import se.kb.libris.mergeworks.Doc +import spock.lang.Specification +import whelk.Document + +class TranslationOfSpec extends Specification { + def "is compatible"() { + given: + def a = [:] + def b = [:] + if (typeA) a['@type'] = typeA + if (typeB) b['@type'] = typeB + if (langA) a['language'] = [['@id': 'https://id.kb.se/language/' + langA]] + if (langB) b['language'] = [['@id': 'https://id.kb.se/language/' + langB]] + if (titleA) a['hasTitle'] = [['@type': 'Title', 'mainTitle': titleA]] + if (titleB) b['hasTitle'] = [['@type': 'Title', 'mainTitle': titleB]] + + expect: + new TranslationOf().isCompatible(a, b) == result + + where: + typeA || typeB || langA || langB || titleA || titleB || result + 'Work' || 'Work' || 'swe' || 'swe' || 'x' || 'x' || true + 'Work' || 'Text' || 'swe' || 'swe' || 'x' || 'x' || true + 'Work' || 'Work' || 'swe' || 'swe' || null || 'x' || true + 'Work' || 'Work' || 'swe' || 'swe' || null || null || true + 'Work' || 'Work' || 'swe' || 'fre' || null || null || false + 'Work' || 'Work' || 'swe' || null || null || null || false + 'Work' || 'Work' || null || null || null || null || true + 'Work' || 'Work' || 'swe' || 'swe' || 'x' || 'X.' || true + 'Work' || 'Work' || 'swe' || 'swe' || 'x' || 'y' || false + } +} From 0f0808ce768c3a8d501593e43c5448211071155e Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 29 Nov 2023 14:32:45 +0100 Subject: [PATCH 3/7] Organize scripts --- librisworks/run.sh | 13 +++++++------ .../scripts/{ => display}/display-clusters.groovy | 0 .../scripts/{ => display}/display-works.groovy | 0 .../{ => svsk}/add-missing-contribution-data.groovy | 0 .../{ => svsk}/contributions-to-instance.groovy | 0 .../{ => svsk}/drop-anonymous-translations.groovy | 0 .../{ => svsk}/language-in-work-title.groovy | 0 .../lxl-4150-deduplicate-contribution.groovy | 0 .../scripts/{ => svsk}/swedish-fiction.groovy | 0 9 files changed, 7 insertions(+), 6 deletions(-) rename librisworks/scripts/{ => display}/display-clusters.groovy (100%) rename librisworks/scripts/{ => display}/display-works.groovy (100%) rename librisworks/scripts/{ => svsk}/add-missing-contribution-data.groovy (100%) rename librisworks/scripts/{ => svsk}/contributions-to-instance.groovy (100%) rename librisworks/scripts/{ => svsk}/drop-anonymous-translations.groovy (100%) rename librisworks/scripts/{ => svsk}/language-in-work-title.groovy (100%) rename librisworks/scripts/{ => svsk}/lxl-4150-deduplicate-contribution.groovy (100%) rename librisworks/scripts/{ => svsk}/swedish-fiction.groovy (100%) diff --git a/librisworks/run.sh b/librisworks/run.sh index 2cd0b8fa16..f074a824db 100755 --- a/librisworks/run.sh +++ b/librisworks/run.sh @@ -27,6 +27,7 @@ WHELKTOOL_REPORT=whelktool-report CLUSTER_TSV=clusters.tsv SCRIPTS_DIR=scripts +SVSK_DIR=$SCRIPTS_DIR/svsk REPORT_DIR=reports/merge-works/$ENV-$(date +%Y%m%d) CLUSTERS_DIR=$REPORT_DIR/clusters @@ -84,7 +85,7 @@ fi echo echo "Filtering on Swedish fiction..." time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$MERGED/$CLUSTER_TSV -jar $JAR_FILE \ - $ARGS --report $SWEDISH_FICTION/$WHELKTOOL_REPORT $SCRIPTS_DIR/swedish-fiction.groovy >$SWEDISH_FICTION/$CLUSTER_TSV 2>/dev/null + $ARGS --report $SWEDISH_FICTION/$WHELKTOOL_REPORT $SVSK_DIR/swedish-fiction.groovy >$SWEDISH_FICTION/$CLUSTER_TSV 2>/dev/null NUM_CLUSTERS=$(count_lines $SWEDISH_FICTION/$CLUSTER_TSV) echo "Found $NUM_CLUSTERS title clusters with Swedish fiction" if [ $NUM_CLUSTERS == 0 ]; then @@ -95,31 +96,31 @@ fi echo echo "Removing language from work titles..." time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \ - $ARGS --report $LANGUAGE_IN_TITLE $SCRIPTS_DIR/language-in-work-title.groovy 2>/dev/null + $ARGS --report $LANGUAGE_IN_TITLE $SVSK_DIR/language-in-work-title.groovy 2>/dev/null echo "$(count_lines $LANGUAGE_IN_TITLE/MODIFIED.txt) records affected, report in $LANGUAGE_IN_TITLE" echo echo "Merging contribution objects with same agent..." time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \ - $ARGS --report $DEDUPLICATE_CONTRIBUTIONS $SCRIPTS_DIR/lxl-4150-deduplicate-contribution.groovy 2>/dev/null + $ARGS --report $DEDUPLICATE_CONTRIBUTIONS $SVSK_DIR/lxl-4150-deduplicate-contribution.groovy 2>/dev/null echo "$(count_lines $DEDUPLICATE_CONTRIBUTIONS/MODIFIED.txt) records affected, report in $DEDUPLICATE_CONTRIBUTIONS" echo echo "Adding missing contribution data..." time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \ - $ARGS --report $ADD_MISSING_CONTRIBUTION_DATA $SCRIPTS_DIR/add-missing-contribution-data.groovy 2>/dev/null + $ARGS --report $ADD_MISSING_CONTRIBUTION_DATA $SVSK_DIR/add-missing-contribution-data.groovy 2>/dev/null echo "$(count_lines $ADD_MISSING_CONTRIBUTION_DATA/MODIFIED.txt) records affected, report in $ADD_MISSING_CONTRIBUTION_DATA" echo echo "Moving roles to instance..." time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \ - $ARGS --report $ROLES_TO_INSTANCE $SCRIPTS_DIR/contributions-to-instance.groovy 2>/dev/null + $ARGS --report $ROLES_TO_INSTANCE $SVSK_DIR/contributions-to-instance.groovy 2>/dev/null echo "$(count_lines $ROLES_TO_INSTANCE/MODIFIED.txt) records affected, report in $ROLES_TO_INSTANCE" # Filter: Drop anonymous translations echo "Filtering out anonymous translations..." time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \ - $ARGS --report $NO_ANONYMOUS_TRANSLATIONS/$WHELKTOOL_REPORT $SCRIPTS_DIR/drop-anonymous-translations.groovy \ + $ARGS --report $NO_ANONYMOUS_TRANSLATIONS/$WHELKTOOL_REPORT $SVSK_DIR/drop-anonymous-translations.groovy \ >$NO_ANONYMOUS_TRANSLATIONS/$CLUSTER_TSV 2>/dev/null NUM_CLUSTERS=$(count_lines $NO_ANONYMOUS_TRANSLATIONS/$CLUSTER_TSV) echo "$NUM_CLUSTERS clusters ready for merge" diff --git a/librisworks/scripts/display-clusters.groovy b/librisworks/scripts/display/display-clusters.groovy similarity index 100% rename from librisworks/scripts/display-clusters.groovy rename to librisworks/scripts/display/display-clusters.groovy diff --git a/librisworks/scripts/display-works.groovy b/librisworks/scripts/display/display-works.groovy similarity index 100% rename from librisworks/scripts/display-works.groovy rename to librisworks/scripts/display/display-works.groovy diff --git a/librisworks/scripts/add-missing-contribution-data.groovy b/librisworks/scripts/svsk/add-missing-contribution-data.groovy similarity index 100% rename from librisworks/scripts/add-missing-contribution-data.groovy rename to librisworks/scripts/svsk/add-missing-contribution-data.groovy diff --git a/librisworks/scripts/contributions-to-instance.groovy b/librisworks/scripts/svsk/contributions-to-instance.groovy similarity index 100% rename from librisworks/scripts/contributions-to-instance.groovy rename to librisworks/scripts/svsk/contributions-to-instance.groovy diff --git a/librisworks/scripts/drop-anonymous-translations.groovy b/librisworks/scripts/svsk/drop-anonymous-translations.groovy similarity index 100% rename from librisworks/scripts/drop-anonymous-translations.groovy rename to librisworks/scripts/svsk/drop-anonymous-translations.groovy diff --git a/librisworks/scripts/language-in-work-title.groovy b/librisworks/scripts/svsk/language-in-work-title.groovy similarity index 100% rename from librisworks/scripts/language-in-work-title.groovy rename to librisworks/scripts/svsk/language-in-work-title.groovy diff --git a/librisworks/scripts/lxl-4150-deduplicate-contribution.groovy b/librisworks/scripts/svsk/lxl-4150-deduplicate-contribution.groovy similarity index 100% rename from librisworks/scripts/lxl-4150-deduplicate-contribution.groovy rename to librisworks/scripts/svsk/lxl-4150-deduplicate-contribution.groovy diff --git a/librisworks/scripts/swedish-fiction.groovy b/librisworks/scripts/svsk/swedish-fiction.groovy similarity index 100% rename from librisworks/scripts/swedish-fiction.groovy rename to librisworks/scripts/svsk/swedish-fiction.groovy From f89dc74c771cfa5a48233d269cd5b98c4c6ca8a4 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 29 Nov 2023 14:35:24 +0100 Subject: [PATCH 4/7] Enable running on EDU environment --- librisworks/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librisworks/run.sh b/librisworks/run.sh index f074a824db..e6bc750100 100755 --- a/librisworks/run.sh +++ b/librisworks/run.sh @@ -12,7 +12,7 @@ count_lines() { fi } -if ! [[ "$1" =~ ^(local|dev|dev2|qa|stg|prod)$ ]]; then +if ! [[ "$1" =~ ^(local|dev|dev2|qa|stg|prod|edu)$ ]]; then echo "Missing or invalid environment" exit 1 fi From ecb2fb30c89850e0f3eb79c7e2b733bd270d3731 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 29 Nov 2023 14:56:34 +0100 Subject: [PATCH 5/7] Fix typo --- librisworks/scripts/find-work-clusters.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librisworks/scripts/find-work-clusters.groovy b/librisworks/scripts/find-work-clusters.groovy index 47681a374b..e4841f7a7a 100644 --- a/librisworks/scripts/find-work-clusters.groovy +++ b/librisworks/scripts/find-work-clusters.groovy @@ -2,7 +2,7 @@ * Find clusters of records that may contain descriptions of the same work. * In short, similar descriptions are found by, for each bib record, querying Elastic for other records * having the same instance or work title and the same agent(s) in work contribution. - * The ids found by the query becomes a cluster. + * The ids found by the query become a cluster. * See script for more details. * * (When running, redirect STDERR to avoid annoying prints from whelktool) From 36dae9669c05579a721acf637b81754f149f82bd Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 29 Nov 2023 15:20:59 +0100 Subject: [PATCH 6/7] Sync with recent commit on develop (don't overwrite changes) --- .../src/main/groovy/se/kb/libris/mergeworks/Doc.groovy | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy index 2378fa9b1e..c2b829a2bf 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy @@ -306,7 +306,7 @@ class Doc { } boolean isSaogfFiction() { - genreForm().any { whelk.relations.isImpliedBy(SAOGF_SKÖN, it[ID_KEY] ?: '') } + genreForm().any { it[ID_KEY] == SAOGF_SKÖN || whelk.relations.isImpliedBy(SAOGF_SKÖN, it[ID_KEY] ?: '') } } boolean isSabFiction() { @@ -356,10 +356,11 @@ class Doc { 'https://id.kb.se/term/barngf/Bliss%20%28symbolspr%C3%A5k%29' ] as Set + def saoGfTactile = 'https://id.kb.se/term/saogf/Taktila%20verk' + asList(workData[CONTENT_TYPE]).contains([(ID_KEY): 'https://id.kb.se/term/rda/TactileText']) || asList(instanceData?.carrierType).any { it[ID_KEY] in ['https://id.kb.se/marc/Braille', 'https://id.kb.se/marc/TacMaterialType-b'] } - || genreForm().any {it[ID_KEY] in barnGfs - || whelk.relations.isImpliedBy('https://id.kb.se/term/saogf/Taktila%20verk', it[ID_KEY]) } + || genreForm().any {it[ID_KEY] in barnGfs || it[ID_KEY] == saoGfTactile || whelk.relations.isImpliedBy(saoGfTactile, it[ID_KEY]) } } boolean isThesis() { From 1bbaccfdbf60b9b9dc191e7cadf61de0f8eb4565 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 29 Nov 2023 15:25:08 +0100 Subject: [PATCH 7/7] Fix instantiating Whelk --- .../mergeworks/compare/IntendedAudienceSpec.groovy | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/IntendedAudienceSpec.groovy b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/IntendedAudienceSpec.groovy index aa7ac5eac1..b5971c919d 100644 --- a/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/IntendedAudienceSpec.groovy +++ b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/IntendedAudienceSpec.groovy @@ -11,6 +11,15 @@ class IntendedAudienceSpec extends Specification { private static def juvenile = ['@id': 'https://id.kb.se/marc/Juvenile'] private static def blank = ['label': 'x'] + static def whelk = null + static { + try { + whelk = Whelk.createLoadedSearchWhelk() + } catch (Exception e) { + System.err.println("Unable to instantiate whelk: $e") + } + } + def "is compatible"() { expect: new IntendedAudience().isCompatible(a, b) == result @@ -31,7 +40,7 @@ class IntendedAudienceSpec extends Specification { def "preferred comparison order"() { given: - Whelk whelk = Whelk.createLoadedSearchWhelk() +// Whelk whelk = Whelk.createLoadedSearchWhelk() def intendedAudience = [[juvenile], [adult], [juvenile], [], [adult], [general]] List docs = intendedAudience.collect { def data = ['@graph': [[], ['instanceOf': ['intendedAudience': it]]]]