diff --git a/librisxl-tools/scripts/merge-works.sh b/librisxl-tools/scripts/merge-works.sh new file mode 100755 index 0000000000..299dcbd752 --- /dev/null +++ b/librisxl-tools/scripts/merge-works.sh @@ -0,0 +1,123 @@ +#!/bin/bash + +# Run from whelktool dir + +count_lines() { + if [ -f $1 ]; then + wc -l $1 | cut -d ' ' -f 1 + else + echo 0 + fi +} + +if ! [[ "$1" =~ ^(local|dev|dev2|qa|stg|prod)$ ]]; then + echo "Missing or invalid environment" + exit 1 +fi + +ENV=$1 +ARGS="${@:2}" +NUM_CLUSTERS=0 + +REPORT_DIR=reports/merge-works/$ENV-$(date +%Y%m%d) + +mkdir -p $REPORT_DIR/{clusters,normalizations,merged-works} + +CLUSTERS_DIR=$REPORT_DIR/clusters +NORMALIZATIONS_DIR=$REPORT_DIR/normalizations + +FIND_CLUSTERS=$CLUSTERS_DIR/find-clusters +ALL_CLUSTERS=$CLUSTERS_DIR/1-all.tsv +MERGED_CLUSTERS=$CLUSTERS_DIR/2-merged.tsv +TITLE_CLUSTERS=$CLUSTERS_DIR/3-title-clusters.tsv +SWEDISH_FICTION=$CLUSTERS_DIR/4-swedish-fiction.tsv +NO_ANONYMOUS_TRANSLATIONS=$CLUSTERS_DIR/5-no-anonymous-translations.tsv + +LANGUAGE_IN_TITLE=$NORMALIZATIONS_DIR/1-titles-with-language +ELIB_DESIGNERS=$NORMALIZATIONS_DIR/2-elib-cover-designer +CONTRIBUTION=$NORMALIZATIONS_DIR/3-contribution +ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/4-roles-to-instance + +# Clustring step 1 TODO: run only on recently updated records after first run +echo "Finding new clusters..." +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar \ + $ARGS --report $FIND_CLUSTERS scripts/analysis/find-work-clusters.groovy >$ALL_CLUSTERS 2>/dev/null +NUM_CLUSTERS=$(count_lines $ALL_CLUSTERS) +echo "$NUM_CLUSTERS clusters found" +if [ $NUM_CLUSTERS == 0 ]; then + exit 0 +fi + +# Clustring step 2 +echo +echo "Merging clusters..." +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$ALL_CLUSTERS -jar build/libs/whelktool.jar \ + $ARGS scripts/analysis/merge-clusters.groovy >$MERGED_CLUSTERS 2>/dev/null +NUM_CLUSTERS=$(count_lines $MERGED_CLUSTERS) +echo "Merged into $NUM_CLUSTERS clusters" +if [ $NUM_CLUSTERS == 0 ]; then + exit 0 +fi + +# Clustring step 3 +echo +echo "Finding title clusters..." +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -cp build/libs/whelktool.jar datatool.WorkTool \ + $ARGS -tc $MERGED_CLUSTERS >$TITLE_CLUSTERS +NUM_CLUSTERS=$(count_lines $TITLE_CLUSTERS) +echo "$NUM_CLUSTERS title clusters found" +if [ $NUM_CLUSTERS == 0 ]; then + exit 0 +fi + +# Filter: Swedish fiction +echo +echo "Filtering on Swedish fiction..." +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -cp build/libs/whelktool.jar datatool.WorkTool \ + $ARGS -f $TITLE_CLUSTERS >$SWEDISH_FICTION +NUM_CLUSTERS=$(count_lines $SWEDISH_FICTION) +echo "Found $NUM_CLUSTERS title clusters with Swedish fiction" +if [ $NUM_CLUSTERS == 0 ]; then + exit 0 +fi + +# Normalization +echo +echo "Removing language from work titles..." +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION -jar build/libs/whelktool.jar \ + $ARGS --report $LANGUAGE_IN_TITLE src/main/groovy/datatool/scripts/mergeworks/normalize/language-in-work-title.groovy 2>/dev/null +echo "$(count_lines $LANGUAGE_IN_TITLE/MODIFIED.txt) records affected, report in $LANGUAGE_IN_TITLE" + +echo +echo "Specifying designer roles in Elib records..." +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar \ + $ARGS --report $ELIB_DESIGNERS scripts/cleanups/2023/05/lxl-4183-elib-cover-designer.groovy 2>/dev/null +echo "$(count_lines $ELIB_DESIGNERS/MODIFIED.txt) records affected, report in $ELIB_DESIGNERS" + +echo +echo "Normalizing contribution..." +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION -jar build/libs/whelktool.jar \ + $ARGS --report $CONTRIBUTION src/main/groovy/datatool/scripts/mergeworks/normalize/contribution.groovy 2>/dev/null +echo "$(count_lines $CONTRIBUTION/MODIFIED.txt) records affected, report in $CONTRIBUTION" + +echo +echo "Moving roles to instance..." +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION -jar build/libs/whelktool.jar \ + $ARGS --report $ROLES_TO_INSTANCE src/main/groovy/datatool/scripts/mergeworks/normalize/contributions-to-instance.groovy 2>/dev/null +echo "$(count_lines $ROLES_TO_INSTANCE/MODIFIED.txt) records affected, report in $ROLES_TO_INSTANCE" + +# Filter: Drop anonymous translations +echo "Filtering out anonymous translations..." +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -cp build/libs/whelktool.jar datatool.WorkTool \ + $ARGS -tr $SWEDISH_FICTION >$NO_ANONYMOUS_TRANSLATIONS +NUM_CLUSTERS=$(count_lines $NO_ANONYMOUS_TRANSLATIONS) +echo "$NUM_CLUSTERS clusters ready for merge" +if [ $NUM_CLUSTERS == 0 ]; then + exit 0 +fi + +# Merge +echo +echo "Merging..." +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -cp build/libs/whelktool.jar datatool.WorkTool \ + $ARGS -r $REPORT_DIR/merged-works -m $NO_ANONYMOUS_TRANSLATIONS \ No newline at end of file diff --git a/whelk-core/src/main/groovy/se/kb/libris/Normalizers.groovy b/whelk-core/src/main/groovy/se/kb/libris/Normalizers.groovy index 3261567c58..7ccdbf20d6 100644 --- a/whelk-core/src/main/groovy/se/kb/libris/Normalizers.groovy +++ b/whelk-core/src/main/groovy/se/kb/libris/Normalizers.groovy @@ -15,6 +15,7 @@ import whelk.util.Romanizer import static whelk.JsonLd.GRAPH_KEY import static whelk.JsonLd.ID_KEY import static whelk.JsonLd.TYPE_KEY +import static whelk.JsonLd.WORK_KEY import static whelk.JsonLd.asList import static whelk.util.DocumentUtil.traverse @@ -192,12 +193,19 @@ class Normalizers { } } - static Map getWork(JsonLd jsonLd, Document doc) { - def (_record, thing) = doc.data['@graph'] - if (thing && isInstanceOf(jsonLd, thing, 'Work')) { + static Map getWork(Whelk whelk, Document doc) { + def (_record, thing) = doc.data[GRAPH_KEY] + if (thing && isInstanceOf(whelk.jsonld, thing, 'Work')) { return thing - } else if (thing && thing['instanceOf'] && isInstanceOf(jsonLd, thing['instanceOf'], 'Work')) { - return thing['instanceOf'] + } + else if (thing && thing[WORK_KEY]) { + def linked = thing[WORK_KEY][ID_KEY] + if (linked) { + return getWork(whelk, whelk.storage.getDocumentByIri(linked)) + } + if (isInstanceOf(whelk.jsonld, thing[WORK_KEY], 'Work')) { + return thing[WORK_KEY] + } } return null } diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index a604efecc5..bdf4e46b4c 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -62,7 +62,7 @@ class Whelk { URI baseUri = null boolean skipIndex = false boolean skipIndexDependers = false - + // useCache may be set to true only when doing initial imports (temporary processes with the rest of Libris down). // Any other use of this results in a "local" cache, which will not be invalidated when data changes elsewhere, // resulting in potential serving of stale data. @@ -263,31 +263,29 @@ class Whelk { def systemId = Document.BASE_URI.resolve(id).getPath().substring(1) idMap[systemId] = id systemIds << systemId - } - else if (JsonLd.looksLikeIri(id)) { + } else if (JsonLd.looksLikeIri(id)) { otherIris << id - } - else { + } else { systemIds << id } } if (otherIris) { Map idToIri = storage.getSystemIdsByIris(otherIris) .collectEntries { k, v -> [(v): k] } - + systemIds.addAll(idToIri.keySet()) idMap.putAll(idToIri) } - + return storage.bulkLoad(systemIds) .findAll { id, doc -> !doc.deleted } - .collectEntries { id, doc -> [(idMap.getOrDefault(id, id)) : doc]} + .collectEntries { id, doc -> [(idMap.getOrDefault(id, id)): doc] } } - + private void reindexUpdated(Document updated, Document preUpdateDoc) { indexAsyncOrSync { elastic.index(updated, this) - + if (!skipIndexDependers) { if (hasChangedMainEntityId(updated, preUpdateDoc)) { reindexAllLinks(updated.shortId) @@ -297,17 +295,17 @@ class Whelk { } } } - + private void indexAsyncOrSync(Runnable runnable) { if (skipIndex) { return } - - if(!elastic) { + + if (!elastic) { log.warn("Elasticsearch not configured when trying to reindex") return } - + Runnable reindex = { try { runnable.run() @@ -316,7 +314,7 @@ class Whelk { log.error("Error reindexing: $e", e) } } - + if (isBatchJobThread()) { // Update them synchronously reindex.run() @@ -337,30 +335,29 @@ class Whelk { Set removedLinks = (preUpdateLinks - postUpdateLinks) removedLinks.findResults { storage.getSystemIdByIri(it.iri) } - .each{id -> elastic.decrementReverseLinks(id) } + .each { id -> elastic.decrementReverseLinks(id) } addedLinks.each { link -> String id = storage.getSystemIdByIri(link.iri) if (id) { Document doc = storage.load(id) def lenses = ['chips', 'cards', 'full'] - def reverseRelations = lenses.collect{ jsonld.getInverseProperties(doc.data, it) }.flatten() + def reverseRelations = lenses.collect { jsonld.getInverseProperties(doc.data, it) }.flatten() if (reverseRelations.contains(link.relation)) { // we added a link to a document that includes us in its @reverse relations, reindex it elastic.index(doc, this) - } - else { + } else { // just update link counter elastic.incrementReverseLinks(id) } } } - + if (storage.isCardChangedOrNonexistent(document.getShortId())) { bulkIndex(elastic.getAffectedIds(document.getThingIdentifiers() + document.getRecordIdentifiers())) } } - + private void bulkIndex(Iterable ids) { Iterables.partition(ids, 100).each { elastic.bulkIndexWithRetry(it, this) @@ -377,12 +374,12 @@ class Whelk { // Identifiers-table lookup on: List uriIDs = document.getRecordIdentifiers() - uriIDs.addAll( document.getThingIdentifiers() ) + uriIDs.addAll(document.getThingIdentifiers()) for (String uriID : uriIDs) { String systemId = storage.getSystemIdByIri(uriID) if (systemId != null && systemId != document.getShortId()) { log.info("Determined that " + document.getShortId() + " is duplicate of " + systemId + " due to collision on URI: " + uriID) - collidingSystemIDs.add( new Tuple2(systemId, "on URI: " + uriID) ) + collidingSystemIDs.add(new Tuple2(systemId, "on URI: " + uriID)) } } @@ -403,7 +400,7 @@ class Whelk { if (includingTypedIDs) { for (String collision : collisions) { if (collision != document.getShortId()) - collidingSystemIDs.add( new Tuple2(collision, "on typed id: " + type + "," + graphIndex + "," + value) ) + collidingSystemIDs.add(new Tuple2(collision, "on typed id: " + type + "," + graphIndex + "," + value)) } } else { @@ -423,7 +420,7 @@ class Whelk { */ boolean createDocument(Document document, String changedIn, String changedBy, String collection, boolean deleted) { normalize(document) - + boolean detectCollisionsOnTypedIDs = false List> collidingIDs = getIdCollisions(document, detectCollisionsOnTypedIDs) if (!collidingIDs.isEmpty()) { @@ -465,7 +462,7 @@ class Whelk { if (updated == null || preUpdateDoc == null) { return false } - + reindexUpdated(updated, preUpdateDoc) sparqlUpdater?.pollNow() @@ -480,7 +477,7 @@ class Whelk { if (updated == null) { return } - + reindexUpdated(updated, preUpdateDoc) sparqlUpdater?.pollNow() } @@ -493,15 +490,22 @@ class Whelk { boolean quickCreateDocument(Document document, String changedIn, String changedBy, String collection) { return storage.quickCreateDocument(document, changedIn, changedBy, collection) } - - void remove(String id, String changedIn, String changedBy, boolean force=false) { + + void remove(String id, String changedIn, String changedBy, boolean force = false) { log.debug "Deleting ${id} from Whelk" - Document doc = storage.load(id) - storage.remove(id, changedIn, changedBy, force) - indexAsyncOrSync { - elastic.remove(id) - if (!skipIndexDependers) { - reindexAffected(doc, doc.getExternalRefs(), Collections.emptySet()) + Document doc + try { + doc = storage.load(id) + } catch (Exception e) { + log.warn "Could not remove object from whelk. No entry with id $id found" + } + if (doc) { + storage.remove(id, changedIn, changedBy, force) + indexAsyncOrSync { + elastic.remove(id) + if (!skipIndexDependers) { + reindexAffected(doc, doc.getExternalRefs(), Collections.emptySet()) + } } } } @@ -513,13 +517,12 @@ class Whelk { } void embellish(Document document, List levels = null) { - def docsByIris = { List iris -> bulkLoad(iris).values().collect{ it.data } } + def docsByIris = { List iris -> bulkLoad(iris).values().collect { it.data } } Embellisher e = new Embellisher(jsonld, docsByIris, storage.&getCards, relations.&getByReverse) if (levels) { e.setEmbellishLevels(levels) - } - else if (document.getThingType() == 'Item') { + } else if (document.getThingType() == 'Item') { e.setEmbellishLevels(['cards']) e.setFollowInverse(false) } @@ -545,7 +548,7 @@ class Whelk { } } } - + return result } diff --git a/whelk-core/src/main/groovy/whelk/util/Statistics.groovy b/whelk-core/src/main/groovy/whelk/util/Statistics.groovy index e99b258421..411f821015 100644 --- a/whelk-core/src/main/groovy/whelk/util/Statistics.groovy +++ b/whelk-core/src/main/groovy/whelk/util/Statistics.groovy @@ -13,7 +13,7 @@ class Statistics { ThreadLocal> context = ThreadLocal.withInitial({ -> null }) int numExamples - + Statistics(int numExamples = 1) { this.numExamples = numExamples } diff --git a/whelk-core/src/main/groovy/whelk/util/Unicode.groovy b/whelk-core/src/main/groovy/whelk/util/Unicode.groovy index c5620c1f5f..ad97201b19 100644 --- a/whelk-core/src/main/groovy/whelk/util/Unicode.groovy +++ b/whelk-core/src/main/groovy/whelk/util/Unicode.groovy @@ -50,6 +50,8 @@ class Unicode { [(it): Normalizer.normalize(it, Normalizer.Form.NFKC)] } + STRIP_UNICODE_CHARS.collectEntries { [(it): ''] } } + + private static final Pattern UNICODE_MARK = Pattern.compile('\\p{M}') static boolean isNormalized(String s) { return Normalizer.isNormalized(s, Normalizer.Form.NFC) && !EXTRA_NORMALIZATION_MAP.keySet().any{ s.contains(it) } @@ -90,11 +92,11 @@ class Unicode { def m = s =~ /[^${w}]*(.*)/ return m.matches() ? m.group(1) : s } - + static String trim(String s) { s.replaceFirst(LEADING_SPACE, '').replaceFirst(TRAILING_SPACE, '') } - + static Optional guessScript(String s) { s = s.replaceAll(~/\p{IsCommon}|\p{IsInherited}|\p{IsUnknown}/, '') @@ -178,4 +180,8 @@ class Unicode { 'Vaii', ].each { add15924scriptCode(it) } } + + static String asciiFold(String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD).replaceAll(UNICODE_MARK, '') + } } diff --git a/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy b/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy index 4e6a59be6e..8ff47084b9 100644 --- a/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy +++ b/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy @@ -40,7 +40,7 @@ class UnicodeSpec extends Specification { ' _.:;|(Überzetsung)|;:. ' | '(Überzetsung)' ' _.:;| Ü b e r - z e t - s u n g |;:. ' | 'Ü b e r - z e t - s u n g' } - + def "trim"() { expect: Unicode.trim(dirty) == clean @@ -58,7 +58,7 @@ class UnicodeSpec extends Specification { '\r\nkeep leading line breaks' | '\r\nkeep leading line breaks' } - + def "double quotation marks"() { expect: Unicode.isNormalizedDoubleQuotes(dirty) == (dirty == clean) @@ -99,5 +99,23 @@ class UnicodeSpec extends Specification { Optional.of('Armn') | 'Պիպին նավի վրա' Optional.of('Kana') | 'デスノート' Optional.of('Hira') | 'とんとんとんと' + + def "u"() { + given: + String s = "übers" //uU+CC88 + String nfc = "übers" //U+C3BC + expect: + Unicode.isNormalized(s) == false + Unicode.normalize(s) == nfc + } + + def "asciiFold"() { + expect: + Unicode.asciiFold(unicode) == ascii + + where: + unicode | ascii + 'Désidéria' | 'Desideria' + 'Антон Павлович' | 'Антон Павлович' } } \ No newline at end of file diff --git a/whelktool/build.gradle b/whelktool/build.gradle index 2847670c82..c68523f273 100644 --- a/whelktool/build.gradle +++ b/whelktool/build.gradle @@ -53,6 +53,7 @@ dependencies { implementation "org.codehaus.groovy:groovy-jsr223:${groovyVersion}" implementation "org.codehaus.groovy:groovy:${groovyVersion}" implementation 'org.codehaus.jackson:jackson-mapper-asl:1.9.12' + implementation 'commons-codec:commons-codec:1.7' implementation group: 'xml-apis', name: 'xml-apis', version: '1.4.01' } diff --git a/whelktool/scripts/analysis/bib-249.groovy b/whelktool/scripts/analysis/bib-249.groovy new file mode 100644 index 0000000000..628d5acfcf --- /dev/null +++ b/whelktool/scripts/analysis/bib-249.groovy @@ -0,0 +1,81 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +s = new Statistics().printOnShutdown() + +selectByCollection('bib') { bib -> + try { + process(bib) + } + catch(Exception e) { + System.err.println(e) + e.printStackTrace() + } + +} + +void process(bib) { + def (record, thing, work) = bib.graph + + if(!work) { + return + } + + if(thing['marc:hasBib249']) { + boolean marcTrl = work['marc:languageNote'] == "marc:ItemIsOrIncludesATranslation" + + String hasTitle = hasTitle(thing, work) + + if (hasTitle == "diff") { + println (""" + ${bib.doc.getURI()} + ${thing['marc:hasBib249']} + marc:ItemIsOrIncludesATranslation ${marcTrl} + ${work.hasTitle} + """.stripIndent()) + } + + s.increment('hasTitle', hasTitle) + s.increment('shape', maybeList(thing['marc:hasBib249']) { map -> new TreeSet(map.keySet()) }) + s.increment('marc:ItemIsOrIncludesATranslation', "${marcTrl}") + } +} + +String hasTitle(thing, work) { + if (work.hasTitle) { + isSameTitle(thing, work) ? "match" : "diff" + } + else { + "no" + } +} + +boolean isSameTitle(def thing, def work) { + String t = getPathSafe(thing, ['marc:hasBib249', 'marc:originalTitle'], "TT") + String w = getPathSafe(work, ['hasTitle', 0, 'mainTitle'], "WT") + trim(w.toLowerCase()) == trim(t.toLowerCase()) +} + +Object maybeList(Object o, Closure c) { + o instanceof List + ? o.collect(c) + : c(o) +} + +private Object getPathSafe(item, path, defaultTo = null) { + for (p in path) { + if (item[p] != null) { + item = item[p] + } else { + return defaultTo + } + } + return item +} + +String trim(String s) { + // remove leading and trailing non-"alpha, digit or parentheses" + def w = /\(\)\p{IsAlphabetic}\p{Digit}/ + def m = s =~ /[^${w}]*([${w}- ]*[${w}])[^${w}]*/ + return m.matches() ? m.group(1) : s +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/bib-976.groovy b/whelktool/scripts/analysis/bib-976.groovy new file mode 100644 index 0000000000..5e26dfca23 --- /dev/null +++ b/whelktool/scripts/analysis/bib-976.groovy @@ -0,0 +1,114 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +class Script { + static PrintWriter notIn084 + static PrintWriter in084 + static PrintWriter noCode + static PrintWriter report + static PrintWriter errors +} +Script.notIn084 = getReportWriter("not-in-084.txt") +Script.in084 = getReportWriter("in-084.txt") +Script.noCode = getReportWriter("no-code.txt") +Script.report = getReportWriter("report.txt") +Script.errors = getReportWriter("errors.txt") + +s = new Statistics().printOnShutdown() + +selectByCollection('bib') { bib -> + try { + process(bib) + } + catch(Exception e) { + Script.errors.println("${bib.doc.shortId} $e") + e.printStackTrace(Script.errors) + } + +} + +void process(bib) { + def work = bib.graph[1]['instanceOf'] + + if(!work) { + return + } + + def bib976 = asList(work['marc:hasBib976']) + if(!bib976) { + return + } + + def (code, noCode) = bib976.split { it['marc:bib976-a'] } + def bib81 = sab(work) + + handleWithSabCode(bib, work, bib81, code) + handleWithoutSabCode(bib, work, bib81, noCode) +} + +void handleWithSabCode(bib, work, bib084, bib976) { + + bib976.each { + def (in084, notIn084) = bib976.split { x -> + def code = x['marc:bib976-a'] + bib084.findAll{ it.startsWith((code)) } + } + + in084.each { + s.increment('bib976-a', 'in classification') + } + + notIn084.each { + s.increment('bib976-a', 'not in classification') + s.increment('bib976-a not in classification', it) + } + + if (notIn084) { + Script.notIn084.println(""" + ${bib.doc.getURI()} + bib-976: ${notIn084.collect{ "${it['marc:bib976-a']} (${it['marc:bib976-b']})" }} + classification/kssb: $bib084 + """.stripIndent()) + } + + if (in084) { + Script.in084.println(""" + ${bib.doc.getURI()} + bib-976: ${in084.collect{ "${it['marc:bib976-a']} (${it['marc:bib976-b']})" }} + classification/kssb: $bib084 + """.stripIndent()) + } + + Script.report.println("${bib.doc.shortId} ${handled(in084, notIn084)}") + } +} + +String handled(in084, notIn084) { + if (!in084 && notIn084) { + return "ingen" + } + if (in084 && !notIn084) { + return "alla" + } + return "delvis" +} + +void handleWithoutSabCode(bib, work, bib084, bib976) { + if (bib976) { + def creator = bib.graph[0]['descriptionCreator']['@id'] + s.increment('bib976 without code', creator) + + bib976.each { + def label = it['marc:bib976-b'] + Script.noCode.println("${bib.doc.getURI()} $creator $label") + } + } +} + +List sab(work) { + asList(work['classification']).findAll{ it['inScheme'] ?: '' == 'kssb' }.collect{ it['code'] } +} + +def asList(x) { + (x ?: []).with {it instanceof List ? it : [it] } +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/broader-gf.groovy b/whelktool/scripts/analysis/broader-gf.groovy new file mode 100644 index 0000000000..a0e526466c --- /dev/null +++ b/whelktool/scripts/analysis/broader-gf.groovy @@ -0,0 +1,45 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +s = new Statistics(5).printOnShutdown() + +selectByCollection('bib') { bib -> + def work = getWork(bib) + + if(!work) { + return + } + + if(work['genreForm']) { + List ids = work['genreForm']['@id'] + if (ids.size() > 1) { + [ids, ids].combinations{ a,b -> + if (a != b) { + check(bib.whelk, a, b) + check(bib.whelk, b, a) + } + } + } + } +} + +void check(whelk, String a, String b) { + if (whelk.relations.isImpliedBy(a, b)) { + s.increment(a, b) + s.increment('#broader', a) + } +} + +Map getWork(def bib) { + def (record, thing, work) = bib.graph + if (thing && isInstanceOf(thing, 'Work')) { + return thing + } + else if(thing && thing['instanceOf'] && isInstanceOf(thing['instanceOf'], 'Work')) { + return thing['instanceOf'] + } + else if (work && isInstanceOf(work, 'Work')) { + return work + } + return null +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/find-work-clusters.groovy b/whelktool/scripts/analysis/find-work-clusters.groovy new file mode 100644 index 0000000000..bd9c61d24d --- /dev/null +++ b/whelktool/scripts/analysis/find-work-clusters.groovy @@ -0,0 +1,169 @@ +/** + * (When running, redirect STDERR to avoid annoying prints from whelktool) + */ + + +import java.text.SimpleDateFormat +import java.util.concurrent.ConcurrentHashMap +import se.kb.libris.Normalizers + +PrintWriter failedQueries = getReportWriter("failed-queries") +PrintWriter tooLargeResult = getReportWriter("too-large-result") + +def yesterday = new SimpleDateFormat('yyyy-MM-dd').with { sdf -> + Calendar.getInstance().with { c -> + c.add(Calendar.DATE, -1) + sdf.format(c.getTime()) + } +} + +def where = """ + collection = '%s' + AND (modified = '$yesterday' + OR (data#>>'{@graph,0,generationDate}')::date = '$yesterday') +""" + +visited = Collections.newSetFromMap(new ConcurrentHashMap()) // TODO: remove? +instancesOfUpdatedLinkedWorks = Collections.synchronizedSet([] as Set) + +selectBySqlWhere(String.format(where, 'auth')) { + def thing = it.graph[1] + if (Normalizers.isInstanceOf(it.whelk.jsonld, thing, 'Work')) { + selectBySqlWhere("collection = 'bib' and data#>>'{@graph,1,instanceOf,@id}' = '${thing['@id']}'") { + instancesOfUpdatedLinkedWorks.add(it.doc.shortId) + } + } +} + +def process = { bib -> + if (!visited.add(bib.doc.shortId)) + return + + try { + def q = buildQuery(bib) + if (!q) { + return + } + + List ids = queryIds(q).collect() + + if (ids.size() > 200) { + tooLargeResult.println("Results: ${ids.size()} Query: ${q}") + } + else if (ids.size() > 1) { + visited.addAll(ids) + println(ids.join('\t')) + } + } + catch (Exception e) { + failedQueries.println(e) + e.printStackTrace() + return + } +} + +selectByIds(instancesOfUpdatedLinkedWorks) { + process(it) +} + +// TODO: Change when starting to run regularly +//selectBySqlWhere(String.format(where, 'bib')) { bib -> +selectByCollection('bib') { + process(it) +} + +Map> buildQuery(bib) { + def title = title(bib) + + if (!title) + return null + + Map> query = [ + "q" : ["*"], + "@type" : ["Instance"], + "hasTitle.mainTitle": [esSafe(title)], + ] + + insertLinkedAgents(bib) + def card = bib.asCard(true) + + def author = primaryContributor(card).collect{ esSafe(it) } + if (author) { + query["or-instanceOf.contribution._str"] = author + query["or-instanceOf.contribution.agent._str"] = author + return query + } + + def allContributors = contributors(card).collect{ esSafe(it) } + if (allContributors) { + query["or-instanceOf.contribution._str"] = allContributors + query["or-instanceOf.contribution.agent._str"] = allContributors + return query + } + return null +} + +private void insertLinkedAgents(bib) { + getPathSafe(bib.doc.data, ['@graph', 1, 'instanceOf', 'contribution']).each { + if (it.agent && it.agent['@id']) { + it.agent = loadThing(it.agent['@id']) + } + } +} + +private String title(bib) { + return getPathSafe(bib.doc.data, ['@graph', 1, 'hasTitle', 0, 'mainTitle']) +} + +private List primaryContributor(bib) { + contributorStrings(getPathSafe(bib, ['@graph', 1, 'instanceOf', 'contribution'], []).find { it['@type'] == "PrimaryContribution" }) +} + +private List contributors(bib) { + getPathSafe(bib, ['@graph', 1, 'instanceOf', 'contribution'], []).collect { contributorStrings(it) }.grep().flatten() +} + +//getPathSafe(contribution, ['_str'])?.with { String s -> s.replaceAll(/[^ \p{IsAlphabetic}]/, '') } +private List contributorStrings(contribution) { + List variants = asList(contribution?.agent) + asList(getPathSafe(contribution, ['agent', 'hasVariant'])) + + variants.collect { name(it) }.grep() +} + +private String name(Map agent) { + agent.givenName && agent.familyName + ? "${agent.givenName} ${agent.familyName}" + : agent.name +} + +// Remove ES query operators from string +private String esSafe(String s) { + s.replaceAll('[+|"\\-*~]', " ") +} + +private Object getPathSafe(item, path, defaultTo = null) { + if (!item) { + return defaultTo + } + + for (p in path) { + if (item[p] != null) { + item = item[p] + } else { + return defaultTo + } + } + return item +} + +private Map loadThing(def id) { + def thing = [:] + selectByIds([id]) { t -> + thing = t.graph[1] + } + return thing +} + +private static List asList(Object o) { + (o ?: []).with { it instanceof List ? it : [it] } +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/local-broader.groovy b/whelktool/scripts/analysis/local-broader.groovy new file mode 100644 index 0000000000..10a87e7bde --- /dev/null +++ b/whelktool/scripts/analysis/local-broader.groovy @@ -0,0 +1,61 @@ +/** + * Find unlinked 'broader' + * + * See LXL-3213 for more information. + */ + + +import groovy.transform.Memoized +import whelk.util.DocumentUtil + +class Script { + static PrintWriter report + static PrintWriter selfRef + static PrintWriter is404 + static PrintWriter error +} +Script.report = getReportWriter("report.txt") +Script.selfRef = getReportWriter("self-ref.txt") +Script.error = getReportWriter("error.txt") +Script.is404 = getReportWriter("404.txt") + +selectByCollection('auth') { auth -> + try { + process(auth) + } + catch(Exception e) { + //Script.error. + println("${auth.doc.shortId} $e") + e.printStackTrace() + } +} + +void process(auth) { + Map thing = auth.graph[1] + String id = thing['@id'] + List broader = thing['broader'] + + if (!broader) { + return + } + + broader.findAll{ !it['@id'] }.each { Map b -> + Script.report.println("$id $b") + } + + broader.findAll{ id == it['@id'] }.each { Map b -> + Script.selfRef.println("$id") + } + broader.findAll{ it['@id'] && is404(it['@id']) }.each { Map b -> + Script.is404.println("$id $b") + } +} + +@Memoized +boolean is404(String id) { + Map thing = null + selectByIds([id]) { auth -> + thing = auth.graph[1] + } + return thing == null +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/mediaterm.groovy b/whelktool/scripts/analysis/mediaterm.groovy new file mode 100644 index 0000000000..4e015e8079 --- /dev/null +++ b/whelktool/scripts/analysis/mediaterm.groovy @@ -0,0 +1,27 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +Statistics s = new Statistics(5) +s.printOnShutdown() + +selectByCollection('bib') { bib -> + try { + DocumentUtil.findKey(bib.doc.data, 'marc:mediaTerm') { String value, path -> + if (value.contains(']')) { + String mediaType = value.substring(0, value.indexOf(']')) + String suffix = value.substring(value.indexOf(']') + 1) + if (!suffix.isBlank()) { + String id = bib.doc.shortId + s.increment('ALL', suffix, id) + s.increment(mediaType, suffix, id) + s.increment('TOTAL', 'TOTAL') + } + } + + } + } + catch(Exception e) { + println(e) + e.printStackTrace() + } +} \ No newline at end of file diff --git a/whelktool/scripts/examples/merge-clusters.groovy b/whelktool/scripts/analysis/merge-clusters.groovy similarity index 58% rename from whelktool/scripts/examples/merge-clusters.groovy rename to whelktool/scripts/analysis/merge-clusters.groovy index c5af74531a..cbb150e180 100644 --- a/whelktool/scripts/examples/merge-clusters.groovy +++ b/whelktool/scripts/analysis/merge-clusters.groovy @@ -1,15 +1,11 @@ import datatool.util.DisjointSets -String dir = System.getProperty('clustersDir') -mergeClusters( - new File(dir, 'clusters.tsv'), - new File(dir, 'clusters-merged.tsv')) +mergeClusters(new File(System.getProperty('clusters'))) -void mergeClusters(File input, File output) throws FileNotFoundException { +void mergeClusters(File clusters) throws FileNotFoundException { DisjointSets sets = new DisjointSets<>() - PrintWriter p = new PrintWriter(output) - input.eachLine() { + clusters.eachLine { sets.addSet(Arrays.asList(it.split(/[\t ]+/))) } @@ -19,13 +15,13 @@ void mergeClusters(File input, File output) throws FileNotFoundException { void nextElement(String e) { if(!first) print('\t') - p.print(e) + print(e) first = false } @Override void closeSet() { - p.println() + println() first = true } }) diff --git a/whelktool/scripts/analysis/oversattning-without.trl.groovy b/whelktool/scripts/analysis/oversattning-without.trl.groovy new file mode 100644 index 0000000000..18e008e764 --- /dev/null +++ b/whelktool/scripts/analysis/oversattning-without.trl.groovy @@ -0,0 +1,49 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +Statistics s = new Statistics().printOnShutdown() + +selectByCollection('bib') { bib -> + def work = getWork(bib) + + if(!work) { + return + } + + if(work['marc:languageNote'] == "marc:ItemIsOrIncludesATranslation" + && noTranslator(work.contribution ?: []) + && (bib.graph[1].responsibilityStatement ?: "").contains('övers') + ) { + println (""" + ${bib.doc.getURI()} + ${work.contribution} + ${bib.graph[1].responsibilityStatement} + + """.stripIndent()) + s.increment('tot', 'tot') + } +} + +boolean noTranslator(def contribution) { + boolean found = false + DocumentUtil.findKey(contribution, '@id') { value, path -> + if (value == 'https://id.kb.se/relator/translator') { + found = true + } + DocumentUtil.NOP + } + + return !found +} + + +Map getWork(def bib) { + def (record, thing, work) = bib.graph + if (thing && isInstanceOf(thing, 'Work')) { + return thing + } + else if (work && isInstanceOf(work, 'Work')) { + return work + } + return null +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/respStatement-to-contribution.groovy b/whelktool/scripts/analysis/respStatement-to-contribution.groovy new file mode 100644 index 0000000000..1dc2201096 --- /dev/null +++ b/whelktool/scripts/analysis/respStatement-to-contribution.groovy @@ -0,0 +1,178 @@ +import groovy.transform.Memoized +import whelk.util.Statistics + +import static datatool.scripts.mergeworks.Util.asList +import static datatool.scripts.mergeworks.Util.parseRespStatement +import static datatool.scripts.mergeworks.Util.getPathSafe +import static datatool.scripts.mergeworks.Util.Relator +import static datatool.scripts.mergeworks.Util.bestEncodingLevel +import static datatool.scripts.mergeworks.WorkToolJob.nameMatch + +PrintWriter allStatements = getReportWriter("all-statements.csv") +PrintWriter notParsed = getReportWriter("not-parsed.txt") +PrintWriter roleSpecified = getReportWriter("role-specified.tsv") +PrintWriter agentFoundInCluster = getReportWriter("agent-found-in-cluster.tsv") +PrintWriter parsedButUnmatched = getReportWriter("parsed-but-unmatched.tsv") +PrintWriter pseudonyms = getReportWriter("pseudonyms") + +Statistics s = new Statistics().printOnShutdown() + +def clusters = System.getProperty('clustersDir') + .with {new File(it, 'clusters.tsv') } + .collect { it.split() as List } + +clusters.each { cluster -> + s.increment('fetch contribution from respStatement', 'clusters checked') + + selectByIds(cluster) { bib -> + def data = bib.doc.data + def id = bib.doc.shortId + def respStatement = getPathSafe(data, ['@graph', 1, 'responsibilityStatement']) + def encodingLevel = getPathSafe(data, ['@graph', 0, 'encodingLevel']) + + if (!respStatement) + return + + s.increment('fetch contribution from respStatement', 'docs checked') + allStatements.println(respStatement) + + def contributionsInRespStmt = parseRespStatement(respStatement) + def contribution = getPathSafe(data, ['@graph', 1, 'instanceOf', 'contribution'], []) + + if (contributionsInRespStmt.isEmpty()) { + notParsed.println([respStatement, id].join('\t')) + return + } + + contribution.each { Map c -> + asList(c.agent).each { a -> + def matchedOnName = contributionsInRespStmt.find { n, r -> + nameMatch(n, loadIfLink(a)) + } + + if (!matchedOnName) + return + + // Contributor found locally, omit from further search + contributionsInRespStmt.remove(matchedOnName.key) + + + def dontAdd = { Relator relator, boolean isFirstStmtPart -> + relator == Relator.UNSPECIFIED_CONTRIBUTOR + || isFirstStmtPart && relator == Relator.AUTHOR + && c.'@type' != 'PrimaryContribution' + } + + def rolesInRespStatement = matchedOnName.value + .findResults { dontAdd(it) ? null : it.getV1() } + + if (rolesInRespStatement.isEmpty()) + return + + def rolesInContribution = asList(c.role).findAll { it.'@id' != Relator.UNSPECIFIED_CONTRIBUTOR.iri } + def roleShort = { it.split('/').last() } + def joinRoles = { roles -> roles.collect { r -> r.'@id' ? roleShort(r.'@id') : 'BLANK' }.join('|') } + + rolesInRespStatement.removeAll { r -> + r == Relator.EDITOR && rolesInContribution.findIndexOf { + it.'@id' == Relator.ADAPTER.iri + }.with { + if (it == -1) { + return false + } else { + rolesInContribution[it]['@id'] = Relator.EDITOR.iri + return true + } + } + } + + if (rolesInRespStatement.size() <= rolesInContribution.size()) + return + + rolesInRespStatement.each { r -> + def idLink = ['@id': r.iri] + if (!(idLink in rolesInContribution)) { + rolesInContribution << idLink + s.increment('fetch contribution from respStatement', "${roleShort(r.iri)} roles specified") + roleSpecified.println([id, joinRoles(asList(c.role)), joinRoles(rolesInContribution), matchedOnName.key, respStatement].join('\t')) + } + } + } + } + + def comparable = { + it*.getV1().findResults { Relator r -> + r != Relator.UNSPECIFIED_CONTRIBUTOR + ? ['@id': r.iri] + : null + } + } + + contributionsInRespStmt.each { name, roles -> + def roleShort = { it.getV1().iri.split('/').last() } + def concat = { it.collect { r -> roleShort(r) }.join('|') } + + def found = false + + for (String otherId : cluster) { + def doc = loadDoc(otherId) + if (!doc) + continue + def otherEncodingLevel = getPathSafe(doc.data, ['@graph', 0, 'encodingLevel']) + + def matched = getPathSafe(doc.data, ['@graph', 1, 'instanceOf', 'contribution'], []) + .find { Map c -> + asList(c.agent).any { a -> + nameMatch(name, loadIfLink(a)) + && comparable(roles).with { r -> !r.isEmpty() && asList(c.role).containsAll(r) } + && bestEncodingLevel.indexOf(encodingLevel) <= bestEncodingLevel.indexOf(otherEncodingLevel) + } + } + + if (matched) { + def isPseudonym = { + asList(it.agent).any { a -> + loadIfLink(a).description =~ /(?i)pseud/ + } + } + + if (isPseudonym(matched)) { + pseudonyms.println([id, concat(roles), name, otherId].join('\t')) + continue + } + + roles.each { s.increment('fetch contribution from respStatement', "${roleShort(it)} found in cluster") } + agentFoundInCluster.println([id, concat(roles), name, otherId, respStatement].join('\t')) + + found = true + break + } + } + + if (!found) + parsedButUnmatched.println([id, concat(roles), name, respStatement].join('\t')) + } + } +} + +def loadIfLink(Map agent) { + agent['@id'] ? loadThing(agent['@id']) : agent +} + +@Memoized +def loadThing(String id) { + def thing = [:] + selectByIds([id]) { t -> + thing = t.graph[1] + } + return thing +} + +@Memoized +def loadDoc(String id) { + def doc + selectByIds([id]) { d -> + doc = d.doc + } + return doc +} diff --git a/whelktool/scripts/analysis/responsibilityStatement.groovy b/whelktool/scripts/analysis/responsibilityStatement.groovy new file mode 100644 index 0000000000..b9f5aafdc5 --- /dev/null +++ b/whelktool/scripts/analysis/responsibilityStatement.groovy @@ -0,0 +1,12 @@ +selectByCollection('bib') { bib -> + def (record, thing) = bib.graph + if (thing.responsibilityStatement) { + int numContribution = asList(thing.instanceOf?.contribution).size() + String title = thing.hasTitle?.mainTitle ?: (thing.hasTitle ?: '') + println(String.format("%s\t%3s\t%s\t\t%s", bib.doc.shortId, numContribution, thing.responsibilityStatement, title)) + } +} + +List asList(Object o) { + (o ?: []).with { it instanceof List ? it : [it] } +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/subject-404.groovy b/whelktool/scripts/analysis/subject-404.groovy new file mode 100644 index 0000000000..e453172dd6 --- /dev/null +++ b/whelktool/scripts/analysis/subject-404.groovy @@ -0,0 +1,38 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +class Script { + static Statistics s = new Statistics().printOnShutdown() +} + +selectByCollection('bib') { bib -> + try { + process(bib) + } + catch(Exception e) { + System.err.println(e) + e.printStackTrace() + } + +} + +void process(bib) { + def (record, thing) = bib.graph + + Map work = thing['instanceOf'] + + if(!work) { + return + } + + if(work['subject']) { + for (Map subject in (work['subject'] as List)) { + if(subject['@type'] != 'ComplexSubject') { + if (subject['sameAs'] && subject['sameAs'][0] && subject['sameAs'][0]['@id'] && subject['sameAs'][0]['@id'].contains('id.kb.se')) { + Script.s.increment('sameAs', subject['sameAs'][0]['@id'], bib.doc.shortId) + } + } + } + + } +} \ No newline at end of file diff --git a/whelktool/scripts/examples/works.groovy b/whelktool/scripts/analysis/works.groovy similarity index 100% rename from whelktool/scripts/examples/works.groovy rename to whelktool/scripts/analysis/works.groovy diff --git a/whelktool/scripts/analysis/works3.groovy b/whelktool/scripts/analysis/works3.groovy deleted file mode 100644 index 41b37f8b77..0000000000 --- a/whelktool/scripts/analysis/works3.groovy +++ /dev/null @@ -1,122 +0,0 @@ -/** - * (When running, redirect STDERR to avoid annoying prints from whelktool) - */ - -import java.util.concurrent.ConcurrentHashMap - -clusterLog = getReportWriter("clusters.tsv") - -visited = Collections.newSetFromMap(new ConcurrentHashMap()) - -selectByCollection('bib') { bib -> - if (!visited.add(bib.doc.shortId)) - return - - try { - def q = buildQuery(bib) - if (!q) { - return - } - - List ids = queryIds(q).collect() - - if (ids.size() > 1) { - visited.addAll(ids) - clusterLog.println(ids.join('\t')) - } - } - catch (Exception e) { - println(e) - return - } -} - -exit() - -List>> buildQueries(bib) { - def title = title(bib) - - if (!title) - return null - - -} - -Map> buildQuery(bib) { - - - Map> query = [ - "q" : ["*"], - "@type" : ["*"], - "hasTitle.mainTitle" : [title + "~"], - ] - - def author = primaryContributorId(bib) - if (author) { - query["instanceOf.contribution.agent.@id"] = [author] - return query - } - - def contributors = contributorStrings(bib) - if (contributors) { - query["instanceOf.contribution._str"] = contributors.collect{ it + "~" } - return query - } - - return null -} - -synchronized void exit() { - System.exit(0) -} - -private String title(bib) { - return getPathSafe(bib.doc.data, ['@graph', 1, 'hasTitle', 0, 'mainTitle']) -} - -private String primaryContributorId(bib) { - def primary = getPathSafe(bib.doc.data, ['@graph', 1, 'instanceOf', 'contribution'], []).grep{ it['@type'] == "PrimaryContribution"} - return getPathSafe(primary, [0, 'agent', '@id']) -} - -private List contributorStrings(bib) { - return getPathSafe(bib.asCard(true), ['@graph', 1, 'instanceOf', 'contribution'], [])['_str'].grep{it} -} - -private String flatTitle(bib) { - return flatten( - bib.doc.data['@graph'][1]['hasTitle'], - ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', ] - ) -} - -private String flatten(Object o, List order) { - if (o instanceof String) { - return o - } - if (o instanceof List) { - return o - .collect{ flatten(it, order) } - .join(' || ') - } - if (o instanceof Map) { - return order - .collect{ o.get(it, null) } - .grep{ it != null } - .collect{ flatten(it, order) } - .join(' | ') - } - - throw new RuntimeException(String.format("unexpected type: %s for %s", o.class.getName(), o)) -} - -private Object getPathSafe(item, path, defaultTo = null) { - for (p in path) { - if (item[p] != null) { - item = item[p] - } else { - return defaultTo - } - } - return item -} diff --git a/whelktool/scripts/cleanups/2021/09/lxl-3303-instance-summary.groovy b/whelktool/scripts/cleanups/2021/09/lxl-3303-instance-summary.groovy new file mode 100644 index 0000000000..53c6d9e01b --- /dev/null +++ b/whelktool/scripts/cleanups/2021/09/lxl-3303-instance-summary.groovy @@ -0,0 +1,48 @@ +/** + * Move summary supplied by some providers from work to instance + * + * See LXL-3303 for more information + */ + +providers = [ + '[Barnbokskatalogen]', + '[Elib]', + '[Publit]', + 'Provided by publisher' +] + +def where = """ + collection = 'bib' + AND data#>>'{@graph, 1, instanceOf, @type}' = 'Text' + AND data#>>'{@graph, 1, instanceOf, summary}' IS NOT NULL + AND deleted = false + """ + +Set shape = ['@type', 'label'] as Set + +selectBySqlWhere(where) { bib -> + List summaries = bib.graph[1]['instanceOf']['summary'] + def (toInstance, toWork) = summaries.split { Map s -> + s.keySet() == shape + && providers.any { p -> asList(s.label).any { l -> l.contains(p) } } + } + + if (toInstance) { + if (toWork) { + bib.graph[1]['instanceOf']['summary'] = toWork + } else { + bib.graph[1]['instanceOf'].remove('summary') + } + + bib.graph[1]['summary'] = (bib.graph[1]['summary'] ?: []) + toInstance + bib.scheduleSave() + } +} + +private List asList(Object o) { + if (o == null) + return [] + if (o instanceof List) + return o + return [o] +} diff --git a/whelktool/scripts/cleanups/2021/09/lxl-3376-elib-cover-designer.groovy b/whelktool/scripts/cleanups/2021/09/lxl-3376-elib-cover-designer.groovy new file mode 100644 index 0000000000..cf7ab8d140 --- /dev/null +++ b/whelktool/scripts/cleanups/2021/09/lxl-3376-elib-cover-designer.groovy @@ -0,0 +1,76 @@ +PrintWriter unhandled = getReportWriter("unhandled.txt") + +def where = """ + collection = 'bib' + AND (data#>>'{@graph, 1, instanceOf, summary}' like '%ormgivare:%[Elib]%' OR data#>>'{@graph, 1, summary}' like '%ormgivare:%[Elib]%') + AND deleted = false + """ + +ROLES = [ + 'Formgivare:' : 'https://id.kb.se/relator/designer', + 'Omslagsformgivare:' : 'https://id.kb.se/relator/coverDesigner' +] + +OTHER = [['@id': 'https://id.kb.se/relator/unspecifiedContributor']] + +selectBySqlWhere(where) { bib -> + def summary = asList(bib.graph[1]['instanceOf']['summary']) + asList(bib.graph[1]['summary']) + def nameToRoles = summary + .findResults { it['label']} + .join(' ') + .with { parseDesigners(it) } + + List workContribution = bib.graph[1]['instanceOf']['contribution'] + if (workContribution.removeAll { !it.agent }) { + bib.scheduleSave() + } + + def coverDesigners = workContribution.findAll { + def a = it.role && ROLES.values().containsAll(it.role) + def b = nameToRoles.containsKey(name(it.agent)) && (it.role == OTHER || !it.role) + a || b + } + + if (!coverDesigners) { + unhandled.println("${bib.doc.shortId} c:$workContribution d:$nameToRoles") + return + } + + workContribution.removeAll(coverDesigners) + + coverDesigners.each { it['role'] = nameToRoles[name(it.agent)].collect { ['@id' : it] } } + + bib.graph[1]['contribution'] = (bib.graph[1]['contribution'] ?: []) + coverDesigners + + bib.scheduleSave() +} + +private Map parseDesigners(String summary) { + def roleToNames = ROLES.collectEntries { s, id -> + def names = summary + .findAll(/$s[^\[,]+/) + .collect { it.substring(s.size()) } + .collect { it.trim() } + + [(id) : names] + } + + def nameToRoles = [:] + roleToNames.each { role, names -> + names.each { n -> nameToRoles[n] = nameToRoles.getOrDefault(n, []) + [role] } + } + + return nameToRoles +} + +private String name(Map agent) { + "${agent.givenName} ${agent.familyName}" +} + +private List asList(Object o) { + if (o == null) + return [] + if (o instanceof List) + return o + return [o] +} \ No newline at end of file diff --git a/whelktool/scripts/cleanups/2021/09/lxl-3376-language-in-work-title.groovy b/whelktool/scripts/cleanups/2021/09/lxl-3376-language-in-work-title.groovy new file mode 100644 index 0000000000..0a0b1e2c08 --- /dev/null +++ b/whelktool/scripts/cleanups/2021/09/lxl-3376-language-in-work-title.groovy @@ -0,0 +1,63 @@ +import groovy.transform.Memoized +import whelk.util.DocumentUtil + +PrintWriter report = getReportWriter("report.txt") + +def ids = new File(System.getProperty('ids')) + .readLines() + .collect { it.split('\t').collect { it.trim()} } + .flatten() + +selectByIds(ids) { bib -> + def langs = [ + [1, 'instanceOf', 'language', 0, '@id'], + [1, 'instanceOf', 'translationOf', 0, 'language', 0, '@id'] + ].collect { + langName(getPathSafe(bib.graph, it, '')).toLowerCase() + } + + boolean changed = DocumentUtil.traverse(bib.graph[1].instanceOf) { value, path -> + if (path && 'mainTitle' in path && value instanceof String) { + for (lang in langs) { + String r = value.replaceAll(/(?i)\s*\(\(?\s*${lang}\s*\)\)?\s*$/, '') + if (value != r) { + report.println("$value -> $r") + return new DocumentUtil.Replace(r) + } + } + } + return DocumentUtil.NOP + } + + if (changed) { + bib.scheduleSave() + } +} + +private Object getPathSafe(item, path, defaultTo = null) { + if (!item) { + return defaultTo + } + + for (p in path) { + if (item[p] != null) { + item = item[p] + } else { + return defaultTo + } + } + return item +} + +@Memoized +private String langName(def id) { + getPathSafe(loadThing(id), ['prefLabelByLang', 'sv'], "NOT FOUND") +} + +private Map loadThing(def id) { + def thing = [:] + selectByIds([id]) { t -> + thing = t.graph[1] + } + return thing +} \ No newline at end of file diff --git a/whelktool/scripts/cleanups/2023/05/lxl-4183-elib-cover-designer.groovy b/whelktool/scripts/cleanups/2023/05/lxl-4183-elib-cover-designer.groovy new file mode 100644 index 0000000000..f9006c7f33 --- /dev/null +++ b/whelktool/scripts/cleanups/2023/05/lxl-4183-elib-cover-designer.groovy @@ -0,0 +1,171 @@ +import groovy.transform.Memoized + +import java.util.concurrent.ConcurrentHashMap + +PrintWriter matchedAndSpecified = getReportWriter("matched.tsv") +PrintWriter unmatchedSpecifiedAnyway = getReportWriter("mismatched.tsv") +PrintWriter matchedInOtherWork = getReportWriter("matched-in-other-work.tsv") +PrintWriter notSpecifiedMovedToInstance = getReportWriter("not-specified-moved-to-instance.txt") + +def where = """ + collection = 'bib' + AND data#>>'{@graph, 0, identifiedBy}' LIKE '%Elib%' + AND (data#>>'{@graph, 1, instanceOf, summary}' is not null OR data#>>'{@graph, 1, summary}' is not null) + AND deleted = false + """ + +ROLES = [ + 'Formgivare:' : 'https://id.kb.se/relator/bookDesigner', + 'Omslag:' : 'https://id.kb.se/relator/coverDesigner', + 'Omslagsformgivare:': 'https://id.kb.se/relator/coverDesigner', +] + +OTHER = [['@id': 'https://id.kb.se/relator/unspecifiedContributor']] + +Map> knownNames = new ConcurrentHashMap(['https://id.kb.se/relator/bookDesigner' : new ConcurrentHashMap().newKeySet(), + 'https://id.kb.se/relator/coverDesigner': new ConcurrentHashMap().newKeySet()]) +Map> knownAgents = new ConcurrentHashMap(['https://id.kb.se/relator/bookDesigner' : new ConcurrentHashMap().newKeySet(), + 'https://id.kb.se/relator/coverDesigner': new ConcurrentHashMap().newKeySet()]) +Set handled = new ConcurrentHashMap().newKeySet() + +selectBySqlWhere(where) { bib -> + def id = bib.doc.shortId + def instance = bib.graph[1] + def summary = asList(instance['instanceOf']['summary']) + asList(bib.graph[1]['summary']) + + def nameToRoles = summary + .findResults { it['label'] } + .join(' ') + .with { parseRoles(it) } + .each { name, roles -> + knownNames.computeIfAbsent(name, f -> []).add(roles) + } + + List workContribution = instance['instanceOf']['contribution'] + if (!workContribution) { + return + } + + def modified = workContribution.removeAll { !it.agent } + + Set existingRoles = workContribution.collect { asList(it.role)*.'@id' }.grep().flatten() + + if (existingRoles.contains('https://id.kb.se/relator/unspecifiedContributor') && nameToRoles) { + workContribution.each { c -> + if (asList(c.role) == OTHER) { + def agentName = name(loadIfLink(c.agent)) + def roles = nameToRoles[agentName] + if (roles) { + c['role'] = roles.collect { ['@id': it] } + matchedAndSpecified.println([id, c.agent, roles].join('\t')) + nameToRoles.remove(agentName) + modified = true + } + } + } + + def other = workContribution.findAll { asList(it.role) == OTHER } + + if (nameToRoles.size() == 1 && other.size() == 1) { + def c = other[0] + def name = nameToRoles.keySet()[0] + def roles = nameToRoles[name] + other[0]['role'] = roles.collect { ['@id': it] } + other.clear() + unmatchedSpecifiedAnyway.println([id, c.agent, name, roles].join('\t')) + modified = true + } + + if (other.isEmpty()) { + handled.add(id) + } + } + + workContribution.each { c -> + def roles = asList(c.role)*.'@id' + if (knownAgents.keySet().intersect(roles)) { + knownAgents.computeIfAbsent(c.agent, f -> []).add(roles) + } + } + + if (modified) { + bib.scheduleSave() + } +} + +selectBySqlWhere("collection = 'bib' AND data#>>'{@graph, 0, identifiedBy}' LIKE '%Elib%' AND deleted = false") { bib -> + def id = bib.doc.shortId + if (id in handled) { + return + } + def instance = bib.graph[1] + List workContribution = instance['instanceOf']['contribution'] + if (!workContribution) { + return + } + + workContribution.removeAll { !it.agent } + + workContribution.each { c -> + if (asList(c.role) == OTHER) { + def roles = knownAgents[c.agent] ?: knownNames[name(loadIfLink(c.agent))] + if (roles) { + def countByRole = roles.countBy { it }.sort { -it.value } + if (countByRole.size() == 1) { + countByRole.find { it.value > 2 }?.with { + def role = it.key.find() + def count = it.value + c['role'] = [['@id': role]] + matchedInOtherWork.println([id, c.agent, role, count].join('\t')) + bib.scheduleSave() + } + } + } + } + } + + workContribution.removeAll { c -> + if (asList(c.role) == OTHER) { + instance['contribution'] = asList(instance['contribution']) + c + notSpecifiedMovedToInstance.println(id) + bib.scheduleSave() + return true + } + return false + } +} + +private Map parseRoles(String summary) { + def roleToNames = ROLES.collectEntries { s, id -> + def names = summary + .findAll(/$s[^\[,"]+/) + .collect { it.substring(s.size()) } + .collect { it.trim() } + + [(id): names] + } + + def nameToRoles = [:] + roleToNames.each { role, names -> + names.each { n -> nameToRoles[n] = nameToRoles.getOrDefault(n, []) + [role] } + } + + return nameToRoles +} + +private String name(Map agent) { + agent.name ?: "${agent.givenName} ${agent.familyName}" +} + +private Map loadIfLink(Map m) { + m['@id'] ? loadThing(m['@id']) : m +} + +@Memoized +private Map loadThing(def id) { + def thing = [:] + selectByIds([id]) { t -> + thing = t.graph[1] + } + return thing +} \ No newline at end of file diff --git a/whelktool/scripts/examples/contribution-role.groovy b/whelktool/scripts/examples/contribution-role.groovy new file mode 100644 index 0000000000..32a9015d01 --- /dev/null +++ b/whelktool/scripts/examples/contribution-role.groovy @@ -0,0 +1,43 @@ +import whelk.util.DocumentUtil +import datatool.util.Statistics + +Statistics s = new Statistics() +s.printOnShutdown() + +selectByCollection('bib') { bib -> + try { + DocumentUtil.findKey(bib.doc.data, 'role') { Object value, path -> + count(s, value) + } + } + catch(Exception e) { + println(e) + e.printStackTrace() + } +} + + +private String normalize(String s) { + def noise = [",", '"', "'", '[', ']', ',', '.', '.', ':', ';', '-', '(', ')', '-', '–', '+', '!', '?'].collectEntries { [it, ''] } + return s.toLowerCase().replace(noise).trim() +} + +void count(Statistics s, Object role) { + if (role instanceof Map && !role['@id']) { + count1(s, role, 'code') + count1(s, role, 'label') + } + else if (role instanceof String) { + s.increment('string', role.toString()) + } + else if (role instanceof List) { + s.increment('list size', role.size()) + role.each { count(s, it) } + } +} + +void count1(Statistics s, Map thing, String prop) { + if (thing[prop]) { + s.increment(prop, normalize(thing[prop].toString())) + } +} \ No newline at end of file diff --git a/whelktool/scripts/examples/works2.groovy b/whelktool/scripts/examples/works2.groovy deleted file mode 100644 index 8f31ad43f0..0000000000 --- a/whelktool/scripts/examples/works2.groovy +++ /dev/null @@ -1,116 +0,0 @@ -/** - * (When running, redirect STDERR to avoid annoying prints from whelktool) - */ - -import java.util.concurrent.ConcurrentHashMap - -clusterLog = getReportWriter("clusters.tsv") - -visited = Collections.newSetFromMap(new ConcurrentHashMap()) - -selectByCollection('bib') { bib -> - if (!visited.add(bib.doc.shortId)) - return - - try { - def q = buildQuery(bib) - if (!q) { - return - } - - List ids = queryIds(q).collect() - - if (ids.size() > 1) { - visited.addAll(ids) - clusterLog.println(ids.join('\t')) - } - } - catch (Exception e) { - println(e) - return - } -} - -exit() - -Map> buildQuery(bib) { - def title = title(bib) - - if (!title) - return null - - Map> query = [ - "q" : ["*"], - "@type" : ["*"], - "hasTitle.mainTitle" : [title + "~"], - ] - - def author = primaryContributorId(bib) - if (author) { - query["instanceOf.contribution.agent.@id"] = [author] - return query - } - - def contributors = contributorStrings(bib) - if (contributors) { - query["instanceOf.contribution._str"] = contributors.collect{ it + "~" } - return query - } - - return null -} - -synchronized void exit() { - System.exit(0) -} - -private String title(bib) { - return getPathSafe(bib.doc.data, ['@graph', 1, 'hasTitle', 0, 'mainTitle']) -} - -private String primaryContributorId(bib) { - def primary = getPathSafe(bib.doc.data, ['@graph', 2, 'contribution'], []).grep{ it['@type'] == "PrimaryContribution"} - return getPathSafe(primary, [0, 'agent', '@id']) -} - -private List contributorStrings(bib) { - return getPathSafe(bib.asCard(true), ['@graph',2,'contribution'], [])['_str'].grep{it} -} - -private String flatTitle(bib) { - return flatten( - bib.doc.data['@graph'][1]['hasTitle'], - ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', ] - ) -} - -private String flatten(Object o, List order) { - if (o instanceof String) { - return o - } - if (o instanceof List) { - return o - .collect{ flatten(it, order) } - .join(' || ') - } - if (o instanceof Map) { - return order - .collect{ o.get(it, null) } - .grep{ it != null } - .collect{ flatten(it, order) } - .join(' | ') - } - - throw new RuntimeException(String.format("unexpected type: %s for %s", o.class.getName(), o)) -} - -private Object getPathSafe(item, path, defaultTo = null) { - for (p in path) { - if (item[p] != null) { - item = item[p] - } else { - return defaultTo - } - } - return item -} diff --git a/whelktool/src/main/groovy/datatool/WorkTool.groovy b/whelktool/src/main/groovy/datatool/WorkTool.groovy new file mode 100644 index 0000000000..b734f879c9 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/WorkTool.groovy @@ -0,0 +1,75 @@ +package datatool + +import datatool.scripts.mergeworks.Doc +import groovy.cli.commons.CliBuilder +import datatool.scripts.mergeworks.WorkToolJob + +/** + 1) find clusters + $ ENV=local && time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --dry-run scripts/analysis/find-work-clusters.groovy + + 2) merge overlapping clusters, output file is placed in same directory as input + + $ CLUSTERSDIR=reports/local-2021... + $ ENV=local && time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -DclustersDir=$CLUSTERSDIR -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --dry-run scripts/analysis/merge-clusters.groovy + + 3) + ENV=local && time java -Xmx4G -Dxl.secret.properties=$HOME/secret.properties-$ENV -cp build/libs/whelktool.jar datatool.WorkTool --dry-run -s reports/1000-swedishFiction.tsv + + + */ + +class WorkTool { + static void main(String[] args) { + def cli = new CliBuilder(usage: 'WorkTool [options] ') + cli.h(longOpt: 'help', 'Print this help message and exit.') + cli.I(longOpt: 'skip-index', 'Do not index any changes, only write to storage.') + cli.d(longOpt: 'dry-run', 'Do not save any modifications.') + cli.a(longOpt: 'allow-loud', 'Do loud modifications.') + cli.nt(longOpt:'num-threads', args:1, argName:'N', "Override default number of threads.") + cli.v(longOpt: 'verbose', '.') + cli.r(longOpt: 'report', args: 1, argName: 'report dir', 'Save reports in this directory') + + cli.m(longOpt: 'merge', 'Merge and extract matching works') + cli.s(longOpt: 'show', 'Show. Generate HTML report with title clusters') + cli.s2(longOpt: 'showWorks', 'Show. Generate HTML report with works') + cli.sh(longOpt: 'showHubs', 'Show. Generate HTML report with title clusters containing different works') + cli.f(longOpt: 'swedishFiction', 'Filter: output clusters containing swedish fiction') + cli.tr(longOpt: 'anonymousTranslation', 'Filter: remove translations without translator') + cli.tc(longOpt: 'title-clusters', 'Filter: output title clusters') + + def options = cli.parse(args) + if (options.h) { + cli.usage() + System.exit 0 + } + + def clustersPath = options.arguments()[0] + def m = new WorkToolJob(new File(clustersPath)) + m.skipIndex = options.I + m.dryRun = options.d + m.loud = options.a + m.verbose = options.v + m.reportDir = options.r ? new File(options.r) : m.reportDir + m.numThreads = options.nt ? Integer.parseInt(options.nt) : -1 + + if (options.m) { + m.merge() + } else if (options.s) { + m.show() + } else if (options.s2) { + m.showWorks() + } else if (options.sh) { + m.showHubs() + } else if (options.f) { + m.swedishFiction() + } else if (options.tr) { + m.filterClusters({ Doc d -> !d.isAnonymousTranslation() }) + } else if (options.tc) { + m.outputTitleClusters() + } else { + cli.usage() + System.exit 1 + } + } +} diff --git a/whelktool/src/main/groovy/datatool/WorkTool.md b/whelktool/src/main/groovy/datatool/WorkTool.md new file mode 100644 index 0000000000..c977784e55 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/WorkTool.md @@ -0,0 +1,35 @@ +* language Swedish +* Fiction +* issuanceType Monograph +* No hasPart +* encodingLevel is not marc:PartialPreliminaryLevel or marc:PrepublicationLevel + TODO: specify a minimal set of properties that must exist? + + +fiction +------- + + + +properties +---------- + +* **classification** Always take the sum of all works. + * SAB/kssb - Merge codes that are the same or prefixes. Take the longer code. Take the latest SAB version. Example: kssb/8 Hc + kssb/7 Hc.02 = kssb/8 Hc.02 + * Dewey - Merge equal codes with different editionEnumeration, use the newest. +* **contentType** Allow missing or `https://id.kb.se/term/rda/Text` +* **subject** Always take the sum of all works. + * TODO: preserve order? +* **hasTitle** Take from one random work. + * TODO: Take the most common one? Some other metric of "best"? +* **genreForm** Take from all works. Only keep the right one if both occur of the following: + * marc/NotFictionNotFurtherSpecified -> marc/FictionNotFurtherSpecified (i.e. actually fiction) + * marc/FictionNotFurtherSpecified -> marc/Novel + * marc/FictionNotFurtherSpecified -> marc/Poetry + * marc/NotFictionNotFurtherSpecified -> marc/Autobiography + * marc/NotFictionNotFurtherSpecified -> marc/Biography + +Instance properties +* **editionStatement** Added to comparison if it contains "förk" (förkortad = abbreviated). Then it must be the exact same string. +* **extent** Number of pages parsed from extent may not differ more than 30%. + * TODO: allow missing extent? \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/DisplayDoc.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/DisplayDoc.groovy new file mode 100644 index 0000000000..b71ba3ad4c --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/DisplayDoc.groovy @@ -0,0 +1,158 @@ +package datatool.scripts.mergeworks + +import whelk.Document +import whelk.JsonLd + +class DisplayDoc { + Doc doc + Map framed + + DisplayDoc(Doc doc) { + this.doc = doc + } + + private static String displayTitle(Map thing) { + thing['hasTitle'].collect { it['@type'] + ": " + it['flatTitle'] }.join(', ') + } + + String instanceDisplayTitle() { + displayTitle(['hasTitle': Util.flatTitles(doc.instanceTitle())]) + } + + // TODO... + String getDisplayText(String field) { + if (field == 'contribution') { + return contributorStrings().join("
") + } else if (field == 'classification') { + return classificationStrings().join("
") + } else if (field == 'instance title') { + return doc.instanceTitle() ?: '' + } else if (field == 'instance type') { + return doc.instanceType() ?: '' + } else if (field == 'editionStatement') { + return doc.editionStatement() ?: '' + } else if (field == 'responsibilityStatement') { + return doc.responsibilityStatement() ?: '' + } else if (field == 'encodingLevel') { + return doc.encodingLevel() + } else if (field == 'publication') { + return chipString(doc.publication()) + } else if (field == 'identifiedBy') { + return chipString(doc.identifiedBy()) + } else if (field == 'extent') { + return chipString(doc.extent() ?: []) + } else if (field == 'reproductionOf') { + return reproductionOfLink() + } else { + return chipString(doc.workData.getOrDefault(field, [])) + } + } + + protected String chipString(def thing) { + Util.chipString(thing, doc.whelk) + } + + private String reproductionOfLink() { + def base = Document.getBASE_URI().toString() + def shortId = doc.reproductionOf() + ? doc.reproductionOf()[0]['@id'].substring(base.length()).replace('#it', '') + : '' + return "$shortId" + } + + String tooltip(String string, String tooltip) { + """${string}""" + } + + String link() { + String base = Document.getBASE_URI().toString() + String kat = "katalogisering/" + String id = doc.document.shortId + return base + kat + id + } + + private List contributorStrings() { + List path = doc.instanceData ? ['instanceOf', 'contribution'] : ['contribution'] + List contribution = Util.getPathSafe(getFramed(), path, []) + + return contribution.collect { Map c -> + contributionStr(c) + } + } + + private String contributionStr(Map contribution) { + StringBuilder s = new StringBuilder() + + if (contribution['@type'] == 'PrimaryContribution') { + s.append('') + } + + s.append(flatMaybeLinked(contribution['role'], ['code', 'label']).with { it.isEmpty() ? it : it + ': ' }) + s.append(flatMaybeLinked(contribution['agent'], ['givenName', 'familyName', 'lifeSpan', 'name'])) + + if (contribution['@type'] == 'PrimaryContribution') { + s.append('') + } + + return s.toString() + } + + List classificationStrings() { + List path = doc.instanceData ? ['instanceOf', 'classification'] : ['classification'] + List classification = Util.getPathSafe(getFramed(), path, []) + + classification.collect { c -> + StringBuilder s = new StringBuilder() + s.append(flatMaybeLinked(c['inScheme'], ['code', 'version']).with { it.isEmpty() ? it : it + ': ' }) + s.append(flatMaybeLinked(c, ['code'])) + return s.toString() + } + } + + private static String flatMaybeLinked(Object thing, List order) { + if (!thing) + return '' + + if (thing instanceof List) { + return thing.collect { flatMaybeLinked(it, order) }.join(' | ') + } + String s = flatten(thing, order, ', ') + + thing['@id'] + ? """$s""" + : s + } + + static String flatten(Object o, List order, String mapSeparator = ': ') { + if (o instanceof String) { + return o + } + if (o instanceof List) { + return o + .collect { flatten(it, order) } + .join(' || ') + } + if (o instanceof Map) { + return order + .findResults { ((Map) o).get(it) } + .collect { flatten(it, order) } + .join(mapSeparator) + } + + throw new RuntimeException(String.format("unexpected type: %s for %s", o.class.getName(), o)) + } + + Map getFramed() { + if (!framed) { + if (doc.existsInStorage) { + framed = JsonLd.frame(doc.thingIri(), doc.whelk.loadEmbellished(doc.shortId()).data) + } else { + Document copy = doc.document.clone() + doc.whelk.embellish(copy) + framed = JsonLd.frame(doc.thingIri(), copy.data) + } + } + + return framed + } +} diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/Doc.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Doc.groovy new file mode 100644 index 0000000000..8a1373e4e7 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Doc.groovy @@ -0,0 +1,305 @@ +package datatool.scripts.mergeworks + + +import whelk.Document +import whelk.Whelk + +import static whelk.JsonLd.ID_KEY +import static datatool.scripts.mergeworks.Util.asList +import datatool.scripts.mergeworks.Util.Relator + +class Doc { + public static final String SAOGF_SKÖN = 'https://id.kb.se/term/saogf/Sk%C3%B6nlitteratur' + public static final List MARC_FICTION = [ + 'https://id.kb.se/marc/FictionNotFurtherSpecified', + 'https://id.kb.se/marc/Drama', + 'https://id.kb.se/marc/Essay', + 'https://id.kb.se/marc/Novel', + 'https://id.kb.se/marc/HumorSatiresEtc', + 'https://id.kb.se/marc/Letter', + 'https://id.kb.se/marc/ShortStory', + 'https://id.kb.se/marc/MixedForms', + 'https://id.kb.se/marc/Poetry', + ] + public static final List MARC_NOT_FICTION = [ + 'https://id.kb.se/marc/NotFictionNotFurtherSpecified', + 'https://id.kb.se/marc/Biography' + ] + public static final List DRAMA_GF = [ + 'https://id.kb.se/term/saogf/Dramatik', + 'https://id.kb.se/marc/Drama' + ] + + + Whelk whelk + Document document + + Collection unlinkedInstances + + Map instanceData + Map workData + + List flatInstanceTitle + + DisplayDoc display + + String preUpdateChecksum + + boolean existsInStorage = true + boolean modified = false + + Doc(Whelk whelk, Document document) { + this.whelk = whelk + this.document = document + this.preUpdateChecksum = document.getChecksum(whelk.getJsonld()) + setData() + } + + void setData() { + if (mainEntity()['instanceOf']) { + instanceData = mainEntity() + workData = instanceData['instanceOf'] + } else { + workData = mainEntity() + } + } + + DisplayDoc getView() { + if (!display) { + display = new DisplayDoc(this) + } + + return display + } + + Map record() { + document.data['@graph'][0] + } + + Map mainEntity() { + document.data['@graph'][1] + } + + String shortId() { + document.shortId + } + + String thingIri() { + document.getThingIdentifiers().first() + } + + String encodingLevel() { + return record()['encodingLevel'] ?: '' + } + + String workIri() { + workData['@id'] + } + + List workTitle() { + asList(workData['hasTitle']) + } + + List instanceTitle() { + asList(instanceData?.hasTitle) + } + + List flatInstanceTitle() { + if (!flatInstanceTitle) { + flatInstanceTitle = Util.getFlatTitle(instanceTitle()) + } + + return flatInstanceTitle + } + + String workType() { + workData['@type'] + } + + String instanceType() { + instanceData?.'@type' + } + + List translationOf() { + asList(workData['translationOf']) + } + + List contribution() { + asList(workData['contribution']) + } + + List classification() { + asList(workData['classification']) + } + + List genreForm() { + asList(workData['genreForm']) + } + + List publication() { + asList(instanceData?.publication) + } + + List identifiedBy() { + asList(instanceData?.identifiedBy) + } + + List extent() { + asList(instanceData?.extent) + } + + List reproductionOf() { + asList(instanceData?.reproductionOf) + } + + String editionStatement() { + instanceData?.editionStatement + } + + String responsibilityStatement() { + instanceData?.responsibilityStatement + } + + int numPages() { + String extent = Util.getPathSafe(extent(), [0, 'label', 0]) ?: Util.getPathSafe(extent(), [0, 'label'], '') + return numPages(extent) + } + + // TODO: improve parsing https://metadatabyran.kb.se/beskrivning/materialtyper-arbetsfloden/tryckta-monografier/omfang-for-tryckta-monografier + static int numPages(String extentLabel) { + def l = extentLabel.replace('onumrerade', '') + def matcher = l =~ /(\d+)(?=[, \[\]0-9]*[sp])/ + List pages = [] + while (matcher.find()) { + pages << Integer.parseInt(matcher.group(1)) + } + pages ? pages.max() : -1 + } + + boolean hasGenericTitle() { + Util.hasGenericTitle(instanceTitle()) + } + + boolean isMonograph() { + instanceData?.issuanceType == 'Monograph' + } + + boolean isManuscript() { + instanceType() == 'Manuscript' || [['@id': 'https://id.kb.se/term/saogf/Manuskript'], ['@id': 'https://id.kb.se/term/saogf/Handskrifter']].intersect(genreForm()) + } + + boolean isInSb17Bibliography() { + asList(record()['bibliography']).contains(['@id': 'https://libris.kb.se/library/SB17']) + } + + boolean isMaybeAggregate() { + hasPart() + || getView().classificationStrings().any { it.contains('kssb') && it.contains('(s)') } + || !contribution().any { it['@type'] == 'PrimaryContribution' } + || hasRelationshipWithContribution() + } + + boolean hasPart() { + workData['hasPart'] || instanceData['hasTitle'].findAll { it['@type'] == 'Title' }.any { + it.hasPart?.size() > 1 || it.hasPart?.any { p -> asList(p.partName).size() > 1 || asList(p.partNumber).size() > 1 } + } + } + + boolean hasRelationshipWithContribution() { + asList(workData['relationship']).any { r -> + asList(r['entity']).any { e -> + e.containsKey('contribution') + } + } + } + + boolean isFiction() { + isMarcFiction() || isSaogfFiction() || isSabFiction() + } + + boolean isMarcFiction() { + genreForm().any { it['@id'] in MARC_FICTION } + } + + boolean isMarcNotFiction() { + genreForm().any { it['@id'] in MARC_NOT_FICTION } + } + + boolean isSaogfFiction() { + genreForm().any { whelk.relations.isImpliedBy(SAOGF_SKÖN, it['@id'] ?: '') } + } + + boolean isSabFiction() { + classification().any { it.inScheme.toString() =~ /kssb/ && it.code =~ /^(H|uH|ufH|ugH)/ } + } + + boolean isNotFiction() { + // A lot of fiction has marc/NotFictionNotFurtherSpecified but then classification is usually empty + isMarcNotFiction() && (!getView().classificationStrings().isEmpty() && !isSabFiction()) + } + + boolean isText() { + workData['@type'] == 'Text' + } + + boolean isAnonymousTranslation() { + translationOf() && !hasAnyRole([Relator.TRANSLATOR, Relator.EDITOR, Relator.ADAPTER]) + } + + boolean hasAnyRole(List relators) { + contribution().any { + asList(it['role']).intersect(relators.collect { [(ID_KEY): it.iri] }) + } + } + + boolean isDrama() { + isSabDrama() || isGfDrama() + } + + boolean isSabDrama() { + getView().classificationStrings().any { it.contains(': Hc.02') || it.contains(': Hce.02') } + } + + boolean isGfDrama() { + asList(genreForm()).any { it['@id'] in DRAMA_GF } + } + + boolean isTactile() { + asList(workData['contentType']).contains(['@id': 'https://id.kb.se/term/rda/TactileText']) + || asList(instanceData?.carrierType).any { it['@id'] in ['https://id.kb.se/marc/Braille', 'https://id.kb.se/marc/TacMaterialType-b'] } + } + + boolean isThesis() { + genreForm().any { it == ['@id': 'https://id.kb.se/marc/Thesis'] } + } + + boolean hasDistinguishingEdition() { + (instanceData?.editionStatement ?: '').toString().toLowerCase().contains("förk") + } + + void addComparisonProps() { + if (hasDistinguishingEdition()) { + workData['_editionStatement'] = instanceData['editionStatement'] + } + workData['_numPages'] = numPages() + } + + void removeComparisonProps() { + workData.remove('_editionStatement') + workData.remove('_numPages') + } + + void replaceWorkData(Map replacement) { + workData.clear() + workData.putAll(replacement) + modified = true + } + + void addCloseMatch(List workIds) { + def closeMatch = (asList(workData['closeMatch']) + (workIds - workIri()).collect { ['@id': it] }).unique() + if (closeMatch) { + workData['closeMatch'] = closeMatch + modified = true + } + } +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/FieldStatus.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/FieldStatus.groovy new file mode 100644 index 0000000000..8f41f4f393 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/FieldStatus.groovy @@ -0,0 +1,7 @@ +package datatool.scripts.mergeworks + +enum FieldStatus { + EQUAL, + COMPATIBLE, + DIFF +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/Html.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Html.groovy new file mode 100644 index 0000000000..e967151d8b --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Html.groovy @@ -0,0 +1,111 @@ +package datatool.scripts.mergeworks + +import org.apache.commons.codec.digest.DigestUtils + +import static datatool.scripts.mergeworks.FieldStatus.COMPATIBLE +import static datatool.scripts.mergeworks.FieldStatus.DIFF +import static datatool.scripts.mergeworks.FieldStatus.EQUAL + +class Html { + private static String CSS = Html.class.getClassLoader() + .getResourceAsStream('merge-works/table.css').getText("UTF-8") + + static final String START = """ + + + """ + static final String END = '' + static final String HORIZONTAL_RULE = "

\n" + + static def infoFields = ['reproductionOf', 'instance title', 'instance type', 'editionStatement', 'responsibilityStatement', 'encodingLevel', 'publication', 'identifiedBy', 'extent', 'physicalDetailsNote'] + + static String clusterTable(Collection cluster) { + String id = clusterId(cluster.collect { it.shortId() }) + String header = """ + + ${id} + ${cluster.collect { doc -> "${doc.shortId()}" }.join('\n')} + + + + ${cluster.collect { doc -> "${doc.view.instanceDisplayTitle()}" }.join('\n')} + + """.stripIndent() + + def statuses = WorkComparator.compare(cluster) + + String info = infoFields.collect(fieldRows(cluster, "info")).join('\n') + String equal = statuses.get(EQUAL, []).collect(fieldRows(cluster, cluster.size() > 1 ? EQUAL.toString() : "")).join('\n') + String compatible = statuses.get(COMPATIBLE, []).collect(fieldRows(cluster, COMPATIBLE.toString())).join('\n') + String diff = statuses.get(DIFF, []).collect(fieldRows(cluster, DIFF.toString())).join('\n') + + return """ + + ${header} + ${equal} + ${compatible} + ${diff} + ${info} +
+

+ """ + } + + static String hubTable(Collection works) { + def instanceDocs = works.collect { work -> work.unlinkedInstances ?: work } + def clusterId = clusterId(instanceDocs.flatten().collect { Doc d -> d.shortId() }) + + String header = """ + + ${clusterId} + ${works.collect { it.workIri() + ? "${it.shortId()}" + : "" } + .join('\n')} + + """.stripIndent() + + def link = { Doc d -> "${d.shortId()}" } + + String instances = + """ + + _instances + ${instanceDocs.collect { "${it.collect(link).join('
')}" }.join('\n')} + + """.stripIndent() + + def statuses = WorkComparator.compare(works) + + String equal = statuses.get(EQUAL, []).collect(fieldRows(works, works.size() > 1 ? EQUAL.toString() : "")).join('\n') + String compatible = statuses.get(COMPATIBLE, []).collect(fieldRows(works, COMPATIBLE.toString())).join('\n') + String diff = statuses.get(DIFF, []).collect(fieldRows(works, DIFF.toString())).join('\n') + + return """ + + ${header} + ${equal} + ${compatible} + ${diff} + ${instances} +
+

+ """ + } + + static String clusterId(Collection cluster) { + cluster + ? DigestUtils.md5Hex(cluster.sort().first()).toUpperCase().substring(0, 12) + : "" + } + + private static def fieldRows(Collection cluster, String cls) { + { field -> + """ + + ${field} + ${cluster.collect { "${it.view.getDisplayText(field)}" }.join('\n')} + """.stripIndent() + } + } +} diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/Util.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Util.groovy new file mode 100644 index 0000000000..a065b0e1e9 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Util.groovy @@ -0,0 +1,341 @@ +package datatool.scripts.mergeworks + +import org.apache.commons.lang3.StringUtils +import whelk.Document +import whelk.IdGenerator +import whelk.Whelk +import whelk.util.DocumentUtil +import whelk.util.Unicode + +class Util { + static def titleComponents = ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', 'marc:parallelTitle', 'marc:equalTitle'] + + static def titleVariant = ['Title', 'ParallelTitle'] + // removed 'VariantTitle', 'CoverTitle' since they sometimes contain random generic stuff like "Alibis filmroman", "Kompisböcker för de yngsta" + + static enum Relator { + TRANSLATOR('https://id.kb.se/relator/translator'), + AUTHOR('https://id.kb.se/relator/author'), + ILLUSTRATOR('https://id.kb.se/relator/illustrator'), + AUTHOR_OF_INTRO('https://id.kb.se/relator/authorOfIntroduction'), + ADAPTER('https://id.kb.se/relator/adapter'), + COVER_DESIGNER('https://id.kb.se/relator/coverDesigner'), + COMPILER('https://id.kb.se/relator/compiler'), + AUTHOR_OF_AFTERWORD('https://id.kb.se/relator/authorOfAfterwordColophonEtc'), + PHOTOGRAPHER('https://id.kb.se/relator/photographer'), + EDITOR('https://id.kb.se/relator/editor'), + UNSPECIFIED_CONTRIBUTOR('https://id.kb.se/relator/unspecifiedContributor'), + PRIMARY_RIGHTS_HOLDER('https://id.kb.se/relator/primaryRightsHolder'), + ABRIDGER('https://id.kb.se/relator/abridger'), + IMPLICIT_AUTHOR('https://id.kb.se/relator/author') + + String iri + + private Relator(String iri) { + this.iri = iri + } + } + + private static Set IGNORED_SUBTITLES = WorkToolJob.class.getClassLoader() + .getResourceAsStream('merge-works/ignored-subtitles.txt') + .readLines().grep().collect(Util.&normalize) as Set + + private static Set GENERIC_TITLES = WorkToolJob.class.getClassLoader() + .getResourceAsStream('merge-works/generic-titles.txt') + .readLines().grep().collect(Util.&normalize) as Set + + static def noise = + [",", '"', "'", "ʹ", "ʼ", '[', ']', ',', '.', '.', ':', ';', '-', '(', ')', ' the ', '-', '–', '+', '!', '?',].collectEntries { [it, ' '] } + + + static List asList(Object o) { + (o ?: []).with { it instanceof List ? it : [it] } + } + + /** + * Partition a collection based on equality condition + * + * NOTE: O(n^2)... + */ + static Collection> partition(Collection collection, Closure matcher) { + List> result = [] + + for (T t : collection) { + boolean match = false + for (List group : result) { + if (groupMatches(t, group, matcher)) { + group.add(t) + match = true + break + } + } + + if (!match) { + result.add([t]) + } + } + return result + } + + static boolean groupMatches(T t, List group, Closure matcher) { + group.every { other -> matcher(other, t) } + } + + static boolean hasGenericTitle(List hasTitle) { + hasTitle.any { it['mainTitle'] && normalize((String) it['mainTitle']) in GENERIC_TITLES } + } + + static List dropGenericSubTitles(List hasTitle) { + hasTitle.collect { + def copy = new TreeMap(it) + if (copy['subtitle'] || copy['titleRemainder']) { + DocumentUtil.traverse(copy) { value, path -> + if (('subtitle' in path || 'titleRemainder' in path) && value instanceof String) { + if (genericSubtitle(value)) { + new DocumentUtil.Remove() + } else { + ((List) value.split(':')).with { + if (it.size() > 1 && genericSubtitle(it.last().trim())) { + new DocumentUtil.Replace(value.replaceFirst(~/\s*:.+$/, '')) + } + } + } + } + } + } + copy + } + } + + static List flatTitles(List hasTitle) { + dropGenericSubTitles(hasTitle).collect { + def title = new TreeMap<>() + title['flatTitle'] = normalize(DisplayDoc.flatten(it, titleComponents)) + if (it['@type']) { + title['@type'] = it['@type'] + } + + title + } + } + + private static boolean genericSubtitle(String s) { + s = Util.normalize(s) + if (s.startsWith("en ")) { + s = s.substring("en ".length()) + } + return s in IGNORED_SUBTITLES + } + + static String normalize(String s) { + return Unicode.asciiFold(Unicode.normalizeForSearch(StringUtils.normalizeSpace(" $s ".toLowerCase().replace(noise)))) + } + + static Object getPathSafe(item, path, defaultTo = null) { + for (p in path) { + if ((item instanceof Collection || item instanceof Map) && item[p] != null) { + item = item[p] + } else { + return defaultTo + } + } + return item + } + + static List getFlatTitle(List hasTitle) { + flatTitles(hasTitle) + .grep(isTitle) + .collect { it['flatTitle'] } + } + + static String chipString(def thing, Whelk whelk) { + if (thing instanceof Integer) { + return thing + } + + def chips = whelk.jsonld.toChip(thing) + if (chips.size() < 2) { + chips = thing + } + if (chips instanceof List) { + return chips.collect { valuesString(it) }.sort().join('
') + } + return valuesString(chips) + } + + private static String valuesString(def thing) { + if (thing instanceof List) { + return thing.collect { valuesString(it) }.join(' • ') + } + if (thing instanceof Map) { + return thing.findAll { k, v -> k != '@type' }.values().collect { valuesString(it) }.join(' • ') + } + return thing.toString() + } + + // (docs on some of these levels are normally filtered out before we reach here) + static List bestEncodingLevel = [ + 'marc:FullLevel', + 'marc:FullLevelMaterialNotExamined', + 'marc:MinimalLevel', + 'marc:LessThanFullLevelMaterialNotExamined', + 'marc:CoreLevel', + 'marc:AbbreviatedLevel', + 'marc:PartialPreliminaryLevel', + 'marc:PrepublicationLevel', + null + ] + + static def toWorkTitleForm = { Map title -> + // partName/partNumber is usually in hasPart but not always + def partName = title['partName'] + def partNumber = title['partNumber'] + + def hasPart = title['hasPart'] + if (hasPart) { + partName = hasPart[0]['partName'] + partNumber = hasPart[0]['partNumber'] + } + + partName = asList(partName)[0] + partNumber = asList(partNumber)[0] + + if (partNumber && partName) { + title['mainTitle'] += ". $partNumber, $partName" + } else if (partNumber) { + title['mainTitle'] += ". $partNumber" + } else if (partName) { + title['mainTitle'] += ". $partName" + } + + return title.subMap(['@type', 'mainTitle', 'source']) + } + + // Return the most common title for the best encodingLevel + static def bestTitle(Collection docs) { + // TODO: which title to pick when matched with already existing linked work? + def linkedWorkTitle = docs.findResult { it.workIri() ? it.workData['hasTitle'] : null } + if (linkedWorkTitle) { + return linkedWorkTitle + } + + for (def level : bestEncodingLevel) { + def onLevel = docs.findAll { it.encodingLevel() == level } + def bestWorkTitle = mostCommonWorkTitle(onLevel) + if (bestWorkTitle) { + return bestWorkTitle + } + } + + for (def level : bestEncodingLevel) { + def onLevel = docs.findAll { it.encodingLevel() == level } + def bestInstanceTitle = mostCommonInstanceTitle(onLevel) + if (bestInstanceTitle) { + return bestInstanceTitle.collect(toWorkTitleForm) + } + } + + return null + } + + static def bestOriginalTitle(Collection docs) { + for (def level : bestEncodingLevel) { + def onLevel = docs.findAll { it.encodingLevel() == level } + def bestOrigTitle = mostCommonOriginalTitle(onLevel) + if (bestOrigTitle) { + return bestOrigTitle + } + } + + return null + } + + static def mostCommonOriginalTitle(Collection docs) { + return mostCommonWorkTitle(docs) { Doc d -> + d.translationOf().findResult { it['hasTitle'] }?.findAll(isTitle) + } + } + + static def mostCommonWorkTitle(Collection docs, Closure getTitle = { it.workTitle().findAll(isTitle) }) { + def workTitles = docs.collect(getTitle) + .grep() + .collect { dropGenericSubTitles(it) } + + if (workTitles) { + return mostCommon(workTitles) + } + + return null + } + + static def mostCommonInstanceTitle(Collection docs) { + def addSource = { t, d -> + return t.collect { it.plus(['source': [d.instanceData.subMap('@id')]]) } + } + + def instanceTitles = docs.collect { it.instanceTitle().findAll(isTitle) } + .collect { dropGenericSubTitles(it) } + + if (instanceTitles.grep()) { + def instanceTitleToDoc = [instanceTitles, docs].transpose().collectEntries() + def best = mostCommon(instanceTitles.grep()) + return addSource(best, instanceTitleToDoc[best]) + } + + return null + } + + static def mostCommon(titles) { + return partition(titles, { a, b -> a == b }) + .sort { it.size() } + .reverse() + .first() + .first() + } + + static def isTitle = { it.'@type' == 'Title' } + + static boolean nameMatch(Object local, Map agent) { + def variants = [agent] + asList(agent.hasVariant) + + def localName = local instanceof Map ? name(local) : normalize(local) + + localName && variants.any { + name(it) && localName == name(it) + } + } + + static String name(Map agent) { + (agent.givenName && agent.familyName) + ? normalize("${agent.givenName} ${agent.familyName}") + : agent.name ? normalize("${agent.name}") : null + } + + static Document buildWorkDocument(Map workData, File reportDir) { + String workId = IdGenerator.generate() + def reportUri = "http://xlbuild.libris.kb.se/works/${reportDir.getPath()}/new/${workId}.html" + + workData['@id'] = "TEMPID#it" + Document d = new Document([ + "@graph": [ + [ + "@id" : "TEMPID", + "@type" : "Record", + "mainEntity" : ["@id": "TEMPID#it"], + "technicalNote": [[ + "@type" : "TechnicalNote", + "hasNote": [[ + "@type": "Note", + "label": ["Maskinellt utbrutet verk... TODO"] + ]], + "uri" : [reportUri] + ] + ]], + workData + ] + ]) + + d.deepReplaceId(Document.BASE_URI.toString() + workId) + return d + } +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkComparator.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkComparator.groovy new file mode 100644 index 0000000000..64aacb9c21 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkComparator.groovy @@ -0,0 +1,137 @@ +package datatool.scripts.mergeworks + +import datatool.scripts.mergeworks.compare.Classification +import datatool.scripts.mergeworks.compare.Id +import datatool.scripts.mergeworks.compare.SameOrEmpty +import datatool.scripts.mergeworks.compare.Default +import datatool.scripts.mergeworks.compare.Extent +import datatool.scripts.mergeworks.compare.FieldHandler +import datatool.scripts.mergeworks.compare.GenreForm +import datatool.scripts.mergeworks.compare.StuffSet +import datatool.scripts.mergeworks.compare.Subject +import datatool.scripts.mergeworks.compare.TranslationOf +import datatool.scripts.mergeworks.compare.ValuePicker +import datatool.scripts.mergeworks.compare.WorkTitle +import datatool.util.DocumentComparator + +import static datatool.scripts.mergeworks.Util.bestTitle + +class WorkComparator { + Set fields + DocumentComparator c = new DocumentComparator() + + Map comparators = [ + 'classification' : new Classification(), + 'contentType' : new SameOrEmpty('https://id.kb.se/term/rda/Text'), + 'genreForm' : new GenreForm(), + 'hasTitle' : new WorkTitle(), + 'intendedAudience': new SameOrEmpty('https://id.kb.se/marc/Juvenile'), + '_numPages' : new Extent(), + 'subject' : new Subject(), + 'summary' : new StuffSet(), + 'translationOf' : new TranslationOf(), + '@id' : new Id() + ] + + static Set ignore = ['closeMatch'] + + static FieldHandler DEFAULT = new Default() + + WorkComparator(Set fields) { + this.fields = new HashSet<>(fields) + } + + boolean sameWork(Doc a, Doc b) { + fields.every { compare(a, b, it).with { it == EQUAL || it == COMPATIBLE } } + } + + FieldStatus compare(Doc a, Doc b, String field) { + Object oa = a.workData.get(field) + Object ob = b.workData.get(field) + + if (oa == null && ob == null) { + return FieldStatus.EQUAL + } + + compareExact(oa, ob, field) == FieldStatus.EQUAL + ? FieldStatus.EQUAL + : compareDiff(a, b, field) + } + + Map merge(Collection docs) { + Map result = [:] + + fields.each { field -> + FieldHandler h = comparators.getOrDefault(field, DEFAULT) + def value = h instanceof ValuePicker + ? h.pick(docs) + : mergeField(field, h, docs) + + if (value) { + result[field] = value + } + } + + if (!result['hasTitle']) { + def bestTitle = bestTitle(docs) + if (bestTitle) { + result['hasTitle'] = bestTitle + } + } + + return result + } + + // TODO: preserve order? e.g. subject + private Object mergeField(String field, FieldHandler h, Collection docs) { + Object value = docs.first().workData.get(field) + def rest = docs.drop(1) + rest.each { + value = h.merge(value, it.workData.get(field)) + } + return value + } + + private FieldStatus compareDiff(Doc a, Doc b, String field) { + comparators.getOrDefault(field, DEFAULT).isCompatible(a.workData.get(field), b.workData.get(field)) + ? FieldStatus.COMPATIBLE + : FieldStatus.DIFF + } + + private FieldStatus compareExact(Object oa, Object ob, String field) { + c.isEqual([(field): oa], [(field): ob]) ? FieldStatus.EQUAL : FieldStatus.DIFF + } + + static Map> compare(Collection cluster) { + WorkComparator c = new WorkComparator(allFields(cluster)) + + Map> result = [:] + c.fieldStatuses(cluster).each { f, s -> result.get(s, []) << f } + return result + } + + static Set allFields(Collection cluster) { + Set fields = new HashSet<>() + cluster.each { fields.addAll(it.workData.keySet()) } + return fields - ignore + } + + Map fieldStatuses(Collection cluster) { + fields.collectEntries { [it, fieldStatus(cluster, it)] } + } + + FieldStatus fieldStatus(Collection cluster, String field) { + boolean anyCompat = false + [cluster, cluster].combinations().findResult { List combination -> + Doc a = combination.first() + Doc b = combination.last() + + def c = compare(a, b, field) + if (c == FieldStatus.COMPATIBLE) { + anyCompat = true + } + c == FieldStatus.DIFF ? c : null + } ?: (anyCompat ? FieldStatus.COMPATIBLE : FieldStatus.EQUAL) + } + +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkToolJob.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkToolJob.groovy new file mode 100644 index 0000000000..8498fbcae8 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkToolJob.groovy @@ -0,0 +1,415 @@ +package datatool.scripts.mergeworks + + +import whelk.IdGenerator +import whelk.Whelk +import whelk.exception.WhelkRuntimeException +import whelk.meta.WhelkConstants +import whelk.util.LegacyIntegrationTools +import whelk.util.Statistics + +import java.text.SimpleDateFormat +import java.util.concurrent.ExecutorService +import java.util.concurrent.LinkedBlockingDeque +import java.util.concurrent.ThreadFactory +import java.util.concurrent.ThreadPoolExecutor +import java.util.concurrent.TimeUnit +import java.util.concurrent.atomic.AtomicInteger +import java.util.function.Function + +import static datatool.scripts.mergeworks.Util.buildWorkDocument +import static datatool.scripts.mergeworks.Util.getPathSafe +import static datatool.scripts.mergeworks.Util.partition + +class WorkToolJob { + Whelk whelk + Statistics statistics + File clusters + + String date = new SimpleDateFormat('yyyyMMdd-HHmmss').format(new Date()) + String jobId = IdGenerator.generate() + File reportDir = new File("reports/merged-works/$date") + + String changedIn = "xl" + String changedBy = "SEK" + String generationProcess = 'https://libris.kb.se/sys/merge-works' + + boolean dryRun = true + boolean skipIndex = false + boolean loud = false + boolean verbose = false + int numThreads = -1 + + private enum WorkStatus { + NEW('new'), + UPDATED('updated') + + String status + + private WorkStatus(String status) { + this.status = status + } + } + + WorkToolJob(File clusters) { + this.clusters = clusters + + this.whelk = Whelk.createLoadedSearchWhelk('secret', true) + this.statistics = new Statistics() + } + + public static Closure qualityMonographs = { Doc doc -> + (doc.isText() + && doc.isMonograph() + && !doc.isManuscript() + && !doc.isMaybeAggregate() + && (doc.encodingLevel() != 'marc:PartialPreliminaryLevel' && doc.encodingLevel() != 'marc:PrepublicationLevel')) + && !doc.isTactile() + && !doc.isDrama() + && !doc.isThesis() + && !doc.isInSb17Bibliography() + } + + void show() { + println(Html.START) + run({ cluster -> + return { + try { + if (cluster.size() > 1) { + Collection docs = loadLastUnlinkedVersion(cluster).each { it.addComparisonProps() } + .sort { a, b -> a.workType() <=> b.workType() } + .sort { it.numPages() } + + println(Html.clusterTable(docs) + Html.HORIZONTAL_RULE) + } + } + catch (NoWorkException e) { + System.err.println(e.getMessage()) + } + catch (Exception e) { + System.err.println(e.getMessage()) + e.printStackTrace(System.err) + } + } + }) + println(Html.END) + } + + void showWorks() { + println(Html.START) + run({ cluster -> + return { + try { + def merged = uniqueWorks(loadLastUnlinkedVersion(cluster)).findAll { !it.existsInStorage } + if (merged) { + println(merged.collect { [it] + it.unlinkedInstances } + .collect { Html.clusterTable(it) } + .join('') + Html.HORIZONTAL_RULE + ) + } + } + catch (Exception e) { + System.err.println(e.getMessage()) + e.printStackTrace(System.err) + } + } + }) + println(Html.END) + } + + void showHubs() { + println(Html.START) + run({ cluster -> + return { + try { + def hub = uniqueWorks(loadLastUnlinkedVersion(cluster)) + if (hub.size() > 1) { + println(Html.hubTable(hub) + Html.HORIZONTAL_RULE) + } + } + catch (Exception e) { + System.err.println(e.getMessage()) + e.printStackTrace(System.err) + } + } + }) + println(Html.END) + } + + void merge() { + def s = statistics.printOnShutdown() + def multiWorkClusters = Collections.synchronizedList([]) + + run({ cluster -> + return { + def docs = loadDocs(cluster) + def works = uniqueWorks(docs) + def createdOrUpdated = works.findAll { it.unlinkedInstances } + + WorkStatus.values().each { + new File(reportDir, it.status).tap { it.mkdirs() } + } + writeSingleWorkReport(docs, createdOrUpdated, s) + + if (works.size() > 1) { + multiWorkClusters.add(works) + } + + if (!dryRun) { + def linkableWorkIris = works.findResults { it.workIri() } + works.each { doc -> + doc.addCloseMatch(linkableWorkIris) + store(doc) + doc.unlinkedInstances?.each { + it.replaceWorkData(['@id': doc.thingIri()]) + store(it) + } + } + } + } + }) + + writeMultiWorkReport(multiWorkClusters) + } + + void store(Doc doc) { + whelk.setSkipIndex(skipIndex) + doc.document.setGenerationDate(new Date()) + doc.document.setGenerationProcess(generationProcess) + + if (!doc.existsInStorage) { + if (!whelk.createDocument(doc.document, changedIn, changedBy, + LegacyIntegrationTools.determineLegacyCollection(doc.document, whelk.getJsonld()), false)) { + throw new WhelkRuntimeException("Could not store new work: ${doc.shortId()}") + } + } else if (doc.modified) { + whelk.storeAtomicUpdate(doc.document, !loud, false, changedIn, generationProcess, doc.preUpdateChecksum) + } + } + + void writeSingleWorkReport(Collection titleClusters, Collection derivedWorks, Statistics s) { + String report = htmlReport(titleClusters, derivedWorks) + derivedWorks.each { + def status = it.existsInStorage ? WorkStatus.UPDATED.status : WorkStatus.NEW.status + new File(reportDir, "$status/${it.shortId()}.html") << report + s.increment("num derivedFrom ($status works)", "${it.unlinkedInstances.size()}", it.shortId()) + } + } + + void writeMultiWorkReport(Collection> workClusters) { + new File(reportDir, "multi-work-clusters.html").with { f -> + f.append(Html.START) + workClusters.each { + f.append(Html.hubTable(it) + Html.HORIZONTAL_RULE) + } + f.append(Html.END) + } + } + + String htmlReport(Collection titleCluster, Collection works) { + StringBuilder s = new StringBuilder() + + s.append(Html.START) + + s.append("

Title cluster

") + titleCluster + .each { it.addComparisonProps() } + .sort { a, b -> a.workType() <=> b.workType() } + .sort { it.numPages() } + s.append(Html.clusterTable(titleCluster)) + s.append(Html.HORIZONTAL_RULE) + + titleCluster.each { + it.removeComparisonProps() + } + + s.append("

Extracted works

") + works.collect { [it] + it.unlinkedInstances } + .each { s.append(Html.clusterTable(it)) } + + s.append(Html.END) + + return s.toString() + } + + private Collection uniqueWorks(Collection titleCluster) { + def works = [] + + prepareForCompare(titleCluster) + + WorkComparator c = new WorkComparator(WorkComparator.allFields(titleCluster)) + + def workClusters = partition(titleCluster, { Doc a, Doc b -> c.sameWork(a, b) }) + .each { work -> work.each { doc -> doc.removeComparisonProps() } } + + workClusters.each { Collection wc -> + def (local, linked) = wc.split { it.instanceData } + if (!linked) { + if (local.size() == 1) { + works.add(local.first()) + } else { + def newWork = new Doc(whelk, buildWorkDocument(c.merge(local), reportDir)).tap { + it.existsInStorage = false + it.unlinkedInstances = local + } + works.add(newWork) + } + } else if (linked.size() == 1) { + def existingWork = linked.first().tap { Doc d -> + if (local) { + d.replaceWorkData(c.merge(linked + local)) + d.unlinkedInstances = local + } + } + works.add(existingWork) + } else { + System.err.println("Local works ${local.collect { it.shortId() }} match multiple linked works: ${linked.collect { it.shortId() }}. Duplicate linked works?") + } + } + + return works + } + + void swedishFiction() { + def swedish = { Doc doc -> + Util.asList(doc.workData['language']).collect { it['@id'] } == ['https://id.kb.se/language/swe'] + } + + run({ cluster -> + return { + def c = loadDocs(cluster).split { it.instanceData } + .with { local, linked -> + linked + local.findAll(qualityMonographs).findAll(swedish) + } + + if (c.size() > 1 && c.any { Doc d -> d.isFiction() } && !c.any { Doc d -> d.isNotFiction() }) { + println(c.collect { Doc d -> d.shortId() }.join('\t')) + } + } + }) + } + + void filterClusters(Closure predicate) { + run({ cluster -> + return { + def c = loadDocs(cluster).findAll(predicate) + if (c.size() > 1) { + println(c.collect { it.shortId() }.join('\t')) + } + } + }) + } + + void outputTitleClusters() { + run({ cluster -> + return { + titleClusters(loadDocs(cluster)).findAll { it.size() > 1 }.each { + println(it.collect { it.shortId() }.join('\t')) + } + } + }) + } + + private void run(Function, Runnable> f) { + ExecutorService s = createExecutorService() + + AtomicInteger i = new AtomicInteger() + clusters.eachLine() { + List cluster = Arrays.asList(it.split(/[\t ]+/)) + + s.submit({ + try { + f.apply(cluster).run() + int n = i.incrementAndGet() + if (n % 100 == 0) { + System.err.println("$n") + } + } + catch (NoWorkException e) { + //println("No work:" + e.getMessage()) + } + catch (Exception e) { + e.printStackTrace() + } + }) + } + + s.shutdown() + s.awaitTermination(1, TimeUnit.DAYS) + } + + private def createExecutorService() { + int poolSize = numThreads > 1 ? numThreads : defaultNumThreads() + def linkedBlockingDeque = new LinkedBlockingDeque((int) (poolSize * 1.5)) + + def executorService = new ThreadPoolExecutor(poolSize, poolSize, + 1, TimeUnit.DAYS, + linkedBlockingDeque, new ThreadPoolExecutor.CallerRunsPolicy()) + + executorService.setThreadFactory(new ThreadFactory() { + ThreadGroup group = new ThreadGroup(WhelkConstants.BATCH_THREAD_GROUP) + + @Override + Thread newThread(Runnable runnable) { + return new Thread(group, runnable) + } + }) + + return executorService + } + + private static int defaultNumThreads() { + Runtime.getRuntime().availableProcessors() * 4 + } + + private Collection loadDocs(Collection cluster) { + whelk + .bulkLoad(cluster).values() + .collect { new Doc(whelk, it) } + } + + private Collection loadLastUnlinkedVersion(Collection cluster) { + cluster.findResults { + whelk.storage. + loadAllVersions(it) + .reverse() + .find { getPathSafe(it.data, it.workIdPath) == null } + ?.with { new Doc(whelk, it) } + } + } + + def loadUniqueLinkedWorks = { Collection docs -> + docs.findResults { it.workIri() } + .unique() + .collect { new Doc(whelk, whelk.storage.getDocumentByIri(it)) } + .plus(docs.findAll { !it.workIri() }) + } + + private Collection> titleClusters(Collection docs) { + partitionByTitle(docs) + .findAll { !it.any { doc -> doc.hasGenericTitle() } } + .collect(loadUniqueLinkedWorks) + .findAll { it.size() > 1 } + .sort { a, b -> a.first().view.instanceDisplayTitle() <=> b.first().view.instanceDisplayTitle() } + } + + Collection> partitionByTitle(Collection docs) { + return partition(docs) { Doc a, Doc b -> + !a.flatInstanceTitle().intersect(b.flatInstanceTitle()).isEmpty() + } + } + + private Collection prepareForCompare(Collection docs) { + docs.each { + if (it.instanceData) { + it.addComparisonProps() + } + }.sort { it.numPages() } + } +} + +class NoWorkException extends RuntimeException { + NoWorkException(String msg) { + super(msg) + } +} diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Classification.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Classification.groovy new file mode 100644 index 0000000000..bc2d85a0e9 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Classification.groovy @@ -0,0 +1,85 @@ +package datatool.scripts.mergeworks.compare + +class Classification extends StuffSet { + // Terms that will be merged (values precede keys) + private static def norm = [ + 'uHc' : ['Hc,u'], + 'uHce' : ['Hce,u'], + 'Hc' : ['Hc.01', 'Hc.02', 'Hc.03'], + 'Hc,u' : ['Hcf', 'Hcg'] + ] + + @Override + Object merge(Object a, Object b) { + return mergeCompatibleElements(super.merge(a, b).findAll { it['code'] }) { c1, c2 -> + String code1 = c1['code'] + String code2 = c2['code'] + if (!code1 || !code2) { + return + } + code1 = code1.replaceAll(/\s+/, "") + code2 = code2.replaceAll(/\s+/, "") + + if (isSab(c1) && isSab(c2)) { + def code = code1 == code2 || n(code2, code1) + ? code1 + : (n(code1, code2) ? code2 : null) + if (code) { + def result = [ + '@type' : 'Classification', + 'code' : code1, + inScheme: [ + '@type': 'ConceptScheme', + 'code' : 'kssb' + ] + ] + def version = maxSabVersion(c1, c2) + if (version) { + result['inScheme']['version'] = version + } + return result + } + } else if (isDewey(c1) && isDewey(c2)) { + def code = code1.startsWith(code2.replace("/", "")) + ? code1 + : (code2.startsWith(code1.replace("/", "")) ? code2 : null) + if (code) { + Map result = [:] + result.putAll(c1) + result.putAll(c2) + result['code'] = code + result['editionEnumeration'] = maxDeweyEdition(c1, c2) + return result + } + } + } + } + + boolean isSab(Map c) { + c['inScheme'] && c['inScheme']['code'] == 'kssb' + } + + String maxSabVersion(c1, c2) { + def v1 = c1['inScheme']['version'] ?: "-1" + def v2 = c2['inScheme']['version'] ?: "-1" + Integer.parseInt(v1) > Integer.parseInt(v2) ? v1 : v2 + } + + boolean isDewey(Map c) { + c['@type'] == 'ClassificationDdc' + } + + String maxDeweyEdition(c1, c2) { + def v1 = c1['editionEnumeration'] + def v2 = c2['editionEnumeration'] + deweyEdition(v1) > deweyEdition(v2) ? v1 : v2 + } + + int deweyEdition(String edition) { + Integer.parseInt((edition ?: "0").replaceAll("[^0-9]", "")) + } + + boolean n(a, b) { + norm[a]?.any { it == b || n(it, b) } + } +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Default.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Default.groovy new file mode 100644 index 0000000000..dfacdb001e --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Default.groovy @@ -0,0 +1,13 @@ +package datatool.scripts.mergeworks.compare + +class Default implements FieldHandler { + @Override + boolean isCompatible(Object a, Object b) { + return false + } + + @Override + Object merge(Object a, Object b) { + return a + } +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Extent.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Extent.groovy new file mode 100644 index 0000000000..2390e77f5a --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Extent.groovy @@ -0,0 +1,15 @@ +package datatool.scripts.mergeworks.compare; + +class Extent implements FieldHandler { + + // TODO: allow one side missing extent (-1)? + @Override + boolean isCompatible(Object a, Object b) { + return true // a * 0.7 < b && a * 1.3 > b + } + + @Override + Object merge(Object a, Object b) { + return b; // not part of final work + } +} diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/FieldHandler.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/FieldHandler.groovy new file mode 100644 index 0000000000..17f440bc12 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/FieldHandler.groovy @@ -0,0 +1,12 @@ +package datatool.scripts.mergeworks.compare + +import datatool.scripts.mergeworks.Doc + +interface FieldHandler { + boolean isCompatible(Object a, Object b) + Object merge(Object a, Object b) +} + +interface ValuePicker extends FieldHandler { + Object pick(Collection values) +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/GenreForm.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/GenreForm.groovy new file mode 100644 index 0000000000..5efe34df33 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/GenreForm.groovy @@ -0,0 +1,47 @@ +package datatool.scripts.mergeworks.compare + +import datatool.util.DocumentComparator + +//FIXME +class GenreForm extends StuffSet { + private static final DocumentComparator c = new DocumentComparator() + + // Terms that will be merged (values precede keys) + private static def norm = [ + (['@id': 'https://id.kb.se/marc/NotFictionNotFurtherSpecified']): [ + ['@id': 'https://id.kb.se/marc/FictionNotFurtherSpecified'], + ['@id': 'https://id.kb.se/marc/Autobiography'], + ['@id': 'https://id.kb.se/marc/Biography'] + ], + (['@id': 'https://id.kb.se/marc/FictionNotFurtherSpecified']) : [ + ['@id': 'https://id.kb.se/marc/Poetry'], + ['@id': 'https://id.kb.se/marc/Novel'] + ], + ] + + @Override + boolean isCompatible(Object a, Object b) { + def lattLast = { + it['@id'] == 'https://id.kb.se/term/saogf/L%C3%A4ttl%C3%A4st' + || it['@id'] == 'https://id.kb.se/term/barngf/L%C3%A4ttl%C3%A4sta%20b%C3%B6cker' + || it['prefLabel'] == 'Lättläst' + } + + a.find(lattLast).asBoolean() == b.findResult(lattLast).asBoolean() + } + + @Override + Object merge(Object a, Object b) { + return mergeCompatibleElements(super.merge(a, b)) { gf1, gf2 -> + if (n(gf1, gf2)) { + gf2 + } else if (n(gf2, gf1)) { + gf1 + } + } + } + + boolean n(a, b) { + norm[a]?.any { it == b || n(it, b) } + } +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Id.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Id.groovy new file mode 100644 index 0000000000..fc3305148b --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Id.groovy @@ -0,0 +1,22 @@ +package datatool.scripts.mergeworks.compare + +import datatool.scripts.mergeworks.Doc +import org.apache.commons.lang3.NotImplementedException + +class Id implements ValuePicker { + + @Override + boolean isCompatible(Object a, Object b) { + return true + } + + @Override + Object merge(Object a, Object b) { + throw new NotImplementedException('') + } + + @Override + Object pick(Collection values) { + return values.findResult { it.workIri() } + } +} diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/SameOrEmpty.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/SameOrEmpty.groovy new file mode 100644 index 0000000000..3fcd988d93 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/SameOrEmpty.groovy @@ -0,0 +1,21 @@ +package datatool.scripts.mergeworks.compare + +import static datatool.scripts.mergeworks.Util.asList + +class SameOrEmpty implements FieldHandler { + Object link + + SameOrEmpty(String iri) { + this.link = [['@id': iri]] + } + + @Override + boolean isCompatible(Object a, Object b) { + (!a && asList(b) == link) || (!b && asList(a) == link) + } + + @Override + Object merge(Object a, Object b) { + return a ?: b + } +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/StuffSet.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/StuffSet.groovy new file mode 100644 index 0000000000..ecf119de9f --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/StuffSet.groovy @@ -0,0 +1,38 @@ +package datatool.scripts.mergeworks.compare + + +import java.util.function.BiFunction + +import static datatool.scripts.mergeworks.Util.asList + +class StuffSet implements FieldHandler { + @Override + boolean isCompatible(Object a, Object b) { + true + } + + @Override + Object merge(Object a, Object b) { + return ((asList(a) as Set) + (asList(b) as Set)).collect() + } + + static Object mergeCompatibleElements(Object o, BiFunction s) { + boolean changed = false + List result = [] + asList(o).each { + def merged = null + for (int i = 0 ; i < result.size() ; i++) { + merged = s.apply(result[i], it) + if (merged) { + result[i] = merged + changed = true + break + } + } + if (merged == null) { + result << it + } + } + return changed ? mergeCompatibleElements(result, s) : result + } +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Subject.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Subject.groovy new file mode 100644 index 0000000000..0434d32d98 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Subject.groovy @@ -0,0 +1,8 @@ +package datatool.scripts.mergeworks.compare + +class Subject extends StuffSet { + @Override + Object merge(Object a, Object b) { + return super.merge(a, b) + } +} diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/TranslationOf.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/TranslationOf.groovy new file mode 100644 index 0000000000..7bd26ebe7d --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/TranslationOf.groovy @@ -0,0 +1,46 @@ +package datatool.scripts.mergeworks.compare + +import datatool.scripts.mergeworks.Doc +import datatool.scripts.mergeworks.Util +import datatool.util.DocumentComparator +import org.apache.commons.lang3.NotImplementedException + +class TranslationOf implements ValuePicker { + DocumentComparator c = new DocumentComparator() + + @Override + boolean isCompatible(Object a, Object b) { + // @type is sometimes Work, sometimes Text. Should not matter for comparison + // We assume that there are never more than one object in translationOf + a = Util.asList(a)[0] + b = Util.asList(b)[0] + (!a && !b) || (a && b && c.isEqual(noTypeNoTitle(a), noTypeNoTitle(b)) && noTitleOrSameTitle(a, b)) + } + + @Override + Object merge(Object a, Object b) { + throw new NotImplementedException('') + } + + @Override + Object pick(Collection values) { + // TODO: which title to pick when matched with already existing linked work? + def translationOf = values.first().workData['translationOf'] + def title = Util.bestOriginalTitle(values) + if (title) { + Util.asList(translationOf)[0]['hasTitle'] = title + } + + return translationOf + } + + Map noTypeNoTitle(Map m) { + m.findAll { k, v -> !(k in ['@type', 'hasTitle']) } + } + + boolean noTitleOrSameTitle(Map a, Map b) { + !a['hasTitle'] + || !b['hasTitle'] + || !Util.getFlatTitle(a['hasTitle']).intersect(Util.getFlatTitle(b['hasTitle'])).isEmpty() + } +} diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/WorkTitle.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/WorkTitle.groovy new file mode 100644 index 0000000000..b1608b64aa --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/WorkTitle.groovy @@ -0,0 +1,23 @@ +package datatool.scripts.mergeworks.compare + +import datatool.scripts.mergeworks.Doc +import datatool.scripts.mergeworks.Util +import org.apache.commons.lang3.NotImplementedException + +class WorkTitle implements ValuePicker { + + @Override + boolean isCompatible(Object a, Object b) { + return !a || !b || !Util.getFlatTitle(a).intersect(Util.getFlatTitle(b)).isEmpty() + } + + @Override + Object merge(Object a, Object b) { + throw new NotImplementedException('') + } + + @Override + Object pick(Collection values) { + return Util.bestTitle(values) + } +} diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contribution.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contribution.groovy new file mode 100644 index 0000000000..7d6aeb2966 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contribution.groovy @@ -0,0 +1,613 @@ +package datatool.scripts.mergeworks.normalize + +import groovy.transform.Memoized +import org.apache.commons.lang3.StringUtils +import whelk.Document +import whelk.JsonLd + +import java.util.concurrent.ConcurrentHashMap +import java.util.regex.Pattern + +import groovy.json.JsonBuilder +import groovy.json.JsonSlurper + +import static datatool.scripts.mergeworks.Util.asList +import static datatool.scripts.mergeworks.Util.name +import static datatool.scripts.mergeworks.Util.normalize +import static datatool.scripts.mergeworks.Util.Relator +import static whelk.JsonLd.ID_KEY +import static whelk.JsonLd.looksLikeIri + +/** + Example: + $ ENV=qa && time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters="reports/clusters.tsv" -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --dry-run src/main/groovy/datatool/scripts/mergeworks/normalize/fetch-contribution-from-respStatement.groovy + */ + +linkedFoundInCluster = getReportWriter("linked-agent-found-in-cluster.tsv") +linkedFoundInCluster.println(['id', 'matched agent', 'agent occurs in (examples)'].join('\t')) + +roleAddedFromRespStatement = getReportWriter("role-added-from-respStatement.tsv") +roleAddedFromRespStatement.println(['id', 'agent name', 'added roles', 'resp statement'].join('\t')) + +respStatementLinkedAgentFoundInCluster = getReportWriter("respStatement-linked-agent-found-in-cluster.tsv") +respStatementLinkedAgentFoundInCluster.println(['id', 'agent name', 'matched agent', 'resp statement roles', 'agent occurs in (examples)', 'resp statement'].join('\t')) + +respStatementLocalAgentFoundInCluster = getReportWriter("respStatement-local-agent-found-in-cluster.tsv") +respStatementLocalAgentFoundInCluster.println(['id', 'agent name', 'resp statement roles', 'agent occurs in (examples)', 'resp statement'].join('\t')) + +unmatchedContributionsInRespStatement = getReportWriter("unmatched-contributions-in-resp-statement.tsv") +unmatchedContributionsInRespStatement.println(['id', 'agent name', 'roles', 'resp statement'].join('\t')) + +roleFoundInCluster = getReportWriter("role-found-in-cluster.tsv") +roleFoundInCluster.println(['id', 'agent', 'added role', 'agent occurs with role in (examples)'].join('\t')) + +titleMovedToTranslationOf = getReportWriter("title-moved-to-translationOf.tsv") + +originalWorkFoundInCluster = getReportWriter("original-work-found-in-cluster.tsv") +originalWorkFoundInCluster.println(['id', 'added translationOf', 'translationOf occurs in (examples)'].join('\t')) + +def clusters = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } } + +idToCluster = initIdToCluster(clusters) +nameToAgents = new ConcurrentHashMap() +agentToRolesToIds = new ConcurrentHashMap>() +agentToLifeSpan = new ConcurrentHashMap() +idToTranslationOf = new ConcurrentHashMap() + +// Populate maps +selectByIds(clusters.flatten()) { bib -> + def id = bib.doc.shortId + def work = bib.graph[1].instanceOf + + if (!work || work[ID_KEY]) return + + work.contribution?.each { Map c -> + asList(c.agent).each { Map agent -> + def agentStr = toString(agent) + def loadedAgent = loadIfLink(agent) + if (agent.containsKey('@id')) { + agentToLifeSpan.computeIfAbsent(agentStr, f -> lifeSpan(loadedAgent)) + } + ([loadedAgent] + asList(loadedAgent.hasVariant)).each { a -> + String agentName = name(a) + if (agentName) { + nameToAgents.computeIfAbsent(agentName, f -> new ConcurrentHashMap().newKeySet()).add(agentStr) + } + } + def roleToIds = agentToRolesToIds.computeIfAbsent(agentStr, f -> new ConcurrentHashMap()) + asList(c.role).with { + if (it.isEmpty()) { + roleToIds.computeIfAbsent(([:]), f -> new ConcurrentHashMap().newKeySet()).add(id) + } else { + it.each { r -> + roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id) + } + } + } + } + } + + if (work['translationOf']) { + idToTranslationOf[id] = work['translationOf'] + } +} + +agentToNames = initAgentToNames(nameToAgents) + +selectByIds(clusters.flatten()) { bib -> + Map thing = bib.graph[1] + def id = bib.doc.shortId + + def respStatement = thing.responsibilityStatement + def work = thing.instanceOf + + if (!work || work[ID_KEY]) return + + def contribution = work.contribution + + if (!contribution) return + + // extract names + roles from responsibilityStatement + // normalize the names for comparison but also save the original strings for later use + def normalizedNameToName = [:] + def contributionsInRespStatement = parseRespStatement(respStatement).collectEntries { name, roles -> + def normalizedName = normalize(name) + normalizedNameToName[normalizedName] = name + [normalizedName, roles] + } + + // remove useless contributions + def modified = contribution.removeAll { !it.agent } + + contribution.each { Map c -> + // match local agent against linked ones in same cluster + modified |= tryLinkAgent(c, id) + // if there are more roles stated in responsibilityStatement other than the existing ones in this contribution, add those + modified |= tryAddRolesFromRespStatement(c, contributionsInRespStatement, respStatement, id) + } + + // drop "implicit authors", e.g. Astrid Lindgren in "Astrid Lindgren ; illustrerad av Ilon Wikland" (likely to already exist) + contributionsInRespStatement.removeAll { _, roles -> roles == [Relator.IMPLICIT_AUTHOR] } + + // agents in responsibilityStatement that are not in contribution? match against linked agents in same cluster + modified |= tryAddLinkedAgentContributionsFromRespStatement(contribution, contributionsInRespStatement, respStatement, id) + + // drop unmatched agents that are likely to already exist (agent with same initials exists or contribution with same role exists) + def existingNames = contribution.findResults { agentToNames[toString(asList(it.agent).find())] }.flatten() + contributionsInRespStatement.removeAll { String name, List roles -> + existingNames.any { similarName(it, name) } + || roles.collect { [(ID_KEY): it.iri] }.intersect(contribution.collect { it.role }.flatten()) + } + + // match remaining against local agents in same cluster + modified |= tryAddLocalAgentContributionsFromRespStatement(contribution, contributionsInRespStatement, respStatement, id) + // if still no match, add constructed local Contribution with agent + roles extracted from responsibilityStatement + modified |= addRemainingContributionsFromRespStatement(contribution, contributionsInRespStatement, normalizedNameToName, respStatement, id) + + contribution.each { Map c -> + // add roles from contributions in same cluster with matching agent + modified |= tryAddRole(c, id) + } + + // works with translators should have translationOf, add if missing + modified |= tryAddMissingTranslationOf(work, contribution, id) + + if (modified) { + bib.scheduleSave() + } +} + +def initIdToCluster(List> clusters) { + def idToCluster = [:] + clusters.each { cluster -> + cluster.each { id -> + idToCluster[id] = cluster as Set - id + } + } + return idToCluster +} + +static Map initAgentToNames(Map> nameToAgents) { + def agentToNames = [:] + nameToAgents.each { name, agents -> + agents.each { + agentToNames.computeIfAbsent(it, f -> [] as Set).add(name) + } + } + return agentToNames +} + +boolean tryLinkAgent(Map contribution, String id) { + def modified = false + + asList(contribution.agent).each { Map agent -> + if (!agent.containsKey(ID_KEY)) { + // get agent name variants + def names = agentToNames[toString(agent)] + if (!names) return + // get linked agents with matching name + def matchingLinkedAgents = nameToAgents.subMap(names).values().flatten().toSet().findAll { a -> + JsonLd.looksLikeIri(a) && !yearMismatch(lifeSpan(agent), agentToLifeSpan[a]) + } + for (agentIri in matchingLinkedAgents) { + // roles that the linked agent appears as and in which records respectively + Map roleToIds = agentToRolesToIds[agentIri] + // records in same cluster where the linked agent appears + def inClusterWithAgent = roleToIds.findResults { _, ids -> idToCluster[id].intersect(ids) }.flatten() as Set + if (inClusterWithAgent) { + // matching linked agent appears in same cluster -> add link + agent.clear() + agent[ID_KEY] = agentIri + // report + def examples = inClusterWithAgent.take(3) + def currentRoles = asList(contribution.role).findResults { roleShort(it[ID_KEY]) }.sort() + linkedFoundInCluster.println([id, idShort(agentIri), examples].join('\t')) + incrementStats('linked agent found in cluster', currentRoles) + // add this id to "records that the agent appears in" for each role + asList(contribution.role).each { r -> + roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id) + } + return modified = true + } + } + } + } + + return modified +} + +boolean tryAddRolesFromRespStatement(Map contribution, Map contributionsInRespStatement, String respStatement, String id) { + if (contributionsInRespStatement.isEmpty()) return false + + String agent = toString(asList(contribution.agent).find()) + + // any matching agent (name) in responsibilityStatement? + def matching = contributionsInRespStatement.subMap(agentToNames[agent] ?: []) + if (!matching) return false + + // matched and will be handled, remove + matching.each { name, _ -> contributionsInRespStatement.remove(name) } + + def firstMatch = matching.find() + String name = firstMatch.key + List rolesInRespStatement = firstMatch.value + + Map roleToIds = agentToRolesToIds[agent] + if (!roleToIds) return false + + def currentRoles = asList(contribution.role) + def isPrimaryContribution = contribution[ID_KEY] == 'PrimaryContribution' + // author role needs to be explicitly stated in responsibilityStatement to be added to "regular" Contribution + def rolesOfInterest = rolesInRespStatement.findResults { Relator relator -> + relator == Relator.IMPLICIT_AUTHOR && !isPrimaryContribution + ? null + : [(ID_KEY): relator.iri] + } + def newRoles = rolesOfInterest - currentRoles + if (newRoles) { + // add new roles (replace existing unspecifiedContributor) + contribution['role'] = noRole(currentRoles) ? newRoles : currentRoles + newRoles + // report + def newRolesShort = newRoles.findResults { roleShort(it[ID_KEY]) } + roleAddedFromRespStatement.println([id, name, newRolesShort, respStatement].join('\t')) + incrementStats("roles added from responsibilityStatement", newRolesShort.sort(), id) + // add this id to "records that the agent appears in" for each added role + newRoles.each { r -> + roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id) + } + return true + } + + return false +} + +boolean tryAddLinkedAgentContributionsFromRespStatement(List contribution, Map contributionsInRespStatement, String respStatement, String id) { + if (contributionsInRespStatement.isEmpty()) return false + + return contributionsInRespStatement.removeAll { String name, List roles -> + // get agents with matching name + def agents = nameToAgents[name] + if (!agents) return false + + // get only linked agents + def linkedAgents = agents.findAll { looksLikeIri(it) } + + for (agentIri in linkedAgents) { + Map roleToIds = agentToRolesToIds[agentIri] + def inClusterWithAgent = roleToIds.findResults { _, ids -> idToCluster[id].intersect(ids) }.flatten() as Set + if (inClusterWithAgent) { + def newContribution = + [ + '@type': 'Contribution', + 'agent': [(ID_KEY): agentIri] + ] + + roles = roles.collect { r -> [(ID_KEY): r.iri] } + + if (roles) { + newContribution['role'] = roles + } + + if (!contribution.contains(newContribution)) { + contribution.add(newContribution) + } + + def rolesShort = roles.collect { r -> roleShort(r[ID_KEY]) }.sort() + def examples = inClusterWithAgent.take(3) + respStatementLinkedAgentFoundInCluster.println([id, name, idShort(agentIri), rolesShort, examples, respStatement].join('\t')) + incrementStats('linked agents from respStatement (found in cluster)', rolesShort, id) + + roles.each { r -> + roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id) + } + + return true + } + } + + return false + } +} + +boolean tryAddLocalAgentContributionsFromRespStatement(List contribution, Map contributionsInRespStatement, String respStatement, String id) { + if (contributionsInRespStatement.isEmpty()) return false + + return contributionsInRespStatement.removeAll { String name, List roles -> + def agents = nameToAgents[name] + if (!agents) return false + + def localAgents = agents.findAll { !looksLikeIri(it) } + + for (localAgent in localAgents) { + Map roleToIds = agentToRolesToIds[localAgent] + def inClusterWithAgent = roleToIds.findResults { _, ids -> idToCluster[id].intersect(ids) }.flatten() as Set + if (inClusterWithAgent) { + def newContribution = + [ + '@type': 'Contribution', + 'agent': toMap(localAgent) + ] + + roles = roles.collect { r -> [(ID_KEY): r.iri] } + + if (roles) { + newContribution['role'] = roles + } + + contribution.add(newContribution) + + def rolesShort = roles.collect { r -> roleShort(r[ID_KEY]) } + def examples = inClusterWithAgent.take(3) + respStatementLocalAgentFoundInCluster.println([id, name, rolesShort, examples, respStatement].join('\t')) + incrementStats('local agents from respStatement (found in cluster)', rolesShort, id) + + roles.each { r -> + roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id) + } + + return true + } + } + + return false + } +} + +boolean addRemainingContributionsFromRespStatement(List contribution, Map contributionsInRespStatement, Map normalizedNames, String respStatement, String id) { + if (contributionsInRespStatement.isEmpty()) return false + + return contributionsInRespStatement.removeAll { name, roles -> + def translatorEditor = roles.findResults { r -> r == Relator.TRANSLATOR || r == Relator.EDITOR ? [(ID_KEY): r.iri] : null } + + if (translatorEditor) { + def newContribution = + [ + '@type': 'Contribution', + 'agent': ['@type': 'Person', 'name': normalizedNames[name]], + 'role' : translatorEditor + ] + + contribution.add(newContribution) + + def rolesShort = translatorEditor.collect { roleShort(it[ID_KEY]) }.sort() + unmatchedContributionsInRespStatement.println([id, normalizedNames[name], rolesShort, respStatement].join('\t')) + incrementStats('unmatched agents in respStatement', rolesShort, id) + + def roleToIds = agentToRolesToIds.computeIfAbsent(toString(newContribution.agent), f -> new ConcurrentHashMap()) + translatorEditor.each { r -> + roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id) + } + + return true + } + } +} + + +boolean tryAddRole(Map contribution, String id) { + def agent = asList(contribution.agent).find() + def agentStr = toString(agent) + + Map roleToIds = agentToRolesToIds[agentStr] + if (!roleToIds) return false + + def adapterEditor = [Relator.EDITOR, Relator.ADAPTER].collect { [(ID_KEY): it.iri] } + + def currentRoles = asList(contribution.role) + // find roles in cluster that can be added (certain conditions need to be met) + def rolesInCluster = roleToIds.findAll { r, ids -> + def inCluster = idToCluster[id] + def inClusterWithRole = ids.intersect(idToCluster[id]) + return inClusterWithRole + && !noRole([r]) + && (inClusterWithRole.size() >= inCluster.size() + || noRole(currentRoles) + || r == [(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri] + || (r in adapterEditor && currentRoles.intersect(adapterEditor))) + }.collect { it.key } + + def newRoles = rolesInCluster - currentRoles + if (newRoles) { + contribution['role'] = noRole(currentRoles) ? newRoles : currentRoles + newRoles + newRoles.each { r -> + def shortRole = roleShort(r[ID_KEY]) + def examples = roleToIds[r].intersect(idToCluster[id]).take(3) + def agentShort = agent[ID_KEY] ? idShort(agentStr) : agentToNames[agentStr]?.getAt(0) + roleFoundInCluster.println([id, agentShort, shortRole, examples].join('\t')) + incrementStats('role found in cluster', shortRole, id) + roleToIds[r].add(id) + } + return true + } + + return false +} + +boolean tryAddMissingTranslationOf(Map work, List contribution, String id) { + def trl = [(ID_KEY): Relator.TRANSLATOR.iri] + def translators = contribution.findResults { asList(it.role).contains(trl) ? toString(asList(it.agent).find()) : null } + + if (!translators || work['translationOf']) return false + + def title = work.remove('hasTitle') + if (title) { + // the title should be in translationOf, construct a new local work and put the title there + work['translationOf'] = ['@type': 'Work', 'hasTitle': title] + incrementStats('add missing translationOf', "title moved to new translationOf", id) + titleMovedToTranslationOf.println([id, work['translationOf']].join('\t')) + return true + } + + for (String translator : translators) { + def roleToIds = agentToRolesToIds[translator] + def inClusterSameTranslator = roleToIds[trl].intersect(idToCluster[id]) + def origWorks = inClusterSameTranslator.findResults { idToTranslationOf[it] } + + if (origWorks) { + // translationOf found on other work in cluster with matching translator, add to this work (pick the most common if several) + work['translationOf'] = origWorks.countBy { it }.max { it.value }?.key + def examples = inClusterSameTranslator.findAll { idToTranslationOf.containsKey(it) }.take(3) + incrementStats('add missing translationOf', 'original work found in cluster (same translator)', id) + originalWorkFoundInCluster.println([id, work['translationOf'], examples].join('\t')) + return true + } + } + + return false +} + +boolean noRole(List roles) { + roles.isEmpty() || roles == [[:]] || roles == [[(ID_KEY): Relator.UNSPECIFIED_CONTRIBUTOR.iri]] +} + +private Map loadIfLink(Map m) { + m[ID_KEY] ? loadThing(m[ID_KEY]) : m +} + +@Memoized +private Map loadThing(def id) { + def thing = [:] + selectByIds([id]) { t -> + thing = t.graph[1] + } + return thing +} + +Map> parseRespStatement(String respStatement) { + def parsedContributions = [:] + + if (respStatement) { + respStatement.split(';').eachWithIndex { part, i -> + parseSwedishFictionContribution(StringUtils.normalizeSpace(part), i == 0).each { name, roles -> + parsedContributions + .computeIfAbsent(name, r -> []) + .addAll(roles) + } + } + } + + return parsedContributions.findAll { name, _ -> name =~ /\s/ } +} + +static Map> parseSwedishFictionContribution(String contribution, boolean isFirstStmtPart) { + def roleToPattern = + [ + (Relator.TRANSLATOR) : ~/(bemynd(\w+|\.)? )?öf?v(\.|ers(\.|\p{L}+)?)( (till|från) \p{L}+)?|(till svenskan?|från \p{L}+)|svensk text/, + (Relator.AUTHOR) : ~/^(text(e[nr])?|skriven|written)/, + (Relator.ILLUSTRATOR) : ~/\bbild(erStrin)?|ill(\.|ustr(\.|\w+)?)|\bvi(gn|nj)ett(er|ill)?|ritad/, + (Relator.AUTHOR_OF_INTRO) : ~/förord|inl(edn(\.|ing)|edd)/, + (Relator.COVER_DESIGNER) : ~/omslag/, + (Relator.AUTHOR_OF_AFTERWORD): ~/efter(ord|skrift)/, + (Relator.PHOTOGRAPHER) : ~/\bfoto\w*\.?/, + (Relator.EDITOR) : ~/red(\.(?! av)|aktör(er)?)|\bbearb(\.|\w+)?|återberättad|sammanställ\w*/, + ] + + def rolePattern = ~/((?iu)${roleToPattern.values().join('|')})/ + def followsRolePattern = ~/(:| a[fv]| by) / + def initialPattern = ~/\p{Lu}/ + def namePattern = ~/\p{Lu}:?\p{Ll}+('\p{Ll})?(,? [Jj](r|unior))?/ + def betweenNamesPattern = ~/-| |\. ?| ([Dd]e(l| la)?|von|van( de[nr])?|v\.|le|af|du|dos) | [ODdLl]'/ + def fullNamePattern = ~/(($initialPattern|$namePattern)($betweenNamesPattern)?)*$namePattern/ + def conjPattern = ~/ (och|&|and) / + def roleAfterNamePattern = ~/( ?\(($rolePattern$conjPattern)?$rolePattern\))/ + def fullContributionPattern = ~/(($rolePattern($conjPattern|\/))*$rolePattern$followsRolePattern)?$fullNamePattern($conjPattern$fullNamePattern)*$roleAfterNamePattern?/ + + // Make roles lower case so that they can't be mistaken for names + contribution = (contribution =~ rolePattern)*.first() + .collectEntries { [it, it.toLowerCase()] } + .with { contribution.replace(it) } + + def nameToRoles = [:] + + def matched = (contribution =~ fullContributionPattern)*.first() + + matched.each { m -> + // Extract roles from the contribution + def roles = roleToPattern.findResults { role, pattern -> m =~ /(?iu)$pattern/ ? role : null } + + // Author should be the role if first part of respStatement (before ';') and no role seems to be stated + if (roles.isEmpty() && isFirstStmtPart && !(contribution =~ /.+$followsRolePattern/)) { + roles << Relator.IMPLICIT_AUTHOR + } + + // Extract names from the contribution + def names = parseNames(fullNamePattern, conjPattern, m) + + // Assign the roles to each name + nameToRoles.putAll(names.collectEntries { [it, roles] }) + } + + return nameToRoles +} + +static List parseNames(Pattern namePattern, Pattern conjPattern, String s) { + def names = [] + + (s =~ namePattern).each { + def name = it.first() + // Handle the case of "Jan och Maria Larsson" + def previousName = names.isEmpty() ? null : names.last() + if (previousName?.split()?.size() == 1 && s =~ /$previousName$conjPattern$name/) { + def nameParts = name.split() + if (nameParts.size() > 1) { + names[-1] += " ${nameParts.last()}" + } + } + names << name + } + + return names +} + +@Memoized +def getWhelk() { + // A little hack to get a handle to whelk... + def whelk = null + selectByIds(['https://id.kb.se/marc']) { docItem -> + whelk = docItem.whelk + } + if (!whelk) { + throw new RuntimeException("Could not get Whelk") + } + return whelk +} + +static boolean yearMismatch(String a, String b) { + a && b && a != b +} + +static String lifeSpan(Map agent) { + agent.lifeSpan?.replaceAll(~/[^\-0-9]/, '')?.replaceAll(~/-+/, '-') +} + +static String toString(Map agent) { + agent[ID_KEY]?.replaceFirst(".+/", Document.BASE_URI.toString()) ?: new JsonBuilder(agent).toString() +} + +static toMap(String agent) { + new JsonSlurper().parseText(agent) +} + +static String idShort(String iri) { + iri.split("[#/]").dropRight(1).last() +} + +static String roleShort(String iri) { + iri?.split("/")?.last() ?: 'NO ROLE' +} + +static boolean similarName(String a, String b) { + [nameParts(a), nameParts(b)].with { n1, n2 -> + n1.size() == 1 || n2.size() == 1 + ? n1.intersect(n2) + : [initials(n1), initials(n2)].with { i1, i2 -> i1.containsAll(i2) || i2.containsAll(i1) } + } +} + +static List initials(List nameParts) { + nameParts.collect { it[0] } +} + +static List nameParts(String s) { + s.split(/\s+|-/) as List +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contributions-to-instance.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contributions-to-instance.groovy new file mode 100644 index 0000000000..d3fb3e9bf3 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contributions-to-instance.groovy @@ -0,0 +1,96 @@ +import datatool.scripts.mergeworks.Util.Relator + +import whelk.Whelk +import static whelk.JsonLd.ID_KEY +import static whelk.JsonLd.TYPE_KEY + +report = getReportWriter('report.tsv') + +def ids = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } }.flatten() + +def whelk = getWhelk() +def instanceRolesByDomain = whelk.resourceCache.relators.findResults { + if (it.domain) { + def domain = whelk.jsonld.toTermKey(it.domain[ID_KEY]) + if (whelk.jsonld.isSubClassOf(domain, 'Embodiment')) it.subMap([ID_KEY]) + } +} +def instanceRoles = instanceRolesByDomain + [Relator.ILLUSTRATOR, Relator.AUTHOR_OF_INTRO, Relator.AUTHOR_OF_AFTERWORD].collect { [(ID_KEY): it.iri] } + +selectByIds(ids) { bib -> + Map instance = bib.graph[1] + def work = instance.instanceOf + def contribution = work?.contribution + + if (!contribution) return + + def ill = [(ID_KEY): Relator.ILLUSTRATOR.iri] + + def modified = false + + contribution.removeAll { c -> + if (isPrimaryContribution(c)) return false + + def toInstance = asList(c.role).intersect(instanceRoles) + if (toInstance.contains(ill)) { + if (has9pu(c) || isPictureBook(work) || isComics(work, bib.whelk) || isStillImage(work)) { + toInstance.remove(ill) + } + } + if (toInstance) { + instance['contribution'] = asList(instance['contribution']) + c.clone().tap { it['role'] = toInstance } + c['role'] = asList(c.role) - toInstance + modified = true + report.println([bib.doc.shortId, toInstance.collect { it[ID_KEY].split('/').last() }].join('\t')) + incrementStats('moved to instance', toInstance) + return c.role.isEmpty() + } + + return false + } + + if (contribution.isEmpty()) { + work.remove('contribution') + } + + if (modified) { + bib.scheduleSave() + } +} + +boolean isPrimaryContribution(Map contribution) { + contribution[TYPE_KEY] == 'PrimaryContribution' +} + +boolean has9pu(Map contribution) { + asList(contribution.role).contains([(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri]) +} + +boolean isStillImage(Map work) { + asList(work.contentType).contains([(ID_KEY): 'https://id.kb.se/term/rda/StillImage']) +} + +boolean isPictureBook(Map work) { + def picBookTerms = [ + 'https://id.kb.se/term/barngf/Bilderb%C3%B6cker', + 'https://id.kb.se/term/barngf/Sm%C3%A5barnsbilderb%C3%B6cker' + ].collect { [(ID_KEY): it] } + + return asList(work.genreForm).any { it in picBookTerms } +} + +boolean isComics(Map work, Whelk whelk) { + def comicsTerms = [ + 'https://id.kb.se/term/saogf/Tecknade%20serier', + 'https://id.kb.se/term/barngf/Tecknade%20serier', + 'https://id.kb.se/term/gmgpc/swe/Tecknade%20serier', + 'https://id.kb.se/marc/ComicOrGraphicNovel', + 'https://id.kb.se/marc/ComicStrip' + ].collect { [(ID_KEY): it] } + + return asList(work.genreForm).any { + it in comicsTerms + || it[ID_KEY] && whelk.relations.isImpliedBy('https://id.kb.se/term/saogf/Tecknade%20serier', it[ID_KEY]) + || asList(work.classification).any { it.code?.startsWith('Hci') } + } +} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/language-in-work-title.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/language-in-work-title.groovy new file mode 100644 index 0000000000..d59da20dd0 --- /dev/null +++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/language-in-work-title.groovy @@ -0,0 +1,57 @@ +package datatool.scripts.mergeworks.normalize + +import groovy.transform.Memoized +import whelk.util.DocumentUtil + +import static datatool.scripts.mergeworks.Util.getPathSafe + +/** + Example: + $ ENV=qa && time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters="reports/clusters.tsv" -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --dry-run src/main/groovy/datatool/scripts/mergeworks/normalize/language-in-work-title.groovy + */ + +PrintWriter report = getReportWriter("report.txt") + +def ids = new File(System.getProperty('clusters')) + .readLines() + .collect { it.split('\t').collect { it.trim()} } + .flatten() + +selectByIds(ids) { bib -> + def langs = [ + [1, 'instanceOf', 'language', 0, '@id'], + [1, 'instanceOf', 'translationOf', 0, 'language', 0, '@id'] + ].collect { + langName(getPathSafe(bib.graph, it, '')).toLowerCase() + } + + boolean changed = DocumentUtil.traverse(bib.graph[1].instanceOf) { value, path -> + if (path && 'mainTitle' in path && value instanceof String) { + for (lang in langs) { + String r = value.replaceAll(/(?i)\s*\(\(?\s*${lang}\s*\)\)?\s*$/, '') + if (value != r) { + report.println("$value -> $r") + return new DocumentUtil.Replace(r) + } + } + } + return DocumentUtil.NOP + } + + if (changed) { + bib.scheduleSave() + } +} + +@Memoized +private String langName(def id) { + getPathSafe(loadThing(id), ['prefLabelByLang', 'sv'], "NOT FOUND") +} + +private Map loadThing(def id) { + def thing = [:] + selectByIds([id]) { t -> + thing = t.graph[1] + } + return thing +} \ No newline at end of file diff --git a/whelktool/src/main/java/datatool/util/DocumentComparator.java b/whelktool/src/main/java/datatool/util/DocumentComparator.java index c7590a84e9..22b818d6c5 100644 --- a/whelktool/src/main/java/datatool/util/DocumentComparator.java +++ b/whelktool/src/main/java/datatool/util/DocumentComparator.java @@ -35,9 +35,13 @@ public boolean isEqual(Map a, Map b) { } private boolean isEqual(Object a, Object b, Object key) { - if (a == null || b == null || a.getClass() != b.getClass()) { + if (a == null || b == null) { return false; } + else if (a.getClass() != b.getClass()) { + return (isSingleItemList(a) && isEqual(((List) a).get(0), b, key) + || (isSingleItemList(b) && isEqual(a, ((List) b).get(0), key))); + } else if (a instanceof Map) { return isEqual((Map) a, (Map) b); } @@ -53,6 +57,10 @@ else if (a instanceof List) { } } + private boolean isSingleItemList(Object o) { + return o instanceof List && ((List) o).size() == 1; + } + private boolean isEqualOrdered(List a, List b) { if (a.size() != b.size()) { return false; diff --git a/whelktool/src/main/resources/merge-works/generic-titles.txt b/whelktool/src/main/resources/merge-works/generic-titles.txt new file mode 100644 index 0000000000..afc73c6ba8 --- /dev/null +++ b/whelktool/src/main/resources/merge-works/generic-titles.txt @@ -0,0 +1,34 @@ +artiklar +collected plays +dagböcker +dikter +dramatik +essäer +folksagor +folkvisor +fragment +korrespondens +krönikor +lyrik +memoarer +noveller +pjäser +plays +poems +poesi +prosa +publication +publications +rapport +report +romaner +sagor +samlade dikter +samlade pjäser +samlade skrifter +samlade verk +skrifter +skådespel +sonetter +tecknade serier +urval \ No newline at end of file diff --git a/whelktool/src/main/resources/merge-works/ignored-subtitles.txt b/whelktool/src/main/resources/merge-works/ignored-subtitles.txt new file mode 100644 index 0000000000..4dea8de2e6 --- /dev/null +++ b/whelktool/src/main/resources/merge-works/ignored-subtitles.txt @@ -0,0 +1,77 @@ +a comedy +a history +a novel +a play +a romance +a tale +aforismer +berättelse +berättelse för barn +berättelse för flickor +berättelse för pojkar +berättelse för unga flickor +berättelser +berättelser för barn +bilderbok +comédie +contos +deckare +deckarroman +detektivroman +dikt +dikter +drama +efterlämnade dikter +ein coq-rouge-thriller +ein roman +eine erzählung +erzählung +erzählungen +essays +essäer +ett fall för kay scarpetta +fortælling +historisk roman +homandeckare +jack reacher-thriller +komedi +komedi i fyra akter +krimi +kriminalroman +kärlekshistoria +kärleksroman +kåserier +lustspel i en akt +nouvelles +novela +novell +novelle +noveller +pjäs +polisroman +povesti +powieść +poėma +reseguide +resehandbok +rikosromaani +romaani +romaani rikoksesta +roman +roman om ett brott +roman om skivvärlden +romanas +romance +romanzo +rövarroman +runoja +saga +sagor +sann historia +skildringar +skáldsaga +spänningsroman +stories +thriller +ungdomsroman +(Efterlämnade dikter.) diff --git a/whelktool/src/main/resources/merge-works/table.css b/whelktool/src/main/resources/merge-works/table.css new file mode 100644 index 0000000000..e6378ea2ee --- /dev/null +++ b/whelktool/src/main/resources/merge-works/table.css @@ -0,0 +1,30 @@ + table { + border-collapse: collapse; + } + table, th, td { + border: 1px solid grey; + } + th { + text-align: left; + } + tr.info td { + background-color: lightgrey; + } + tr.DIFF td { + background-color: lightpink; + } + tr.COMPATIBLE td { + background-color: greenyellow; + } + tr.EQUAL td { + background-color: lightgreen; + } + td { + vertical-align: top; + } + hr { + border: 4px solid; + } + a:target { + background-color: coral; + } \ No newline at end of file diff --git a/whelktool/src/test/groovy/datatool/scripts/mergeworks/DocSpec.groovy b/whelktool/src/test/groovy/datatool/scripts/mergeworks/DocSpec.groovy new file mode 100644 index 0000000000..407372c419 --- /dev/null +++ b/whelktool/src/test/groovy/datatool/scripts/mergeworks/DocSpec.groovy @@ -0,0 +1,27 @@ +package datatool.scripts.mergeworks +import spock.lang.Specification +import whelk.util.Unicode + +class DocSpec extends Specification { + + def "parse extent"() { + expect: + Doc.numPages(extent) == pages + where: + extent | pages + "" | -1 + "114, [1] s." | 114 + "[4], 105, [2] s." | 105 + "21 s., ([4], 21, [5] s.)" | 21 + "[108] s., (Ca 110 s.)" | 110 + "80 s., (80, [3] s., [8] pl.-bl. i färg)" | 80 + "622, [8] s." | 622 + "[2] s., s. 635-919, [7] s." | 919 // ?? + "[1], iv, 295 s." | 295 + "3 vol." | -1 + //"249, (1) s." | 249 + //"[8] s., s. 11-370" | 370 + //[12] s., s. 15-256 | 256 + "25 onumrerade sidor" | 25 + } +} diff --git a/whelktool/src/test/groovy/datatool/util/DocumentComparatorSpec.groovy b/whelktool/src/test/groovy/datatool/util/DocumentComparatorSpec.groovy index 436bc07372..7417638205 100644 --- a/whelktool/src/test/groovy/datatool/util/DocumentComparatorSpec.groovy +++ b/whelktool/src/test/groovy/datatool/util/DocumentComparatorSpec.groovy @@ -13,6 +13,7 @@ class DocumentComparatorSpec extends Specification { expect: d.isEqual(a, b) == eq + d.isEqual(b, a) == eq where: a | b || eq @@ -30,6 +31,9 @@ class DocumentComparatorSpec extends Specification { ["ordered": [1, 2]] | ["ordered": [1, 2]] || true ["ordered": [1, 2]] | ["ordered": [2, 1]] || false + // one element list equals element + ["x": ["a"]] | ["x": "a"] || true + ["x": [["n": 2]]] | ["x": ["n": 2]] || true } def "isSubset"() {