From 962ed16544d8da886a0082d1f7549bb27126e19d Mon Sep 17 00:00:00 2001 From: Jannis Mohlin Tsiroyannis Date: Wed, 2 Nov 2022 10:45:31 +0100 Subject: [PATCH 1/9] WIP work merging. --- .../whelk/importer/DatasetImporter.groovy | 4 + .../src/main/groovy/whelk/Document.groovy | 10 ++ .../src/main/groovy/whelk/WorkMerging.java | 138 ++++++++++++++++++ 3 files changed, 152 insertions(+) create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging.java diff --git a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy index c966ce8e3a..c9cfe318cc 100644 --- a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy +++ b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy @@ -58,6 +58,10 @@ class DatasetImporter { Map aliasMap = [:] DatasetImporter(Whelk whelk, String datasetUri, Map flags=[:], String datasetDescPath=null) { + System.err.println("WARNING: Do not ever cancel an ongoing dataset loading operation using CTRL+C (or equivalent).\n"+ + "Doing so poses a danger because dataset loading may under some circumstances need to perform\n"+ + "operations that are not atomic, and loss of data can occur if the process is interrupted during\n"+ + "such an operation.") this.whelk = whelk this.datasetUri = datasetUri if (datasetDescPath != null) { diff --git a/whelk-core/src/main/groovy/whelk/Document.groovy b/whelk-core/src/main/groovy/whelk/Document.groovy index 0580d6de04..3c1932e986 100644 --- a/whelk-core/src/main/groovy/whelk/Document.groovy +++ b/whelk-core/src/main/groovy/whelk/Document.groovy @@ -20,6 +20,10 @@ import static whelk.util.Jackson.mapper * A document is represented as a data Map (containing Maps, Lists and Value objects). * * This class serves as a wrapper around such a map, with access methods for specific parts of the data. + * + * TODO: + * Many of the accessors of this class assumes the underlying data is an INSTANCE. We may want to break that + * assumption up at some point down the line, or check that it is actually the case in the accessors themselves. */ @Log class Document { @@ -51,6 +55,7 @@ class Document { static final List thingInSchemePath = ["@graph",1,"inScheme","@id"] static final List recordIdPath = ["@graph", 0, "@id"] static final List workIdPath = ["@graph", 1, "instanceOf", "@id"] + static final List workPath = ["@graph", 1, "instanceOf"] static final List thingMetaPath = ["@graph", 1, "meta", "@id"] static final List recordSameAsPath = ["@graph", 0, "sameAs"] static final List recordTypedIDsPath = ["@graph", 0, "identifiedBy"] @@ -177,6 +182,11 @@ class Document { void setThingMeta(meta) { set(thingMetaPath, meta) } + Map getWorkEntity() { return get(workPath) } + + void setWorkEntity(work) { set(workPath, work) } + + /** * Will have base URI prepended if not already there */ diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging.java b/whelk-core/src/main/groovy/whelk/WorkMerging.java new file mode 100644 index 0000000000..3747160792 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging.java @@ -0,0 +1,138 @@ +package whelk; + +import whelk.component.PostgreSQLComponent; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class WorkMerging { + + /** + * Merge the works of all listed instances into one. The listed instances + * may or may not have external works already. Orphaned work records will be + * deleted. + * + * This is _not_ one atomic operation, but rather a series of operations. + * This means that it is possible to observe the process halfway though from the + * outside. It also means that should the process be stopped halfway through, + * results may look odd (but will still obey basic data integrity rules). + * + * In the worst case scenario, if the process is interrupted just after the orphans + * have been deleted, but their sameAs-uris have not yet been moved to the merged + * work, those sameAs-uris will be lost. This risk cannot be avoided without compromising + * the URI integrity checks of the underlying code (two records are never allowed to + * have the same URI at the same time). + * + * Returns the URI of the one remaining (or new) work that all of the instances + * now link to. + */ + public static String mergeWorksOf(List instanceIDs, Whelk whelk) { + + List instances = collectInstancesOfThisWork(instanceIDs, whelk); + + Document baseWork = selectBaseWork(instances, whelk); + String baseWorkUri = baseWork.getThingIdentifiers().get(0); + + // Relink the instances and collect all work aliases + Map linkEntity = new HashMap(); + linkEntity.put("@id", baseWorkUri); + List workAlternateUris = new ArrayList<>(); + for (Document instance : instances) { + workAlternateUris.addAll( instance.getThingIdentifiers() ); + if (!instance.getWorkEntity().equals(linkEntity)) { // If not already linked to the correct record + whelk.storeAtomicUpdate(instance.getShortId(), true, false, true, "xl", null, (Document doc) -> { + doc.setWorkEntity(linkEntity); + }); + } + } + + // Merge other works into the baseWork. This must be done first, before any orphans can be deleted, + // or we risk loosing data if the process is interrupted. + whelk.storeAtomicUpdate(baseWork.getShortId(), true, false, true, "xl", null, (Document doc) -> { + // TODO MERGE HERE + }); + + // Cleanup no longer linked work records + for (Document instance : instances) { + Map workEntity = instance.getWorkEntity(); + String workUri = (String) workEntity.get("@id"); + String workId = whelk.getStorage().getSystemIdByIri(workUri); + if (workEntity.size() == 1 + && workEntity.containsKey("@id") + && !workEntity.equals(linkEntity) + && whelk.getStorage().getDependers(workId).isEmpty()) { + String orphanID = whelk.getStorage().getSystemIdByIri((String)workEntity.get("@id")); + whelk.remove(orphanID, "xl", null); + } + } + + // We must now save the baseWork a second time, to add all of the sameAs identifiers. + // These could not be added the first time, because they still belonged to other records + // that were not yet deleted. + whelk.storeAtomicUpdate(baseWork.getShortId(), true, false, true, "xl", null, (Document doc) -> { + for (String uri : workAlternateUris) + baseWork.addThingIdentifier(uri); + }); + + return baseWorkUri; + } + + /** + * Find the set of instances that should link to the merged work. This of course includes the + * passed instanceIDs, but also any other instances already sharing a work with one of those IDs. + */ + private static List collectInstancesOfThisWork(List instanceIDs, Whelk whelk) { + List instances = new ArrayList<>(instanceIDs.size()); + for (String instanceID : instanceIDs) { + Document instance = whelk.getDocument(instanceID); + instances.add( instance ); + + // Are there other instances linking to the same work as 'instance' ? If so add them to the + // collection to (possibly) re-link as well. + Map workEntity = instance.getWorkEntity(); + if (workEntity.size() == 1 && workEntity.containsKey("@id")) { + String workUri = (String) workEntity.get("@id"); + String workId = whelk.getStorage().getSystemIdByIri(workUri); + for (String otherInstanceId : whelk.getStorage().getDependers(workId)) { + Document otherInstance = whelk.getDocument(otherInstanceId); + instances.add( otherInstance ); + } + } + } + return instances; + } + + /** + * Select (or create+save) a work record that should be used going forward for + * all of the passed instances. + */ + private static Document selectBaseWork(List instances, Whelk whelk) { + // Find all the works + List linkedWorkURIs = new ArrayList<>(); + List embeddedWorks = new ArrayList<>(); + for (Document instance : instances) { + Map workEntity = instance.getWorkEntity(); + if (workEntity.size() == 1 && workEntity.containsKey("@id")) { + linkedWorkURIs.add( (String) workEntity.get("@id")); + } else { + embeddedWorks.add(workEntity); + } + } + + // Pick a linked one if any such exist, otherwise break off an embedded one + String baseWorkUri = null; + if (!linkedWorkURIs.isEmpty()) { + baseWorkUri = linkedWorkURIs.get(0); // TODO: Be a little smarter about _which_ work we pick? + } else { + Document newWork = new Document(embeddedWorks.get(0)); // TODO: Be a little smarter about _which_ work we break off? + newWork.deepReplaceId(Document.getBASE_URI().toString() + IdGenerator.generate()); + newWork.setControlNumber(newWork.getShortId()); + whelk.createDocument(newWork, "xl", null, "auth", false); + baseWorkUri = newWork.getThingIdentifiers().get(0); + } + + return whelk.getStorage().loadDocumentByMainId(baseWorkUri); + } +} From ba10ba71e9bf5a7689706054c086dd4918aa3eb8 Mon Sep 17 00:00:00 2001 From: Jannis Mohlin Tsiroyannis Date: Wed, 2 Nov 2022 13:24:05 +0100 Subject: [PATCH 2/9] Do removal of orphaned works and transfer of mainEntity IDs within a transaction This is necessary because the fundamental data integrity rules do not allow two records to hold the same URI at the same time. In other words: 1. We can't first write URIs from the disappearing record to the remaining one. As that would mean both having the URIs at the same time (forbidden). 2. We can't delete the disappearing record first, and then write the URIs to the remaining one, because that would require holding the URIs in volatile memory only for a short while in between. If the process was to die or be cancelled during this window, the URIs would be permanently lost. Therefore the removal and transfer of URIs need to happen within one and the same transaction. --- .../whelk/importer/DatasetImporter.groovy | 4 ---- .../src/main/groovy/whelk/WorkMerging.java | 20 ++----------------- .../component/PostgreSQLComponent.groovy | 13 ++++++++++++ 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy index c9cfe318cc..c966ce8e3a 100644 --- a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy +++ b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy @@ -58,10 +58,6 @@ class DatasetImporter { Map aliasMap = [:] DatasetImporter(Whelk whelk, String datasetUri, Map flags=[:], String datasetDescPath=null) { - System.err.println("WARNING: Do not ever cancel an ongoing dataset loading operation using CTRL+C (or equivalent).\n"+ - "Doing so poses a danger because dataset loading may under some circumstances need to perform\n"+ - "operations that are not atomic, and loss of data can occur if the process is interrupted during\n"+ - "such an operation.") this.whelk = whelk this.datasetUri = datasetUri if (datasetDescPath != null) { diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging.java b/whelk-core/src/main/groovy/whelk/WorkMerging.java index 3747160792..c8a95f1088 100644 --- a/whelk-core/src/main/groovy/whelk/WorkMerging.java +++ b/whelk-core/src/main/groovy/whelk/WorkMerging.java @@ -19,12 +19,6 @@ public class WorkMerging { * outside. It also means that should the process be stopped halfway through, * results may look odd (but will still obey basic data integrity rules). * - * In the worst case scenario, if the process is interrupted just after the orphans - * have been deleted, but their sameAs-uris have not yet been moved to the merged - * work, those sameAs-uris will be lost. This risk cannot be avoided without compromising - * the URI integrity checks of the underlying code (two records are never allowed to - * have the same URI at the same time). - * * Returns the URI of the one remaining (or new) work that all of the instances * now link to. */ @@ -35,12 +29,10 @@ public static String mergeWorksOf(List instanceIDs, Whelk whelk) { Document baseWork = selectBaseWork(instances, whelk); String baseWorkUri = baseWork.getThingIdentifiers().get(0); - // Relink the instances and collect all work aliases + // Relink the instances Map linkEntity = new HashMap(); linkEntity.put("@id", baseWorkUri); - List workAlternateUris = new ArrayList<>(); for (Document instance : instances) { - workAlternateUris.addAll( instance.getThingIdentifiers() ); if (!instance.getWorkEntity().equals(linkEntity)) { // If not already linked to the correct record whelk.storeAtomicUpdate(instance.getShortId(), true, false, true, "xl", null, (Document doc) -> { doc.setWorkEntity(linkEntity); @@ -64,18 +56,10 @@ public static String mergeWorksOf(List instanceIDs, Whelk whelk) { && !workEntity.equals(linkEntity) && whelk.getStorage().getDependers(workId).isEmpty()) { String orphanID = whelk.getStorage().getSystemIdByIri((String)workEntity.get("@id")); - whelk.remove(orphanID, "xl", null); + whelk.getStorage().removeAndTransferMainEntityURIs(orphanID, baseWork.getShortId()); } } - // We must now save the baseWork a second time, to add all of the sameAs identifiers. - // These could not be added the first time, because they still belonged to other records - // that were not yet deleted. - whelk.storeAtomicUpdate(baseWork.getShortId(), true, false, true, "xl", null, (Document doc) -> { - for (String uri : workAlternateUris) - baseWork.addThingIdentifier(uri); - }); - return baseWorkUri; } diff --git a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy index 2b00823fc6..9c176d75cc 100644 --- a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy +++ b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy @@ -991,6 +991,19 @@ class PostgreSQLComponent { return doc } + public removeAndTransferMainEntityURIs(String removeID, String inheritsAliasesID) { + withDbConnection { + Connection connection = getMyConnection() + Document from = lockAndLoad(removeID, connection) + remove(from.getShortId(), "xl", null, false) + storeUpdate(inheritsAliasesID, true, false, true, "xl", null, { to -> + from.getThingIdentifiers().each { + to.addThingIdentifier(it) + } + }) + } + } + private Document lockAndLoad(String id, Connection connection) throws DocumentNotFoundException { PreparedStatement statement = null ResultSet resultSet = null From 7908be9151344097c02744184b5424d3a45b56d2 Mon Sep 17 00:00:00 2001 From: Jannis Mohlin Tsiroyannis Date: Wed, 2 Nov 2022 13:49:44 +0100 Subject: [PATCH 3/9] WIP merging works. --- .../src/main/groovy/whelk/WorkMerging.java | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging.java b/whelk-core/src/main/groovy/whelk/WorkMerging.java index c8a95f1088..1d7686567d 100644 --- a/whelk-core/src/main/groovy/whelk/WorkMerging.java +++ b/whelk-core/src/main/groovy/whelk/WorkMerging.java @@ -28,15 +28,17 @@ public static String mergeWorksOf(List instanceIDs, Whelk whelk) { Document baseWork = selectBaseWork(instances, whelk); String baseWorkUri = baseWork.getThingIdentifiers().get(0); + Map correctLinkEntity = new HashMap(); + correctLinkEntity.put("@id", baseWorkUri); - // Relink the instances - Map linkEntity = new HashMap(); - linkEntity.put("@id", baseWorkUri); + // Collect all already existing external works (different from our target) before relinking + List orphanIDs = new ArrayList<>(); for (Document instance : instances) { - if (!instance.getWorkEntity().equals(linkEntity)) { // If not already linked to the correct record - whelk.storeAtomicUpdate(instance.getShortId(), true, false, true, "xl", null, (Document doc) -> { - doc.setWorkEntity(linkEntity); - }); + Map workEntity = instance.getWorkEntity(); + if (workEntity.size() == 1 && !workEntity.equals(correctLinkEntity)) { + String workUri = (String) workEntity.get("@id"); + String workId = whelk.getStorage().getSystemIdByIri(workUri); + orphanIDs.add(workId); } } @@ -46,17 +48,22 @@ public static String mergeWorksOf(List instanceIDs, Whelk whelk) { // TODO MERGE HERE }); - // Cleanup no longer linked work records + // Relink the instances for (Document instance : instances) { - Map workEntity = instance.getWorkEntity(); - String workUri = (String) workEntity.get("@id"); - String workId = whelk.getStorage().getSystemIdByIri(workUri); - if (workEntity.size() == 1 - && workEntity.containsKey("@id") - && !workEntity.equals(linkEntity) - && whelk.getStorage().getDependers(workId).isEmpty()) { - String orphanID = whelk.getStorage().getSystemIdByIri((String)workEntity.get("@id")); + if (!instance.getWorkEntity().equals(correctLinkEntity)) { // If not already linked to the correct record + whelk.storeAtomicUpdate(instance.getShortId(), true, false, true, "xl", null, (Document doc) -> { + doc.setWorkEntity(correctLinkEntity); + }); + } + } + + // Cleanup no longer linked work records + for (String orphanID : orphanIDs) { + try { whelk.getStorage().removeAndTransferMainEntityURIs(orphanID, baseWork.getShortId()); + } catch (RuntimeException e) { + // Expected possible cause of exception: A new link was added to this work, _after_ we collected + // and relinked the instances of it. In this (theoretical) case, just leave the old work in place. } } From 74784f6dbca2edfb769d3df549a02ab4bd87ed8c Mon Sep 17 00:00:00 2001 From: Jannis Mohlin Tsiroyannis Date: Mon, 7 Nov 2022 12:44:42 +0100 Subject: [PATCH 4/9] WIP: work merging by dataset loader. --- .../whelk/importer/DatasetImporter.groovy | 45 +++++++++--- .../src/main/groovy/whelk/WorkMerging.java | 72 ++++++++++++++----- 2 files changed, 91 insertions(+), 26 deletions(-) diff --git a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy index c966ce8e3a..9bd99cd915 100644 --- a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy +++ b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy @@ -2,6 +2,8 @@ package whelk.importer import groovy.util.logging.Log4j2 as Log import groovy.transform.CompileStatic +import whelk.WorkMerging + import static groovy.transform.TypeCheckingMode.SKIP import whelk.Document @@ -113,15 +115,23 @@ class DatasetImporter { Document incomingDoc = completeRecord(data, recordType, true) idsInInput.add(incomingDoc.getShortId()) - // This race condition should be benign. If there is a document with - // the same ID created in between the check and the creation, we'll - // get an exception and fail early (unfortunate but acceptable). - switch (createOrUpdateDocument(incomingDoc)) { - case WRITE_RESULT.CREATED: - createdCount++; - break; - case WRITE_RESULT.UPDATED: - updatedCount++; + if (data.get("@type") != null && + whelk.getJsonld().isSubClassOf( incomingDoc.getThingType(), "Work" )) { + + createOrUpdateWork(incomingDoc) + + } else { // Not a work + + // This race condition should be benign. If there is a document with + // the same ID created in between the check and the creation, we'll + // get an exception and fail early (unfortunate but acceptable). + switch (createOrUpdateDocument(incomingDoc)) { + case WRITE_RESULT.CREATED: + createdCount++; + break; + case WRITE_RESULT.UPDATED: + updatedCount++; + } } if ( lineCount % 100 == 0 ) { @@ -286,8 +296,8 @@ class DatasetImporter { } private WRITE_RESULT createOrUpdateDocument(Document incomingDoc) { - Document storedDoc = whelk.getDocument(incomingDoc.getShortId()) WRITE_RESULT result + Document storedDoc = whelk.getDocument(incomingDoc.getShortId()) if (storedDoc != null) { if (whelk.storeAtomicUpdate(incomingDoc.getShortId(), true, false, refreshDependers, "xl", null, { doc -> doc.data = incomingDoc.data @@ -300,9 +310,24 @@ class DatasetImporter { whelk.createDocument(incomingDoc, "xl", null, collection, false) result = WRITE_RESULT.CREATED } + return result } + private void createOrUpdateWork(Document incomingWork) { + List bibIDs = [] + List graphList = incomingWork.data.get("@graph") + Map mainEntity = graphList[1] + mainEntity.get("@reverse", [:]).get("instanceOf", []).each { bib -> + String instanceID = whelk.getStorage().getSystemIdByIri( (String) bib["@id"] ) + if (instanceID != null) + bibIDs.add(instanceID) + } + if (!bibIDs.isEmpty()) { + WorkMerging.mergeWorksOf(bibIDs, [incomingWork], whelk) + } + } + private long removeDeleted(Set idsInInput, List needsRetry) { // Clear out anything that was previously stored in this dataset, but was not in the in-data now. // If faced with "can't delete depended on stuff", retry again later, after more other deletes have diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging.java b/whelk-core/src/main/groovy/whelk/WorkMerging.java index 1d7686567d..3435bc43b0 100644 --- a/whelk-core/src/main/groovy/whelk/WorkMerging.java +++ b/whelk-core/src/main/groovy/whelk/WorkMerging.java @@ -2,17 +2,14 @@ import whelk.component.PostgreSQLComponent; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; public class WorkMerging { /** * Merge the works of all listed instances into one. The listed instances * may or may not have external works already. Orphaned work records will be - * deleted. + * deleted. Extra (previously unsaved) works may optionally be supplied. * * This is _not_ one atomic operation, but rather a series of operations. * This means that it is possible to observe the process halfway though from the @@ -22,11 +19,11 @@ public class WorkMerging { * Returns the URI of the one remaining (or new) work that all of the instances * now link to. */ - public static String mergeWorksOf(List instanceIDs, Whelk whelk) { + public static String mergeWorksOf(List instanceIDs, List extraWorks, Whelk whelk) { List instances = collectInstancesOfThisWork(instanceIDs, whelk); - Document baseWork = selectBaseWork(instances, whelk); + Document baseWork = selectBaseWork(instances, extraWorks, whelk); String baseWorkUri = baseWork.getThingIdentifiers().get(0); Map correctLinkEntity = new HashMap(); correctLinkEntity.put("@id", baseWorkUri); @@ -42,11 +39,13 @@ public static String mergeWorksOf(List instanceIDs, Whelk whelk) { } } + System.err.println("**** SELECTED BASE: " + baseWork.getThingIdentifiers().get(0)); + // Merge other works into the baseWork. This must be done first, before any orphans can be deleted, // or we risk loosing data if the process is interrupted. - whelk.storeAtomicUpdate(baseWork.getShortId(), true, false, true, "xl", null, (Document doc) -> { + /*whelk.storeAtomicUpdate(baseWork.getShortId(), true, false, true, "xl", null, (Document doc) -> { // TODO MERGE HERE - }); + });*/ // Relink the instances for (Document instance : instances) { @@ -99,7 +98,7 @@ private static List collectInstancesOfThisWork(List instanceID * Select (or create+save) a work record that should be used going forward for * all of the passed instances. */ - private static Document selectBaseWork(List instances, Whelk whelk) { + private static Document selectBaseWork(List instances, List extraWorks, Whelk whelk) { // Find all the works List linkedWorkURIs = new ArrayList<>(); List embeddedWorks = new ArrayList<>(); @@ -112,16 +111,57 @@ private static Document selectBaseWork(List instances, Whelk whelk) { } } - // Pick a linked one if any such exist, otherwise break off an embedded one + // Order of priority: + // 1. Any pre existing linked work records + // 2. Any supplied extra works + // 3. Any embedded work from one of the instances + + // Pick a linked one if any such exist (1) String baseWorkUri = null; if (!linkedWorkURIs.isEmpty()) { baseWorkUri = linkedWorkURIs.get(0); // TODO: Be a little smarter about _which_ work we pick? - } else { - Document newWork = new Document(embeddedWorks.get(0)); // TODO: Be a little smarter about _which_ work we break off? - newWork.deepReplaceId(Document.getBASE_URI().toString() + IdGenerator.generate()); - newWork.setControlNumber(newWork.getShortId()); + } else if(!extraWorks.isEmpty()) { // Any supplied extra work (2) + Document selectedWork = extraWorks.get(0); + + String slug = IdGenerator.generate(); + String recordId = Document.getBASE_URI().toString() + slug; + String mainEntityId = recordId + "#it"; + Document._set(Document.getRecordIdPath(), recordId, selectedWork.data); + Document._set(Document.getThingIdPath(), mainEntityId, selectedWork.data); + Document._set(Document.getThingIdPath2(), mainEntityId, selectedWork.data); + + ((Map)(((List)selectedWork.data.get("@graph")).get(1))).remove("@reverse"); // ugh + + whelk.createDocument(selectedWork, "xl", null, "auth", false); + baseWorkUri = selectedWork.getThingIdentifiers().get(0); + } else { // Otherwise break off an embedded one (3) + String slug = IdGenerator.generate(); + String recordId = Document.getBASE_URI().toString() + slug; + String mainEntityId = recordId + "#it"; + + Map chosenEmbedded = embeddedWorks.get(0); // TODO: Be a little smarter about _which_ work we break off? + + Map docMap = new HashMap(); + List graph = new ArrayList(); + Map record = new HashMap(); + docMap.put("@graph", graph); + + graph.add(record); + record.put("@id", Document.getBASE_URI().toString() + slug); + record.put("@type", "Record"); + Map mainEntityLink = new HashMap(); + mainEntityLink.put("@id", mainEntityId); + record.put("mainEntity", mainEntityLink); + + graph.add(chosenEmbedded); + chosenEmbedded.put("@id", mainEntityId); + + Document newWork = new Document(docMap); + newWork.setControlNumber(slug); + newWork.setGenerationDate(new Date()); + //newWork.setGenerationProcess("https://id.kb.se/datasetimporter"); // TODO: KOLLA MED FORMAT!! whelk.createDocument(newWork, "xl", null, "auth", false); - baseWorkUri = newWork.getThingIdentifiers().get(0); + baseWorkUri = mainEntityId; } return whelk.getStorage().loadDocumentByMainId(baseWorkUri); From b27d41425c2aa2d0553bc2b23211f8e25f2fe6ba Mon Sep 17 00:00:00 2001 From: Jannis Mohlin Tsiroyannis Date: Mon, 7 Nov 2022 13:19:25 +0100 Subject: [PATCH 5/9] WIP work merging. Can't use new IDs, or we loose syncability of datasets. --- whelk-core/src/main/groovy/whelk/WorkMerging.java | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging.java b/whelk-core/src/main/groovy/whelk/WorkMerging.java index 3435bc43b0..c1f67eef3c 100644 --- a/whelk-core/src/main/groovy/whelk/WorkMerging.java +++ b/whelk-core/src/main/groovy/whelk/WorkMerging.java @@ -123,15 +123,8 @@ private static Document selectBaseWork(List instances, List } else if(!extraWorks.isEmpty()) { // Any supplied extra work (2) Document selectedWork = extraWorks.get(0); - String slug = IdGenerator.generate(); - String recordId = Document.getBASE_URI().toString() + slug; - String mainEntityId = recordId + "#it"; - Document._set(Document.getRecordIdPath(), recordId, selectedWork.data); - Document._set(Document.getThingIdPath(), mainEntityId, selectedWork.data); - Document._set(Document.getThingIdPath2(), mainEntityId, selectedWork.data); - ((Map)(((List)selectedWork.data.get("@graph")).get(1))).remove("@reverse"); // ugh - + whelk.createDocument(selectedWork, "xl", null, "auth", false); baseWorkUri = selectedWork.getThingIdentifiers().get(0); } else { // Otherwise break off an embedded one (3) From 0b01c79d0e744ccddad4d7b558f49dfd3ba59609 Mon Sep 17 00:00:00 2001 From: Jannis Mohlin Tsiroyannis Date: Mon, 7 Nov 2022 13:41:29 +0100 Subject: [PATCH 6/9] Track updates/creates for work records too. --- .../whelk/importer/DatasetImporter.groovy | 9 ++++- .../src/main/groovy/whelk/WorkMerging.java | 34 ++++++++++++------- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy index 9bd99cd915..078a6f6985 100644 --- a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy +++ b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy @@ -99,6 +99,7 @@ class DatasetImporter { long updatedCount = 0 long createdCount = 0 + long lineCount = 1 // The data sets self describing first record also counts. boolean first = true @@ -118,7 +119,13 @@ class DatasetImporter { if (data.get("@type") != null && whelk.getJsonld().isSubClassOf( incomingDoc.getThingType(), "Work" )) { - createOrUpdateWork(incomingDoc) + switch (createOrUpdateWork(incomingDoc)) { + case WorkMerging.WRITE_RESULT.CREATED: + createdCount++; + break; + case WorkMerging.WRITE_RESULT.UPDATED: + updatedCount++; + } } else { // Not a work diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging.java b/whelk-core/src/main/groovy/whelk/WorkMerging.java index c1f67eef3c..6d9fb2e6d0 100644 --- a/whelk-core/src/main/groovy/whelk/WorkMerging.java +++ b/whelk-core/src/main/groovy/whelk/WorkMerging.java @@ -1,11 +1,20 @@ package whelk; -import whelk.component.PostgreSQLComponent; - import java.util.*; public class WorkMerging { + public enum WRITE_RESULT { + ALREADY_UP_TO_DATE, + UPDATED, + CREATED + } + + // No proper pointers or multiple return values in Java :( + private static class WriteResultReference { + public WRITE_RESULT result = WRITE_RESULT.ALREADY_UP_TO_DATE; + } + /** * Merge the works of all listed instances into one. The listed instances * may or may not have external works already. Orphaned work records will be @@ -15,15 +24,14 @@ public class WorkMerging { * This means that it is possible to observe the process halfway though from the * outside. It also means that should the process be stopped halfway through, * results may look odd (but will still obey basic data integrity rules). - * - * Returns the URI of the one remaining (or new) work that all of the instances - * now link to. */ - public static String mergeWorksOf(List instanceIDs, List extraWorks, Whelk whelk) { + public static WRITE_RESULT mergeWorksOf(List instanceIDs, List extraWorks, Whelk whelk) { + + WriteResultReference result = new WriteResultReference(); List instances = collectInstancesOfThisWork(instanceIDs, whelk); - Document baseWork = selectBaseWork(instances, extraWorks, whelk); + Document baseWork = selectBaseWork(instances, extraWorks, result, whelk); String baseWorkUri = baseWork.getThingIdentifiers().get(0); Map correctLinkEntity = new HashMap(); correctLinkEntity.put("@id", baseWorkUri); @@ -39,12 +47,10 @@ public static String mergeWorksOf(List instanceIDs, List extra } } - System.err.println("**** SELECTED BASE: " + baseWork.getThingIdentifiers().get(0)); - // Merge other works into the baseWork. This must be done first, before any orphans can be deleted, // or we risk loosing data if the process is interrupted. /*whelk.storeAtomicUpdate(baseWork.getShortId(), true, false, true, "xl", null, (Document doc) -> { - // TODO MERGE HERE + // TODO MERGE HERE AND DONT FORGET TO SET result.result IF ANYTHING CHANGES! });*/ // Relink the instances @@ -66,7 +72,7 @@ public static String mergeWorksOf(List instanceIDs, List extra } } - return baseWorkUri; + return result.result; } /** @@ -98,7 +104,7 @@ private static List collectInstancesOfThisWork(List instanceID * Select (or create+save) a work record that should be used going forward for * all of the passed instances. */ - private static Document selectBaseWork(List instances, List extraWorks, Whelk whelk) { + private static Document selectBaseWork(List instances, List extraWorks, WriteResultReference result, Whelk whelk) { // Find all the works List linkedWorkURIs = new ArrayList<>(); List embeddedWorks = new ArrayList<>(); @@ -126,6 +132,7 @@ private static Document selectBaseWork(List instances, List ((Map)(((List)selectedWork.data.get("@graph")).get(1))).remove("@reverse"); // ugh whelk.createDocument(selectedWork, "xl", null, "auth", false); + result.result = WRITE_RESULT.CREATED; baseWorkUri = selectedWork.getThingIdentifiers().get(0); } else { // Otherwise break off an embedded one (3) String slug = IdGenerator.generate(); @@ -152,8 +159,9 @@ private static Document selectBaseWork(List instances, List Document newWork = new Document(docMap); newWork.setControlNumber(slug); newWork.setGenerationDate(new Date()); - //newWork.setGenerationProcess("https://id.kb.se/datasetimporter"); // TODO: KOLLA MED FORMAT!! + //newWork.setGenerationProcess("https://id.kb.se/workmerger"); // TODO: KOLLA MED FORMAT!! whelk.createDocument(newWork, "xl", null, "auth", false); + result.result = WRITE_RESULT.CREATED; baseWorkUri = mainEntityId; } From 61da6d6e81fba6edcc3b6d5bde5f4147bac5ed66 Mon Sep 17 00:00:00 2001 From: Jannis Mohlin Tsiroyannis Date: Mon, 7 Nov 2022 13:51:27 +0100 Subject: [PATCH 7/9] Cleanup --- importers/src/main/groovy/whelk/importer/DatasetImporter.groovy | 1 - 1 file changed, 1 deletion(-) diff --git a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy index 078a6f6985..3fddb229c8 100644 --- a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy +++ b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy @@ -99,7 +99,6 @@ class DatasetImporter { long updatedCount = 0 long createdCount = 0 - long lineCount = 1 // The data sets self describing first record also counts. boolean first = true From 8758d0fad8843d28ae70ff7676a5a0cc6d47f91a Mon Sep 17 00:00:00 2001 From: Jannis Mohlin Tsiroyannis Date: Tue, 8 Nov 2022 14:53:27 +0100 Subject: [PATCH 8/9] Move already existing work merging code in from an experimental branch The code in question was taken as is from commit 3d2882cce and moved into whelk-core. --- .../whelk/importer/DatasetImporter.groovy | 3 +- .../whelk/WorkMerging/DisjointSets.java | 220 +++++ .../main/groovy/whelk/WorkMerging/Doc.groovy | 363 ++++++++ .../whelk/WorkMerging/DocumentComparator.java | 231 +++++ .../whelk/WorkMerging/FieldStatus.groovy | 7 + .../main/groovy/whelk/WorkMerging/Html.groovy | 111 +++ .../main/groovy/whelk/WorkMerging/Util.groovy | 306 +++++++ .../whelk/WorkMerging/WorkComparator.groovy | 136 +++ .../whelk/{ => WorkMerging}/WorkMerging.java | 6 +- .../whelk/WorkMerging/WorkToolJob.groovy | 797 ++++++++++++++++++ .../WorkMerging/compare/Classification.groovy | 63 ++ .../whelk/WorkMerging/compare/Default.groovy | 13 + .../whelk/WorkMerging/compare/Extent.groovy | 15 + .../WorkMerging/compare/FieldHandler.groovy | 12 + .../WorkMerging/compare/GenreForm.groovy | 36 + .../WorkMerging/compare/SameOrEmpty.groovy | 21 + .../whelk/WorkMerging/compare/StuffSet.groovy | 38 + .../whelk/WorkMerging/compare/Subject.groovy | 8 + .../WorkMerging/compare/TranslationOf.groovy | 22 + .../WorkMerging/compare/WorkTitle.groovy | 23 + 20 files changed, 2428 insertions(+), 3 deletions(-) create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/DisjointSets.java create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/Doc.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/DocumentComparator.java create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/FieldStatus.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/Html.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/Util.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/WorkComparator.groovy rename whelk-core/src/main/groovy/whelk/{ => WorkMerging}/WorkMerging.java (98%) create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/WorkToolJob.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/Classification.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/Default.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/Extent.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/FieldHandler.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/GenreForm.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/SameOrEmpty.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/StuffSet.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/Subject.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/TranslationOf.groovy create mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/WorkTitle.groovy diff --git a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy index 3fddb229c8..394037cf73 100644 --- a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy +++ b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy @@ -2,7 +2,7 @@ package whelk.importer import groovy.util.logging.Log4j2 as Log import groovy.transform.CompileStatic -import whelk.WorkMerging +import whelk.WorkMerging.WorkMerging import static groovy.transform.TypeCheckingMode.SKIP @@ -11,7 +11,6 @@ import whelk.JsonLd import whelk.TargetVocabMapper import whelk.Whelk import whelk.converter.TrigToJsonLdParser -import whelk.exception.CancelUpdateException import whelk.util.DocumentUtil import static whelk.util.LegacyIntegrationTools.NO_MARC_COLLECTION diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/DisjointSets.java b/whelk-core/src/main/groovy/whelk/WorkMerging/DisjointSets.java new file mode 100644 index 0000000000..7cd3e33a6c --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/DisjointSets.java @@ -0,0 +1,220 @@ +package whelk.WorkMerging; + +import java.util.List; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +/** + * This class keeps track of a set of disjoint (non-overlapping) sets. + */ +public class DisjointSets { + /** + * Sets as forest of rooted trees. + * Pointer to parent in tree, root points to itself. + */ + List forest; + + /** + * Rank of each tree node (keeps trees balanced when merging). + */ + List ranks; + + /** + * Sets as circular linked lists (so that we can find all elements in a set). + * Pointer to the next element in the set. + */ + List sets; + + /** + * Map from set element value to index + */ + Map ixs; + + /** + * Map from set element index to value + */ + List ixToValue; + + public DisjointSets(int initialCapacity) { + forest = new ArrayList<>(initialCapacity); + ranks = new ArrayList<>(initialCapacity); + sets = new ArrayList<>(initialCapacity); + ixs = new HashMap<>(initialCapacity); + ixToValue = new ArrayList<>(initialCapacity); + } + + public DisjointSets() { + this(20); + } + + /** + * Create a new set if it doesn't already exist. + * + * @param e initial element in set + */ + public void createSet(T e) { + if (ixs.containsKey(e)) { + return; + } + + int ix = forest.size(); + ixs.put(e, ix); + forest.add(ix); + ranks.add(0); + sets.add(ix); + ixToValue.add(e); + + if (ix == Integer.MAX_VALUE) { + throw new IllegalStateException("size > Integer.MAX_VALUE"); + } + } + + /** + * Add a set, merging it with existing intersecting sets + * + * @param set a set to be added + */ + public void addSet(Iterable set) { + Iterator i = set.iterator(); + if (!i.hasNext()) { + return; + } + + T first = i.next(); + while (i.hasNext()) { + mergeSets(first, i.next()); + } + } + + /** + * Merge two sets identified by elements. + * Sets will be created if they don't exist + * + * @param a an element of the first set + * @param b an element of the second set + */ + public void mergeSets(T a, T b) { + if (!ixs.containsKey(a)) { + createSet(a); + } + if (!ixs.containsKey(b)) { + createSet(b); + } + + int ixA = ixs.get(a); + int ixB = ixs.get(b); + + int rootA = root(ixA); + int rootB = root(ixB); + + if (rootA == rootB) { + return; + } + + int rankA = ranks.get(rootA); + int rankB = ranks.get(rootB); + + if (rankA > rankB) { + forest.set(rootB, rootA); + } else { + forest.set(rootA, rootB); + if (rankA == rankB) { + ranks.set(rootB, rankB + 1); + } + } + + int link = sets.get(rootA); + sets.set(rootA, sets.get(rootB)); + sets.set(rootB, link); + } + + /** + * Lookup a set based on an element in the set + * + * @param e an element in the set + * @return the set + */ + public Set getSet(T e) { + if (!ixs.containsKey(e)) { + throw new IllegalArgumentException("No set with element: " + e); + } + + Set result = new HashSet<>(); + int start = sets.get(ixs.get(e)); + int node = start; + do { + result.add(ixToValue.get(node)); + node = sets.get(node); + } while (node != start); + + return result; + } + + /** + * Iterate over all sets + * + * @param visitor + */ + public void iterateAllSets(SetVisitor visitor) { + boolean[] visited = new boolean[sets.size()]; + + for (int ix : sets) { + if (visited[ix]) { + continue; + } + + int start = sets.get(ix); + int node = start; + do { + visited[node] = true; + visitor.nextElement(ixToValue.get(node)); + node = sets.get(node); + } while (node != start); + + visitor.closeSet(); + } + } + + /** + * @return a set with all sets + */ + public Set> allSets() { + final Set> result = new HashSet<>(); + + iterateAllSets(new SetVisitor() { + Set current = new HashSet<>(); + + public void closeSet() { + result.add(current); + current = new HashSet<>(); + } + + public void nextElement(T e) { + current.add(e); + } + }); + + return result; + } + + private int root(int node) { + while (node != forest.get(node)) { + int parent = forest.get(node); + //path splitting - point node to grandparent + forest.set(node, forest.get(parent)); + node = parent; + } + + return node; + } + + public interface SetVisitor { + void nextElement(T e); + + void closeSet(); + } +} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/Doc.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/Doc.groovy new file mode 100644 index 0000000000..c50a7abf85 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/Doc.groovy @@ -0,0 +1,363 @@ +package whelk.WorkMerging + +import se.kb.libris.Normalizers +import whelk.Document +import whelk.JsonLd +import whelk.Whelk + +import static whelk.WorkMerging.Util.asList + +class Doc { + public static final String SAOGF_SKÖN = 'https://id.kb.se/term/saogf/Sk%C3%B6nlitteratur' + public static final List MARC_FICTION = [ + 'https://id.kb.se/marc/FictionNotFurtherSpecified', + 'https://id.kb.se/marc/Drama', + 'https://id.kb.se/marc/Essay', + 'https://id.kb.se/marc/Novel', + 'https://id.kb.se/marc/HumorSatiresEtc', + 'https://id.kb.se/marc/Letter', + 'https://id.kb.se/marc/ShortStory', + 'https://id.kb.se/marc/MixedForms', + 'https://id.kb.se/marc/Poetry', + ] + public static final List MARC_NOT_FICTION = [ + 'https://id.kb.se/marc/NotFictionNotFurtherSpecified', + 'https://id.kb.se/marc/Biography' + ] + public static final List DRAMA_GF = [ + 'https://id.kb.se/term/saogf/Dramatik', + 'https://id.kb.se/marc/Drama' + ] + + Whelk whelk + Document doc + Map work + Map framed + List titles + + //FIXME + Document ogDoc + + Doc(Whelk whelk, Document doc) { + this.whelk = whelk + this.doc = doc + this.ogDoc = doc.clone() + } + + Map getWork() { + if (!work) { + work = getWork(whelk, doc) + } + + return work + } + + static Map getWork(Whelk whelk, Document d) { + Map work = Normalizers.getWork(whelk.jsonld, d) + if (!work) { + throw new NoWorkException(d.shortId) + } + work = new HashMap<>(work) + + //TODO 'marc:fieldref' + + work.remove('@id') + return work + } + + Map workCopy() { + return getWork(whelk, doc.clone()) + } + + Map getMainEntity() { + return doc.data['@graph'][1] + } + + boolean isInstance() { + return getMainEntity().containsKey('instanceOf') + } + + List getTitleVariants() { + if (!titles) { + titles = Util.getTitleVariants(getMainEntity()['hasTitle']) + } + + return titles + } + + boolean hasGenericTitle() { + Util.hasGenericTitle(getMainEntity()['hasTitle']) + } + + private static String displayTitle(Map thing) { + thing['hasTitle'].collect { it['@type'] + ": " + it['flatTitle'] }.join(', ') + } + + String mainEntityDisplayTitle() { + displayTitle(['hasTitle': Util.flatTitles(getMainEntity()['hasTitle'])]) + } + + String link() { + String base = Document.getBASE_URI().toString() + String kat = "katalogisering/" + String id = doc.shortId + return base + kat + id + } + + boolean isMonograph() { + getMainEntity()['issuanceType'] == 'Monograph' + } + + boolean hasPart() { + getWork()['hasPart'] != null + } + + String encodingLevel() { + return doc.data['@graph'][0]['encodingLevel'] ?: '' + } + + int numPages() { + String extent = Util.getPathSafe(getMainEntity(), ['extent', 0, 'label', 0]) ?: Util.getPathSafe(getMainEntity(), ['extent', 0, 'label'], '') + return numPages(extent) + } + + // TODO: improve parsing https://metadatabyran.kb.se/beskrivning/materialtyper-arbetsfloden/tryckta-monografier/omfang-for-tryckta-monografier + static int numPages(String extentLabel) { + def l = extentLabel.replace('onumrerade', '') + def matcher = l =~ /(\d+)(?=[, \[\]0-9]*[sp])/ + List pages = [] + while (matcher.find()) { + pages << Integer.parseInt(matcher.group(1)) + } + pages ? pages.max() : -1 + } + + // TODO... + String getDisplayText(String field) { + if (field == 'contribution') { + return contributorStrings().join("
") + } else if (field == 'classification') { + return classificationStrings().join("
") + } else if (field == 'instance title') { + return isInstance() ? (getMainEntity()['hasTitle'] ?: '') : '' + } else if (field == 'work title') { + // To load hasTitle from linked work in instanceOf we can use getFramed() + // However we then need to handle that getFramed() loads linked instances in hasTitle.source + // Prefer getMainEntity() for now + return isInstance() ? (getMainEntity()['instanceOf']['hasTitle'] ?: '') : (getMainEntity()['hasTitle'] ?: '') + } else if (field == 'instance type') { + return isInstance() ? getMainEntity()['@type'] : '' + } else if (field == 'editionStatement') { + return getMainEntity()['editionStatement'] ?: '' + } else if (field == 'responsibilityStatement') { + return getMainEntity()['responsibilityStatement'] ?: '' + } else if (field == 'encodingLevel') { + return encodingLevel() + } else if (field == 'publication') { + return chipString(getMainEntity()['publication'] ?: []) + } else if (field == 'identifiedBy') { + return chipString(getMainEntity()['identifiedBy'] ?: []) + } else if (field == 'extent') { + return chipString(getMainEntity()['extent'] ?: []) + } else if (field == 'reproductionOf') { + return reproductionOfLink() + } else { + return chipString(getWork().getOrDefault(field, [])) + } + } + + protected String chipString(def thing) { + Util.chipString(thing, whelk) + } + + String tooltip(String string, String tooltip) { + """${string}""" + } + + private String reproductionOfLink() { + def shortId = Util.getPathSafe(getMainEntity(), ['reproductionOf', '@id']) + ?.tokenize("/#") + ?.dropRight(1) + ?.last() ?: '' + + return "$shortId" + } + + private List classificationStrings() { + List path = isInstance() ? ['instanceOf', 'classification'] : ['classification'] + List classification = Util.getPathSafe(getFramed(), path, []) + classification.collect() { c -> + StringBuilder s = new StringBuilder() + s.append(flatMaybeLinked(c['inScheme'], ['code', 'version']).with { it.isEmpty() ? it : it + ': ' }) + s.append(flatMaybeLinked(c, ['code'])) + return s.toString() + } + } + + private List contributorStrings() { + List path = isInstance() ? ['instanceOf', 'contribution'] : ['contribution'] + List contribution = Util.getPathSafe(getFramed(), path, []) + + return contribution.collect { Map c -> + contributionStr(c) + } + } + + protected Map getFramed() { + if (!framed) { + if (isInstance()) { + framed = JsonLd.frame(doc.getThingIdentifiers().first(), whelk.loadEmbellished(doc.shortId).data) + } else { + Document copy = doc.clone() + whelk.embellish(copy) + framed = JsonLd.frame(doc.getThingIdentifiers().first(), copy.data) + } + } + + return framed + } + + private String contributionStr(Map contribution) { + StringBuilder s = new StringBuilder() + + if (contribution['@type'] == 'PrimaryContribution') { + s.append('') + } + + s.append(flatMaybeLinked(contribution['role'], ['code', 'label']).with { it.isEmpty() ? it : it + ': ' }) + s.append(flatMaybeLinked(contribution['agent'], ['givenName', 'familyName', 'lifeSpan', 'name'])) + + if (contribution['@type'] == 'PrimaryContribution') { + s.append('') + } + + return s.toString() + } + + static String flatten(Object o, List order, String mapSeparator = ': ') { + if (o instanceof String) { + return o + } + if (o instanceof List) { + return o + .collect { flatten(it, order) } + .join(' || ') + } + if (o instanceof Map) { + return order + .findResults { ((Map) o).get(it) } + .collect { flatten(it, order) } + .join(mapSeparator) + } + + throw new RuntimeException(String.format("unexpected type: %s for %s", o.class.getName(), o)) + } + + private String flatMaybeLinked(Object thing, List order) { + if (!thing) + return '' + + if (thing instanceof List) { + return thing.collect { flatMaybeLinked(it, order) }.join(' | ') + } + String s = flatten(thing, order, ', ') + + thing['@id'] + ? """$s""" + : s + } + + boolean isFiction() { + isMarcFiction() || isSaogfFiction() || isSabFiction() + } + + boolean isMarcFiction() { + (getWork()['genreForm'] ?: []).any { it['@id'] in MARC_FICTION } + } + + boolean isMarcNotFiction() { + (getWork()['genreForm'] ?: []).any { it['@id'] in MARC_NOT_FICTION } + } + + boolean isSaogfFiction() { + (getWork()['genreForm'] ?: []).any { whelk.relations.isImpliedBy(SAOGF_SKÖN, it['@id'] ?: '') } + } + + boolean isSabFiction() { + classificationStrings().any { it.contains('kssb') && it.contains(': H') } + } + + boolean isNotFiction() { + // A lot of fiction has marc/NotFictionNotFurtherSpecified but then classification is usually empty + isMarcNotFiction() && (!classificationStrings().isEmpty() && !isSabFiction()) + } + + boolean isText() { + getWork()['@type'] == 'Text' + } + + boolean isTranslationWithoutTranslator() { + isTranslation() && !hasTranslator() + } + + boolean isTranslation() { + getWork()['translationOf'] + } + + boolean isSabDrama() { + classificationStrings().any { it.contains(': Hc.02') || it.contains(': Hce.02') } + } + + boolean isGfDrama() { + asList(getWork()['genreForm']).any { it['@id'] in DRAMA_GF } + } + + boolean isDrama() { + isSabDrama() || isGfDrama() + } + + boolean hasRole(String relatorIri) { + asList(getWork()['contribution']).any { + asList(it['role']).contains(['@id': relatorIri]) + } + } + + boolean hasTranslator() { + hasRole('https://id.kb.se/relator/translator') + } + + boolean hasDistinguishingEdition() { + (getMainEntity()['editionStatement'] ?: '').toString().toLowerCase().contains("förk") + } + + boolean hasRelationshipWithContribution() { + asList(getWork()['relationship']).any { r -> + asList(r['entity']).any { e -> + e.containsKey('contribution') + } + } + } + + void addComparisonProps() { + if (hasDistinguishingEdition()) { + addToWork('editionStatement') + } + getWork()['_numPages'] = numPages() + } + + void moveSummaryToInstance() { + if (getWork()['summary']) { + getMainEntity()['summary'] = asList(getMainEntity()['summary']) + asList(getWork()['summary']) + getWork().remove('summary') + } + } + + void addToWork(String field) { + getWork()[field] = getMainEntity()[field] + } + + void removeComparisonProps() { + getWork().remove('editionStatement') + getWork().remove('_numPages') + } +} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/DocumentComparator.java b/whelk-core/src/main/groovy/whelk/WorkMerging/DocumentComparator.java new file mode 100644 index 0000000000..2adb902b0b --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/DocumentComparator.java @@ -0,0 +1,231 @@ +package whelk.WorkMerging; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Stack; +import java.util.function.Function; + +public class DocumentComparator { + private static final Comparator BY_HASH = (o1, o2) -> o2.hashCode() - o1.hashCode(); + + private final Function isOrderedList; + + public DocumentComparator() { + this(o -> "termComponentList".equals(o)); + } + + public DocumentComparator(Function isOrderedList) { + if (isOrderedList == null) + throw new NullPointerException(); + this.isOrderedList = isOrderedList; + } + + public boolean isEqual(Map a, Map b) { + if (a == null || b == null || a.size() != b.size()) { + return false; + } + for (Object key : a.keySet()) { + if (!isEqual(a.get(key), b.get(key), key)) { + return false; + } + } + return true; + } + + private boolean isEqual(Object a, Object b, Object key) { + if (a == null || b == null) { + return false; + } + else if (a.getClass() != b.getClass()) { + return (isSingleItemList(a) && isEqual(((List) a).get(0), b, key) + || (isSingleItemList(b) && isEqual(a, ((List) b).get(0), key))); + } + else if (a instanceof Map) { + return isEqual((Map) a, (Map) b); + } + else if (a instanceof List) { + if (isOrderedList.apply(key)) { + return isEqualOrdered((List) a, (List) b); + } else { + return isEqualUnordered((List) a, (List) b); + } + } + else { + return a.equals(b); + } + } + + private boolean isSingleItemList(Object o) { + return o instanceof List && ((List) o).size() == 1; + } + + private boolean isEqualOrdered(List a, List b) { + if (a.size() != b.size()) { + return false; + } + for (int i = 0; i < a.size(); i++) { + if (!isEqual(a.get(i), b.get(i), null)) { + return false; + } + } + return true; + } + + private boolean isEqualUnordered(List a, List b) { + if (a.size() != b.size()) { + return false; + } + + a.sort(BY_HASH); + b.sort(BY_HASH); + + List taken = new ArrayList<>(a.size()); + nextA: for (int i = 0 ; i < a.size() ; i++) { + for (int j = 0 ; j < b.size() ; j++) { + if (!taken.contains(j) && isEqual(a.get(i), b.get(j), null)) { + taken.add(j); + continue nextA; + } + } + return false; + } + + return true; + } + + public boolean isSubset(Map a, Map b) { + if (a == null || b == null || a.size() > b.size()) { + return false; + } + for (Object key : a.keySet()) { + if (!isSubset(a.get(key), b.get(key), key)) { + return false; + } + } + return true; + } + + private boolean isSubset(Object a, Object b, Object key) { + if (a == null || b == null || a.getClass() != b.getClass()) { + return false; + } + else if (a instanceof Map) { + return isSubset((Map) a, (Map) b); + } + else if (a instanceof List) { + if (isOrderedList.apply(key)) { + return isOrderedSubset((List) a, (List) b); + } else { + return isUnorderedSubset((List) a, (List) b); + } + } + else { + return a.equals(b); + } + } + + private boolean isOrderedSubset(List a, List b) { + if (a.size() > b.size()) { + return false; + } + int ixB = 0; + for (int ixA = 0; ixA < a.size(); ixA++) { + if (ixB == b.size()) { + return false; + } + + while (!isSubset(a.get(ixA), b.get(ixB++), null)) { + if (ixB == b.size()) { + return false; + } + } + } + return true; + } + + private boolean isUnorderedSubset(List a, List b) { + return new UnorderedListComparator(a, b).isSubset(); + } + + private class UnorderedListComparator { + List a; + List b; + + Stack stack; + Stack matched; + boolean anyMatch; + Boolean[][] cache; + + UnorderedListComparator(List a, List b) { + this.a = a; + this.b = b; + cache = new Boolean[a.size()][b.size()]; + } + + boolean isSubset() { + // since elements in 'a' might be subsets of more than one element + // in 'b' we must try different ways of matching elements + stack = new Stack<>(); + matched = new Stack<>(); + + nextA(); + while (stack.size() > 0) { + boolean match = isSubset(ixA(), ixB()); + nextB(); + if (match) { + anyMatch = true; + if (!matched.contains(ixB())) { + matched.push(ixB()); + if (matched.size() == a.size()) { + return true; + } + nextA(); + } + } + + while (ixB() == b.size()) { + if (!anyMatch) { + return false; + } + previousA(); + } + } + + return false; + } + + private boolean isSubset(int ixA, int ixB) { + if (cache[ixA][ixB] == null) { + cache[ixA][ixB] = DocumentComparator.this.isSubset(a.get(ixA), b.get(ixB), null); + } + + return cache[ixA][ixB]; + } + + private void previousA() { + stack.pop(); + if (matched.size() > 0) { + matched.pop(); + } + } + + private void nextA() { + stack.push(0); + anyMatch = false; + } + + private void nextB() { + stack.push(stack.pop() + 1); + } + + private int ixA() { + return stack.size() - 1; + } + + private int ixB() { + return stack.size() > 0 ? stack.peek() : -1; + } + } +} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/FieldStatus.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/FieldStatus.groovy new file mode 100644 index 0000000000..a33445d1b4 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/FieldStatus.groovy @@ -0,0 +1,7 @@ +package whelk.WorkMerging + +enum FieldStatus { + EQUAL, + COMPATIBLE, + DIFF +} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/Html.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/Html.groovy new file mode 100644 index 0000000000..c313415618 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/Html.groovy @@ -0,0 +1,111 @@ +package whelk.WorkMerging + +import org.apache.commons.codec.digest.DigestUtils + +import static whelk.WorkMerging.FieldStatus.COMPATIBLE +import static whelk.WorkMerging.FieldStatus.DIFF +import static whelk.WorkMerging.FieldStatus.EQUAL + +class Html { + private static String CSS = Html.class.getClassLoader() + .getResourceAsStream('merge-works/table.css').getText("UTF-8") + + static final String START = """ + + + """ + static final String END = '' + static final String HORIZONTAL_RULE = "

\n" + + static def infoFields = ['reproductionOf', 'instance title', 'work title', 'instance type', 'editionStatement', 'responsibilityStatement', 'encodingLevel', 'publication', 'identifiedBy', 'extent'] + + static String clusterTable(Collection cluster) { + String id = clusterId(cluster.collect { it.doc.shortId }) + String header = """ + + ${id} + ${cluster.collect { doc -> "${doc.doc.shortId}" }.join('\n')} + + + + ${cluster.collect { doc -> "${doc.mainEntityDisplayTitle()}" }.join('\n')} + + """.stripIndent() + + def statuses = WorkComparator.compare(cluster) + + String info = infoFields.collect(fieldRows(cluster, "info")).join('\n') + String equal = statuses.get(EQUAL, []).collect(fieldRows(cluster, cluster.size() > 1 ? EQUAL.toString() : "")).join('\n') + String compatible = statuses.get(COMPATIBLE, []).collect(fieldRows(cluster, COMPATIBLE.toString())).join('\n') + String diff = statuses.get(DIFF, []).collect(fieldRows(cluster, DIFF.toString())).join('\n') + + return """ + + ${header} + ${equal} + ${compatible} + ${diff} + ${info} +
+

+ """ + } + + static String hubTable(List> docs) { + def mergedWorks = docs*.first() + def ids = docs.collect { group -> + group.drop(1).collectEntries { doc -> + [doc.doc.shortId, doc.link()] + } + } + def clusterId = clusterId(ids*.keySet().flatten()) + + String header = """ + + ${clusterId} + ${mergedWorks.collect { "" }.join('\n')} + + """.stripIndent() + + String derivedFrom = + """ + + _derivedFrom + ${ids.collect { "${it.collect { id, link -> "$id" }.join('\n')}" }.join('\n')} + + """.stripIndent() + + def statuses = WorkComparator.compare(mergedWorks) + + String equal = statuses.get(EQUAL, []).collect(fieldRows(mergedWorks, mergedWorks.size() > 1 ? EQUAL.toString() : "")).join('\n') + String compatible = statuses.get(COMPATIBLE, []).collect(fieldRows(mergedWorks, COMPATIBLE.toString())).join('\n') + String diff = statuses.get(DIFF, []).collect(fieldRows(mergedWorks, DIFF.toString())).join('\n') + + return """ + + ${header} + ${equal} + ${compatible} + ${diff} + ${derivedFrom} +
+

+ """ + } + + static String clusterId(Collection cluster) { + cluster + ? DigestUtils.md5Hex(cluster.sort().first()).toUpperCase().substring(0, 12) + : "" + } + + private static def fieldRows(Collection cluster, String cls) { + { field -> + """ + + ${field} + ${cluster.collect { "${it.getDisplayText(field)}" }.join('\n')} + """.stripIndent() + } + } +} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/Util.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/Util.groovy new file mode 100644 index 0000000000..07a876cff3 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/Util.groovy @@ -0,0 +1,306 @@ +package whelk.WorkMerging + +import org.apache.commons.lang3.StringUtils +import whelk.Whelk +import whelk.util.Unicode + +import java.util.regex.Pattern + +class Util { + static def titleComponents = ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', 'marc:parallelTitle', 'marc:equalTitle'] + + static def titleVariant = ['Title', 'ParallelTitle'] + // removed 'VariantTitle', 'CoverTitle' since they sometimes contain random generic stuff like "Alibis filmroman", "Kompisböcker för de yngsta" + + static enum Relator { + TRANSLATOR('https://id.kb.se/relator/translator'), + AUTHOR('https://id.kb.se/relator/author'), + ILLUSTRATOR('https://id.kb.se/relator/illustrator'), + AUTHOR_OF_INTRO('https://id.kb.se/relator/authorOfIntroduction'), + ADAPTER('https://id.kb.se/relator/adapter'), + COVER_DESIGNER('https://id.kb.se/relator/coverDesigner'), + COMPILER('https://id.kb.se/relator/compiler'), + AUTHOR_OF_AFTERWORD('https://id.kb.se/relator/authorOfAfterwordColophonEtc'), + PHOTOGRAPHER('https://id.kb.se/relator/photographer'), + EDITOR('https://id.kb.se/relator/editor'), + UNSPECIFIED_CONTRIBUTOR('https://id.kb.se/relator/unspecifiedContributor'), + PRIMARY_RIGHTS_HOLDER('https://id.kb.se/relator/primaryRightsHolder') + + String iri + + private Relator(String iri) { + this.iri = iri + } + } + +// private static Set IGNORED_SUBTITLES = WorkToolJob.class.getClassLoader() +// .getResourceAsStream('merge-works/ignored-subtitles.txt') +// .readLines().grep().collect(Util.&normalize) as Set + + private static Set GENERIC_TITLES = WorkToolJob.class.getClassLoader() + .getResourceAsStream('merge-works/generic-titles.txt') + .readLines().grep().collect(Util.&normalize) as Set + + static def noise = + [",", '"', "'", '[', ']', ',', '.', '.', ':', ';', '-', '(', ')', ' the ', '-', '–', '+', '!', '?'].collectEntries { [it, ' '] } + + + static List asList(Object o) { + (o ?: []).with { it instanceof List ? it : [it] } + } + + /** + * Partition a collection based on equality condition + * + * NOTE: O(n^2)... + */ + static Collection> partition(Collection collection, Closure matcher) { + List> result = [] + + for (T t : collection) { + boolean match = false + for (List group : result) { + if (groupMatches(t, group, matcher)) { + group.add(t) + match = true + break + } + } + + if (!match) { + result.add([t]) + } + } + return result + } + + static boolean groupMatches(T t, List group, Closure matcher) { + for (T other : group) { + if (matcher(other, t)) { + return true + } + } + return false + } + + static boolean hasGenericTitle(List hasTitle) { + hasTitle.any { it['mainTitle'] && normalize((String) it['mainTitle']) in GENERIC_TITLES } + } + + static List dropSubTitles(List hasTitle) { + hasTitle.collect { t -> + def copy = new TreeMap(t) + copy.subMap(copy.keySet() - ['subtitle', 'titleRemainder']) + } + } + +// static List dropGenericSubTitles(List hasTitle) { +// hasTitle.collect { +// def copy = new TreeMap(it) +// if (copy['subtitle'] || copy['titleRemainder']) { +// DocumentUtil.traverse(copy) { value, path -> +// if (('subtitle' in path || 'titleRemainder' in path) && value instanceof String && genericSubtitle(value)) { +// new DocumentUtil.Remove() +// } +// } +// } +// copy +// } +// } + + static List flatTitles(List hasTitle) { + dropSubTitles(hasTitle).collect { + def title = new TreeMap<>() + title['flatTitle'] = normalize(Doc.flatten(it, titleComponents)) + if (it['@type']) { + title['@type'] = it['@type'] + } + + title + } + } + +// private static boolean genericSubtitle(String s) { +// s = Util.normalize(s) +// if (s.startsWith("en ")) { +// s = s.substring("en ".length()) +// } +// return s in IGNORED_SUBTITLES +// } + + static String normalize(String s) { + return Unicode.asciiFold(Unicode.normalizeForSearch(StringUtils.normalizeSpace(" $s ".toLowerCase().replace(noise)))) + } + + static Object getPathSafe(item, path, defaultTo = null) { + for (p in path) { + if ((item instanceof Collection || item instanceof Map) && item[p] != null) { + item = item[p] + } else { + return defaultTo + } + } + return item + } + + + static List getTitleVariants(List hasTitle) { + flatTitles(hasTitle) + .grep { it['@type'] in titleVariant } + .collect { it['flatTitle'] } + } + + static String chipString(def thing, Whelk whelk) { + if (thing instanceof Integer) { + return thing + } + + def chips = whelk.jsonld.toChip(thing) + if (chips.size() < 2) { + chips = thing + } + if (chips instanceof List) { + return chips.collect { valuesString(it) }.sort().join('
') + } + return valuesString(chips) + } + + private static String valuesString(def thing) { + if (thing instanceof List) { + return thing.collect { valuesString(it) }.join(' • ') + } + if (thing instanceof Map) { + return thing.findAll { k, v -> k != '@type' }.values().collect { valuesString(it) }.join(' • ') + } + return thing.toString() + } + + // (docs on some of these levels are normally filtered out before we reach here) + static List bestEncodingLevel = [ + 'marc:FullLevel', + 'marc:FullLevelMaterialNotExamined', + 'marc:MinimalLevel', + 'marc:LessThanFullLevelMaterialNotExamined', + 'marc:CoreLevel', + 'marc:AbbreviatedLevel', + 'marc:PartialPreliminaryLevel', + 'marc:PrepublicationLevel', + null + ] + + // Return the most common title for the best encodingLevel + static Object bestTitle(Collection docs) { + def isTitle = { it.'@type' == 'Title' } + def addSource = { t, d -> t.plus(['source': [d.getMainEntity().subMap('@id')]]) } + + for (def level : bestEncodingLevel) { + def titles = docs + .findAll { it.encodingLevel() == level } + .collect { d -> + d.getWork().get('hasTitle')?.findAll(isTitle) + ?: d.getMainEntity().get('hasTitle')?.findResults { isTitle(it) ? addSource(it, d) : null } + } + .grep() + + if (!titles) { + continue + } + + titles = titles.collect(Util.&dropSubTitles) + return partition(titles, { a, b -> a == b }).sort { it.size() }.reverse().first().first() + } + + return null + } + + static Map>> parseRespStatement(String respStatement) { + def parsedContributions = [:] + + respStatement.split(';').eachWithIndex { part, i -> + // TODO: generalize for other material types + parseSwedishFictionContribution(StringUtils.normalizeSpace(part), i == 0).each { name, roles -> + parsedContributions + .computeIfAbsent(name, r -> []) + .addAll(roles) + } + } + + return parsedContributions + } + + private static Map>> parseSwedishFictionContribution(String contribution, boolean isFirstPart) { + def roleToPattern = + [ + (Relator.TRANSLATOR) : ~/(bemynd(\w+|\.)? )?öf?v(\.|ers(\.|\p{L}+)?)( (till|från) \p{L}+)?|(till svenskan?|från \p{L}+)|svensk text/, + (Relator.AUTHOR) : ~/^(text(e[nr])?|skriven|written)/, + (Relator.ILLUSTRATOR) : ~/\bbild(er)?|ill(\.|ustr(\.|\w+)?)|\bvi(gn|nj)ett(er|ill)?|ritad/, + (Relator.AUTHOR_OF_INTRO) : ~/förord|inl(edn(\.|ing)|edd)/, + (Relator.COVER_DESIGNER) : ~/omslag/, + (Relator.AUTHOR_OF_AFTERWORD): ~/efter(ord|skrift)/, + (Relator.PHOTOGRAPHER) : ~/\bfoto\w*\.?/, + (Relator.EDITOR) : ~/red(\.(?! av)|aktör(er)?)|\bbearb(\.|\w+)?|återberättad|sammanställ\w*/, + ] + + def rolePattern = ~/((?iu)${roleToPattern.values().join('|')})/ + def followsRolePattern = ~/(:| a[fv]| by) / + def initialPattern = ~/\p{Lu}/ + def namePattern = ~/\p{Lu}:?\p{Ll}+('\p{Ll})?(,? [Jj](r|unior))?/ + def betweenNamesPattern = ~/-| |\. ?| (de(l| la)?|von|van( de[nr])?|v\.|le|af|du|dos) | [ODdLl]'/ + def fullNamePattern = ~/(($initialPattern|$namePattern)($betweenNamesPattern)?)*$namePattern/ + def conjPattern = ~/ (och|&|and) / + def roleAfterNamePattern = ~/( ?\(($rolePattern$conjPattern)?$rolePattern\))/ + def fullContributionPattern = ~/(($rolePattern($conjPattern|\/))*$rolePattern$followsRolePattern)?$fullNamePattern($conjPattern$fullNamePattern)*$roleAfterNamePattern?/ + + // Make roles lower case so that they can't be mistaken for names + contribution = (contribution =~ rolePattern)*.first() + .collectEntries { [it, it.toLowerCase()] } + .with { contribution.replace(it) } + + def nameToRoles = [:] + + def matched = (contribution =~ fullContributionPattern)*.first() + + matched.each { m -> + // Extract roles from the contribution + def roles = roleToPattern + .findAll { k, v -> m =~ /(?iu)$v/ } + .with { + it.isEmpty() && contribution =~ /.+$followsRolePattern/ + ? [new Tuple2(Relator.UNSPECIFIED_CONTRIBUTOR, isFirstPart)] + : it.collect { role, pattern -> new Tuple2(role, isFirstPart) } + } + + // Author should be the role if first part of respStatement (before ';') and no role seems to be stated + if (roles.isEmpty() && isFirstPart) { + roles << new Tuple2(Relator.AUTHOR, isFirstPart) + } + + // Extract names from the contribution + def names = parseNames(fullNamePattern, conjPattern, m) + + // Assign the roles to each name + nameToRoles.putAll(names.collectEntries { [it, roles] }) + } + + return nameToRoles + } + + private static List parseNames(Pattern namePattern, Pattern conjPattern, String s) { + def names = [] + + (s =~ namePattern).each { + def name = it.first() + // Handle the case of "Jan och Maria Larsson" + def previousName = names.isEmpty() ? null : names.last() + if (previousName?.split()?.size() == 1 && s =~ /$previousName$conjPattern$name/) { + def nameParts = name.split() + if (nameParts.size() > 1) { + names[-1] += " ${nameParts.last()}" + } + } + names << name + } + + return names + } +} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/WorkComparator.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/WorkComparator.groovy new file mode 100644 index 0000000000..faa369e9d9 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/WorkComparator.groovy @@ -0,0 +1,136 @@ +package whelk.WorkMerging + +import whelk.WorkMerging.compare.Classification +import whelk.WorkMerging.compare.SameOrEmpty +import whelk.WorkMerging.compare.Default +import whelk.WorkMerging.compare.Extent +import whelk.WorkMerging.compare.FieldHandler +import whelk.WorkMerging.compare.GenreForm +import whelk.WorkMerging.compare.StuffSet +import whelk.WorkMerging.compare.Subject +import whelk.WorkMerging.compare.TranslationOf +import whelk.WorkMerging.compare.ValuePicker +import whelk.WorkMerging.compare.WorkTitle + +import static whelk.WorkMerging.Util.bestTitle + +class WorkComparator { + Set fields + DocumentComparator c = new DocumentComparator() + + Map comparators = [ + 'classification' : new Classification(), + 'contentType' : new SameOrEmpty('https://id.kb.se/term/rda/Text'), + 'genreForm' : new GenreForm(), + 'hasTitle' : new WorkTitle(), + 'intendedAudience': new SameOrEmpty('https://id.kb.se/marc/Juvenile'), + '_numPages' : new Extent(), + 'subject' : new Subject(), + 'summary' : new StuffSet(), + 'translationOf' : new TranslationOf(), + ] + + static FieldHandler DEFAULT = new Default() + + WorkComparator(Set fields) { + this.fields = new HashSet<>(fields) + } + + boolean sameWork(Doc a, Doc b) { + fields.every { compare(a, b, it).with { it == EQUAL || it == COMPATIBLE } } + } + + FieldStatus compare(Doc a, Doc b, String field) { + Object oa = a.getWork().get(field) + Object ob = b.getWork().get(field) + + if (oa == null && ob == null) { + return FieldStatus.EQUAL + } + + compareExact(oa, ob, field) == FieldStatus.EQUAL + ? FieldStatus.EQUAL + : compareDiff(a, b, field) + } + + Map merge(Collection docs) { + Map result = [:] + + if (docs.size() > 1) { + fields.each { field -> + FieldHandler h = comparators.getOrDefault(field, DEFAULT) + def value = h instanceof ValuePicker + ? h.pick(docs) + : mergeField(field, h, docs) + + if (value) { + result[field] = value + } + } + } else { + result = docs[0].workCopy() + } + + if (!result['hasTitle']) { + def bestTitle = bestTitle(docs) + if (bestTitle) { + result['hasTitle'] = bestTitle + } + } + + return result + } + + // TODO: preserve order? e.g. subject + private Object mergeField(String field, FieldHandler h, Collection docs) { + Object value = docs.first().getWork().get(field) + def rest = docs.drop(1) + rest.each { + value = h.merge(value, it.getWork().get(field)) + } + return value + } + + private FieldStatus compareDiff(Doc a, Doc b, String field) { + comparators.getOrDefault(field, DEFAULT).isCompatible(a.getWork().get(field), b.getWork().get(field)) + ? FieldStatus.COMPATIBLE + : FieldStatus.DIFF + } + + private FieldStatus compareExact(Object oa, Object ob, String field) { + c.isEqual([(field): oa], [(field): ob]) ? FieldStatus.EQUAL : FieldStatus.DIFF + } + + static Map> compare(Collection cluster) { + WorkComparator c = new WorkComparator(allFields(cluster)) + + Map> result = [:] + c.fieldStatuses(cluster).each { f, s -> result.get(s, []) << f } + return result + } + + static Set allFields(Collection cluster) { + Set fields = new HashSet<>() + cluster.each { fields.addAll(it.getWork().keySet()) } + return fields - 'summary' // - 'summary' only temporary, remove when summaries have been moved to instance (LXL-3303) + } + + Map fieldStatuses(Collection cluster) { + fields.collectEntries { [it, fieldStatus(cluster, it)] } + } + + FieldStatus fieldStatus(Collection cluster, String field) { + boolean anyCompat = false + [cluster, cluster].combinations().findResult { List combination -> + Doc a = combination.first() + Doc b = combination.last() + + def c = compare(a, b, field) + if (c == FieldStatus.COMPATIBLE) { + anyCompat = true + } + c == FieldStatus.DIFF ? c : null + } ?: (anyCompat ? FieldStatus.COMPATIBLE : FieldStatus.EQUAL) + } + +} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging.java b/whelk-core/src/main/groovy/whelk/WorkMerging/WorkMerging.java similarity index 98% rename from whelk-core/src/main/groovy/whelk/WorkMerging.java rename to whelk-core/src/main/groovy/whelk/WorkMerging/WorkMerging.java index 6d9fb2e6d0..99a2105145 100644 --- a/whelk-core/src/main/groovy/whelk/WorkMerging.java +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/WorkMerging.java @@ -1,4 +1,8 @@ -package whelk; +package whelk.WorkMerging; + +import whelk.Document; +import whelk.IdGenerator; +import whelk.Whelk; import java.util.*; diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/WorkToolJob.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/WorkToolJob.groovy new file mode 100644 index 0000000000..d30deeb9ca --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/WorkToolJob.groovy @@ -0,0 +1,797 @@ +package whelk.WorkMerging + + +import whelk.Document +import whelk.IdGenerator +import whelk.JsonLd +import whelk.Whelk +import whelk.exception.WhelkRuntimeException +import whelk.util.LegacyIntegrationTools +import whelk.util.Statistics + +import java.text.SimpleDateFormat +import java.util.concurrent.ExecutorService +import java.util.concurrent.Executors +import java.util.concurrent.TimeUnit +import java.util.concurrent.atomic.AtomicInteger +import java.util.function.Function + +import static whelk.WorkMerging.FieldStatus.DIFF + +import static whelk.WorkMerging.Util.asList +import static whelk.WorkMerging.Util.chipString +import static whelk.WorkMerging.Util.getPathSafe +import static whelk.WorkMerging.Util.normalize +import static whelk.WorkMerging.Util.partition +import static whelk.WorkMerging.Util.parseRespStatement +import static whelk.WorkMerging.Util.Relator + +class WorkToolJob { + Whelk whelk + Statistics statistics + File clusters + + String date = new SimpleDateFormat('yyyyMMdd-HHmmss').format(new Date()) + String jobId = IdGenerator.generate() + File reportDir = new File("reports/$date/merged-works") + + String changedIn = "xl" + String changedBy = "SEK" + String generationProcess = 'https://libris.kb.se/sys/merge-works' + boolean dryRun = true + boolean skipIndex = false + boolean loud = false + boolean verbose = false + + WorkToolJob(File clusters) { + this.clusters = clusters + + this.whelk = Whelk.createLoadedSearchWhelk('secret', true) + this.statistics = new Statistics() + } + + public static Closure qualityMonographs = { Doc doc -> + (doc.isText() + && doc.isMonograph() + && !doc.hasPart() + && (doc.encodingLevel() != 'marc:PartialPreliminaryLevel' && doc.encodingLevel() != 'marc:PrepublicationLevel')) + && !doc.hasRelationshipWithContribution() + } + + void show() { + println(Html.START) + run({ cluster -> + return { + try { + Collection> docs = titleClusters(cluster) + + if (docs.isEmpty() || docs.size() == 1 && docs.first().size() == 1) { + return + } + + println(docs + .collect { it.sort { a, b -> a.getWork()['@type'] <=> b.getWork()['@type'] } } + .collect { it.sort { it.numPages() } } + .collect { Html.clusterTable(it) } + .join('') + Html.HORIZONTAL_RULE + ) + } + catch (NoWorkException e) { + System.err.println(e.getMessage()) + } + catch (Exception e) { + System.err.println(e.getMessage()) + e.printStackTrace(System.err) + } + } + }) + println(Html.END) + } + + void showWorks() { + println(Html.START) + run({ cluster -> + return { + try { + println(mergedWorks(titleClusters(cluster)).findAll { it.derivedFrom.size() > 1 } + .collect { [new Doc(whelk, it.work)] + it.derivedFrom } + .collect { Html.clusterTable(it) } + .join('') + Html.HORIZONTAL_RULE + ) + } + catch (Exception e) { + System.err.println(e.getMessage()) + e.printStackTrace(System.err) + } + } + }) + println(Html.END) + } + + void showHubs() { + println(Html.START) + run({ cluster -> + return { + try { + def hub = mergedWorks(titleClusters(cluster)) + .collect { [new Doc(whelk, it.work)] + it.derivedFrom } + if (hub.size() > 1) { + println(Html.hubTable(hub) + Html.HORIZONTAL_RULE) + } + } + catch (Exception e) { + System.err.println(e.getMessage()) + e.printStackTrace(System.err) + } + } + }) + println(Html.END) + } + + void merge() { + def s = statistics.printOnShutdown() + reportDir.mkdirs() + + run({ cluster -> + return { + def titles = titleClusters(cluster) + def works = mergedWorks(titles) + + works.each { + if (it.derivedFrom.size() > 1) { + store(it) + } + } + + String report = htmlReport(titles, works) + + new File(reportDir, "${Html.clusterId(cluster)}.html") << report + works.each { + s.increment('num derivedFrom', "${it.derivedFrom.size()}", it.work.shortId) + new File(reportDir, "${it.work.shortId}.html") << report + } + } + }) + } + + void revert() { + run({ cluster -> + return { + def docs = cluster.collect(whelk.&getDocument).grep() + + Set works = [] + + docs.each { Document d -> + def sum = d.getChecksum(whelk.jsonld) + works << getPathSafe(d.data, d.workIdPath) + def revertTo = whelk.storage.loadAllVersions(d.shortId) + .reverse() + .find { v -> getPathSafe(v.data, v.workIdPath) == null } + d.data = revertTo.data + d.setGenerationDate(new Date()) + d.setGenerationProcess(generationProcess) + whelk.storeAtomicUpdate(d, !loud, changedIn, changedBy, sum) + } + + works.grep().each { + def shortId = it.split("[#/]")[-2] + whelk.remove(shortId, changedIn, changedBy) + } + } + }) + } + + String htmlReport(Collection> titleClusters, Collection works) { + if (titleClusters.isEmpty() || titleClusters.size() == 1 && titleClusters.first().size() == 1) { + return "" + } + + StringBuilder s = new StringBuilder() + + s.append(Html.START) + s.append("

Title cluster(s)

") + titleClusters.each { it.each { it.addComparisonProps() } } + + titleClusters + .collect { it.sort { a, b -> a.getWork()['@type'] <=> b.getWork()['@type'] } } + .collect { it.sort { it.numPages() } } + .each { + s.append(Html.clusterTable(it)) + s.append(Html.HORIZONTAL_RULE) + } + titleClusters.each { it.each { it.removeComparisonProps() } } + + s.append("

Extracted works

") + works.collect { [new Doc(whelk, it.work)] + it.derivedFrom } + .each { s.append(Html.clusterTable(it)) } + + s.append(Html.END) + return s.toString() + } + + class MergedWork { + Document work + Collection derivedFrom + } + + private Document buildWorkDocument(Map workData) { + String workId = IdGenerator.generate() + + workData['@id'] = "TEMPID#it" + Document d = new Document([ + "@graph": [ + [ + "@id" : "TEMPID", + "@type" : "Record", + "mainEntity" : ["@id": "TEMPID#it"], + "technicalNote": [[ + "@type" : "TechnicalNote", + "hasNote": [[ + "@type": "Note", + "label": ["Maskinellt utbrutet verk... TODO"] + ]], + "uri" : ["http://xlbuild.libris.kb.se/works/$date/merged-works/${workId}.html".toString()] + + ] + ]], + workData + ] + ]) + + d.setGenerationDate(new Date()) + d.setGenerationProcess(generationProcess) + d.deepReplaceId(Document.BASE_URI.toString() + workId) + return d + } + + private void store(MergedWork work) { + if (!dryRun) { + whelk.setSkipIndex(skipIndex) + if (!whelk.createDocument(work.work, changedIn, changedBy, + LegacyIntegrationTools.determineLegacyCollection(work.work, whelk.getJsonld()), false)) { + throw new WhelkRuntimeException("Could not store new work: ${work.work.shortId}") + } + + String workIri = work.work.thingIdentifiers.first() + + work.derivedFrom + .collect { it.ogDoc } + .each { + def sum = it.getChecksum(whelk.jsonld) + it.data[JsonLd.GRAPH_KEY][1]['instanceOf'] = [(JsonLd.ID_KEY): workIri] + it.setGenerationDate(new Date()) + it.setGenerationProcess(generationProcess) + whelk.storeAtomicUpdate(it, !loud, changedIn, changedBy, sum) + } + } + } + + private Collection mergedWorks(Collection titleClusters) { + def works = [] + titleClusters.each { titleCluster -> + titleCluster.sort { it.numPages() } + WorkComparator c = new WorkComparator(WorkComparator.allFields(titleCluster)) + + works.addAll(partition(titleCluster, { Doc a, Doc b -> c.sameWork(a, b) }) + .each { work -> work.each { doc -> doc.removeComparisonProps() } } + .collect { new MergedWork(work: buildWorkDocument(c.merge(it)), derivedFrom: it) }) + } + + return works + } + + + void subTitles() { + statistics.printOnShutdown(10) + run({ cluster -> + return { + String titles = cluster.collect(whelk.&getDocument).collect { + getPathSafe(it.data, ['@graph', 1, 'hasTitle', 0, 'subtitle']) + }.grep().join('\n') + + if (!titles.isBlank()) { + println(titles + '\n') + } + } + }) + } + + void printInstanceValue(String field) { + run({ cluster -> + return { + String values = cluster.collect(whelk.&getDocument).collect { + "${it.shortId}\t${getPathSafe(it.data, ['@graph', 1, field])}" + }.join('\n') + + println(values + '\n') + } + }) + } + + void fictionNotFiction() { + run({ cluster -> + return { + Collection> titleClusters = titleClusters(cluster) + + for (titleCluster in titleClusters) { + if (titleCluster.size() > 1) { + def statuses = WorkComparator.compare(cluster) + if (!statuses[DIFF].contains('contribution')) { + String gf = titleCluster.collect { it.getDisplayText('genreForm') }.join(' ') + if (gf.contains('marc/FictionNotFurtherSpecified') && gf.contains('marc/NotFictionNotFurtherSpecified')) { + println(titleCluster.collect { it.getDoc().shortId }.join('\t')) + } + } + } + } + } + }) + } + + void swedishFiction() { + def swedish = { Doc doc -> + Util.asList(doc.getWork()['language']).collect { it['@id'] } == ['https://id.kb.se/language/swe'] + } + + run({ cluster -> + return { + def c = loadDocs(cluster) + .findAll(qualityMonographs) + .findAll(swedish) + .findAll { d -> !d.isDrama() } + + if (c.any { it.isFiction() } && !c.any { it.isNotFiction() }) { + println(c.collect { it.doc.shortId }.join('\t')) + } + } + }) + } + + void filterClusters(Closure> predicate) { + run({ cluster -> + return { + if (predicate(loadDocs(cluster))) { + println(cluster.join('\t')) + } + } + }) + } + + void filterDocs(Closure predicate) { + run({ cluster -> + return { + def c = loadDocs(cluster).findAll(predicate) + if (c.size() > 0) { + println(c.collect { it.doc.shortId }.join('\t')) + } + } + }) + } + + void translationNoTranslator() { + run({ cluster -> + return { + def c = loadDocs(cluster) + + if (c) { + if (c.any { it.isTranslation() }) { + if (c.any { it.hasTranslator() }) { + c = c.findAll { !it.isTranslationWithoutTranslator() } + } else { + int pages = c.first().numPages() + if (c.any { it.numPages() != pages }) { + return // drop cluster + } + } + } + } + + if (c.size() > 0) { + println(c.collect { it.doc.shortId }.join('\t')) + } + } + }) + } + + void outputTitleClusters() { + run({ cluster -> + return { + titleClusters(cluster).findAll { it.size() > 1 }.each { + println(it.collect { it.doc.shortId }.join('\t')) + } + } + }) + } + + void add9pu() { + statistics.printOnShutdown() + run({ cluster -> + return { + statistics.increment('add 9pu', 'clusters checked') + def docs = cluster + .collect(whelk.&getDocument) + .findAll() + .collect { [doc: it, checksum: it.getChecksum(whelk.jsonld), changed: false] } + + def ill = ['@id': Relator.ILLUSTRATOR.iri] + def pu = ['@id': Relator.PRIMARY_RIGHTS_HOLDER.iri] + def path = ['@graph', 1, 'instanceOf', 'contribution'] + + docs.each { + Document d = it.doc + + statistics.increment('add 9pu', 'docs checked') + + getPathSafe(d.data, path, []).each { Map c -> + def r = asList(c.role) + + if (pu in r || !(ill in r) || c.'@type' == 'PrimaryContribution') + return + + for (Map other : docs) { + Document od = other.doc + + def found9pu = false + + getPathSafe(od.data, path, []).each { Map oc -> + if (asList(c.agent) == asList(oc.agent) && asList(oc.role).containsAll([ill, pu])) { + c.role = asList(c.role) + pu + found9pu = true + statistics.increment('add 9pu', "9pu added") + if (verbose) { + println("${d.shortId} <- ${od.shortId}") + } + return + } + } + + if (found9pu) { + println(c) + it.changed = true + break + } + } + } + } + + docs.each { + if (!dryRun && it.changed) { + Document d = it.doc + d.setGenerationDate(new Date()) + d.setGenerationProcess(generationProcess) + whelk.storeAtomicUpdate(d, !loud, changedIn, changedBy, it.checksum) + } + } + } + }) + } + + void fetchContributionFromRespStatement() { + def loadThingByIri = { String iri -> + // TODO: fix whelk, add load by IRI method + whelk.storage.loadDocumentByMainId(iri)?.with { doc -> + return (Map) doc.data['@graph'][1] + } + } + + def loadIfLink = { it['@id'] ? loadThingByIri(it['@id']) : it } + + statistics.printOnShutdown() + run({ cluster -> + return { + statistics.increment('fetch contribution from respStatement', 'clusters checked') + def docs = cluster + .collect(whelk.&getDocument) + .findAll() + .collect { [doc: it, checksum: it.getChecksum(whelk.jsonld), changed: false] } + + docs.each { + Document d = it.doc + def respStatement = getPathSafe(d.data, ['@graph', 1, 'responsibilityStatement']) + if (!respStatement) + return + + statistics.increment('fetch contribution from respStatement', 'docs checked') + + def contributionsInRespStmt = parseRespStatement(respStatement) + def contribution = getPathSafe(d.data, ['@graph', 1, 'instanceOf', 'contribution'], []) + + contribution.each { Map c -> + asList(c.agent).each { a -> + def matchedOnName = contributionsInRespStmt.find { n, r -> + nameMatch(n, loadIfLink(a)) + } + + if (!matchedOnName) + return + + // Contributor found locally, omit from further search + contributionsInRespStmt.remove(matchedOnName.key) + + def dontAdd = { Relator relator, boolean isFirstStmtPart -> + relator == Relator.UNSPECIFIED_CONTRIBUTOR + || isFirstStmtPart && relator == Relator.AUTHOR + && c.'@type' != 'PrimaryContribution' + } + + def rolesInRespStatement = matchedOnName.value + .findResults { dontAdd(it) ? null : it.getV1() } + + if (rolesInRespStatement.isEmpty()) + return + + def rolesInContribution = asList(c.role).findAll { it.'@id' != Relator.UNSPECIFIED_CONTRIBUTOR.iri } + + // Replace Adapter with Editor + it.changed |= rolesInRespStatement.removeAll { r -> + r == Relator.EDITOR && rolesInContribution.findIndexOf { + it.'@id' == Relator.ADAPTER.iri + }.with { + if (it == -1) { + return false + } else { + rolesInContribution[it]['@id'] = Relator.EDITOR.iri + return true + } + } + } + + if (rolesInRespStatement.size() <= rolesInContribution.size()) + return + + rolesInRespStatement.each { r -> + def idLink = ['@id': r.iri] + if (!(idLink in rolesInContribution)) { + rolesInContribution << idLink + it.changed = true + def roleShort = r.iri.split('/').last() + statistics.increment('fetch contribution from respStatement', "$roleShort roles specified") + if (verbose) { + println("${chipString(c, whelk)} (${d.shortId}) <- $roleShort") + } + } + } + + c.role = rolesInContribution + } + } + + def comparable = { + it*.getV1().findResults { Relator r -> + r != Relator.UNSPECIFIED_CONTRIBUTOR + ? ['@id': r.iri] + : null + } + } + + contributionsInRespStmt.each { name, roles -> + for (Map other : docs) { + Document od = other.doc + def matched = getPathSafe(od.data, ['@graph', 1, 'instanceOf', 'contribution'], []) + .find { Map c -> + asList(c.agent).any { a -> + loadIfLink(a).with { nameMatch(name, it) && !(it.description =~ /(?i)pseud/) } + && comparable(roles).with { r -> !r.isEmpty() && asList(c.role).containsAll(r) } + && Util.bestEncodingLevel.indexOf(d.getEncodingLevel()) <= Util.bestEncodingLevel.indexOf(od.getEncodingLevel()) + } + } + if (matched) { + contribution << matched + roles.each { + def roleShort = it.getV1().iri.split('/').last() + statistics.increment('fetch contribution from respStatement', "$roleShort found in cluster") + } + if (verbose) { + println("${d.shortId} <- ${chipString(matched, whelk)} (${od.shortId})") + } + it.changed = true + break + } + } + } + } + + docs.each { + if (!dryRun && it.changed) { + Document d = it.doc + d.setGenerationDate(new Date()) + d.setGenerationProcess(generationProcess) + whelk.storeAtomicUpdate(d, !loud, changedIn, changedBy, it.checksum) + } + } + } + } + + ) + } + + void linkContribution() { + def loadThingByIri = { String iri -> + // TODO: fix whelk, add load by IRI method + whelk.storage.loadDocumentByMainId(iri)?.with { doc -> + return (Map) doc.data['@graph'][1] + } + } + + def loadIfLink = { it['@id'] ? loadThingByIri(it['@id']) : it } + + statistics.printOnShutdown() + run({ cluster -> + return { + statistics.increment('link contribution', 'clusters checked') + // TODO: check work language? + def docs = cluster + .collect(whelk.&getDocument) + .collect { [doc: it, checksum: it.getChecksum(whelk.jsonld), changed: false] } + + List linked = [] + docs.each { d -> + def contribution = getPathSafe(d.doc.data, ['@graph', 1, 'instanceOf', 'contribution'], []) + contribution.each { Map c -> + if (c.agent && c.agent['@id']) { + loadThingByIri(c.agent['@id'])?.with { Map agent -> + agent.roles = asList(c.role) + linked << agent + } + } + } + statistics.increment('link contribution', 'docs checked') + } + + docs.each { + Document d = it.doc + def contribution = getPathSafe(d.data, ['@graph', 1, 'instanceOf', 'contribution'], []) + contribution.each { Map c -> + if (c.agent && !c.agent['@id']) { + def l = linked.find { + agentMatches(c.agent, it) && (!c.role || it.roles.containsAll(c.role)) + } + if (l) { + println("${d.shortId} ${chipString(c, whelk)} --> ${chipString(l, whelk)}") + c.agent = ['@id': l['@id']] + it.changed = true + statistics.increment('link contribution', 'agents linked') + } else if (verbose) { + println("${d.shortId} NO MATCH: ${chipString(c, whelk)} ??? ${linked.collect { chipString(it, whelk) }}") + } + } + } + } + + List primaryAutAgents = [] + docs.each { + def contribution = getPathSafe(it.doc.data, ['@graph', 1, 'instanceOf', 'contribution'], []) + def p = contribution.findAll() + contribution.each { + if (it['@type'] == 'PrimaryContribution' && it['role'] == ['@id': 'https://id.kb.se/relator/author'] && it['agent']) { + Map agent = loadIfLink(it['agent']) + if (agent) { + primaryAutAgents << agent + } + } + } + } + + docs.each { + Document d = it.doc + def contribution = getPathSafe(d.data, ['@graph', 1, 'instanceOf', 'contribution'], []) + contribution.each { Map c -> + if (c['@type'] == 'PrimaryContribution' && !c.role) { + if (c.agent) { + def agent = loadIfLink(c.agent) + if (primaryAutAgents.any { agentMatches(agent, it) }) { + c.role = ['@id': 'https://id.kb.se/relator/author'] + it.changed = true + statistics.increment('link contribution', 'author role added to primary contribution') + } + } + } + } + } + + docs.each { + if (!dryRun && it.changed) { + Document d = it.doc + d.setGenerationDate(new Date()) + d.setGenerationProcess(generationProcess) + whelk.storeAtomicUpdate(d, !loud, changedIn, changedBy, it.checksum) + } + } + } + }) + } + + static boolean agentMatches(Map local, Map linked) { + nameMatch(local, linked) && !yearMismatch(local, linked) + } + + static boolean nameMatch(Object local, Map agent) { + def variants = [agent] + asList(agent.hasVariant) + def name = { + Map p -> + (p.givenName && p.familyName) + ? normalize("${p.givenName} ${p.familyName}") + : p.name ? normalize("${p.name}") : null + } + + def localName = local instanceof Map ? name(local) : normalize(local) + + localName && variants.any { + name(it) && localName == name(it) + } + } + + static boolean yearMismatch(Map local, Map linked) { + def birth = { Map p -> p.lifeSpan?.with { (it.replaceAll(/[^\-0-9]/, '').split('-') as List)[0] } } + def death = { Map p -> p.lifeSpan?.with { (it.replaceAll(/[^\-0-9]/, '').split('-') as List)[1] } } + def b = birth(local) && birth(linked) && birth(local) != birth(linked) + def d = death(local) && death(linked) && death(local) != death(linked) + b || d + } + + private void run(Function, Runnable> f) { + ExecutorService s = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() * 4) + + AtomicInteger i = new AtomicInteger() + clusters.eachLine() { + List cluster = Arrays.asList(it.split(/[\t ]+/)) + + s.submit({ + try { + f.apply(cluster).run() + int n = i.incrementAndGet() + if (n % 100 == 0) { + System.err.println("$n") + } + } + catch (NoWorkException e) { + //println("No work:" + e.getMessage()) + } + catch (Exception e) { + e.printStackTrace() + } + }) + } + + s.shutdown() + s.awaitTermination(1, TimeUnit.DAYS) + } + + private Collection loadDocs(Collection cluster) { + whelk + .bulkLoad(cluster).values() + .collect { new Doc(whelk, it) } + } + + private Collection> titleClusters(Collection cluster) { + loadDocs(cluster) + .findAll(qualityMonographs) + .each { it.addComparisonProps() } + .with { partitionByTitle(it) } + .findAll { it.size() > 1 } + .findAll { !it.any { doc -> doc.hasGenericTitle() } } + .sort { a, b -> a.first().mainEntityDisplayTitle() <=> b.first().mainEntityDisplayTitle() } + } + + Collection> partitionByTitle(Collection docs) { + return partition(docs) { Doc a, Doc b -> + !a.getTitleVariants().intersect(b.getTitleVariants()).isEmpty() + } + } + +} + +class NoWorkException extends RuntimeException { + NoWorkException(String msg) { + super(msg) + } +} + + + + + + + + + diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Classification.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Classification.groovy new file mode 100644 index 0000000000..2dd9a10e7e --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Classification.groovy @@ -0,0 +1,63 @@ +package whelk.WorkMerging.compare + +class Classification extends StuffSet { + @Override + Object merge(Object a, Object b) { + return mergeCompatibleElements(super.merge(a, b)) { c1, c2 -> + String code1 = c1['code'] + String code2 = c2['code'] + if (!code1 || !code2) { + return + } + code1 = code1.trim() + code2 = code2.trim() + + if (isSab(c1) && isSab(c2) && (code1.startsWith(code2) || code2.startsWith(code1))) { + def result = [ + '@type' : 'Classification', + 'code' : code1.size() > code2.size() ? code1 : code2, + inScheme: [ + '@type' : 'ConceptScheme', + 'code' : 'kssb' + ] + ] + def version = maxSabVersion(c1, c2) + if (version) { + result['inScheme']['version'] = version + } + return result + } + else if (isDewey(c1) && isDewey(c2) && code1 == code2) { + Map result = [:] + result.putAll(c1) + result.putAll(c2) + result['editionEnumeration'] = maxDeweyEdition(c1, c2) + return result + } + } + } + + boolean isSab(Map c) { + c['inScheme'] && c['inScheme']['code'] == 'kssb' + } + + String maxSabVersion(c1, c2) { + def v1 = c1['inScheme']['version'] ?: "-1" + def v2 = c2['inScheme']['version'] ?: "-1" + Integer.parseInt(v1) > Integer.parseInt(v2) ? v1 : v2 + } + + boolean isDewey(Map c) { + c['@type'] == 'ClassificationDdc' + } + + String maxDeweyEdition(c1, c2) { + def v1 = c1['editionEnumeration'] + def v2 = c2['editionEnumeration'] + deweyEdition(v1) > deweyEdition(v2) ? v1 : v2 + } + + int deweyEdition(String edition) { + Integer.parseInt((edition ?: "0").replaceAll("[^0-9]", "")) + } +} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Default.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Default.groovy new file mode 100644 index 0000000000..07e0635234 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Default.groovy @@ -0,0 +1,13 @@ +package whelk.WorkMerging.compare + +class Default implements FieldHandler { + @Override + boolean isCompatible(Object a, Object b) { + return false + } + + @Override + Object merge(Object a, Object b) { + return a + } +} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Extent.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Extent.groovy new file mode 100644 index 0000000000..078a3fee78 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Extent.groovy @@ -0,0 +1,15 @@ +package whelk.WorkMerging.compare; + +class Extent implements FieldHandler { + + // TODO: allow one side missing extent (-1)? + @Override + boolean isCompatible(Object a, Object b) { + return true // a * 0.7 < b && a * 1.3 > b + } + + @Override + Object merge(Object a, Object b) { + return b; // not part of final work + } +} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/FieldHandler.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/FieldHandler.groovy new file mode 100644 index 0000000000..22a95fd2a9 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/FieldHandler.groovy @@ -0,0 +1,12 @@ +package whelk.WorkMerging.compare + +import whelk.WorkMerging.Doc + +interface FieldHandler { + boolean isCompatible(Object a, Object b) + Object merge(Object a, Object b) +} + +interface ValuePicker extends FieldHandler { + Object pick(Collection values) +} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/GenreForm.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/GenreForm.groovy new file mode 100644 index 0000000000..690e9353ff --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/GenreForm.groovy @@ -0,0 +1,36 @@ +package whelk.WorkMerging.compare + +import whelk.WorkMerging.DocumentComparator + +//FIXME +class GenreForm extends StuffSet { + private static final DocumentComparator c = new DocumentComparator() + + // Terms that will be merged (values precede keys) + private static def norm = [ + (['@id': 'https://id.kb.se/marc/NotFictionNotFurtherSpecified']): [ + ['@id': 'https://id.kb.se/marc/FictionNotFurtherSpecified'], + ['@id': 'https://id.kb.se/marc/Autobiography'], + ['@id': 'https://id.kb.se/marc/Biography'] + ], + (['@id': 'https://id.kb.se/marc/FictionNotFurtherSpecified']) : [ + ['@id': 'https://id.kb.se/marc/Poetry'], + ['@id': 'https://id.kb.se/marc/Novel'] + ], + ] + + @Override + Object merge(Object a, Object b) { + return mergeCompatibleElements(super.merge(a, b).findAll { it.'@id' }) { gf1, gf2 -> + if (n(gf1, gf2)) { + gf2 + } else if (n(gf2, gf1)) { + gf1 + } + } + } + + boolean n(a, b) { + norm[a]?.any { it == b || n(it, b) } + } +} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/SameOrEmpty.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/SameOrEmpty.groovy new file mode 100644 index 0000000000..f36f580773 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/SameOrEmpty.groovy @@ -0,0 +1,21 @@ +package whelk.WorkMerging.compare + +import static whelk.WorkMerging.Util.asList + +class SameOrEmpty implements FieldHandler { + Object link + + SameOrEmpty(String iri) { + this.link = [['@id': iri]] + } + + @Override + boolean isCompatible(Object a, Object b) { + (!a && asList(b) == link) || (!b && asList(a) == link) + } + + @Override + Object merge(Object a, Object b) { + return a ?: b + } +} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/StuffSet.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/StuffSet.groovy new file mode 100644 index 0000000000..92262086f7 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/StuffSet.groovy @@ -0,0 +1,38 @@ +package whelk.WorkMerging.compare + + +import java.util.function.BiFunction + +import static whelk.WorkMerging.Util.asList + +class StuffSet implements FieldHandler { + @Override + boolean isCompatible(Object a, Object b) { + true + } + + @Override + Object merge(Object a, Object b) { + return ((asList(a) as Set) + (asList(b) as Set)).collect() + } + + static Object mergeCompatibleElements(Object o, BiFunction s) { + boolean changed = false + List result = [] + asList(o).each { + def merged = null + for (int i = 0 ; i < result.size() ; i++) { + merged = s.apply(result[i], it) + if (merged) { + result[i] = merged + changed = true + break + } + } + if (merged == null) { + result << it + } + } + return changed ? mergeCompatibleElements(result, s) : result + } +} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Subject.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Subject.groovy new file mode 100644 index 0000000000..e69fb633e7 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Subject.groovy @@ -0,0 +1,8 @@ +package whelk.WorkMerging.compare + +class Subject extends StuffSet { + @Override + Object merge(Object a, Object b) { + return super.merge(a, b).findAll { it.'@id' || it.'@type' == 'ComplexSubject' } + } +} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/TranslationOf.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/TranslationOf.groovy new file mode 100644 index 0000000000..73836e6fee --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/TranslationOf.groovy @@ -0,0 +1,22 @@ +package whelk.WorkMerging.compare + +import whelk.WorkMerging.DocumentComparator + +class TranslationOf implements FieldHandler { + DocumentComparator c = new DocumentComparator() + + @Override + boolean isCompatible(Object a, Object b) { + // @type is sometimes Work, sometimes Text. Should not matter for comparison + (!a && !b) || a && b && a instanceof Map && b instanceof Map && c.isEqual(noType(a), noType(b)) + } + + @Override + Object merge(Object a, Object b) { + return a // TODO: prefer one @type over another? + } + + Map noType(Map m) { + m.findAll { k, v -> k != '@type' } + } +} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/WorkTitle.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/WorkTitle.groovy new file mode 100644 index 0000000000..4c948af25a --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/WorkTitle.groovy @@ -0,0 +1,23 @@ +package whelk.WorkMerging.compare + +import whelk.WorkMerging.Doc +import whelk.WorkMerging.Util +import org.apache.commons.lang3.NotImplementedException + +class WorkTitle implements ValuePicker { + + @Override + boolean isCompatible(Object a, Object b) { + return !a || !b || !Util.getTitleVariants(a).intersect(Util.getTitleVariants(b)).isEmpty() + } + + @Override + Object merge(Object a, Object b) { + throw new NotImplementedException('') + } + + @Override + Object pick(Collection values) { + return Util.bestTitle(values) + } +} From 68bd85011ce2f6e4519eba44dc938504c224d040 Mon Sep 17 00:00:00 2001 From: Jannis Mohlin Tsiroyannis Date: Wed, 9 Nov 2022 10:10:20 +0100 Subject: [PATCH 9/9] Revert "Move already existing work merging code in from an experimental branch" This reverts commit 8758d0fad8843d28ae70ff7676a5a0cc6d47f91a. --- .../whelk/importer/DatasetImporter.groovy | 3 +- .../whelk/{WorkMerging => }/WorkMerging.java | 6 +- .../whelk/WorkMerging/DisjointSets.java | 220 ----- .../main/groovy/whelk/WorkMerging/Doc.groovy | 363 -------- .../whelk/WorkMerging/DocumentComparator.java | 231 ----- .../whelk/WorkMerging/FieldStatus.groovy | 7 - .../main/groovy/whelk/WorkMerging/Html.groovy | 111 --- .../main/groovy/whelk/WorkMerging/Util.groovy | 306 ------- .../whelk/WorkMerging/WorkComparator.groovy | 136 --- .../whelk/WorkMerging/WorkToolJob.groovy | 797 ------------------ .../WorkMerging/compare/Classification.groovy | 63 -- .../whelk/WorkMerging/compare/Default.groovy | 13 - .../whelk/WorkMerging/compare/Extent.groovy | 15 - .../WorkMerging/compare/FieldHandler.groovy | 12 - .../WorkMerging/compare/GenreForm.groovy | 36 - .../WorkMerging/compare/SameOrEmpty.groovy | 21 - .../whelk/WorkMerging/compare/StuffSet.groovy | 38 - .../whelk/WorkMerging/compare/Subject.groovy | 8 - .../WorkMerging/compare/TranslationOf.groovy | 22 - .../WorkMerging/compare/WorkTitle.groovy | 23 - 20 files changed, 3 insertions(+), 2428 deletions(-) rename whelk-core/src/main/groovy/whelk/{WorkMerging => }/WorkMerging.java (98%) delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/DisjointSets.java delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/Doc.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/DocumentComparator.java delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/FieldStatus.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/Html.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/Util.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/WorkComparator.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/WorkToolJob.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/Classification.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/Default.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/Extent.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/FieldHandler.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/GenreForm.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/SameOrEmpty.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/StuffSet.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/Subject.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/TranslationOf.groovy delete mode 100644 whelk-core/src/main/groovy/whelk/WorkMerging/compare/WorkTitle.groovy diff --git a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy index 394037cf73..3fddb229c8 100644 --- a/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy +++ b/importers/src/main/groovy/whelk/importer/DatasetImporter.groovy @@ -2,7 +2,7 @@ package whelk.importer import groovy.util.logging.Log4j2 as Log import groovy.transform.CompileStatic -import whelk.WorkMerging.WorkMerging +import whelk.WorkMerging import static groovy.transform.TypeCheckingMode.SKIP @@ -11,6 +11,7 @@ import whelk.JsonLd import whelk.TargetVocabMapper import whelk.Whelk import whelk.converter.TrigToJsonLdParser +import whelk.exception.CancelUpdateException import whelk.util.DocumentUtil import static whelk.util.LegacyIntegrationTools.NO_MARC_COLLECTION diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/WorkMerging.java b/whelk-core/src/main/groovy/whelk/WorkMerging.java similarity index 98% rename from whelk-core/src/main/groovy/whelk/WorkMerging/WorkMerging.java rename to whelk-core/src/main/groovy/whelk/WorkMerging.java index 99a2105145..6d9fb2e6d0 100644 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/WorkMerging.java +++ b/whelk-core/src/main/groovy/whelk/WorkMerging.java @@ -1,8 +1,4 @@ -package whelk.WorkMerging; - -import whelk.Document; -import whelk.IdGenerator; -import whelk.Whelk; +package whelk; import java.util.*; diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/DisjointSets.java b/whelk-core/src/main/groovy/whelk/WorkMerging/DisjointSets.java deleted file mode 100644 index 7cd3e33a6c..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/DisjointSets.java +++ /dev/null @@ -1,220 +0,0 @@ -package whelk.WorkMerging; - -import java.util.List; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; - -/** - * This class keeps track of a set of disjoint (non-overlapping) sets. - */ -public class DisjointSets { - /** - * Sets as forest of rooted trees. - * Pointer to parent in tree, root points to itself. - */ - List forest; - - /** - * Rank of each tree node (keeps trees balanced when merging). - */ - List ranks; - - /** - * Sets as circular linked lists (so that we can find all elements in a set). - * Pointer to the next element in the set. - */ - List sets; - - /** - * Map from set element value to index - */ - Map ixs; - - /** - * Map from set element index to value - */ - List ixToValue; - - public DisjointSets(int initialCapacity) { - forest = new ArrayList<>(initialCapacity); - ranks = new ArrayList<>(initialCapacity); - sets = new ArrayList<>(initialCapacity); - ixs = new HashMap<>(initialCapacity); - ixToValue = new ArrayList<>(initialCapacity); - } - - public DisjointSets() { - this(20); - } - - /** - * Create a new set if it doesn't already exist. - * - * @param e initial element in set - */ - public void createSet(T e) { - if (ixs.containsKey(e)) { - return; - } - - int ix = forest.size(); - ixs.put(e, ix); - forest.add(ix); - ranks.add(0); - sets.add(ix); - ixToValue.add(e); - - if (ix == Integer.MAX_VALUE) { - throw new IllegalStateException("size > Integer.MAX_VALUE"); - } - } - - /** - * Add a set, merging it with existing intersecting sets - * - * @param set a set to be added - */ - public void addSet(Iterable set) { - Iterator i = set.iterator(); - if (!i.hasNext()) { - return; - } - - T first = i.next(); - while (i.hasNext()) { - mergeSets(first, i.next()); - } - } - - /** - * Merge two sets identified by elements. - * Sets will be created if they don't exist - * - * @param a an element of the first set - * @param b an element of the second set - */ - public void mergeSets(T a, T b) { - if (!ixs.containsKey(a)) { - createSet(a); - } - if (!ixs.containsKey(b)) { - createSet(b); - } - - int ixA = ixs.get(a); - int ixB = ixs.get(b); - - int rootA = root(ixA); - int rootB = root(ixB); - - if (rootA == rootB) { - return; - } - - int rankA = ranks.get(rootA); - int rankB = ranks.get(rootB); - - if (rankA > rankB) { - forest.set(rootB, rootA); - } else { - forest.set(rootA, rootB); - if (rankA == rankB) { - ranks.set(rootB, rankB + 1); - } - } - - int link = sets.get(rootA); - sets.set(rootA, sets.get(rootB)); - sets.set(rootB, link); - } - - /** - * Lookup a set based on an element in the set - * - * @param e an element in the set - * @return the set - */ - public Set getSet(T e) { - if (!ixs.containsKey(e)) { - throw new IllegalArgumentException("No set with element: " + e); - } - - Set result = new HashSet<>(); - int start = sets.get(ixs.get(e)); - int node = start; - do { - result.add(ixToValue.get(node)); - node = sets.get(node); - } while (node != start); - - return result; - } - - /** - * Iterate over all sets - * - * @param visitor - */ - public void iterateAllSets(SetVisitor visitor) { - boolean[] visited = new boolean[sets.size()]; - - for (int ix : sets) { - if (visited[ix]) { - continue; - } - - int start = sets.get(ix); - int node = start; - do { - visited[node] = true; - visitor.nextElement(ixToValue.get(node)); - node = sets.get(node); - } while (node != start); - - visitor.closeSet(); - } - } - - /** - * @return a set with all sets - */ - public Set> allSets() { - final Set> result = new HashSet<>(); - - iterateAllSets(new SetVisitor() { - Set current = new HashSet<>(); - - public void closeSet() { - result.add(current); - current = new HashSet<>(); - } - - public void nextElement(T e) { - current.add(e); - } - }); - - return result; - } - - private int root(int node) { - while (node != forest.get(node)) { - int parent = forest.get(node); - //path splitting - point node to grandparent - forest.set(node, forest.get(parent)); - node = parent; - } - - return node; - } - - public interface SetVisitor { - void nextElement(T e); - - void closeSet(); - } -} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/Doc.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/Doc.groovy deleted file mode 100644 index c50a7abf85..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/Doc.groovy +++ /dev/null @@ -1,363 +0,0 @@ -package whelk.WorkMerging - -import se.kb.libris.Normalizers -import whelk.Document -import whelk.JsonLd -import whelk.Whelk - -import static whelk.WorkMerging.Util.asList - -class Doc { - public static final String SAOGF_SKÖN = 'https://id.kb.se/term/saogf/Sk%C3%B6nlitteratur' - public static final List MARC_FICTION = [ - 'https://id.kb.se/marc/FictionNotFurtherSpecified', - 'https://id.kb.se/marc/Drama', - 'https://id.kb.se/marc/Essay', - 'https://id.kb.se/marc/Novel', - 'https://id.kb.se/marc/HumorSatiresEtc', - 'https://id.kb.se/marc/Letter', - 'https://id.kb.se/marc/ShortStory', - 'https://id.kb.se/marc/MixedForms', - 'https://id.kb.se/marc/Poetry', - ] - public static final List MARC_NOT_FICTION = [ - 'https://id.kb.se/marc/NotFictionNotFurtherSpecified', - 'https://id.kb.se/marc/Biography' - ] - public static final List DRAMA_GF = [ - 'https://id.kb.se/term/saogf/Dramatik', - 'https://id.kb.se/marc/Drama' - ] - - Whelk whelk - Document doc - Map work - Map framed - List titles - - //FIXME - Document ogDoc - - Doc(Whelk whelk, Document doc) { - this.whelk = whelk - this.doc = doc - this.ogDoc = doc.clone() - } - - Map getWork() { - if (!work) { - work = getWork(whelk, doc) - } - - return work - } - - static Map getWork(Whelk whelk, Document d) { - Map work = Normalizers.getWork(whelk.jsonld, d) - if (!work) { - throw new NoWorkException(d.shortId) - } - work = new HashMap<>(work) - - //TODO 'marc:fieldref' - - work.remove('@id') - return work - } - - Map workCopy() { - return getWork(whelk, doc.clone()) - } - - Map getMainEntity() { - return doc.data['@graph'][1] - } - - boolean isInstance() { - return getMainEntity().containsKey('instanceOf') - } - - List getTitleVariants() { - if (!titles) { - titles = Util.getTitleVariants(getMainEntity()['hasTitle']) - } - - return titles - } - - boolean hasGenericTitle() { - Util.hasGenericTitle(getMainEntity()['hasTitle']) - } - - private static String displayTitle(Map thing) { - thing['hasTitle'].collect { it['@type'] + ": " + it['flatTitle'] }.join(', ') - } - - String mainEntityDisplayTitle() { - displayTitle(['hasTitle': Util.flatTitles(getMainEntity()['hasTitle'])]) - } - - String link() { - String base = Document.getBASE_URI().toString() - String kat = "katalogisering/" - String id = doc.shortId - return base + kat + id - } - - boolean isMonograph() { - getMainEntity()['issuanceType'] == 'Monograph' - } - - boolean hasPart() { - getWork()['hasPart'] != null - } - - String encodingLevel() { - return doc.data['@graph'][0]['encodingLevel'] ?: '' - } - - int numPages() { - String extent = Util.getPathSafe(getMainEntity(), ['extent', 0, 'label', 0]) ?: Util.getPathSafe(getMainEntity(), ['extent', 0, 'label'], '') - return numPages(extent) - } - - // TODO: improve parsing https://metadatabyran.kb.se/beskrivning/materialtyper-arbetsfloden/tryckta-monografier/omfang-for-tryckta-monografier - static int numPages(String extentLabel) { - def l = extentLabel.replace('onumrerade', '') - def matcher = l =~ /(\d+)(?=[, \[\]0-9]*[sp])/ - List pages = [] - while (matcher.find()) { - pages << Integer.parseInt(matcher.group(1)) - } - pages ? pages.max() : -1 - } - - // TODO... - String getDisplayText(String field) { - if (field == 'contribution') { - return contributorStrings().join("
") - } else if (field == 'classification') { - return classificationStrings().join("
") - } else if (field == 'instance title') { - return isInstance() ? (getMainEntity()['hasTitle'] ?: '') : '' - } else if (field == 'work title') { - // To load hasTitle from linked work in instanceOf we can use getFramed() - // However we then need to handle that getFramed() loads linked instances in hasTitle.source - // Prefer getMainEntity() for now - return isInstance() ? (getMainEntity()['instanceOf']['hasTitle'] ?: '') : (getMainEntity()['hasTitle'] ?: '') - } else if (field == 'instance type') { - return isInstance() ? getMainEntity()['@type'] : '' - } else if (field == 'editionStatement') { - return getMainEntity()['editionStatement'] ?: '' - } else if (field == 'responsibilityStatement') { - return getMainEntity()['responsibilityStatement'] ?: '' - } else if (field == 'encodingLevel') { - return encodingLevel() - } else if (field == 'publication') { - return chipString(getMainEntity()['publication'] ?: []) - } else if (field == 'identifiedBy') { - return chipString(getMainEntity()['identifiedBy'] ?: []) - } else if (field == 'extent') { - return chipString(getMainEntity()['extent'] ?: []) - } else if (field == 'reproductionOf') { - return reproductionOfLink() - } else { - return chipString(getWork().getOrDefault(field, [])) - } - } - - protected String chipString(def thing) { - Util.chipString(thing, whelk) - } - - String tooltip(String string, String tooltip) { - """${string}""" - } - - private String reproductionOfLink() { - def shortId = Util.getPathSafe(getMainEntity(), ['reproductionOf', '@id']) - ?.tokenize("/#") - ?.dropRight(1) - ?.last() ?: '' - - return "$shortId" - } - - private List classificationStrings() { - List path = isInstance() ? ['instanceOf', 'classification'] : ['classification'] - List classification = Util.getPathSafe(getFramed(), path, []) - classification.collect() { c -> - StringBuilder s = new StringBuilder() - s.append(flatMaybeLinked(c['inScheme'], ['code', 'version']).with { it.isEmpty() ? it : it + ': ' }) - s.append(flatMaybeLinked(c, ['code'])) - return s.toString() - } - } - - private List contributorStrings() { - List path = isInstance() ? ['instanceOf', 'contribution'] : ['contribution'] - List contribution = Util.getPathSafe(getFramed(), path, []) - - return contribution.collect { Map c -> - contributionStr(c) - } - } - - protected Map getFramed() { - if (!framed) { - if (isInstance()) { - framed = JsonLd.frame(doc.getThingIdentifiers().first(), whelk.loadEmbellished(doc.shortId).data) - } else { - Document copy = doc.clone() - whelk.embellish(copy) - framed = JsonLd.frame(doc.getThingIdentifiers().first(), copy.data) - } - } - - return framed - } - - private String contributionStr(Map contribution) { - StringBuilder s = new StringBuilder() - - if (contribution['@type'] == 'PrimaryContribution') { - s.append('') - } - - s.append(flatMaybeLinked(contribution['role'], ['code', 'label']).with { it.isEmpty() ? it : it + ': ' }) - s.append(flatMaybeLinked(contribution['agent'], ['givenName', 'familyName', 'lifeSpan', 'name'])) - - if (contribution['@type'] == 'PrimaryContribution') { - s.append('') - } - - return s.toString() - } - - static String flatten(Object o, List order, String mapSeparator = ': ') { - if (o instanceof String) { - return o - } - if (o instanceof List) { - return o - .collect { flatten(it, order) } - .join(' || ') - } - if (o instanceof Map) { - return order - .findResults { ((Map) o).get(it) } - .collect { flatten(it, order) } - .join(mapSeparator) - } - - throw new RuntimeException(String.format("unexpected type: %s for %s", o.class.getName(), o)) - } - - private String flatMaybeLinked(Object thing, List order) { - if (!thing) - return '' - - if (thing instanceof List) { - return thing.collect { flatMaybeLinked(it, order) }.join(' | ') - } - String s = flatten(thing, order, ', ') - - thing['@id'] - ? """$s""" - : s - } - - boolean isFiction() { - isMarcFiction() || isSaogfFiction() || isSabFiction() - } - - boolean isMarcFiction() { - (getWork()['genreForm'] ?: []).any { it['@id'] in MARC_FICTION } - } - - boolean isMarcNotFiction() { - (getWork()['genreForm'] ?: []).any { it['@id'] in MARC_NOT_FICTION } - } - - boolean isSaogfFiction() { - (getWork()['genreForm'] ?: []).any { whelk.relations.isImpliedBy(SAOGF_SKÖN, it['@id'] ?: '') } - } - - boolean isSabFiction() { - classificationStrings().any { it.contains('kssb') && it.contains(': H') } - } - - boolean isNotFiction() { - // A lot of fiction has marc/NotFictionNotFurtherSpecified but then classification is usually empty - isMarcNotFiction() && (!classificationStrings().isEmpty() && !isSabFiction()) - } - - boolean isText() { - getWork()['@type'] == 'Text' - } - - boolean isTranslationWithoutTranslator() { - isTranslation() && !hasTranslator() - } - - boolean isTranslation() { - getWork()['translationOf'] - } - - boolean isSabDrama() { - classificationStrings().any { it.contains(': Hc.02') || it.contains(': Hce.02') } - } - - boolean isGfDrama() { - asList(getWork()['genreForm']).any { it['@id'] in DRAMA_GF } - } - - boolean isDrama() { - isSabDrama() || isGfDrama() - } - - boolean hasRole(String relatorIri) { - asList(getWork()['contribution']).any { - asList(it['role']).contains(['@id': relatorIri]) - } - } - - boolean hasTranslator() { - hasRole('https://id.kb.se/relator/translator') - } - - boolean hasDistinguishingEdition() { - (getMainEntity()['editionStatement'] ?: '').toString().toLowerCase().contains("förk") - } - - boolean hasRelationshipWithContribution() { - asList(getWork()['relationship']).any { r -> - asList(r['entity']).any { e -> - e.containsKey('contribution') - } - } - } - - void addComparisonProps() { - if (hasDistinguishingEdition()) { - addToWork('editionStatement') - } - getWork()['_numPages'] = numPages() - } - - void moveSummaryToInstance() { - if (getWork()['summary']) { - getMainEntity()['summary'] = asList(getMainEntity()['summary']) + asList(getWork()['summary']) - getWork().remove('summary') - } - } - - void addToWork(String field) { - getWork()[field] = getMainEntity()[field] - } - - void removeComparisonProps() { - getWork().remove('editionStatement') - getWork().remove('_numPages') - } -} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/DocumentComparator.java b/whelk-core/src/main/groovy/whelk/WorkMerging/DocumentComparator.java deleted file mode 100644 index 2adb902b0b..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/DocumentComparator.java +++ /dev/null @@ -1,231 +0,0 @@ -package whelk.WorkMerging; - -import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.Stack; -import java.util.function.Function; - -public class DocumentComparator { - private static final Comparator BY_HASH = (o1, o2) -> o2.hashCode() - o1.hashCode(); - - private final Function isOrderedList; - - public DocumentComparator() { - this(o -> "termComponentList".equals(o)); - } - - public DocumentComparator(Function isOrderedList) { - if (isOrderedList == null) - throw new NullPointerException(); - this.isOrderedList = isOrderedList; - } - - public boolean isEqual(Map a, Map b) { - if (a == null || b == null || a.size() != b.size()) { - return false; - } - for (Object key : a.keySet()) { - if (!isEqual(a.get(key), b.get(key), key)) { - return false; - } - } - return true; - } - - private boolean isEqual(Object a, Object b, Object key) { - if (a == null || b == null) { - return false; - } - else if (a.getClass() != b.getClass()) { - return (isSingleItemList(a) && isEqual(((List) a).get(0), b, key) - || (isSingleItemList(b) && isEqual(a, ((List) b).get(0), key))); - } - else if (a instanceof Map) { - return isEqual((Map) a, (Map) b); - } - else if (a instanceof List) { - if (isOrderedList.apply(key)) { - return isEqualOrdered((List) a, (List) b); - } else { - return isEqualUnordered((List) a, (List) b); - } - } - else { - return a.equals(b); - } - } - - private boolean isSingleItemList(Object o) { - return o instanceof List && ((List) o).size() == 1; - } - - private boolean isEqualOrdered(List a, List b) { - if (a.size() != b.size()) { - return false; - } - for (int i = 0; i < a.size(); i++) { - if (!isEqual(a.get(i), b.get(i), null)) { - return false; - } - } - return true; - } - - private boolean isEqualUnordered(List a, List b) { - if (a.size() != b.size()) { - return false; - } - - a.sort(BY_HASH); - b.sort(BY_HASH); - - List taken = new ArrayList<>(a.size()); - nextA: for (int i = 0 ; i < a.size() ; i++) { - for (int j = 0 ; j < b.size() ; j++) { - if (!taken.contains(j) && isEqual(a.get(i), b.get(j), null)) { - taken.add(j); - continue nextA; - } - } - return false; - } - - return true; - } - - public boolean isSubset(Map a, Map b) { - if (a == null || b == null || a.size() > b.size()) { - return false; - } - for (Object key : a.keySet()) { - if (!isSubset(a.get(key), b.get(key), key)) { - return false; - } - } - return true; - } - - private boolean isSubset(Object a, Object b, Object key) { - if (a == null || b == null || a.getClass() != b.getClass()) { - return false; - } - else if (a instanceof Map) { - return isSubset((Map) a, (Map) b); - } - else if (a instanceof List) { - if (isOrderedList.apply(key)) { - return isOrderedSubset((List) a, (List) b); - } else { - return isUnorderedSubset((List) a, (List) b); - } - } - else { - return a.equals(b); - } - } - - private boolean isOrderedSubset(List a, List b) { - if (a.size() > b.size()) { - return false; - } - int ixB = 0; - for (int ixA = 0; ixA < a.size(); ixA++) { - if (ixB == b.size()) { - return false; - } - - while (!isSubset(a.get(ixA), b.get(ixB++), null)) { - if (ixB == b.size()) { - return false; - } - } - } - return true; - } - - private boolean isUnorderedSubset(List a, List b) { - return new UnorderedListComparator(a, b).isSubset(); - } - - private class UnorderedListComparator { - List a; - List b; - - Stack stack; - Stack matched; - boolean anyMatch; - Boolean[][] cache; - - UnorderedListComparator(List a, List b) { - this.a = a; - this.b = b; - cache = new Boolean[a.size()][b.size()]; - } - - boolean isSubset() { - // since elements in 'a' might be subsets of more than one element - // in 'b' we must try different ways of matching elements - stack = new Stack<>(); - matched = new Stack<>(); - - nextA(); - while (stack.size() > 0) { - boolean match = isSubset(ixA(), ixB()); - nextB(); - if (match) { - anyMatch = true; - if (!matched.contains(ixB())) { - matched.push(ixB()); - if (matched.size() == a.size()) { - return true; - } - nextA(); - } - } - - while (ixB() == b.size()) { - if (!anyMatch) { - return false; - } - previousA(); - } - } - - return false; - } - - private boolean isSubset(int ixA, int ixB) { - if (cache[ixA][ixB] == null) { - cache[ixA][ixB] = DocumentComparator.this.isSubset(a.get(ixA), b.get(ixB), null); - } - - return cache[ixA][ixB]; - } - - private void previousA() { - stack.pop(); - if (matched.size() > 0) { - matched.pop(); - } - } - - private void nextA() { - stack.push(0); - anyMatch = false; - } - - private void nextB() { - stack.push(stack.pop() + 1); - } - - private int ixA() { - return stack.size() - 1; - } - - private int ixB() { - return stack.size() > 0 ? stack.peek() : -1; - } - } -} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/FieldStatus.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/FieldStatus.groovy deleted file mode 100644 index a33445d1b4..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/FieldStatus.groovy +++ /dev/null @@ -1,7 +0,0 @@ -package whelk.WorkMerging - -enum FieldStatus { - EQUAL, - COMPATIBLE, - DIFF -} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/Html.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/Html.groovy deleted file mode 100644 index c313415618..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/Html.groovy +++ /dev/null @@ -1,111 +0,0 @@ -package whelk.WorkMerging - -import org.apache.commons.codec.digest.DigestUtils - -import static whelk.WorkMerging.FieldStatus.COMPATIBLE -import static whelk.WorkMerging.FieldStatus.DIFF -import static whelk.WorkMerging.FieldStatus.EQUAL - -class Html { - private static String CSS = Html.class.getClassLoader() - .getResourceAsStream('merge-works/table.css').getText("UTF-8") - - static final String START = """ - - - """ - static final String END = '' - static final String HORIZONTAL_RULE = "

\n" - - static def infoFields = ['reproductionOf', 'instance title', 'work title', 'instance type', 'editionStatement', 'responsibilityStatement', 'encodingLevel', 'publication', 'identifiedBy', 'extent'] - - static String clusterTable(Collection cluster) { - String id = clusterId(cluster.collect { it.doc.shortId }) - String header = """ - - ${id} - ${cluster.collect { doc -> "${doc.doc.shortId}" }.join('\n')} - - - - ${cluster.collect { doc -> "${doc.mainEntityDisplayTitle()}" }.join('\n')} - - """.stripIndent() - - def statuses = WorkComparator.compare(cluster) - - String info = infoFields.collect(fieldRows(cluster, "info")).join('\n') - String equal = statuses.get(EQUAL, []).collect(fieldRows(cluster, cluster.size() > 1 ? EQUAL.toString() : "")).join('\n') - String compatible = statuses.get(COMPATIBLE, []).collect(fieldRows(cluster, COMPATIBLE.toString())).join('\n') - String diff = statuses.get(DIFF, []).collect(fieldRows(cluster, DIFF.toString())).join('\n') - - return """ - - ${header} - ${equal} - ${compatible} - ${diff} - ${info} -
-

- """ - } - - static String hubTable(List> docs) { - def mergedWorks = docs*.first() - def ids = docs.collect { group -> - group.drop(1).collectEntries { doc -> - [doc.doc.shortId, doc.link()] - } - } - def clusterId = clusterId(ids*.keySet().flatten()) - - String header = """ - - ${clusterId} - ${mergedWorks.collect { "" }.join('\n')} - - """.stripIndent() - - String derivedFrom = - """ - - _derivedFrom - ${ids.collect { "${it.collect { id, link -> "$id" }.join('\n')}" }.join('\n')} - - """.stripIndent() - - def statuses = WorkComparator.compare(mergedWorks) - - String equal = statuses.get(EQUAL, []).collect(fieldRows(mergedWorks, mergedWorks.size() > 1 ? EQUAL.toString() : "")).join('\n') - String compatible = statuses.get(COMPATIBLE, []).collect(fieldRows(mergedWorks, COMPATIBLE.toString())).join('\n') - String diff = statuses.get(DIFF, []).collect(fieldRows(mergedWorks, DIFF.toString())).join('\n') - - return """ - - ${header} - ${equal} - ${compatible} - ${diff} - ${derivedFrom} -
-

- """ - } - - static String clusterId(Collection cluster) { - cluster - ? DigestUtils.md5Hex(cluster.sort().first()).toUpperCase().substring(0, 12) - : "" - } - - private static def fieldRows(Collection cluster, String cls) { - { field -> - """ - - ${field} - ${cluster.collect { "${it.getDisplayText(field)}" }.join('\n')} - """.stripIndent() - } - } -} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/Util.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/Util.groovy deleted file mode 100644 index 07a876cff3..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/Util.groovy +++ /dev/null @@ -1,306 +0,0 @@ -package whelk.WorkMerging - -import org.apache.commons.lang3.StringUtils -import whelk.Whelk -import whelk.util.Unicode - -import java.util.regex.Pattern - -class Util { - static def titleComponents = ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', 'marc:parallelTitle', 'marc:equalTitle'] - - static def titleVariant = ['Title', 'ParallelTitle'] - // removed 'VariantTitle', 'CoverTitle' since they sometimes contain random generic stuff like "Alibis filmroman", "Kompisböcker för de yngsta" - - static enum Relator { - TRANSLATOR('https://id.kb.se/relator/translator'), - AUTHOR('https://id.kb.se/relator/author'), - ILLUSTRATOR('https://id.kb.se/relator/illustrator'), - AUTHOR_OF_INTRO('https://id.kb.se/relator/authorOfIntroduction'), - ADAPTER('https://id.kb.se/relator/adapter'), - COVER_DESIGNER('https://id.kb.se/relator/coverDesigner'), - COMPILER('https://id.kb.se/relator/compiler'), - AUTHOR_OF_AFTERWORD('https://id.kb.se/relator/authorOfAfterwordColophonEtc'), - PHOTOGRAPHER('https://id.kb.se/relator/photographer'), - EDITOR('https://id.kb.se/relator/editor'), - UNSPECIFIED_CONTRIBUTOR('https://id.kb.se/relator/unspecifiedContributor'), - PRIMARY_RIGHTS_HOLDER('https://id.kb.se/relator/primaryRightsHolder') - - String iri - - private Relator(String iri) { - this.iri = iri - } - } - -// private static Set IGNORED_SUBTITLES = WorkToolJob.class.getClassLoader() -// .getResourceAsStream('merge-works/ignored-subtitles.txt') -// .readLines().grep().collect(Util.&normalize) as Set - - private static Set GENERIC_TITLES = WorkToolJob.class.getClassLoader() - .getResourceAsStream('merge-works/generic-titles.txt') - .readLines().grep().collect(Util.&normalize) as Set - - static def noise = - [",", '"', "'", '[', ']', ',', '.', '.', ':', ';', '-', '(', ')', ' the ', '-', '–', '+', '!', '?'].collectEntries { [it, ' '] } - - - static List asList(Object o) { - (o ?: []).with { it instanceof List ? it : [it] } - } - - /** - * Partition a collection based on equality condition - * - * NOTE: O(n^2)... - */ - static Collection> partition(Collection collection, Closure matcher) { - List> result = [] - - for (T t : collection) { - boolean match = false - for (List group : result) { - if (groupMatches(t, group, matcher)) { - group.add(t) - match = true - break - } - } - - if (!match) { - result.add([t]) - } - } - return result - } - - static boolean groupMatches(T t, List group, Closure matcher) { - for (T other : group) { - if (matcher(other, t)) { - return true - } - } - return false - } - - static boolean hasGenericTitle(List hasTitle) { - hasTitle.any { it['mainTitle'] && normalize((String) it['mainTitle']) in GENERIC_TITLES } - } - - static List dropSubTitles(List hasTitle) { - hasTitle.collect { t -> - def copy = new TreeMap(t) - copy.subMap(copy.keySet() - ['subtitle', 'titleRemainder']) - } - } - -// static List dropGenericSubTitles(List hasTitle) { -// hasTitle.collect { -// def copy = new TreeMap(it) -// if (copy['subtitle'] || copy['titleRemainder']) { -// DocumentUtil.traverse(copy) { value, path -> -// if (('subtitle' in path || 'titleRemainder' in path) && value instanceof String && genericSubtitle(value)) { -// new DocumentUtil.Remove() -// } -// } -// } -// copy -// } -// } - - static List flatTitles(List hasTitle) { - dropSubTitles(hasTitle).collect { - def title = new TreeMap<>() - title['flatTitle'] = normalize(Doc.flatten(it, titleComponents)) - if (it['@type']) { - title['@type'] = it['@type'] - } - - title - } - } - -// private static boolean genericSubtitle(String s) { -// s = Util.normalize(s) -// if (s.startsWith("en ")) { -// s = s.substring("en ".length()) -// } -// return s in IGNORED_SUBTITLES -// } - - static String normalize(String s) { - return Unicode.asciiFold(Unicode.normalizeForSearch(StringUtils.normalizeSpace(" $s ".toLowerCase().replace(noise)))) - } - - static Object getPathSafe(item, path, defaultTo = null) { - for (p in path) { - if ((item instanceof Collection || item instanceof Map) && item[p] != null) { - item = item[p] - } else { - return defaultTo - } - } - return item - } - - - static List getTitleVariants(List hasTitle) { - flatTitles(hasTitle) - .grep { it['@type'] in titleVariant } - .collect { it['flatTitle'] } - } - - static String chipString(def thing, Whelk whelk) { - if (thing instanceof Integer) { - return thing - } - - def chips = whelk.jsonld.toChip(thing) - if (chips.size() < 2) { - chips = thing - } - if (chips instanceof List) { - return chips.collect { valuesString(it) }.sort().join('
') - } - return valuesString(chips) - } - - private static String valuesString(def thing) { - if (thing instanceof List) { - return thing.collect { valuesString(it) }.join(' • ') - } - if (thing instanceof Map) { - return thing.findAll { k, v -> k != '@type' }.values().collect { valuesString(it) }.join(' • ') - } - return thing.toString() - } - - // (docs on some of these levels are normally filtered out before we reach here) - static List bestEncodingLevel = [ - 'marc:FullLevel', - 'marc:FullLevelMaterialNotExamined', - 'marc:MinimalLevel', - 'marc:LessThanFullLevelMaterialNotExamined', - 'marc:CoreLevel', - 'marc:AbbreviatedLevel', - 'marc:PartialPreliminaryLevel', - 'marc:PrepublicationLevel', - null - ] - - // Return the most common title for the best encodingLevel - static Object bestTitle(Collection docs) { - def isTitle = { it.'@type' == 'Title' } - def addSource = { t, d -> t.plus(['source': [d.getMainEntity().subMap('@id')]]) } - - for (def level : bestEncodingLevel) { - def titles = docs - .findAll { it.encodingLevel() == level } - .collect { d -> - d.getWork().get('hasTitle')?.findAll(isTitle) - ?: d.getMainEntity().get('hasTitle')?.findResults { isTitle(it) ? addSource(it, d) : null } - } - .grep() - - if (!titles) { - continue - } - - titles = titles.collect(Util.&dropSubTitles) - return partition(titles, { a, b -> a == b }).sort { it.size() }.reverse().first().first() - } - - return null - } - - static Map>> parseRespStatement(String respStatement) { - def parsedContributions = [:] - - respStatement.split(';').eachWithIndex { part, i -> - // TODO: generalize for other material types - parseSwedishFictionContribution(StringUtils.normalizeSpace(part), i == 0).each { name, roles -> - parsedContributions - .computeIfAbsent(name, r -> []) - .addAll(roles) - } - } - - return parsedContributions - } - - private static Map>> parseSwedishFictionContribution(String contribution, boolean isFirstPart) { - def roleToPattern = - [ - (Relator.TRANSLATOR) : ~/(bemynd(\w+|\.)? )?öf?v(\.|ers(\.|\p{L}+)?)( (till|från) \p{L}+)?|(till svenskan?|från \p{L}+)|svensk text/, - (Relator.AUTHOR) : ~/^(text(e[nr])?|skriven|written)/, - (Relator.ILLUSTRATOR) : ~/\bbild(er)?|ill(\.|ustr(\.|\w+)?)|\bvi(gn|nj)ett(er|ill)?|ritad/, - (Relator.AUTHOR_OF_INTRO) : ~/förord|inl(edn(\.|ing)|edd)/, - (Relator.COVER_DESIGNER) : ~/omslag/, - (Relator.AUTHOR_OF_AFTERWORD): ~/efter(ord|skrift)/, - (Relator.PHOTOGRAPHER) : ~/\bfoto\w*\.?/, - (Relator.EDITOR) : ~/red(\.(?! av)|aktör(er)?)|\bbearb(\.|\w+)?|återberättad|sammanställ\w*/, - ] - - def rolePattern = ~/((?iu)${roleToPattern.values().join('|')})/ - def followsRolePattern = ~/(:| a[fv]| by) / - def initialPattern = ~/\p{Lu}/ - def namePattern = ~/\p{Lu}:?\p{Ll}+('\p{Ll})?(,? [Jj](r|unior))?/ - def betweenNamesPattern = ~/-| |\. ?| (de(l| la)?|von|van( de[nr])?|v\.|le|af|du|dos) | [ODdLl]'/ - def fullNamePattern = ~/(($initialPattern|$namePattern)($betweenNamesPattern)?)*$namePattern/ - def conjPattern = ~/ (och|&|and) / - def roleAfterNamePattern = ~/( ?\(($rolePattern$conjPattern)?$rolePattern\))/ - def fullContributionPattern = ~/(($rolePattern($conjPattern|\/))*$rolePattern$followsRolePattern)?$fullNamePattern($conjPattern$fullNamePattern)*$roleAfterNamePattern?/ - - // Make roles lower case so that they can't be mistaken for names - contribution = (contribution =~ rolePattern)*.first() - .collectEntries { [it, it.toLowerCase()] } - .with { contribution.replace(it) } - - def nameToRoles = [:] - - def matched = (contribution =~ fullContributionPattern)*.first() - - matched.each { m -> - // Extract roles from the contribution - def roles = roleToPattern - .findAll { k, v -> m =~ /(?iu)$v/ } - .with { - it.isEmpty() && contribution =~ /.+$followsRolePattern/ - ? [new Tuple2(Relator.UNSPECIFIED_CONTRIBUTOR, isFirstPart)] - : it.collect { role, pattern -> new Tuple2(role, isFirstPart) } - } - - // Author should be the role if first part of respStatement (before ';') and no role seems to be stated - if (roles.isEmpty() && isFirstPart) { - roles << new Tuple2(Relator.AUTHOR, isFirstPart) - } - - // Extract names from the contribution - def names = parseNames(fullNamePattern, conjPattern, m) - - // Assign the roles to each name - nameToRoles.putAll(names.collectEntries { [it, roles] }) - } - - return nameToRoles - } - - private static List parseNames(Pattern namePattern, Pattern conjPattern, String s) { - def names = [] - - (s =~ namePattern).each { - def name = it.first() - // Handle the case of "Jan och Maria Larsson" - def previousName = names.isEmpty() ? null : names.last() - if (previousName?.split()?.size() == 1 && s =~ /$previousName$conjPattern$name/) { - def nameParts = name.split() - if (nameParts.size() > 1) { - names[-1] += " ${nameParts.last()}" - } - } - names << name - } - - return names - } -} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/WorkComparator.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/WorkComparator.groovy deleted file mode 100644 index faa369e9d9..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/WorkComparator.groovy +++ /dev/null @@ -1,136 +0,0 @@ -package whelk.WorkMerging - -import whelk.WorkMerging.compare.Classification -import whelk.WorkMerging.compare.SameOrEmpty -import whelk.WorkMerging.compare.Default -import whelk.WorkMerging.compare.Extent -import whelk.WorkMerging.compare.FieldHandler -import whelk.WorkMerging.compare.GenreForm -import whelk.WorkMerging.compare.StuffSet -import whelk.WorkMerging.compare.Subject -import whelk.WorkMerging.compare.TranslationOf -import whelk.WorkMerging.compare.ValuePicker -import whelk.WorkMerging.compare.WorkTitle - -import static whelk.WorkMerging.Util.bestTitle - -class WorkComparator { - Set fields - DocumentComparator c = new DocumentComparator() - - Map comparators = [ - 'classification' : new Classification(), - 'contentType' : new SameOrEmpty('https://id.kb.se/term/rda/Text'), - 'genreForm' : new GenreForm(), - 'hasTitle' : new WorkTitle(), - 'intendedAudience': new SameOrEmpty('https://id.kb.se/marc/Juvenile'), - '_numPages' : new Extent(), - 'subject' : new Subject(), - 'summary' : new StuffSet(), - 'translationOf' : new TranslationOf(), - ] - - static FieldHandler DEFAULT = new Default() - - WorkComparator(Set fields) { - this.fields = new HashSet<>(fields) - } - - boolean sameWork(Doc a, Doc b) { - fields.every { compare(a, b, it).with { it == EQUAL || it == COMPATIBLE } } - } - - FieldStatus compare(Doc a, Doc b, String field) { - Object oa = a.getWork().get(field) - Object ob = b.getWork().get(field) - - if (oa == null && ob == null) { - return FieldStatus.EQUAL - } - - compareExact(oa, ob, field) == FieldStatus.EQUAL - ? FieldStatus.EQUAL - : compareDiff(a, b, field) - } - - Map merge(Collection docs) { - Map result = [:] - - if (docs.size() > 1) { - fields.each { field -> - FieldHandler h = comparators.getOrDefault(field, DEFAULT) - def value = h instanceof ValuePicker - ? h.pick(docs) - : mergeField(field, h, docs) - - if (value) { - result[field] = value - } - } - } else { - result = docs[0].workCopy() - } - - if (!result['hasTitle']) { - def bestTitle = bestTitle(docs) - if (bestTitle) { - result['hasTitle'] = bestTitle - } - } - - return result - } - - // TODO: preserve order? e.g. subject - private Object mergeField(String field, FieldHandler h, Collection docs) { - Object value = docs.first().getWork().get(field) - def rest = docs.drop(1) - rest.each { - value = h.merge(value, it.getWork().get(field)) - } - return value - } - - private FieldStatus compareDiff(Doc a, Doc b, String field) { - comparators.getOrDefault(field, DEFAULT).isCompatible(a.getWork().get(field), b.getWork().get(field)) - ? FieldStatus.COMPATIBLE - : FieldStatus.DIFF - } - - private FieldStatus compareExact(Object oa, Object ob, String field) { - c.isEqual([(field): oa], [(field): ob]) ? FieldStatus.EQUAL : FieldStatus.DIFF - } - - static Map> compare(Collection cluster) { - WorkComparator c = new WorkComparator(allFields(cluster)) - - Map> result = [:] - c.fieldStatuses(cluster).each { f, s -> result.get(s, []) << f } - return result - } - - static Set allFields(Collection cluster) { - Set fields = new HashSet<>() - cluster.each { fields.addAll(it.getWork().keySet()) } - return fields - 'summary' // - 'summary' only temporary, remove when summaries have been moved to instance (LXL-3303) - } - - Map fieldStatuses(Collection cluster) { - fields.collectEntries { [it, fieldStatus(cluster, it)] } - } - - FieldStatus fieldStatus(Collection cluster, String field) { - boolean anyCompat = false - [cluster, cluster].combinations().findResult { List combination -> - Doc a = combination.first() - Doc b = combination.last() - - def c = compare(a, b, field) - if (c == FieldStatus.COMPATIBLE) { - anyCompat = true - } - c == FieldStatus.DIFF ? c : null - } ?: (anyCompat ? FieldStatus.COMPATIBLE : FieldStatus.EQUAL) - } - -} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/WorkToolJob.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/WorkToolJob.groovy deleted file mode 100644 index d30deeb9ca..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/WorkToolJob.groovy +++ /dev/null @@ -1,797 +0,0 @@ -package whelk.WorkMerging - - -import whelk.Document -import whelk.IdGenerator -import whelk.JsonLd -import whelk.Whelk -import whelk.exception.WhelkRuntimeException -import whelk.util.LegacyIntegrationTools -import whelk.util.Statistics - -import java.text.SimpleDateFormat -import java.util.concurrent.ExecutorService -import java.util.concurrent.Executors -import java.util.concurrent.TimeUnit -import java.util.concurrent.atomic.AtomicInteger -import java.util.function.Function - -import static whelk.WorkMerging.FieldStatus.DIFF - -import static whelk.WorkMerging.Util.asList -import static whelk.WorkMerging.Util.chipString -import static whelk.WorkMerging.Util.getPathSafe -import static whelk.WorkMerging.Util.normalize -import static whelk.WorkMerging.Util.partition -import static whelk.WorkMerging.Util.parseRespStatement -import static whelk.WorkMerging.Util.Relator - -class WorkToolJob { - Whelk whelk - Statistics statistics - File clusters - - String date = new SimpleDateFormat('yyyyMMdd-HHmmss').format(new Date()) - String jobId = IdGenerator.generate() - File reportDir = new File("reports/$date/merged-works") - - String changedIn = "xl" - String changedBy = "SEK" - String generationProcess = 'https://libris.kb.se/sys/merge-works' - boolean dryRun = true - boolean skipIndex = false - boolean loud = false - boolean verbose = false - - WorkToolJob(File clusters) { - this.clusters = clusters - - this.whelk = Whelk.createLoadedSearchWhelk('secret', true) - this.statistics = new Statistics() - } - - public static Closure qualityMonographs = { Doc doc -> - (doc.isText() - && doc.isMonograph() - && !doc.hasPart() - && (doc.encodingLevel() != 'marc:PartialPreliminaryLevel' && doc.encodingLevel() != 'marc:PrepublicationLevel')) - && !doc.hasRelationshipWithContribution() - } - - void show() { - println(Html.START) - run({ cluster -> - return { - try { - Collection> docs = titleClusters(cluster) - - if (docs.isEmpty() || docs.size() == 1 && docs.first().size() == 1) { - return - } - - println(docs - .collect { it.sort { a, b -> a.getWork()['@type'] <=> b.getWork()['@type'] } } - .collect { it.sort { it.numPages() } } - .collect { Html.clusterTable(it) } - .join('') + Html.HORIZONTAL_RULE - ) - } - catch (NoWorkException e) { - System.err.println(e.getMessage()) - } - catch (Exception e) { - System.err.println(e.getMessage()) - e.printStackTrace(System.err) - } - } - }) - println(Html.END) - } - - void showWorks() { - println(Html.START) - run({ cluster -> - return { - try { - println(mergedWorks(titleClusters(cluster)).findAll { it.derivedFrom.size() > 1 } - .collect { [new Doc(whelk, it.work)] + it.derivedFrom } - .collect { Html.clusterTable(it) } - .join('') + Html.HORIZONTAL_RULE - ) - } - catch (Exception e) { - System.err.println(e.getMessage()) - e.printStackTrace(System.err) - } - } - }) - println(Html.END) - } - - void showHubs() { - println(Html.START) - run({ cluster -> - return { - try { - def hub = mergedWorks(titleClusters(cluster)) - .collect { [new Doc(whelk, it.work)] + it.derivedFrom } - if (hub.size() > 1) { - println(Html.hubTable(hub) + Html.HORIZONTAL_RULE) - } - } - catch (Exception e) { - System.err.println(e.getMessage()) - e.printStackTrace(System.err) - } - } - }) - println(Html.END) - } - - void merge() { - def s = statistics.printOnShutdown() - reportDir.mkdirs() - - run({ cluster -> - return { - def titles = titleClusters(cluster) - def works = mergedWorks(titles) - - works.each { - if (it.derivedFrom.size() > 1) { - store(it) - } - } - - String report = htmlReport(titles, works) - - new File(reportDir, "${Html.clusterId(cluster)}.html") << report - works.each { - s.increment('num derivedFrom', "${it.derivedFrom.size()}", it.work.shortId) - new File(reportDir, "${it.work.shortId}.html") << report - } - } - }) - } - - void revert() { - run({ cluster -> - return { - def docs = cluster.collect(whelk.&getDocument).grep() - - Set works = [] - - docs.each { Document d -> - def sum = d.getChecksum(whelk.jsonld) - works << getPathSafe(d.data, d.workIdPath) - def revertTo = whelk.storage.loadAllVersions(d.shortId) - .reverse() - .find { v -> getPathSafe(v.data, v.workIdPath) == null } - d.data = revertTo.data - d.setGenerationDate(new Date()) - d.setGenerationProcess(generationProcess) - whelk.storeAtomicUpdate(d, !loud, changedIn, changedBy, sum) - } - - works.grep().each { - def shortId = it.split("[#/]")[-2] - whelk.remove(shortId, changedIn, changedBy) - } - } - }) - } - - String htmlReport(Collection> titleClusters, Collection works) { - if (titleClusters.isEmpty() || titleClusters.size() == 1 && titleClusters.first().size() == 1) { - return "" - } - - StringBuilder s = new StringBuilder() - - s.append(Html.START) - s.append("

Title cluster(s)

") - titleClusters.each { it.each { it.addComparisonProps() } } - - titleClusters - .collect { it.sort { a, b -> a.getWork()['@type'] <=> b.getWork()['@type'] } } - .collect { it.sort { it.numPages() } } - .each { - s.append(Html.clusterTable(it)) - s.append(Html.HORIZONTAL_RULE) - } - titleClusters.each { it.each { it.removeComparisonProps() } } - - s.append("

Extracted works

") - works.collect { [new Doc(whelk, it.work)] + it.derivedFrom } - .each { s.append(Html.clusterTable(it)) } - - s.append(Html.END) - return s.toString() - } - - class MergedWork { - Document work - Collection derivedFrom - } - - private Document buildWorkDocument(Map workData) { - String workId = IdGenerator.generate() - - workData['@id'] = "TEMPID#it" - Document d = new Document([ - "@graph": [ - [ - "@id" : "TEMPID", - "@type" : "Record", - "mainEntity" : ["@id": "TEMPID#it"], - "technicalNote": [[ - "@type" : "TechnicalNote", - "hasNote": [[ - "@type": "Note", - "label": ["Maskinellt utbrutet verk... TODO"] - ]], - "uri" : ["http://xlbuild.libris.kb.se/works/$date/merged-works/${workId}.html".toString()] - - ] - ]], - workData - ] - ]) - - d.setGenerationDate(new Date()) - d.setGenerationProcess(generationProcess) - d.deepReplaceId(Document.BASE_URI.toString() + workId) - return d - } - - private void store(MergedWork work) { - if (!dryRun) { - whelk.setSkipIndex(skipIndex) - if (!whelk.createDocument(work.work, changedIn, changedBy, - LegacyIntegrationTools.determineLegacyCollection(work.work, whelk.getJsonld()), false)) { - throw new WhelkRuntimeException("Could not store new work: ${work.work.shortId}") - } - - String workIri = work.work.thingIdentifiers.first() - - work.derivedFrom - .collect { it.ogDoc } - .each { - def sum = it.getChecksum(whelk.jsonld) - it.data[JsonLd.GRAPH_KEY][1]['instanceOf'] = [(JsonLd.ID_KEY): workIri] - it.setGenerationDate(new Date()) - it.setGenerationProcess(generationProcess) - whelk.storeAtomicUpdate(it, !loud, changedIn, changedBy, sum) - } - } - } - - private Collection mergedWorks(Collection titleClusters) { - def works = [] - titleClusters.each { titleCluster -> - titleCluster.sort { it.numPages() } - WorkComparator c = new WorkComparator(WorkComparator.allFields(titleCluster)) - - works.addAll(partition(titleCluster, { Doc a, Doc b -> c.sameWork(a, b) }) - .each { work -> work.each { doc -> doc.removeComparisonProps() } } - .collect { new MergedWork(work: buildWorkDocument(c.merge(it)), derivedFrom: it) }) - } - - return works - } - - - void subTitles() { - statistics.printOnShutdown(10) - run({ cluster -> - return { - String titles = cluster.collect(whelk.&getDocument).collect { - getPathSafe(it.data, ['@graph', 1, 'hasTitle', 0, 'subtitle']) - }.grep().join('\n') - - if (!titles.isBlank()) { - println(titles + '\n') - } - } - }) - } - - void printInstanceValue(String field) { - run({ cluster -> - return { - String values = cluster.collect(whelk.&getDocument).collect { - "${it.shortId}\t${getPathSafe(it.data, ['@graph', 1, field])}" - }.join('\n') - - println(values + '\n') - } - }) - } - - void fictionNotFiction() { - run({ cluster -> - return { - Collection> titleClusters = titleClusters(cluster) - - for (titleCluster in titleClusters) { - if (titleCluster.size() > 1) { - def statuses = WorkComparator.compare(cluster) - if (!statuses[DIFF].contains('contribution')) { - String gf = titleCluster.collect { it.getDisplayText('genreForm') }.join(' ') - if (gf.contains('marc/FictionNotFurtherSpecified') && gf.contains('marc/NotFictionNotFurtherSpecified')) { - println(titleCluster.collect { it.getDoc().shortId }.join('\t')) - } - } - } - } - } - }) - } - - void swedishFiction() { - def swedish = { Doc doc -> - Util.asList(doc.getWork()['language']).collect { it['@id'] } == ['https://id.kb.se/language/swe'] - } - - run({ cluster -> - return { - def c = loadDocs(cluster) - .findAll(qualityMonographs) - .findAll(swedish) - .findAll { d -> !d.isDrama() } - - if (c.any { it.isFiction() } && !c.any { it.isNotFiction() }) { - println(c.collect { it.doc.shortId }.join('\t')) - } - } - }) - } - - void filterClusters(Closure> predicate) { - run({ cluster -> - return { - if (predicate(loadDocs(cluster))) { - println(cluster.join('\t')) - } - } - }) - } - - void filterDocs(Closure predicate) { - run({ cluster -> - return { - def c = loadDocs(cluster).findAll(predicate) - if (c.size() > 0) { - println(c.collect { it.doc.shortId }.join('\t')) - } - } - }) - } - - void translationNoTranslator() { - run({ cluster -> - return { - def c = loadDocs(cluster) - - if (c) { - if (c.any { it.isTranslation() }) { - if (c.any { it.hasTranslator() }) { - c = c.findAll { !it.isTranslationWithoutTranslator() } - } else { - int pages = c.first().numPages() - if (c.any { it.numPages() != pages }) { - return // drop cluster - } - } - } - } - - if (c.size() > 0) { - println(c.collect { it.doc.shortId }.join('\t')) - } - } - }) - } - - void outputTitleClusters() { - run({ cluster -> - return { - titleClusters(cluster).findAll { it.size() > 1 }.each { - println(it.collect { it.doc.shortId }.join('\t')) - } - } - }) - } - - void add9pu() { - statistics.printOnShutdown() - run({ cluster -> - return { - statistics.increment('add 9pu', 'clusters checked') - def docs = cluster - .collect(whelk.&getDocument) - .findAll() - .collect { [doc: it, checksum: it.getChecksum(whelk.jsonld), changed: false] } - - def ill = ['@id': Relator.ILLUSTRATOR.iri] - def pu = ['@id': Relator.PRIMARY_RIGHTS_HOLDER.iri] - def path = ['@graph', 1, 'instanceOf', 'contribution'] - - docs.each { - Document d = it.doc - - statistics.increment('add 9pu', 'docs checked') - - getPathSafe(d.data, path, []).each { Map c -> - def r = asList(c.role) - - if (pu in r || !(ill in r) || c.'@type' == 'PrimaryContribution') - return - - for (Map other : docs) { - Document od = other.doc - - def found9pu = false - - getPathSafe(od.data, path, []).each { Map oc -> - if (asList(c.agent) == asList(oc.agent) && asList(oc.role).containsAll([ill, pu])) { - c.role = asList(c.role) + pu - found9pu = true - statistics.increment('add 9pu', "9pu added") - if (verbose) { - println("${d.shortId} <- ${od.shortId}") - } - return - } - } - - if (found9pu) { - println(c) - it.changed = true - break - } - } - } - } - - docs.each { - if (!dryRun && it.changed) { - Document d = it.doc - d.setGenerationDate(new Date()) - d.setGenerationProcess(generationProcess) - whelk.storeAtomicUpdate(d, !loud, changedIn, changedBy, it.checksum) - } - } - } - }) - } - - void fetchContributionFromRespStatement() { - def loadThingByIri = { String iri -> - // TODO: fix whelk, add load by IRI method - whelk.storage.loadDocumentByMainId(iri)?.with { doc -> - return (Map) doc.data['@graph'][1] - } - } - - def loadIfLink = { it['@id'] ? loadThingByIri(it['@id']) : it } - - statistics.printOnShutdown() - run({ cluster -> - return { - statistics.increment('fetch contribution from respStatement', 'clusters checked') - def docs = cluster - .collect(whelk.&getDocument) - .findAll() - .collect { [doc: it, checksum: it.getChecksum(whelk.jsonld), changed: false] } - - docs.each { - Document d = it.doc - def respStatement = getPathSafe(d.data, ['@graph', 1, 'responsibilityStatement']) - if (!respStatement) - return - - statistics.increment('fetch contribution from respStatement', 'docs checked') - - def contributionsInRespStmt = parseRespStatement(respStatement) - def contribution = getPathSafe(d.data, ['@graph', 1, 'instanceOf', 'contribution'], []) - - contribution.each { Map c -> - asList(c.agent).each { a -> - def matchedOnName = contributionsInRespStmt.find { n, r -> - nameMatch(n, loadIfLink(a)) - } - - if (!matchedOnName) - return - - // Contributor found locally, omit from further search - contributionsInRespStmt.remove(matchedOnName.key) - - def dontAdd = { Relator relator, boolean isFirstStmtPart -> - relator == Relator.UNSPECIFIED_CONTRIBUTOR - || isFirstStmtPart && relator == Relator.AUTHOR - && c.'@type' != 'PrimaryContribution' - } - - def rolesInRespStatement = matchedOnName.value - .findResults { dontAdd(it) ? null : it.getV1() } - - if (rolesInRespStatement.isEmpty()) - return - - def rolesInContribution = asList(c.role).findAll { it.'@id' != Relator.UNSPECIFIED_CONTRIBUTOR.iri } - - // Replace Adapter with Editor - it.changed |= rolesInRespStatement.removeAll { r -> - r == Relator.EDITOR && rolesInContribution.findIndexOf { - it.'@id' == Relator.ADAPTER.iri - }.with { - if (it == -1) { - return false - } else { - rolesInContribution[it]['@id'] = Relator.EDITOR.iri - return true - } - } - } - - if (rolesInRespStatement.size() <= rolesInContribution.size()) - return - - rolesInRespStatement.each { r -> - def idLink = ['@id': r.iri] - if (!(idLink in rolesInContribution)) { - rolesInContribution << idLink - it.changed = true - def roleShort = r.iri.split('/').last() - statistics.increment('fetch contribution from respStatement', "$roleShort roles specified") - if (verbose) { - println("${chipString(c, whelk)} (${d.shortId}) <- $roleShort") - } - } - } - - c.role = rolesInContribution - } - } - - def comparable = { - it*.getV1().findResults { Relator r -> - r != Relator.UNSPECIFIED_CONTRIBUTOR - ? ['@id': r.iri] - : null - } - } - - contributionsInRespStmt.each { name, roles -> - for (Map other : docs) { - Document od = other.doc - def matched = getPathSafe(od.data, ['@graph', 1, 'instanceOf', 'contribution'], []) - .find { Map c -> - asList(c.agent).any { a -> - loadIfLink(a).with { nameMatch(name, it) && !(it.description =~ /(?i)pseud/) } - && comparable(roles).with { r -> !r.isEmpty() && asList(c.role).containsAll(r) } - && Util.bestEncodingLevel.indexOf(d.getEncodingLevel()) <= Util.bestEncodingLevel.indexOf(od.getEncodingLevel()) - } - } - if (matched) { - contribution << matched - roles.each { - def roleShort = it.getV1().iri.split('/').last() - statistics.increment('fetch contribution from respStatement', "$roleShort found in cluster") - } - if (verbose) { - println("${d.shortId} <- ${chipString(matched, whelk)} (${od.shortId})") - } - it.changed = true - break - } - } - } - } - - docs.each { - if (!dryRun && it.changed) { - Document d = it.doc - d.setGenerationDate(new Date()) - d.setGenerationProcess(generationProcess) - whelk.storeAtomicUpdate(d, !loud, changedIn, changedBy, it.checksum) - } - } - } - } - - ) - } - - void linkContribution() { - def loadThingByIri = { String iri -> - // TODO: fix whelk, add load by IRI method - whelk.storage.loadDocumentByMainId(iri)?.with { doc -> - return (Map) doc.data['@graph'][1] - } - } - - def loadIfLink = { it['@id'] ? loadThingByIri(it['@id']) : it } - - statistics.printOnShutdown() - run({ cluster -> - return { - statistics.increment('link contribution', 'clusters checked') - // TODO: check work language? - def docs = cluster - .collect(whelk.&getDocument) - .collect { [doc: it, checksum: it.getChecksum(whelk.jsonld), changed: false] } - - List linked = [] - docs.each { d -> - def contribution = getPathSafe(d.doc.data, ['@graph', 1, 'instanceOf', 'contribution'], []) - contribution.each { Map c -> - if (c.agent && c.agent['@id']) { - loadThingByIri(c.agent['@id'])?.with { Map agent -> - agent.roles = asList(c.role) - linked << agent - } - } - } - statistics.increment('link contribution', 'docs checked') - } - - docs.each { - Document d = it.doc - def contribution = getPathSafe(d.data, ['@graph', 1, 'instanceOf', 'contribution'], []) - contribution.each { Map c -> - if (c.agent && !c.agent['@id']) { - def l = linked.find { - agentMatches(c.agent, it) && (!c.role || it.roles.containsAll(c.role)) - } - if (l) { - println("${d.shortId} ${chipString(c, whelk)} --> ${chipString(l, whelk)}") - c.agent = ['@id': l['@id']] - it.changed = true - statistics.increment('link contribution', 'agents linked') - } else if (verbose) { - println("${d.shortId} NO MATCH: ${chipString(c, whelk)} ??? ${linked.collect { chipString(it, whelk) }}") - } - } - } - } - - List primaryAutAgents = [] - docs.each { - def contribution = getPathSafe(it.doc.data, ['@graph', 1, 'instanceOf', 'contribution'], []) - def p = contribution.findAll() - contribution.each { - if (it['@type'] == 'PrimaryContribution' && it['role'] == ['@id': 'https://id.kb.se/relator/author'] && it['agent']) { - Map agent = loadIfLink(it['agent']) - if (agent) { - primaryAutAgents << agent - } - } - } - } - - docs.each { - Document d = it.doc - def contribution = getPathSafe(d.data, ['@graph', 1, 'instanceOf', 'contribution'], []) - contribution.each { Map c -> - if (c['@type'] == 'PrimaryContribution' && !c.role) { - if (c.agent) { - def agent = loadIfLink(c.agent) - if (primaryAutAgents.any { agentMatches(agent, it) }) { - c.role = ['@id': 'https://id.kb.se/relator/author'] - it.changed = true - statistics.increment('link contribution', 'author role added to primary contribution') - } - } - } - } - } - - docs.each { - if (!dryRun && it.changed) { - Document d = it.doc - d.setGenerationDate(new Date()) - d.setGenerationProcess(generationProcess) - whelk.storeAtomicUpdate(d, !loud, changedIn, changedBy, it.checksum) - } - } - } - }) - } - - static boolean agentMatches(Map local, Map linked) { - nameMatch(local, linked) && !yearMismatch(local, linked) - } - - static boolean nameMatch(Object local, Map agent) { - def variants = [agent] + asList(agent.hasVariant) - def name = { - Map p -> - (p.givenName && p.familyName) - ? normalize("${p.givenName} ${p.familyName}") - : p.name ? normalize("${p.name}") : null - } - - def localName = local instanceof Map ? name(local) : normalize(local) - - localName && variants.any { - name(it) && localName == name(it) - } - } - - static boolean yearMismatch(Map local, Map linked) { - def birth = { Map p -> p.lifeSpan?.with { (it.replaceAll(/[^\-0-9]/, '').split('-') as List)[0] } } - def death = { Map p -> p.lifeSpan?.with { (it.replaceAll(/[^\-0-9]/, '').split('-') as List)[1] } } - def b = birth(local) && birth(linked) && birth(local) != birth(linked) - def d = death(local) && death(linked) && death(local) != death(linked) - b || d - } - - private void run(Function, Runnable> f) { - ExecutorService s = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() * 4) - - AtomicInteger i = new AtomicInteger() - clusters.eachLine() { - List cluster = Arrays.asList(it.split(/[\t ]+/)) - - s.submit({ - try { - f.apply(cluster).run() - int n = i.incrementAndGet() - if (n % 100 == 0) { - System.err.println("$n") - } - } - catch (NoWorkException e) { - //println("No work:" + e.getMessage()) - } - catch (Exception e) { - e.printStackTrace() - } - }) - } - - s.shutdown() - s.awaitTermination(1, TimeUnit.DAYS) - } - - private Collection loadDocs(Collection cluster) { - whelk - .bulkLoad(cluster).values() - .collect { new Doc(whelk, it) } - } - - private Collection> titleClusters(Collection cluster) { - loadDocs(cluster) - .findAll(qualityMonographs) - .each { it.addComparisonProps() } - .with { partitionByTitle(it) } - .findAll { it.size() > 1 } - .findAll { !it.any { doc -> doc.hasGenericTitle() } } - .sort { a, b -> a.first().mainEntityDisplayTitle() <=> b.first().mainEntityDisplayTitle() } - } - - Collection> partitionByTitle(Collection docs) { - return partition(docs) { Doc a, Doc b -> - !a.getTitleVariants().intersect(b.getTitleVariants()).isEmpty() - } - } - -} - -class NoWorkException extends RuntimeException { - NoWorkException(String msg) { - super(msg) - } -} - - - - - - - - - diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Classification.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Classification.groovy deleted file mode 100644 index 2dd9a10e7e..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Classification.groovy +++ /dev/null @@ -1,63 +0,0 @@ -package whelk.WorkMerging.compare - -class Classification extends StuffSet { - @Override - Object merge(Object a, Object b) { - return mergeCompatibleElements(super.merge(a, b)) { c1, c2 -> - String code1 = c1['code'] - String code2 = c2['code'] - if (!code1 || !code2) { - return - } - code1 = code1.trim() - code2 = code2.trim() - - if (isSab(c1) && isSab(c2) && (code1.startsWith(code2) || code2.startsWith(code1))) { - def result = [ - '@type' : 'Classification', - 'code' : code1.size() > code2.size() ? code1 : code2, - inScheme: [ - '@type' : 'ConceptScheme', - 'code' : 'kssb' - ] - ] - def version = maxSabVersion(c1, c2) - if (version) { - result['inScheme']['version'] = version - } - return result - } - else if (isDewey(c1) && isDewey(c2) && code1 == code2) { - Map result = [:] - result.putAll(c1) - result.putAll(c2) - result['editionEnumeration'] = maxDeweyEdition(c1, c2) - return result - } - } - } - - boolean isSab(Map c) { - c['inScheme'] && c['inScheme']['code'] == 'kssb' - } - - String maxSabVersion(c1, c2) { - def v1 = c1['inScheme']['version'] ?: "-1" - def v2 = c2['inScheme']['version'] ?: "-1" - Integer.parseInt(v1) > Integer.parseInt(v2) ? v1 : v2 - } - - boolean isDewey(Map c) { - c['@type'] == 'ClassificationDdc' - } - - String maxDeweyEdition(c1, c2) { - def v1 = c1['editionEnumeration'] - def v2 = c2['editionEnumeration'] - deweyEdition(v1) > deweyEdition(v2) ? v1 : v2 - } - - int deweyEdition(String edition) { - Integer.parseInt((edition ?: "0").replaceAll("[^0-9]", "")) - } -} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Default.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Default.groovy deleted file mode 100644 index 07e0635234..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Default.groovy +++ /dev/null @@ -1,13 +0,0 @@ -package whelk.WorkMerging.compare - -class Default implements FieldHandler { - @Override - boolean isCompatible(Object a, Object b) { - return false - } - - @Override - Object merge(Object a, Object b) { - return a - } -} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Extent.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Extent.groovy deleted file mode 100644 index 078a3fee78..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Extent.groovy +++ /dev/null @@ -1,15 +0,0 @@ -package whelk.WorkMerging.compare; - -class Extent implements FieldHandler { - - // TODO: allow one side missing extent (-1)? - @Override - boolean isCompatible(Object a, Object b) { - return true // a * 0.7 < b && a * 1.3 > b - } - - @Override - Object merge(Object a, Object b) { - return b; // not part of final work - } -} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/FieldHandler.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/FieldHandler.groovy deleted file mode 100644 index 22a95fd2a9..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/FieldHandler.groovy +++ /dev/null @@ -1,12 +0,0 @@ -package whelk.WorkMerging.compare - -import whelk.WorkMerging.Doc - -interface FieldHandler { - boolean isCompatible(Object a, Object b) - Object merge(Object a, Object b) -} - -interface ValuePicker extends FieldHandler { - Object pick(Collection values) -} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/GenreForm.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/GenreForm.groovy deleted file mode 100644 index 690e9353ff..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/GenreForm.groovy +++ /dev/null @@ -1,36 +0,0 @@ -package whelk.WorkMerging.compare - -import whelk.WorkMerging.DocumentComparator - -//FIXME -class GenreForm extends StuffSet { - private static final DocumentComparator c = new DocumentComparator() - - // Terms that will be merged (values precede keys) - private static def norm = [ - (['@id': 'https://id.kb.se/marc/NotFictionNotFurtherSpecified']): [ - ['@id': 'https://id.kb.se/marc/FictionNotFurtherSpecified'], - ['@id': 'https://id.kb.se/marc/Autobiography'], - ['@id': 'https://id.kb.se/marc/Biography'] - ], - (['@id': 'https://id.kb.se/marc/FictionNotFurtherSpecified']) : [ - ['@id': 'https://id.kb.se/marc/Poetry'], - ['@id': 'https://id.kb.se/marc/Novel'] - ], - ] - - @Override - Object merge(Object a, Object b) { - return mergeCompatibleElements(super.merge(a, b).findAll { it.'@id' }) { gf1, gf2 -> - if (n(gf1, gf2)) { - gf2 - } else if (n(gf2, gf1)) { - gf1 - } - } - } - - boolean n(a, b) { - norm[a]?.any { it == b || n(it, b) } - } -} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/SameOrEmpty.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/SameOrEmpty.groovy deleted file mode 100644 index f36f580773..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/SameOrEmpty.groovy +++ /dev/null @@ -1,21 +0,0 @@ -package whelk.WorkMerging.compare - -import static whelk.WorkMerging.Util.asList - -class SameOrEmpty implements FieldHandler { - Object link - - SameOrEmpty(String iri) { - this.link = [['@id': iri]] - } - - @Override - boolean isCompatible(Object a, Object b) { - (!a && asList(b) == link) || (!b && asList(a) == link) - } - - @Override - Object merge(Object a, Object b) { - return a ?: b - } -} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/StuffSet.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/StuffSet.groovy deleted file mode 100644 index 92262086f7..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/StuffSet.groovy +++ /dev/null @@ -1,38 +0,0 @@ -package whelk.WorkMerging.compare - - -import java.util.function.BiFunction - -import static whelk.WorkMerging.Util.asList - -class StuffSet implements FieldHandler { - @Override - boolean isCompatible(Object a, Object b) { - true - } - - @Override - Object merge(Object a, Object b) { - return ((asList(a) as Set) + (asList(b) as Set)).collect() - } - - static Object mergeCompatibleElements(Object o, BiFunction s) { - boolean changed = false - List result = [] - asList(o).each { - def merged = null - for (int i = 0 ; i < result.size() ; i++) { - merged = s.apply(result[i], it) - if (merged) { - result[i] = merged - changed = true - break - } - } - if (merged == null) { - result << it - } - } - return changed ? mergeCompatibleElements(result, s) : result - } -} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Subject.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Subject.groovy deleted file mode 100644 index e69fb633e7..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/Subject.groovy +++ /dev/null @@ -1,8 +0,0 @@ -package whelk.WorkMerging.compare - -class Subject extends StuffSet { - @Override - Object merge(Object a, Object b) { - return super.merge(a, b).findAll { it.'@id' || it.'@type' == 'ComplexSubject' } - } -} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/TranslationOf.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/TranslationOf.groovy deleted file mode 100644 index 73836e6fee..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/TranslationOf.groovy +++ /dev/null @@ -1,22 +0,0 @@ -package whelk.WorkMerging.compare - -import whelk.WorkMerging.DocumentComparator - -class TranslationOf implements FieldHandler { - DocumentComparator c = new DocumentComparator() - - @Override - boolean isCompatible(Object a, Object b) { - // @type is sometimes Work, sometimes Text. Should not matter for comparison - (!a && !b) || a && b && a instanceof Map && b instanceof Map && c.isEqual(noType(a), noType(b)) - } - - @Override - Object merge(Object a, Object b) { - return a // TODO: prefer one @type over another? - } - - Map noType(Map m) { - m.findAll { k, v -> k != '@type' } - } -} diff --git a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/WorkTitle.groovy b/whelk-core/src/main/groovy/whelk/WorkMerging/compare/WorkTitle.groovy deleted file mode 100644 index 4c948af25a..0000000000 --- a/whelk-core/src/main/groovy/whelk/WorkMerging/compare/WorkTitle.groovy +++ /dev/null @@ -1,23 +0,0 @@ -package whelk.WorkMerging.compare - -import whelk.WorkMerging.Doc -import whelk.WorkMerging.Util -import org.apache.commons.lang3.NotImplementedException - -class WorkTitle implements ValuePicker { - - @Override - boolean isCompatible(Object a, Object b) { - return !a || !b || !Util.getTitleVariants(a).intersect(Util.getTitleVariants(b)).isEmpty() - } - - @Override - Object merge(Object a, Object b) { - throw new NotImplementedException('') - } - - @Override - Object pick(Collection values) { - return Util.bestTitle(values) - } -}