diff --git a/librisxl-tools/scripts/merge-works.sh b/librisxl-tools/scripts/merge-works.sh
new file mode 100755
index 0000000000..299dcbd752
--- /dev/null
+++ b/librisxl-tools/scripts/merge-works.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+
+# Run from whelktool dir
+
+count_lines() {
+  if [ -f $1 ]; then
+    wc -l $1 | cut -d ' ' -f 1
+  else
+    echo 0
+  fi
+}
+
+if ! [[ "$1" =~ ^(local|dev|dev2|qa|stg|prod)$ ]]; then
+  echo "Missing or invalid environment"
+  exit 1
+fi
+
+ENV=$1
+ARGS="${@:2}"
+NUM_CLUSTERS=0
+
+REPORT_DIR=reports/merge-works/$ENV-$(date +%Y%m%d)
+
+mkdir -p $REPORT_DIR/{clusters,normalizations,merged-works}
+
+CLUSTERS_DIR=$REPORT_DIR/clusters
+NORMALIZATIONS_DIR=$REPORT_DIR/normalizations
+
+FIND_CLUSTERS=$CLUSTERS_DIR/find-clusters
+ALL_CLUSTERS=$CLUSTERS_DIR/1-all.tsv
+MERGED_CLUSTERS=$CLUSTERS_DIR/2-merged.tsv
+TITLE_CLUSTERS=$CLUSTERS_DIR/3-title-clusters.tsv
+SWEDISH_FICTION=$CLUSTERS_DIR/4-swedish-fiction.tsv
+NO_ANONYMOUS_TRANSLATIONS=$CLUSTERS_DIR/5-no-anonymous-translations.tsv
+
+LANGUAGE_IN_TITLE=$NORMALIZATIONS_DIR/1-titles-with-language
+ELIB_DESIGNERS=$NORMALIZATIONS_DIR/2-elib-cover-designer
+CONTRIBUTION=$NORMALIZATIONS_DIR/3-contribution
+ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/4-roles-to-instance
+
+# Clustring step 1 TODO: run only on recently updated records after first run
+echo "Finding new clusters..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar \
+  $ARGS --report $FIND_CLUSTERS scripts/analysis/find-work-clusters.groovy >$ALL_CLUSTERS 2>/dev/null
+NUM_CLUSTERS=$(count_lines $ALL_CLUSTERS)
+echo "$NUM_CLUSTERS clusters found"
+if [ $NUM_CLUSTERS == 0 ]; then
+  exit 0
+fi
+
+# Clustring step 2
+echo
+echo "Merging clusters..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$ALL_CLUSTERS -jar build/libs/whelktool.jar \
+  $ARGS scripts/analysis/merge-clusters.groovy >$MERGED_CLUSTERS 2>/dev/null
+NUM_CLUSTERS=$(count_lines $MERGED_CLUSTERS)
+echo "Merged into $NUM_CLUSTERS clusters"
+if [ $NUM_CLUSTERS == 0 ]; then
+  exit 0
+fi
+
+# Clustring step 3
+echo
+echo "Finding title clusters..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -cp build/libs/whelktool.jar datatool.WorkTool \
+  $ARGS -tc $MERGED_CLUSTERS >$TITLE_CLUSTERS
+NUM_CLUSTERS=$(count_lines $TITLE_CLUSTERS)
+echo "$NUM_CLUSTERS title clusters found"
+if [ $NUM_CLUSTERS == 0 ]; then
+  exit 0
+fi
+
+# Filter: Swedish fiction
+echo
+echo "Filtering on Swedish fiction..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -cp build/libs/whelktool.jar datatool.WorkTool \
+  $ARGS -f $TITLE_CLUSTERS >$SWEDISH_FICTION
+NUM_CLUSTERS=$(count_lines $SWEDISH_FICTION)
+echo "Found $NUM_CLUSTERS title clusters with Swedish fiction"
+if [ $NUM_CLUSTERS == 0 ]; then
+  exit 0
+fi
+
+# Normalization
+echo
+echo "Removing language from work titles..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION -jar build/libs/whelktool.jar \
+  $ARGS --report $LANGUAGE_IN_TITLE src/main/groovy/datatool/scripts/mergeworks/normalize/language-in-work-title.groovy 2>/dev/null
+echo "$(count_lines $LANGUAGE_IN_TITLE/MODIFIED.txt) records affected, report in $LANGUAGE_IN_TITLE"
+
+echo
+echo "Specifying designer roles in Elib records..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar \
+  $ARGS --report $ELIB_DESIGNERS scripts/cleanups/2023/05/lxl-4183-elib-cover-designer.groovy 2>/dev/null
+echo "$(count_lines $ELIB_DESIGNERS/MODIFIED.txt) records affected, report in $ELIB_DESIGNERS"
+
+echo
+echo "Normalizing contribution..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION -jar build/libs/whelktool.jar \
+  $ARGS --report $CONTRIBUTION src/main/groovy/datatool/scripts/mergeworks/normalize/contribution.groovy 2>/dev/null
+echo "$(count_lines $CONTRIBUTION/MODIFIED.txt) records affected, report in $CONTRIBUTION"
+
+echo
+echo "Moving roles to instance..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION -jar build/libs/whelktool.jar \
+  $ARGS --report $ROLES_TO_INSTANCE src/main/groovy/datatool/scripts/mergeworks/normalize/contributions-to-instance.groovy 2>/dev/null
+echo "$(count_lines $ROLES_TO_INSTANCE/MODIFIED.txt) records affected, report in $ROLES_TO_INSTANCE"
+
+# Filter: Drop anonymous translations
+echo "Filtering out anonymous translations..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -cp build/libs/whelktool.jar datatool.WorkTool \
+  $ARGS -tr $SWEDISH_FICTION >$NO_ANONYMOUS_TRANSLATIONS
+NUM_CLUSTERS=$(count_lines $NO_ANONYMOUS_TRANSLATIONS)
+echo "$NUM_CLUSTERS clusters ready for merge"
+if [ $NUM_CLUSTERS == 0 ]; then
+  exit 0
+fi
+
+# Merge
+echo
+echo "Merging..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -cp build/libs/whelktool.jar datatool.WorkTool \
+  $ARGS -r $REPORT_DIR/merged-works -m $NO_ANONYMOUS_TRANSLATIONS
\ No newline at end of file
diff --git a/whelk-core/src/main/groovy/se/kb/libris/Normalizers.groovy b/whelk-core/src/main/groovy/se/kb/libris/Normalizers.groovy
index 3261567c58..7ccdbf20d6 100644
--- a/whelk-core/src/main/groovy/se/kb/libris/Normalizers.groovy
+++ b/whelk-core/src/main/groovy/se/kb/libris/Normalizers.groovy
@@ -15,6 +15,7 @@ import whelk.util.Romanizer
 import static whelk.JsonLd.GRAPH_KEY
 import static whelk.JsonLd.ID_KEY
 import static whelk.JsonLd.TYPE_KEY
+import static whelk.JsonLd.WORK_KEY
 import static whelk.JsonLd.asList
 import static whelk.util.DocumentUtil.traverse
 
@@ -192,12 +193,19 @@ class Normalizers {
         }
     }
 
-    static Map getWork(JsonLd jsonLd, Document doc) {
-        def (_record, thing) = doc.data['@graph']
-        if (thing && isInstanceOf(jsonLd, thing, 'Work')) {
+    static Map getWork(Whelk whelk, Document doc) {
+        def (_record, thing) = doc.data[GRAPH_KEY]
+        if (thing && isInstanceOf(whelk.jsonld, thing, 'Work')) {
             return thing
-        } else if (thing && thing['instanceOf'] && isInstanceOf(jsonLd, thing['instanceOf'], 'Work')) {
-            return thing['instanceOf']
+        }
+        else if (thing && thing[WORK_KEY]) {
+            def linked = thing[WORK_KEY][ID_KEY]
+            if (linked) {
+                return getWork(whelk, whelk.storage.getDocumentByIri(linked))
+            }
+            if (isInstanceOf(whelk.jsonld, thing[WORK_KEY], 'Work')) {
+                return thing[WORK_KEY]
+            }
         }
         return null
     }
diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy
index a604efecc5..bdf4e46b4c 100644
--- a/whelk-core/src/main/groovy/whelk/Whelk.groovy
+++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy
@@ -62,7 +62,7 @@ class Whelk {
     URI baseUri = null
     boolean skipIndex = false
     boolean skipIndexDependers = false
-    
+
     // useCache may be set to true only when doing initial imports (temporary processes with the rest of Libris down).
     // Any other use of this results in a "local" cache, which will not be invalidated when data changes elsewhere,
     // resulting in potential serving of stale data.
@@ -263,31 +263,29 @@ class Whelk {
                 def systemId = Document.BASE_URI.resolve(id).getPath().substring(1)
                 idMap[systemId] = id
                 systemIds << systemId
-            }
-            else if (JsonLd.looksLikeIri(id)) {
+            } else if (JsonLd.looksLikeIri(id)) {
                 otherIris << id
-            }
-            else {
+            } else {
                 systemIds << id
             }
         }
         if (otherIris) {
             Map<String, String> idToIri = storage.getSystemIdsByIris(otherIris)
                     .collectEntries { k, v -> [(v): k] }
-            
+
             systemIds.addAll(idToIri.keySet())
             idMap.putAll(idToIri)
         }
-        
+
         return storage.bulkLoad(systemIds)
                 .findAll { id, doc -> !doc.deleted }
-                .collectEntries { id, doc -> [(idMap.getOrDefault(id, id)) : doc]}
+                .collectEntries { id, doc -> [(idMap.getOrDefault(id, id)): doc] }
     }
-    
+
     private void reindexUpdated(Document updated, Document preUpdateDoc) {
         indexAsyncOrSync {
             elastic.index(updated, this)
-            
+
             if (!skipIndexDependers) {
                 if (hasChangedMainEntityId(updated, preUpdateDoc)) {
                     reindexAllLinks(updated.shortId)
@@ -297,17 +295,17 @@ class Whelk {
             }
         }
     }
-    
+
     private void indexAsyncOrSync(Runnable runnable) {
         if (skipIndex) {
             return
         }
-        
-        if(!elastic) {
+
+        if (!elastic) {
             log.warn("Elasticsearch not configured when trying to reindex")
             return
         }
-        
+
         Runnable reindex = {
             try {
                 runnable.run()
@@ -316,7 +314,7 @@ class Whelk {
                 log.error("Error reindexing: $e", e)
             }
         }
-        
+
         if (isBatchJobThread()) {
             // Update them synchronously
             reindex.run()
@@ -337,30 +335,29 @@ class Whelk {
         Set<Link> removedLinks = (preUpdateLinks - postUpdateLinks)
 
         removedLinks.findResults { storage.getSystemIdByIri(it.iri) }
-                .each{id -> elastic.decrementReverseLinks(id) }
+                .each { id -> elastic.decrementReverseLinks(id) }
 
         addedLinks.each { link ->
             String id = storage.getSystemIdByIri(link.iri)
             if (id) {
                 Document doc = storage.load(id)
                 def lenses = ['chips', 'cards', 'full']
-                def reverseRelations = lenses.collect{ jsonld.getInverseProperties(doc.data, it) }.flatten()
+                def reverseRelations = lenses.collect { jsonld.getInverseProperties(doc.data, it) }.flatten()
                 if (reverseRelations.contains(link.relation)) {
                     // we added a link to a document that includes us in its @reverse relations, reindex it
                     elastic.index(doc, this)
-                }
-                else {
+                } else {
                     // just update link counter
                     elastic.incrementReverseLinks(id)
                 }
             }
         }
-        
+
         if (storage.isCardChangedOrNonexistent(document.getShortId())) {
             bulkIndex(elastic.getAffectedIds(document.getThingIdentifiers() + document.getRecordIdentifiers()))
         }
     }
-    
+
     private void bulkIndex(Iterable<String> ids) {
         Iterables.partition(ids, 100).each {
             elastic.bulkIndexWithRetry(it, this)
@@ -377,12 +374,12 @@ class Whelk {
 
         // Identifiers-table lookup on:
         List<String> uriIDs = document.getRecordIdentifiers()
-        uriIDs.addAll( document.getThingIdentifiers() )
+        uriIDs.addAll(document.getThingIdentifiers())
         for (String uriID : uriIDs) {
             String systemId = storage.getSystemIdByIri(uriID)
             if (systemId != null && systemId != document.getShortId()) {
                 log.info("Determined that " + document.getShortId() + " is duplicate of " + systemId + " due to collision on URI: " + uriID)
-                collidingSystemIDs.add( new Tuple2(systemId, "on URI: " + uriID) )
+                collidingSystemIDs.add(new Tuple2(systemId, "on URI: " + uriID))
             }
         }
 
@@ -403,7 +400,7 @@ class Whelk {
                 if (includingTypedIDs) {
                     for (String collision : collisions) {
                         if (collision != document.getShortId())
-                        collidingSystemIDs.add( new Tuple2(collision, "on typed id: " + type + "," + graphIndex + "," + value) )
+                            collidingSystemIDs.add(new Tuple2(collision, "on typed id: " + type + "," + graphIndex + "," + value))
                     }
                 } else {
 
@@ -423,7 +420,7 @@ class Whelk {
      */
     boolean createDocument(Document document, String changedIn, String changedBy, String collection, boolean deleted) {
         normalize(document)
-        
+
         boolean detectCollisionsOnTypedIDs = false
         List<Tuple2<String, String>> collidingIDs = getIdCollisions(document, detectCollisionsOnTypedIDs)
         if (!collidingIDs.isEmpty()) {
@@ -465,7 +462,7 @@ class Whelk {
         if (updated == null || preUpdateDoc == null) {
             return false
         }
-   
+
         reindexUpdated(updated, preUpdateDoc)
         sparqlUpdater?.pollNow()
 
@@ -480,7 +477,7 @@ class Whelk {
         if (updated == null) {
             return
         }
-        
+
         reindexUpdated(updated, preUpdateDoc)
         sparqlUpdater?.pollNow()
     }
@@ -493,15 +490,22 @@ class Whelk {
     boolean quickCreateDocument(Document document, String changedIn, String changedBy, String collection) {
         return storage.quickCreateDocument(document, changedIn, changedBy, collection)
     }
-  
-    void remove(String id, String changedIn, String changedBy, boolean force=false) {
+
+    void remove(String id, String changedIn, String changedBy, boolean force = false) {
         log.debug "Deleting ${id} from Whelk"
-        Document doc = storage.load(id)
-        storage.remove(id, changedIn, changedBy, force)
-        indexAsyncOrSync {
-            elastic.remove(id)
-            if (!skipIndexDependers) {
-                reindexAffected(doc, doc.getExternalRefs(), Collections.emptySet())
+        Document doc
+        try {
+            doc = storage.load(id)
+        } catch (Exception e) {
+            log.warn "Could not remove object from whelk. No entry with id $id found"
+        }
+        if (doc) {
+            storage.remove(id, changedIn, changedBy, force)
+            indexAsyncOrSync {
+                elastic.remove(id)
+                if (!skipIndexDependers) {
+                    reindexAffected(doc, doc.getExternalRefs(), Collections.emptySet())
+                }
             }
         }
     }
@@ -513,13 +517,12 @@ class Whelk {
     }
 
     void embellish(Document document, List<String> levels = null) {
-        def docsByIris = { List<String> iris -> bulkLoad(iris).values().collect{ it.data } }
+        def docsByIris = { List<String> iris -> bulkLoad(iris).values().collect { it.data } }
         Embellisher e = new Embellisher(jsonld, docsByIris, storage.&getCards, relations.&getByReverse)
 
         if (levels) {
             e.setEmbellishLevels(levels)
-        }
-        else if (document.getThingType() == 'Item') {
+        } else if (document.getThingType() == 'Item') {
             e.setEmbellishLevels(['cards'])
             e.setFollowInverse(false)
         }
@@ -545,7 +548,7 @@ class Whelk {
                 }
             }
         }
-        
+
         return result
     }
 
diff --git a/whelk-core/src/main/groovy/whelk/util/Statistics.groovy b/whelk-core/src/main/groovy/whelk/util/Statistics.groovy
index e99b258421..411f821015 100644
--- a/whelk-core/src/main/groovy/whelk/util/Statistics.groovy
+++ b/whelk-core/src/main/groovy/whelk/util/Statistics.groovy
@@ -13,7 +13,7 @@ class Statistics {
     ThreadLocal<Stack<Object>> context = ThreadLocal.withInitial({ -> null })
 
     int numExamples
-
+    
     Statistics(int numExamples = 1) {
         this.numExamples = numExamples
     }
diff --git a/whelk-core/src/main/groovy/whelk/util/Unicode.groovy b/whelk-core/src/main/groovy/whelk/util/Unicode.groovy
index c5620c1f5f..ad97201b19 100644
--- a/whelk-core/src/main/groovy/whelk/util/Unicode.groovy
+++ b/whelk-core/src/main/groovy/whelk/util/Unicode.groovy
@@ -50,6 +50,8 @@ class Unicode {
             [(it): Normalizer.normalize(it, Normalizer.Form.NFKC)]
         } + STRIP_UNICODE_CHARS.collectEntries { [(it): ''] }
     }
+
+    private static final Pattern UNICODE_MARK = Pattern.compile('\\p{M}')
     
     static boolean isNormalized(String s) {
         return Normalizer.isNormalized(s, Normalizer.Form.NFC) && !EXTRA_NORMALIZATION_MAP.keySet().any{ s.contains(it) }
@@ -90,11 +92,11 @@ class Unicode {
         def m = s =~ /[^${w}]*(.*)/
         return m.matches() ? m.group(1) : s
     }
-    
+
     static String trim(String s) {
         s.replaceFirst(LEADING_SPACE, '').replaceFirst(TRAILING_SPACE, '')
     }
-    
+
     static Optional<Character.UnicodeScript> guessScript(String s) {
         s = s.replaceAll(~/\p{IsCommon}|\p{IsInherited}|\p{IsUnknown}/, '')
 
@@ -178,4 +180,8 @@ class Unicode {
                 'Vaii',
         ].each { add15924scriptCode(it) }
     }
+
+    static String asciiFold(String s) {
+        return Normalizer.normalize(s, Normalizer.Form.NFD).replaceAll(UNICODE_MARK, '')
+    }
 }
diff --git a/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy b/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy
index 4e6a59be6e..8ff47084b9 100644
--- a/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy
+++ b/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy
@@ -40,7 +40,7 @@ class UnicodeSpec extends Specification {
         ' _.:;|(Überzetsung)|;:. '               | '(Überzetsung)'
         ' _.:;| Ü b e r - z e t - s u n g |;:. ' | 'Ü b e r - z e t - s u n g'
     }
-
+    
     def "trim"() {
         expect:
         Unicode.trim(dirty) == clean
@@ -58,7 +58,7 @@ class UnicodeSpec extends Specification {
         '\r\nkeep leading line breaks'                              | '\r\nkeep leading line breaks'
         
     }
-    
+
     def "double quotation marks"() {
         expect:
         Unicode.isNormalizedDoubleQuotes(dirty) == (dirty == clean)
@@ -99,5 +99,23 @@ class UnicodeSpec extends Specification {
         Optional.of('Armn')                           | 'Պիպին նավի վրա'
         Optional.of('Kana')                           | 'デスノート'
         Optional.of('Hira')                           | 'とんとんとんと'
+
+    def "u"() {
+        given:
+        String s = "übers"   //uU+CC88
+        String nfc = "übers" //U+C3BC
+        expect:
+        Unicode.isNormalized(s) == false
+        Unicode.normalize(s) == nfc
+    }
+
+    def "asciiFold"() {
+        expect:
+        Unicode.asciiFold(unicode) == ascii
+
+        where:
+        unicode          | ascii
+        'Désidéria'      | 'Desideria'
+        'Антон Павлович' | 'Антон Павлович'
     }
 }
\ No newline at end of file
diff --git a/whelktool/build.gradle b/whelktool/build.gradle
index 2847670c82..c68523f273 100644
--- a/whelktool/build.gradle
+++ b/whelktool/build.gradle
@@ -53,6 +53,7 @@ dependencies {
     implementation "org.codehaus.groovy:groovy-jsr223:${groovyVersion}"
     implementation "org.codehaus.groovy:groovy:${groovyVersion}"
     implementation 'org.codehaus.jackson:jackson-mapper-asl:1.9.12'
+    implementation 'commons-codec:commons-codec:1.7'
     implementation group: 'xml-apis', name: 'xml-apis', version: '1.4.01'
 }
 
diff --git a/whelktool/scripts/analysis/bib-249.groovy b/whelktool/scripts/analysis/bib-249.groovy
new file mode 100644
index 0000000000..628d5acfcf
--- /dev/null
+++ b/whelktool/scripts/analysis/bib-249.groovy
@@ -0,0 +1,81 @@
+import whelk.util.DocumentUtil
+import whelk.util.Statistics
+
+s = new Statistics().printOnShutdown()
+
+selectByCollection('bib') { bib ->
+    try {
+        process(bib)
+    }
+    catch(Exception e) {
+        System.err.println(e)
+        e.printStackTrace()
+    }
+
+}
+
+void process(bib) {
+    def (record, thing, work) = bib.graph
+
+    if(!work) {
+        return
+    }
+
+    if(thing['marc:hasBib249']) {
+        boolean marcTrl = work['marc:languageNote'] == "marc:ItemIsOrIncludesATranslation"
+
+        String hasTitle = hasTitle(thing, work)
+
+        if (hasTitle == "diff") {
+            println ("""
+            ${bib.doc.getURI()}
+            ${thing['marc:hasBib249']}
+            marc:ItemIsOrIncludesATranslation ${marcTrl}
+            ${work.hasTitle}
+            """.stripIndent())
+        }
+
+        s.increment('hasTitle', hasTitle)
+        s.increment('shape', maybeList(thing['marc:hasBib249']) { map -> new TreeSet(map.keySet()) })
+        s.increment('marc:ItemIsOrIncludesATranslation', "${marcTrl}")
+    }
+}
+
+String hasTitle(thing, work) {
+    if (work.hasTitle) {
+        isSameTitle(thing, work) ? "match" : "diff"
+    }
+    else {
+        "no"
+    }
+}
+
+boolean isSameTitle(def thing, def work) {
+    String t = getPathSafe(thing, ['marc:hasBib249', 'marc:originalTitle'], "TT")
+    String w = getPathSafe(work, ['hasTitle', 0, 'mainTitle'], "WT")
+    trim(w.toLowerCase()) == trim(t.toLowerCase())
+}
+
+Object maybeList(Object o, Closure c) {
+    o instanceof List
+            ? o.collect(c)
+            : c(o)
+}
+
+private Object getPathSafe(item, path, defaultTo = null) {
+    for (p in path) {
+        if (item[p] != null) {
+            item = item[p]
+        } else {
+            return defaultTo
+        }
+    }
+    return item
+}
+
+String trim(String s) {
+    // remove leading and trailing non-"alpha, digit or parentheses"
+    def w = /\(\)\p{IsAlphabetic}\p{Digit}/
+    def m = s =~ /[^${w}]*([${w}- ]*[${w}])[^${w}]*/
+    return m.matches() ? m.group(1) : s
+}
\ No newline at end of file
diff --git a/whelktool/scripts/analysis/bib-976.groovy b/whelktool/scripts/analysis/bib-976.groovy
new file mode 100644
index 0000000000..5e26dfca23
--- /dev/null
+++ b/whelktool/scripts/analysis/bib-976.groovy
@@ -0,0 +1,114 @@
+import whelk.util.DocumentUtil
+import whelk.util.Statistics
+
+class Script {
+    static PrintWriter notIn084
+    static PrintWriter in084
+    static PrintWriter noCode
+    static PrintWriter report
+    static PrintWriter errors
+}
+Script.notIn084 = getReportWriter("not-in-084.txt")
+Script.in084 = getReportWriter("in-084.txt")
+Script.noCode = getReportWriter("no-code.txt")
+Script.report = getReportWriter("report.txt")
+Script.errors = getReportWriter("errors.txt")
+
+s = new Statistics().printOnShutdown()
+
+selectByCollection('bib') { bib ->
+    try {
+        process(bib)
+    }
+    catch(Exception e) {
+        Script.errors.println("${bib.doc.shortId} $e")
+        e.printStackTrace(Script.errors)
+    }
+
+}
+
+void process(bib) {
+    def work = bib.graph[1]['instanceOf']
+
+    if(!work) {
+        return
+    }
+
+    def bib976 = asList(work['marc:hasBib976'])
+    if(!bib976) {
+        return
+    }
+
+    def (code, noCode) = bib976.split { it['marc:bib976-a'] }
+    def bib81 = sab(work)
+
+    handleWithSabCode(bib, work, bib81, code)
+    handleWithoutSabCode(bib, work, bib81, noCode)
+}
+
+void handleWithSabCode(bib, work, bib084, bib976) {
+
+    bib976.each {
+        def (in084, notIn084) = bib976.split { x ->
+            def code = x['marc:bib976-a']
+            bib084.findAll{ it.startsWith((code)) }
+        }
+
+        in084.each {
+            s.increment('bib976-a', 'in classification')
+        }
+
+        notIn084.each {
+            s.increment('bib976-a', 'not in classification')
+            s.increment('bib976-a not in classification', it)
+        }
+
+        if (notIn084) {
+            Script.notIn084.println("""
+                ${bib.doc.getURI()}
+                bib-976: ${notIn084.collect{ "${it['marc:bib976-a']} (${it['marc:bib976-b']})" }}
+                classification/kssb: $bib084
+            """.stripIndent())
+        }
+
+        if (in084) {
+            Script.in084.println("""
+                ${bib.doc.getURI()}
+                bib-976: ${in084.collect{ "${it['marc:bib976-a']} (${it['marc:bib976-b']})" }}
+                classification/kssb: $bib084
+            """.stripIndent())
+        }
+
+        Script.report.println("${bib.doc.shortId} ${handled(in084, notIn084)}")
+    }
+}
+
+String handled(in084, notIn084) {
+    if (!in084 && notIn084) {
+        return "ingen"
+    }
+    if (in084 && !notIn084) {
+        return "alla"
+    }
+    return "delvis"
+}
+
+void handleWithoutSabCode(bib, work, bib084, bib976) {
+    if (bib976) {
+        def creator = bib.graph[0]['descriptionCreator']['@id']
+        s.increment('bib976 without code', creator)
+
+        bib976.each {
+            def label = it['marc:bib976-b']
+            Script.noCode.println("${bib.doc.getURI()} $creator $label")
+        }
+    }
+}
+
+List sab(work) {
+    asList(work['classification']).findAll{ it['inScheme'] ?: '' == 'kssb' }.collect{ it['code'] }
+}
+
+def asList(x) {
+    (x ?: []).with {it instanceof List ? it : [it] }
+}
\ No newline at end of file
diff --git a/whelktool/scripts/analysis/broader-gf.groovy b/whelktool/scripts/analysis/broader-gf.groovy
new file mode 100644
index 0000000000..a0e526466c
--- /dev/null
+++ b/whelktool/scripts/analysis/broader-gf.groovy
@@ -0,0 +1,45 @@
+import whelk.util.DocumentUtil
+import whelk.util.Statistics
+
+s = new Statistics(5).printOnShutdown()
+
+selectByCollection('bib') { bib ->
+    def work = getWork(bib)
+
+    if(!work) {
+        return
+    }
+
+    if(work['genreForm']) {
+        List<String> ids = work['genreForm']['@id']
+        if (ids.size() > 1) {
+            [ids, ids].combinations{ a,b ->
+                if (a != b) {
+                    check(bib.whelk, a, b)
+                    check(bib.whelk, b, a)
+                }
+            }
+        }
+    }
+}
+
+void check(whelk, String a, String b) {
+    if (whelk.relations.isImpliedBy(a, b)) {
+        s.increment(a, b)
+        s.increment('#broader', a)
+    }
+}
+
+Map getWork(def bib) {
+    def (record, thing, work) = bib.graph
+    if (thing && isInstanceOf(thing, 'Work')) {
+        return thing
+    }
+    else if(thing && thing['instanceOf'] && isInstanceOf(thing['instanceOf'], 'Work')) {
+        return thing['instanceOf']
+    }
+    else if (work && isInstanceOf(work, 'Work')) {
+        return work
+    }
+    return null
+}
\ No newline at end of file
diff --git a/whelktool/scripts/analysis/find-work-clusters.groovy b/whelktool/scripts/analysis/find-work-clusters.groovy
new file mode 100644
index 0000000000..bd9c61d24d
--- /dev/null
+++ b/whelktool/scripts/analysis/find-work-clusters.groovy
@@ -0,0 +1,169 @@
+/**
+ * (When running, redirect STDERR to avoid annoying prints from whelktool)
+ */
+
+
+import java.text.SimpleDateFormat
+import java.util.concurrent.ConcurrentHashMap
+import se.kb.libris.Normalizers
+
+PrintWriter failedQueries = getReportWriter("failed-queries")
+PrintWriter tooLargeResult = getReportWriter("too-large-result")
+
+def yesterday = new SimpleDateFormat('yyyy-MM-dd').with { sdf ->
+    Calendar.getInstance().with { c ->
+        c.add(Calendar.DATE, -1)
+        sdf.format(c.getTime())
+    }
+}
+
+def where = """
+    collection = '%s'
+    AND (modified = '$yesterday'
+        OR (data#>>'{@graph,0,generationDate}')::date = '$yesterday')
+"""
+
+visited = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>())  // TODO: remove?
+instancesOfUpdatedLinkedWorks = Collections.synchronizedSet([] as Set)
+
+selectBySqlWhere(String.format(where, 'auth')) {
+    def thing = it.graph[1]
+    if (Normalizers.isInstanceOf(it.whelk.jsonld, thing, 'Work')) {
+        selectBySqlWhere("collection = 'bib' and data#>>'{@graph,1,instanceOf,@id}' = '${thing['@id']}'") {
+            instancesOfUpdatedLinkedWorks.add(it.doc.shortId)
+        }
+    }
+}
+
+def process = { bib ->
+    if (!visited.add(bib.doc.shortId))
+        return
+
+    try {
+        def q = buildQuery(bib)
+        if (!q) {
+            return
+        }
+
+        List ids = queryIds(q).collect()
+
+        if (ids.size() > 200) {
+            tooLargeResult.println("Results: ${ids.size()} Query: ${q}")
+        }
+        else if (ids.size() > 1) {
+            visited.addAll(ids)
+            println(ids.join('\t'))
+        }
+    }
+    catch (Exception e) {
+        failedQueries.println(e)
+        e.printStackTrace()
+        return
+    }
+}
+
+selectByIds(instancesOfUpdatedLinkedWorks) {
+    process(it)
+}
+
+// TODO: Change when starting to run regularly
+//selectBySqlWhere(String.format(where, 'bib')) { bib ->
+selectByCollection('bib') {
+    process(it)
+}
+
+Map<String, List<String>> buildQuery(bib) {
+    def title = title(bib)
+
+    if (!title)
+        return null
+
+    Map<String, List<String>> query = [
+            "q"                 : ["*"],
+            "@type"             : ["Instance"],
+            "hasTitle.mainTitle": [esSafe(title)],
+    ]
+
+    insertLinkedAgents(bib)
+    def card = bib.asCard(true)
+
+    def author = primaryContributor(card).collect{ esSafe(it) }
+    if (author) {
+        query["or-instanceOf.contribution._str"] = author
+        query["or-instanceOf.contribution.agent._str"] = author
+        return query
+    }
+
+    def allContributors = contributors(card).collect{ esSafe(it) }
+    if (allContributors) {
+        query["or-instanceOf.contribution._str"] = allContributors
+        query["or-instanceOf.contribution.agent._str"] = allContributors
+        return query
+    }
+    return null
+}
+
+private void insertLinkedAgents(bib) {
+    getPathSafe(bib.doc.data, ['@graph', 1, 'instanceOf', 'contribution']).each {
+        if (it.agent && it.agent['@id']) {
+            it.agent = loadThing(it.agent['@id'])
+        }
+    }
+}
+
+private String title(bib) {
+    return getPathSafe(bib.doc.data, ['@graph', 1, 'hasTitle', 0, 'mainTitle'])
+}
+
+private List primaryContributor(bib) {
+    contributorStrings(getPathSafe(bib, ['@graph', 1, 'instanceOf', 'contribution'], []).find { it['@type'] == "PrimaryContribution" })
+}
+
+private List contributors(bib) {
+    getPathSafe(bib, ['@graph', 1, 'instanceOf', 'contribution'], []).collect { contributorStrings(it) }.grep().flatten()
+}
+
+//getPathSafe(contribution, ['_str'])?.with { String s -> s.replaceAll(/[^ \p{IsAlphabetic}]/, '') }
+private List contributorStrings(contribution) {
+    List variants = asList(contribution?.agent) + asList(getPathSafe(contribution, ['agent', 'hasVariant']))
+
+    variants.collect { name(it) }.grep()
+}
+
+private String name(Map agent) {
+    agent.givenName && agent.familyName
+            ? "${agent.givenName} ${agent.familyName}"
+            : agent.name
+}
+
+// Remove ES query operators from string
+private String esSafe(String s) {
+    s.replaceAll('[+|"\\-*~]', " ")
+}
+
+private Object getPathSafe(item, path, defaultTo = null) {
+    if (!item) {
+        return defaultTo
+    }
+
+    for (p in path) {
+        if (item[p] != null) {
+            item = item[p]
+        } else {
+            return defaultTo
+        }
+    }
+    return item
+}
+
+private Map loadThing(def id) {
+    def thing = [:]
+    selectByIds([id]) { t ->
+        thing = t.graph[1]
+    }
+    return thing
+}
+
+private static List asList(Object o) {
+    (o ?: []).with { it instanceof List ? it : [it] }
+}
\ No newline at end of file
diff --git a/whelktool/scripts/analysis/local-broader.groovy b/whelktool/scripts/analysis/local-broader.groovy
new file mode 100644
index 0000000000..10a87e7bde
--- /dev/null
+++ b/whelktool/scripts/analysis/local-broader.groovy
@@ -0,0 +1,61 @@
+/**
+ * Find unlinked 'broader'
+ *
+ * See LXL-3213 for more information.
+ */
+
+
+import groovy.transform.Memoized
+import whelk.util.DocumentUtil
+
+class Script {
+    static PrintWriter report
+    static PrintWriter selfRef
+    static PrintWriter is404
+    static PrintWriter error
+}
+Script.report = getReportWriter("report.txt")
+Script.selfRef = getReportWriter("self-ref.txt")
+Script.error = getReportWriter("error.txt")
+Script.is404 = getReportWriter("404.txt")
+
+selectByCollection('auth') { auth ->
+    try {
+        process(auth)
+    }
+    catch(Exception e) {
+        //Script.error.
+        println("${auth.doc.shortId} $e")
+        e.printStackTrace()
+    }
+}
+
+void process(auth) {
+    Map thing = auth.graph[1]
+    String id = thing['@id']
+    List broader = thing['broader']
+
+    if (!broader) {
+        return
+    }
+
+    broader.findAll{ !it['@id'] }.each { Map b ->
+        Script.report.println("$id $b")
+    }
+
+    broader.findAll{ id == it['@id'] }.each { Map b ->
+        Script.selfRef.println("$id")
+    }
+    broader.findAll{ it['@id'] && is404(it['@id']) }.each { Map b ->
+        Script.is404.println("$id $b")
+    }
+}
+
+@Memoized
+boolean is404(String id) {
+    Map thing = null
+    selectByIds([id]) { auth ->
+        thing = auth.graph[1]
+    }
+    return thing == null
+}
\ No newline at end of file
diff --git a/whelktool/scripts/analysis/mediaterm.groovy b/whelktool/scripts/analysis/mediaterm.groovy
new file mode 100644
index 0000000000..4e015e8079
--- /dev/null
+++ b/whelktool/scripts/analysis/mediaterm.groovy
@@ -0,0 +1,27 @@
+import whelk.util.DocumentUtil
+import whelk.util.Statistics
+
+Statistics s = new Statistics(5)
+s.printOnShutdown()
+
+selectByCollection('bib') { bib ->
+    try {
+        DocumentUtil.findKey(bib.doc.data, 'marc:mediaTerm') { String value, path ->
+            if (value.contains(']')) {
+                String mediaType = value.substring(0, value.indexOf(']'))
+                String suffix = value.substring(value.indexOf(']') + 1)
+                if (!suffix.isBlank()) {
+                    String id = bib.doc.shortId
+                    s.increment('ALL', suffix, id)
+                    s.increment(mediaType, suffix, id)
+                    s.increment('TOTAL', 'TOTAL')
+                }
+            }
+
+        }
+    }
+    catch(Exception e) {
+        println(e)
+        e.printStackTrace()
+    }
+}
\ No newline at end of file
diff --git a/whelktool/scripts/examples/merge-clusters.groovy b/whelktool/scripts/analysis/merge-clusters.groovy
similarity index 58%
rename from whelktool/scripts/examples/merge-clusters.groovy
rename to whelktool/scripts/analysis/merge-clusters.groovy
index c5af74531a..cbb150e180 100644
--- a/whelktool/scripts/examples/merge-clusters.groovy
+++ b/whelktool/scripts/analysis/merge-clusters.groovy
@@ -1,15 +1,11 @@
 import datatool.util.DisjointSets
 
-String dir = System.getProperty('clustersDir')
-mergeClusters(
-        new File(dir, 'clusters.tsv'),
-        new File(dir, 'clusters-merged.tsv'))
+mergeClusters(new File(System.getProperty('clusters')))
 
-void mergeClusters(File input, File output) throws FileNotFoundException {
+void mergeClusters(File clusters) throws FileNotFoundException {
     DisjointSets<String> sets = new DisjointSets<>()
-    PrintWriter p = new PrintWriter(output)
 
-    input.eachLine() {
+    clusters.eachLine {
         sets.addSet(Arrays.asList(it.split(/[\t ]+/)))
     }
 
@@ -19,13 +15,13 @@ void mergeClusters(File input, File output) throws FileNotFoundException {
         void nextElement(String e) {
             if(!first)
                 print('\t')
-            p.print(e)
+            print(e)
             first = false
         }
 
         @Override
         void closeSet() {
-            p.println()
+            println()
             first = true
         }
     })
diff --git a/whelktool/scripts/analysis/oversattning-without.trl.groovy b/whelktool/scripts/analysis/oversattning-without.trl.groovy
new file mode 100644
index 0000000000..18e008e764
--- /dev/null
+++ b/whelktool/scripts/analysis/oversattning-without.trl.groovy
@@ -0,0 +1,49 @@
+import whelk.util.DocumentUtil
+import whelk.util.Statistics
+
+Statistics s = new Statistics().printOnShutdown()
+
+selectByCollection('bib') { bib ->
+    def work = getWork(bib)
+
+    if(!work) {
+        return
+    }
+
+    if(work['marc:languageNote'] == "marc:ItemIsOrIncludesATranslation"
+            && noTranslator(work.contribution ?: [])
+            && (bib.graph[1].responsibilityStatement ?: "").contains('övers')
+    ) {
+        println ("""
+            ${bib.doc.getURI()}
+            ${work.contribution}
+            ${bib.graph[1].responsibilityStatement}
+
+            """.stripIndent())
+        s.increment('tot', 'tot')
+    }
+}
+
+boolean noTranslator(def contribution) {
+    boolean found = false
+    DocumentUtil.findKey(contribution, '@id') { value, path ->
+        if (value == 'https://id.kb.se/relator/translator') {
+            found = true
+        }
+        DocumentUtil.NOP
+    }
+
+    return !found
+}
+
+
+Map getWork(def bib) {
+    def (record, thing, work) = bib.graph
+    if (thing && isInstanceOf(thing, 'Work')) {
+        return thing
+    }
+    else if (work && isInstanceOf(work, 'Work')) {
+        return work
+    }
+    return null
+}
\ No newline at end of file
diff --git a/whelktool/scripts/analysis/respStatement-to-contribution.groovy b/whelktool/scripts/analysis/respStatement-to-contribution.groovy
new file mode 100644
index 0000000000..1dc2201096
--- /dev/null
+++ b/whelktool/scripts/analysis/respStatement-to-contribution.groovy
@@ -0,0 +1,178 @@
+import groovy.transform.Memoized
+import whelk.util.Statistics
+
+import static datatool.scripts.mergeworks.Util.asList
+import static datatool.scripts.mergeworks.Util.parseRespStatement
+import static datatool.scripts.mergeworks.Util.getPathSafe
+import static datatool.scripts.mergeworks.Util.Relator
+import static datatool.scripts.mergeworks.Util.bestEncodingLevel
+import static datatool.scripts.mergeworks.WorkToolJob.nameMatch
+
+PrintWriter allStatements = getReportWriter("all-statements.csv")
+PrintWriter notParsed = getReportWriter("not-parsed.txt")
+PrintWriter roleSpecified = getReportWriter("role-specified.tsv")
+PrintWriter agentFoundInCluster = getReportWriter("agent-found-in-cluster.tsv")
+PrintWriter parsedButUnmatched = getReportWriter("parsed-but-unmatched.tsv")
+PrintWriter pseudonyms = getReportWriter("pseudonyms")
+
+Statistics s = new Statistics().printOnShutdown()
+
+def clusters = System.getProperty('clustersDir')
+        .with {new File(it, 'clusters.tsv') }
+        .collect { it.split() as List }
+
+clusters.each { cluster ->
+    s.increment('fetch contribution from respStatement', 'clusters checked')
+
+    selectByIds(cluster) { bib ->
+        def data = bib.doc.data
+        def id = bib.doc.shortId
+        def respStatement = getPathSafe(data, ['@graph', 1, 'responsibilityStatement'])
+        def encodingLevel = getPathSafe(data, ['@graph', 0, 'encodingLevel'])
+
+        if (!respStatement)
+            return
+
+        s.increment('fetch contribution from respStatement', 'docs checked')
+        allStatements.println(respStatement)
+
+        def contributionsInRespStmt = parseRespStatement(respStatement)
+        def contribution = getPathSafe(data, ['@graph', 1, 'instanceOf', 'contribution'], [])
+
+        if (contributionsInRespStmt.isEmpty()) {
+            notParsed.println([respStatement, id].join('\t'))
+            return
+        }
+
+        contribution.each { Map c ->
+            asList(c.agent).each { a ->
+                def matchedOnName = contributionsInRespStmt.find { n, r ->
+                    nameMatch(n, loadIfLink(a))
+                }
+
+                if (!matchedOnName)
+                    return
+
+                // Contributor found locally, omit from further search
+                contributionsInRespStmt.remove(matchedOnName.key)
+
+
+                def dontAdd = { Relator relator, boolean isFirstStmtPart ->
+                    relator == Relator.UNSPECIFIED_CONTRIBUTOR
+                            || isFirstStmtPart && relator == Relator.AUTHOR
+                                && c.'@type' != 'PrimaryContribution'
+                }
+
+                def rolesInRespStatement = matchedOnName.value
+                        .findResults { dontAdd(it) ? null : it.getV1() }
+
+                if (rolesInRespStatement.isEmpty())
+                    return
+
+                def rolesInContribution = asList(c.role).findAll { it.'@id' != Relator.UNSPECIFIED_CONTRIBUTOR.iri }
+                def roleShort = { it.split('/').last() }
+                def joinRoles = { roles -> roles.collect { r -> r.'@id' ? roleShort(r.'@id') : 'BLANK' }.join('|') }
+
+                rolesInRespStatement.removeAll { r ->
+                    r == Relator.EDITOR && rolesInContribution.findIndexOf {
+                        it.'@id' == Relator.ADAPTER.iri
+                    }.with {
+                        if (it == -1) {
+                            return false
+                        } else {
+                            rolesInContribution[it]['@id'] = Relator.EDITOR.iri
+                            return true
+                        }
+                    }
+                }
+
+                if (rolesInRespStatement.size() <= rolesInContribution.size())
+                    return
+
+                rolesInRespStatement.each { r ->
+                    def idLink = ['@id': r.iri]
+                    if (!(idLink in rolesInContribution)) {
+                        rolesInContribution << idLink
+                        s.increment('fetch contribution from respStatement', "${roleShort(r.iri)} roles specified")
+                        roleSpecified.println([id, joinRoles(asList(c.role)), joinRoles(rolesInContribution), matchedOnName.key, respStatement].join('\t'))
+                    }
+                }
+            }
+        }
+
+        def comparable = {
+            it*.getV1().findResults { Relator r ->
+                r != Relator.UNSPECIFIED_CONTRIBUTOR
+                        ? ['@id': r.iri]
+                        : null
+            }
+        }
+
+        contributionsInRespStmt.each { name, roles ->
+            def roleShort = { it.getV1().iri.split('/').last() }
+            def concat = { it.collect { r -> roleShort(r) }.join('|') }
+
+            def found = false
+
+            for (String otherId : cluster) {
+                def doc = loadDoc(otherId)
+                if (!doc)
+                    continue
+                def otherEncodingLevel = getPathSafe(doc.data, ['@graph', 0, 'encodingLevel'])
+
+                def matched = getPathSafe(doc.data, ['@graph', 1, 'instanceOf', 'contribution'], [])
+                        .find { Map c ->
+                            asList(c.agent).any { a ->
+                                nameMatch(name, loadIfLink(a))
+                                        && comparable(roles).with { r -> !r.isEmpty() && asList(c.role).containsAll(r) }
+                                        && bestEncodingLevel.indexOf(encodingLevel) <= bestEncodingLevel.indexOf(otherEncodingLevel)
+                            }
+                        }
+
+                if (matched) {
+                    def isPseudonym = {
+                        asList(it.agent).any { a ->
+                            loadIfLink(a).description =~ /(?i)pseud/
+                        }
+                    }
+
+                    if (isPseudonym(matched)) {
+                        pseudonyms.println([id, concat(roles), name, otherId].join('\t'))
+                        continue
+                    }
+
+                    roles.each { s.increment('fetch contribution from respStatement', "${roleShort(it)} found in cluster") }
+                    agentFoundInCluster.println([id, concat(roles), name, otherId, respStatement].join('\t'))
+
+                    found = true
+                    break
+                }
+            }
+
+            if (!found)
+                parsedButUnmatched.println([id, concat(roles), name, respStatement].join('\t'))
+        }
+    }
+}
+
+def loadIfLink(Map agent) {
+    agent['@id'] ? loadThing(agent['@id']) : agent
+}
+
+@Memoized
+def loadThing(String id) {
+    def thing = [:]
+    selectByIds([id]) { t ->
+        thing = t.graph[1]
+    }
+    return thing
+}
+
+@Memoized
+def loadDoc(String id) {
+    def doc
+    selectByIds([id]) { d ->
+        doc = d.doc
+    }
+    return doc
+}
diff --git a/whelktool/scripts/analysis/responsibilityStatement.groovy b/whelktool/scripts/analysis/responsibilityStatement.groovy
new file mode 100644
index 0000000000..b9f5aafdc5
--- /dev/null
+++ b/whelktool/scripts/analysis/responsibilityStatement.groovy
@@ -0,0 +1,12 @@
+selectByCollection('bib') { bib ->
+    def (record, thing) = bib.graph
+    if (thing.responsibilityStatement) {
+        int numContribution = asList(thing.instanceOf?.contribution).size()
+        String title = thing.hasTitle?.mainTitle ?: (thing.hasTitle ?: '') 
+        println(String.format("%s\t%3s\t%s\t\t%s", bib.doc.shortId, numContribution, thing.responsibilityStatement, title))
+    }
+}
+
+List asList(Object o) {
+    (o ?: []).with { it instanceof List ? it : [it] }
+}
\ No newline at end of file
diff --git a/whelktool/scripts/analysis/subject-404.groovy b/whelktool/scripts/analysis/subject-404.groovy
new file mode 100644
index 0000000000..e453172dd6
--- /dev/null
+++ b/whelktool/scripts/analysis/subject-404.groovy
@@ -0,0 +1,38 @@
+import whelk.util.DocumentUtil
+import whelk.util.Statistics
+
+class Script {
+    static Statistics s = new Statistics().printOnShutdown()
+}
+
+selectByCollection('bib') { bib ->
+    try {
+        process(bib)
+    }
+    catch(Exception e) {
+        System.err.println(e)
+        e.printStackTrace()
+    }
+
+}
+
+void process(bib) {
+    def (record, thing) = bib.graph
+
+    Map work = thing['instanceOf']
+
+    if(!work) {
+        return
+    }
+
+    if(work['subject']) {
+        for (Map subject in (work['subject'] as List<Map>)) {
+            if(subject['@type'] != 'ComplexSubject') {
+                if (subject['sameAs'] && subject['sameAs'][0] && subject['sameAs'][0]['@id'] && subject['sameAs'][0]['@id'].contains('id.kb.se')) {
+                    Script.s.increment('sameAs', subject['sameAs'][0]['@id'], bib.doc.shortId)
+                }
+            }
+        }
+
+    }
+}
\ No newline at end of file
diff --git a/whelktool/scripts/examples/works.groovy b/whelktool/scripts/analysis/works.groovy
similarity index 100%
rename from whelktool/scripts/examples/works.groovy
rename to whelktool/scripts/analysis/works.groovy
diff --git a/whelktool/scripts/analysis/works3.groovy b/whelktool/scripts/analysis/works3.groovy
deleted file mode 100644
index 41b37f8b77..0000000000
--- a/whelktool/scripts/analysis/works3.groovy
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * (When running, redirect STDERR to avoid annoying prints from whelktool)
- */
-
-import java.util.concurrent.ConcurrentHashMap
-
-clusterLog = getReportWriter("clusters.tsv")
-
-visited = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>())
-
-selectByCollection('bib') { bib ->
-    if (!visited.add(bib.doc.shortId))
-        return
-
-    try {
-        def q = buildQuery(bib)
-        if (!q) {
-            return
-        }
-
-        List ids = queryIds(q).collect()
-
-        if (ids.size() > 1) {
-            visited.addAll(ids)
-            clusterLog.println(ids.join('\t'))
-        }
-    }
-    catch (Exception e) {
-        println(e)
-        return
-    }
-}
-
-exit()
-
-List<Map<String, List<String>>> buildQueries(bib) {
-    def title = title(bib)
-
-    if (!title)
-        return null
-
-
-}
-
-Map<String, List<String>> buildQuery(bib) {
-
-
-    Map<String, List<String>> query = [
-            "q"                                : ["*"],
-            "@type"                            : ["*"],
-            "hasTitle.mainTitle"               : [title + "~"],
-    ]
-
-    def author = primaryContributorId(bib)
-    if (author) {
-        query["instanceOf.contribution.agent.@id"] = [author]
-        return query
-    }
-
-    def contributors = contributorStrings(bib)
-    if (contributors) {
-        query["instanceOf.contribution._str"] = contributors.collect{ it + "~" }
-        return query
-    }
-
-    return null
-}
-
-synchronized void exit() {
-    System.exit(0)
-}
-
-private String title(bib) {
-    return getPathSafe(bib.doc.data, ['@graph', 1, 'hasTitle', 0, 'mainTitle'])
-}
-
-private String primaryContributorId(bib) {
-    def primary = getPathSafe(bib.doc.data, ['@graph', 1, 'instanceOf', 'contribution'], []).grep{ it['@type'] == "PrimaryContribution"}
-    return getPathSafe(primary, [0, 'agent', '@id'])
-}
-
-private List contributorStrings(bib) {
-    return getPathSafe(bib.asCard(true), ['@graph', 1, 'instanceOf', 'contribution'], [])['_str'].grep{it}
-}
-
-private String flatTitle(bib) {
-    return flatten(
-            bib.doc.data['@graph'][1]['hasTitle'],
-            ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', ]
-    )
-}
-
-private String flatten(Object o, List order) {
-    if (o instanceof String) {
-        return o
-    }
-    if (o instanceof List) {
-        return o
-                .collect{ flatten(it, order) }
-                .join(' || ')
-    }
-    if (o instanceof Map) {
-        return order
-                .collect{ o.get(it, null) }
-                .grep{ it != null }
-                .collect{ flatten(it, order) }
-                .join(' | ')
-    }
-
-    throw new RuntimeException(String.format("unexpected type: %s for %s", o.class.getName(), o))
-}
-
-private Object getPathSafe(item, path, defaultTo = null) {
-    for (p in path) {
-        if (item[p] != null) {
-            item = item[p]
-        } else {
-            return defaultTo
-        }
-    }
-    return item
-}
diff --git a/whelktool/scripts/cleanups/2021/09/lxl-3303-instance-summary.groovy b/whelktool/scripts/cleanups/2021/09/lxl-3303-instance-summary.groovy
new file mode 100644
index 0000000000..53c6d9e01b
--- /dev/null
+++ b/whelktool/scripts/cleanups/2021/09/lxl-3303-instance-summary.groovy
@@ -0,0 +1,48 @@
+/**
+ * Move summary supplied by some providers from work to instance
+ *
+ * See LXL-3303 for more information
+ */
+
+providers = [
+        '[Barnbokskatalogen]',
+        '[Elib]',
+        '[Publit]',
+        'Provided by publisher'
+]
+
+def where = """
+  collection = 'bib' 
+  AND data#>>'{@graph, 1, instanceOf, @type}' = 'Text'
+  AND data#>>'{@graph, 1, instanceOf, summary}' IS NOT NULL
+  AND deleted = false
+  """
+
+Set shape = ['@type', 'label'] as Set
+
+selectBySqlWhere(where) { bib ->
+    List summaries = bib.graph[1]['instanceOf']['summary']
+    def (toInstance, toWork) = summaries.split { Map s ->
+        s.keySet() == shape
+                && providers.any { p -> asList(s.label).any { l -> l.contains(p) } }
+    }
+
+    if (toInstance) {
+        if (toWork) {
+            bib.graph[1]['instanceOf']['summary'] = toWork
+        } else {
+            bib.graph[1]['instanceOf'].remove('summary')
+        }
+
+        bib.graph[1]['summary'] = (bib.graph[1]['summary'] ?: []) + toInstance
+        bib.scheduleSave()
+    }
+}
+
+private List asList(Object o) {
+    if (o == null)
+        return []
+    if (o instanceof List)
+        return o
+    return [o]
+}
diff --git a/whelktool/scripts/cleanups/2021/09/lxl-3376-elib-cover-designer.groovy b/whelktool/scripts/cleanups/2021/09/lxl-3376-elib-cover-designer.groovy
new file mode 100644
index 0000000000..cf7ab8d140
--- /dev/null
+++ b/whelktool/scripts/cleanups/2021/09/lxl-3376-elib-cover-designer.groovy
@@ -0,0 +1,76 @@
+PrintWriter unhandled = getReportWriter("unhandled.txt")
+
+def where = """
+  collection = 'bib' 
+  AND (data#>>'{@graph, 1, instanceOf, summary}' like '%ormgivare:%[Elib]%' OR data#>>'{@graph, 1, summary}' like '%ormgivare:%[Elib]%')
+  AND deleted = false
+  """
+
+ROLES = [
+        'Formgivare:' : 'https://id.kb.se/relator/designer',
+        'Omslagsformgivare:' : 'https://id.kb.se/relator/coverDesigner'
+]
+
+OTHER = [['@id': 'https://id.kb.se/relator/unspecifiedContributor']]
+
+selectBySqlWhere(where) { bib ->
+    def summary = asList(bib.graph[1]['instanceOf']['summary']) + asList(bib.graph[1]['summary'])
+    def nameToRoles = summary
+            .findResults { it['label']}
+            .join(' ')
+            .with { parseDesigners(it) }
+        
+    List workContribution = bib.graph[1]['instanceOf']['contribution']
+    if (workContribution.removeAll { !it.agent }) {
+        bib.scheduleSave()
+    }
+    
+    def coverDesigners = workContribution.findAll {
+        def a = it.role && ROLES.values().containsAll(it.role)
+        def b = nameToRoles.containsKey(name(it.agent)) && (it.role == OTHER || !it.role)
+        a || b
+    }
+
+    if (!coverDesigners) {
+        unhandled.println("${bib.doc.shortId} c:$workContribution d:$nameToRoles")
+        return
+    }
+    
+    workContribution.removeAll(coverDesigners)
+    
+    coverDesigners.each { it['role'] = nameToRoles[name(it.agent)].collect { ['@id' : it] } }
+
+    bib.graph[1]['contribution'] = (bib.graph[1]['contribution'] ?: []) + coverDesigners
+        
+    bib.scheduleSave()
+}
+
+private Map parseDesigners(String summary) {
+    def roleToNames = ROLES.collectEntries { s, id ->
+        def names = summary
+                .findAll(/$s[^\[,]+/)
+                .collect { it.substring(s.size()) }
+                .collect { it.trim() }
+        
+        [(id) : names]
+    }
+    
+    def nameToRoles = [:]
+    roleToNames.each { role, names ->
+        names.each { n -> nameToRoles[n] = nameToRoles.getOrDefault(n, []) + [role] }
+    } 
+    
+    return nameToRoles
+}
+
+private String name(Map agent) {
+    "${agent.givenName} ${agent.familyName}"
+}
+
+private List asList(Object o) {
+    if (o == null)
+        return []
+    if (o instanceof List)
+        return o
+    return [o]
+}
\ No newline at end of file
diff --git a/whelktool/scripts/cleanups/2021/09/lxl-3376-language-in-work-title.groovy b/whelktool/scripts/cleanups/2021/09/lxl-3376-language-in-work-title.groovy
new file mode 100644
index 0000000000..0a0b1e2c08
--- /dev/null
+++ b/whelktool/scripts/cleanups/2021/09/lxl-3376-language-in-work-title.groovy
@@ -0,0 +1,63 @@
+import groovy.transform.Memoized
+import whelk.util.DocumentUtil
+
+PrintWriter report = getReportWriter("report.txt")
+
+def ids = new File(System.getProperty('ids'))
+        .readLines()
+        .collect { it.split('\t').collect { it.trim()} }
+        .flatten()
+
+selectByIds(ids) { bib -> 
+    def langs = [
+            [1, 'instanceOf', 'language', 0, '@id'],
+            [1, 'instanceOf', 'translationOf', 0, 'language', 0, '@id']
+    ].collect {
+        langName(getPathSafe(bib.graph, it, '')).toLowerCase() 
+    }
+    
+    boolean changed = DocumentUtil.traverse(bib.graph[1].instanceOf) { value, path ->
+        if (path && 'mainTitle' in path && value instanceof String) {
+            for (lang in langs) {
+                String r = value.replaceAll(/(?i)\s*\(\(?\s*${lang}\s*\)\)?\s*$/, '')
+                if (value != r) {
+                    report.println("$value -> $r")
+                    return new DocumentUtil.Replace(r)
+                }
+            }
+        }
+        return DocumentUtil.NOP
+    }
+
+    if (changed) {
+        bib.scheduleSave()
+    }
+}
+
+private Object getPathSafe(item, path, defaultTo = null) {
+    if (!item) {
+        return defaultTo
+    }
+
+    for (p in path) {
+        if (item[p] != null) {
+            item = item[p]
+        } else {
+            return defaultTo
+        }
+    }
+    return item
+}
+
+@Memoized
+private String langName(def id) {
+    getPathSafe(loadThing(id), ['prefLabelByLang', 'sv'], "NOT FOUND")
+}
+
+private Map loadThing(def id) {
+    def thing = [:]
+    selectByIds([id]) { t ->
+        thing = t.graph[1]
+    }
+    return thing
+}
\ No newline at end of file
diff --git a/whelktool/scripts/cleanups/2023/05/lxl-4183-elib-cover-designer.groovy b/whelktool/scripts/cleanups/2023/05/lxl-4183-elib-cover-designer.groovy
new file mode 100644
index 0000000000..f9006c7f33
--- /dev/null
+++ b/whelktool/scripts/cleanups/2023/05/lxl-4183-elib-cover-designer.groovy
@@ -0,0 +1,171 @@
+import groovy.transform.Memoized
+
+import java.util.concurrent.ConcurrentHashMap
+
+PrintWriter matchedAndSpecified = getReportWriter("matched.tsv")
+PrintWriter unmatchedSpecifiedAnyway = getReportWriter("mismatched.tsv")
+PrintWriter matchedInOtherWork = getReportWriter("matched-in-other-work.tsv")
+PrintWriter notSpecifiedMovedToInstance = getReportWriter("not-specified-moved-to-instance.txt")
+
+def where = """
+  collection = 'bib'
+  AND data#>>'{@graph, 0, identifiedBy}' LIKE '%Elib%'
+  AND (data#>>'{@graph, 1, instanceOf, summary}' is not null OR data#>>'{@graph, 1, summary}' is not null)
+  AND deleted = false
+  """
+
+ROLES = [
+        'Formgivare:'       : 'https://id.kb.se/relator/bookDesigner',
+        'Omslag:'           : 'https://id.kb.se/relator/coverDesigner',
+        'Omslagsformgivare:': 'https://id.kb.se/relator/coverDesigner',
+]
+
+OTHER = [['@id': 'https://id.kb.se/relator/unspecifiedContributor']]
+
+Map<String, Set<String>> knownNames = new ConcurrentHashMap(['https://id.kb.se/relator/bookDesigner'     : new ConcurrentHashMap().newKeySet(),
+                                                             'https://id.kb.se/relator/coverDesigner': new ConcurrentHashMap().newKeySet()])
+Map<String, Set<String>> knownAgents = new ConcurrentHashMap(['https://id.kb.se/relator/bookDesigner'     : new ConcurrentHashMap().newKeySet(),
+                                                              'https://id.kb.se/relator/coverDesigner': new ConcurrentHashMap().newKeySet()])
+Set<String> handled = new ConcurrentHashMap().newKeySet()
+
+selectBySqlWhere(where) { bib ->
+    def id = bib.doc.shortId
+    def instance = bib.graph[1]
+    def summary = asList(instance['instanceOf']['summary']) + asList(bib.graph[1]['summary'])
+
+    def nameToRoles = summary
+            .findResults { it['label'] }
+            .join(' ')
+            .with { parseRoles(it) }
+            .each { name, roles ->
+                knownNames.computeIfAbsent(name, f -> []).add(roles)
+            }
+
+    List workContribution = instance['instanceOf']['contribution']
+    if (!workContribution) {
+        return
+    }
+
+    def modified = workContribution.removeAll { !it.agent }
+
+    Set existingRoles = workContribution.collect { asList(it.role)*.'@id' }.grep().flatten()
+
+    if (existingRoles.contains('https://id.kb.se/relator/unspecifiedContributor') && nameToRoles) {
+        workContribution.each { c ->
+            if (asList(c.role) == OTHER) {
+                def agentName = name(loadIfLink(c.agent))
+                def roles = nameToRoles[agentName]
+                if (roles) {
+                    c['role'] = roles.collect { ['@id': it] }
+                    matchedAndSpecified.println([id, c.agent, roles].join('\t'))
+                    nameToRoles.remove(agentName)
+                    modified = true
+                }
+            }
+        }
+
+        def other = workContribution.findAll { asList(it.role) == OTHER }
+
+        if (nameToRoles.size() == 1 && other.size() == 1) {
+            def c = other[0]
+            def name = nameToRoles.keySet()[0]
+            def roles = nameToRoles[name]
+            other[0]['role'] = roles.collect { ['@id': it] }
+            other.clear()
+            unmatchedSpecifiedAnyway.println([id, c.agent, name, roles].join('\t'))
+            modified = true
+        }
+
+        if (other.isEmpty()) {
+            handled.add(id)
+        }
+    }
+
+    workContribution.each { c ->
+        def roles = asList(c.role)*.'@id'
+        if (knownAgents.keySet().intersect(roles)) {
+            knownAgents.computeIfAbsent(c.agent, f -> []).add(roles)
+        }
+    }
+
+    if (modified) {
+        bib.scheduleSave()
+    }
+}
+
+selectBySqlWhere("collection = 'bib' AND data#>>'{@graph, 0, identifiedBy}' LIKE '%Elib%' AND deleted = false") { bib ->
+    def id = bib.doc.shortId
+    if (id in handled) {
+        return
+    }
+    def instance = bib.graph[1]
+    List workContribution = instance['instanceOf']['contribution']
+    if (!workContribution) {
+        return
+    }
+
+    workContribution.removeAll { !it.agent }
+
+    workContribution.each { c ->
+        if (asList(c.role) == OTHER) {
+            def roles = knownAgents[c.agent] ?: knownNames[name(loadIfLink(c.agent))]
+            if (roles) {
+                def countByRole = roles.countBy { it }.sort { -it.value }
+                if (countByRole.size() == 1) {
+                    countByRole.find { it.value > 2 }?.with {
+                        def role = it.key.find()
+                        def count = it.value
+                        c['role'] = [['@id': role]]
+                        matchedInOtherWork.println([id, c.agent, role, count].join('\t'))
+                        bib.scheduleSave()
+                    }
+                }
+            }
+        }
+    }
+
+    workContribution.removeAll { c ->
+        if (asList(c.role) == OTHER) {
+            instance['contribution'] = asList(instance['contribution']) + c
+            notSpecifiedMovedToInstance.println(id)
+            bib.scheduleSave()
+            return true
+        }
+        return false
+    }
+}
+
+private Map parseRoles(String summary) {
+    def roleToNames = ROLES.collectEntries { s, id ->
+        def names = summary
+                .findAll(/$s[^\[,"]+/)
+                .collect { it.substring(s.size()) }
+                .collect { it.trim() }
+
+        [(id): names]
+    }
+
+    def nameToRoles = [:]
+    roleToNames.each { role, names ->
+        names.each { n -> nameToRoles[n] = nameToRoles.getOrDefault(n, []) + [role] }
+    }
+
+    return nameToRoles
+}
+
+private String name(Map agent) {
+    agent.name ?: "${agent.givenName} ${agent.familyName}"
+}
+
+private Map loadIfLink(Map m) {
+    m['@id'] ? loadThing(m['@id']) : m
+}
+
+@Memoized
+private Map loadThing(def id) {
+    def thing = [:]
+    selectByIds([id]) { t ->
+        thing = t.graph[1]
+    }
+    return thing
+}
\ No newline at end of file
diff --git a/whelktool/scripts/examples/contribution-role.groovy b/whelktool/scripts/examples/contribution-role.groovy
new file mode 100644
index 0000000000..32a9015d01
--- /dev/null
+++ b/whelktool/scripts/examples/contribution-role.groovy
@@ -0,0 +1,43 @@
+import whelk.util.DocumentUtil
+import datatool.util.Statistics
+
+Statistics s = new Statistics()
+s.printOnShutdown()
+
+selectByCollection('bib') { bib ->
+    try {
+        DocumentUtil.findKey(bib.doc.data, 'role') { Object value, path ->
+            count(s, value)
+        }
+    }
+    catch(Exception e) {
+        println(e)
+        e.printStackTrace()
+    }
+}
+
+
+private String normalize(String s) {
+    def noise = [",", '"', "'", '[', ']', ',', '.', '.', ':', ';', '-', '(', ')', '-', '–', '+', '!', '?'].collectEntries { [it, ''] }
+    return s.toLowerCase().replace(noise).trim()
+}
+
+void count(Statistics s, Object role) {
+    if (role instanceof Map && !role['@id']) {
+        count1(s, role, 'code')
+        count1(s, role, 'label')
+    }
+    else if (role instanceof String) {
+        s.increment('string', role.toString())
+    }
+    else if (role instanceof List) {
+        s.increment('list size', role.size())
+        role.each { count(s, it) }
+    }
+}
+
+void count1(Statistics s, Map thing, String prop) {
+    if (thing[prop]) {
+        s.increment(prop, normalize(thing[prop].toString()))
+    }
+}
\ No newline at end of file
diff --git a/whelktool/scripts/examples/works2.groovy b/whelktool/scripts/examples/works2.groovy
deleted file mode 100644
index 8f31ad43f0..0000000000
--- a/whelktool/scripts/examples/works2.groovy
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * (When running, redirect STDERR to avoid annoying prints from whelktool)
- */
-
-import java.util.concurrent.ConcurrentHashMap
-
-clusterLog = getReportWriter("clusters.tsv")
-
-visited = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>())
-
-selectByCollection('bib') { bib ->
-    if (!visited.add(bib.doc.shortId))
-        return
-    
-    try {
-        def q = buildQuery(bib)
-        if (!q) {
-            return
-        }
-
-        List ids = queryIds(q).collect()
-
-        if (ids.size() > 1) {
-            visited.addAll(ids)
-            clusterLog.println(ids.join('\t'))
-        }
-    }
-    catch (Exception e) {
-        println(e)
-        return
-    }
-}
-
-exit()
-
-Map<String, List<String>> buildQuery(bib) {
-    def title = title(bib)
-
-    if (!title)
-        return null
-
-    Map<String, List<String>> query = [
-            "q"                                : ["*"],
-            "@type"                            : ["*"],
-            "hasTitle.mainTitle"               : [title + "~"],
-    ]
-
-    def author = primaryContributorId(bib)
-    if (author) {
-        query["instanceOf.contribution.agent.@id"] = [author]
-        return query
-    }
-
-    def contributors = contributorStrings(bib)
-    if (contributors) {
-        query["instanceOf.contribution._str"] = contributors.collect{ it + "~" }
-        return query
-    }
-
-    return null
-}
-
-synchronized void exit() {
-    System.exit(0)
-}
-
-private String title(bib) {
-    return getPathSafe(bib.doc.data, ['@graph', 1, 'hasTitle', 0, 'mainTitle'])
-}
-
-private String primaryContributorId(bib) {
-    def primary = getPathSafe(bib.doc.data, ['@graph', 2, 'contribution'], []).grep{ it['@type'] == "PrimaryContribution"}
-    return getPathSafe(primary, [0, 'agent', '@id'])
-}
-
-private List contributorStrings(bib) {
-    return getPathSafe(bib.asCard(true), ['@graph',2,'contribution'], [])['_str'].grep{it}
-}
-
-private String flatTitle(bib) {
-    return flatten(
-            bib.doc.data['@graph'][1]['hasTitle'],
-            ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', ]
-    )
-}
-
-private String flatten(Object o, List order) {
-    if (o instanceof String) {
-        return o
-    }
-    if (o instanceof List) {
-        return o
-                .collect{ flatten(it, order) }
-                .join(' || ')
-    }
-    if (o instanceof Map) {
-        return order
-                .collect{ o.get(it, null) }
-                .grep{ it != null }
-                .collect{ flatten(it, order) }
-                .join(' | ')
-    }
-
-    throw new RuntimeException(String.format("unexpected type: %s for %s", o.class.getName(), o))
-}
-
-private Object getPathSafe(item, path, defaultTo = null) {
-    for (p in path) {
-        if (item[p] != null) {
-            item = item[p]
-        } else {
-            return defaultTo
-        }
-    }
-    return item
-}
diff --git a/whelktool/src/main/groovy/datatool/WorkTool.groovy b/whelktool/src/main/groovy/datatool/WorkTool.groovy
new file mode 100644
index 0000000000..b734f879c9
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/WorkTool.groovy
@@ -0,0 +1,75 @@
+package datatool
+
+import datatool.scripts.mergeworks.Doc
+import groovy.cli.commons.CliBuilder
+import datatool.scripts.mergeworks.WorkToolJob
+
+/**
+ 1) find clusters
+ $ ENV=local && time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --dry-run scripts/analysis/find-work-clusters.groovy
+
+ 2) merge overlapping clusters, output file is placed in same directory as input
+
+ $ CLUSTERSDIR=reports/local-2021...
+ $ ENV=local && time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -DclustersDir=$CLUSTERSDIR -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --dry-run scripts/analysis/merge-clusters.groovy
+
+ 3)
+ ENV=local && time java -Xmx4G -Dxl.secret.properties=$HOME/secret.properties-$ENV -cp build/libs/whelktool.jar datatool.WorkTool --dry-run -s reports/1000-swedishFiction.tsv
+
+
+ */
+
+class WorkTool {
+    static void main(String[] args) {
+        def cli = new CliBuilder(usage: 'WorkTool [options] <CLUSTER TSV>')
+        cli.h(longOpt: 'help', 'Print this help message and exit.')
+        cli.I(longOpt: 'skip-index', 'Do not index any changes, only write to storage.')
+        cli.d(longOpt: 'dry-run', 'Do not save any modifications.')
+        cli.a(longOpt: 'allow-loud', 'Do loud modifications.')
+        cli.nt(longOpt:'num-threads', args:1, argName:'N', "Override default number of threads.")
+        cli.v(longOpt: 'verbose', '.')
+        cli.r(longOpt: 'report', args: 1, argName: 'report dir', 'Save reports in this directory')
+
+        cli.m(longOpt: 'merge', 'Merge and extract matching works')
+        cli.s(longOpt: 'show', 'Show. Generate HTML report with title clusters')
+        cli.s2(longOpt: 'showWorks', 'Show. Generate HTML report with works')
+        cli.sh(longOpt: 'showHubs', 'Show. Generate HTML report with title clusters containing different works')
+        cli.f(longOpt: 'swedishFiction', 'Filter: output clusters containing swedish fiction')
+        cli.tr(longOpt: 'anonymousTranslation', 'Filter: remove translations without translator')
+        cli.tc(longOpt: 'title-clusters', 'Filter: output title clusters')
+
+        def options = cli.parse(args)
+        if (options.h) {
+            cli.usage()
+            System.exit 0
+        }
+
+        def clustersPath = options.arguments()[0]
+        def m = new WorkToolJob(new File(clustersPath))
+        m.skipIndex = options.I
+        m.dryRun = options.d
+        m.loud = options.a
+        m.verbose = options.v
+        m.reportDir = options.r ? new File(options.r) : m.reportDir
+        m.numThreads = options.nt ? Integer.parseInt(options.nt) : -1
+
+        if (options.m) {
+            m.merge()
+        } else if (options.s) {
+            m.show()
+        } else if (options.s2) {
+            m.showWorks()
+        } else if (options.sh) {
+            m.showHubs()
+        } else if (options.f) {
+            m.swedishFiction()
+        } else if (options.tr) {
+            m.filterClusters({ Doc d -> !d.isAnonymousTranslation() })
+        } else if (options.tc) {
+            m.outputTitleClusters()
+        } else {
+            cli.usage()
+            System.exit 1
+        }
+    }
+}
diff --git a/whelktool/src/main/groovy/datatool/WorkTool.md b/whelktool/src/main/groovy/datatool/WorkTool.md
new file mode 100644
index 0000000000..c977784e55
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/WorkTool.md
@@ -0,0 +1,35 @@
+* language Swedish
+* Fiction
+* issuanceType Monograph
+* No hasPart
+* encodingLevel is not marc:PartialPreliminaryLevel or marc:PrepublicationLevel
+  TODO: specify a minimal set of properties that must exist?
+
+
+fiction
+-------
+
+
+
+properties
+----------
+
+* **classification** Always take the sum of all works.
+  * SAB/kssb - Merge codes that are the same or prefixes. Take the longer code. Take the latest SAB version. Example: kssb/8 Hc + kssb/7 Hc.02 = kssb/8 Hc.02
+  * Dewey - Merge equal codes with different editionEnumeration, use the newest.
+* **contentType** Allow missing or `https://id.kb.se/term/rda/Text`
+* **subject** Always take the sum of all works. 
+  * TODO: preserve order?
+* **hasTitle** Take from one random work. 
+  * TODO: Take the most common one? Some other metric of "best"?
+* **genreForm** Take from all works. Only keep the right one if both occur of the following:
+  * marc/NotFictionNotFurtherSpecified -> marc/FictionNotFurtherSpecified (i.e. actually fiction)
+  * marc/FictionNotFurtherSpecified -> marc/Novel
+  * marc/FictionNotFurtherSpecified -> marc/Poetry
+  * marc/NotFictionNotFurtherSpecified -> marc/Autobiography
+  * marc/NotFictionNotFurtherSpecified -> marc/Biography
+
+Instance properties
+* **editionStatement** Added to comparison if it contains "förk" (förkortad = abbreviated). Then it must be the exact same string.
+* **extent** Number of pages parsed from extent may not differ more than 30%. 
+  * TODO: allow missing extent?
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/DisplayDoc.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/DisplayDoc.groovy
new file mode 100644
index 0000000000..b71ba3ad4c
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/DisplayDoc.groovy
@@ -0,0 +1,158 @@
+package datatool.scripts.mergeworks
+
+import whelk.Document
+import whelk.JsonLd
+
+class DisplayDoc {
+    Doc doc
+    Map framed
+
+    DisplayDoc(Doc doc) {
+        this.doc = doc
+    }
+
+    private static String displayTitle(Map thing) {
+        thing['hasTitle'].collect { it['@type'] + ": " + it['flatTitle'] }.join(', ')
+    }
+
+    String instanceDisplayTitle() {
+        displayTitle(['hasTitle': Util.flatTitles(doc.instanceTitle())])
+    }
+
+    // TODO...
+    String getDisplayText(String field) {
+        if (field == 'contribution') {
+            return contributorStrings().join("<br>")
+        } else if (field == 'classification') {
+            return classificationStrings().join("<br>")
+        } else if (field == 'instance title') {
+            return doc.instanceTitle() ?: ''
+        } else if (field == 'instance type') {
+            return doc.instanceType() ?: ''
+        } else if (field == 'editionStatement') {
+            return doc.editionStatement() ?: ''
+        } else if (field == 'responsibilityStatement') {
+            return doc.responsibilityStatement() ?: ''
+        } else if (field == 'encodingLevel') {
+            return doc.encodingLevel()
+        } else if (field == 'publication') {
+            return chipString(doc.publication())
+        } else if (field == 'identifiedBy') {
+            return chipString(doc.identifiedBy())
+        } else if (field == 'extent') {
+            return chipString(doc.extent() ?: [])
+        } else if (field == 'reproductionOf') {
+            return reproductionOfLink()
+        } else {
+            return chipString(doc.workData.getOrDefault(field, []))
+        }
+    }
+
+    protected String chipString(def thing) {
+        Util.chipString(thing, doc.whelk)
+    }
+
+    private String reproductionOfLink() {
+        def base = Document.getBASE_URI().toString()
+        def shortId = doc.reproductionOf()
+                ? doc.reproductionOf()[0]['@id'].substring(base.length()).replace('#it', '')
+                : ''
+        return "<a href=\"#$shortId\">$shortId</a>"
+    }
+
+    String tooltip(String string, String tooltip) {
+        """<abbr title="${tooltip}">${string}</abbr>"""
+    }
+
+    String link() {
+        String base = Document.getBASE_URI().toString()
+        String kat = "katalogisering/"
+        String id = doc.document.shortId
+        return base + kat + id
+    }
+
+    private List contributorStrings() {
+        List path = doc.instanceData ? ['instanceOf', 'contribution'] : ['contribution']
+        List contribution = Util.getPathSafe(getFramed(), path, [])
+
+        return contribution.collect { Map c ->
+            contributionStr(c)
+        }
+    }
+
+    private String contributionStr(Map contribution) {
+        StringBuilder s = new StringBuilder()
+
+        if (contribution['@type'] == 'PrimaryContribution') {
+            s.append('<b>')
+        }
+
+        s.append(flatMaybeLinked(contribution['role'], ['code', 'label']).with { it.isEmpty() ? it : it + ': ' })
+        s.append(flatMaybeLinked(contribution['agent'], ['givenName', 'familyName', 'lifeSpan', 'name']))
+
+        if (contribution['@type'] == 'PrimaryContribution') {
+            s.append('</b>')
+        }
+
+        return s.toString()
+    }
+
+    List classificationStrings() {
+        List path = doc.instanceData ? ['instanceOf', 'classification'] : ['classification']
+        List classification = Util.getPathSafe(getFramed(), path, [])
+
+        classification.collect { c ->
+            StringBuilder s = new StringBuilder()
+            s.append(flatMaybeLinked(c['inScheme'], ['code', 'version']).with { it.isEmpty() ? it : it + ': ' })
+            s.append(flatMaybeLinked(c, ['code']))
+            return s.toString()
+        }
+    }
+
+    private static String flatMaybeLinked(Object thing, List order) {
+        if (!thing)
+            return ''
+
+        if (thing instanceof List) {
+            return thing.collect { flatMaybeLinked(it, order) }.join(' | ')
+        }
+        String s = flatten(thing, order, ', ')
+
+        thing['@id']
+                ? """<a href="${thing['@id']}">$s</a>"""
+                : s
+    }
+
+    static String flatten(Object o, List order, String mapSeparator = ': ') {
+        if (o instanceof String) {
+            return o
+        }
+        if (o instanceof List) {
+            return o
+                    .collect { flatten(it, order) }
+                    .join(' || ')
+        }
+        if (o instanceof Map) {
+            return order
+                    .findResults { ((Map) o).get(it) }
+                    .collect { flatten(it, order) }
+                    .join(mapSeparator)
+        }
+
+        throw new RuntimeException(String.format("unexpected type: %s for %s", o.class.getName(), o))
+    }
+
+    Map getFramed() {
+        if (!framed) {
+            if (doc.existsInStorage) {
+                framed = JsonLd.frame(doc.thingIri(), doc.whelk.loadEmbellished(doc.shortId()).data)
+            } else {
+                Document copy = doc.document.clone()
+                doc.whelk.embellish(copy)
+                framed = JsonLd.frame(doc.thingIri(), copy.data)
+            }
+        }
+
+        return framed
+    }
+}
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/Doc.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Doc.groovy
new file mode 100644
index 0000000000..8a1373e4e7
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Doc.groovy
@@ -0,0 +1,305 @@
+package datatool.scripts.mergeworks
+
+
+import whelk.Document
+import whelk.Whelk
+
+import static whelk.JsonLd.ID_KEY
+import static datatool.scripts.mergeworks.Util.asList
+import datatool.scripts.mergeworks.Util.Relator
+
+class Doc {
+    public static final String SAOGF_SKÖN = 'https://id.kb.se/term/saogf/Sk%C3%B6nlitteratur'
+    public static final List MARC_FICTION = [
+            'https://id.kb.se/marc/FictionNotFurtherSpecified',
+            'https://id.kb.se/marc/Drama',
+            'https://id.kb.se/marc/Essay',
+            'https://id.kb.se/marc/Novel',
+            'https://id.kb.se/marc/HumorSatiresEtc',
+            'https://id.kb.se/marc/Letter',
+            'https://id.kb.se/marc/ShortStory',
+            'https://id.kb.se/marc/MixedForms',
+            'https://id.kb.se/marc/Poetry',
+    ]
+    public static final List MARC_NOT_FICTION = [
+            'https://id.kb.se/marc/NotFictionNotFurtherSpecified',
+            'https://id.kb.se/marc/Biography'
+    ]
+    public static final List DRAMA_GF = [
+            'https://id.kb.se/term/saogf/Dramatik',
+            'https://id.kb.se/marc/Drama'
+    ]
+
+
+    Whelk whelk
+    Document document
+
+    Collection<Doc> unlinkedInstances
+
+    Map instanceData
+    Map workData
+
+    List<String> flatInstanceTitle
+
+    DisplayDoc display
+
+    String preUpdateChecksum
+
+    boolean existsInStorage = true
+    boolean modified = false
+
+    Doc(Whelk whelk, Document document) {
+        this.whelk = whelk
+        this.document = document
+        this.preUpdateChecksum = document.getChecksum(whelk.getJsonld())
+        setData()
+    }
+
+    void setData() {
+        if (mainEntity()['instanceOf']) {
+            instanceData = mainEntity()
+            workData = instanceData['instanceOf']
+        } else {
+            workData = mainEntity()
+        }
+    }
+
+    DisplayDoc getView() {
+        if (!display) {
+            display = new DisplayDoc(this)
+        }
+
+        return display
+    }
+
+    Map record() {
+        document.data['@graph'][0]
+    }
+
+    Map mainEntity() {
+        document.data['@graph'][1]
+    }
+
+    String shortId() {
+        document.shortId
+    }
+
+    String thingIri() {
+        document.getThingIdentifiers().first()
+    }
+
+    String encodingLevel() {
+        return record()['encodingLevel'] ?: ''
+    }
+
+    String workIri() {
+        workData['@id']
+    }
+
+    List<Map> workTitle() {
+        asList(workData['hasTitle'])
+    }
+
+    List<Map> instanceTitle() {
+        asList(instanceData?.hasTitle)
+    }
+
+    List<String> flatInstanceTitle() {
+        if (!flatInstanceTitle) {
+            flatInstanceTitle = Util.getFlatTitle(instanceTitle())
+        }
+
+        return flatInstanceTitle
+    }
+
+    String workType() {
+        workData['@type']
+    }
+
+    String instanceType() {
+        instanceData?.'@type'
+    }
+
+    List<Map> translationOf() {
+        asList(workData['translationOf'])
+    }
+
+    List<Map> contribution() {
+        asList(workData['contribution'])
+    }
+
+    List<Map> classification() {
+        asList(workData['classification'])
+    }
+
+    List<Map> genreForm() {
+        asList(workData['genreForm'])
+    }
+
+    List<Map> publication() {
+        asList(instanceData?.publication)
+    }
+
+    List<Map> identifiedBy() {
+        asList(instanceData?.identifiedBy)
+    }
+
+    List<Map> extent() {
+        asList(instanceData?.extent)
+    }
+
+    List<Map> reproductionOf() {
+        asList(instanceData?.reproductionOf)
+    }
+
+    String editionStatement() {
+        instanceData?.editionStatement
+    }
+
+    String responsibilityStatement() {
+        instanceData?.responsibilityStatement
+    }
+
+    int numPages() {
+        String extent = Util.getPathSafe(extent(), [0, 'label', 0]) ?: Util.getPathSafe(extent(), [0, 'label'], '')
+        return numPages(extent)
+    }
+
+    // TODO: improve parsing https://metadatabyran.kb.se/beskrivning/materialtyper-arbetsfloden/tryckta-monografier/omfang-for-tryckta-monografier
+    static int numPages(String extentLabel) {
+        def l = extentLabel.replace('onumrerade', '')
+        def matcher = l =~ /(\d+)(?=[, \[\]0-9]*[sp])/
+        List<Integer> pages = []
+        while (matcher.find()) {
+            pages << Integer.parseInt(matcher.group(1))
+        }
+        pages ? pages.max() : -1
+    }
+
+    boolean hasGenericTitle() {
+        Util.hasGenericTitle(instanceTitle())
+    }
+
+    boolean isMonograph() {
+        instanceData?.issuanceType == 'Monograph'
+    }
+
+    boolean isManuscript() {
+        instanceType() == 'Manuscript' || [['@id': 'https://id.kb.se/term/saogf/Manuskript'], ['@id': 'https://id.kb.se/term/saogf/Handskrifter']].intersect(genreForm())
+    }
+
+    boolean isInSb17Bibliography() {
+        asList(record()['bibliography']).contains(['@id': 'https://libris.kb.se/library/SB17'])
+    }
+
+    boolean isMaybeAggregate() {
+        hasPart()
+                || getView().classificationStrings().any { it.contains('kssb') && it.contains('(s)') }
+                || !contribution().any { it['@type'] == 'PrimaryContribution' }
+                || hasRelationshipWithContribution()
+    }
+
+    boolean hasPart() {
+        workData['hasPart'] || instanceData['hasTitle'].findAll { it['@type'] == 'Title' }.any {
+            it.hasPart?.size() > 1 || it.hasPart?.any { p -> asList(p.partName).size() > 1 || asList(p.partNumber).size() > 1 }
+        }
+    }
+
+    boolean hasRelationshipWithContribution() {
+        asList(workData['relationship']).any { r ->
+            asList(r['entity']).any { e ->
+                e.containsKey('contribution')
+            }
+        }
+    }
+
+    boolean isFiction() {
+        isMarcFiction() || isSaogfFiction() || isSabFiction()
+    }
+
+    boolean isMarcFiction() {
+        genreForm().any { it['@id'] in MARC_FICTION }
+    }
+
+    boolean isMarcNotFiction() {
+        genreForm().any { it['@id'] in MARC_NOT_FICTION }
+    }
+
+    boolean isSaogfFiction() {
+        genreForm().any { whelk.relations.isImpliedBy(SAOGF_SKÖN, it['@id'] ?: '') }
+    }
+
+    boolean isSabFiction() {
+        classification().any { it.inScheme.toString() =~ /kssb/ && it.code =~ /^(H|uH|ufH|ugH)/ }
+    }
+
+    boolean isNotFiction() {
+        // A lot of fiction has marc/NotFictionNotFurtherSpecified but then classification is usually empty
+        isMarcNotFiction() && (!getView().classificationStrings().isEmpty() && !isSabFiction())
+    }
+
+    boolean isText() {
+        workData['@type'] == 'Text'
+    }
+
+    boolean isAnonymousTranslation() {
+        translationOf() && !hasAnyRole([Relator.TRANSLATOR, Relator.EDITOR, Relator.ADAPTER])
+    }
+
+    boolean hasAnyRole(List<Relator> relators) {
+        contribution().any {
+            asList(it['role']).intersect(relators.collect { [(ID_KEY): it.iri] })
+        }
+    }
+
+    boolean isDrama() {
+        isSabDrama() || isGfDrama()
+    }
+
+    boolean isSabDrama() {
+        getView().classificationStrings().any { it.contains(': Hc.02') || it.contains(': Hce.02') }
+    }
+
+    boolean isGfDrama() {
+        asList(genreForm()).any { it['@id'] in DRAMA_GF }
+    }
+
+    boolean isTactile() {
+        asList(workData['contentType']).contains(['@id': 'https://id.kb.se/term/rda/TactileText'])
+                || asList(instanceData?.carrierType).any { it['@id'] in ['https://id.kb.se/marc/Braille', 'https://id.kb.se/marc/TacMaterialType-b'] }
+    }
+
+    boolean isThesis() {
+        genreForm().any { it == ['@id': 'https://id.kb.se/marc/Thesis'] }
+    }
+
+    boolean hasDistinguishingEdition() {
+        (instanceData?.editionStatement ?: '').toString().toLowerCase().contains("förk")
+    }
+
+    void addComparisonProps() {
+        if (hasDistinguishingEdition()) {
+            workData['_editionStatement'] = instanceData['editionStatement']
+        }
+        workData['_numPages'] = numPages()
+    }
+
+    void removeComparisonProps() {
+        workData.remove('_editionStatement')
+        workData.remove('_numPages')
+    }
+
+    void replaceWorkData(Map replacement) {
+        workData.clear()
+        workData.putAll(replacement)
+        modified = true
+    }
+
+    void addCloseMatch(List<String> workIds) {
+        def closeMatch = (asList(workData['closeMatch']) + (workIds - workIri()).collect { ['@id': it] }).unique()
+        if (closeMatch) {
+            workData['closeMatch'] = closeMatch
+            modified = true
+        }
+    }
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/FieldStatus.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/FieldStatus.groovy
new file mode 100644
index 0000000000..8f41f4f393
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/FieldStatus.groovy
@@ -0,0 +1,7 @@
+package datatool.scripts.mergeworks
+
+enum FieldStatus {
+    EQUAL,
+    COMPATIBLE,
+    DIFF
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/Html.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Html.groovy
new file mode 100644
index 0000000000..e967151d8b
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Html.groovy
@@ -0,0 +1,111 @@
+package datatool.scripts.mergeworks
+
+import org.apache.commons.codec.digest.DigestUtils
+
+import static datatool.scripts.mergeworks.FieldStatus.COMPATIBLE
+import static datatool.scripts.mergeworks.FieldStatus.DIFF
+import static datatool.scripts.mergeworks.FieldStatus.EQUAL
+
+class Html {
+    private static String CSS = Html.class.getClassLoader()
+            .getResourceAsStream('merge-works/table.css').getText("UTF-8")
+
+    static final String START = """<html><head>
+                    <meta charset="UTF-8">
+                    <style>$CSS</style>
+                    </head><body>"""
+    static final String END = '</body></html>'
+    static final String HORIZONTAL_RULE = "<hr/><br/>\n"
+
+    static def infoFields = ['reproductionOf', 'instance title', 'instance type', 'editionStatement', 'responsibilityStatement', 'encodingLevel', 'publication', 'identifiedBy', 'extent', 'physicalDetailsNote']
+
+    static String clusterTable(Collection<Doc> cluster) {
+        String id = clusterId(cluster.collect { it.shortId() })
+        String header = """
+            <tr>
+                <th><a id="${id}"><a href="#${id}">${id}</th>
+                ${cluster.collect { doc -> "<th><a id=\"${doc.shortId()}\" href=\"${doc.view.link()}\">${doc.shortId()}</a></th>" }.join('\n')}
+            </tr>
+            <tr>
+                <td></td>
+                ${cluster.collect { doc -> "<td>${doc.view.instanceDisplayTitle()}</td>" }.join('\n')}                                     
+            </tr>
+           """.stripIndent()
+
+        def statuses = WorkComparator.compare(cluster)
+
+        String info = infoFields.collect(fieldRows(cluster, "info")).join('\n')
+        String equal = statuses.get(EQUAL, []).collect(fieldRows(cluster, cluster.size() > 1 ? EQUAL.toString() : "")).join('\n')
+        String compatible = statuses.get(COMPATIBLE, []).collect(fieldRows(cluster, COMPATIBLE.toString())).join('\n')
+        String diff = statuses.get(DIFF, []).collect(fieldRows(cluster, DIFF.toString())).join('\n')
+
+        return """
+            <table>
+                ${header}
+                ${equal}
+                ${compatible}
+                ${diff}
+                ${info}
+            </table>
+            <br/><br/>
+        """
+    }
+
+    static String hubTable(Collection<Doc> works) {
+        def instanceDocs = works.collect { work -> work.unlinkedInstances ?: work }
+        def clusterId = clusterId(instanceDocs.flatten().collect { Doc d -> d.shortId() })
+
+        String header = """
+            <tr>
+                <th><a id="${clusterId}"><a href="#${clusterId}">${clusterId}</th>
+                ${works.collect { it.workIri()
+                ? "<th><a id=\"${it.shortId()}\" href=\"${it.view.link()}\">${it.shortId()}</a></th>"
+                : "<th></th>" }
+                .join('\n')}
+            </tr>
+           """.stripIndent()
+
+        def link = { Doc d -> "<a id=\"${d.shortId()}\" href=\"${d.view.link()}\">${d.shortId()}</a>" }
+
+        String instances =
+                """
+                    <tr class="info">
+                        <td>_instances</td>
+                        ${instanceDocs.collect { "<td>${it.collect(link).join('<br>')}</td>" }.join('\n')}
+                        </tr>
+                """.stripIndent()
+
+        def statuses = WorkComparator.compare(works)
+
+        String equal = statuses.get(EQUAL, []).collect(fieldRows(works, works.size() > 1 ? EQUAL.toString() : "")).join('\n')
+        String compatible = statuses.get(COMPATIBLE, []).collect(fieldRows(works, COMPATIBLE.toString())).join('\n')
+        String diff = statuses.get(DIFF, []).collect(fieldRows(works, DIFF.toString())).join('\n')
+
+        return """
+            <table>
+                ${header}
+                ${equal}
+                ${compatible}
+                ${diff}
+                ${instances}
+            </table>
+            <br/><br/>
+        """
+    }
+
+    static String clusterId(Collection<String> cluster) {
+        cluster
+                ? DigestUtils.md5Hex(cluster.sort().first()).toUpperCase().substring(0, 12)
+                : ""
+    }
+
+    private static def fieldRows(Collection<Doc> cluster, String cls) {
+        { field ->
+            """
+            <tr class="${cls}">
+                <td>${field}</td>
+                ${cluster.collect { "<td>${it.view.getDisplayText(field)}</td>" }.join('\n')}   
+            </tr> """.stripIndent()
+        }
+    }
+}
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/Util.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Util.groovy
new file mode 100644
index 0000000000..a065b0e1e9
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/Util.groovy
@@ -0,0 +1,341 @@
+package datatool.scripts.mergeworks
+
+import org.apache.commons.lang3.StringUtils
+import whelk.Document
+import whelk.IdGenerator
+import whelk.Whelk
+import whelk.util.DocumentUtil
+import whelk.util.Unicode
+
+class Util {
+    static def titleComponents = ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', 'marc:parallelTitle', 'marc:equalTitle']
+
+    static def titleVariant = ['Title', 'ParallelTitle']
+    // removed 'VariantTitle', 'CoverTitle' since they sometimes contain random generic stuff like "Alibis filmroman", "Kompisböcker för de yngsta"
+
+    static enum Relator {
+        TRANSLATOR('https://id.kb.se/relator/translator'),
+        AUTHOR('https://id.kb.se/relator/author'),
+        ILLUSTRATOR('https://id.kb.se/relator/illustrator'),
+        AUTHOR_OF_INTRO('https://id.kb.se/relator/authorOfIntroduction'),
+        ADAPTER('https://id.kb.se/relator/adapter'),
+        COVER_DESIGNER('https://id.kb.se/relator/coverDesigner'),
+        COMPILER('https://id.kb.se/relator/compiler'),
+        AUTHOR_OF_AFTERWORD('https://id.kb.se/relator/authorOfAfterwordColophonEtc'),
+        PHOTOGRAPHER('https://id.kb.se/relator/photographer'),
+        EDITOR('https://id.kb.se/relator/editor'),
+        UNSPECIFIED_CONTRIBUTOR('https://id.kb.se/relator/unspecifiedContributor'),
+        PRIMARY_RIGHTS_HOLDER('https://id.kb.se/relator/primaryRightsHolder'),
+        ABRIDGER('https://id.kb.se/relator/abridger'),
+        IMPLICIT_AUTHOR('https://id.kb.se/relator/author')
+
+        String iri
+
+        private Relator(String iri) {
+            this.iri = iri
+        }
+    }
+
+    private static Set<String> IGNORED_SUBTITLES = WorkToolJob.class.getClassLoader()
+            .getResourceAsStream('merge-works/ignored-subtitles.txt')
+            .readLines().grep().collect(Util.&normalize) as Set
+
+    private static Set<String> GENERIC_TITLES = WorkToolJob.class.getClassLoader()
+            .getResourceAsStream('merge-works/generic-titles.txt')
+            .readLines().grep().collect(Util.&normalize) as Set
+
+    static def noise =
+            [",", '"', "'", "ʹ", "ʼ", '[', ']', ',', '.', '.', ':', ';', '-', '(', ')', ' the ', '-', '–', '+', '!', '?',].collectEntries { [it, ' '] }
+
+
+    static List asList(Object o) {
+        (o ?: []).with { it instanceof List ? it : [it] }
+    }
+
+    /**
+     * Partition a collection based on equality condition
+     *
+     * NOTE: O(n^2)...
+     */
+    static <T> Collection<Collection<T>> partition(Collection<T> collection, Closure matcher) {
+        List<List<T>> result = []
+
+        for (T t : collection) {
+            boolean match = false
+            for (List<T> group : result) {
+                if (groupMatches(t, group, matcher)) {
+                    group.add(t)
+                    match = true
+                    break
+                }
+            }
+
+            if (!match) {
+                result.add([t])
+            }
+        }
+        return result
+    }
+
+    static <T> boolean groupMatches(T t, List<T> group, Closure matcher) {
+        group.every { other -> matcher(other, t) }
+    }
+
+    static boolean hasGenericTitle(List hasTitle) {
+        hasTitle.any { it['mainTitle'] && normalize((String) it['mainTitle']) in GENERIC_TITLES }
+    }
+
+    static List dropGenericSubTitles(List hasTitle) {
+        hasTitle.collect {
+            def copy = new TreeMap(it)
+            if (copy['subtitle'] || copy['titleRemainder']) {
+                DocumentUtil.traverse(copy) { value, path ->
+                    if (('subtitle' in path || 'titleRemainder' in path) && value instanceof String) {
+                        if (genericSubtitle(value)) {
+                            new DocumentUtil.Remove()
+                        } else {
+                            ((List) value.split(':')).with {
+                                if (it.size() > 1 && genericSubtitle(it.last().trim())) {
+                                    new DocumentUtil.Replace(value.replaceFirst(~/\s*:.+$/, ''))
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            copy
+        }
+    }
+
+    static List flatTitles(List hasTitle) {
+        dropGenericSubTitles(hasTitle).collect {
+            def title = new TreeMap<>()
+            title['flatTitle'] = normalize(DisplayDoc.flatten(it, titleComponents))
+            if (it['@type']) {
+                title['@type'] = it['@type']
+            }
+
+            title
+        }
+    }
+
+    private static boolean genericSubtitle(String s) {
+        s = Util.normalize(s)
+        if (s.startsWith("en ")) {
+            s = s.substring("en ".length())
+        }
+        return s in IGNORED_SUBTITLES
+    }
+
+    static String normalize(String s) {
+        return Unicode.asciiFold(Unicode.normalizeForSearch(StringUtils.normalizeSpace(" $s ".toLowerCase().replace(noise))))
+    }
+
+    static Object getPathSafe(item, path, defaultTo = null) {
+        for (p in path) {
+            if ((item instanceof Collection || item instanceof Map) && item[p] != null) {
+                item = item[p]
+            } else {
+                return defaultTo
+            }
+        }
+        return item
+    }
+
+    static List<String> getFlatTitle(List hasTitle) {
+        flatTitles(hasTitle)
+                .grep(isTitle)
+                .collect { it['flatTitle'] }
+    }
+
+    static String chipString(def thing, Whelk whelk) {
+        if (thing instanceof Integer) {
+            return thing
+        }
+
+        def chips = whelk.jsonld.toChip(thing)
+        if (chips.size() < 2) {
+            chips = thing
+        }
+        if (chips instanceof List) {
+            return chips.collect { valuesString(it) }.sort().join('<br>')
+        }
+        return valuesString(chips)
+    }
+
+    private static String valuesString(def thing) {
+        if (thing instanceof List) {
+            return thing.collect { valuesString(it) }.join(' • ')
+        }
+        if (thing instanceof Map) {
+            return thing.findAll { k, v -> k != '@type' }.values().collect { valuesString(it) }.join(' • ')
+        }
+        return thing.toString()
+    }
+
+    // (docs on some of these levels are normally filtered out before we reach here)
+    static List bestEncodingLevel = [
+            'marc:FullLevel',
+            'marc:FullLevelMaterialNotExamined',
+            'marc:MinimalLevel',
+            'marc:LessThanFullLevelMaterialNotExamined',
+            'marc:CoreLevel',
+            'marc:AbbreviatedLevel',
+            'marc:PartialPreliminaryLevel',
+            'marc:PrepublicationLevel',
+            null
+    ]
+
+    static def toWorkTitleForm = { Map title ->
+        // partName/partNumber is usually in hasPart but not always
+        def partName = title['partName']
+        def partNumber = title['partNumber']
+
+        def hasPart = title['hasPart']
+        if (hasPart) {
+            partName = hasPart[0]['partName']
+            partNumber = hasPart[0]['partNumber']
+        }
+
+        partName = asList(partName)[0]
+        partNumber = asList(partNumber)[0]
+
+        if (partNumber && partName) {
+            title['mainTitle'] += ". $partNumber, $partName"
+        } else if (partNumber) {
+            title['mainTitle'] += ". $partNumber"
+        } else if (partName) {
+            title['mainTitle'] += ". $partName"
+        }
+
+        return title.subMap(['@type', 'mainTitle', 'source'])
+    }
+
+    // Return the most common title for the best encodingLevel
+    static def bestTitle(Collection<Doc> docs) {
+        // TODO: which title to pick when matched with already existing linked work?
+        def linkedWorkTitle = docs.findResult { it.workIri() ? it.workData['hasTitle'] : null }
+        if (linkedWorkTitle) {
+            return linkedWorkTitle
+        }
+
+        for (def level : bestEncodingLevel) {
+            def onLevel = docs.findAll { it.encodingLevel() == level }
+            def bestWorkTitle = mostCommonWorkTitle(onLevel)
+            if (bestWorkTitle) {
+                return bestWorkTitle
+            }
+        }
+
+        for (def level : bestEncodingLevel) {
+            def onLevel = docs.findAll { it.encodingLevel() == level }
+            def bestInstanceTitle = mostCommonInstanceTitle(onLevel)
+            if (bestInstanceTitle) {
+                return bestInstanceTitle.collect(toWorkTitleForm)
+            }
+        }
+
+        return null
+    }
+
+    static def bestOriginalTitle(Collection<Doc> docs) {
+        for (def level : bestEncodingLevel) {
+            def onLevel = docs.findAll { it.encodingLevel() == level }
+            def bestOrigTitle = mostCommonOriginalTitle(onLevel)
+            if (bestOrigTitle) {
+                return bestOrigTitle
+            }
+        }
+
+        return null
+    }
+
+    static def mostCommonOriginalTitle(Collection<Doc> docs) {
+        return mostCommonWorkTitle(docs) { Doc d ->
+            d.translationOf().findResult { it['hasTitle'] }?.findAll(isTitle)
+        }
+    }
+
+    static def mostCommonWorkTitle(Collection<Doc> docs, Closure getTitle = { it.workTitle().findAll(isTitle) }) {
+        def workTitles = docs.collect(getTitle)
+                .grep()
+                .collect { dropGenericSubTitles(it) }
+
+        if (workTitles) {
+            return mostCommon(workTitles)
+        }
+
+        return null
+    }
+
+    static def mostCommonInstanceTitle(Collection<Doc> docs) {
+        def addSource = { t, d ->
+            return t.collect { it.plus(['source': [d.instanceData.subMap('@id')]]) }
+        }
+
+        def instanceTitles = docs.collect { it.instanceTitle().findAll(isTitle) }
+                .collect { dropGenericSubTitles(it) }
+
+        if (instanceTitles.grep()) {
+            def instanceTitleToDoc = [instanceTitles, docs].transpose().collectEntries()
+            def best = mostCommon(instanceTitles.grep())
+            return addSource(best, instanceTitleToDoc[best])
+        }
+
+        return null
+    }
+
+    static def mostCommon(titles) {
+        return partition(titles, { a, b -> a == b })
+                .sort { it.size() }
+                .reverse()
+                .first()
+                .first()
+    }
+
+    static def isTitle = { it.'@type' == 'Title' }
+
+    static boolean nameMatch(Object local, Map agent) {
+        def variants = [agent] + asList(agent.hasVariant)
+
+        def localName = local instanceof Map ? name(local) : normalize(local)
+
+        localName && variants.any {
+            name(it) && localName == name(it)
+        }
+    }
+
+    static String name(Map agent) {
+        (agent.givenName && agent.familyName)
+                ? normalize("${agent.givenName} ${agent.familyName}")
+                : agent.name ? normalize("${agent.name}") : null
+    }
+
+    static Document buildWorkDocument(Map workData, File reportDir) {
+        String workId = IdGenerator.generate()
+        def reportUri = "http://xlbuild.libris.kb.se/works/${reportDir.getPath()}/new/${workId}.html"
+
+        workData['@id'] = "TEMPID#it"
+        Document d = new Document([
+                "@graph": [
+                        [
+                                "@id"          : "TEMPID",
+                                "@type"        : "Record",
+                                "mainEntity"   : ["@id": "TEMPID#it"],
+                                "technicalNote": [[
+                                                          "@type"  : "TechnicalNote",
+                                                          "hasNote": [[
+                                                                              "@type": "Note",
+                                                                              "label": ["Maskinellt utbrutet verk... TODO"]
+                                                                      ]],
+                                                          "uri"    : [reportUri]
+                                                  ]
+                                ]],
+                        workData
+                ]
+        ])
+
+        d.deepReplaceId(Document.BASE_URI.toString() + workId)
+        return d
+    }
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkComparator.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkComparator.groovy
new file mode 100644
index 0000000000..64aacb9c21
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkComparator.groovy
@@ -0,0 +1,137 @@
+package datatool.scripts.mergeworks
+
+import datatool.scripts.mergeworks.compare.Classification
+import datatool.scripts.mergeworks.compare.Id
+import datatool.scripts.mergeworks.compare.SameOrEmpty
+import datatool.scripts.mergeworks.compare.Default
+import datatool.scripts.mergeworks.compare.Extent
+import datatool.scripts.mergeworks.compare.FieldHandler
+import datatool.scripts.mergeworks.compare.GenreForm
+import datatool.scripts.mergeworks.compare.StuffSet
+import datatool.scripts.mergeworks.compare.Subject
+import datatool.scripts.mergeworks.compare.TranslationOf
+import datatool.scripts.mergeworks.compare.ValuePicker
+import datatool.scripts.mergeworks.compare.WorkTitle
+import datatool.util.DocumentComparator
+
+import static datatool.scripts.mergeworks.Util.bestTitle
+
+class WorkComparator {
+    Set<String> fields
+    DocumentComparator c = new DocumentComparator()
+
+    Map<String, FieldHandler> comparators = [
+            'classification'  : new Classification(),
+            'contentType'     : new SameOrEmpty('https://id.kb.se/term/rda/Text'),
+            'genreForm'       : new GenreForm(),
+            'hasTitle'        : new WorkTitle(),
+            'intendedAudience': new SameOrEmpty('https://id.kb.se/marc/Juvenile'),
+            '_numPages'       : new Extent(),
+            'subject'         : new Subject(),
+            'summary'         : new StuffSet(),
+            'translationOf'   : new TranslationOf(),
+            '@id'             : new Id()
+    ]
+
+    static Set<String> ignore = ['closeMatch']
+
+    static FieldHandler DEFAULT = new Default()
+
+    WorkComparator(Set<String> fields) {
+        this.fields = new HashSet<>(fields)
+    }
+
+    boolean sameWork(Doc a, Doc b) {
+        fields.every { compare(a, b, it).with { it == EQUAL || it == COMPATIBLE } }
+    }
+
+    FieldStatus compare(Doc a, Doc b, String field) {
+        Object oa = a.workData.get(field)
+        Object ob = b.workData.get(field)
+
+        if (oa == null && ob == null) {
+            return FieldStatus.EQUAL
+        }
+
+        compareExact(oa, ob, field) == FieldStatus.EQUAL
+                ? FieldStatus.EQUAL
+                : compareDiff(a, b, field)
+    }
+
+    Map merge(Collection<Doc> docs) {
+        Map result = [:]
+
+        fields.each { field ->
+            FieldHandler h = comparators.getOrDefault(field, DEFAULT)
+            def value = h instanceof ValuePicker
+                    ? h.pick(docs)
+                    : mergeField(field, h, docs)
+
+            if (value) {
+                result[field] = value
+            }
+        }
+
+        if (!result['hasTitle']) {
+            def bestTitle = bestTitle(docs)
+            if (bestTitle) {
+                result['hasTitle'] = bestTitle
+            }
+        }
+
+        return result
+    }
+
+    // TODO: preserve order? e.g. subject
+    private Object mergeField(String field, FieldHandler h, Collection<Doc> docs) {
+        Object value = docs.first().workData.get(field)
+        def rest = docs.drop(1)
+        rest.each {
+            value = h.merge(value, it.workData.get(field))
+        }
+        return value
+    }
+
+    private FieldStatus compareDiff(Doc a, Doc b, String field) {
+        comparators.getOrDefault(field, DEFAULT).isCompatible(a.workData.get(field), b.workData.get(field))
+                ? FieldStatus.COMPATIBLE
+                : FieldStatus.DIFF
+    }
+
+    private FieldStatus compareExact(Object oa, Object ob, String field) {
+        c.isEqual([(field): oa], [(field): ob]) ? FieldStatus.EQUAL : FieldStatus.DIFF
+    }
+
+    static Map<FieldStatus, List<String>> compare(Collection<Doc> cluster) {
+        WorkComparator c = new WorkComparator(allFields(cluster))
+
+        Map<FieldStatus, List<String>> result = [:]
+        c.fieldStatuses(cluster).each { f, s -> result.get(s, []) << f }
+        return result
+    }
+
+    static Set<String> allFields(Collection<Doc> cluster) {
+        Set<String> fields = new HashSet<>()
+        cluster.each { fields.addAll(it.workData.keySet()) }
+        return fields - ignore
+    }
+
+    Map<String, FieldStatus> fieldStatuses(Collection<Doc> cluster) {
+        fields.collectEntries { [it, fieldStatus(cluster, it)] }
+    }
+
+    FieldStatus fieldStatus(Collection<Doc> cluster, String field) {
+        boolean anyCompat = false
+        [cluster, cluster].combinations().findResult { List combination ->
+            Doc a = combination.first()
+            Doc b = combination.last()
+
+            def c = compare(a, b, field)
+            if (c == FieldStatus.COMPATIBLE) {
+                anyCompat = true
+            }
+            c == FieldStatus.DIFF ? c : null
+        } ?: (anyCompat ? FieldStatus.COMPATIBLE : FieldStatus.EQUAL)
+    }
+
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkToolJob.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkToolJob.groovy
new file mode 100644
index 0000000000..8498fbcae8
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkToolJob.groovy
@@ -0,0 +1,415 @@
+package datatool.scripts.mergeworks
+
+
+import whelk.IdGenerator
+import whelk.Whelk
+import whelk.exception.WhelkRuntimeException
+import whelk.meta.WhelkConstants
+import whelk.util.LegacyIntegrationTools
+import whelk.util.Statistics
+
+import java.text.SimpleDateFormat
+import java.util.concurrent.ExecutorService
+import java.util.concurrent.LinkedBlockingDeque
+import java.util.concurrent.ThreadFactory
+import java.util.concurrent.ThreadPoolExecutor
+import java.util.concurrent.TimeUnit
+import java.util.concurrent.atomic.AtomicInteger
+import java.util.function.Function
+
+import static datatool.scripts.mergeworks.Util.buildWorkDocument
+import static datatool.scripts.mergeworks.Util.getPathSafe
+import static datatool.scripts.mergeworks.Util.partition
+
+class WorkToolJob {
+    Whelk whelk
+    Statistics statistics
+    File clusters
+
+    String date = new SimpleDateFormat('yyyyMMdd-HHmmss').format(new Date())
+    String jobId = IdGenerator.generate()
+    File reportDir = new File("reports/merged-works/$date")
+
+    String changedIn = "xl"
+    String changedBy = "SEK"
+    String generationProcess = 'https://libris.kb.se/sys/merge-works'
+
+    boolean dryRun = true
+    boolean skipIndex = false
+    boolean loud = false
+    boolean verbose = false
+    int numThreads = -1
+
+    private enum WorkStatus {
+        NEW('new'),
+        UPDATED('updated')
+
+        String status
+
+        private WorkStatus(String status) {
+            this.status = status
+        }
+    }
+
+    WorkToolJob(File clusters) {
+        this.clusters = clusters
+
+        this.whelk = Whelk.createLoadedSearchWhelk('secret', true)
+        this.statistics = new Statistics()
+    }
+
+    public static Closure qualityMonographs = { Doc doc ->
+        (doc.isText()
+                && doc.isMonograph()
+                && !doc.isManuscript()
+                && !doc.isMaybeAggregate()
+                && (doc.encodingLevel() != 'marc:PartialPreliminaryLevel' && doc.encodingLevel() != 'marc:PrepublicationLevel'))
+                && !doc.isTactile()
+                && !doc.isDrama()
+                && !doc.isThesis()
+                && !doc.isInSb17Bibliography()
+    }
+
+    void show() {
+        println(Html.START)
+        run({ cluster ->
+            return {
+                try {
+                    if (cluster.size() > 1) {
+                        Collection<Doc> docs = loadLastUnlinkedVersion(cluster).each { it.addComparisonProps() }
+                                .sort { a, b -> a.workType() <=> b.workType() }
+                                .sort { it.numPages() }
+
+                        println(Html.clusterTable(docs) + Html.HORIZONTAL_RULE)
+                    }
+                }
+                catch (NoWorkException e) {
+                    System.err.println(e.getMessage())
+                }
+                catch (Exception e) {
+                    System.err.println(e.getMessage())
+                    e.printStackTrace(System.err)
+                }
+            }
+        })
+        println(Html.END)
+    }
+
+    void showWorks() {
+        println(Html.START)
+        run({ cluster ->
+            return {
+                try {
+                    def merged = uniqueWorks(loadLastUnlinkedVersion(cluster)).findAll { !it.existsInStorage }
+                    if (merged) {
+                        println(merged.collect { [it] + it.unlinkedInstances }
+                                .collect { Html.clusterTable(it) }
+                                .join('') + Html.HORIZONTAL_RULE
+                        )
+                    }
+                }
+                catch (Exception e) {
+                    System.err.println(e.getMessage())
+                    e.printStackTrace(System.err)
+                }
+            }
+        })
+        println(Html.END)
+    }
+
+    void showHubs() {
+        println(Html.START)
+        run({ cluster ->
+            return {
+                try {
+                    def hub = uniqueWorks(loadLastUnlinkedVersion(cluster))
+                    if (hub.size() > 1) {
+                        println(Html.hubTable(hub) + Html.HORIZONTAL_RULE)
+                    }
+                }
+                catch (Exception e) {
+                    System.err.println(e.getMessage())
+                    e.printStackTrace(System.err)
+                }
+            }
+        })
+        println(Html.END)
+    }
+
+    void merge() {
+        def s = statistics.printOnShutdown()
+        def multiWorkClusters = Collections.synchronizedList([])
+
+        run({ cluster ->
+            return {
+                def docs = loadDocs(cluster)
+                def works = uniqueWorks(docs)
+                def createdOrUpdated = works.findAll { it.unlinkedInstances }
+
+                WorkStatus.values().each {
+                    new File(reportDir, it.status).tap { it.mkdirs() }
+                }
+                writeSingleWorkReport(docs, createdOrUpdated, s)
+
+                if (works.size() > 1) {
+                    multiWorkClusters.add(works)
+                }
+
+                if (!dryRun) {
+                    def linkableWorkIris = works.findResults { it.workIri() }
+                    works.each { doc ->
+                        doc.addCloseMatch(linkableWorkIris)
+                        store(doc)
+                        doc.unlinkedInstances?.each {
+                            it.replaceWorkData(['@id': doc.thingIri()])
+                            store(it)
+                        }
+                    }
+                }
+            }
+        })
+
+        writeMultiWorkReport(multiWorkClusters)
+    }
+
+    void store(Doc doc) {
+        whelk.setSkipIndex(skipIndex)
+        doc.document.setGenerationDate(new Date())
+        doc.document.setGenerationProcess(generationProcess)
+
+        if (!doc.existsInStorage) {
+            if (!whelk.createDocument(doc.document, changedIn, changedBy,
+                    LegacyIntegrationTools.determineLegacyCollection(doc.document, whelk.getJsonld()), false)) {
+                throw new WhelkRuntimeException("Could not store new work: ${doc.shortId()}")
+            }
+        } else if (doc.modified) {
+            whelk.storeAtomicUpdate(doc.document, !loud, false, changedIn, generationProcess, doc.preUpdateChecksum)
+        }
+    }
+
+    void writeSingleWorkReport(Collection<Doc> titleClusters, Collection<Doc> derivedWorks, Statistics s) {
+        String report = htmlReport(titleClusters, derivedWorks)
+        derivedWorks.each {
+            def status = it.existsInStorage ? WorkStatus.UPDATED.status : WorkStatus.NEW.status
+            new File(reportDir, "$status/${it.shortId()}.html") << report
+            s.increment("num derivedFrom ($status works)", "${it.unlinkedInstances.size()}", it.shortId())
+        }
+    }
+
+    void writeMultiWorkReport(Collection<Collection<Doc>> workClusters) {
+        new File(reportDir, "multi-work-clusters.html").with { f ->
+            f.append(Html.START)
+            workClusters.each {
+                f.append(Html.hubTable(it) + Html.HORIZONTAL_RULE)
+            }
+            f.append(Html.END)
+        }
+    }
+
+    String htmlReport(Collection<Doc> titleCluster, Collection<Doc> works) {
+        StringBuilder s = new StringBuilder()
+
+        s.append(Html.START)
+
+        s.append("<h1>Title cluster</h1>")
+        titleCluster
+                .each { it.addComparisonProps() }
+                .sort { a, b -> a.workType() <=> b.workType() }
+                .sort { it.numPages() }
+        s.append(Html.clusterTable(titleCluster))
+        s.append(Html.HORIZONTAL_RULE)
+
+        titleCluster.each {
+            it.removeComparisonProps()
+        }
+
+        s.append("<h1>Extracted works</h1>")
+        works.collect { [it] + it.unlinkedInstances }
+                .each { s.append(Html.clusterTable(it)) }
+
+        s.append(Html.END)
+
+        return s.toString()
+    }
+
+    private Collection<Doc> uniqueWorks(Collection<Doc> titleCluster) {
+        def works = []
+
+        prepareForCompare(titleCluster)
+
+        WorkComparator c = new WorkComparator(WorkComparator.allFields(titleCluster))
+
+        def workClusters = partition(titleCluster, { Doc a, Doc b -> c.sameWork(a, b) })
+                .each { work -> work.each { doc -> doc.removeComparisonProps() } }
+
+        workClusters.each { Collection<Doc> wc ->
+            def (local, linked) = wc.split { it.instanceData }
+            if (!linked) {
+                if (local.size() == 1) {
+                    works.add(local.first())
+                } else {
+                    def newWork = new Doc(whelk, buildWorkDocument(c.merge(local), reportDir)).tap {
+                        it.existsInStorage = false
+                        it.unlinkedInstances = local
+                    }
+                    works.add(newWork)
+                }
+            } else if (linked.size() == 1) {
+                def existingWork = linked.first().tap { Doc d ->
+                    if (local) {
+                        d.replaceWorkData(c.merge(linked + local))
+                        d.unlinkedInstances = local
+                    }
+                }
+                works.add(existingWork)
+            } else {
+                System.err.println("Local works ${local.collect { it.shortId() }} match multiple linked works: ${linked.collect { it.shortId() }}. Duplicate linked works?")
+            }
+        }
+
+        return works
+    }
+
+    void swedishFiction() {
+        def swedish = { Doc doc ->
+            Util.asList(doc.workData['language']).collect { it['@id'] } == ['https://id.kb.se/language/swe']
+        }
+
+        run({ cluster ->
+            return {
+                def c = loadDocs(cluster).split { it.instanceData }
+                        .with { local, linked ->
+                            linked + local.findAll(qualityMonographs).findAll(swedish)
+                        }
+
+                if (c.size() > 1 && c.any { Doc d -> d.isFiction() } && !c.any { Doc d -> d.isNotFiction() }) {
+                    println(c.collect { Doc d -> d.shortId() }.join('\t'))
+                }
+            }
+        })
+    }
+
+    void filterClusters(Closure<Doc> predicate) {
+        run({ cluster ->
+            return {
+                def c = loadDocs(cluster).findAll(predicate)
+                if (c.size() > 1) {
+                    println(c.collect { it.shortId() }.join('\t'))
+                }
+            }
+        })
+    }
+
+    void outputTitleClusters() {
+        run({ cluster ->
+            return {
+                titleClusters(loadDocs(cluster)).findAll { it.size() > 1 }.each {
+                    println(it.collect { it.shortId() }.join('\t'))
+                }
+            }
+        })
+    }
+
+    private void run(Function<List<String>, Runnable> f) {
+        ExecutorService s = createExecutorService()
+
+        AtomicInteger i = new AtomicInteger()
+        clusters.eachLine() {
+            List<String> cluster = Arrays.asList(it.split(/[\t ]+/))
+
+            s.submit({
+                try {
+                    f.apply(cluster).run()
+                    int n = i.incrementAndGet()
+                    if (n % 100 == 0) {
+                        System.err.println("$n")
+                    }
+                }
+                catch (NoWorkException e) {
+                    //println("No work:" + e.getMessage())
+                }
+                catch (Exception e) {
+                    e.printStackTrace()
+                }
+            })
+        }
+
+        s.shutdown()
+        s.awaitTermination(1, TimeUnit.DAYS)
+    }
+
+    private def createExecutorService() {
+        int poolSize = numThreads > 1 ? numThreads : defaultNumThreads()
+        def linkedBlockingDeque = new LinkedBlockingDeque<Runnable>((int) (poolSize * 1.5))
+
+        def executorService = new ThreadPoolExecutor(poolSize, poolSize,
+                1, TimeUnit.DAYS,
+                linkedBlockingDeque, new ThreadPoolExecutor.CallerRunsPolicy())
+
+        executorService.setThreadFactory(new ThreadFactory() {
+            ThreadGroup group = new ThreadGroup(WhelkConstants.BATCH_THREAD_GROUP)
+
+            @Override
+            Thread newThread(Runnable runnable) {
+                return new Thread(group, runnable)
+            }
+        })
+
+        return executorService
+    }
+
+    private static int defaultNumThreads() {
+        Runtime.getRuntime().availableProcessors() * 4
+    }
+
+    private Collection<Doc> loadDocs(Collection<String> cluster) {
+        whelk
+                .bulkLoad(cluster).values()
+                .collect { new Doc(whelk, it) }
+    }
+
+    private Collection<Doc> loadLastUnlinkedVersion(Collection<String> cluster) {
+        cluster.findResults {
+            whelk.storage.
+                    loadAllVersions(it)
+                    .reverse()
+                    .find { getPathSafe(it.data, it.workIdPath) == null }
+                    ?.with { new Doc(whelk, it) }
+        }
+    }
+
+    def loadUniqueLinkedWorks = { Collection<Doc> docs ->
+        docs.findResults { it.workIri() }
+                .unique()
+                .collect { new Doc(whelk, whelk.storage.getDocumentByIri(it)) }
+                .plus(docs.findAll { !it.workIri() })
+    }
+
+    private Collection<Collection<Doc>> titleClusters(Collection<Doc> docs) {
+        partitionByTitle(docs)
+                .findAll { !it.any { doc -> doc.hasGenericTitle() } }
+                .collect(loadUniqueLinkedWorks)
+                .findAll { it.size() > 1 }
+                .sort { a, b -> a.first().view.instanceDisplayTitle() <=> b.first().view.instanceDisplayTitle() }
+    }
+
+    Collection<Collection<Doc>> partitionByTitle(Collection<Doc> docs) {
+        return partition(docs) { Doc a, Doc b ->
+            !a.flatInstanceTitle().intersect(b.flatInstanceTitle()).isEmpty()
+        }
+    }
+
+    private Collection<Doc> prepareForCompare(Collection<Doc> docs) {
+        docs.each {
+            if (it.instanceData) {
+                it.addComparisonProps()
+            }
+        }.sort { it.numPages() }
+    }
+}
+
+class NoWorkException extends RuntimeException {
+    NoWorkException(String msg) {
+        super(msg)
+    }
+}
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Classification.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Classification.groovy
new file mode 100644
index 0000000000..bc2d85a0e9
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Classification.groovy
@@ -0,0 +1,85 @@
+package datatool.scripts.mergeworks.compare
+
+class Classification extends StuffSet {
+    // Terms that will be merged (values precede keys)
+    private static def norm = [
+            'uHc'                                                        : ['Hc,u'],
+            'uHce'                                                       : ['Hce,u'],
+            'Hc'                                                         : ['Hc.01', 'Hc.02', 'Hc.03'],
+            'Hc,u'                                                       : ['Hcf', 'Hcg']
+    ]
+
+    @Override
+    Object merge(Object a, Object b) {
+        return mergeCompatibleElements(super.merge(a, b).findAll { it['code'] }) { c1, c2 ->
+            String code1 = c1['code']
+            String code2 = c2['code']
+            if (!code1 || !code2) {
+                return
+            }
+            code1 = code1.replaceAll(/\s+/, "")
+            code2 = code2.replaceAll(/\s+/, "")
+
+            if (isSab(c1) && isSab(c2)) {
+                def code = code1 == code2 || n(code2, code1)
+                        ? code1
+                        : (n(code1, code2) ? code2 : null)
+                if (code) {
+                    def result = [
+                            '@type' : 'Classification',
+                            'code'  : code1,
+                            inScheme: [
+                                    '@type': 'ConceptScheme',
+                                    'code' : 'kssb'
+                            ]
+                    ]
+                    def version = maxSabVersion(c1, c2)
+                    if (version) {
+                        result['inScheme']['version'] = version
+                    }
+                    return result
+                }
+            } else if (isDewey(c1) && isDewey(c2)) {
+                def code = code1.startsWith(code2.replace("/", ""))
+                        ? code1
+                        : (code2.startsWith(code1.replace("/", "")) ? code2 : null)
+                if (code) {
+                    Map result = [:]
+                    result.putAll(c1)
+                    result.putAll(c2)
+                    result['code'] = code
+                    result['editionEnumeration'] = maxDeweyEdition(c1, c2)
+                    return result
+                }
+            }
+        }
+    }
+
+    boolean isSab(Map c) {
+        c['inScheme'] && c['inScheme']['code'] == 'kssb'
+    }
+
+    String maxSabVersion(c1, c2) {
+        def v1 = c1['inScheme']['version'] ?: "-1"
+        def v2 = c2['inScheme']['version'] ?: "-1"
+        Integer.parseInt(v1) > Integer.parseInt(v2) ? v1 : v2
+    }
+
+    boolean isDewey(Map c) {
+        c['@type'] == 'ClassificationDdc'
+    }
+
+    String maxDeweyEdition(c1, c2) {
+        def v1 = c1['editionEnumeration']
+        def v2 = c2['editionEnumeration']
+        deweyEdition(v1) > deweyEdition(v2) ? v1 : v2
+    }
+
+    int deweyEdition(String edition) {
+        Integer.parseInt((edition ?: "0").replaceAll("[^0-9]", ""))
+    }
+
+    boolean n(a, b) {
+        norm[a]?.any { it == b || n(it, b) }
+    }
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Default.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Default.groovy
new file mode 100644
index 0000000000..dfacdb001e
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Default.groovy
@@ -0,0 +1,13 @@
+package datatool.scripts.mergeworks.compare
+
+class Default implements FieldHandler {
+    @Override
+    boolean isCompatible(Object a, Object b) {
+        return false
+    }
+
+    @Override
+    Object merge(Object a, Object b) {
+        return a
+    }
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Extent.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Extent.groovy
new file mode 100644
index 0000000000..2390e77f5a
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Extent.groovy
@@ -0,0 +1,15 @@
+package datatool.scripts.mergeworks.compare;
+
+class Extent implements FieldHandler {
+
+    // TODO: allow one side missing extent (-1)?
+    @Override
+    boolean isCompatible(Object a, Object b) {
+        return true // a * 0.7 < b && a * 1.3 > b
+    }
+
+    @Override
+    Object merge(Object a, Object b) {
+        return b; // not part of final work
+    }
+}
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/FieldHandler.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/FieldHandler.groovy
new file mode 100644
index 0000000000..17f440bc12
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/FieldHandler.groovy
@@ -0,0 +1,12 @@
+package datatool.scripts.mergeworks.compare
+
+import datatool.scripts.mergeworks.Doc
+
+interface FieldHandler {
+    boolean isCompatible(Object a, Object b)
+    Object merge(Object a, Object b)
+}
+
+interface ValuePicker extends FieldHandler {
+    Object pick(Collection<Doc> values)
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/GenreForm.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/GenreForm.groovy
new file mode 100644
index 0000000000..5efe34df33
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/GenreForm.groovy
@@ -0,0 +1,47 @@
+package datatool.scripts.mergeworks.compare
+
+import datatool.util.DocumentComparator
+
+//FIXME
+class GenreForm extends StuffSet {
+    private static final DocumentComparator c = new DocumentComparator()
+
+    // Terms that will be merged (values precede keys)
+    private static def norm = [
+            (['@id': 'https://id.kb.se/marc/NotFictionNotFurtherSpecified']): [
+                    ['@id': 'https://id.kb.se/marc/FictionNotFurtherSpecified'],
+                    ['@id': 'https://id.kb.se/marc/Autobiography'],
+                    ['@id': 'https://id.kb.se/marc/Biography']
+            ],
+            (['@id': 'https://id.kb.se/marc/FictionNotFurtherSpecified'])   : [
+                    ['@id': 'https://id.kb.se/marc/Poetry'],
+                    ['@id': 'https://id.kb.se/marc/Novel']
+            ],
+    ]
+
+    @Override
+    boolean isCompatible(Object a, Object b) {
+        def lattLast = {
+            it['@id'] == 'https://id.kb.se/term/saogf/L%C3%A4ttl%C3%A4st'
+                    || it['@id'] == 'https://id.kb.se/term/barngf/L%C3%A4ttl%C3%A4sta%20b%C3%B6cker'
+                    || it['prefLabel'] == 'Lättläst'
+        }
+
+        a.find(lattLast).asBoolean() == b.findResult(lattLast).asBoolean()
+    }
+
+    @Override
+    Object merge(Object a, Object b) {
+        return mergeCompatibleElements(super.merge(a, b)) { gf1, gf2 ->
+            if (n(gf1, gf2)) {
+                gf2
+            } else if (n(gf2, gf1)) {
+                gf1
+            }
+        }
+    }
+
+    boolean n(a, b) {
+        norm[a]?.any { it == b || n(it, b) }
+    }
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Id.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Id.groovy
new file mode 100644
index 0000000000..fc3305148b
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Id.groovy
@@ -0,0 +1,22 @@
+package datatool.scripts.mergeworks.compare
+
+import datatool.scripts.mergeworks.Doc
+import org.apache.commons.lang3.NotImplementedException
+
+class Id implements ValuePicker {
+
+    @Override
+    boolean isCompatible(Object a, Object b) {
+        return true
+    }
+
+    @Override
+    Object merge(Object a, Object b) {
+        throw new NotImplementedException('')
+    }
+
+    @Override
+    Object pick(Collection<Doc> values) {
+        return values.findResult { it.workIri() }
+    }
+}
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/SameOrEmpty.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/SameOrEmpty.groovy
new file mode 100644
index 0000000000..3fcd988d93
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/SameOrEmpty.groovy
@@ -0,0 +1,21 @@
+package datatool.scripts.mergeworks.compare
+
+import static datatool.scripts.mergeworks.Util.asList
+
+class SameOrEmpty implements FieldHandler {
+    Object link
+
+    SameOrEmpty(String iri) {
+        this.link = [['@id': iri]]
+    }
+
+    @Override
+    boolean isCompatible(Object a, Object b) {
+        (!a && asList(b) == link) || (!b && asList(a) == link)
+    }
+
+    @Override
+    Object merge(Object a, Object b) {
+        return a ?: b
+    }
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/StuffSet.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/StuffSet.groovy
new file mode 100644
index 0000000000..ecf119de9f
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/StuffSet.groovy
@@ -0,0 +1,38 @@
+package datatool.scripts.mergeworks.compare
+
+
+import java.util.function.BiFunction
+
+import static datatool.scripts.mergeworks.Util.asList
+
+class StuffSet implements FieldHandler {
+    @Override
+    boolean isCompatible(Object a, Object b) {
+        true
+    }
+
+    @Override
+    Object merge(Object a, Object b) {
+        return ((asList(a) as Set) + (asList(b) as Set)).collect()
+    }
+
+    static Object mergeCompatibleElements(Object o, BiFunction<Object, Object, Object> s) {
+        boolean changed = false
+        List result = []
+        asList(o).each {
+            def merged = null
+            for (int i = 0 ; i < result.size() ; i++) {
+                merged = s.apply(result[i], it)
+                if (merged) {
+                    result[i] = merged
+                    changed = true
+                    break
+                }
+            }
+            if (merged == null) {
+                result << it
+            }
+        }
+        return changed ? mergeCompatibleElements(result, s) : result
+    }
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Subject.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Subject.groovy
new file mode 100644
index 0000000000..0434d32d98
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Subject.groovy
@@ -0,0 +1,8 @@
+package datatool.scripts.mergeworks.compare
+
+class Subject extends StuffSet {
+    @Override
+    Object merge(Object a, Object b) {
+        return super.merge(a, b)
+    }
+}
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/TranslationOf.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/TranslationOf.groovy
new file mode 100644
index 0000000000..7bd26ebe7d
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/TranslationOf.groovy
@@ -0,0 +1,46 @@
+package datatool.scripts.mergeworks.compare
+
+import datatool.scripts.mergeworks.Doc
+import datatool.scripts.mergeworks.Util
+import datatool.util.DocumentComparator
+import org.apache.commons.lang3.NotImplementedException
+
+class TranslationOf implements ValuePicker {
+    DocumentComparator c = new DocumentComparator()
+
+    @Override
+    boolean isCompatible(Object a, Object b) {
+        // @type is sometimes Work, sometimes Text. Should not matter for comparison
+        // We assume that there are never more than one object in translationOf
+        a = Util.asList(a)[0]
+        b = Util.asList(b)[0]
+        (!a && !b) || (a && b && c.isEqual(noTypeNoTitle(a), noTypeNoTitle(b)) && noTitleOrSameTitle(a, b))
+    }
+
+    @Override
+    Object merge(Object a, Object b) {
+        throw new NotImplementedException('')
+    }
+
+    @Override
+    Object pick(Collection<Doc> values) {
+        // TODO: which title to pick when matched with already existing linked work?
+        def translationOf = values.first().workData['translationOf']
+        def title = Util.bestOriginalTitle(values)
+        if (title) {
+            Util.asList(translationOf)[0]['hasTitle'] = title
+        }
+
+        return translationOf
+    }
+
+    Map noTypeNoTitle(Map m) {
+        m.findAll { k, v -> !(k in ['@type', 'hasTitle']) }
+    }
+
+    boolean noTitleOrSameTitle(Map a, Map b) {
+        !a['hasTitle']
+                || !b['hasTitle']
+                || !Util.getFlatTitle(a['hasTitle']).intersect(Util.getFlatTitle(b['hasTitle'])).isEmpty()
+    }
+}
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/WorkTitle.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/WorkTitle.groovy
new file mode 100644
index 0000000000..b1608b64aa
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/WorkTitle.groovy
@@ -0,0 +1,23 @@
+package datatool.scripts.mergeworks.compare
+
+import datatool.scripts.mergeworks.Doc
+import datatool.scripts.mergeworks.Util
+import org.apache.commons.lang3.NotImplementedException
+
+class WorkTitle implements ValuePicker {
+
+    @Override
+    boolean isCompatible(Object a, Object b) {
+        return !a || !b || !Util.getFlatTitle(a).intersect(Util.getFlatTitle(b)).isEmpty()
+    }
+
+    @Override
+    Object merge(Object a, Object b) {
+        throw new NotImplementedException('')
+    }
+    
+    @Override
+    Object pick(Collection<Doc> values) {
+        return Util.bestTitle(values)
+    }
+}
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contribution.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contribution.groovy
new file mode 100644
index 0000000000..7d6aeb2966
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contribution.groovy
@@ -0,0 +1,613 @@
+package datatool.scripts.mergeworks.normalize
+
+import groovy.transform.Memoized
+import org.apache.commons.lang3.StringUtils
+import whelk.Document
+import whelk.JsonLd
+
+import java.util.concurrent.ConcurrentHashMap
+import java.util.regex.Pattern
+
+import groovy.json.JsonBuilder
+import groovy.json.JsonSlurper
+
+import static datatool.scripts.mergeworks.Util.asList
+import static datatool.scripts.mergeworks.Util.name
+import static datatool.scripts.mergeworks.Util.normalize
+import static datatool.scripts.mergeworks.Util.Relator
+import static whelk.JsonLd.ID_KEY
+import static whelk.JsonLd.looksLikeIri
+
+/**
+ Example:
+ $ ENV=qa && time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters="reports/clusters.tsv" -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --dry-run src/main/groovy/datatool/scripts/mergeworks/normalize/fetch-contribution-from-respStatement.groovy
+ */
+
+linkedFoundInCluster = getReportWriter("linked-agent-found-in-cluster.tsv")
+linkedFoundInCluster.println(['id', 'matched agent', 'agent occurs in (examples)'].join('\t'))
+
+roleAddedFromRespStatement = getReportWriter("role-added-from-respStatement.tsv")
+roleAddedFromRespStatement.println(['id', 'agent name', 'added roles', 'resp statement'].join('\t'))
+
+respStatementLinkedAgentFoundInCluster = getReportWriter("respStatement-linked-agent-found-in-cluster.tsv")
+respStatementLinkedAgentFoundInCluster.println(['id', 'agent name', 'matched agent', 'resp statement roles', 'agent occurs in (examples)', 'resp statement'].join('\t'))
+
+respStatementLocalAgentFoundInCluster = getReportWriter("respStatement-local-agent-found-in-cluster.tsv")
+respStatementLocalAgentFoundInCluster.println(['id', 'agent name', 'resp statement roles', 'agent occurs in (examples)', 'resp statement'].join('\t'))
+
+unmatchedContributionsInRespStatement = getReportWriter("unmatched-contributions-in-resp-statement.tsv")
+unmatchedContributionsInRespStatement.println(['id', 'agent name', 'roles', 'resp statement'].join('\t'))
+
+roleFoundInCluster = getReportWriter("role-found-in-cluster.tsv")
+roleFoundInCluster.println(['id', 'agent', 'added role', 'agent occurs with role in (examples)'].join('\t'))
+
+titleMovedToTranslationOf = getReportWriter("title-moved-to-translationOf.tsv")
+
+originalWorkFoundInCluster = getReportWriter("original-work-found-in-cluster.tsv")
+originalWorkFoundInCluster.println(['id', 'added translationOf', 'translationOf occurs in (examples)'].join('\t'))
+
+def clusters = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } }
+
+idToCluster = initIdToCluster(clusters)
+nameToAgents = new ConcurrentHashMap<String, ConcurrentHashMap>()
+agentToRolesToIds = new ConcurrentHashMap<String, ConcurrentHashMap<Map, ConcurrentHashMap>>()
+agentToLifeSpan = new ConcurrentHashMap<String, String>()
+idToTranslationOf = new ConcurrentHashMap<String, Object>()
+
+// Populate maps
+selectByIds(clusters.flatten()) { bib ->
+    def id = bib.doc.shortId
+    def work = bib.graph[1].instanceOf
+
+    if (!work || work[ID_KEY]) return
+
+    work.contribution?.each { Map c ->
+        asList(c.agent).each { Map agent ->
+            def agentStr = toString(agent)
+            def loadedAgent = loadIfLink(agent)
+            if (agent.containsKey('@id')) {
+                agentToLifeSpan.computeIfAbsent(agentStr, f -> lifeSpan(loadedAgent))
+            }
+            ([loadedAgent] + asList(loadedAgent.hasVariant)).each { a ->
+                String agentName = name(a)
+                if (agentName) {
+                    nameToAgents.computeIfAbsent(agentName, f -> new ConcurrentHashMap().newKeySet()).add(agentStr)
+                }
+            }
+            def roleToIds = agentToRolesToIds.computeIfAbsent(agentStr, f -> new ConcurrentHashMap())
+            asList(c.role).with {
+                if (it.isEmpty()) {
+                    roleToIds.computeIfAbsent(([:]), f -> new ConcurrentHashMap().newKeySet()).add(id)
+                } else {
+                    it.each { r ->
+                        roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id)
+                    }
+                }
+            }
+        }
+    }
+
+    if (work['translationOf']) {
+        idToTranslationOf[id] = work['translationOf']
+    }
+}
+
+agentToNames = initAgentToNames(nameToAgents)
+
+selectByIds(clusters.flatten()) { bib ->
+    Map thing = bib.graph[1]
+    def id = bib.doc.shortId
+
+    def respStatement = thing.responsibilityStatement
+    def work = thing.instanceOf
+
+    if (!work || work[ID_KEY]) return
+
+    def contribution = work.contribution
+
+    if (!contribution) return
+
+    // extract names + roles from responsibilityStatement
+    // normalize the names for comparison but also save the original strings for later use
+    def normalizedNameToName = [:]
+    def contributionsInRespStatement = parseRespStatement(respStatement).collectEntries { name, roles ->
+        def normalizedName = normalize(name)
+        normalizedNameToName[normalizedName] = name
+        [normalizedName, roles]
+    }
+
+    // remove useless contributions
+    def modified = contribution.removeAll { !it.agent }
+
+    contribution.each { Map c ->
+        // match local agent against linked ones in same cluster
+        modified |= tryLinkAgent(c, id)
+        // if there are more roles stated in responsibilityStatement other than the existing ones in this contribution, add those
+        modified |= tryAddRolesFromRespStatement(c, contributionsInRespStatement, respStatement, id)
+    }
+
+    // drop "implicit authors", e.g. Astrid Lindgren in "Astrid Lindgren ; illustrerad av Ilon Wikland" (likely to already exist)
+    contributionsInRespStatement.removeAll { _, roles -> roles == [Relator.IMPLICIT_AUTHOR] }
+
+    // agents in responsibilityStatement that are not in contribution? match against linked agents in same cluster
+    modified |= tryAddLinkedAgentContributionsFromRespStatement(contribution, contributionsInRespStatement, respStatement, id)
+
+    // drop unmatched agents that are likely to already exist (agent with same initials exists or contribution with same role exists)
+    def existingNames = contribution.findResults { agentToNames[toString(asList(it.agent).find())] }.flatten()
+    contributionsInRespStatement.removeAll { String name, List<Relator> roles ->
+        existingNames.any { similarName(it, name) }
+                || roles.collect { [(ID_KEY): it.iri] }.intersect(contribution.collect { it.role }.flatten())
+    }
+
+    // match remaining against local agents in same cluster
+    modified |= tryAddLocalAgentContributionsFromRespStatement(contribution, contributionsInRespStatement, respStatement, id)
+    // if still no match, add constructed local Contribution with agent + roles extracted from responsibilityStatement
+    modified |= addRemainingContributionsFromRespStatement(contribution, contributionsInRespStatement, normalizedNameToName, respStatement, id)
+
+    contribution.each { Map c ->
+        // add roles from contributions in same cluster with matching agent
+        modified |= tryAddRole(c, id)
+    }
+
+    // works with translators should have translationOf, add if missing
+    modified |= tryAddMissingTranslationOf(work, contribution, id)
+
+    if (modified) {
+        bib.scheduleSave()
+    }
+}
+
+def initIdToCluster(List<List<String>> clusters) {
+    def idToCluster = [:]
+    clusters.each { cluster ->
+        cluster.each { id ->
+            idToCluster[id] = cluster as Set - id
+        }
+    }
+    return idToCluster
+}
+
+static Map<Object, String> initAgentToNames(Map<String, List<Object>> nameToAgents) {
+    def agentToNames = [:]
+    nameToAgents.each { name, agents ->
+        agents.each {
+            agentToNames.computeIfAbsent(it, f -> [] as Set).add(name)
+        }
+    }
+    return agentToNames
+}
+
+boolean tryLinkAgent(Map contribution, String id) {
+    def modified = false
+
+    asList(contribution.agent).each { Map agent ->
+        if (!agent.containsKey(ID_KEY)) {
+            // get agent name variants
+            def names = agentToNames[toString(agent)]
+            if (!names) return
+            // get linked agents with matching name
+            def matchingLinkedAgents = nameToAgents.subMap(names).values().flatten().toSet().findAll { a ->
+                JsonLd.looksLikeIri(a) && !yearMismatch(lifeSpan(agent), agentToLifeSpan[a])
+            }
+            for (agentIri in matchingLinkedAgents) {
+                // roles that the linked agent appears as and in which records respectively
+                Map roleToIds = agentToRolesToIds[agentIri]
+                // records in same cluster where the linked agent appears
+                def inClusterWithAgent = roleToIds.findResults { _, ids -> idToCluster[id].intersect(ids) }.flatten() as Set
+                if (inClusterWithAgent) {
+                    // matching linked agent appears in same cluster -> add link
+                    agent.clear()
+                    agent[ID_KEY] = agentIri
+                    // report
+                    def examples = inClusterWithAgent.take(3)
+                    def currentRoles = asList(contribution.role).findResults { roleShort(it[ID_KEY]) }.sort()
+                    linkedFoundInCluster.println([id, idShort(agentIri), examples].join('\t'))
+                    incrementStats('linked agent found in cluster', currentRoles)
+                    // add this id to "records that the agent appears in" for each role
+                    asList(contribution.role).each { r ->
+                        roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id)
+                    }
+                    return modified = true
+                }
+            }
+        }
+    }
+
+    return modified
+}
+
+boolean tryAddRolesFromRespStatement(Map contribution, Map contributionsInRespStatement, String respStatement, String id) {
+    if (contributionsInRespStatement.isEmpty()) return false
+
+    String agent = toString(asList(contribution.agent).find())
+
+    // any matching agent (name) in responsibilityStatement?
+    def matching = contributionsInRespStatement.subMap(agentToNames[agent] ?: [])
+    if (!matching) return false
+
+    // matched and will be handled, remove
+    matching.each { name, _ -> contributionsInRespStatement.remove(name) }
+
+    def firstMatch = matching.find()
+    String name = firstMatch.key
+    List<Relator> rolesInRespStatement = firstMatch.value
+
+    Map roleToIds = agentToRolesToIds[agent]
+    if (!roleToIds) return false
+
+    def currentRoles = asList(contribution.role)
+    def isPrimaryContribution = contribution[ID_KEY] == 'PrimaryContribution'
+    // author role needs to be explicitly stated in responsibilityStatement to be added to "regular" Contribution
+    def rolesOfInterest = rolesInRespStatement.findResults { Relator relator ->
+        relator == Relator.IMPLICIT_AUTHOR && !isPrimaryContribution
+                ? null
+                : [(ID_KEY): relator.iri]
+    }
+    def newRoles = rolesOfInterest - currentRoles
+    if (newRoles) {
+        // add new roles (replace existing unspecifiedContributor)
+        contribution['role'] = noRole(currentRoles) ? newRoles : currentRoles + newRoles
+        // report
+        def newRolesShort = newRoles.findResults { roleShort(it[ID_KEY]) }
+        roleAddedFromRespStatement.println([id, name, newRolesShort, respStatement].join('\t'))
+        incrementStats("roles added from responsibilityStatement", newRolesShort.sort(), id)
+        // add this id to "records that the agent appears in" for each added role
+        newRoles.each { r ->
+            roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id)
+        }
+        return true
+    }
+
+    return false
+}
+
+boolean tryAddLinkedAgentContributionsFromRespStatement(List<Map> contribution, Map contributionsInRespStatement, String respStatement, String id) {
+    if (contributionsInRespStatement.isEmpty()) return false
+
+    return contributionsInRespStatement.removeAll { String name, List<Relator> roles ->
+        // get agents with matching name
+        def agents = nameToAgents[name]
+        if (!agents) return false
+
+        // get only linked agents
+        def linkedAgents = agents.findAll { looksLikeIri(it) }
+
+        for (agentIri in linkedAgents) {
+            Map roleToIds = agentToRolesToIds[agentIri]
+            def inClusterWithAgent = roleToIds.findResults { _, ids -> idToCluster[id].intersect(ids) }.flatten() as Set
+            if (inClusterWithAgent) {
+                def newContribution =
+                        [
+                                '@type': 'Contribution',
+                                'agent': [(ID_KEY): agentIri]
+                        ]
+
+                roles = roles.collect { r -> [(ID_KEY): r.iri] }
+
+                if (roles) {
+                    newContribution['role'] = roles
+                }
+
+                if (!contribution.contains(newContribution)) {
+                    contribution.add(newContribution)
+                }
+
+                def rolesShort = roles.collect { r -> roleShort(r[ID_KEY]) }.sort()
+                def examples = inClusterWithAgent.take(3)
+                respStatementLinkedAgentFoundInCluster.println([id, name, idShort(agentIri), rolesShort, examples, respStatement].join('\t'))
+                incrementStats('linked agents from respStatement (found in cluster)', rolesShort, id)
+
+                roles.each { r ->
+                    roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id)
+                }
+
+                return true
+            }
+        }
+
+        return false
+    }
+}
+
+boolean tryAddLocalAgentContributionsFromRespStatement(List<Map> contribution, Map contributionsInRespStatement, String respStatement, String id) {
+    if (contributionsInRespStatement.isEmpty()) return false
+
+    return contributionsInRespStatement.removeAll { String name, List<Relator> roles ->
+        def agents = nameToAgents[name]
+        if (!agents) return false
+
+        def localAgents = agents.findAll { !looksLikeIri(it) }
+
+        for (localAgent in localAgents) {
+            Map roleToIds = agentToRolesToIds[localAgent]
+            def inClusterWithAgent = roleToIds.findResults { _, ids -> idToCluster[id].intersect(ids) }.flatten() as Set
+            if (inClusterWithAgent) {
+                def newContribution =
+                        [
+                                '@type': 'Contribution',
+                                'agent': toMap(localAgent)
+                        ]
+
+                roles = roles.collect { r -> [(ID_KEY): r.iri] }
+
+                if (roles) {
+                    newContribution['role'] = roles
+                }
+
+                contribution.add(newContribution)
+
+                def rolesShort = roles.collect { r -> roleShort(r[ID_KEY]) }
+                def examples = inClusterWithAgent.take(3)
+                respStatementLocalAgentFoundInCluster.println([id, name, rolesShort, examples, respStatement].join('\t'))
+                incrementStats('local agents from respStatement (found in cluster)', rolesShort, id)
+
+                roles.each { r ->
+                    roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id)
+                }
+
+                return true
+            }
+        }
+
+        return false
+    }
+}
+
+boolean addRemainingContributionsFromRespStatement(List<Map> contribution, Map contributionsInRespStatement, Map normalizedNames, String respStatement, String id) {
+    if (contributionsInRespStatement.isEmpty()) return false
+
+    return contributionsInRespStatement.removeAll { name, roles ->
+        def translatorEditor = roles.findResults { r -> r == Relator.TRANSLATOR || r == Relator.EDITOR ? [(ID_KEY): r.iri] : null }
+
+        if (translatorEditor) {
+            def newContribution =
+                    [
+                            '@type': 'Contribution',
+                            'agent': ['@type': 'Person', 'name': normalizedNames[name]],
+                            'role' : translatorEditor
+                    ]
+
+            contribution.add(newContribution)
+
+            def rolesShort = translatorEditor.collect { roleShort(it[ID_KEY]) }.sort()
+            unmatchedContributionsInRespStatement.println([id, normalizedNames[name], rolesShort, respStatement].join('\t'))
+            incrementStats('unmatched agents in respStatement', rolesShort, id)
+
+            def roleToIds = agentToRolesToIds.computeIfAbsent(toString(newContribution.agent), f -> new ConcurrentHashMap())
+            translatorEditor.each { r ->
+                roleToIds.computeIfAbsent(r, f -> new ConcurrentHashMap().newKeySet()).add(id)
+            }
+
+            return true
+        }
+    }
+}
+
+
+boolean tryAddRole(Map contribution, String id) {
+    def agent = asList(contribution.agent).find()
+    def agentStr = toString(agent)
+
+    Map roleToIds = agentToRolesToIds[agentStr]
+    if (!roleToIds) return false
+
+    def adapterEditor = [Relator.EDITOR, Relator.ADAPTER].collect { [(ID_KEY): it.iri] }
+
+    def currentRoles = asList(contribution.role)
+    // find roles in cluster that can be added (certain conditions need to be met)
+    def rolesInCluster = roleToIds.findAll { r, ids ->
+        def inCluster = idToCluster[id]
+        def inClusterWithRole = ids.intersect(idToCluster[id])
+        return inClusterWithRole
+                && !noRole([r])
+                && (inClusterWithRole.size() >= inCluster.size()
+                || noRole(currentRoles)
+                || r == [(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri]
+                || (r in adapterEditor && currentRoles.intersect(adapterEditor)))
+    }.collect { it.key }
+
+    def newRoles = rolesInCluster - currentRoles
+    if (newRoles) {
+        contribution['role'] = noRole(currentRoles) ? newRoles : currentRoles + newRoles
+        newRoles.each { r ->
+            def shortRole = roleShort(r[ID_KEY])
+            def examples = roleToIds[r].intersect(idToCluster[id]).take(3)
+            def agentShort = agent[ID_KEY] ? idShort(agentStr) : agentToNames[agentStr]?.getAt(0)
+            roleFoundInCluster.println([id, agentShort, shortRole, examples].join('\t'))
+            incrementStats('role found in cluster', shortRole, id)
+            roleToIds[r].add(id)
+        }
+        return true
+    }
+
+    return false
+}
+
+boolean tryAddMissingTranslationOf(Map work, List<Map> contribution, String id) {
+    def trl = [(ID_KEY): Relator.TRANSLATOR.iri]
+    def translators = contribution.findResults { asList(it.role).contains(trl) ? toString(asList(it.agent).find()) : null }
+
+    if (!translators || work['translationOf']) return false
+
+    def title = work.remove('hasTitle')
+    if (title) {
+        // the title should be in translationOf, construct a new local work and put the title there
+        work['translationOf'] = ['@type': 'Work', 'hasTitle': title]
+        incrementStats('add missing translationOf', "title moved to new translationOf", id)
+        titleMovedToTranslationOf.println([id, work['translationOf']].join('\t'))
+        return true
+    }
+
+    for (String translator : translators) {
+        def roleToIds = agentToRolesToIds[translator]
+        def inClusterSameTranslator = roleToIds[trl].intersect(idToCluster[id])
+        def origWorks = inClusterSameTranslator.findResults { idToTranslationOf[it] }
+
+        if (origWorks) {
+            // translationOf found on other work in cluster with matching translator, add to this work (pick the most common if several)
+            work['translationOf'] = origWorks.countBy { it }.max { it.value }?.key
+            def examples = inClusterSameTranslator.findAll { idToTranslationOf.containsKey(it) }.take(3)
+            incrementStats('add missing translationOf', 'original work found in cluster (same translator)', id)
+            originalWorkFoundInCluster.println([id, work['translationOf'], examples].join('\t'))
+            return true
+        }
+    }
+
+    return false
+}
+
+boolean noRole(List<Map> roles) {
+    roles.isEmpty() || roles == [[:]] || roles == [[(ID_KEY): Relator.UNSPECIFIED_CONTRIBUTOR.iri]]
+}
+
+private Map loadIfLink(Map m) {
+    m[ID_KEY] ? loadThing(m[ID_KEY]) : m
+}
+
+@Memoized
+private Map loadThing(def id) {
+    def thing = [:]
+    selectByIds([id]) { t ->
+        thing = t.graph[1]
+    }
+    return thing
+}
+
+Map<String, List<Relator>> parseRespStatement(String respStatement) {
+    def parsedContributions = [:]
+
+    if (respStatement) {
+        respStatement.split(';').eachWithIndex { part, i ->
+            parseSwedishFictionContribution(StringUtils.normalizeSpace(part), i == 0).each { name, roles ->
+                parsedContributions
+                        .computeIfAbsent(name, r -> [])
+                        .addAll(roles)
+            }
+        }
+    }
+
+    return parsedContributions.findAll { name, _ -> name =~ /\s/ }
+}
+
+static Map<String, List<Relator>> parseSwedishFictionContribution(String contribution, boolean isFirstStmtPart) {
+    def roleToPattern =
+            [
+                    (Relator.TRANSLATOR)         : ~/(bemynd(\w+|\.)? )?öf?v(\.|ers(\.|\p{L}+)?)( (till|från) \p{L}+)?|(till svenskan?|från \p{L}+)|svensk text/,
+                    (Relator.AUTHOR)             : ~/^(text(e[nr])?|skriven|written)/,
+                    (Relator.ILLUSTRATOR)        : ~/\bbild(erStrin)?|ill(\.|ustr(\.|\w+)?)|\bvi(gn|nj)ett(er|ill)?|ritad/,
+                    (Relator.AUTHOR_OF_INTRO)    : ~/förord|inl(edn(\.|ing)|edd)/,
+                    (Relator.COVER_DESIGNER)     : ~/omslag/,
+                    (Relator.AUTHOR_OF_AFTERWORD): ~/efter(ord|skrift)/,
+                    (Relator.PHOTOGRAPHER)       : ~/\bfoto\w*\.?/,
+                    (Relator.EDITOR)             : ~/red(\.(?! av)|aktör(er)?)|\bbearb(\.|\w+)?|återberättad|sammanställ\w*/,
+            ]
+
+    def rolePattern = ~/((?iu)${roleToPattern.values().join('|')})/
+    def followsRolePattern = ~/(:| a[fv]| by) /
+    def initialPattern = ~/\p{Lu}/
+    def namePattern = ~/\p{Lu}:?\p{Ll}+('\p{Ll})?(,? [Jj](r|unior))?/
+    def betweenNamesPattern = ~/-| |\. ?| ([Dd]e(l| la)?|von|van( de[nr])?|v\.|le|af|du|dos) | [ODdLl]'/
+    def fullNamePattern = ~/(($initialPattern|$namePattern)($betweenNamesPattern)?)*$namePattern/
+    def conjPattern = ~/ (och|&|and) /
+    def roleAfterNamePattern = ~/( ?\(($rolePattern$conjPattern)?$rolePattern\))/
+    def fullContributionPattern = ~/(($rolePattern($conjPattern|\/))*$rolePattern$followsRolePattern)?$fullNamePattern($conjPattern$fullNamePattern)*$roleAfterNamePattern?/
+
+    // Make roles lower case so that they can't be mistaken for names
+    contribution = (contribution =~ rolePattern)*.first()
+            .collectEntries { [it, it.toLowerCase()] }
+            .with { contribution.replace(it) }
+
+    def nameToRoles = [:]
+
+    def matched = (contribution =~ fullContributionPattern)*.first()
+
+    matched.each { m ->
+        // Extract roles from the contribution
+        def roles = roleToPattern.findResults { role, pattern -> m =~ /(?iu)$pattern/ ? role : null }
+
+        // Author should be the role if first part of respStatement (before ';') and no role seems to be stated
+        if (roles.isEmpty() && isFirstStmtPart && !(contribution =~ /.+$followsRolePattern/)) {
+            roles << Relator.IMPLICIT_AUTHOR
+        }
+
+        // Extract names from the contribution
+        def names = parseNames(fullNamePattern, conjPattern, m)
+
+        // Assign the roles to each name
+        nameToRoles.putAll(names.collectEntries { [it, roles] })
+    }
+
+    return nameToRoles
+}
+
+static List<String> parseNames(Pattern namePattern, Pattern conjPattern, String s) {
+    def names = []
+
+    (s =~ namePattern).each {
+        def name = it.first()
+        // Handle the case of "Jan och Maria Larsson"
+        def previousName = names.isEmpty() ? null : names.last()
+        if (previousName?.split()?.size() == 1 && s =~ /$previousName$conjPattern$name/) {
+            def nameParts = name.split()
+            if (nameParts.size() > 1) {
+                names[-1] += " ${nameParts.last()}"
+            }
+        }
+        names << name
+    }
+
+    return names
+}
+
+@Memoized
+def getWhelk() {
+    // A little hack to get a handle to whelk...
+    def whelk = null
+    selectByIds(['https://id.kb.se/marc']) { docItem ->
+        whelk = docItem.whelk
+    }
+    if (!whelk) {
+        throw new RuntimeException("Could not get Whelk")
+    }
+    return whelk
+}
+
+static boolean yearMismatch(String a, String b) {
+    a && b && a != b
+}
+
+static String lifeSpan(Map agent) {
+    agent.lifeSpan?.replaceAll(~/[^\-0-9]/, '')?.replaceAll(~/-+/, '-')
+}
+
+static String toString(Map agent) {
+    agent[ID_KEY]?.replaceFirst(".+/", Document.BASE_URI.toString()) ?: new JsonBuilder(agent).toString()
+}
+
+static toMap(String agent) {
+    new JsonSlurper().parseText(agent)
+}
+
+static String idShort(String iri) {
+    iri.split("[#/]").dropRight(1).last()
+}
+
+static String roleShort(String iri) {
+    iri?.split("/")?.last() ?: 'NO ROLE'
+}
+
+static boolean similarName(String a, String b) {
+    [nameParts(a), nameParts(b)].with { n1, n2 ->
+        n1.size() == 1 || n2.size() == 1
+                ? n1.intersect(n2)
+                : [initials(n1), initials(n2)].with { i1, i2 -> i1.containsAll(i2) || i2.containsAll(i1) }
+    }
+}
+
+static List<Character> initials(List nameParts) {
+    nameParts.collect { it[0] }
+}
+
+static List<String> nameParts(String s) {
+    s.split(/\s+|-/) as List
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contributions-to-instance.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contributions-to-instance.groovy
new file mode 100644
index 0000000000..d3fb3e9bf3
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/contributions-to-instance.groovy
@@ -0,0 +1,96 @@
+import datatool.scripts.mergeworks.Util.Relator
+
+import whelk.Whelk
+import static whelk.JsonLd.ID_KEY
+import static whelk.JsonLd.TYPE_KEY
+
+report = getReportWriter('report.tsv')
+
+def ids = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } }.flatten()
+
+def whelk = getWhelk()
+def instanceRolesByDomain = whelk.resourceCache.relators.findResults {
+    if (it.domain) {
+        def domain = whelk.jsonld.toTermKey(it.domain[ID_KEY])
+        if (whelk.jsonld.isSubClassOf(domain, 'Embodiment')) it.subMap([ID_KEY])
+    }
+}
+def instanceRoles = instanceRolesByDomain + [Relator.ILLUSTRATOR, Relator.AUTHOR_OF_INTRO, Relator.AUTHOR_OF_AFTERWORD].collect { [(ID_KEY): it.iri] }
+
+selectByIds(ids) { bib ->
+    Map instance = bib.graph[1]
+    def work = instance.instanceOf
+    def contribution = work?.contribution
+
+    if (!contribution) return
+
+    def ill = [(ID_KEY): Relator.ILLUSTRATOR.iri]
+
+    def modified = false
+
+    contribution.removeAll { c ->
+        if (isPrimaryContribution(c)) return false
+
+        def toInstance = asList(c.role).intersect(instanceRoles)
+        if (toInstance.contains(ill)) {
+            if (has9pu(c) || isPictureBook(work) || isComics(work, bib.whelk) || isStillImage(work)) {
+                toInstance.remove(ill)
+            }
+        }
+        if (toInstance) {
+            instance['contribution'] = asList(instance['contribution']) + c.clone().tap { it['role'] = toInstance }
+            c['role'] = asList(c.role) - toInstance
+            modified = true
+            report.println([bib.doc.shortId, toInstance.collect { it[ID_KEY].split('/').last() }].join('\t'))
+            incrementStats('moved to instance', toInstance)
+            return c.role.isEmpty()
+        }
+
+        return false
+    }
+
+    if (contribution.isEmpty()) {
+        work.remove('contribution')
+    }
+
+    if (modified) {
+        bib.scheduleSave()
+    }
+}
+
+boolean isPrimaryContribution(Map contribution) {
+    contribution[TYPE_KEY] == 'PrimaryContribution'
+}
+
+boolean has9pu(Map contribution) {
+    asList(contribution.role).contains([(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri])
+}
+
+boolean isStillImage(Map work) {
+    asList(work.contentType).contains([(ID_KEY): 'https://id.kb.se/term/rda/StillImage'])
+}
+
+boolean isPictureBook(Map work) {
+    def picBookTerms = [
+            'https://id.kb.se/term/barngf/Bilderb%C3%B6cker',
+            'https://id.kb.se/term/barngf/Sm%C3%A5barnsbilderb%C3%B6cker'
+    ].collect { [(ID_KEY): it] }
+
+    return asList(work.genreForm).any { it in picBookTerms }
+}
+
+boolean isComics(Map work, Whelk whelk) {
+    def comicsTerms = [
+            'https://id.kb.se/term/saogf/Tecknade%20serier',
+            'https://id.kb.se/term/barngf/Tecknade%20serier',
+            'https://id.kb.se/term/gmgpc/swe/Tecknade%20serier',
+            'https://id.kb.se/marc/ComicOrGraphicNovel',
+            'https://id.kb.se/marc/ComicStrip'
+    ].collect { [(ID_KEY): it] }
+
+    return asList(work.genreForm).any {
+        it in comicsTerms
+                || it[ID_KEY] && whelk.relations.isImpliedBy('https://id.kb.se/term/saogf/Tecknade%20serier', it[ID_KEY])
+                || asList(work.classification).any { it.code?.startsWith('Hci') }
+    }
+}
\ No newline at end of file
diff --git a/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/language-in-work-title.groovy b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/language-in-work-title.groovy
new file mode 100644
index 0000000000..d59da20dd0
--- /dev/null
+++ b/whelktool/src/main/groovy/datatool/scripts/mergeworks/normalize/language-in-work-title.groovy
@@ -0,0 +1,57 @@
+package datatool.scripts.mergeworks.normalize
+
+import groovy.transform.Memoized
+import whelk.util.DocumentUtil
+
+import static datatool.scripts.mergeworks.Util.getPathSafe
+
+/**
+ Example:
+ $ ENV=qa && time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters="reports/clusters.tsv" -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --dry-run src/main/groovy/datatool/scripts/mergeworks/normalize/language-in-work-title.groovy
+ */
+
+PrintWriter report = getReportWriter("report.txt")
+
+def ids = new File(System.getProperty('clusters'))
+        .readLines()
+        .collect { it.split('\t').collect { it.trim()} }
+        .flatten()
+
+selectByIds(ids) { bib -> 
+    def langs = [
+            [1, 'instanceOf', 'language', 0, '@id'],
+            [1, 'instanceOf', 'translationOf', 0, 'language', 0, '@id']
+    ].collect {
+        langName(getPathSafe(bib.graph, it, '')).toLowerCase() 
+    }
+    
+    boolean changed = DocumentUtil.traverse(bib.graph[1].instanceOf) { value, path ->
+        if (path && 'mainTitle' in path && value instanceof String) {
+            for (lang in langs) {
+                String r = value.replaceAll(/(?i)\s*\(\(?\s*${lang}\s*\)\)?\s*$/, '')
+                if (value != r) {
+                    report.println("$value -> $r")
+                    return new DocumentUtil.Replace(r)
+                }
+            }
+        }
+        return DocumentUtil.NOP
+    }
+
+    if (changed) {
+        bib.scheduleSave()
+    }
+}
+
+@Memoized
+private String langName(def id) {
+    getPathSafe(loadThing(id), ['prefLabelByLang', 'sv'], "NOT FOUND")
+}
+
+private Map loadThing(def id) {
+    def thing = [:]
+    selectByIds([id]) { t ->
+        thing = t.graph[1]
+    }
+    return thing
+}
\ No newline at end of file
diff --git a/whelktool/src/main/java/datatool/util/DocumentComparator.java b/whelktool/src/main/java/datatool/util/DocumentComparator.java
index c7590a84e9..22b818d6c5 100644
--- a/whelktool/src/main/java/datatool/util/DocumentComparator.java
+++ b/whelktool/src/main/java/datatool/util/DocumentComparator.java
@@ -35,9 +35,13 @@ public boolean isEqual(Map<?, ?> a, Map<?, ?> b) {
     }
 
     private boolean isEqual(Object a, Object b, Object key) {
-        if (a == null || b == null || a.getClass() != b.getClass()) {
+        if (a == null || b == null) {
             return false;
         }
+        else if (a.getClass() != b.getClass()) {
+            return (isSingleItemList(a) && isEqual(((List<?>) a).get(0), b, key)
+                    || (isSingleItemList(b) && isEqual(a, ((List<?>) b).get(0), key)));
+        }
         else if (a instanceof Map) {
             return isEqual((Map<?, ?>) a, (Map<?, ?>) b);
         }
@@ -53,6 +57,10 @@ else if (a instanceof List) {
         }
     }
 
+    private boolean isSingleItemList(Object o) {
+        return o instanceof List && ((List<?>) o).size() == 1;
+    }
+
     private boolean isEqualOrdered(List<?> a, List<?> b) {
         if (a.size() != b.size()) {
             return false;
diff --git a/whelktool/src/main/resources/merge-works/generic-titles.txt b/whelktool/src/main/resources/merge-works/generic-titles.txt
new file mode 100644
index 0000000000..afc73c6ba8
--- /dev/null
+++ b/whelktool/src/main/resources/merge-works/generic-titles.txt
@@ -0,0 +1,34 @@
+artiklar
+collected plays
+dagböcker
+dikter
+dramatik
+essäer
+folksagor
+folkvisor
+fragment
+korrespondens
+krönikor
+lyrik
+memoarer
+noveller
+pjäser
+plays
+poems
+poesi
+prosa
+publication
+publications
+rapport
+report
+romaner
+sagor
+samlade dikter
+samlade pjäser
+samlade skrifter
+samlade verk
+skrifter
+skådespel
+sonetter
+tecknade serier
+urval
\ No newline at end of file
diff --git a/whelktool/src/main/resources/merge-works/ignored-subtitles.txt b/whelktool/src/main/resources/merge-works/ignored-subtitles.txt
new file mode 100644
index 0000000000..4dea8de2e6
--- /dev/null
+++ b/whelktool/src/main/resources/merge-works/ignored-subtitles.txt
@@ -0,0 +1,77 @@
+a comedy
+a history
+a novel
+a play
+a romance
+a tale
+aforismer
+berättelse
+berättelse för barn
+berättelse för flickor
+berättelse för pojkar
+berättelse för unga flickor
+berättelser
+berättelser för barn
+bilderbok
+comédie
+contos
+deckare
+deckarroman
+detektivroman
+dikt
+dikter
+drama
+efterlämnade dikter
+ein coq-rouge-thriller
+ein roman
+eine erzählung
+erzählung
+erzählungen
+essays
+essäer
+ett fall för kay scarpetta
+fortælling
+historisk roman
+homandeckare
+jack reacher-thriller
+komedi
+komedi i fyra akter
+krimi
+kriminalroman
+kärlekshistoria
+kärleksroman
+kåserier
+lustspel i en akt
+nouvelles
+novela
+novell
+novelle
+noveller
+pjäs
+polisroman
+povesti
+powieść
+poėma
+reseguide
+resehandbok
+rikosromaani
+romaani
+romaani rikoksesta
+roman
+roman om ett brott
+roman om skivvärlden
+romanas
+romance
+romanzo
+rövarroman
+runoja
+saga
+sagor
+sann historia
+skildringar
+skáldsaga
+spänningsroman
+stories
+thriller
+ungdomsroman
+(Efterlämnade dikter.)
diff --git a/whelktool/src/main/resources/merge-works/table.css b/whelktool/src/main/resources/merge-works/table.css
new file mode 100644
index 0000000000..e6378ea2ee
--- /dev/null
+++ b/whelktool/src/main/resources/merge-works/table.css
@@ -0,0 +1,30 @@
+ table {
+   border-collapse: collapse;
+ }
+ table, th, td {
+   border: 1px solid grey;
+ }
+ th {
+   text-align: left;
+ }
+ tr.info td {
+   background-color: lightgrey;
+ }
+ tr.DIFF td {
+   background-color: lightpink;
+ }
+ tr.COMPATIBLE td {
+     background-color: greenyellow;
+ }
+ tr.EQUAL td {
+   background-color: lightgreen;
+ }
+ td {
+   vertical-align: top;
+ }
+ hr {
+   border: 4px solid;
+ }
+ a:target {
+     background-color: coral;
+ }
\ No newline at end of file
diff --git a/whelktool/src/test/groovy/datatool/scripts/mergeworks/DocSpec.groovy b/whelktool/src/test/groovy/datatool/scripts/mergeworks/DocSpec.groovy
new file mode 100644
index 0000000000..407372c419
--- /dev/null
+++ b/whelktool/src/test/groovy/datatool/scripts/mergeworks/DocSpec.groovy
@@ -0,0 +1,27 @@
+package datatool.scripts.mergeworks
+import spock.lang.Specification
+import whelk.util.Unicode
+
+class DocSpec extends Specification {
+
+    def "parse extent"() {
+        expect:
+        Doc.numPages(extent) == pages
+        where:
+        extent                                    | pages
+        ""                                        | -1
+        "114, [1] s."                             | 114
+        "[4], 105, [2] s."                        | 105
+        "21 s., ([4], 21, [5] s.)"                | 21
+        "[108] s., (Ca 110 s.)"                   | 110
+        "80 s., (80, [3] s., [8] pl.-bl. i färg)" | 80
+        "622, [8] s."                             | 622
+        "[2] s., s. 635-919, [7] s."              | 919 // ??
+        "[1], iv, 295 s."                         | 295
+        "3 vol."                                  | -1
+        //"249, (1) s."                             | 249
+        //"[8] s., s. 11-370"                       | 370
+        //[12] s., s. 15-256                        | 256
+        "25 onumrerade sidor"                     | 25
+    }
+}
diff --git a/whelktool/src/test/groovy/datatool/util/DocumentComparatorSpec.groovy b/whelktool/src/test/groovy/datatool/util/DocumentComparatorSpec.groovy
index 436bc07372..7417638205 100644
--- a/whelktool/src/test/groovy/datatool/util/DocumentComparatorSpec.groovy
+++ b/whelktool/src/test/groovy/datatool/util/DocumentComparatorSpec.groovy
@@ -13,6 +13,7 @@ class DocumentComparatorSpec extends Specification {
 
         expect:
         d.isEqual(a, b) == eq
+        d.isEqual(b, a) == eq
 
         where:
         a                    | b                    || eq
@@ -30,6 +31,9 @@ class DocumentComparatorSpec extends Specification {
         ["ordered": [1, 2]]  | ["ordered": [1, 2]]  || true
         ["ordered": [1, 2]]  | ["ordered": [2, 1]]  || false
 
+        // one element list equals element
+        ["x": ["a"]]         | ["x": "a"]           || true
+        ["x": [["n": 2]]]    | ["x": ["n": 2]]      || true
     }
 
     def "isSubset"() {