From 9bc5ccfde901ca92692efbf0771834805314f430 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Tue, 17 Mar 2020 12:01:15 +0100 Subject: [PATCH 001/249] Clean up cluster merging script --- .../main/groovy/whelk/util/Statistics.groovy | 2 +- .../src/main/groovy/whelk/util/Unicode.groovy | 10 +- .../test/groovy/whelk/util/UnicodeSpec.groovy | 21 +- whelktool/scripts/analysis/bib-249.groovy | 81 ++++ whelktool/scripts/analysis/bib-976.groovy | 114 ++++++ whelktool/scripts/analysis/broader-gf.groovy | 45 +++ .../scripts/analysis/local-broader.groovy | 61 +++ whelktool/scripts/analysis/mediaterm.groovy | 27 ++ .../scripts/analysis/merge-clusters.groovy | 33 ++ .../analysis/oversattning-without.trl.groovy | 49 +++ whelktool/scripts/analysis/subject-404.groovy | 38 ++ .../{examples => analysis}/works.groovy | 0 .../{examples => analysis}/works2.groovy | 4 +- .../scripts/examples/contribution-role.groovy | 43 +++ .../scripts/examples/merge-clusters.groovy | 32 -- .../src/main/groovy/datatool/WorkTool.groovy | 73 ++++ .../datatool/scripts/mergeworks/Doc.groovy | 283 ++++++++++++++ .../scripts/mergeworks/FieldStatus.groovy | 7 + .../datatool/scripts/mergeworks/Util.groovy | 109 ++++++ .../scripts/mergeworks/WorkComparator.groovy | 117 ++++++ .../scripts/mergeworks/WorkJob.groovy | 350 ++++++++++++++++++ .../mergeworks/compare/Classification.groovy | 63 ++++ .../mergeworks/compare/ContentType.groovy | 21 ++ .../scripts/mergeworks/compare/Default.groovy | 13 + .../mergeworks/compare/FieldHandler.groovy | 6 + .../mergeworks/compare/GenreForm.groovy | 35 ++ .../mergeworks/compare/StuffSet.groovy | 39 ++ .../mergeworks/compare/WorkTitle.groovy | 16 + .../datatool/util/DocumentComparator.java | 10 +- .../merge-works/ignored-subtitles.txt | 70 ++++ .../src/main/resources/merge-works/table.css | 27 ++ .../util/DocumentComparatorSpec.groovy | 4 + 32 files changed, 1763 insertions(+), 40 deletions(-) create mode 100644 whelktool/scripts/analysis/bib-249.groovy create mode 100644 whelktool/scripts/analysis/bib-976.groovy create mode 100644 whelktool/scripts/analysis/broader-gf.groovy create mode 100644 whelktool/scripts/analysis/local-broader.groovy create mode 100644 whelktool/scripts/analysis/mediaterm.groovy create mode 100644 whelktool/scripts/analysis/merge-clusters.groovy create mode 100644 whelktool/scripts/analysis/oversattning-without.trl.groovy create mode 100644 whelktool/scripts/analysis/subject-404.groovy rename whelktool/scripts/{examples => analysis}/works.groovy (100%) rename whelktool/scripts/{examples => analysis}/works2.groovy (91%) create mode 100644 whelktool/scripts/examples/contribution-role.groovy delete mode 100644 whelktool/scripts/examples/merge-clusters.groovy create mode 100644 whelktool/src/main/groovy/datatool/WorkTool.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/Doc.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/FieldStatus.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/Util.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkComparator.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/WorkJob.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Classification.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/ContentType.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/Default.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/FieldHandler.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/GenreForm.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/StuffSet.groovy create mode 100644 whelktool/src/main/groovy/datatool/scripts/mergeworks/compare/WorkTitle.groovy create mode 100644 whelktool/src/main/resources/merge-works/ignored-subtitles.txt create mode 100644 whelktool/src/main/resources/merge-works/table.css diff --git a/whelk-core/src/main/groovy/whelk/util/Statistics.groovy b/whelk-core/src/main/groovy/whelk/util/Statistics.groovy index e99b258421..411f821015 100644 --- a/whelk-core/src/main/groovy/whelk/util/Statistics.groovy +++ b/whelk-core/src/main/groovy/whelk/util/Statistics.groovy @@ -13,7 +13,7 @@ class Statistics { ThreadLocal> context = ThreadLocal.withInitial({ -> null }) int numExamples - + Statistics(int numExamples = 1) { this.numExamples = numExamples } diff --git a/whelk-core/src/main/groovy/whelk/util/Unicode.groovy b/whelk-core/src/main/groovy/whelk/util/Unicode.groovy index c5620c1f5f..ad97201b19 100644 --- a/whelk-core/src/main/groovy/whelk/util/Unicode.groovy +++ b/whelk-core/src/main/groovy/whelk/util/Unicode.groovy @@ -50,6 +50,8 @@ class Unicode { [(it): Normalizer.normalize(it, Normalizer.Form.NFKC)] } + STRIP_UNICODE_CHARS.collectEntries { [(it): ''] } } + + private static final Pattern UNICODE_MARK = Pattern.compile('\\p{M}') static boolean isNormalized(String s) { return Normalizer.isNormalized(s, Normalizer.Form.NFC) && !EXTRA_NORMALIZATION_MAP.keySet().any{ s.contains(it) } @@ -90,11 +92,11 @@ class Unicode { def m = s =~ /[^${w}]*(.*)/ return m.matches() ? m.group(1) : s } - + static String trim(String s) { s.replaceFirst(LEADING_SPACE, '').replaceFirst(TRAILING_SPACE, '') } - + static Optional guessScript(String s) { s = s.replaceAll(~/\p{IsCommon}|\p{IsInherited}|\p{IsUnknown}/, '') @@ -178,4 +180,8 @@ class Unicode { 'Vaii', ].each { add15924scriptCode(it) } } + + static String asciiFold(String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD).replaceAll(UNICODE_MARK, '') + } } diff --git a/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy b/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy index 4e6a59be6e..a71c622052 100644 --- a/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy +++ b/whelk-core/src/test/groovy/whelk/util/UnicodeSpec.groovy @@ -40,7 +40,7 @@ class UnicodeSpec extends Specification { ' _.:;|(Überzetsung)|;:. ' | '(Überzetsung)' ' _.:;| Ü b e r - z e t - s u n g |;:. ' | 'Ü b e r - z e t - s u n g' } - + def "trim"() { expect: Unicode.trim(dirty) == clean @@ -58,7 +58,7 @@ class UnicodeSpec extends Specification { '\r\nkeep leading line breaks' | '\r\nkeep leading line breaks' } - + def "double quotation marks"() { expect: Unicode.isNormalizedDoubleQuotes(dirty) == (dirty == clean) @@ -99,5 +99,22 @@ class UnicodeSpec extends Specification { Optional.of('Armn') | 'Պիպին նավի վրա' Optional.of('Kana') | 'デスノート' Optional.of('Hira') | 'とんとんとんと' + + def "u"() { + given: + String s = "übers" //uU+CC88 + String nfc = "übers" //U+C3BC + expect: + Unicode.isNormalized(s) == false + Unicode.normalize(s) == nfc + } + + def "asciiFold"() { + expect: + Unicode.asciiFold(unicode) == ascii + + where: + unicode | ascii + 'Désidéria' | 'Desideria' } } \ No newline at end of file diff --git a/whelktool/scripts/analysis/bib-249.groovy b/whelktool/scripts/analysis/bib-249.groovy new file mode 100644 index 0000000000..628d5acfcf --- /dev/null +++ b/whelktool/scripts/analysis/bib-249.groovy @@ -0,0 +1,81 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +s = new Statistics().printOnShutdown() + +selectByCollection('bib') { bib -> + try { + process(bib) + } + catch(Exception e) { + System.err.println(e) + e.printStackTrace() + } + +} + +void process(bib) { + def (record, thing, work) = bib.graph + + if(!work) { + return + } + + if(thing['marc:hasBib249']) { + boolean marcTrl = work['marc:languageNote'] == "marc:ItemIsOrIncludesATranslation" + + String hasTitle = hasTitle(thing, work) + + if (hasTitle == "diff") { + println (""" + ${bib.doc.getURI()} + ${thing['marc:hasBib249']} + marc:ItemIsOrIncludesATranslation ${marcTrl} + ${work.hasTitle} + """.stripIndent()) + } + + s.increment('hasTitle', hasTitle) + s.increment('shape', maybeList(thing['marc:hasBib249']) { map -> new TreeSet(map.keySet()) }) + s.increment('marc:ItemIsOrIncludesATranslation', "${marcTrl}") + } +} + +String hasTitle(thing, work) { + if (work.hasTitle) { + isSameTitle(thing, work) ? "match" : "diff" + } + else { + "no" + } +} + +boolean isSameTitle(def thing, def work) { + String t = getPathSafe(thing, ['marc:hasBib249', 'marc:originalTitle'], "TT") + String w = getPathSafe(work, ['hasTitle', 0, 'mainTitle'], "WT") + trim(w.toLowerCase()) == trim(t.toLowerCase()) +} + +Object maybeList(Object o, Closure c) { + o instanceof List + ? o.collect(c) + : c(o) +} + +private Object getPathSafe(item, path, defaultTo = null) { + for (p in path) { + if (item[p] != null) { + item = item[p] + } else { + return defaultTo + } + } + return item +} + +String trim(String s) { + // remove leading and trailing non-"alpha, digit or parentheses" + def w = /\(\)\p{IsAlphabetic}\p{Digit}/ + def m = s =~ /[^${w}]*([${w}- ]*[${w}])[^${w}]*/ + return m.matches() ? m.group(1) : s +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/bib-976.groovy b/whelktool/scripts/analysis/bib-976.groovy new file mode 100644 index 0000000000..5e26dfca23 --- /dev/null +++ b/whelktool/scripts/analysis/bib-976.groovy @@ -0,0 +1,114 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +class Script { + static PrintWriter notIn084 + static PrintWriter in084 + static PrintWriter noCode + static PrintWriter report + static PrintWriter errors +} +Script.notIn084 = getReportWriter("not-in-084.txt") +Script.in084 = getReportWriter("in-084.txt") +Script.noCode = getReportWriter("no-code.txt") +Script.report = getReportWriter("report.txt") +Script.errors = getReportWriter("errors.txt") + +s = new Statistics().printOnShutdown() + +selectByCollection('bib') { bib -> + try { + process(bib) + } + catch(Exception e) { + Script.errors.println("${bib.doc.shortId} $e") + e.printStackTrace(Script.errors) + } + +} + +void process(bib) { + def work = bib.graph[1]['instanceOf'] + + if(!work) { + return + } + + def bib976 = asList(work['marc:hasBib976']) + if(!bib976) { + return + } + + def (code, noCode) = bib976.split { it['marc:bib976-a'] } + def bib81 = sab(work) + + handleWithSabCode(bib, work, bib81, code) + handleWithoutSabCode(bib, work, bib81, noCode) +} + +void handleWithSabCode(bib, work, bib084, bib976) { + + bib976.each { + def (in084, notIn084) = bib976.split { x -> + def code = x['marc:bib976-a'] + bib084.findAll{ it.startsWith((code)) } + } + + in084.each { + s.increment('bib976-a', 'in classification') + } + + notIn084.each { + s.increment('bib976-a', 'not in classification') + s.increment('bib976-a not in classification', it) + } + + if (notIn084) { + Script.notIn084.println(""" + ${bib.doc.getURI()} + bib-976: ${notIn084.collect{ "${it['marc:bib976-a']} (${it['marc:bib976-b']})" }} + classification/kssb: $bib084 + """.stripIndent()) + } + + if (in084) { + Script.in084.println(""" + ${bib.doc.getURI()} + bib-976: ${in084.collect{ "${it['marc:bib976-a']} (${it['marc:bib976-b']})" }} + classification/kssb: $bib084 + """.stripIndent()) + } + + Script.report.println("${bib.doc.shortId} ${handled(in084, notIn084)}") + } +} + +String handled(in084, notIn084) { + if (!in084 && notIn084) { + return "ingen" + } + if (in084 && !notIn084) { + return "alla" + } + return "delvis" +} + +void handleWithoutSabCode(bib, work, bib084, bib976) { + if (bib976) { + def creator = bib.graph[0]['descriptionCreator']['@id'] + s.increment('bib976 without code', creator) + + bib976.each { + def label = it['marc:bib976-b'] + Script.noCode.println("${bib.doc.getURI()} $creator $label") + } + } +} + +List sab(work) { + asList(work['classification']).findAll{ it['inScheme'] ?: '' == 'kssb' }.collect{ it['code'] } +} + +def asList(x) { + (x ?: []).with {it instanceof List ? it : [it] } +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/broader-gf.groovy b/whelktool/scripts/analysis/broader-gf.groovy new file mode 100644 index 0000000000..a0e526466c --- /dev/null +++ b/whelktool/scripts/analysis/broader-gf.groovy @@ -0,0 +1,45 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +s = new Statistics(5).printOnShutdown() + +selectByCollection('bib') { bib -> + def work = getWork(bib) + + if(!work) { + return + } + + if(work['genreForm']) { + List ids = work['genreForm']['@id'] + if (ids.size() > 1) { + [ids, ids].combinations{ a,b -> + if (a != b) { + check(bib.whelk, a, b) + check(bib.whelk, b, a) + } + } + } + } +} + +void check(whelk, String a, String b) { + if (whelk.relations.isImpliedBy(a, b)) { + s.increment(a, b) + s.increment('#broader', a) + } +} + +Map getWork(def bib) { + def (record, thing, work) = bib.graph + if (thing && isInstanceOf(thing, 'Work')) { + return thing + } + else if(thing && thing['instanceOf'] && isInstanceOf(thing['instanceOf'], 'Work')) { + return thing['instanceOf'] + } + else if (work && isInstanceOf(work, 'Work')) { + return work + } + return null +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/local-broader.groovy b/whelktool/scripts/analysis/local-broader.groovy new file mode 100644 index 0000000000..10a87e7bde --- /dev/null +++ b/whelktool/scripts/analysis/local-broader.groovy @@ -0,0 +1,61 @@ +/** + * Find unlinked 'broader' + * + * See LXL-3213 for more information. + */ + + +import groovy.transform.Memoized +import whelk.util.DocumentUtil + +class Script { + static PrintWriter report + static PrintWriter selfRef + static PrintWriter is404 + static PrintWriter error +} +Script.report = getReportWriter("report.txt") +Script.selfRef = getReportWriter("self-ref.txt") +Script.error = getReportWriter("error.txt") +Script.is404 = getReportWriter("404.txt") + +selectByCollection('auth') { auth -> + try { + process(auth) + } + catch(Exception e) { + //Script.error. + println("${auth.doc.shortId} $e") + e.printStackTrace() + } +} + +void process(auth) { + Map thing = auth.graph[1] + String id = thing['@id'] + List broader = thing['broader'] + + if (!broader) { + return + } + + broader.findAll{ !it['@id'] }.each { Map b -> + Script.report.println("$id $b") + } + + broader.findAll{ id == it['@id'] }.each { Map b -> + Script.selfRef.println("$id") + } + broader.findAll{ it['@id'] && is404(it['@id']) }.each { Map b -> + Script.is404.println("$id $b") + } +} + +@Memoized +boolean is404(String id) { + Map thing = null + selectByIds([id]) { auth -> + thing = auth.graph[1] + } + return thing == null +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/mediaterm.groovy b/whelktool/scripts/analysis/mediaterm.groovy new file mode 100644 index 0000000000..4e015e8079 --- /dev/null +++ b/whelktool/scripts/analysis/mediaterm.groovy @@ -0,0 +1,27 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +Statistics s = new Statistics(5) +s.printOnShutdown() + +selectByCollection('bib') { bib -> + try { + DocumentUtil.findKey(bib.doc.data, 'marc:mediaTerm') { String value, path -> + if (value.contains(']')) { + String mediaType = value.substring(0, value.indexOf(']')) + String suffix = value.substring(value.indexOf(']') + 1) + if (!suffix.isBlank()) { + String id = bib.doc.shortId + s.increment('ALL', suffix, id) + s.increment(mediaType, suffix, id) + s.increment('TOTAL', 'TOTAL') + } + } + + } + } + catch(Exception e) { + println(e) + e.printStackTrace() + } +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/merge-clusters.groovy b/whelktool/scripts/analysis/merge-clusters.groovy new file mode 100644 index 0000000000..d212611d8e --- /dev/null +++ b/whelktool/scripts/analysis/merge-clusters.groovy @@ -0,0 +1,33 @@ +import datatool.util.DisjointSets + +String dir = System.getProperty('clustersDir') +mergeClusters( + new File(dir, 'clusters.tsv'), + new File(dir, 'clusters-merged.tsv')) + +void mergeClusters(File input, File output) throws FileNotFoundException { + DisjointSets sets = new DisjointSets<>() + + input.eachLine() { + sets.addSet(Arrays.asList(it.split(/[\t ]+/))) + } + + output.withPrintWriter { p -> + sets.iterateAllSets(new DisjointSets.SetVisitor() { + boolean first = true + @Override + void nextElement(String e) { + if(!first) + p.print('\t') + p.print(e) + first = false + } + + @Override + void closeSet() { + p.println() + first = true + } + }) + } +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/oversattning-without.trl.groovy b/whelktool/scripts/analysis/oversattning-without.trl.groovy new file mode 100644 index 0000000000..18e008e764 --- /dev/null +++ b/whelktool/scripts/analysis/oversattning-without.trl.groovy @@ -0,0 +1,49 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +Statistics s = new Statistics().printOnShutdown() + +selectByCollection('bib') { bib -> + def work = getWork(bib) + + if(!work) { + return + } + + if(work['marc:languageNote'] == "marc:ItemIsOrIncludesATranslation" + && noTranslator(work.contribution ?: []) + && (bib.graph[1].responsibilityStatement ?: "").contains('övers') + ) { + println (""" + ${bib.doc.getURI()} + ${work.contribution} + ${bib.graph[1].responsibilityStatement} + + """.stripIndent()) + s.increment('tot', 'tot') + } +} + +boolean noTranslator(def contribution) { + boolean found = false + DocumentUtil.findKey(contribution, '@id') { value, path -> + if (value == 'https://id.kb.se/relator/translator') { + found = true + } + DocumentUtil.NOP + } + + return !found +} + + +Map getWork(def bib) { + def (record, thing, work) = bib.graph + if (thing && isInstanceOf(thing, 'Work')) { + return thing + } + else if (work && isInstanceOf(work, 'Work')) { + return work + } + return null +} \ No newline at end of file diff --git a/whelktool/scripts/analysis/subject-404.groovy b/whelktool/scripts/analysis/subject-404.groovy new file mode 100644 index 0000000000..e453172dd6 --- /dev/null +++ b/whelktool/scripts/analysis/subject-404.groovy @@ -0,0 +1,38 @@ +import whelk.util.DocumentUtil +import whelk.util.Statistics + +class Script { + static Statistics s = new Statistics().printOnShutdown() +} + +selectByCollection('bib') { bib -> + try { + process(bib) + } + catch(Exception e) { + System.err.println(e) + e.printStackTrace() + } + +} + +void process(bib) { + def (record, thing) = bib.graph + + Map work = thing['instanceOf'] + + if(!work) { + return + } + + if(work['subject']) { + for (Map subject in (work['subject'] as List)) { + if(subject['@type'] != 'ComplexSubject') { + if (subject['sameAs'] && subject['sameAs'][0] && subject['sameAs'][0]['@id'] && subject['sameAs'][0]['@id'].contains('id.kb.se')) { + Script.s.increment('sameAs', subject['sameAs'][0]['@id'], bib.doc.shortId) + } + } + } + + } +} \ No newline at end of file diff --git a/whelktool/scripts/examples/works.groovy b/whelktool/scripts/analysis/works.groovy similarity index 100% rename from whelktool/scripts/examples/works.groovy rename to whelktool/scripts/analysis/works.groovy diff --git a/whelktool/scripts/examples/works2.groovy b/whelktool/scripts/analysis/works2.groovy similarity index 91% rename from whelktool/scripts/examples/works2.groovy rename to whelktool/scripts/analysis/works2.groovy index 8f31ad43f0..cb160f1ec7 100644 --- a/whelktool/scripts/examples/works2.groovy +++ b/whelktool/scripts/analysis/works2.groovy @@ -69,12 +69,12 @@ private String title(bib) { } private String primaryContributorId(bib) { - def primary = getPathSafe(bib.doc.data, ['@graph', 2, 'contribution'], []).grep{ it['@type'] == "PrimaryContribution"} + def primary = getPathSafe(bib.doc.data, ['@graph', 1, 'instanceOf', 'contribution'], []).grep{ it['@type'] == "PrimaryContribution"} return getPathSafe(primary, [0, 'agent', '@id']) } private List contributorStrings(bib) { - return getPathSafe(bib.asCard(true), ['@graph',2,'contribution'], [])['_str'].grep{it} + return getPathSafe(bib.asCard(true), ['@graph', 1, 'instanceOf', 'contribution'], [])['_str'].grep{it} } private String flatTitle(bib) { diff --git a/whelktool/scripts/examples/contribution-role.groovy b/whelktool/scripts/examples/contribution-role.groovy new file mode 100644 index 0000000000..32a9015d01 --- /dev/null +++ b/whelktool/scripts/examples/contribution-role.groovy @@ -0,0 +1,43 @@ +import whelk.util.DocumentUtil +import datatool.util.Statistics + +Statistics s = new Statistics() +s.printOnShutdown() + +selectByCollection('bib') { bib -> + try { + DocumentUtil.findKey(bib.doc.data, 'role') { Object value, path -> + count(s, value) + } + } + catch(Exception e) { + println(e) + e.printStackTrace() + } +} + + +private String normalize(String s) { + def noise = [",", '"', "'", '[', ']', ',', '.', '.', ':', ';', '-', '(', ')', '-', '–', '+', '!', '?'].collectEntries { [it, ''] } + return s.toLowerCase().replace(noise).trim() +} + +void count(Statistics s, Object role) { + if (role instanceof Map && !role['@id']) { + count1(s, role, 'code') + count1(s, role, 'label') + } + else if (role instanceof String) { + s.increment('string', role.toString()) + } + else if (role instanceof List) { + s.increment('list size', role.size()) + role.each { count(s, it) } + } +} + +void count1(Statistics s, Map thing, String prop) { + if (thing[prop]) { + s.increment(prop, normalize(thing[prop].toString())) + } +} \ No newline at end of file diff --git a/whelktool/scripts/examples/merge-clusters.groovy b/whelktool/scripts/examples/merge-clusters.groovy deleted file mode 100644 index c5af74531a..0000000000 --- a/whelktool/scripts/examples/merge-clusters.groovy +++ /dev/null @@ -1,32 +0,0 @@ -import datatool.util.DisjointSets - -String dir = System.getProperty('clustersDir') -mergeClusters( - new File(dir, 'clusters.tsv'), - new File(dir, 'clusters-merged.tsv')) - -void mergeClusters(File input, File output) throws FileNotFoundException { - DisjointSets sets = new DisjointSets<>() - PrintWriter p = new PrintWriter(output) - - input.eachLine() { - sets.addSet(Arrays.asList(it.split(/[\t ]+/))) - } - - sets.iterateAllSets(new DisjointSets.SetVisitor() { - boolean first = true - @Override - void nextElement(String e) { - if(!first) - print('\t') - p.print(e) - first = false - } - - @Override - void closeSet() { - p.println() - first = true - } - }) -} \ No newline at end of file diff --git a/whelktool/src/main/groovy/datatool/WorkTool.groovy b/whelktool/src/main/groovy/datatool/WorkTool.groovy new file mode 100644 index 0000000000..43754611ca --- /dev/null +++ b/whelktool/src/main/groovy/datatool/WorkTool.groovy @@ -0,0 +1,73 @@ +package datatool + +import groovy.cli.commons.CliBuilder +import datatool.scripts.mergeworks.WorkJob + +/** + + ENV=qa && time java -Xmx4G -Dxl.secret.properties=$HOME/secret.properties-$ENV -cp build/libs/whelktool.jar datatool.WorkTool -s reports/1000-fiction.tsv + + + */ + +class WorkTool { + public static void main(String[] args) { + def cli = new CliBuilder(usage:'whelktool [options]