diff --git a/pom.xml b/pom.xml index 549a98c..b5e4680 100644 --- a/pom.xml +++ b/pom.xml @@ -14,6 +14,7 @@ add-language-of-descriptions identify-main-identifier leave-only-one-identifier + relativize-identifiers diff --git a/relativize-identifiers/README.md b/relativize-identifiers/README.md new file mode 100644 index 0000000..51bdb44 --- /dev/null +++ b/relativize-identifiers/README.md @@ -0,0 +1,10 @@ +relativize-identifiers +========================= + +Part of the ehri-ead-preprocessing tools to normalise EAD files before importing into the EHRI database. + +precondition: The EAD file has absolute identifiers, where unitids in each c-level include the full ID of their parent unitid +postcondition: The EAD file has relative identifiers. + +usage: +java -jar relativize-identifiers/target/relativize-identifier-1.0-SNAPSHOT-jar-with-dependencies.jar diff --git a/relativize-identifiers/pom.xml b/relativize-identifiers/pom.xml new file mode 100644 index 0000000..db4d45d --- /dev/null +++ b/relativize-identifiers/pom.xml @@ -0,0 +1,72 @@ + + + 4.0.0 + + ehri-project-preprocess + ead-preprocessing + 1.0 + + ehri-project + relativize-identifiers + 1.0-SNAPSHOT + relativize-identifiers + http://maven.apache.org + + UTF-8 + + + + junit + junit + 4.10 + test + + + + + stax + stax + 1.2.0 + + + stax + stax-api + 1.0.1 + + + org.apache.commons + commons-io + 1.3.2 + + + + + + + + maven-assembly-plugin + + + + eu.ehri.relativize_identifiers.RelativizeIdentifiers + + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + + + diff --git a/relativize-identifiers/src/main/java/eu/ehri/relativize_identifiers/RelativizeIdentifiers.java b/relativize-identifiers/src/main/java/eu/ehri/relativize_identifiers/RelativizeIdentifiers.java new file mode 100644 index 0000000..affd99d --- /dev/null +++ b/relativize-identifiers/src/main/java/eu/ehri/relativize_identifiers/RelativizeIdentifiers.java @@ -0,0 +1,90 @@ +package eu.ehri.relativize_identifiers; + +import java.io.FileInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Writer; + +import java.util.Stack; +import java.util.regex.Pattern; +import javax.xml.parsers.FactoryConfigurationError; +import javax.xml.stream.XMLEventFactory; +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.XMLEventWriter; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLOutputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.events.Characters; +import javax.xml.stream.events.XMLEvent; + + +public class RelativizeIdentifiers { + + public final static String SUFFIX = "_relid.xml"; + + static XMLEventFactory eventFactory = XMLEventFactory.newInstance(); + static XMLOutputFactory factory = XMLOutputFactory.newInstance(); + + + public static void main(String[] args) throws XMLStreamException, javax.xml.stream.FactoryConfigurationError, IOException { + String eadfile = args[0]; + String outputfile = eadfile.replace(".xml", SUFFIX); + RelativizeIdentifiers.relativizeIdentifiers(eadfile, new FileWriter(outputfile)); + } + + /** + * precondition: The EAD file has absolute identifiers, where unitids in each c-level + * include the full ID of their parent unitid + * postcondition: The EAD file has relative identifiers. + * + * @param eadfile the name of the ead file + * @throws javax.xml.stream.XMLStreamException + * @throws javax.xml.parsers.FactoryConfigurationError + * @throws java.io.IOException + */ + public static String relativizeIdentifiers(String eadfile, Writer outputWriter) + throws XMLStreamException, FactoryConfigurationError, IOException { + + FileInputStream fileInputStreamEAD = new FileInputStream(eadfile); + XMLEventWriter writer = factory.createXMLEventWriter(outputWriter); + + Stack idStack = new Stack(); + Pattern childPattern = Pattern.compile("c\\d\\d"); + + String thisId; + + XMLEventReader xmlEventReaderEAD = XMLInputFactory.newInstance().createXMLEventReader(fileInputStreamEAD); + while (xmlEventReaderEAD.hasNext()) { + XMLEvent event = xmlEventReaderEAD.nextEvent(); + writer.add(event); + if (event.isStartElement()) { + if (event.asStartElement().getName().getLocalPart().equals("unitid")) { + XMLEvent nextEvent = xmlEventReaderEAD.nextEvent(); + if (nextEvent.isCharacters()) { + thisId = nextEvent.asCharacters().getData(); + if (!idStack.empty() && thisId.contains(idStack.peek())) { + // Replace the ID and any non-ID trailing chars, such as spaces, + // colons, or dashes. + String regex = "^" + Pattern.quote(idStack.peek()) + "[\\s\\-:_\\/]*"; + String newId = thisId.replaceFirst(regex, ""); + Characters chars = eventFactory.createCharacters(newId); + writer.add(chars); + } else { + writer.add(nextEvent); + } + idStack.push(thisId); + } + } + } else if (event.isEndElement()) { + if (event.asEndElement().getName().getLocalPart() + .matches(childPattern.pattern())) { + idStack.pop(); + } + } + } + + writer.close(); + xmlEventReaderEAD.close(); + return null; + } +} diff --git a/relativize-identifiers/src/test/java/eu/ehri/relativize_identifiers/RelativizeIdentifiersTest.java b/relativize-identifiers/src/test/java/eu/ehri/relativize_identifiers/RelativizeIdentifiersTest.java new file mode 100644 index 0000000..03ae828 --- /dev/null +++ b/relativize-identifiers/src/test/java/eu/ehri/relativize_identifiers/RelativizeIdentifiersTest.java @@ -0,0 +1,88 @@ +package eu.ehri.relativize_identifiers; + +import org.junit.Before; +import org.junit.Test; +import org.w3c.dom.Document; +import org.xml.sax.SAXException; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.transform.dom.DOMSource; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathFactory; +import java.io.*; +import java.net.URISyntaxException; +import java.net.URL; + +import static org.junit.Assert.assertEquals; + +/** + * @author Mike Bryant (http://github.com/mikesname) + */ +public class RelativizeIdentifiersTest { + + DocumentBuilder builder; + XPath xpath; + + @Before + public void setUp() throws Exception { + builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); + xpath = XPathFactory.newInstance().newXPath(); + } + + @Test + public void testRelativizeIdentifiersWithSpaces() throws Exception { + Document outDoc = getOutputDocument("/absoluteids-spaces.xml"); + assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/did/unitid").evaluate(outDoc)); + assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/did/unitid").evaluate(outDoc)); + assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/c03/did/unitid").evaluate(outDoc)); + assertEquals("2", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/did/unitid").evaluate(outDoc)); + assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/c03/did/unitid").evaluate(outDoc)); + assertEquals("2 root 1 2", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/c03[2]/did/unitid").evaluate(outDoc)); + } + + @Test + public void testRelativizeIdentifiersWithHyphens() throws Exception { + Document outDoc = getOutputDocument("/absoluteids-hyphens.xml"); + assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/did/unitid").evaluate(outDoc)); + assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/did/unitid").evaluate(outDoc)); + assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/c03/did/unitid").evaluate(outDoc)); + assertEquals("2", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/did/unitid").evaluate(outDoc)); + assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/c03/did/unitid").evaluate(outDoc)); + } + + @Test + public void testRelativizeIdentifiersWithSlashes() throws Exception { + Document outDoc = getOutputDocument("/wp2_jmp_ead.xml"); + assertEquals("COLLECTION.JMP.SHOAH/T", xpath.compile("/ead/archdesc/did/unitid").evaluate(outDoc)); + assertEquals("2", xpath.compile("/ead/archdesc/dsc/c01[1]/did/unitid").evaluate(outDoc)); + assertEquals("A", xpath.compile("/ead/archdesc/dsc/c01[1]/c02[1]/did/unitid").evaluate(outDoc)); + assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01[1]/c02[1]/c03[1]/did/unitid").evaluate(outDoc)); + assertEquals("a", xpath.compile("/ead/archdesc/dsc/c01[1]/c02[1]/c03[1]/c04[1]/did/unitid").evaluate(outDoc)); + assertEquals("028", xpath.compile("/ead/archdesc/dsc/c01[1]/c02[1]/c03[1]/c04[1]/c05[1]/did/unitid").evaluate + (outDoc)); + } + + @Test + public void testRelativizeIdentifiersAlreadyRelative() throws Exception { + Document outDoc = getOutputDocument("/relativeids.xml"); + assertEquals("c1", xpath.compile("/ead/archdesc/dsc/c01/did/unitid").evaluate(outDoc)); + assertEquals("c2-1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/did/unitid").evaluate(outDoc)); + assertEquals("c3-1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/c03/did/unitid").evaluate(outDoc)); + assertEquals("c2-2", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/did/unitid").evaluate(outDoc)); + assertEquals("c3-1", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/c03/did/unitid").evaluate(outDoc)); + } + + private Document getOutputDocument(String resourceName) throws URISyntaxException, XMLStreamException, + IOException, SAXException { + URL resource = RelativizeIdentifiersTest.class.getResource(resourceName); + String path = new File(resource.toURI()).getAbsolutePath(); + StringWriter stringWriter = new StringWriter(); + RelativizeIdentifiers.relativizeIdentifiers(path, stringWriter); + stringWriter.close(); + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(stringWriter.getBuffer().toString().getBytes()); + return builder.parse(byteArrayInputStream); + } +} diff --git a/relativize-identifiers/src/test/resources/absoluteids-hyphens.xml b/relativize-identifiers/src/test/resources/absoluteids-hyphens.xml new file mode 100644 index 0000000..4e087d6 --- /dev/null +++ b/relativize-identifiers/src/test/resources/absoluteids-hyphens.xml @@ -0,0 +1,37 @@ + + + + + + + root + + + + + root-1 + + + + root-1-1 + + + + root-1-1-1 + + + + + + root-1-2 + + + + root-1-2-1 + + + + + + + \ No newline at end of file diff --git a/relativize-identifiers/src/test/resources/absoluteids-spaces.xml b/relativize-identifiers/src/test/resources/absoluteids-spaces.xml new file mode 100644 index 0000000..a80b2ef --- /dev/null +++ b/relativize-identifiers/src/test/resources/absoluteids-spaces.xml @@ -0,0 +1,44 @@ + + + + + + + root + + + + + root 1 + + + + root 1 1 + + + + root 1 1 1 + + + + + + root 1 2 + + + + root 1 2 1 + + + + + + root 1 2 2 root 1 2 + + + + + + + \ No newline at end of file diff --git a/relativize-identifiers/src/test/resources/relativeids.xml b/relativize-identifiers/src/test/resources/relativeids.xml new file mode 100644 index 0000000..69187c1 --- /dev/null +++ b/relativize-identifiers/src/test/resources/relativeids.xml @@ -0,0 +1,37 @@ + + + + + + + root + + + + + c1 + + + + c2-1 + + + + c3-1 + + + + + + c2-2 + + + + c3-1 + + + + + + + \ No newline at end of file diff --git a/relativize-identifiers/src/test/resources/wp2_jmp_ead.xml b/relativize-identifiers/src/test/resources/wp2_jmp_ead.xml new file mode 100644 index 0000000..a46bf1d --- /dev/null +++ b/relativize-identifiers/src/test/resources/wp2_jmp_ead.xml @@ -0,0 +1,95 @@ + + + + + + COLLECTION.JMP.SHOAH/T + + + Terezín/Theresienstadt + Shoah History Department, Jewish Museum in Prague + + + 13. 12. 2013 + + + + + + Terezín/Theresienstadt + COLLECTION.JMP.SHOAH/T + + +

The archive records from the Terezín ghetto (24 November 1941 – 8 May 1945, and + from the period after the liberation) are preserved only in fragments.

+
+ + + + Documents + COLLECTION.JMP.SHOAH/T/2 + + + + Documents from the period of occupation + COLLECTION.JMP.SHOAH/T/2/A + + + + Preparations for implementing the plan for the Final + Solution to the Jewish Question in the Protectorate of + Bohemia and Moravia. + COLLECTION.JMP.SHOAH/T/2/A/1 + + + + Considerations about setting up a ghetto for Jews + in Protectorate of Bohemia and Moravia + COLLECTION.JMP.SHOAH/T/2/A/1a + + + + Statistics relating to Jewish women aged + 20-45 years + COLLECTION.JMP.SHOAH/T/2/A/1a/028 + + + + Statistics relating to Jewish women + aged 20-45 years in the Protectorate + DOCUMENT.JMP.SHOAH/T/2/A/1a/028 + September 21 1941 + + German + + + 2 Folio; 2 Pages + + + + + + Keywords + Women + + + Places + Praha + Brno + + + + + + + + + +
+
\ No newline at end of file