From 52e01625b450691e875db4fcf93568ce0c3746f8 Mon Sep 17 00:00:00 2001 From: c_ringwald <42188996+datalogism@users.noreply.github.com> Date: Fri, 9 Dec 2022 11:26:12 +0100 Subject: [PATCH 1/2] Add files via upload --- .../mappings/SisterProjectExtractor.scala | 222 ++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala new file mode 100644 index 000000000..675dc2044 --- /dev/null +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala @@ -0,0 +1,222 @@ +package org.dbpedia.extraction.mappings + +import org.dbpedia.extraction.config.provenance.DBpediaDatasets +import org.dbpedia.extraction.transform.Quad + +import collection.mutable.HashSet +import org.dbpedia.extraction.ontology.datatypes.{Datatype, DimensionDatatype} +import org.dbpedia.extraction.wikiparser._ +import org.dbpedia.extraction.dataparser._ +import org.dbpedia.extraction.util.RichString.wrapString +import org.dbpedia.extraction.ontology.Ontology +import org.dbpedia.extraction.util._ +import org.dbpedia.extraction.config.mappings.InfoboxExtractorConfig + +import scala.collection.mutable.ArrayBuffer +import org.dbpedia.extraction.config.dataparser.DataParserConfig +import org.dbpedia.iri.UriUtils + +import scala.language.reflectiveCalls + +/** + * This extractor extracts all properties from all infoboxes. + * Extracted information is represented using properties in the http://xx.dbpedia.org/property/ + * namespace (where xx is the language code). + * The names of the these properties directly reflect the name of the Wikipedia infobox property. + * Property names are not cleaned or merged. + * Property types are not part of a subsumption hierarchy and there is no consistent ontology for the infobox dataset. + * The infobox extractor performs only a minimal amount of property value clean-up, e.g., by converting a value like “June 2009” to the XML Schema format “2009–06”. + * You should therefore use the infobox dataset only if your application requires complete coverage of all Wikipeda properties and you are prepared to accept relatively noisy data. + */ + +class SisterProjectExtractor( + context : { + def ontology : Ontology + def language : Language + def redirects : Redirects + } + ) + extends PageNodeExtractor +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Configuration + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + private val ontology = context.ontology + + private val language = context.language + + private val wikiCode = language.wikiCode + + + private val ignoreTemplates = InfoboxExtractorConfig.ignoreTemplates + + private val ignoreTemplatesRegex = InfoboxExtractorConfig.ignoreTemplatesRegex + + private val ignoreProperties = InfoboxExtractorConfig.ignoreProperties + + private val rdfLangStrDt = ontology.datatypes("rdf:langString") + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Regexes + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // TODO: i18n + + private val SplitWordsRegex = InfoboxExtractorConfig.SplitWordsRegex + + private val TrailingNumberRegex = InfoboxExtractorConfig.TrailingNumberRegex + + private val splitPropertyNodeRegexInfobox = if (DataParserConfig.splitPropertyNodeRegexInfobox.contains(wikiCode)) + DataParserConfig.splitPropertyNodeRegexInfobox.get(wikiCode).get + else DataParserConfig.splitPropertyNodeRegexInfobox.get("en").get + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Parsers + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + private val sameAsProperty = context.ontology.properties("owl:sameAs") + private val regexMap = Map( + "fr"->List("autres projet.*".r), + "en"->List("sister project.*".r), + "de"->List("schwesterprojekte.*".r) + + ) + val currentRegexList = regexMap(wikiCode) + + private val objectParser = new ObjectParser(context, true) + //USELESS ? + private val linkParser = new LinkParser(true) + + private val mapAbrev = Map( + "c" -> "commons", + "wikt" -> "wiktionary", + "n" -> "wikinews", + "voy" -> "wikivoyage", + "q" -> "wikiquote", + "s" -> "wikisource", + "b" -> "wikibooks", + "v" -> "wikiversity", + "wikispecies" -> "species" + ) + private val mapProjects = Map( + "commons" -> "http://commons.dbpedia.org/resource/", + "wiktionary" -> "https://wiktionary.org/wiki/", + "wikinews" -> "https://wikinews.org/wiki/", + "wikivoyage" -> "https://wikivoyage.org/wiki/", + "wikiquote" -> "https://wikiquote.org/wiki/", + "wikisource" -> "https://wikisource.org/wiki/", + "wikibooks" -> "https://wikibooks.org/wiki/", + "wikiversity" -> "https://wikibooks.org/wiki/", + "species" -> "https://species.wikimedia.org/wiki/" + ) + + override val datasets = Set(DBpediaDatasets.SisterProjectLink) + + + override def extract(node : PageNode, subjectUri : String) : Seq[Quad] = + { + if(node.title.namespace != Namespace.Main && !ExtractorUtils.titleContainsCommonsMetadata(node.title)) return Seq.empty + val quads = new ArrayBuffer[Quad]() + + /** Retrieve all templates on the page which are not ignored */ + for { template <- InfoboxExtractor.collectTemplates(node) + resolvedTitle = context.redirects.resolve(template.title).decoded.toLowerCase + if !ignoreTemplates.contains(resolvedTitle) + if !ignoreTemplatesRegex.exists(regex => regex.unapplySeq(resolvedTitle).isDefined) + if currentRegexList.exists(regex => regex.findFirstMatchIn(resolvedTitle).isDefined) + } + { + val propertyList = template.children.filterNot(property => ignoreProperties.get(wikiCode).getOrElse(ignoreProperties("en")).contains(property.key.toLowerCase)) + + for(property <- propertyList; if (!property.key.forall(_.isDigit))) { + // TODO clean HTML + + val cleanedPropertyNode = NodeUtil.removeParentheses(property) + + val splitPropertyNodes = NodeUtil.splitPropertyNode(cleanedPropertyNode, splitPropertyNodeRegexInfobox) + + //for(splitNode <- splitPropertyNodes; pr <- extractValue(splitNode); if pr.unit.nonEmpty) + //sh: removed pr.unit.nonEmpty as it kicked out all objectproperty triples from wikilinks, + // didn't test for further side-effects seems to work + for(splitNode <- splitPropertyNodes; pr <- extractValue(splitNode)) + { + val propertyUri = getPropertyUri(property.key) + try + { + + if (mapProjects.contains(property.key)) { + val value = mapProjects(property.key).replace(property.key, language.wikiCode + "." + property.key) + WikiUtil.wikiEncode(pr.value) + quads += new Quad(language, DBpediaDatasets.SisterProjectLink, subjectUri, sameAsProperty, value, splitNode.sourceIri, null) + } + if (mapAbrev.contains(property.key)) { + val keyProj=mapAbrev(property.key) + val value = mapProjects(keyProj).replace(keyProj, language.wikiCode + "." + keyProj) + WikiUtil.wikiEncode(pr.value) + quads += new Quad(language, DBpediaDatasets.SisterProjectLink, subjectUri, sameAsProperty, value, splitNode.sourceIri, null) + } + } + catch + { + case ex : IllegalArgumentException => println(ex) + } + } + } + //} + } + + quads + } + + private def extractValue(node : PropertyNode) : List[ParseResult[String]] = + { + // TODO don't convert to SI units (what happens to {{convert|25|kg}} ?) + // extractUnitValue(node).foreach(result => return List(result)) + // extractDates(node) match + //{ + // case dates if dates.nonEmpty => return dates + // case _ => + //} + // extractSingleCoordinate(node).foreach(result => return List(result)) + //extractNumber(node).foreach(result => return List(result)) + //extractRankNumber(node).foreach(result => return List(result)) + extractLinks(node) match + { + case links if links.nonEmpty => { return links} + case _ => + } + StringParser.parse(node).map(value => ParseResult(value.value, None, Some(rdfLangStrDt))).toList + } + + private def extractLinks(node : PropertyNode) : List[ParseResult[String]] = + { + val splitNodes = NodeUtil.splitPropertyNode(node, """\s*\W+\s*""") + + splitNodes.flatMap(splitNode => objectParser.parse(splitNode)) match + { + // TODO: explain why we check links.size == splitNodes.size + case links if links.size == splitNodes.size => return links + case _ => List.empty + } + + splitNodes.flatMap(splitNode => linkParser.parse(splitNode)) match + { + // TODO: explain why we check links.size == splitNodes.size + case links if links.size == splitNodes.size => links.map(x => UriUtils.cleanLink(x.value)).collect{case Some(link) => ParseResult(link)} + case _ => List.empty + } + } + + private def getPropertyUri(key : String) : String = + { + // convert property key to camelCase + var result = key.toLowerCase(language.locale).trim + result = result.toCamelCase(SplitWordsRegex, language.locale) + + // Rename Properties like LeaderName1, LeaderName2, ... to LeaderName + result = TrailingNumberRegex.replaceFirstIn(result, "") + + result = WikiUtil.cleanSpace(result) + + language.propertyUri.append(result) + } + + +} \ No newline at end of file From e10a97a1347972fcf9820e934cd36b92be852cbd Mon Sep 17 00:00:00 2001 From: celian_ringwald Date: Fri, 9 Dec 2022 16:37:55 +0100 Subject: [PATCH 2/2] Add new extractor for SisterProjects --- .../main/resources/datasetdefinitions.json | 6 + .../config/provenance/DBpediaDatasets.scala | 1 + .../mappings/SisterProjectExtractor.scala | 190 ++++++++++++++++++ 3 files changed, 197 insertions(+) mode change 100644 => 100755 core/src/main/resources/datasetdefinitions.json mode change 100644 => 100755 core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala create mode 100644 core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala diff --git a/core/src/main/resources/datasetdefinitions.json b/core/src/main/resources/datasetdefinitions.json old mode 100644 new mode 100755 index 521f57341..4a63b3f4c --- a/core/src/main/resources/datasetdefinitions.json +++ b/core/src/main/resources/datasetdefinitions.json @@ -461,6 +461,12 @@ "traits":"LinkedData, Published", "desc": "Dataset containing links between a language-locale DBpedia instance and DBpedia Commons.", "defaultgraph": "namespace" + }, + "sisterprojects_page_links": { + "name": "Sister Projects Links", + "traits":"LinkedData, Published", + "desc": "Dataset containing links between a language-locale DBpedia instance and Sister projects.", + "defaultgraph": "namespace" } }, "wikidata":{ diff --git a/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala b/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala old mode 100644 new mode 100755 index a040316c5..77ebbff82 --- a/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala @@ -230,6 +230,7 @@ object DBpediaDatasets extends java.io.Serializable val FreebaseLinks: Dataset = datasets("freebase_links") val GeonamesLinks: Dataset = datasets("geonames_links") val CommonsLink: Dataset = datasets("commons_page_links") + val SisterProjectLink: Dataset = datasets("sisterprojects_page_links") /** diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala new file mode 100644 index 000000000..6e253c79c --- /dev/null +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala @@ -0,0 +1,190 @@ +package org.dbpedia.extraction.mappings + +import org.dbpedia.extraction.config.provenance.DBpediaDatasets +import org.dbpedia.extraction.transform.Quad +import org.dbpedia.extraction.wikiparser._ +import org.dbpedia.extraction.dataparser._ +import org.dbpedia.extraction.ontology.Ontology +import org.dbpedia.extraction.util._ +import org.dbpedia.extraction.config.mappings.InfoboxExtractorConfig + +import scala.collection.mutable.ArrayBuffer +import org.dbpedia.extraction.config.dataparser.DataParserConfig +import org.dbpedia.iri.UriUtils + +import scala.language.reflectiveCalls + +/** + * This extractor extracts all properties from all infoboxes. + * Extracted information is represented using properties in the http://xx.dbpedia.org/property/ + * namespace (where xx is the language code). + * The names of the these properties directly reflect the name of the Wikipedia infobox property. + * Property names are not cleaned or merged. + * Property types are not part of a subsumption hierarchy and there is no consistent ontology for the infobox dataset. + * The infobox extractor performs only a minimal amount of property value clean-up, e.g., by converting a value like “June 2009” to the XML Schema format “2009–06”. + * You should therefore use the infobox dataset only if your application requires complete coverage of all Wikipeda properties and you are prepared to accept relatively noisy data. + */ + +class SisterProjectExtractor( + context : { + def ontology : Ontology + def language : Language + def redirects : Redirects + } + ) + extends PageNodeExtractor +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Configuration + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + private val ontology = context.ontology + + private val language = context.language + + private val wikiCode = language.wikiCode + + + private val ignoreTemplates = InfoboxExtractorConfig.ignoreTemplates + + private val ignoreTemplatesRegex = InfoboxExtractorConfig.ignoreTemplatesRegex + + private val ignoreProperties = InfoboxExtractorConfig.ignoreProperties + + private val rdfLangStrDt = ontology.datatypes("rdf:langString") + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Regexes + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + private val splitPropertyNodeRegexInfobox = if (DataParserConfig.splitPropertyNodeRegexInfobox.contains(wikiCode)) + DataParserConfig.splitPropertyNodeRegexInfobox.get(wikiCode).get + else DataParserConfig.splitPropertyNodeRegexInfobox.get("en").get + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Parsers + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + private val sameAsProperty = context.ontology.properties("owl:sameAs") + + /// NEED TO BE EXTENDED TO OTHERS LANGUAGES + private val regexMap = Map( + "fr"->List("autres projet.*".r), + "en"->List("sister project.*".r), + "de"->List("schwesterprojekte.*".r) + + ) + private val currentRegexList = regexMap(wikiCode) + + private val objectParser = new ObjectParser(context, true) + //USELESS ? + private val linkParser = new LinkParser(true) + + private val mapAbrev = Map( + "c" -> "commons", + "wikt" -> "wiktionary", + "n" -> "wikinews", + "voy" -> "wikivoyage", + "q" -> "wikiquote", + "s" -> "wikisource", + "b" -> "wikibooks", + "v" -> "wikiversity", + "wikispecies" -> "species" + ) + private val mapProjects = Map( + "commons" -> "http://commons.dbpedia.org/resource/", + "wiktionary" -> "https://wiktionary.org/wiki/", + "wikinews" -> "https://wikinews.org/wiki/", + "wikivoyage" -> "https://wikivoyage.org/wiki/", + "wikiquote" -> "https://wikiquote.org/wiki/", + "wikisource" -> "https://wikisource.org/wiki/", + "wikibooks" -> "https://wikibooks.org/wiki/", + "wikiversity" -> "https://wikibooks.org/wiki/", + "species" -> "https://species.wikimedia.org/wiki/" + ) + + override val datasets = Set(DBpediaDatasets.SisterProjectLink) + + + override def extract(node : PageNode, subjectUri : String) : Seq[Quad] = + { + if(node.title.namespace != Namespace.Main && !ExtractorUtils.titleContainsCommonsMetadata(node.title)) return Seq.empty + val quads = new ArrayBuffer[Quad]() + + /** Retrieve all templates on the page which are not ignored */ + for { template <- InfoboxExtractor.collectTemplates(node) + resolvedTitle = context.redirects.resolve(template.title).decoded.toLowerCase + if !ignoreTemplates.contains(resolvedTitle) + if !ignoreTemplatesRegex.exists(regex => regex.unapplySeq(resolvedTitle).isDefined) + if currentRegexList.exists(regex => regex.findFirstMatchIn(resolvedTitle).isDefined) + } + { + val propertyList = template.children.filterNot(property => ignoreProperties.get(wikiCode).getOrElse(ignoreProperties("en")).contains(property.key.toLowerCase)) + + for(property <- propertyList; if !property.key.forall(_.isDigit)) { + // TODO clean HTML + + val cleanedPropertyNode = NodeUtil.removeParentheses(property) + + val splitPropertyNodes = NodeUtil.splitPropertyNode(cleanedPropertyNode, splitPropertyNodeRegexInfobox) + + for(splitNode <- splitPropertyNodes; pr <- extractValue(splitNode)) + { + + try + { + + if (mapProjects.contains(property.key)) { + val value = mapProjects(property.key).replace(property.key, language.wikiCode + "." + property.key) + WikiUtil.wikiEncode(pr.value) + quads += new Quad(language, DBpediaDatasets.SisterProjectLink, subjectUri, sameAsProperty, value, splitNode.sourceIri, null) + } + if (mapAbrev.contains(property.key)) { + val keyProj=mapAbrev(property.key) + val value = mapProjects(keyProj).replace(keyProj, language.wikiCode + "." + keyProj) + WikiUtil.wikiEncode(pr.value) + quads += new Quad(language, DBpediaDatasets.SisterProjectLink, subjectUri, sameAsProperty, value, splitNode.sourceIri, null) + } + } + catch + { + case ex : IllegalArgumentException => println(ex) + } + } + } + //} + } + + quads + } + + private def extractValue(node : PropertyNode) : List[ParseResult[String]] = + { + + extractLinks(node) match + { + case links if links.nonEmpty => return links + case _ => + } + StringParser.parse(node).map(value => ParseResult(value.value, None, Some(rdfLangStrDt))).toList + } + + private def extractLinks(node : PropertyNode) : List[ParseResult[String]] = + { + val splitNodes = NodeUtil.splitPropertyNode(node, """\s*\W+\s*""") + + splitNodes.flatMap(splitNode => objectParser.parse(splitNode)) match + { + // TODO: explain why we check links.size == splitNodes.size + case links if links.size == splitNodes.size => return links + } + + splitNodes.flatMap(splitNode => linkParser.parse(splitNode)) match + { + // TODO: explain why we check links.size == splitNodes.size + case links if links.size == splitNodes.size => links.map(x => UriUtils.cleanLink(x.value)).collect{case Some(link) => ParseResult(link)} + case _ => List.empty + } + } + + + + +} \ No newline at end of file