From 339a568c2abb02754ccb375f7b623bf6eb50c7ca Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Sat, 24 Jan 2026 17:57:43 +0000 Subject: [PATCH 1/3] updated chnages --- .../extraction/mappings/GenderExtractor.scala | 175 +++++++++++------- 1 file changed, 103 insertions(+), 72 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala index 32b1221e59..0d58060332 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala @@ -1,88 +1,119 @@ package org.dbpedia.extraction.mappings -import org.dbpedia.extraction.config.provenance.DBpediaDatasets -import org.dbpedia.extraction.transform.Quad -import org.dbpedia.extraction.wikiparser._ import org.dbpedia.extraction.config.mappings.GenderExtractorConfig +import org.dbpedia.extraction.config.provenance.DBpediaDatasets import org.dbpedia.extraction.ontology.Ontology +import org.dbpedia.extraction.ontology.datatypes.Datatype +import org.dbpedia.extraction.transform.Quad import org.dbpedia.extraction.util.Language +import org.dbpedia.extraction.wikiparser._ import util.matching.Regex -import org.dbpedia.extraction.ontology.datatypes.Datatype import scala.language.reflectiveCalls - /** - * Extracts the grammatical gender of people using a heuristic. + * Extracts the grammatical gender of people using a pronoun-based heuristic. */ -class GenderExtractor( - context : { - def mappings : Mappings - def ontology : Ontology - def language : Language - def redirects : Redirects - } -) -extends MappingExtractor(context) -{ - private val language = context.language.wikiCode - - private val pronounMap: Map[String, String] = GenderExtractorConfig.pronounsMap(language) - - // FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.FOAF) - private val genderProperty = "http://xmlns.com/foaf/0.1/gender" - // FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.RDF) - private val typeProperty = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" - // FIXME: don't use string constant, use context.ontology (or at least DBpediaNamespace.ONTOLOGY) - private val personUri = "http://dbpedia.org/ontology/Person" +class GenderExtractor( + context: { + def mappings: Mappings + def ontology: Ontology + def language: Language + def redirects: Redirects + } +) extends MappingExtractor(context) { + + /** Language code (en, de, fr, etc.) */ + private val language: String = + context.language.wikiCode + /** Pronoun → gender map (from config) */ + private val pronounMap: Map[String, String] = + GenderExtractorConfig.pronounsMap(language) + + /** Ontology-based properties & classes */ + private val genderProperty = + context.ontology.properties("foaf:gender") + + private val typeProperty = + context.ontology.properties("rdf:type") + + private val personClass = + context.ontology.classes("Person") + + private val langStringDatatype = + new Datatype("rdf:langString") override val datasets = Set(DBpediaDatasets.Genders) - override def extract(node : PageNode, subjectUri : String) : Seq[Quad] = - { - // apply mappings - // FIXME: To find out if it's a person, we extract all mapped properties a second time and throw them away. - // Find a better solution. For example: Make sure that this extractor runs after the - // MappingExtractor. In the MappingExtractor, set the page type as an attriute. - // Even better: in the first extraction pass, extract all types. Use them in the second pass. - val mappingGraph = super.extract(node, subjectUri) - - // if this page is mapped onto Person - if (mappingGraph.exists(q => q.predicate == typeProperty && q.value == personUri)) - { - // get the page text - val wikiText: String = node.toWikiText - - // count gender pronouns - var genderCounts: Map[String, Int] = Map() - for ((pronoun, gender) <- pronounMap) - { - val regex = new Regex("\\W" + pronoun + "\\W") - val count = regex.findAllIn(wikiText).size - val oldCount = genderCounts.getOrElse(gender, 0) - genderCounts = genderCounts.updated(gender, oldCount + count) - } - - // get maximum gender - var maxGender = "" - var maxCount = 0 - var secondCount = 0.0 - for ((gender, count) <- genderCounts) - { - if (count > maxCount) - { - secondCount = maxCount.toDouble - maxCount = count - maxGender = gender - } - } - - // output triple for maximum gender - if (maxGender != "" && maxCount > GenderExtractorConfig.minCount && maxCount/secondCount > GenderExtractorConfig.minDifference) - { - return Seq(new Quad(context.language, DBpediaDatasets.Genders, subjectUri, genderProperty, maxGender, node.sourceIri, new Datatype("rdf:langString"))) - } + override def extract(node: PageNode, subjectUri: String): Seq[Quad] = { + + /** First pass: extract mappings to detect rdf:type */ + val mappingGraph: Seq[Quad] = + super.extract(node, subjectUri) + + /** Check if entity is a dbo:Person */ + val isPerson: Boolean = + mappingGraph.exists(q => + q.predicate == typeProperty.uri && + q.value == personClass.uri + ) + + if (!isPerson) return Seq.empty + + /** Get full wiki text */ + val wikiText: String = + node.toWikiText + + /** Count pronouns by gender */ + var genderCounts: Map[String, Int] = + Map.empty.withDefaultValue(0) + + for ((pronoun, gender) <- pronounMap) { + val regex = + new Regex("(?i)\\b" + Regex.quote(pronoun) + "\\b") + + val count = + regex.findAllIn(wikiText).size + + genderCounts = + genderCounts.updated(gender, genderCounts(gender) + count) } - Seq.empty - } + if (genderCounts.isEmpty) return Seq.empty + + /** Find dominant gender */ + val sorted = + genderCounts.toSeq.sortBy(-_._2) + + val (maxGender, maxCount) = + sorted.head + + val secondCount: Double = + if (sorted.size > 1) sorted(1)._2.toDouble else 0.0 + /** Avoid division-by-zero */ + val differenceOk: Boolean = + secondCount == 0.0 || + (maxCount.toDouble / secondCount) > + GenderExtractorConfig.minDifference + + /** Threshold checks */ + if ( + maxGender.nonEmpty && + maxCount > GenderExtractorConfig.minCount && + differenceOk + ) { + Seq( + new Quad( + context.language, + DBpediaDatasets.Genders, + subjectUri, + genderProperty, + maxGender, + node.sourceIri, + langStringDatatype + ) + ) + } else { + Seq.empty + } + } } From 70fbc63222404a0af22d3d529845d3a9cc554c1a Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Tue, 27 Jan 2026 22:22:02 +0530 Subject: [PATCH 2/3] Fix template text extraction for lang, native name, and Nihongo templates --- .../src/main/resources/templatetransform.json | 14 ++++++++++++- .../TemplateTransformParserTest.scala | 20 +++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/core/src/main/resources/templatetransform.json b/core/src/main/resources/templatetransform.json index fce2421f03..2d075c2515 100644 --- a/core/src/main/resources/templatetransform.json +++ b/core/src/main/resources/templatetransform.json @@ -30,7 +30,19 @@ }, "Lang":{ "transformer":"textNode", - "replace": "
$(3||)
" + "replace": "$(2||)" + }, + "Native name|native_name":{ + "transformer":"textNode", + "replace": "$(2||)" + }, + "Nihongo2":{ + "transformer":"textNode", + "replace": "$(1||)" + }, + "Nihongo":{ + "transformer":"textNode", + "replace": "$(2||)" }, "Marriage":{ "transformer":"extractChildren", diff --git a/core/src/test/scala/org/dbpedia/extraction/wikiparser/TemplateTransformParserTest.scala b/core/src/test/scala/org/dbpedia/extraction/wikiparser/TemplateTransformParserTest.scala index af95a30f41..dcb923d34d 100644 --- a/core/src/test/scala/org/dbpedia/extraction/wikiparser/TemplateTransformParserTest.scala +++ b/core/src/test/scala/org/dbpedia/extraction/wikiparser/TemplateTransformParserTest.scala @@ -53,6 +53,26 @@ class TemplateTransformParserTest extends FlatSpec with Matchers parse("en", "{{url|https://www.dji.com DJI.com}}") should be (Some("[https://www.dji.com]")) } + it should "extract text from {{lang|nap|Abbrùzzu}}" in + { + parse("en", "{{lang|nap|Abbrùzzu}}") should be (Some("Abbrùzzu")) + } + + it should "extract text from {{native name|nap|Abbrùzze}}" in + { + parse("en", "{{native name|nap|Abbrùzze}}") should be (Some("Abbrùzze")) + } + + it should "extract text from {{Nihongo2|東京都}}" in + { + parse("en", "{{Nihongo2|東京都}}") should be (Some("東京都")) + } + + it should "extract text from {{Nihongo|Tokyo|東京|Tōkyō}}" in + { + parse("en", "{{Nihongo|Tokyo|東京|Tōkyō}}") should be (Some("東京")) + } + private val wikiParser = WikiParser.getInstance() From 10f785844b6b01f155b8f745fef147a42a963e4a Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Tue, 27 Jan 2026 22:45:08 +0530 Subject: [PATCH 3/3] Revert GenderExtractor.scala to upstream version --- .../extraction/mappings/GenderExtractor.scala | 175 +++++++----------- 1 file changed, 72 insertions(+), 103 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala index 0d58060332..32b1221e59 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala @@ -1,119 +1,88 @@ package org.dbpedia.extraction.mappings -import org.dbpedia.extraction.config.mappings.GenderExtractorConfig import org.dbpedia.extraction.config.provenance.DBpediaDatasets -import org.dbpedia.extraction.ontology.Ontology -import org.dbpedia.extraction.ontology.datatypes.Datatype import org.dbpedia.extraction.transform.Quad -import org.dbpedia.extraction.util.Language import org.dbpedia.extraction.wikiparser._ +import org.dbpedia.extraction.config.mappings.GenderExtractorConfig +import org.dbpedia.extraction.ontology.Ontology +import org.dbpedia.extraction.util.Language import util.matching.Regex +import org.dbpedia.extraction.ontology.datatypes.Datatype import scala.language.reflectiveCalls + /** - * Extracts the grammatical gender of people using a pronoun-based heuristic. + * Extracts the grammatical gender of people using a heuristic. */ -class GenderExtractor( - context: { - def mappings: Mappings - def ontology: Ontology - def language: Language - def redirects: Redirects - } -) extends MappingExtractor(context) { - - /** Language code (en, de, fr, etc.) */ - private val language: String = - context.language.wikiCode - /** Pronoun → gender map (from config) */ - private val pronounMap: Map[String, String] = - GenderExtractorConfig.pronounsMap(language) - - /** Ontology-based properties & classes */ - private val genderProperty = - context.ontology.properties("foaf:gender") - - private val typeProperty = - context.ontology.properties("rdf:type") - - private val personClass = - context.ontology.classes("Person") - - private val langStringDatatype = - new Datatype("rdf:langString") +class GenderExtractor( + context : { + def mappings : Mappings + def ontology : Ontology + def language : Language + def redirects : Redirects + } +) +extends MappingExtractor(context) +{ + private val language = context.language.wikiCode + + private val pronounMap: Map[String, String] = GenderExtractorConfig.pronounsMap(language) + + // FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.FOAF) + private val genderProperty = "http://xmlns.com/foaf/0.1/gender" + // FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.RDF) + private val typeProperty = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" + // FIXME: don't use string constant, use context.ontology (or at least DBpediaNamespace.ONTOLOGY) + private val personUri = "http://dbpedia.org/ontology/Person" override val datasets = Set(DBpediaDatasets.Genders) - override def extract(node: PageNode, subjectUri: String): Seq[Quad] = { - - /** First pass: extract mappings to detect rdf:type */ - val mappingGraph: Seq[Quad] = - super.extract(node, subjectUri) - - /** Check if entity is a dbo:Person */ - val isPerson: Boolean = - mappingGraph.exists(q => - q.predicate == typeProperty.uri && - q.value == personClass.uri - ) - - if (!isPerson) return Seq.empty - - /** Get full wiki text */ - val wikiText: String = - node.toWikiText - - /** Count pronouns by gender */ - var genderCounts: Map[String, Int] = - Map.empty.withDefaultValue(0) - - for ((pronoun, gender) <- pronounMap) { - val regex = - new Regex("(?i)\\b" + Regex.quote(pronoun) + "\\b") - - val count = - regex.findAllIn(wikiText).size - - genderCounts = - genderCounts.updated(gender, genderCounts(gender) + count) + override def extract(node : PageNode, subjectUri : String) : Seq[Quad] = + { + // apply mappings + // FIXME: To find out if it's a person, we extract all mapped properties a second time and throw them away. + // Find a better solution. For example: Make sure that this extractor runs after the + // MappingExtractor. In the MappingExtractor, set the page type as an attriute. + // Even better: in the first extraction pass, extract all types. Use them in the second pass. + val mappingGraph = super.extract(node, subjectUri) + + // if this page is mapped onto Person + if (mappingGraph.exists(q => q.predicate == typeProperty && q.value == personUri)) + { + // get the page text + val wikiText: String = node.toWikiText + + // count gender pronouns + var genderCounts: Map[String, Int] = Map() + for ((pronoun, gender) <- pronounMap) + { + val regex = new Regex("\\W" + pronoun + "\\W") + val count = regex.findAllIn(wikiText).size + val oldCount = genderCounts.getOrElse(gender, 0) + genderCounts = genderCounts.updated(gender, oldCount + count) + } + + // get maximum gender + var maxGender = "" + var maxCount = 0 + var secondCount = 0.0 + for ((gender, count) <- genderCounts) + { + if (count > maxCount) + { + secondCount = maxCount.toDouble + maxCount = count + maxGender = gender + } + } + + // output triple for maximum gender + if (maxGender != "" && maxCount > GenderExtractorConfig.minCount && maxCount/secondCount > GenderExtractorConfig.minDifference) + { + return Seq(new Quad(context.language, DBpediaDatasets.Genders, subjectUri, genderProperty, maxGender, node.sourceIri, new Datatype("rdf:langString"))) + } } - if (genderCounts.isEmpty) return Seq.empty - - /** Find dominant gender */ - val sorted = - genderCounts.toSeq.sortBy(-_._2) - - val (maxGender, maxCount) = - sorted.head - - val secondCount: Double = - if (sorted.size > 1) sorted(1)._2.toDouble else 0.0 - - /** Avoid division-by-zero */ - val differenceOk: Boolean = - secondCount == 0.0 || - (maxCount.toDouble / secondCount) > - GenderExtractorConfig.minDifference - - /** Threshold checks */ - if ( - maxGender.nonEmpty && - maxCount > GenderExtractorConfig.minCount && - differenceOk - ) { - Seq( - new Quad( - context.language, - DBpediaDatasets.Genders, - subjectUri, - genderProperty, - maxGender, - node.sourceIri, - langStringDatatype - ) - ) - } else { - Seq.empty - } + Seq.empty } + }