From d5867cd664577e898a1427f52ca51b17ce8b0e41 Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Sat, 24 Jan 2026 18:09:20 +0000 Subject: [PATCH] updated chnages --- .../extraction/mappings/GenderExtractor.scala | 177 ++++++++++-------- 1 file changed, 104 insertions(+), 73 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala index 32b1221e59..d75130b18a 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala @@ -1,88 +1,119 @@ package org.dbpedia.extraction.mappings -import org.dbpedia.extraction.config.provenance.DBpediaDatasets -import org.dbpedia.extraction.transform.Quad -import org.dbpedia.extraction.wikiparser._ import org.dbpedia.extraction.config.mappings.GenderExtractorConfig +import org.dbpedia.extraction.config.provenance.DBpediaDatasets import org.dbpedia.extraction.ontology.Ontology +import org.dbpedia.extraction.ontology.datatypes.Datatype +import org.dbpedia.extraction.transform.Quad import org.dbpedia.extraction.util.Language +import org.dbpedia.extraction.wikiparser._ import util.matching.Regex -import org.dbpedia.extraction.ontology.datatypes.Datatype import scala.language.reflectiveCalls - /** - * Extracts the grammatical gender of people using a heuristic. + * Extracts the grammatical gender of people using a pronoun-based heuristic. */ -class GenderExtractor( - context : { - def mappings : Mappings - def ontology : Ontology - def language : Language - def redirects : Redirects - } -) -extends MappingExtractor(context) -{ - private val language = context.language.wikiCode - - private val pronounMap: Map[String, String] = GenderExtractorConfig.pronounsMap(language) - - // FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.FOAF) - private val genderProperty = "http://xmlns.com/foaf/0.1/gender" - // FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.RDF) - private val typeProperty = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" - // FIXME: don't use string constant, use context.ontology (or at least DBpediaNamespace.ONTOLOGY) - private val personUri = "http://dbpedia.org/ontology/Person" +class GenderExtractor( + context: { + def mappings: Mappings + def ontology: Ontology + def language: Language + def redirects: Redirects + } +) extends MappingExtractor(context) { + + /** Language code (en, de, fr, etc.) */ + private val language: String = + context.language.wikiCode + /** Pronoun → gender map (from config) */ + private val pronounMap: Map[String, String] = + GenderExtractorConfig.pronounsMap(language) + + /** Ontology-based properties & classes */ + private val genderProperty = + context.ontology.properties("foaf:gender") + + private val typeProperty = + context.ontology.properties("rdf:type") + + private val personClass = + context.ontology.classes("Person") + + private val langStringDatatype = + new Datatype("rdf:langString") override val datasets = Set(DBpediaDatasets.Genders) - override def extract(node : PageNode, subjectUri : String) : Seq[Quad] = - { - // apply mappings - // FIXME: To find out if it's a person, we extract all mapped properties a second time and throw them away. - // Find a better solution. For example: Make sure that this extractor runs after the - // MappingExtractor. In the MappingExtractor, set the page type as an attriute. - // Even better: in the first extraction pass, extract all types. Use them in the second pass. - val mappingGraph = super.extract(node, subjectUri) - - // if this page is mapped onto Person - if (mappingGraph.exists(q => q.predicate == typeProperty && q.value == personUri)) - { - // get the page text - val wikiText: String = node.toWikiText - - // count gender pronouns - var genderCounts: Map[String, Int] = Map() - for ((pronoun, gender) <- pronounMap) - { - val regex = new Regex("\\W" + pronoun + "\\W") - val count = regex.findAllIn(wikiText).size - val oldCount = genderCounts.getOrElse(gender, 0) - genderCounts = genderCounts.updated(gender, oldCount + count) - } - - // get maximum gender - var maxGender = "" - var maxCount = 0 - var secondCount = 0.0 - for ((gender, count) <- genderCounts) - { - if (count > maxCount) - { - secondCount = maxCount.toDouble - maxCount = count - maxGender = gender - } - } - - // output triple for maximum gender - if (maxGender != "" && maxCount > GenderExtractorConfig.minCount && maxCount/secondCount > GenderExtractorConfig.minDifference) - { - return Seq(new Quad(context.language, DBpediaDatasets.Genders, subjectUri, genderProperty, maxGender, node.sourceIri, new Datatype("rdf:langString"))) - } + override def extract(node: PageNode, subjectUri: String): Seq[Quad] = { + + /** First pass: extract mappings to detect rdf:type */ + val mappingGraph: Seq[Quad] = + super.extract(node, subjectUri) + + /** Check if entity is a dbo:Person */ + val isPerson: Boolean = + mappingGraph.exists(q => + q.predicate == typeProperty.uri && + q.value == personClass.uri + ) + + if (!isPerson) return Seq.empty + + /** Get full wiki text */ + val wikiText: String = + node.toWikiText + + /** Count pronouns by gender */ + var genderCounts: Map[String, Int] = + Map.empty.withDefaultValue(0) + + for ((pronoun, gender) <- pronounMap) { + val regex = + new Regex("(?i)\\b" + Regex.quote(pronoun) + "\\b") + + val count = + regex.findAllIn(wikiText).size + + genderCounts = + genderCounts.updated(gender, genderCounts(gender) + count) } - Seq.empty - } + if (genderCounts.isEmpty) return Seq.empty + + /** Find dominant gender */ + val sorted = + genderCounts.toSeq.sortBy(-_._2) + + val (maxGender, maxCount) = + sorted.head + + val secondCount: Double = + if (sorted.size > 1) sorted(1)._2.toDouble else 0.0 -} + /** Avoid division-by-zero */ + val differenceOk: Boolean = + secondCount == 0.0 || + (maxCount.toDouble / secondCount) > + GenderExtractorConfig.minDifference + + /** Threshold checks */ + if ( + maxGender.nonEmpty && + maxCount > GenderExtractorConfig.minCount && + differenceOk + ) { + Seq( + new Quad( + context.language, + DBpediaDatasets.Genders, + subjectUri, + genderProperty, + maxGender, + node.sourceIri, + langStringDatatype + ) + ) + } else { + Seq.empty + } + } +} \ No newline at end of file