Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,88 +1,121 @@
package org.dbpedia.extraction.mappings

import org.dbpedia.extraction.config.provenance.DBpediaDatasets
import org.dbpedia.extraction.transform.Quad
import org.dbpedia.extraction.wikiparser._
import org.dbpedia.extraction.config.mappings.GenderExtractorConfig
import org.dbpedia.extraction.config.provenance.DBpediaDatasets
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.ontology.datatypes.Datatype
import org.dbpedia.extraction.transform.Quad
import org.dbpedia.extraction.util.Language
import org.dbpedia.extraction.wikiparser._
import util.matching.Regex
import org.dbpedia.extraction.ontology.datatypes.Datatype

import scala.language.reflectiveCalls

/**
* Extracts the grammatical gender of people using a heuristic.
* Extracts the grammatical gender of people using a pronoun-based heuristic.
*/
class GenderExtractor(
context : {
def mappings : Mappings
def ontology : Ontology
def language : Language
def redirects : Redirects
}
)
extends MappingExtractor(context)
{
private val language = context.language.wikiCode

private val pronounMap: Map[String, String] = GenderExtractorConfig.pronounsMap(language)

// FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.FOAF)
private val genderProperty = "http://xmlns.com/foaf/0.1/gender"
// FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.RDF)
private val typeProperty = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
// FIXME: don't use string constant, use context.ontology (or at least DBpediaNamespace.ONTOLOGY)
private val personUri = "http://dbpedia.org/ontology/Person"

override val datasets = Set(DBpediaDatasets.Genders)

override def extract(node : PageNode, subjectUri : String) : Seq[Quad] =
{
// apply mappings
// FIXME: To find out if it's a person, we extract all mapped properties a second time and throw them away.
// Find a better solution. For example: Make sure that this extractor runs after the
// MappingExtractor. In the MappingExtractor, set the page type as an attriute.
// Even better: in the first extraction pass, extract all types. Use them in the second pass.
val mappingGraph = super.extract(node, subjectUri)

// if this page is mapped onto Person
if (mappingGraph.exists(q => q.predicate == typeProperty && q.value == personUri))
{
// get the page text
val wikiText: String = node.toWikiText

// count gender pronouns
var genderCounts: Map[String, Int] = Map()
for ((pronoun, gender) <- pronounMap)
{
val regex = new Regex("\\W" + pronoun + "\\W")
val count = regex.findAllIn(wikiText).size
val oldCount = genderCounts.getOrElse(gender, 0)
genderCounts = genderCounts.updated(gender, oldCount + count)
}

// get maximum gender
var maxGender = ""
var maxCount = 0
var secondCount = 0.0
for ((gender, count) <- genderCounts)
{
if (count > maxCount)
{
secondCount = maxCount.toDouble
maxCount = count
maxGender = gender
}
}

// output triple for maximum gender
if (maxGender != "" && maxCount > GenderExtractorConfig.minCount && maxCount/secondCount > GenderExtractorConfig.minDifference)
{
return Seq(new Quad(context.language, DBpediaDatasets.Genders, subjectUri, genderProperty, maxGender, node.sourceIri, new Datatype("rdf:langString")))
}
class GenderExtractor(
context: {
def mappings: Mappings
def ontology: Ontology
def language: Language
def redirects: Redirects
}
) extends MappingExtractor(context) {

/** Language code (en, de, fr, etc.) */
private val language: String =
context.language.wikiCode

/** Pronoun → gender map (from config) */
private val pronounMap: Map[String, String] =
GenderExtractorConfig.pronounsMap(language)

/** Ontology-based properties & classes */
private val genderProperty =
context.ontology.properties("foaf:gender")

private val typeProperty =
context.ontology.properties("rdf:type")

private val personClass =
context.ontology.classes("Person")

override val datasets: Set[String] =
Set(DBpediaDatasets.Genders)
Comment on lines +44 to +45
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# First, find the DBpediaDatasets class definition
find . -name "*.scala" | xargs grep -l "object DBpediaDatasets" | head -5

Repository: dbpedia/extraction-framework

Length of output: 156


🏁 Script executed:

#!/bin/bash
# Search for the DBpediaDatasets definition and Genders field
rg -n "object DBpediaDatasets" -A 50 --type scala | head -80

Repository: dbpedia/extraction-framework

Length of output: 6912


🏁 Script executed:

#!/bin/bash
# Look at how other extractors define datasets override
rg -n "override val datasets.*Set" --type scala | head -20

Repository: dbpedia/extraction-framework

Length of output: 3230


🏁 Script executed:

#!/bin/bash
# Check the GenderExtractor file to see the full context
cat -n core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala | head -50

Repository: dbpedia/extraction-framework

Length of output: 1928


🏁 Script executed:

#!/bin/bash
# Search for where Genders is defined in DBpediaDatasets
rg -n "Genders" core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala

Repository: dbpedia/extraction-framework

Length of output: 122


🏁 Script executed:

#!/bin/bash
# Check if DBpediaDatasets has property accessors defined
rg -n "def|val" core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala | tail -50

Repository: dbpedia/extraction-framework

Length of output: 3915


🏁 Script executed:

#!/bin/bash
# Look for the parent class/trait that defines datasets field
rg -n "def datasets|val datasets" --type scala --context 3 -B 5

Repository: dbpedia/extraction-framework

Length of output: 50384


🏁 Script executed:

#!/bin/bash
# Find Dataset class definition
fd "Dataset.scala" --type f -x cat {} | head -150

Repository: dbpedia/extraction-framework

Length of output: 7614


🏁 Script executed:

#!/bin/bash
# Look for the Dataset class constructor and properties
rg -n "class Dataset" --type scala -A 20 | head -60

Repository: dbpedia/extraction-framework

Length of output: 6702


🏁 Script executed:

#!/bin/bash
# Check the parent class/trait of extractors to see what datasets field should return
rg -n "class GenderExtractor|class.*Extractor.*extends|trait.*Extractor" --type scala core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala -B 2 -A 10 | head -40

Repository: dbpedia/extraction-framework

Length of output: 456


🏁 Script executed:

#!/bin/bash
# Find the Extractor interface/trait definition
fd "Extractor.scala" --type f core/src/main/scala/org/dbpedia/extraction/mappings/ | xargs cat | head -100

Repository: dbpedia/extraction-framework

Length of output: 3942


🏁 Script executed:

#!/bin/bash
# Find MappingExtractor and its parent class definition
rg -n "class MappingExtractor|trait.*Extractor" --type scala core/src/main/scala/org/dbpedia/extraction/mappings/ | grep -E "(MappingExtractor|trait.*Extractor)" | head -20

Repository: dbpedia/extraction-framework

Length of output: 817


🏁 Script executed:

#!/bin/bash
# Find the base Extractor trait/interface
rg -n "trait Extractor|override val datasets" --type scala core/src/main/scala/org/dbpedia/extraction/mappings/Extractor.scala -A 2 | head -50

Repository: dbpedia/extraction-framework

Length of output: 175


🏁 Script executed:

#!/bin/bash
# Get the full Extractor trait definition
cat -n core/src/main/scala/org/dbpedia/extraction/mappings/Extractor.scala

Repository: dbpedia/extraction-framework

Length of output: 1394


Remove the incorrect type annotation Set[String].

The datasets override declares Set[String] but the parent Extractor trait expects Set[Dataset]. Since DBpediaDatasets.Genders is a Dataset object, remove the explicit type annotation and let Scala infer the correct type:

🔧 Correct fix
  override val datasets: Set[String] =
-   Set(DBpediaDatasets.Genders)
+
+ override val datasets =
+   Set(DBpediaDatasets.Genders)

Or keep it concise on one line:

-  override val datasets: Set[String] =
-    Set(DBpediaDatasets.Genders)
+  override val datasets =
+    Set(DBpediaDatasets.Genders)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
override val datasets: Set[String] =
Set(DBpediaDatasets.Genders)
override val datasets =
Set(DBpediaDatasets.Genders)
🧰 Tools
🪛 GitHub Actions: DIEF Long-Running Extraction Tests

[error] 45-45: GenderExtractor.scala: type mismatch: found org.dbpedia.extraction.config.provenance.Dataset, required: String. Set(DBpediaDatasets.Genders) is used where a String is expected.

🪛 GitHub Actions: DIEF Server Component Web API Extraction Tests

[error] 45-45: GenderExtractor.scala: type mismatch; found org.dbpedia.extraction.config.provenance.Dataset, required: String

🪛 GitHub Actions: Snapshot Deploy

[error] 45-45: Type mismatch: found org.dbpedia.extraction.config.provenance.Dataset; required: String. Use a string identifier for the dataset.

🪛 GitHub Actions: Update Minidump document

[error] 45-45: Type mismatch: found org.dbpedia.extraction.config.provenance.Dataset; required: String. Set(DBpediaDatasets.Genders) is providing a Dataset instead of a String.

🤖 Prompt for AI Agents
In @core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala
around lines 44 - 45, The override of datasets in GenderExtractor uses the wrong
explicit type Set[String]; remove the explicit type annotation so Scala infers
the correct Set[Dataset] expected by the parent Extractor trait (keep the value
as Set(DBpediaDatasets.Genders) or inline as a one-line declaration). Ensure the
member name datasets remains unchanged and that DBpediaDatasets.Genders is used
as the sole element.


override def extract(node: PageNode, subjectUri: String): Seq[Quad] = {

/** First pass: extract mappings to detect rdf:type */
val mappingGraph: Seq[Quad] =
super.extract(node, subjectUri)

/** Check if entity is a dbo:Person */
val isPerson: Boolean =
mappingGraph.exists(q =>
q.predicate.uri == typeProperty.uri &&
q.value.uri == personClass.uri
)
Comment on lines +54 to +58
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Compilation error: accessing .uri on String fields instead of ontology objects.

The Quad class stores predicate and value as raw URI strings, not as objects with a .uri property. The ontology objects (typeProperty, personClass) have the .uri accessor. The comparison is inverted.

🔧 Proposed fix
   val isPerson: Boolean =
  mappingGraph.exists(q =>
-   q.predicate.uri == typeProperty.uri &&
-   q.value.uri == personClass.uri
+   q.predicate == typeProperty.uri &&
+   q.value == personClass.uri
  )
🧰 Tools
🪛 GitHub Actions: DIEF Long-Running Extraction Tests

[error] 56-56: GenderExtractor.scala: 'uri' is not a member of String. Accessing 'uri' on a String value is invalid.


[error] 57-57: GenderExtractor.scala: 'uri' is not a member of String. Accessing 'uri' on a String value is invalid.

🪛 GitHub Actions: DIEF Server Component Web API Extraction Tests

[error] 56-56: GenderExtractor.scala: 'uri' is not a member of String (attempting to access 'uri' on a String in q.predicate.uri)


[error] 57-57: GenderExtractor.scala: 'uri' is not a member of String (attempting to access 'uri' on a String in q.value.uri)

🪛 GitHub Actions: Snapshot Deploy

[error] 56-57: Ошибка: 'uri' is not a member of String. Accessing property 'uri' on a String is invalid in this context.


[error] 57-57: Ошибка: 'uri' is not a member of String. Accessing property 'uri' on a String is invalid in this context.

🪛 GitHub Actions: Update Minidump document

[error] 56-57: GenderExtractor.scala: 'uri' is not a member of String. Expected a property on an object with a 'uri' field.


[error] 56-57: GenderExtractor.scala: 'q.predicate.uri' and 'personClass.uri' refer to 'uri' on Strings, causing compile-time errors.

🤖 Prompt for AI Agents
In @core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala
around lines 54 - 58, The isPerson check is comparing ontology objects' .uri to
fields that are plain strings; change the predicate and value comparisons to
compare the raw Quad string fields to the ontology .uri values (i.e., in the
mappingGraph.exists lambda compare q.predicate == typeProperty.uri and q.value
== personClass.uri) so you no longer call .uri on q.predicate/q.value and the
direction of comparison matches the Quad representation.



if (!isPerson) return Seq.empty

/** Get full wiki text */
val wikiText: String =
node.toWikiText

/** Count pronouns by gender */
var genderCounts: Map[String, Int] =
Map.empty.withDefaultValue(0)

for ((pronoun, gender) <- pronounMap) {
val regex =
new Regex("\\W" + Regex.quote(pronoun) + "\\W")

val count =
regex.findAllIn(wikiText).size

genderCounts =
genderCounts.updated(gender, genderCounts(gender) + count)
}
Comment on lines +71 to 80
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Regex pattern may miss pronouns at text boundaries.

The pattern "\\W" + Regex.quote(pronoun) + "\\W" requires non-word characters on both sides, which will miss pronouns at the start or end of the text. Consider using word boundaries instead.

💡 Suggested fix using word boundaries
       val regex =
-        new Regex("\\W" + Regex.quote(pronoun) + "\\W")
+        new Regex("(?i)\\b" + Regex.quote(pronoun) + "\\b")

This also adds case-insensitivity ((?i)) which may be desirable for pronoun matching.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
for ((pronoun, gender) <- pronounMap) {
val regex =
new Regex("\\W" + Regex.quote(pronoun) + "\\W")
val count =
regex.findAllIn(wikiText).size
genderCounts =
genderCounts.updated(gender, genderCounts(gender) + count)
}
for ((pronoun, gender) <- pronounMap) {
val regex =
new Regex("(?i)\\b" + Regex.quote(pronoun) + "\\b")
val count =
regex.findAllIn(wikiText).size
genderCounts =
genderCounts.updated(gender, genderCounts(gender) + count)
}
🤖 Prompt for AI Agents
In @core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala
around lines 70 - 79, The current regex new Regex("\\W" + Regex.quote(pronoun) +
"\\W") misses pronouns at text boundaries; replace it with a word-boundary
based, case-insensitive pattern such as new Regex("(?i)\\b" +
Regex.quote(pronoun) + "\\b") so matches occur at start/end and ignore case;
update the loop that builds regexes (the block using pronounMap, wikiText, and
genderCounts) to use this new Regex expression.


Seq.empty
}
if (genderCounts.isEmpty) return Seq.empty

/** Find dominant gender */
val sorted =
genderCounts.toSeq.sortBy(-_._2)

val (maxGender, maxCount) =
sorted.head

val secondCount: Double =
if (sorted.size > 1) sorted(1)._2.toDouble else 0.0

/** Avoid division-by-zero */
val differenceOk: Boolean =
secondCount == 0.0 ||
(maxCount.toDouble / secondCount) >
GenderExtractorConfig.minDifference

/** Threshold checks */
if (
maxGender.nonEmpty &&
maxCount > GenderExtractorConfig.minCount &&
differenceOk
) {
Seq(
new Quad(
context.language,
DBpediaDatasets.Genders,
subjectUri,
genderProperty,
maxGender,
node.sourceIri,
new Datatype("rdf:langString")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, let's look at the GenderExtractor.scala file structure around line 113
fd "GenderExtractor.scala" --type f

Repository: dbpedia/extraction-framework

Length of output: 145


🏁 Script executed:

# Check how Datatype is used across the codebase
rg -n "new Datatype\(" --type scala | head -30

Repository: dbpedia/extraction-framework

Length of output: 4719


🏁 Script executed:

# Check for ontology-based datatype usage patterns
rg -n "ontology\.datatypes" --type scala | head -30

Repository: dbpedia/extraction-framework

Length of output: 4547


🏁 Script executed:

# Also check for Datatype instantiation in class fields
rg -n "private.*Datatype|val.*Datatype.*=" --type scala | head -30

Repository: dbpedia/extraction-framework

Length of output: 8102


🏁 Script executed:

cat -n core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala | sed -n '100,125p'

Repository: dbpedia/extraction-framework

Length of output: 620


🏁 Script executed:

# Let's also check if GenderExtractor has access to context.ontology
rg -n "class GenderExtractor|ontology\." core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala -A 5 -B 5

Repository: dbpedia/extraction-framework

Length of output: 1402


🏁 Script executed:

# Check PersondataExtractor more closely since it uses class fields with Datatype
head -50 core/src/main/scala/org/dbpedia/extraction/mappings/PersondataExtractor.scala

Repository: dbpedia/extraction-framework

Length of output: 2613


🏁 Script executed:

# Check CategoryLabelExtractor which also uses Datatype in a field
head -30 core/src/main/scala/org/dbpedia/extraction/mappings/CategoryLabelExtractor.scala

Repository: dbpedia/extraction-framework

Length of output: 1208


🏁 Script executed:

# Check OntologyDatatypes.scala to see how datatypes are loaded
head -60 core/src/main/scala/org/dbpedia/extraction/ontology/OntologyDatatypes.scala

Repository: dbpedia/extraction-framework

Length of output: 3085


🏁 Script executed:

# Check if rdf:langString is available in ontology.datatypes
rg -n "rdf:langString" core/src/main/scala/org/dbpedia/extraction/ontology/OntologyDatatypes.scala

Repository: dbpedia/extraction-framework

Length of output: 161


Cache the Datatype instance as a class field to avoid repeated instantiation on each extraction.

The rdf:langString datatype is created anew on every extraction call. Following the pattern used in PersondataExtractor and CategoryLabelExtractor, move this to a class-level field. The framework predefines this datatype in OntologyDatatypes, so it should be instantiated once at class initialization rather than repeatedly during extraction.

♻️ Suggested improvement

Add as a class field:

private val langStringDatatype = new Datatype("rdf:langString")

Then use langStringDatatype in the Quad constructor at line 113.

🤖 Prompt for AI Agents
In @core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala at
line 113, The Datatype("rdf:langString") is being instantiated on each
extraction; add a private class-level field (e.g. private val langStringDatatype
= new Datatype("rdf:langString"))—following the pattern in
PersondataExtractor/CategoryLabelExtractor and using the framework's
OntologyDatatypes—then replace the inline new Datatype("rdf:langString") used in
the Quad construction inside GenderExtractor with langStringDatatype so the
instance is created once at class initialization.

)
)
} else {
Seq.empty
}
}
}
Loading