Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,88 +1,119 @@
package org.dbpedia.extraction.mappings

import org.dbpedia.extraction.config.provenance.DBpediaDatasets
import org.dbpedia.extraction.transform.Quad
import org.dbpedia.extraction.wikiparser._
import org.dbpedia.extraction.config.mappings.GenderExtractorConfig
import org.dbpedia.extraction.config.provenance.DBpediaDatasets
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.ontology.datatypes.Datatype
import org.dbpedia.extraction.transform.Quad
import org.dbpedia.extraction.util.Language
import org.dbpedia.extraction.wikiparser._
import util.matching.Regex
import org.dbpedia.extraction.ontology.datatypes.Datatype
import scala.language.reflectiveCalls

/**
* Extracts the grammatical gender of people using a heuristic.
* Extracts the grammatical gender of people using a pronoun-based heuristic.
*/
class GenderExtractor(
context : {
def mappings : Mappings
def ontology : Ontology
def language : Language
def redirects : Redirects
}
)
extends MappingExtractor(context)
{
private val language = context.language.wikiCode

private val pronounMap: Map[String, String] = GenderExtractorConfig.pronounsMap(language)

// FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.FOAF)
private val genderProperty = "http://xmlns.com/foaf/0.1/gender"
// FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.RDF)
private val typeProperty = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
// FIXME: don't use string constant, use context.ontology (or at least DBpediaNamespace.ONTOLOGY)
private val personUri = "http://dbpedia.org/ontology/Person"
class GenderExtractor(
context: {
def mappings: Mappings
def ontology: Ontology
def language: Language
def redirects: Redirects
}
) extends MappingExtractor(context) {

/** Language code (en, de, fr, etc.) */
private val language: String =
context.language.wikiCode
/** Pronoun → gender map (from config) */
private val pronounMap: Map[String, String] =
GenderExtractorConfig.pronounsMap(language)

/** Ontology-based properties & classes */
private val genderProperty =
context.ontology.properties("foaf:gender")

private val typeProperty =
context.ontology.properties("rdf:type")

private val personClass =
context.ontology.classes("Person")

private val langStringDatatype =
new Datatype("rdf:langString")

override val datasets = Set(DBpediaDatasets.Genders)

override def extract(node : PageNode, subjectUri : String) : Seq[Quad] =
{
// apply mappings
// FIXME: To find out if it's a person, we extract all mapped properties a second time and throw them away.
// Find a better solution. For example: Make sure that this extractor runs after the
// MappingExtractor. In the MappingExtractor, set the page type as an attriute.
// Even better: in the first extraction pass, extract all types. Use them in the second pass.
val mappingGraph = super.extract(node, subjectUri)

// if this page is mapped onto Person
if (mappingGraph.exists(q => q.predicate == typeProperty && q.value == personUri))
{
// get the page text
val wikiText: String = node.toWikiText

// count gender pronouns
var genderCounts: Map[String, Int] = Map()
for ((pronoun, gender) <- pronounMap)
{
val regex = new Regex("\\W" + pronoun + "\\W")
val count = regex.findAllIn(wikiText).size
val oldCount = genderCounts.getOrElse(gender, 0)
genderCounts = genderCounts.updated(gender, oldCount + count)
}

// get maximum gender
var maxGender = ""
var maxCount = 0
var secondCount = 0.0
for ((gender, count) <- genderCounts)
{
if (count > maxCount)
{
secondCount = maxCount.toDouble
maxCount = count
maxGender = gender
}
}

// output triple for maximum gender
if (maxGender != "" && maxCount > GenderExtractorConfig.minCount && maxCount/secondCount > GenderExtractorConfig.minDifference)
{
return Seq(new Quad(context.language, DBpediaDatasets.Genders, subjectUri, genderProperty, maxGender, node.sourceIri, new Datatype("rdf:langString")))
}
override def extract(node: PageNode, subjectUri: String): Seq[Quad] = {

/** First pass: extract mappings to detect rdf:type */
val mappingGraph: Seq[Quad] =
super.extract(node, subjectUri)

/** Check if entity is a dbo:Person */
val isPerson: Boolean =
mappingGraph.exists(q =>
q.predicate == typeProperty.uri &&
q.value == personClass.uri
)

if (!isPerson) return Seq.empty

/** Get full wiki text */
val wikiText: String =
node.toWikiText

/** Count pronouns by gender */
var genderCounts: Map[String, Int] =
Map.empty.withDefaultValue(0)

for ((pronoun, gender) <- pronounMap) {
val regex =
new Regex("(?i)\\b" + Regex.quote(pronoun) + "\\b")

val count =
regex.findAllIn(wikiText).size

genderCounts =
genderCounts.updated(gender, genderCounts(gender) + count)
}

Seq.empty
}
if (genderCounts.isEmpty) return Seq.empty

/** Find dominant gender */
val sorted =
genderCounts.toSeq.sortBy(-_._2)

val (maxGender, maxCount) =
sorted.head

val secondCount: Double =
if (sorted.size > 1) sorted(1)._2.toDouble else 0.0

}
/** Avoid division-by-zero */
val differenceOk: Boolean =
secondCount == 0.0 ||
(maxCount.toDouble / secondCount) >
GenderExtractorConfig.minDifference

/** Threshold checks */
if (
maxGender.nonEmpty &&
maxCount > GenderExtractorConfig.minCount &&
differenceOk
) {
Seq(
new Quad(
context.language,
DBpediaDatasets.Genders,
subjectUri,
genderProperty,
maxGender,
node.sourceIri,
langStringDatatype
)
)
} else {
Seq.empty
}
}
}
Loading