Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions core/src/main/resources/datasetdefinitions.json
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,12 @@
"traits":"LinkedData, Published",
"desc": "Dataset containing links between a language-locale DBpedia instance and DBpedia Commons.",
"defaultgraph": "namespace"
},
"sisterprojects_page_links": {
"name": "Sister Projects Links",
"traits":"LinkedData, Published",
"desc": "Dataset containing links between a language-locale DBpedia instance and Sister projects.",
"defaultgraph": "namespace"
}
},
"wikidata":{
Expand Down
1 change: 1 addition & 0 deletions core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ object DBpediaDatasets extends java.io.Serializable
val FreebaseLinks: Dataset = datasets("freebase_links")
val GeonamesLinks: Dataset = datasets("geonames_links")
val CommonsLink: Dataset = datasets("commons_page_links")
val SisterProjectLink: Dataset = datasets("sisterprojects_page_links")


/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
package org.dbpedia.extraction.mappings

import org.dbpedia.extraction.config.provenance.DBpediaDatasets
import org.dbpedia.extraction.transform.Quad

import org.dbpedia.extraction.wikiparser._
import org.dbpedia.extraction.dataparser._

import org.dbpedia.extraction.util.RichString.wrapString
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.util._
import org.dbpedia.extraction.config.mappings.InfoboxExtractorConfig

import scala.collection.mutable.ArrayBuffer
import org.dbpedia.extraction.config.dataparser.DataParserConfig
import org.dbpedia.iri.UriUtils

import scala.language.reflectiveCalls

/**
* This extractor extracts all properties from all infoboxes.
* Extracted information is represented using properties in the http://xx.dbpedia.org/property/
* namespace (where xx is the language code).
Comment on lines +22 to +23
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Extracted information is represented using properties in the http://xx.dbpedia.org/property/
* namespace (where xx is the language code).
* Extracted information is represented using properties in the
* <http://xx.dbpedia.org/property/> namespace (where xx is the language code).

* The names of the these properties directly reflect the name of the Wikipedia infobox property.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* The names of the these properties directly reflect the name of the Wikipedia infobox property.
* The names of the these properties directly reflect the names of the Wikipedia infobox properties.

* Property names are not cleaned or merged.
* Property types are not part of a subsumption hierarchy and there is no consistent ontology for the infobox dataset.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Property types are not part of a subsumption hierarchy and there is no consistent ontology for the infobox dataset.
* Property types are not part of a subsumption hierarchy, and there is no consistent ontology for the infobox dataset.

* The infobox extractor performs only a minimal amount of property value clean-up, e.g., by converting a value like “June 2009” to the XML Schema format “2009–06”.
* You should therefore use the infobox dataset only if your application requires complete coverage of all Wikipeda properties and you are prepared to accept relatively noisy data.
*/
Comment on lines +20 to +29
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Update class documentation to reflect the extractor's actual purpose.

The documentation appears to be copied from InfoboxExtractor and describes infobox property extraction rather than sister project link extraction. It should be updated to explain that this extractor retrieves links to sister projects (Commons, Wiktionary, Wikiquote, etc.) from infobox templates.

🤖 Prompt for AI Agents
In
core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala
around lines 20 to 29, the class Scaladoc currently describes infobox property
extraction (copied from InfoboxExtractor) but this extractor actually extracts
links to sister projects (Commons, Wiktionary, Wikiquote, etc.) from infobox
templates; update the class documentation to accurately describe that the
extractor locates and converts sister-project interwiki/infobox links into
DBpedia properties (naming the target sister projects, expected input
templates/fields, output property namespace, and any value-normalization
performed), replacing the erroneous infobox-focused text with a concise
explanation of purpose and behavior.


class SisterProjectExtractor(
context : {
def ontology : Ontology
def language : Language
def redirects : Redirects
}
)
extends PageNodeExtractor
{
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Configuration
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

private val ontology = context.ontology

private val language = context.language

private val wikiCode = language.wikiCode


private val ignoreTemplates = InfoboxExtractorConfig.ignoreTemplates

private val ignoreTemplatesRegex = InfoboxExtractorConfig.ignoreTemplatesRegex

private val ignoreProperties = InfoboxExtractorConfig.ignoreProperties

private val rdfLangStrDt = ontology.datatypes("rdf:langString")

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Regexes
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// TODO: i18n

private val SplitWordsRegex = InfoboxExtractorConfig.SplitWordsRegex

private val TrailingNumberRegex = InfoboxExtractorConfig.TrailingNumberRegex


private val splitPropertyNodeRegexInfobox = if (DataParserConfig.splitPropertyNodeRegexInfobox.contains(wikiCode))
DataParserConfig.splitPropertyNodeRegexInfobox.get(wikiCode).get
else DataParserConfig.splitPropertyNodeRegexInfobox.get("en").get
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Parsers
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
private val sameAsProperty = context.ontology.properties("owl:sameAs")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Replace owl:sameAs with an appropriate predicate.

As discussed in the PR comments, owl:sameAs is semantically incorrect for sister project links because it denotes strict co-reference (subject and object identify the same entity). A Wiktionary page is not the same entity as a Wikipedia article.

Consider using skos:related or creating dedicated properties for each sister project type (e.g., dbo:wiktionaryPage, dbo:commonsPage), as suggested by datalogism in the PR discussion.

🤖 Prompt for AI Agents
In
core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala
around line 76, the code currently uses
context.ontology.properties("owl:sameAs") which is semantically incorrect for
sister project links; replace this with an appropriate predicate such as
"skos:related" or, preferably, add and use dedicated properties per project
(e.g., "dbo:wiktionaryPage", "dbo:commonsPage") in the ontology and then
reference them via context.ontology.properties("<predicate>"); update the
variable name to reflect the new predicate (e.g., relatedProperty or
wiktionaryProperty), change any downstream uses to emit the new predicate, and
add/update tests/fixtures and ontology configuration to include the chosen
predicates.



/// NEED TO BE EXTENDED TO OTHERS LANGUAGES
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// NEED TO BE EXTENDED TO OTHERS LANGUAGES
/// NEED TO BE EXTENDED TO OTHER LANGUAGES

private val regexMap = Map(
"fr"->List("autres projet.*".r),
"en"->List("sister project.*".r),
"de"->List("schwesterprojekte.*".r)

)


private val currentRegexList = regexMap(wikiCode)
Comment on lines +80 to +88
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Guard against unsupported language codes.

Line 88 accesses regexMap(wikiCode) directly, which will throw NoSuchElementException if the language code is not in the map. Since only three languages are currently supported (fr, en, de), this extractor will fail for all other languages.

Apply this diff to fail gracefully:

-  private val currentRegexList = regexMap(wikiCode)
+  private val currentRegexList = regexMap.getOrElse(wikiCode, {
+    throw new IllegalArgumentException(s"SisterProjectExtractor does not support language code: $wikiCode. Supported languages: ${regexMap.keys.mkString(", ")}")
+  })

Alternatively, return an empty list to skip processing for unsupported languages:

-  private val currentRegexList = regexMap(wikiCode)
+  private val currentRegexList = regexMap.getOrElse(wikiCode, List.empty)
🤖 Prompt for AI Agents
In
core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala
around lines 80 to 88, the code directly indexes regexMap with wikiCode which
throws NoSuchElementException for unsupported languages; update it to guard
lookup (use Map.get or contains) and either default to an empty List when
wikiCode is not present or handle the missing case by logging/warning and
skipping processing, ensuring currentRegexList is assigned via a safe getOrElse
so the extractor fails gracefully for unsupported languages.


private val objectParser = new ObjectParser(context, true)
//USELESS ?
private val linkParser = new LinkParser(true)
Comment on lines +91 to +92
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Remove outdated comment.

The comment "USELESS ?" is incorrect since linkParser is used in the extractLinks method at line 197.

🤖 Prompt for AI Agents
In
core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala
around lines 91 to 92, remove the outdated "USELESS ?" comment that incorrectly
claims linkParser is useless—linkParser is used by extractLinks at line 197;
delete that comment (or replace it with a short accurate comment describing
linkParser’s purpose) so the code comment no longer misleads.


private val mapAbrev = Map(
"c" -> "commons",
"wikt" -> "wiktionary",
"n" -> "wikinews",
"voy" -> "wikivoyage",
"q" -> "wikiquote",
"s" -> "wikisource",
"b" -> "wikibooks",
"v" -> "wikiversity",
"wikispecies" -> "species"
)
private val mapProjects = Map(
"commons" -> "http://commons.dbpedia.org/resource/",
"wiktionary" -> "https://wiktionary.org/wiki/",
"wikinews" -> "https://wikinews.org/wiki/",
"wikivoyage" -> "https://wikivoyage.org/wiki/",
"wikiquote" -> "https://wikiquote.org/wiki/",
"wikisource" -> "https://wikisource.org/wiki/",
"wikibooks" -> "https://wikibooks.org/wiki/",
"wikiversity" -> "https://wikibooks.org/wiki/",
"species" -> "https://species.wikimedia.org/wiki/"

)
Comment on lines +105 to +116
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Fix incorrect URL for Wikiversity.

Line 113 maps "wikiversity" to the Wikibooks URL (https://wikibooks.org/wiki/) instead of the correct Wikiversity URL.

Apply this diff:

     "wikisource" -> "https://wikisource.org/wiki/",
     "wikibooks" -> "https://wikibooks.org/wiki/",
-    "wikiversity" -> "https://wikibooks.org/wiki/",
+    "wikiversity" -> "https://wikiversity.org/wiki/",
     "species" -> "https://species.wikimedia.org/wiki/"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
private val mapProjects = Map(
"commons" -> "http://commons.dbpedia.org/resource/",
"wiktionary" -> "https://wiktionary.org/wiki/",
"wikinews" -> "https://wikinews.org/wiki/",
"wikivoyage" -> "https://wikivoyage.org/wiki/",
"wikiquote" -> "https://wikiquote.org/wiki/",
"wikisource" -> "https://wikisource.org/wiki/",
"wikibooks" -> "https://wikibooks.org/wiki/",
"wikiversity" -> "https://wikibooks.org/wiki/",
"species" -> "https://species.wikimedia.org/wiki/"
)
private val mapProjects = Map(
"commons" -> "http://commons.dbpedia.org/resource/",
"wiktionary" -> "https://wiktionary.org/wiki/",
"wikinews" -> "https://wikinews.org/wiki/",
"wikivoyage" -> "https://wikivoyage.org/wiki/",
"wikiquote" -> "https://wikiquote.org/wiki/",
"wikisource" -> "https://wikisource.org/wiki/",
"wikibooks" -> "https://wikibooks.org/wiki/",
"wikiversity" -> "https://wikiversity.org/wiki/",
"species" -> "https://species.wikimedia.org/wiki/"
)
🤖 Prompt for AI Agents
In
core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala
around lines 105 to 116, the mapProjects entry for "wikiversity" incorrectly
points to the Wikibooks URL; update the value for the "wikiversity" key to the
correct Wikiversity base URL "https://wikiversity.org/wiki/". Ensure only that
map entry is changed and the rest of the Map remains intact.


override val datasets = Set(DBpediaDatasets.SisterProjectLink)


override def extract(node : PageNode, subjectUri : String) : Seq[Quad] =
{
if(node.title.namespace != Namespace.Main && !ExtractorUtils.titleContainsCommonsMetadata(node.title)) return Seq.empty
val quads = new ArrayBuffer[Quad]()

/** Retrieve all templates on the page which are not ignored */
for { template <- InfoboxExtractor.collectTemplates(node)
resolvedTitle = context.redirects.resolve(template.title).decoded.toLowerCase
if !ignoreTemplates.contains(resolvedTitle)
if !ignoreTemplatesRegex.exists(regex => regex.unapplySeq(resolvedTitle).isDefined)
if currentRegexList.exists(regex => regex.findFirstMatchIn(resolvedTitle).isDefined)
}
{
val propertyList = template.children.filterNot(property => ignoreProperties.get(wikiCode).getOrElse(ignoreProperties("en")).contains(property.key.toLowerCase))


for(property <- propertyList; if (!property.key.forall(_.isDigit))) {

// TODO clean HTML

val cleanedPropertyNode = NodeUtil.removeParentheses(property)

val splitPropertyNodes = NodeUtil.splitPropertyNode(cleanedPropertyNode, splitPropertyNodeRegexInfobox)


for(splitNode <- splitPropertyNodes; pr <- extractValue(splitNode))
{
val propertyUri = getPropertyUri(property.key)
try
{

if (mapProjects.contains(property.key)) {
val value = mapProjects(property.key).replace(property.key, language.wikiCode + "." + property.key) + WikiUtil.wikiEncode(pr.value)
quads += new Quad(language, DBpediaDatasets.SisterProjectLink, subjectUri, sameAsProperty, value, splitNode.sourceIri, null)
}
if (mapAbrev.contains(property.key)) {
val keyProj=mapAbrev(property.key)
val value = mapProjects(keyProj).replace(keyProj, language.wikiCode + "." + keyProj) + WikiUtil.wikiEncode(pr.value)
quads += new Quad(language, DBpediaDatasets.SisterProjectLink, subjectUri, sameAsProperty, value, splitNode.sourceIri, null)
}
Comment on lines +152 to +160
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Verify and fix URL construction logic.

The URL construction logic appears problematic:

  1. Line 153: mapProjects(property.key).replace(property.key, language.wikiCode + "." + property.key)

    • For "commons" property with URL "http://commons.dbpedia.org/resource/", replacing "commons" with "en.commons" would produce "http://en.commons.dbpedia.org/resource/", which may not be the intended URL structure.
    • For "wiktionary" with URL "https://wiktionary.org/wiki/", the same replace would produce "https://en.wiktionary.org/wiki/", which seems correct but is fragile.
  2. The logic assumes the property key appears exactly once in the URL string at the right position, which is fragile and error-prone.

Consider building the URLs explicitly rather than using string replacement:

if (mapProjects.contains(property.key)) {
  val baseProject = property.key
  val value = baseProject match {
    case "commons" => s"http://commons.dbpedia.org/resource/${WikiUtil.wikiEncode(pr.value)}"
    case _ => s"https://${language.wikiCode}.${baseProject}.org/wiki/${WikiUtil.wikiEncode(pr.value)}"
  }
  quads += new Quad(language, DBpediaDatasets.SisterProjectLink, subjectUri, sameAsProperty, value, splitNode.sourceIri, null)
}
🤖 Prompt for AI Agents
core/src/main/scala/org/dbpedia/extraction/mappings/SisterProjectExtractor.scala
around lines 152 to 160: the current URL construction uses string.replace on
mapProjects(...) which is fragile and can produce incorrect hostnames (e.g.
replacing "commons" produces "en.commons.dbpedia.org"); instead construct the
target URL explicitly based on the project key and language: treat commons (and
any other special-case projects) with their correct fixed base (e.g.
"http://commons.dbpedia.org/resource/") and for standard wiki projects build
"https://{language.wikiCode}.{project}.org/wiki/{WikiUtil.wikiEncode(pr.value)}";
apply the same explicit-construction logic to the mapAbrev branch (use the
mapped keyProj then build the URL), remove the string.replace usage, and ensure
WikiUtil.wikiEncode(pr.value) is appended to the constructed base.

}
catch
{
case ex : IllegalArgumentException => println(ex)
}
}
}
//}
}

quads
}


private def extractValue(node: PropertyNode): List[ParseResult[String]] = {

extractLinks(node) match {
case links if links.nonEmpty => {
return links
}
case _ =>
}
StringParser.parse(node).map(value => ParseResult(value.value, None, Some(rdfLangStrDt))).toList
}

private def extractLinks(node : PropertyNode) : List[ParseResult[String]] =
{
val splitNodes = NodeUtil.splitPropertyNode(node, """\s*\W+\s*""")

splitNodes.flatMap(splitNode => objectParser.parse(splitNode)) match
{
// TODO: explain why we check links.size == splitNodes.size
case links if links.size == splitNodes.size => return links

}

splitNodes.flatMap(splitNode => linkParser.parse(splitNode)) match
{
// TODO: explain why we check links.size == splitNodes.size
case links if links.size == splitNodes.size => links.map(x => UriUtils.cleanLink(x.value)).collect{case Some(link) => ParseResult(link)}
case _ => List.empty
}
}


private def getPropertyUri(key : String) : String =
{
// convert property key to camelCase
var result = key.toLowerCase(language.locale).trim
result = result.toCamelCase(SplitWordsRegex, language.locale)

// Rename Properties like LeaderName1, LeaderName2, ... to LeaderName
result = TrailingNumberRegex.replaceFirstIn(result, "")

result = WikiUtil.cleanSpace(result)

language.propertyUri.append(result)
}


}