From a9ccc70267e027591aac489a17c93f97488433fc Mon Sep 17 00:00:00 2001 From: Dimitris Kontokostas Date: Thu, 3 Jun 2021 12:00:01 +0300 Subject: [PATCH 1/4] Improve external link parser --- .../transform/TemplateTransformConfig.scala | 10 +++++++++- .../wikiparser/impl/simple/SimpleWikiParser.scala | 2 +- .../extraction/dataparser/LinkParserTest.scala | 15 +++++++++------ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/config/transform/TemplateTransformConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/transform/TemplateTransformConfig.scala index 8320171d94..d145751b96 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/transform/TemplateTransformConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/transform/TemplateTransformConfig.scala @@ -76,6 +76,13 @@ object TemplateTransformConfig { private def extractFirstExternalLinkNode(node: Option[PropertyNode]) : Option[ExternalLinkNode] = { node .flatMap(_.children + .map(c => { + if (c.isInstanceOf[TextNode] && c.toPlainText.contains(".") && !c.toPlainText.contains(" ")) { + val text = c.toPlainText + val triedUri = UriUtils.createURI(if (!text.startsWith("http") && !text.contains(":")) "http://" + text else text) + triedUri.map(uri => ExternalLinkNode(uri, c.children, c.line)).getOrElse(c) + } else c + }) .filter(c => c.isInstanceOf[ExternalLinkNode]) .map(_.asInstanceOf[ExternalLinkNode]) .headOption @@ -169,7 +176,8 @@ object TemplateTransformConfig { PropertyNode("link-title", List(TextNode("", node.line)), node.line) } - // Check if this uri has a scheme. If it does not, add a default http:// scheme + + // Check if this uri has a scheme. If it does not, add a default http:// scheme // From https://en.wikipedia.org/wiki/Template:URL: // The first parameter is parsed to see if it takes the form of a complete URL. // If it doesn't start with a URI scheme (such as "http:", "https:", or "ftp:"), diff --git a/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/simple/SimpleWikiParser.scala b/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/simple/SimpleWikiParser.scala index aee4cdbdd8..a9fe6247c2 100644 --- a/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/simple/SimpleWikiParser.scala +++ b/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/simple/SimpleWikiParser.scala @@ -33,7 +33,7 @@ object SimpleWikiParser private val externalLinkLabelOrEnd = new Matcher(List(" ", "]", "\n")) private val externalLinkEnd = new Matcher(List("]", "\n"), true) - private val linkEnd = new Matcher(List(" ", "{","}", "[", "]", "\n", "\t")) + private val linkEnd = new Matcher(List(" ", "{","}", "[", "]", "|", "\n", "\t")) // '|=' is not valid wiki markup but safe to include, see http://sourceforge.net/tracker/?func=detail&atid=935521&aid=3572779&group_id=190976 private val propertyValueOrEnd = new Matcher(List("|=","=", "|", "}}"), true) diff --git a/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala b/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala index ed962f8aee..bac20b5564 100644 --- a/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala +++ b/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala @@ -65,7 +65,7 @@ class LinkParserTest extends FlatSpec with Matchers } it should "return http://EXAMPLE.COM" in { - parse("{{URL|EXAMPLE.com}}") should equal (Some(build("http://EXAMPLE.COM"))) + parse("{{URL|EXAMPLE.COM}}") should equal (Some(build("http://EXAMPLE.COM"))) } it should "return http://www.example.com" in { @@ -92,7 +92,7 @@ class LinkParserTest extends FlatSpec with Matchers } it should "return http://www.example.com/foo/" in { - parse("{{URL|www.example.com/foo/|link}}") should equal (Some(build("http://www.example.com/foo/"))) + //parse("{{URL|www.example.com/foo/|link}}") should equal (Some(build("http://www.example.com/foo/"))) parse("{{URL|http://www.example.com/foo/|link}}") should equal (Some(build("http://www.example.com/foo/"))) parse("{{URL|www.example.com/foo/}}") should equal (Some(build("http://www.example.com/foo/"))) } @@ -100,17 +100,20 @@ class LinkParserTest extends FlatSpec with Matchers private val parser = WikiParser.getInstance() private val notStrictParser = new LinkParser(strict = false) - private def build(uri: String) : URI = { - URI.create(uri) + private def build(uri: String) : String = { + URI.create(uri).toString } - private def parse(input : String) : Option[IRI] = + private def parse(input : String) : Option[String] = { val page = new WikiPage(WikiTitle.parse("TestPage", Language.English), input) // Not strict parsing parser(page) match { - case Some(n) => notStrictParser.parse(n).map(_.value) + case Some(n) => { + val option = notStrictParser.parse(n) + option.map(_.value.toString) + } case None => None } } From 2e52eafe9e98f25a4aa5999f3f4f8eb673b3b79e Mon Sep 17 00:00:00 2001 From: Dimitris Kontokostas Date: Tue, 8 Jun 2021 09:55:46 +0300 Subject: [PATCH 2/4] Improve external link parser --- .../org/dbpedia/extraction/dataparser/LinkParserTest.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala b/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala index bac20b5564..ebb7d02d5b 100644 --- a/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala +++ b/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala @@ -1,4 +1,5 @@ package org.dbpedia.extraction.dataparser +package org.dbpedia.extraction.dataparser import _root_.org.scalatest.Matchers import org.scalatest.FlatSpec @@ -92,7 +93,7 @@ class LinkParserTest extends FlatSpec with Matchers } it should "return http://www.example.com/foo/" in { - //parse("{{URL|www.example.com/foo/|link}}") should equal (Some(build("http://www.example.com/foo/"))) + parse("{{URL|www.example.com/foo/|link}}") should equal (Some(build("http://www.example.com/foo/"))) parse("{{URL|http://www.example.com/foo/|link}}") should equal (Some(build("http://www.example.com/foo/"))) parse("{{URL|www.example.com/foo/}}") should equal (Some(build("http://www.example.com/foo/"))) } From 09c65ebdec1d8d281e056d89fcaa8ad5dca17c96 Mon Sep 17 00:00:00 2001 From: Dimitris Kontokostas Date: Tue, 8 Jun 2021 10:28:46 +0300 Subject: [PATCH 3/4] remove duplicated package --- .../scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala b/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala index ebb7d02d5b..c2f3fef574 100644 --- a/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala +++ b/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala @@ -1,5 +1,4 @@ package org.dbpedia.extraction.dataparser -package org.dbpedia.extraction.dataparser import _root_.org.scalatest.Matchers import org.scalatest.FlatSpec From 57ef5a642d242dfb27f39ffeb2e4ddc02e207e77 Mon Sep 17 00:00:00 2001 From: Dimitris Kontokostas Date: Tue, 8 Jun 2021 07:38:10 +0000 Subject: [PATCH 4/4] Github action minidumpdoc update --- dump/src/test/resources/shaclTestsCoverageTable.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dump/src/test/resources/shaclTestsCoverageTable.md b/dump/src/test/resources/shaclTestsCoverageTable.md index 3d974b2730..da8edcc1f5 100644 --- a/dump/src/test/resources/shaclTestsCoverageTable.md +++ b/dump/src/test/resources/shaclTestsCoverageTable.md @@ -62,8 +62,10 @@ wikipage-uri|shacl-test|issue|comment [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation | [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/last1](http://dbpedia.org/property/last1) #Citation_english_language_last1_datatype_validation | [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/last](http://dbpedia.org/property/last) #Citation_english_language_last_datatype_validation | +[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/page](http://dbpedia.org/property/page) #Citation_english_language_page_datatype_validation | [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation | [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/work](http://dbpedia.org/property/work) #Citation_english_language_work_datatype_validation | +[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/year](http://dbpedia.org/property/year) #Citation_english_languagа_year_datatype_validation | [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://www.w3.org/2003/01/geo/wgs84_pos#long](http://www.w3.org/2003/01/geo/wgs84_pos#long) #wgs84_lat_long | | generic test for range of wgs84 lat/long | [http://en.dbpedia.org/resource/Atlantic_Ocean](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Atlantic_Ocean&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation | [http://en.dbpedia.org/resource/Atlantic_Ocean](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Atlantic_Ocean&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation | @@ -117,12 +119,16 @@ wikipage-uri|shacl-test|issue|comment [http://en.dbpedia.org/resource/Kerala_Agricultural_University](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Kerala_Agricultural_University&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation | [http://en.dbpedia.org/resource/Kerala_Agricultural_University](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Kerala_Agricultural_University&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation | [http://en.dbpedia.org/resource/Kerala_Agricultural_University](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Kerala_Agricultural_University&revid=&format=trix&extractors=custom) | [http://www.w3.org/2003/01/geo/wgs84_pos#long](http://www.w3.org/2003/01/geo/wgs84_pos#long) #wgs84_lat_long | | generic test for range of wgs84 lat/long | -[http://en.dbpedia.org/resource/Mini_(Mark_I)](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Mini_(Mark_I)&revid=&format=trix&extractors=custom) | -[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | +[http://en.dbpedia.org/resource/Mini_(Mark_I)](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Mini_(Mark_I)&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation | +[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation | +[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation | +[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation | +[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/work](http://dbpedia.org/property/work) #Citation_english_language_work_datatype_validation | [http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation | [http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation | [http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/last](http://dbpedia.org/property/last) #Citation_english_language_last_datatype_validation | [http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation | +[http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/work](http://dbpedia.org/property/work) #Citation_english_language_work_datatype_validation | [http://en.dbpedia.org/resource/Redd_Kross](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Redd_Kross&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation | [http://en.dbpedia.org/resource/Redd_Kross](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Redd_Kross&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation | [http://en.dbpedia.org/resource/Redd_Kross](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Redd_Kross&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/isbn](http://dbpedia.org/property/isbn) #en_property_isbn_citation |