From f0013a1a3f870915c6d74e2f4e3926837f1e2a95 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 8 Sep 2024 04:33:47 +0100 Subject: [PATCH 1/9] Add rdfs:lable and skos:definition Signed-off-by: Arthit Suriyawongkul --- spec_parser/rdf.py | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/spec_parser/rdf.py b/spec_parser/rdf.py index 1d34e2b..c60923b 100644 --- a/spec_parser/rdf.py +++ b/spec_parser/rdf.py @@ -59,6 +59,16 @@ def gen_rdf_ontology(model): g.add((node, RDF.type, OWL.Ontology)) g.add((node, OWL.versionIRI, node)) g.add((node, RDFS.label, Literal("System Package Data Exchange (SPDX) Ontology", lang="en"))) + g.add( + ( + node, + SKOS.definition, + Literal( + "This ontology defines the terms and relationships used in the SPDX specification to describe system packages", + lang="en", + ), + ) + ) g.add( ( node, @@ -74,7 +84,16 @@ def gen_rdf_ontology(model): g.add((node, DCTERMS.license, URIRef("https://spdx.org/licenses/Community-Spec-1.0.html"))) g.add((node, DCTERMS.references, URIRef("https://spdx.dev/specifications/"))) g.add((node, DCTERMS.title, Literal("System Package Data Exchange (SPDX) Ontology", lang="en"))) - g.add((node, OMG_ANN.copyright, Literal("Copyright (C) 2024 SPDX Project", lang="en"))) + g.add( + ( + node, + OMG_ANN.copyright, + Literal( + "Copyright (C) 2010-2024, The Linux Foundation and its Contributors, including SPDX Model contributions from OMG and its Contributors.", + lang="en", + ), + ) + ) gen_rdf_classes(model, g) gen_rdf_properties(model, g) @@ -92,8 +111,12 @@ def gen_rdf_classes(model, g): for c in model.classes.values(): node = URIRef(c.iri) g.add((node, RDF.type, OWL.Class)) + if c.name: + g.add((node, RDFS.label, Literal(c.name))) if c.summary: g.add((node, RDFS.comment, Literal(c.summary, lang="en"))) + if c.description: + g.add((node, SKOS.definition, Literal(c.description, lang="en"))) parent = c.metadata.get("SubclassOf") if parent: pns = "" if parent.startswith("/") else f"/{c.ns.name}/" @@ -164,8 +187,12 @@ def gen_rdf_properties(model, g): if fqname == "/Core/spdxId": continue node = URIRef(p.iri) + if p.name: + g.add((node, RDFS.label, Literal(p.name))) if p.summary: g.add((node, RDFS.comment, Literal(p.summary, lang="en"))) + if p.description: + g.add((node, SKOS.definition, Literal(p.description, lang="en"))) if p.metadata["Nature"] == "ObjectProperty": g.add((node, RDF.type, OWL.ObjectProperty)) # to add: g.add((node, RDFS.domain, xxx)) @@ -192,22 +219,31 @@ def gen_rdf_vocabularies(model, g): for v in model.vocabularies.values(): node = URIRef(v.iri) g.add((node, RDF.type, OWL.Class)) + if v.name: + g.add((node, RDFS.label, Literal(v.name))) if v.summary: g.add((node, RDFS.comment, Literal(v.summary, lang="en"))) + if v.description: + g.add((node, SKOS.definition, Literal(v.description, lang="en"))) for e, d in v.entries.items(): enode = URIRef(v.iri + "/" + e) g.add((enode, RDF.type, OWL.NamedIndividual)) g.add((enode, RDF.type, node)) g.add((enode, RDFS.label, Literal(e))) g.add((enode, RDFS.comment, Literal(d, lang="en"))) + g.add((enode, SKOS.definition, Literal(d, lang="en"))) def gen_rdf_individuals(model, g): for i in model.individuals.values(): node = URIRef(i.iri) g.add((node, RDF.type, OWL.NamedIndividual)) + if i.name: + g.add((node, RDFS.label, Literal(i.name))) if i.summary: g.add((node, RDFS.comment, Literal(i.summary, lang="en"))) + if i.description: + g.add((node, SKOS.definition, Literal(i.description, lang="en"))) typ = i.metadata["type"] typename = "" if typ.startswith("/") else f"/{i.ns.name}/" typename += typ From 3295d7e5664b89ed58758e84197a25bfefee6f62 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 8 Sep 2024 04:48:54 +0100 Subject: [PATCH 2/9] Use skos:note for description Signed-off-by: Arthit Suriyawongkul --- spec_parser/rdf.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/spec_parser/rdf.py b/spec_parser/rdf.py index c60923b..0544e0c 100644 --- a/spec_parser/rdf.py +++ b/spec_parser/rdf.py @@ -115,8 +115,9 @@ def gen_rdf_classes(model, g): g.add((node, RDFS.label, Literal(c.name))) if c.summary: g.add((node, RDFS.comment, Literal(c.summary, lang="en"))) + g.add((node, SKOS.definition, Literal(c.summary, lang="en"))) if c.description: - g.add((node, SKOS.definition, Literal(c.description, lang="en"))) + g.add((node, SKOS.note, Literal(c.description, lang="en"))) parent = c.metadata.get("SubclassOf") if parent: pns = "" if parent.startswith("/") else f"/{c.ns.name}/" @@ -191,8 +192,9 @@ def gen_rdf_properties(model, g): g.add((node, RDFS.label, Literal(p.name))) if p.summary: g.add((node, RDFS.comment, Literal(p.summary, lang="en"))) + g.add((node, SKOS.definition, Literal(p.summary, lang="en"))) if p.description: - g.add((node, SKOS.definition, Literal(p.description, lang="en"))) + g.add((node, SKOS.note, Literal(p.description, lang="en"))) if p.metadata["Nature"] == "ObjectProperty": g.add((node, RDF.type, OWL.ObjectProperty)) # to add: g.add((node, RDFS.domain, xxx)) @@ -223,8 +225,9 @@ def gen_rdf_vocabularies(model, g): g.add((node, RDFS.label, Literal(v.name))) if v.summary: g.add((node, RDFS.comment, Literal(v.summary, lang="en"))) + g.add((node, SKOS.definition, Literal(v.summary, lang="en"))) if v.description: - g.add((node, SKOS.definition, Literal(v.description, lang="en"))) + g.add((node, SKOS.note, Literal(v.description, lang="en"))) for e, d in v.entries.items(): enode = URIRef(v.iri + "/" + e) g.add((enode, RDF.type, OWL.NamedIndividual)) @@ -242,8 +245,9 @@ def gen_rdf_individuals(model, g): g.add((node, RDFS.label, Literal(i.name))) if i.summary: g.add((node, RDFS.comment, Literal(i.summary, lang="en"))) + g.add((node, SKOS.definition, Literal(i.summary, lang="en"))) if i.description: - g.add((node, SKOS.definition, Literal(i.description, lang="en"))) + g.add((node, SKOS.note, Literal(i.description, lang="en"))) typ = i.metadata["type"] typename = "" if typ.startswith("/") else f"/{i.ns.name}/" typename += typ From 6d91b6e5b2f04889b0e83408706dbfa57e22929c Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 9 Sep 2024 00:05:42 +0100 Subject: [PATCH 3/9] Reformat links + remove code block/code inline markups Signed-off-by: Arthit Suriyawongkul --- spec_parser/rdf.py | 30 ++++++++++++++++-------------- spec_parser/util.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 14 deletions(-) create mode 100644 spec_parser/util.py diff --git a/spec_parser/rdf.py b/spec_parser/rdf.py index 0544e0c..11646f0 100644 --- a/spec_parser/rdf.py +++ b/spec_parser/rdf.py @@ -17,6 +17,8 @@ from rdflib.namespace import DCTERMS, OWL, RDF, RDFS, SH, SKOS, XSD from rdflib.tools.rdf2dot import rdf2dot +from .util import unmarkdown + URI_BASE = "https://spdx.org/rdf/3.0.1/terms/" @@ -114,10 +116,10 @@ def gen_rdf_classes(model, g): if c.name: g.add((node, RDFS.label, Literal(c.name))) if c.summary: - g.add((node, RDFS.comment, Literal(c.summary, lang="en"))) - g.add((node, SKOS.definition, Literal(c.summary, lang="en"))) + g.add((node, RDFS.comment, Literal(unmarkdown(c.summary), lang="en"))) + g.add((node, SKOS.definition, Literal(unmarkdown(c.summary), lang="en"))) if c.description: - g.add((node, SKOS.note, Literal(c.description, lang="en"))) + g.add((node, SKOS.note, Literal(unmarkdown(c.description), lang="en"))) parent = c.metadata.get("SubclassOf") if parent: pns = "" if parent.startswith("/") else f"/{c.ns.name}/" @@ -191,10 +193,10 @@ def gen_rdf_properties(model, g): if p.name: g.add((node, RDFS.label, Literal(p.name))) if p.summary: - g.add((node, RDFS.comment, Literal(p.summary, lang="en"))) - g.add((node, SKOS.definition, Literal(p.summary, lang="en"))) + g.add((node, RDFS.comment, Literal(unmarkdown(p.summary), lang="en"))) + g.add((node, SKOS.definition, Literal(unmarkdown(p.summary), lang="en"))) if p.description: - g.add((node, SKOS.note, Literal(p.description, lang="en"))) + g.add((node, SKOS.note, Literal(unmarkdown(p.description), lang="en"))) if p.metadata["Nature"] == "ObjectProperty": g.add((node, RDF.type, OWL.ObjectProperty)) # to add: g.add((node, RDFS.domain, xxx)) @@ -224,17 +226,17 @@ def gen_rdf_vocabularies(model, g): if v.name: g.add((node, RDFS.label, Literal(v.name))) if v.summary: - g.add((node, RDFS.comment, Literal(v.summary, lang="en"))) - g.add((node, SKOS.definition, Literal(v.summary, lang="en"))) + g.add((node, RDFS.comment, Literal(unmarkdown(v.summary), lang="en"))) + g.add((node, SKOS.definition, Literal(unmarkdown(v.summary), lang="en"))) if v.description: - g.add((node, SKOS.note, Literal(v.description, lang="en"))) + g.add((node, SKOS.note, Literal(unmarkdown(v.description), lang="en"))) for e, d in v.entries.items(): enode = URIRef(v.iri + "/" + e) g.add((enode, RDF.type, OWL.NamedIndividual)) g.add((enode, RDF.type, node)) g.add((enode, RDFS.label, Literal(e))) - g.add((enode, RDFS.comment, Literal(d, lang="en"))) - g.add((enode, SKOS.definition, Literal(d, lang="en"))) + g.add((enode, RDFS.comment, Literal(unmarkdown(d), lang="en"))) + g.add((enode, SKOS.definition, Literal(unmarkdown(d), lang="en"))) def gen_rdf_individuals(model, g): @@ -244,10 +246,10 @@ def gen_rdf_individuals(model, g): if i.name: g.add((node, RDFS.label, Literal(i.name))) if i.summary: - g.add((node, RDFS.comment, Literal(i.summary, lang="en"))) - g.add((node, SKOS.definition, Literal(i.summary, lang="en"))) + g.add((node, RDFS.comment, Literal(unmarkdown(i.summary), lang="en"))) + g.add((node, SKOS.definition, Literal(unmarkdown(i.summary), lang="en"))) if i.description: - g.add((node, SKOS.note, Literal(i.description, lang="en"))) + g.add((node, SKOS.note, Literal(unmarkdown(i.description), lang="en"))) typ = i.metadata["type"] typename = "" if typ.startswith("/") else f"/{i.ns.name}/" typename += typ diff --git a/spec_parser/util.py b/spec_parser/util.py new file mode 100644 index 0000000..c30190c --- /dev/null +++ b/spec_parser/util.py @@ -0,0 +1,29 @@ +import re +from typing import Pattern, cast + + +def _replace_text_link(match: re.Match) -> str: + # [text](link) -> text + # [text](insite_link) -> text + # [link](link) -> + text = str(match.group(1)) + link = str(match.group(2)) + if text.lower() == link.lower(): + return f"<{link}>" + elif link.startswith("."): + return f"{text}" + else: + return f"{text} <{link}>" + + +_replace_pairs = { + "text_link": {"pat": re.compile(r"\[(.*?)\]\((.*?)\)"), "repl": _replace_text_link}, + "code_block_markup": {"pat": re.compile(r"^```\S*\s*$", re.MULTILINE), "repl": ""}, + "code_inline_markup": {"pat": re.compile(r"`([^`]+)`"), "repl": r"\1"}, +} + + +def unmarkdown(text: str) -> str: + for pair in _replace_pairs.values(): + text = cast(Pattern, pair["pat"]).sub(pair["repl"], text) + return text From c13118ad7651acb6b235d8713dfe17bec1787562 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 9 Sep 2024 00:25:45 +0100 Subject: [PATCH 4/9] Deal with internal link Signed-off-by: Arthit Suriyawongkul --- spec_parser/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec_parser/util.py b/spec_parser/util.py index c30190c..9ea11f1 100644 --- a/spec_parser/util.py +++ b/spec_parser/util.py @@ -10,7 +10,7 @@ def _replace_text_link(match: re.Match) -> str: link = str(match.group(2)) if text.lower() == link.lower(): return f"<{link}>" - elif link.startswith("."): + elif link.startswith(".") or link.endswith(".md"): return f"{text}" else: return f"{text} <{link}>" From a8c8ed0bfc8d179d693cd276e880a9d82bdc57c0 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 10 Sep 2024 12:48:00 +0100 Subject: [PATCH 5/9] Add docstring Signed-off-by: Arthit Suriyawongkul --- spec_parser/util.py | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/spec_parser/util.py b/spec_parser/util.py index 9ea11f1..bda0930 100644 --- a/spec_parser/util.py +++ b/spec_parser/util.py @@ -2,28 +2,42 @@ from typing import Pattern, cast -def _replace_text_link(match: re.Match) -> str: - # [text](link) -> text - # [text](insite_link) -> text - # [link](link) -> +def _unmarkdown_repl_text_url(match: re.Match) -> str: + # [text](url) -> text + # [text](url) -> text + # [url](url) -> text = str(match.group(1)) - link = str(match.group(2)) - if text.lower() == link.lower(): - return f"<{link}>" - elif link.startswith(".") or link.endswith(".md"): + url = str(match.group(2)) + if text.lower() == url.lower(): + return f"<{url}>" + elif url.startswith(".") or url.endswith(".md"): return f"{text}" else: - return f"{text} <{link}>" + return f"{text} <{url}>" -_replace_pairs = { - "text_link": {"pat": re.compile(r"\[(.*?)\]\((.*?)\)"), "repl": _replace_text_link}, - "code_block_markup": {"pat": re.compile(r"^```\S*\s*$", re.MULTILINE), "repl": ""}, - "code_inline_markup": {"pat": re.compile(r"`([^`]+)`"), "repl": r"\1"}, +# A list of regular expression and replacement string pairs, ordered by the +# sequence in which they should be applied to a Markdown text. +# Note that this assumes that the dict is ordered; +# dict is ordered since CPython 3.6 (unofficial) and all of Python 3.7 (official). +_unmakdown_rules = { + "repl_text_url": {"pat": re.compile(r"\[(.*?)\]\((.*?)\)"), "repl": _unmarkdown_repl_text_url}, + "rm_code_block_markup": {"pat": re.compile(r"^```\S*\s*$", re.MULTILINE), "repl": ""}, + "rm_code_inline_markup": {"pat": re.compile(r"`([^`]+)`"), "repl": r"\1"}, } def unmarkdown(text: str) -> str: - for pair in _replace_pairs.values(): + """ + Convert Markdown text to plain text by applying a series of + regular expression replacements. + + Args: + text (str): The Markdown text to be converted. + + Returns: + str: The plain text result. + """ + for pair in _unmakdown_rules.values(): text = cast(Pattern, pair["pat"]).sub(pair["repl"], text) return text From efb659757f525eb24810623e362663a665135327 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 10 Sep 2024 13:00:14 +0100 Subject: [PATCH 6/9] Use array of tuples instead of dict Signed-off-by: Arthit Suriyawongkul --- spec_parser/util.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/spec_parser/util.py b/spec_parser/util.py index bda0930..cd97b82 100644 --- a/spec_parser/util.py +++ b/spec_parser/util.py @@ -1,11 +1,19 @@ +from __future__ import annotations + import re -from typing import Pattern, cast +from typing import Callable, Pattern, Tuple, Union + +ReplaceTuple = Tuple[Pattern[str], Union[str, Callable[[re.Match], str]]] def _unmarkdown_repl_text_url(match: re.Match) -> str: - # [text](url) -> text - # [text](url) -> text - # [url](url) -> + """ + Replacement function for Markdown links. + + [text](url) -> text + [text](url) -> text + [url](url) -> + """ text = str(match.group(1)) url = str(match.group(2)) if text.lower() == url.lower(): @@ -16,15 +24,11 @@ def _unmarkdown_repl_text_url(match: re.Match) -> str: return f"{text} <{url}>" -# A list of regular expression and replacement string pairs, ordered by the -# sequence in which they should be applied to a Markdown text. -# Note that this assumes that the dict is ordered; -# dict is ordered since CPython 3.6 (unofficial) and all of Python 3.7 (official). -_unmakdown_rules = { - "repl_text_url": {"pat": re.compile(r"\[(.*?)\]\((.*?)\)"), "repl": _unmarkdown_repl_text_url}, - "rm_code_block_markup": {"pat": re.compile(r"^```\S*\s*$", re.MULTILINE), "repl": ""}, - "rm_code_inline_markup": {"pat": re.compile(r"`([^`]+)`"), "repl": r"\1"}, -} +_unmakdown_rules: list[ReplaceTuple] = [ + (re.compile(r"\[(.*?)\]\((.*?)\)"), _unmarkdown_repl_text_url), # [text](url) replacements + (re.compile(r"^```\S*\s*$", re.MULTILINE), ""), # remove code block markup + (re.compile(r"`([^`]+)`"), r"\1"), # remove code inline markup +] def unmarkdown(text: str) -> str: @@ -38,6 +42,6 @@ def unmarkdown(text: str) -> str: Returns: str: The plain text result. """ - for pair in _unmakdown_rules.values(): - text = cast(Pattern, pair["pat"]).sub(pair["repl"], text) + for pattern, replacement in _unmakdown_rules: + text = pattern.sub(replacement, text) return text From d56a1dedf6165277e5b2b1eaf53fc6b7360b9280 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 10 Sep 2024 13:09:09 +0100 Subject: [PATCH 7/9] Add license line Signed-off-by: Arthit Suriyawongkul --- spec_parser/util.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/spec_parser/util.py b/spec_parser/util.py index cd97b82..7ffb9f1 100644 --- a/spec_parser/util.py +++ b/spec_parser/util.py @@ -1,3 +1,15 @@ +# SPDX-License-Identifier: Apache-2.0 + +""" +This module provides utility functions. + +Functions: + unmarkdown(text: str) -> str: Convert Markdown text to plain text. + +Types: + ReplaceTuple: A tuple containing a compiled regex pattern and a replacement string or function. +""" + from __future__ import annotations import re @@ -10,9 +22,9 @@ def _unmarkdown_repl_text_url(match: re.Match) -> str: """ Replacement function for Markdown links. - [text](url) -> text - [text](url) -> text - [url](url) -> + [text](url) -> text + [text](file.md) -> text + [url](url) -> """ text = str(match.group(1)) url = str(match.group(2)) From 8890d5a653947d6f71567c4fcf915d18e285c15e Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 10 Sep 2024 13:19:42 +0100 Subject: [PATCH 8/9] Remove newline after ``` Signed-off-by: Arthit Suriyawongkul --- spec_parser/util.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spec_parser/util.py b/spec_parser/util.py index 7ffb9f1..67db5cc 100644 --- a/spec_parser/util.py +++ b/spec_parser/util.py @@ -15,16 +15,16 @@ import re from typing import Callable, Pattern, Tuple, Union -ReplaceTuple = Tuple[Pattern[str], Union[str, Callable[[re.Match], str]]] +ReplaceTuple = Tuple[Pattern, Union[str, Callable[[re.Match], str]]] def _unmarkdown_repl_text_url(match: re.Match) -> str: """ Replacement function for Markdown links. - [text](url) -> text - [text](file.md) -> text - [url](url) -> + [text](url) -> text + [text](../file.md) -> text + [url](url) -> """ text = str(match.group(1)) url = str(match.group(2)) @@ -38,7 +38,7 @@ def _unmarkdown_repl_text_url(match: re.Match) -> str: _unmakdown_rules: list[ReplaceTuple] = [ (re.compile(r"\[(.*?)\]\((.*?)\)"), _unmarkdown_repl_text_url), # [text](url) replacements - (re.compile(r"^```\S*\s*$", re.MULTILINE), ""), # remove code block markup + (re.compile(r"^```\S*\s*\n?", re.MULTILINE), ""), # remove code block markup (re.compile(r"`([^`]+)`"), r"\1"), # remove code inline markup ] From fb3040d3bde3a90a16ecbb3fe4ae81ea7e0c1d43 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 10 Sep 2024 14:34:41 +0100 Subject: [PATCH 9/9] Rules comments Signed-off-by: Arthit Suriyawongkul --- spec_parser/util.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/spec_parser/util.py b/spec_parser/util.py index 67db5cc..6d332d2 100644 --- a/spec_parser/util.py +++ b/spec_parser/util.py @@ -36,10 +36,15 @@ def _unmarkdown_repl_text_url(match: re.Match) -> str: return f"{text} <{url}>" +# A list of (regular expression, replacement string/function), ordered by +# the sequence in which they should be applied to a Markdown text. _unmakdown_rules: list[ReplaceTuple] = [ - (re.compile(r"\[(.*?)\]\((.*?)\)"), _unmarkdown_repl_text_url), # [text](url) replacements - (re.compile(r"^```\S*\s*\n?", re.MULTILINE), ""), # remove code block markup - (re.compile(r"`([^`]+)`"), r"\1"), # remove code inline markup + # [text](url) replacements + (re.compile(r"\[(.*?)\]\((.*?)\)"), _unmarkdown_repl_text_url), + # remove code block markup + (re.compile(r"^```\S*\s*\n?", re.MULTILINE), ""), + # remove code inline markup + (re.compile(r"`([^`]+)`"), r"\1"), ]