diff --git a/package.json b/package.json index 47b9e478..db8a9cb1 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,7 @@ "js-yaml": "^4.1.0", "node-fetch": "^3.3.0", "nunjucks": "^3.2.4", + "papaparse": "^5.4.1", "semver": "^7.5.3", "shelljs": "^0.8.5", "tslib": "^2.5.0" diff --git a/schemas/src/digital-objects/2d-ftu.yaml b/schemas/src/digital-objects/2d-ftu.yaml new file mode 100644 index 00000000..f53b0d45 --- /dev/null +++ b/schemas/src/digital-objects/2d-ftu.yaml @@ -0,0 +1,153 @@ +id: https://purl.humanatlas.io/specs/2d-ftu +name: ftu-2d +prefixes: + ccf: http://purl.org/ccf/ + dcterms: http://purl.org/dc/terms/ + pav: http://purl.org/pav/ + rdfs: http://www.w3.org/2000/01/rdf-schema# + rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# + xsd: http://www.w3.org/2001/XMLSchema# + obo: http://purl.obolibrary.org/obo/ + linkml: https://w3id.org/linkml/ + UBERON: http://purl.obolibrary.org/obo/UBERON_ + CL: http://purl.obolibrary.org/obo/CL_ + +default_prefix: ccf +default_range: string + +imports: + - linkml:types + - ../shared/metadata-base + +settings: + uberon: "UBERON" + fma: "FMA" + +classes: + Named: + mixin: true + slots: + - id + - label + - class_type + Instance: + mixin: true + slots: + - typeOf + annotations: + owl: Individual + + FtuIllustration: + class_uri: ccf:FtuIllustration + mixins: + - Named + - Instance + slots: + - located_in + - image_file + - illustration_node + annotations: + owl: NamedIndividual + + FtuIllustrationFile: + class_uri: ccf:FtuIllustrationFile + mixins: + - Named + - Instance + slots: + - file_url + - file_format + annotations: + owl: NamedIndividual + + FtuIllustrationNode: + class_uri: ccf:FtuIllustrationNode + mixins: + - Named + - Instance + slots: + - node_name + - part_of_illustration + annotations: + owl: NamedIndividual + + AnatomicalStructure: + slots: + - id + slot_usage: + id: + structured_pattern: + syntax: "({uberon}|{fma}):\\d+" + interpolated: true + partial_match: false + + Container: + tree_root: true + attributes: + iri: + range: uriorcurie + metadata: + range: DatasetMetadata + data: + multivalued: true + inlined_as_list: true + range: FtuIllustration + annotations: + owl.template: |- + AnnotationAssertion( dct:title {{iri}} "{{metadata.title}}" ) + AnnotationAssertion( dct:description {{iri}} "{{metadata.description}}" ) + {% for c in metadata.creators %} + AnnotationAssertion( dct:creator {{iri}} "{{c.fullName}} ({{c.orcid}})" ) + {% endfor %} + AnnotationAssertion( schema:version {{iri}} "{{metadata.version}}" ) + AnnotationAssertion( schema:dateCreated {{iri}} "{{metadata.creation_date}}" ) + AnnotationAssertion( dct:license {{iri}} "{{metadata.license}}" ) + AnnotationAssertion( dct:publisher {{iri}} "{{metadata.publisher}}" ) + AnnotationAssertion( rdfs:seeAlso {{iri}} "{{metadata.see_also}}" ) + +slots: + id: + identifier: true + range: string + label: + slot_uri: rdfs:label + annotations: + owl: AnnotationAssertion + class_type: + designates_type: true + typeOf: + multivalued: true + range: Named + slot_uri: rdf:type + annotations: + owl: ClassAssertion + located_in: + slot_uri: ccf:ccf_located_in + range: AnatomicalStructure + annotations: + owl: AnnotationAssertion + image_file: + range: FtuIllustrationFile + multivalued: true + inlined_as_list: true + annotations: + owl: AnnotationAssertion + file_url: + range: uri + annotations: + owl: AnnotationAssertion + file_format: + annotations: + owl: AnnotationAssertion + illustration_node: + range: FtuIllustrationNode + multivalued: true + inlined_as_list: true + annotations: + owl: AnnotationAssertion + node_name: + annotations: + owl: AnnotationAssertion + part_of_illustration: + annotations: + owl: AnnotationAssertion diff --git a/schemas/src/metadata/2d-ftu-metadata.yaml b/schemas/src/metadata/2d-ftu-metadata.yaml new file mode 100644 index 00000000..16ea2495 --- /dev/null +++ b/schemas/src/metadata/2d-ftu-metadata.yaml @@ -0,0 +1,60 @@ +id: https://purl.humanatlas.io/specs/2d-ftu-metadata +name: ftu-2d-metadata +prefixes: + ccf: http://purl.org/ccf/ + pav: http://purl.org/pav/ + dcat: http://www.w3.org/ns/dcat# + dct: http://purl.org/dc/terms/ + foaf: http://xmlns.com/foaf/0.1/ + schema: http://schema.org/ + rdfs: http://www.w3.org/2000/01/rdf-schema# + xsd: http://www.w3.org/2001/XMLSchema# + linkml: https://w3id.org/linkml/ + +default_prefix: dcat +default_range: string + +imports: + - linkml:types + - ../shared/metadata-base + +classes: + Container: + tree_root: true + class_uri: dcat:Dataset + slots: + - iri + - title + - description + - creators + - project_leads + - reviewers + - externalReviewers + - version + - creation_date + - license + - publisher + - funders + - hubmapId + - doi + - citation + - citationOverall + - datatable + - distributions + slot_usage: + iri: + required: true + title: + required: true + description: + required: true + creators: + required: true + version: + required: true + creation_date: + required: true + license: + required: true + distributions: + required: true diff --git a/schemas/src/shared/metadata-base.yaml b/schemas/src/shared/metadata-base.yaml index 827efaf2..a4858c4e 100644 --- a/schemas/src/shared/metadata-base.yaml +++ b/schemas/src/shared/metadata-base.yaml @@ -39,6 +39,17 @@ classes: - downloadUrl - accessUrl - mediaType + DatasetMetadata: + class_uri: dcat:Dataset + slots: + - title + - description + - creators + - version + - creation_date + - license + - publisher + - see_also slots: iri: diff --git a/src/enrichment/enrich-2d-ftu.js b/src/enrichment/enrich-2d-ftu.js new file mode 100644 index 00000000..d47eb865 --- /dev/null +++ b/src/enrichment/enrich-2d-ftu.js @@ -0,0 +1,116 @@ +import fs from 'fs'; +import { resolve } from 'path'; +import { error, header, info, more } from '../utils/logging.js'; +import { convert, filter, merge, query } from '../utils/robot.js'; +import { throwOnError } from '../utils/sh-exec.js'; +import { + cleanTemporaryFiles, + convertNormalizedMetadataToRdf, + convertNormalizedDataToOwl, + logOutput +} from './utils.js'; + +export function enrich2dFtuMetadata(context) { + const { selectedDigitalObject: obj } = context; + const normalizedPath = resolve(obj.path, 'normalized/normalized-metadata.yaml'); + const enrichedPath = resolve(obj.path, 'enriched/enriched-metadata.ttl'); + convertNormalizedMetadataToRdf(context, normalizedPath, enrichedPath); +} + +export function enrich2dFtuData(context) { + try { + const { selectedDigitalObject: obj, processorHome } = context; + + // Convert normalized data to graph data (.ttl) + const normalizedPath = resolve(obj.path, 'normalized/normalized.yaml'); + const baseInputPath = resolve(obj.path, 'enriched/base-input.ttl'); + convertNormalizedDataToOwl(context, normalizedPath, baseInputPath); + logOutput(baseInputPath); + + let inputPaths = []; // variable to hold input files for merging + + const enrichedWithOntologyPath = resolve(obj.path, 'enriched/enriched-with-ontology.owl'); + + inputPaths.push(baseInputPath); // Set the enriched path as the initial + + info('Getting concept details from reference ontologies...') + const uberonEntitiesPath = collectEntities(context, 'uberon', baseInputPath); + if (!isFileEmpty(uberonEntitiesPath)) { + info('Extracting UBERON.'); + const uberonExtractPath = filterClasses(context, 'uberon', uberonEntitiesPath); + logOutput(uberonExtractPath); + inputPaths.push(uberonExtractPath); + } + + const fmaEntitiesPath = collectEntities(context, 'fma', baseInputPath); + if (!isFileEmpty(fmaEntitiesPath)) { + info('Extracting FMA.'); + const fmaExtractPath = filterClasses(context, 'fma', fmaEntitiesPath); + logOutput(fmaExtractPath); + inputPaths.push(fmaExtractPath); + } + + const clEntitiesPath = collectEntities(context, 'cl', baseInputPath); + if (!isFileEmpty(clEntitiesPath)) { + info('Extracting CL.'); + const clExtractPath = filterClasses(context, 'cl', clEntitiesPath); + logOutput(clExtractPath); + inputPaths.push(clExtractPath); + } + + const pclEntitiesPath = collectEntities(context, 'pcl', baseInputPath); + if (!isFileEmpty(pclEntitiesPath)) { + info('Extracting PCL.'); + const pclExtractPath = filterClasses(context, 'pcl', clEntitiesPath); + logOutput(pclExtractPath); + inputPaths.push(pclExtractPath); + } + + info('Merging files:'); + for (const inputPath of inputPaths) { + more(` -> ${inputPath}`); + } + merge(inputPaths, enrichedWithOntologyPath); + logOutput(enrichedWithOntologyPath); + + const enrichedPath = resolve(obj.path, 'enriched/enriched.ttl'); + + info(`Creating 2d-ftu: ${enrichedPath}`); + convert(enrichedWithOntologyPath, enrichedPath, 'ttl'); + + } catch (e) { + error(e); + } finally { + // Clean up + info('Cleaning up temporary files...'); + cleanTemporaryFiles(context); + more("Done.") + } +} + +function isFileEmpty(path) { + return fs.statSync(path).size === 0; +} + +function collectEntities(context, ontologyName, inputPath) { + const { selectedDigitalObject: obj, processorHome } = context; + + const queryPath = resolve(processorHome, `src/utils/get-${ontologyName}-terms.sparql`); + const outputPath = resolve(obj.path, `enriched/${ontologyName}-terms.csv`); + + query(inputPath, queryPath, outputPath); + throwOnError(`sed -i '1d' ${outputPath}`, 'Collect entities failed.'); + + return outputPath; +} + +function filterClasses(context, ontologyName, classTermFile) { + const { selectedDigitalObject: obj, processorHome } = context; + + const ontologyPath = resolve(processorHome, `mirrors/${ontologyName}.owl`); + const outputPath = resolve(obj.path, `enriched/${ontologyName}-filter.owl`); + + filter(ontologyPath, classTermFile, ['rdfs:label', 'http://www.geneontology.org/formats/oboInOwl#id', 'http://purl.obolibrary.org/obo/IAO_0000115'], outputPath); + + return outputPath; +} diff --git a/src/enrichment/enrich.js b/src/enrichment/enrich.js index 9de1606e..0234f991 100644 --- a/src/enrichment/enrich.js +++ b/src/enrichment/enrich.js @@ -2,6 +2,7 @@ import { resolve } from 'path'; import sh from 'shelljs'; import { enrichAsctbMetadata, enrichAsctbData } from './enrich-asct-b.js'; import { enrichRefOrganMetadata, enrichRefOrganData } from './enrich-ref-organ.js'; +import { enrich2dFtuMetadata, enrich2dFtuData } from './enrich-2d-ftu.js'; import { enrichCollectionMetadata, enrichCollectionData } from './enrich-collection.js'; import { header } from '../utils/logging.js'; @@ -18,6 +19,10 @@ export function enrich(context) { enrichRefOrganMetadata(context); enrichRefOrganData(context); break; + case '2d-ftu': + enrich2dFtuMetadata(context); + enrich2dFtuData(context); + break; case 'collection': enrichCollectionMetadata(context); enrichCollectionData(context); diff --git a/src/normalization/normalize-2d-ftu.js b/src/normalization/normalize-2d-ftu.js new file mode 100644 index 00000000..1f723716 --- /dev/null +++ b/src/normalization/normalize-2d-ftu.js @@ -0,0 +1,117 @@ +import { readFileSync, writeFileSync } from 'fs'; +import { dump } from 'js-yaml'; +import { resolve } from 'path'; +import sh from 'shelljs'; +import Papa from 'papaparse'; +import { info, more, warning } from '../utils/logging.js'; +import { + readMetadata, + readLocalData, + writeNormalizedMetadata, + writeNormalizedData, + getMetadataIri, + getDataDistributions +} from './utils.js'; + +export function normalize2dFtuMetadata(context) { + const rawMetadata = readMetadata(context); + const normalizedMetadata = normalizeMetadata(context, rawMetadata); + writeNormalizedMetadata(context, normalizedMetadata); +} + +function normalizeMetadata(context, metadata) { + const normalizedMetadata = { + iri: getMetadataIri(context), + ...metadata, + datatable: normalizeDatatable(context, metadata.datatable), + distributions: getDataDistributions(context) + }; + delete normalizedMetadata.type; + delete normalizedMetadata.name; + return normalizedMetadata; +} + +function normalizeDatatable(context, datatable) { + const { type, name, version } = context.selectedDigitalObject; + return datatable.map(item => `https://cdn.humanatlast.io/digital-objects/${type}/${name}/${version}/${item}`) +} + +export async function normalize2dFtuData(context) { + const rawData = await getRawData(context); + const rawMetadata = readMetadata(context); + const normalizedData = normalizeData(context, rawMetadata, rawData); + writeNormalizedData(context, normalizedData); +} + +async function getRawData(context) { + const crosswalk = readLocalData(context, "crosswalk.csv", + (csvData) => Papa.parse( + csvData.toString(), { + header: true, + skipEmptyLines: true + })); + return crosswalk.data; +} + +function normalizeData(context, metadata, data) { + const { iri, name } = context.selectedDigitalObject; + const illustrationName = name.replace(/-/g, " "); + const illustrationRepresentation = data[0]['tissue_mapped_to']; + return [{ + id: `${iri}#illustration`, + label: `An illustration of ${illustrationName}`, + class_type: 'FtuIllustration', + typeOf: [ 'FtuIllustration', illustrationRepresentation ], + located_in: data[0]['organ_mapped_to'], + image_file: normalizeIllustrationImage(context, metadata), + illustration_node: normalizeIllustrationNode(context, data) + }]; +} + +function normalizeIllustrationImage(context, metadata) { + const { iri, type, name, version } = context.selectedDigitalObject; + const datatable = metadata['datatable']; + return datatable.filter(item => item.split('.').pop() != "csv") + .map(item => { + const fileType = item.split('.').pop(); + const fileTypeName = fileType.toUpperCase(); + const illustrationName = name.replace(/-/g, " "); + return { + id: `${iri}#${fileType}`, + label: `${fileTypeName} image of ${illustrationName}`, + class_type: 'FtuIllustrationFile', + typeOf: [ 'FtuIllustrationFile' ], + file_url: `https://cdn.humanatlast.io/digital-objects/${type}/${name}/${version}/${item}`, + file_format: getMimeType(fileType) + } + }); +} + +function getMimeType(fileType) { + if (fileType === 'svg') { + return 'image/svg+xml'; + } else if (fileType === 'ai') { + return 'application/pdf'; + } else if (fileType === 'png') { + return 'image/png'; + } else { + return 'image/other'; + } +} + +function normalizeIllustrationNode(context, data) { + const { iri } = context.selectedDigitalObject; + return data.map(item => { + const nodeId = item['node_id']; + const nodeLabel = item['node_id'].replace(/_/g, " ").toLowerCase(); + const nodeRepresentation = item['node_mapped_to']; + return { + id: `${iri}#${nodeId}`, + label: `An illustration node of ${nodeLabel}`, + class_type: 'FtuIllustrationNode', + typeOf: [ 'FtuIllustrationNode', nodeRepresentation ], + node_name: nodeId, + part_of_illustration: `${iri}#illustration` + } + }); +} diff --git a/src/normalization/normalize.js b/src/normalization/normalize.js index b036fbe2..af83789c 100644 --- a/src/normalization/normalize.js +++ b/src/normalization/normalize.js @@ -3,6 +3,7 @@ import sh from 'shelljs'; import { normalizeAsctbMetadata, normalizeAsctbData } from './normalize-asct-b.js'; import { normalizeCollectionMetadata, normalizeCollectionData } from './normalize-collection.js'; import { normalizeRefOrganMetadata, normalizeRefOrganData } from './normalize-ref-organ.js'; +import { normalize2dFtuMetadata, normalize2dFtuData } from './normalize-2d-ftu.js'; import { validateNormalizedMetadata, validateNormalizedData } from '../utils/validation.js'; import { header } from '../utils/logging.js'; @@ -19,6 +20,10 @@ export async function normalize(context) { normalizeRefOrganMetadata(context); await normalizeRefOrganData(context); break; + case '2d-ftu': + normalize2dFtuMetadata(context); + await normalize2dFtuData(context); + break; case 'collection': normalizeCollectionMetadata(context); normalizeCollectionData(context); diff --git a/src/normalization/utils.js b/src/normalization/utils.js index c4579235..98ac4952 100644 --- a/src/normalization/utils.js +++ b/src/normalization/utils.js @@ -11,6 +11,11 @@ export function readMetadata(context) { } } +export function readLocalData(context, fileName, parse) { + const { path, type, name, version } = context.selectedDigitalObject; + return parse(readFileSync(resolve(path, `raw/${fileName}`))); +} + export function writeNormalizedMetadata(context, metadata) { const { path } = context.selectedDigitalObject; const normalizedPath = resolve(path, 'normalized/normalized-metadata.yaml'); diff --git a/src/utils/get-cl-terms.sparql b/src/utils/get-cl-terms.sparql index fd99c04a..9d3951df 100644 --- a/src/utils/get-cl-terms.sparql +++ b/src/utils/get-cl-terms.sparql @@ -13,10 +13,4 @@ WHERE { FILTER(STRSTARTS(STR(?o),"http://purl.obolibrary.org/obo/CL_")) } } - UNION - { - SELECT ?entity WHERE { - BIND("http://purl.obolibrary.org/obo/CL_0000000" AS ?entity) - } - } } \ No newline at end of file diff --git a/src/utils/get-fma-terms.sparql b/src/utils/get-fma-terms.sparql index 2fb8e180..12f37bcc 100644 --- a/src/utils/get-fma-terms.sparql +++ b/src/utils/get-fma-terms.sparql @@ -13,10 +13,4 @@ WHERE { FILTER(STRSTARTS(STR(?o),"http://purl.org/sig/ont/fma/fma")) } } - UNION - { - SELECT ?entity WHERE { - BIND("http://purl.org/sig/ont/fma/fma62955" AS ?entity) - } - } } \ No newline at end of file diff --git a/src/utils/get-hgnc-terms.sparql b/src/utils/get-hgnc-terms.sparql index ece336aa..b41a663a 100644 --- a/src/utils/get-hgnc-terms.sparql +++ b/src/utils/get-hgnc-terms.sparql @@ -13,10 +13,4 @@ WHERE { FILTER(STRSTARTS(STR(?o),"http://identifiers.org/hgnc/")) } } - UNION - { - SELECT ?entity WHERE { - BIND("http://purl.bioontology.org/ontology/HGNC/gene" AS ?entity) - } - } } \ No newline at end of file diff --git a/src/utils/get-lmha-terms.sparql b/src/utils/get-lmha-terms.sparql index 23b783ef..70e0fd42 100644 --- a/src/utils/get-lmha-terms.sparql +++ b/src/utils/get-lmha-terms.sparql @@ -13,10 +13,4 @@ WHERE { FILTER(STRSTARTS(STR(?o),"http://purl.obolibrary.org/obo/LMHA_")) } } - UNION - { - SELECT ?entity WHERE { - BIND("http://purl.obolibrary.org/obo/LMHA_00135" AS ?entity) - } - } } \ No newline at end of file diff --git a/src/utils/get-pcl-terms.sparql b/src/utils/get-pcl-terms.sparql index f4c78672..ce915bea 100644 --- a/src/utils/get-pcl-terms.sparql +++ b/src/utils/get-pcl-terms.sparql @@ -13,10 +13,4 @@ WHERE { FILTER(STRSTARTS(STR(?o),"http://purl.obolibrary.org/obo/PCL_")) } } - UNION - { - SELECT ?entity WHERE { - BIND("http://purl.obolibrary.org/obo/CL_0000000" AS ?entity) - } - } } \ No newline at end of file diff --git a/src/utils/get-uberon-terms.sparql b/src/utils/get-uberon-terms.sparql index 1ac5ea74..010dbc24 100644 --- a/src/utils/get-uberon-terms.sparql +++ b/src/utils/get-uberon-terms.sparql @@ -13,10 +13,4 @@ WHERE { FILTER(STRSTARTS(STR(?o),"http://purl.obolibrary.org/obo/UBERON_")) } } - UNION - { - SELECT ?entity WHERE { - BIND("http://purl.obolibrary.org/obo/UBERON_0001062" AS ?entity) - } - } } \ No newline at end of file diff --git a/src/utils/robot.js b/src/utils/robot.js index 9d227d75..bd47ce0a 100644 --- a/src/utils/robot.js +++ b/src/utils/robot.js @@ -22,6 +22,17 @@ export function extract(input, upperTerm, lowerTerms, output, outputFormat="owl" ); } +export function filter(input, anyTerms, annotationTerms=['rdfs:label'], output, outputFormat="owl") { + const termArguments = annotationTerms.map(term => `--term ${term}`).join(" "); + throwOnError( + `robot filter -i ${input} \ + --include-terms ${anyTerms} \ + ${termArguments} \ + convert --format ${outputFormat} -o ${output}`, + 'Class(es) extraction failed. See errors above.' + ); +} + export function merge(inputs, output, outputFormat="owl") { // Convert the inputs to OWL/XML format to avoid blank node collisions const owlInputs = [];