From cc911a37e5fbe2b2dd8c6d29b850d1fefa6a8d2b Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Mon, 16 Mar 2026 16:59:21 -0400 Subject: [PATCH] implemente inside out croissant for review datasets #12124 --- .../export/croissant/CroissantExportUtil.java | 742 +++++++++--------- .../harvard/iq/dataverse/api/ReviewsIT.java | 18 + .../export/CroissantExporterSlimTest.java | 80 ++ .../review/expected/review-croissantSlim.json | 52 ++ .../croissant/review/in/dataCiteXml.xml | 31 + .../review/in/datasetFileDetails.json | 1 + .../croissant/review/in/datasetJson.json | 140 ++++ .../croissant/review/in/datasetORE.json | 68 ++ .../review/in/datasetSchemaDotOrg.json | 44 ++ 9 files changed, 819 insertions(+), 357 deletions(-) create mode 100644 src/test/resources/croissant/review/expected/review-croissantSlim.json create mode 100644 src/test/resources/croissant/review/in/dataCiteXml.xml create mode 100644 src/test/resources/croissant/review/in/datasetFileDetails.json create mode 100644 src/test/resources/croissant/review/in/datasetJson.json create mode 100644 src/test/resources/croissant/review/in/datasetORE.json create mode 100644 src/test/resources/croissant/review/in/datasetSchemaDotOrg.json diff --git a/src/main/java/edu/harvard/iq/dataverse/export/croissant/CroissantExportUtil.java b/src/main/java/edu/harvard/iq/dataverse/export/croissant/CroissantExportUtil.java index e9ba39eeba5..e3b1fd22079 100644 --- a/src/main/java/edu/harvard/iq/dataverse/export/croissant/CroissantExportUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/export/croissant/CroissantExportUtil.java @@ -1,5 +1,6 @@ package edu.harvard.iq.dataverse.export.croissant; +import edu.harvard.iq.dataverse.dataset.DatasetType; import io.gdcc.spi.export.ExportDataProvider; import io.gdcc.spi.export.ExportException; import jakarta.json.Json; @@ -87,390 +88,417 @@ public static void exportDataset( job.add("conformsTo", "http://mlcommons.org/croissant/1.1"); JsonObject datasetJson = dataProvider.getDatasetJson(); + String datasetType = datasetJson.getString("datasetType", null); JsonObject datasetORE = dataProvider.getDatasetORE(); JsonObject describes = datasetORE.getJsonObject("ore:describes"); - job.add("name", StringEscapeUtils.escapeHtml4(describes.getString("title"))); - job.add("url", describes.getJsonString("@id")); JsonObject datasetSchemaDotOrg = dataProvider.getDatasetSchemaDotOrg(); - // We don't escape DatasetSchemaDotOrg fields like creator, description, etc. because - // they are already escaped. - job.add("creator", datasetSchemaDotOrg.getJsonArray("creator")); - job.add("description", datasetSchemaDotOrg.getJsonString("description")); - job.add("keywords", datasetSchemaDotOrg.getJsonArray("keywords")); - job.add("license", datasetSchemaDotOrg.getString("license")); - String datePublished = datasetSchemaDotOrg.getString("datePublished", null); - if (datePublished != null) { - job.add("datePublished", datasetSchemaDotOrg.getString("datePublished")); - } - job.add("dateModified", datasetSchemaDotOrg.getString("dateModified")); - job.add( - "includedInDataCatalog", - datasetSchemaDotOrg.getJsonObject("includedInDataCatalog")); - job.add("publisher", datasetSchemaDotOrg.getJsonObject("publisher")); - - /** - * For "version", we are knowingly sending "1.0" rather than "1.0.0", even though - * MAJOR.MINOR.PATCH is recommended by the Croissant spec. We are aware that the - * Croissant validator throws a warning for anything other than MAJOR.MINOR.PATCH. See - * the README for a detailed explanation and the following issues: - * https://github.com/mlcommons/croissant/issues/609 - * https://github.com/mlcommons/croissant/issues/643 - */ - job.add("version", describes.getString("schema:version")); - /** - * We have been told that it's fine and appropriate to put the citation to the dataset - * itself into "citeAs". However, the spec says "citeAs" is "A citation for a - * publication that describes the dataset" so we have asked for clarification here: - * https://github.com/mlcommons/croissant/issues/638 - */ - job.add("citeAs", getBibtex(datasetORE, datasetJson, datasetSchemaDotOrg)); - - JsonArray funder = datasetSchemaDotOrg.getJsonArray("funder"); - if (funder != null) { - job.add("funder", funder); - } - - JsonArray spatialCoverage = datasetSchemaDotOrg.getJsonArray("spatialCoverage"); - if (spatialCoverage != null) { - job.add("spatialCoverage", spatialCoverage); - } - - JsonArray oreFiles = describes.getJsonArray("ore:aggregates"); - - // Create a map so that later we can use the storageIdentifier to lookup - // the position of the file in the array of files in the datasetORE format. - // We don't use checksum because it's possible for a dataset to have the - // same checksum for multiple files. - Map storageIdentifierToPositionInOre = new HashMap<>(); - for (int i = 0; i < oreFiles.size(); i++) { - JsonObject aggregate = oreFiles.getJsonObject(i); - String storageIdentifier = aggregate.getString("dvcore:storageIdentifier", null); - if (storageIdentifier != null) { - storageIdentifierToPositionInOre.put(storageIdentifier, i); - } - } - - JsonArrayBuilder distribution = Json.createArrayBuilder(); - JsonArrayBuilder recordSet = Json.createArrayBuilder(); - JsonArray datasetFileDetails = dataProvider.getDatasetFileDetails(); - for (JsonValue jsonValue : datasetFileDetails) { - - JsonObjectBuilder recordSetContent = Json.createObjectBuilder(); - recordSetContent.add("@type", "cr:RecordSet"); - JsonObject fileDetails = jsonValue.asJsonObject(); + if (datasetType != null && DatasetType.DATASET_TYPE_REVIEW.equals(datasetType)) { /** - * When there is an originalFileName, it means that the file has gone through ingest - * and that multiple files formats are available: original, tab-separated, and - * RData. Currently we are only showing the original file but we we could create - * additional cr:FileObject entries for tab-separated and RData as suggested in - * https://github.com/mlcommons/croissant/issues/641 . Should we? Is there interest - * in this? And would we duplicate all the cr:RecordSet entries (columns) with each - * additional format? Probably not as it would be the same. + * Review datasets are "inside out" in the sense that we don't present the normal + * metadata fields via Croissant. Instead, we attempt to represent the item being + * reviewed. The number of fields we can expose is limited, but we can expose + * itemReviewedUrl, for example. */ - String filename = - StringEscapeUtils.escapeHtml4( - fileDetails.getString("originalFileName", null)); - if (filename == null) { - filename = StringEscapeUtils.escapeHtml4(fileDetails.getString("filename")); - } - String fileFormat = null; - // Use the original file format, if available, since that's where the - // contentUrl will point. - String originalFileFormat = fileDetails.getString("originalFileFormat", null); - if (originalFileFormat != null) { - if ("text/tsv".equals(originalFileFormat)) { - // "text/tsv" is an internal format used by Dataverse while - // "text/tab-separated-values" is the official IANA format - // that we present to the outside world - // See https://github.com/IQSS/dataverse/issues/11505 and - // https://www.iana.org/assignments/media-types/media-types.xhtml - fileFormat = "text/tab-separated-values"; - } else { - fileFormat = originalFileFormat; - } - } - if (fileFormat == null) { - fileFormat = fileDetails.getString("contentType"); - } - JsonNumber fileSize = fileDetails.getJsonNumber("originalFileSize"); - if (fileSize == null) { - fileSize = fileDetails.getJsonNumber("filesize"); + job.add( + "url", + describes + .getJsonObject("review:itemReviewed") + .getJsonString("review:itemReviewedUrl")); + } else { + // This is something other than a review dataset. Display all the normal Croissant + // fields. + job.add("name", StringEscapeUtils.escapeHtml4(describes.getString("title"))); + job.add("url", describes.getJsonString("@id")); + + // We don't escape DatasetSchemaDotOrg fields like creator, description, etc. + // because + // they are already escaped. + job.add("creator", datasetSchemaDotOrg.getJsonArray("creator")); + job.add("description", datasetSchemaDotOrg.getJsonString("description")); + job.add("keywords", datasetSchemaDotOrg.getJsonArray("keywords")); + job.add("license", datasetSchemaDotOrg.getString("license")); + String datePublished = datasetSchemaDotOrg.getString("datePublished", null); + if (datePublished != null) { + job.add("datePublished", datasetSchemaDotOrg.getString("datePublished")); } + job.add("dateModified", datasetSchemaDotOrg.getString("dateModified")); + job.add( + "includedInDataCatalog", + datasetSchemaDotOrg.getJsonObject("includedInDataCatalog")); + job.add("publisher", datasetSchemaDotOrg.getJsonObject("publisher")); /** - * We make contentSize a String ( https://schema.org/Text ) rather than a number - * (JsonNumber) to pass the Croissant validator and comply with the spec. We don't - * include a unit because the spec says "Defaults to bytes if a unit is not - * specified." + * For "version", we are knowingly sending "1.0" rather than "1.0.0", even though + * MAJOR.MINOR.PATCH is recommended by the Croissant spec. We are aware that the + * Croissant validator throws a warning for anything other than MAJOR.MINOR.PATCH. + * See the README for a detailed explanation and the following issues: + * https://github.com/mlcommons/croissant/issues/609 + * https://github.com/mlcommons/croissant/issues/643 */ - String fileSizeInBytes = fileSize.toString(); - JsonObject checksum = fileDetails.getJsonObject("checksum"); - // Out of the box the checksum type will be md5 - String checksumType = checksum.getString("type").toLowerCase(); - String checksumValue = checksum.getString("value"); - String storageIdentifier = fileDetails.getString("storageIdentifier"); - int positionInOre = storageIdentifierToPositionInOre.get(storageIdentifier); - String contentUrl = - oreFiles.getJsonObject(positionInOre).getString("schema:sameAs"); - String description = - StringEscapeUtils.escapeHtml4(fileDetails.getString("description", "")); + job.add("version", describes.getString("schema:version")); + /** - * See https://github.com/mlcommons/croissant/issues/639 for discussion with the - * Croissant spec leads on what to put in - * - * @id (path/to/file.txt). - *

It's suboptimal that the directoryLabel isn't already included in - * dataProvider.getDatasetFileDetails(). If it gets added as part of the - * following issue, we can get it from there: - * https://github.com/IQSS/dataverse/issues/10523 + * We have been told that it's fine and appropriate to put the citation to the + * dataset itself into "citeAs". However, the spec says "citeAs" is "A citation for + * a publication that describes the dataset" so we have asked for clarification + * here: https://github.com/mlcommons/croissant/issues/638 */ - String fileId = filename; - // We don't escape directory label because many characters aren't allowed anyway - String directoryLabel = - oreFiles.getJsonObject(positionInOre) - .getString("dvcore:directoryLabel", null); - if (directoryLabel != null) { - fileId = directoryLabel + "/" + filename; + job.add("citeAs", getBibtex(datasetORE, datasetJson, datasetSchemaDotOrg)); + + JsonArray funder = datasetSchemaDotOrg.getJsonArray("funder"); + if (funder != null) { + job.add("funder", funder); } - distribution.add( - Json.createObjectBuilder() - .add("@type", "cr:FileObject") - .add("@id", fileId) - .add("name", filename) - .add("encodingFormat", fileFormat) - .add(checksumType, checksumValue) - .add("contentSize", fileSizeInBytes) - .add("description", description) - .add("contentUrl", contentUrl)); - boolean fileRestricted = fileDetails.getBoolean("restricted"); - if (fileRestricted) { - // Don't add the recordSet items for restricted files. - // Go on to the next file. - continue; + JsonArray spatialCoverage = datasetSchemaDotOrg.getJsonArray("spatialCoverage"); + if (spatialCoverage != null) { + job.add("spatialCoverage", spatialCoverage); } - int fileIndex = 0; - JsonArray dataTables = fileDetails.getJsonArray("dataTables"); - if (dataTables == null) { - dataTables = JsonArray.EMPTY_JSON_ARRAY; + + JsonArray oreFiles = describes.getJsonArray("ore:aggregates"); + + // Create a map so that later we can use the storageIdentifier to lookup + // the position of the file in the array of files in the datasetORE format. + // We don't use checksum because it's possible for a dataset to have the + // same checksum for multiple files. + Map storageIdentifierToPositionInOre = new HashMap<>(); + for (int i = 0; i < oreFiles.size(); i++) { + JsonObject aggregate = oreFiles.getJsonObject(i); + String storageIdentifier = + aggregate.getString("dvcore:storageIdentifier", null); + if (storageIdentifier != null) { + storageIdentifierToPositionInOre.put(storageIdentifier, i); + } } - for (JsonValue dataTableValue : dataTables) { - JsonObject dataTableObject = dataTableValue.asJsonObject(); - // Unused - int varQuantity = dataTableObject.getInt("varQuantity"); - // Unused - int caseQuantity = dataTableObject.getInt("caseQuantity"); - recordSetContent.add( - "cr:annotation", - Json.createObjectBuilder() - .add("@type", "cr:Field") - .add("name", fileId.toString() + "/count") - .add("value", caseQuantity) - .add("dataType", "http://www.wikidata.org/entity/Q4049983")); - JsonArray dataVariables = dataTableObject.getJsonArray("dataVariables"); - JsonArrayBuilder fieldSetArray = Json.createArrayBuilder(); - for (JsonValue dataVariableValue : dataVariables) { - JsonObjectBuilder fieldSetObject = Json.createObjectBuilder(); - fieldSetObject.add("@type", "cr:RecordSet"); - JsonObject dataVariableObject = dataVariableValue.asJsonObject(); - // TODO: should this be an integer? - Integer variableId = dataVariableObject.getInt("id"); - String variableName = - StringEscapeUtils.escapeHtml4(dataVariableObject.getString("name")); - String variableDescription = - StringEscapeUtils.escapeHtml4( - dataVariableObject.getString("label", "")); - String variableFormatType = - dataVariableObject.getString("variableFormatType"); - String variableIntervalType = - dataVariableObject.getString("variableIntervalType"); - JsonObject variableSummaryStatistics = - dataVariableObject.getJsonObject("summaryStatistics"); - String dataType = null; - /** - * There are only two variableFormatType types on the Dataverse side: - * CHARACTER and NUMERIC. (See VariableType in DataVariable.java.) - */ - switch (variableFormatType) { - case "CHARACTER": - dataType = "sc:Text"; - break; - case "NUMERIC": - dataType = getNumericType(variableIntervalType); - break; - default: - break; - } - JsonArrayBuilder annotationsBuilder = Json.createArrayBuilder(); - if (variableSummaryStatistics != null) { - // Same order as upstream: MEAN, MEDN, MODE, MIN, MAX, STDEV, VALD, INVD - annotationsBuilder - .add( - Json.createObjectBuilder() - // We're aware that an @id of - // "data/stata13-auto.dta/price/mean" - // looks nice but won't validate if there's - // whitespace in the filename. - // We've asked for guidance here: - // https://github.com/mlcommons/croissant/issues/639#issuecomment-3792179493 - .add( - "@id", - fileId.toString() - + "/" - + variableName - // The spec gives "mean" as an - // example but we'll use - // ArithmeticMean from - // https://rdf-vocabulary.ddialliance.org/ddi-cv/SummaryStatisticType/2.1.2/SummaryStatisticType.html - + "/ArithmeticMean") - .add( - "value", - variableSummaryStatistics.getString( - "mean")) - .add("dataType", "ddi-stats:7975ed0")) - .add( - Json.createObjectBuilder() - .add( - "@id", - fileId.toString() - + "/" - + variableName - + "/Median") - .add( - "value", - variableSummaryStatistics.getString( - "medn")) - .add("dataType", "ddi-stats:66851a3") - .add("equivalentProperty", "sc:median")) - .add( - Json.createObjectBuilder() - .add( - "@id", - fileId.toString() - + "/" - + variableName - + "/Mode") - .add( - "value", - variableSummaryStatistics.getString( - "mode")) - .add("dataType", "ddi-stats:650be61")) - .add( - Json.createObjectBuilder() - .add( - "@id", - fileId.toString() - + "/" - + variableName - + "/Minimum") - .add( - "value", - variableSummaryStatistics.getString( - "min")) - .add("dataType", "ddi-stats:a1d0ec6") - .add("equivalentProperty", "sc:minValue")) - .add( - Json.createObjectBuilder() - .add( - "@id", - fileId.toString() - + "/" - + variableName - + "/Maximum") - .add( - "value", - variableSummaryStatistics.getString( - "max")) - .add("dataType", "ddi-stats:8321e79") - .add("equivalentProperty", "sc:maxValue")) - .add( - Json.createObjectBuilder() - .add( - "@id", - fileId.toString() - + "/" - + variableName - + "/StandardDeviation") - .add( - "value", - variableSummaryStatistics.getString( - "stdev")) - .add("dataType", "ddi-stats:690ab50")) - .add( - Json.createObjectBuilder() - .add( - "@id", - fileId.toString() - + "/" - + variableName - + "/ValidCases") - .add( - "value", - variableSummaryStatistics.getString( - "vald")) - .add("dataType", "ddi-stats:c646dd8")) - .add( - Json.createObjectBuilder() - .add( - "@id", - fileId.toString() - + "/" - + variableName - + "/InvalidCases") - .add( - "value", - variableSummaryStatistics.getString( - "invd")) - .add("dataType", "ddi-stats:6459c62")); + + JsonArrayBuilder distribution = Json.createArrayBuilder(); + JsonArrayBuilder recordSet = Json.createArrayBuilder(); + JsonArray datasetFileDetails = dataProvider.getDatasetFileDetails(); + for (JsonValue jsonValue : datasetFileDetails) { + + JsonObjectBuilder recordSetContent = Json.createObjectBuilder(); + recordSetContent.add("@type", "cr:RecordSet"); + JsonObject fileDetails = jsonValue.asJsonObject(); + /** + * When there is an originalFileName, it means that the file has gone through + * ingest and that multiple files formats are available: original, + * tab-separated, and RData. Currently we are only showing the original file but + * we we could create additional cr:FileObject entries for tab-separated and + * RData as suggested in https://github.com/mlcommons/croissant/issues/641 . + * Should we? Is there interest in this? And would we duplicate all the + * cr:RecordSet entries (columns) with each additional format? Probably not as + * it would be the same. + */ + String filename = + StringEscapeUtils.escapeHtml4( + fileDetails.getString("originalFileName", null)); + if (filename == null) { + filename = StringEscapeUtils.escapeHtml4(fileDetails.getString("filename")); + } + String fileFormat = null; + // Use the original file format, if available, since that's where the + // contentUrl will point. + String originalFileFormat = fileDetails.getString("originalFileFormat", null); + if (originalFileFormat != null) { + if ("text/tsv".equals(originalFileFormat)) { + // "text/tsv" is an internal format used by Dataverse while + // "text/tab-separated-values" is the official IANA format + // that we present to the outside world + // See https://github.com/IQSS/dataverse/issues/11505 and + // https://www.iana.org/assignments/media-types/media-types.xhtml + fileFormat = "text/tab-separated-values"; + } else { + fileFormat = originalFileFormat; } - JsonObjectBuilder fieldBuilder = + } + if (fileFormat == null) { + fileFormat = fileDetails.getString("contentType"); + } + JsonNumber fileSize = fileDetails.getJsonNumber("originalFileSize"); + if (fileSize == null) { + fileSize = fileDetails.getJsonNumber("filesize"); + } + + /** + * We make contentSize a String ( https://schema.org/Text ) rather than a number + * (JsonNumber) to pass the Croissant validator and comply with the spec. We + * don't include a unit because the spec says "Defaults to bytes if a unit is + * not specified." + */ + String fileSizeInBytes = fileSize.toString(); + JsonObject checksum = fileDetails.getJsonObject("checksum"); + // Out of the box the checksum type will be md5 + String checksumType = checksum.getString("type").toLowerCase(); + String checksumValue = checksum.getString("value"); + String storageIdentifier = fileDetails.getString("storageIdentifier"); + int positionInOre = storageIdentifierToPositionInOre.get(storageIdentifier); + String contentUrl = + oreFiles.getJsonObject(positionInOre).getString("schema:sameAs"); + String description = + StringEscapeUtils.escapeHtml4(fileDetails.getString("description", "")); + /** + * See https://github.com/mlcommons/croissant/issues/639 for discussion with the + * Croissant spec leads on what to put in + * + * @id (path/to/file.txt). + *

It's suboptimal that the directoryLabel isn't already included in + * dataProvider.getDatasetFileDetails(). If it gets added as part of the + * following issue, we can get it from there: + * https://github.com/IQSS/dataverse/issues/10523 + */ + String fileId = filename; + // We don't escape directory label because many characters aren't allowed anyway + String directoryLabel = + oreFiles.getJsonObject(positionInOre) + .getString("dvcore:directoryLabel", null); + if (directoryLabel != null) { + fileId = directoryLabel + "/" + filename; + } + + distribution.add( + Json.createObjectBuilder() + .add("@type", "cr:FileObject") + .add("@id", fileId) + .add("name", filename) + .add("encodingFormat", fileFormat) + .add(checksumType, checksumValue) + .add("contentSize", fileSizeInBytes) + .add("description", description) + .add("contentUrl", contentUrl)); + boolean fileRestricted = fileDetails.getBoolean("restricted"); + if (fileRestricted) { + // Don't add the recordSet items for restricted files. + // Go on to the next file. + continue; + } + int fileIndex = 0; + JsonArray dataTables = fileDetails.getJsonArray("dataTables"); + if (dataTables == null) { + dataTables = JsonArray.EMPTY_JSON_ARRAY; + } + for (JsonValue dataTableValue : dataTables) { + JsonObject dataTableObject = dataTableValue.asJsonObject(); + // Unused + int varQuantity = dataTableObject.getInt("varQuantity"); + // Unused + int caseQuantity = dataTableObject.getInt("caseQuantity"); + recordSetContent.add( + "cr:annotation", Json.createObjectBuilder() .add("@type", "cr:Field") - .add("name", variableName) - .add("description", variableDescription) - .add("dataType", dataType) + .add("name", fileId.toString() + "/count") + .add("value", caseQuantity) + .add( + "dataType", + "http://www.wikidata.org/entity/Q4049983")); + JsonArray dataVariables = dataTableObject.getJsonArray("dataVariables"); + JsonArrayBuilder fieldSetArray = Json.createArrayBuilder(); + for (JsonValue dataVariableValue : dataVariables) { + JsonObjectBuilder fieldSetObject = Json.createObjectBuilder(); + fieldSetObject.add("@type", "cr:RecordSet"); + JsonObject dataVariableObject = dataVariableValue.asJsonObject(); + // TODO: should this be an integer? + Integer variableId = dataVariableObject.getInt("id"); + String variableName = + StringEscapeUtils.escapeHtml4( + dataVariableObject.getString("name")); + String variableDescription = + StringEscapeUtils.escapeHtml4( + dataVariableObject.getString("label", "")); + String variableFormatType = + dataVariableObject.getString("variableFormatType"); + String variableIntervalType = + dataVariableObject.getString("variableIntervalType"); + JsonObject variableSummaryStatistics = + dataVariableObject.getJsonObject("summaryStatistics"); + String dataType = null; + /** + * There are only two variableFormatType types on the Dataverse side: + * CHARACTER and NUMERIC. (See VariableType in DataVariable.java.) + */ + switch (variableFormatType) { + case "CHARACTER": + dataType = "sc:Text"; + break; + case "NUMERIC": + dataType = getNumericType(variableIntervalType); + break; + default: + break; + } + JsonArrayBuilder annotationsBuilder = Json.createArrayBuilder(); + if (variableSummaryStatistics != null) { + // Same order as upstream: MEAN, MEDN, MODE, MIN, MAX, STDEV, VALD, + // INVD + annotationsBuilder .add( - "source", Json.createObjectBuilder() - .add("@id", variableId.toString()) + // We're aware that an @id of + // "data/stata13-auto.dta/price/mean" + // looks nice but won't validate if there's + // whitespace in the filename. + // We've asked for guidance here: + // https://github.com/mlcommons/croissant/issues/639#issuecomment-3792179493 .add( - "fileObject", - Json.createObjectBuilder() - .add("@id", fileId)) + "@id", + fileId.toString() + + "/" + + variableName + // The spec gives "mean" as + // an + // example but we'll use + // ArithmeticMean from + // https://rdf-vocabulary.ddialliance.org/ddi-cv/SummaryStatisticType/2.1.2/SummaryStatisticType.html + + "/ArithmeticMean") .add( - "extract", - Json.createObjectBuilder() - .add( - "column", - variableName))); - JsonArray annotations = annotationsBuilder.build(); - if (!annotations.isEmpty()) { - fieldBuilder.add("annotation", annotations); + "value", + variableSummaryStatistics.getString( + "mean")) + .add("dataType", "ddi-stats:7975ed0")) + .add( + Json.createObjectBuilder() + .add( + "@id", + fileId.toString() + + "/" + + variableName + + "/Median") + .add( + "value", + variableSummaryStatistics.getString( + "medn")) + .add("dataType", "ddi-stats:66851a3") + .add("equivalentProperty", "sc:median")) + .add( + Json.createObjectBuilder() + .add( + "@id", + fileId.toString() + + "/" + + variableName + + "/Mode") + .add( + "value", + variableSummaryStatistics.getString( + "mode")) + .add("dataType", "ddi-stats:650be61")) + .add( + Json.createObjectBuilder() + .add( + "@id", + fileId.toString() + + "/" + + variableName + + "/Minimum") + .add( + "value", + variableSummaryStatistics.getString( + "min")) + .add("dataType", "ddi-stats:a1d0ec6") + .add("equivalentProperty", "sc:minValue")) + .add( + Json.createObjectBuilder() + .add( + "@id", + fileId.toString() + + "/" + + variableName + + "/Maximum") + .add( + "value", + variableSummaryStatistics.getString( + "max")) + .add("dataType", "ddi-stats:8321e79") + .add("equivalentProperty", "sc:maxValue")) + .add( + Json.createObjectBuilder() + .add( + "@id", + fileId.toString() + + "/" + + variableName + + "/StandardDeviation") + .add( + "value", + variableSummaryStatistics.getString( + "stdev")) + .add("dataType", "ddi-stats:690ab50")) + .add( + Json.createObjectBuilder() + .add( + "@id", + fileId.toString() + + "/" + + variableName + + "/ValidCases") + .add( + "value", + variableSummaryStatistics.getString( + "vald")) + .add("dataType", "ddi-stats:c646dd8")) + .add( + Json.createObjectBuilder() + .add( + "@id", + fileId.toString() + + "/" + + variableName + + "/InvalidCases") + .add( + "value", + variableSummaryStatistics.getString( + "invd")) + .add("dataType", "ddi-stats:6459c62")); + } + JsonObjectBuilder fieldBuilder = + Json.createObjectBuilder() + .add("@type", "cr:Field") + .add("name", variableName) + .add("description", variableDescription) + .add("dataType", dataType) + .add( + "source", + Json.createObjectBuilder() + .add("@id", variableId.toString()) + .add( + "fileObject", + Json.createObjectBuilder() + .add("@id", fileId)) + .add( + "extract", + Json.createObjectBuilder() + .add( + "column", + variableName))); + JsonArray annotations = annotationsBuilder.build(); + if (!annotations.isEmpty()) { + fieldBuilder.add("annotation", annotations); + } + fieldSetArray.add(fieldBuilder); } - fieldSetArray.add(fieldBuilder); + recordSetContent.add("field", fieldSetArray); + recordSet.add(recordSetContent); + fileIndex++; } - recordSetContent.add("field", fieldSetArray); - recordSet.add(recordSetContent); - fileIndex++; } - } - JsonArray citation = datasetSchemaDotOrg.getJsonArray("citation"); - if (citation != null) { - job.add("citation", citation); - } - JsonArray temporalCoverage = datasetSchemaDotOrg.getJsonArray("temporalCoverage"); - if (temporalCoverage != null) { - job.add("temporalCoverage", temporalCoverage); - } - JsonArray distributionArray = distribution.build(); - if (!slim && !distributionArray.isEmpty()) { - job.add("distribution", distributionArray); - } - JsonArray recordSetArray = recordSet.build(); - if (!slim && !recordSetArray.isEmpty()) { - job.add("recordSet", recordSetArray); + JsonArray citation = datasetSchemaDotOrg.getJsonArray("citation"); + if (citation != null) { + job.add("citation", citation); + } + JsonArray temporalCoverage = datasetSchemaDotOrg.getJsonArray("temporalCoverage"); + if (temporalCoverage != null) { + job.add("temporalCoverage", temporalCoverage); + } + JsonArray distributionArray = distribution.build(); + if (!slim && !distributionArray.isEmpty()) { + job.add("distribution", distributionArray); + } + JsonArray recordSetArray = recordSet.build(); + if (!slim && !recordSetArray.isEmpty()) { + job.add("recordSet", recordSetArray); + } } // TODO: Do we need DataCite XML? diff --git a/src/test/java/edu/harvard/iq/dataverse/api/ReviewsIT.java b/src/test/java/edu/harvard/iq/dataverse/api/ReviewsIT.java index 2fc74c4b3f1..d2b857a80fe 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/ReviewsIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/ReviewsIT.java @@ -11,6 +11,7 @@ import static jakarta.ws.rs.core.Response.Status.CREATED; import static jakarta.ws.rs.core.Response.Status.OK; import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.nullValue; import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.IOException; @@ -231,6 +232,23 @@ public void testCreateReview() { Integer reviewId = UtilIT.getDatasetIdFromResponse(createReview); String reviewPid = JsonPath.from(createReview.getBody().asString()).getString("data.persistentId"); + UtilIT.publishDataverseViaNativeApi(dataverseAlias, apiToken).then().statusCode(OK.getStatusCode()); + UtilIT.publishDatasetViaNativeApi(reviewPid, "major", apiToken).then().statusCode(OK.getStatusCode()); + + /** + * Review datasets are "inside out" in the sense that we don't present the + * normal metadata fields via Croissant. Instead, we attempt to represent the + * item being reviewed. The number of fields we can expose is limited, but we + * can expose "itemReviewedUrl" as "url" for example. + */ + Response insideOutCroissant = UtilIT.exportDataset(reviewPid, "croissantSlim"); + insideOutCroissant.prettyPrint(); + insideOutCroissant.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("@type", is("sc:Dataset")) + .body("name", nullValue()) + .body("url", is("https://datacommons.org/tools/statvar#sv=Percent_Person_Children_WithAsthma")); + } /** diff --git a/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterSlimTest.java b/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterSlimTest.java index fcbc9611818..3163800a9ba 100644 --- a/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterSlimTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterSlimTest.java @@ -42,6 +42,8 @@ public class CroissantExporterSlimTest { static ExportDataProvider dataProviderJunk; static OutputStream outputStreamDraft; static ExportDataProvider dataProviderDraft; + static OutputStream outputStreamReview; + static ExportDataProvider dataProviderReview; @BeforeAll public static void setUp() { @@ -430,6 +432,70 @@ public String getDataCiteXml() { } } }; + + outputStreamReview = new ByteArrayOutputStream(); + dataProviderReview = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = + "src/test/resources/croissant/review/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = + "src/test/resources/croissant/review/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/review/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/review/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get( + "src/test/resources/croissant/review/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; } @Test @@ -544,6 +610,20 @@ public void testExportDatasetDraft() throws Exception { assertEquals(prettyPrint(expected), prettyPrint(outputStreamDraft.toString())); } + @Test + public void testExportDatasetReview() throws Exception { + exporter.exportDataset(dataProviderReview, outputStreamReview); + String actual = outputStreamReview.toString(); + writeCroissantFile(actual, "review"); + String expected = + Files.readString( + Paths.get( + "src/test/resources/croissant/review/expected/review-croissantSlim.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamReview.toString())); + } + private void writeCroissantFile(String actual, String name) throws IOException { Path dir = Files.createDirectories(Paths.get("src/test/resources/croissant/" + name + "/out")); diff --git a/src/test/resources/croissant/review/expected/review-croissantSlim.json b/src/test/resources/croissant/review/expected/review-croissantSlim.json new file mode 100644 index 00000000000..740ad833c36 --- /dev/null +++ b/src/test/resources/croissant/review/expected/review-croissantSlim.json @@ -0,0 +1,52 @@ +{ + "@context": { + "@language": "en", + "@vocab": "http://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "ddi-stats": "http://rdf-vocabulary.ddialliance.org/cv/SummaryStatisticType/2.1.2/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "http://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.1", + "url": "https://datacommons.org/tools/statvar#sv=Percent_Person_Children_WithAsthma" +} \ No newline at end of file diff --git a/src/test/resources/croissant/review/in/dataCiteXml.xml b/src/test/resources/croissant/review/in/dataCiteXml.xml new file mode 100644 index 00000000000..40fbd8c18d9 --- /dev/null +++ b/src/test/resources/croissant/review/in/dataCiteXml.xml @@ -0,0 +1,31 @@ + + + 10.5072/FK2/ZIKBUC + + + Wazowski, Mike + Mike + Wazowski + + + + Review of Percent of Children That Have Asthma + + Root + 2026 + + Medicine, Health and Life Sciences + + + 2026-03-13 + + Review + 1.0 + + + Creative Commons CC0 1.0 Universal Public Domain Dedication. + + + This is a review of a dataset. + + diff --git a/src/test/resources/croissant/review/in/datasetFileDetails.json b/src/test/resources/croissant/review/in/datasetFileDetails.json new file mode 100644 index 00000000000..fe51488c706 --- /dev/null +++ b/src/test/resources/croissant/review/in/datasetFileDetails.json @@ -0,0 +1 @@ +[] diff --git a/src/test/resources/croissant/review/in/datasetJson.json b/src/test/resources/croissant/review/in/datasetJson.json new file mode 100644 index 00000000000..858af3cc4f4 --- /dev/null +++ b/src/test/resources/croissant/review/in/datasetJson.json @@ -0,0 +1,140 @@ +{ + "id": 3, + "identifier": "FK2/ZIKBUC", + "persistentUrl": "https://doi.org/10.5072/FK2/ZIKBUC", + "protocol": "doi", + "authority": "10.5072", + "separator": "/", + "publisher": "Root", + "publicationDate": "2026-03-13", + "storageIdentifier": "local://10.5072/FK2/ZIKBUC", + "datasetType": "review", + "locks": [], + "datasetVersion": { + "id": 1, + "datasetId": 3, + "datasetPersistentId": "doi:10.5072/FK2/ZIKBUC", + "datasetType": "review", + "storageIdentifier": "local://10.5072/FK2/ZIKBUC", + "versionNumber": 1, + "internalVersionNumber": 4, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "latestVersionPublishingState": "RELEASED", + "lastUpdateTime": "2026-03-13T20:30:58Z", + "releaseTime": "2026-03-13T20:30:58Z", + "createTime": "2026-03-13T20:30:56Z", + "publicationDate": "2026-03-13", + "citationDate": "2026-03-13", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png", + "rightsIdentifier": "CC0-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Review of Percent of Children That Have Asthma" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Wazowski, Mike" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "mwazowski@mailinator.com" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "This is a review of a dataset." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Medicine, Health and Life Sciences" + ] + } + ] + }, + "review": { + "displayName": "Review Metadata", + "name": "review", + "fields": [ + { + "typeName": "itemReviewed", + "multiple": false, + "typeClass": "compound", + "value": { + "itemReviewedUrl": { + "typeName": "itemReviewedUrl", + "multiple": false, + "typeClass": "primitive", + "value": "https://datacommons.org/tools/statvar#sv=Percent_Person_Children_WithAsthma" + }, + "itemReviewedType": { + "typeName": "itemReviewedType", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "Dataset" + }, + "itemReviewedCitation": { + "typeName": "itemReviewedCitation", + "multiple": false, + "typeClass": "primitive", + "value": "\"Statistical Variable Explorer - Data Commons.\" Datacommons.org, 2026, datacommons.org/tools/statvar#sv=Percent_Person_Children_WithAsthma. Accessed 9 Mar. 2026." + } + } + } + ] + } + }, + "files": [], + "citation": "Wazowski, Mike, 2026, \"Review of Percent of Children That Have Asthma\", https://doi.org/10.5072/FK2/ZIKBUC, Root, V1" + } +} diff --git a/src/test/resources/croissant/review/in/datasetORE.json b/src/test/resources/croissant/review/in/datasetORE.json new file mode 100644 index 00000000000..8c24a72baf6 --- /dev/null +++ b/src/test/resources/croissant/review/in/datasetORE.json @@ -0,0 +1,68 @@ +{ + "dcterms:modified": "2026-03-13", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.1", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.9", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "http://localhost:8080/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/ZIKBUC", + "ore:describes": { + "review:itemReviewed": { + "review:itemReviewedUrl": "https://datacommons.org/tools/statvar#sv=Percent_Person_Children_WithAsthma", + "review:itemReviewedType": "Dataset", + "review:itemReviewedCitation": "\"Statistical Variable Explorer - Data Commons.\" Datacommons.org, 2026, datacommons.org/tools/statvar#sv=Percent_Person_Children_WithAsthma. Accessed 9 Mar. 2026." + }, + "author": { + "citation:authorName": "Wazowski, Mike" + }, + "citation:dsDescription": { + "citation:dsDescriptionValue": "This is a review of a dataset." + }, + "citation:datasetContact": { + "citation:datasetContactEmail": "mwazowski@mailinator.com" + }, + "title": "Review of Percent of Children That Have Asthma", + "subject": "Medicine, Health and Life Sciences", + "@id": "https://doi.org/10.5072/FK2/ZIKBUC", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "1.0", + "schema:name": "Review of Percent of Children That Have Asthma", + "schema:dateModified": "2026-03-13 20:30:58.165", + "schema:datePublished": "2026-03-13", + "schema:creativeWorkStatus": "RELEASED", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "dv586b3d63", + "@id": "http://localhost:8080/dataverse/dv586b3d63", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "http://localhost:8080/dataverse/root", + "schema:description": "The root dataverse." + } + }, + "ore:aggregates": [], + "schema:hasPart": [] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "citation": "https://dataverse.org/schema/citation/", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "ore": "http://www.openarchives.org/ore/terms/", + "review": "http://localhost:8080/schema/review#", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/review/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/review/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..69f95186b2c --- /dev/null +++ b/src/test/resources/croissant/review/in/datasetSchemaDotOrg.json @@ -0,0 +1,44 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/ZIKBUC", + "identifier": "https://doi.org/10.5072/FK2/ZIKBUC", + "name": "Review of Percent of Children That Have Asthma", + "creator": [ + { + "@type": "Person", + "givenName": "Mike", + "familyName": "Wazowski", + "name": "Wazowski, Mike" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Mike", + "familyName": "Wazowski", + "name": "Wazowski, Mike" + } + ], + "datePublished": "2026-03-13", + "dateModified": "2026-03-13", + "version": "1", + "description": "This is a review of a dataset.", + "keywords": [ + "Medicine, Health and Life Sciences" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + } +}