From dc3bba9658fcccd6eefb24dd60d735bf5ede3b27 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Thu, 26 Mar 2026 13:51:03 +0100 Subject: [PATCH 01/74] Single-cell data transformation draft documents --- .../about-sc-hdf5-transformations.md | 83 +++ .../doc-odm-user-guide/api-reference.md | 258 +++++++++ .../configuration-reference.md | 513 ++++++++++++++++++ .../how-to-sc-hdf5-transformations.md | 392 +++++++++++++ .../transformation-process-reference.md | 214 ++++++++ 5 files changed, 1460 insertions(+) create mode 100644 docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md create mode 100644 docs/user-guide/doc-odm-user-guide/api-reference.md create mode 100644 docs/user-guide/doc-odm-user-guide/configuration-reference.md create mode 100644 docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md create mode 100644 docs/user-guide/doc-odm-user-guide/transformation-process-reference.md diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md new file mode 100644 index 0000000..c150a6a --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -0,0 +1,83 @@ +# About Single-Cell HDF5 Transformations in ODM + +> **Related documentation:** For step-by-step guidance on running the transformation, see the [How-to Guides](how-to-sc-hdf5-transformations.md). For the full configuration schema, see the [Configuration Reference](configuration-reference.md). For the API endpoints, see the [API Reference](api-reference.md). For the internal processing pipeline, see the [Transformation Process Reference](transformation-process-reference.md). + +Single-cell datasets are commonly distributed as HDF5-based files — most often H5AD (the AnnData format) or the 10x Genomics H5 format. While these formats are rich and expressive, they are not directly ingestible into ODM in a way that supports consistent indexing, cross-dataset search, and the entity relationships that ODM relies on for downstream discovery. + +The single-cell HDF5 transformation exists to bridge this gap. Rather than requiring users to hand-craft intermediate files or manually restructure their data, the transformation automates the entire end-to-end process: reading the source file, extracting and optionally curating the relevant metadata, and uploading the results as structured ODM objects. + +## The ODM entity model for single-cell data + +Understanding the transformation requires familiarity with how ODM represents single-cell experiments. ODM organises data around a hierarchy of entities: + +- **Sample, Library, and Preparation groups** (collectively referred to as SLP) represent the biological and experimental context of the data. A Sample describes a biological specimen; a Library describes the sequencing library prepared from it; a Preparation describes a preparation step. These entities already exist in ODM for most studies, or can be created by the transformation itself. + +- **A Cell Group** represents the collection of individual cells from an experiment, together with their metadata. Each Cell Group must be linked to exactly one parent SLP entity (a Sample, Library, or Preparation group). This linkage is what allows ODM to associate cell-level observations with the correct experimental context. + +- **An Expression Group** represents the gene-by-cell expression matrix, compressed for efficient retrieval, together with computed dataset statistics. An Expression Group is always linked to a Cell Group. + +The transformation creates the Cell Group and Expression Group and links them into the existing (or newly created) SLP structure. This is why the configuration requires specifying how the resulting Cell Group should be connected to its parent — the linking step is fundamental to how ODM organises and queries the data. + +## What the transformation reads from the source file + +The transformation extracts three distinct types of data from a single HDF5 source file: + +**Cell metadata** comes primarily from the `obs` slot of an H5AD file (or its equivalent in a 10x H5 file). This includes per-cell annotations such as barcodes, cluster assignments, quality control metrics, and any experimental annotations attached to individual cells. Multidimensional representations (such as PCA or UMAP coordinates stored in `obsm`) and pairwise cell annotations (from `obsp`) can also be extracted. + +**Feature metadata** comes from `var` (and optionally `varm` and `varp`). This includes per-gene annotations such as gene identifiers and names. The transformation can automatically map Ensembl or NCBI gene IDs to gene names for supported species, avoiding the need to pre-process gene annotation separately. + +**The expression matrix** is the count or normalized values matrix (`X`). The transformation validates its dimensions against the extracted cell and feature metadata, then writes it in a Brotli-compressed format optimised for ODM ingestion. + +## The role of metadata curation + +Raw single-cell datasets frequently contain metadata that needs adjustment before it is useful in a cross-study context. Column names may differ between studies, values may be inconsistently coded, fields may be missing, or the biosample identifiers in the cell metadata may not match the naming conventions used by ODM's SLP entities. + +The transformation addresses this through a set of configurable column operations applied during extraction. These include renaming columns, dropping irrelevant fields, filling missing values with defaults, and replacing specific values with standardized equivalents. Attribute names are also mapped to ODM standard names where applicable; non-standard names are automatically converted to camelCase to satisfy the ODM API requirements. + +This curation happens in-pipeline, which means the source file is never modified. The curated output exists only as intermediate files in a temporary directory and, ultimately, as the uploaded ODM objects. + +## Biosample metadata and the aggregation model + +A particularly important feature of the transformation is its ability to derive Sample, Library, or Preparation-level metadata directly from the cell metadata. In many single-cell datasets, attributes such as tissue type, disease condition, or donor information are stored as cell-level annotations (one value per cell), even though they logically belong at the biosample level. + +The transformation can aggregate these cell-level attributes to the biosample level by grouping cells by a designated biosample identifier column. Only attributes that are constant across all cells belonging to the same biosample are considered eligible for export to SLP metadata. This ensures that the resulting biosample records are coherent and that no per-cell variation is incorrectly collapsed into a biosample-level value. + +Attributes exported to biosample metadata are automatically removed from the cell metadata, preventing duplication. If a biosample attribute should remain in the cell metadata for other reasons, it must be explicitly retained by omitting it from `cell_metadata.columns_to_drop`. + +## The linking resolution rules + +When the transformation uploads a Cell Group, it must link it to a parent SLP entity. The transformation resolves this target using a defined priority order, so that users do not always need to specify the target explicitly: + +1. If the configuration creates new SLP groups, the Cell Group is linked to those newly created groups after they are uploaded. +2. If `cell_metadata.linking_group` explicitly names a target (a sample, library, or preparation accession), that target is used directly. +3. If no explicit target is given, the transformation auto-discovers the appropriate SLP groups for the study from ODM, checking first for Library groups, then Preparation groups, then Sample groups. The first entity type that has at least one group associated with the study is used, and all accessions of that type are linked. + +This priority order reflects the typical ODM study structure: Library groups are usually the most specific and appropriate parent for a Cell Group. If a study only has Sample-level grouping, the transformation falls back gracefully. + +## Dry run mode + +Before committing any data to ODM, users can run the transformation in dry run mode. In this mode, the transformation performs all extraction, curation, and validation steps — including resolving the linking target and validating that all cell batch identifiers match existing SLP objects — but uploads nothing. Logs are printed but not saved as attachments. + +Dry run mode is particularly useful for exploring which biosample-level attributes are available in a dataset before committing to a curation strategy. When `biosample_metadata` is configured without any `columns_to_export` entries, the dry run will log which columns are uniform per biosample and therefore eligible for export — without creating any files or objects. + +The recommended practice is to iterate on the configuration using repeated dry runs until all warnings are resolved before submitting a full transformation run. + +## The API layer: configurations, images, and jobs + +The transformation is triggered and managed through the ODM Processors Controller API. This API models the workflow as three separate concerns, each of which can be managed independently: + +**Transformation configurations** are stored JSON documents that describe how a source file should be processed: the input file format, which metadata to extract, and any curation rules to apply. Configurations are created, retrieved, and updated independently of any particular run. This separation means you can refine a configuration through many dry-run iterations without losing the history of changes, and reuse the same configuration across multiple runs or files. + +**Transformation images** are versioned, containerized environments that execute the processing logic for a given file format. The image used for single-cell HDF5 files is called `hdf5-cells`. Specifying an image version (e.g. `"latest"` or a specific release tag) allows reproducibility and controlled upgrades when new versions are released. + +**Transformation jobs** are the actual execution records. A job binds a configuration and an image to one or more input file accessions, runs the processing pipeline, and produces a log. Each job is independent: you can re-run with a different configuration or image without affecting previous jobs or their results. + +This design allows the configuration to evolve (through iterations of the iterative dry-run cycle) while keeping the job history clean and auditable. + +## Supported input formats + +The transformation supports the following HDF5-based input formats: + +- **H5AD (AnnData)** — the native format of the AnnData Python library, widely used in the single-cell ecosystem. +- **10x Genomics H5** — converted internally to H5AD before processing, so the extraction logic is unified regardless of the source format. +- **Legacy 10x Genomics H5 (v<3)** — supported provided the file contains a single genome. Multi-genome legacy files are not supported. diff --git a/docs/user-guide/doc-odm-user-guide/api-reference.md b/docs/user-guide/doc-odm-user-guide/api-reference.md new file mode 100644 index 0000000..35e43ae --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/api-reference.md @@ -0,0 +1,258 @@ +# API Reference: Single-Cell HDF5 Transformation (Processors Controller) + +> **Related documentation:** For conceptual background on configurations, images, and jobs, see [About Single-Cell HDF5 Transformations in ODM](about-sc-hdf5-transformations.md). For step-by-step usage of these endpoints, see the [How-to Guides](how-to-sc-hdf5-transformations.md). For the configuration `data` object schema, see the [Configuration Reference](configuration-reference.md). + +This reference describes all endpoints in the ODM Processors Controller API used to manage and execute single-cell HDF5 transformations. Endpoints are grouped into three resources: Transformation Configurations, Transformation Images, and Transformation Jobs. + +--- + +## Quick Reference + +| Operation | Method | Endpoint | +|---|---|---| +| List configurations | `GET` | `/api/v1/transformations/configurations` | +| Get a configuration | `GET` | `/api/v1/transformations/configurations/{id}` | +| Create a configuration | `POST` | `/api/v1/transformations/configurations` | +| Update a configuration | `PUT` | `/api/v1/transformations/configurations/{id}` | +| List images | `GET` | `/api/v1/transformations/images` | +| Submit a job | `POST` | `/api/v1/transformations/jobs` | +| Get job status | `GET` | `/api/v1/transformations/jobs/{id}` | +| Retrieve job logs | `POST` | `/api/v1/transformations/jobs/{id}/logs` | + +--- + +## Transformation Configurations + +A transformation configuration is a stored JSON document that defines how a source file should be processed. It contains a human-readable name and description alongside the `data` object, which is the full processing specification passed to the transformation image. + +Configurations are independent of any particular run. The same configuration can be reused across multiple jobs and updated iteratively without affecting previous job results. + +### List configurations + +``` +GET /api/v1/transformations/configurations +``` + +Returns an array of configuration objects. Each entry includes: + +| Field | Type | Description | +|---|---|---| +| `id` | integer | Unique identifier for the configuration | +| `name` | string | Human-readable name | +| `description` | string | Human-readable description | + +Use this endpoint to discover existing configurations before deciding to create a new one or reuse an existing one. + +### Get a configuration + +``` +GET /api/v1/transformations/configurations/{id} +``` + +Returns the full configuration object, including the `data` field with all processing rules. Use this to inspect an existing configuration before deciding to update or reuse it. + +**Path parameters:** + +| Parameter | Type | Description | +|---|---|---| +| `id` | integer | ID of the configuration to retrieve | + +### Create a configuration + +``` +POST /api/v1/transformations/configurations +``` + +Creates a new transformation configuration and returns its assigned `id`. + +**Request body:** + +| Field | Type | Required | Description | +|---|---|---|---| +| `name` | string | Yes | Human-readable name for this configuration | +| `description` | string | Yes | Human-readable description | +| `data` | object | Yes | The processing specification. See the [Configuration Reference](configuration-reference.md) for the full schema. | + +**Example request body:** + +```json +{ + "name": "minimal_config", + "description": "Minimal transformation config for H5AD files", + "data": { + "file_type": "h5ad", + "biosample_metadata": null, + "cell_metadata": { + "metadata_keys": { + "obs": "metadata" + } + }, + "feature_metadata": { + "metadata_keys": { + "var": "metadata" + } + }, + "cell_expression": { + "data_class": "Single-cell transcriptomics" + } + } +} +``` + +**Response:** The response object includes the `id` assigned to the new configuration. This `id` is required when submitting a job. + +### Update a configuration + +``` +PUT /api/v1/transformations/configurations/{id} +``` + +Fully replaces the configuration at the given `id` with the provided content. The request body follows the same structure as `POST`. Use this after reviewing dry-run logs to apply adjustments before resubmitting a dry-run or running the full transformation. + +**Path parameters:** + +| Parameter | Type | Description | +|---|---|---| +| `id` | integer | ID of the configuration to update | + +**Request body:** Same structure as `POST /api/v1/transformations/configurations`. + +--- + +## Transformation Images + +A transformation image is a versioned, containerized processing environment that executes the transformation logic for a specific input format. Images are managed separately from configurations, enabling version-controlled upgrades. + +### List images + +``` +GET /api/v1/transformations/images +``` + +Returns an array of available image objects. + +**Response fields per image:** + +| Field | Description | +|---|---| +| `name` | Identifier used when referencing the image in a job (e.g. `"hdf5-cells"`) | +| `description` | Human-readable description of the image's purpose | +| `input_formats` | File formats accepted as input | +| `output_formats` | File formats produced as output | +| `version` | Version tag (e.g. `"latest"` or a specific release tag such as `"0.0.7"`) | + +Use this endpoint to confirm image availability and identify the version to specify when submitting a job. + +--- + +## Transformation Jobs + +A transformation job binds a configuration and an image to one or more input file accessions and executes the processing pipeline. Each job produces an execution log and, when not in dry-run mode, creates or updates ODM objects. + +### Submit a job + +``` +POST /api/v1/transformations/jobs +``` + +Creates and submits a new transformation job. The response includes the `id` of the created job, which is required for status and log queries. + +**Request body:** + +| Field | Type | Required | Description | +|---|---|---|---| +| `configuration_id` | integer | Yes | ID of the transformation configuration to use | +| `dry_run` | boolean | Yes | `true` to simulate the run without writing data to ODM; `false` for a full run | +| `image_reference` | object | Yes | Specifies the image to use. Contains `name` (string) and `version` (string). | +| `input_accessions` | array of strings | Yes | ODM accessions of the input files to process | +| `volume_size` | integer | Yes | Scratch volume size in GB allocated for the job | + +**`image_reference` fields:** + +| Field | Type | Description | +|---|---|---| +| `name` | string | Image name. Use `"hdf5-cells"` for single-cell HDF5 transformations. | +| `version` | string | Version tag. Use `"latest"` or a specific release tag (e.g. `"0.0.7"`). | + +**`volume_size` guidelines:** + +| Input format | Recommended `volume_size` | +|---|---| +| H5AD | ≥ 1.4 × size of the original attachment (GB) | +| 10x H5 | ≥ 4 × size of the original attachment (GB) | + +H5 files require significantly more scratch space due to the internal conversion to H5AD format. + +**Example request body (dry run):** + +```json +{ + "configuration_id": 42, + "dry_run": true, + "image_reference": { + "name": "hdf5-cells", + "version": "latest" + }, + "input_accessions": ["GSF020408"], + "volume_size": 30 +} +``` + +**Example request body (full run):** + +```json +{ + "configuration_id": 42, + "dry_run": false, + "image_reference": { + "name": "hdf5-cells", + "version": "latest" + }, + "input_accessions": ["GSF020408"], + "volume_size": 30 +} +``` + +### Get job status + +``` +GET /api/v1/transformations/jobs/{id} +``` + +Returns the job object, including the current `status.state`. Repeat this request until the state reaches a terminal value before retrieving logs or proceeding to the next step. + +**Path parameters:** + +| Parameter | Type | Description | +|---|---|---| +| `id` | integer | ID of the job to query | + +**`status.state` values:** + +| State | Meaning | +|---|---| +| `RUNNING` | Job is in progress | +| `COMPLETED` | Job finished successfully | +| `FAILED` | Job encountered an error | + +### Retrieve job logs + +``` +POST /api/v1/transformations/jobs/{id}/logs +``` + +Returns the log records for the specified job. Logs include: + +- Configuration validation messages. +- Input file structure report (keys, data types, shapes, attribute names). +- Warnings and errors encountered during metadata extraction and curation. +- Linking validation results (dry-run only). +- Accessions of ODM objects created or updated (full run only). + +**Path parameters:** + +| Parameter | Type | Description | +|---|---|---| +| `id` | integer | ID of the job whose logs to retrieve | + +After a dry run, review the logs carefully before updating the configuration or proceeding to a full run. After a full run, the logs are the primary source of information about which ODM accessions were created. diff --git a/docs/user-guide/doc-odm-user-guide/configuration-reference.md b/docs/user-guide/doc-odm-user-guide/configuration-reference.md new file mode 100644 index 0000000..46fc11e --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/configuration-reference.md @@ -0,0 +1,513 @@ +# Configuration Reference: Single-Cell HDF5 Transformation + +> **Related documentation:** For conceptual background, see [About Single-Cell HDF5 Transformations in ODM](about-sc-hdf5-transformations.md). For step-by-step usage, see the [How-to Guides](how-to-sc-hdf5-transformations.md). For the API endpoints used to store and submit configurations, see the [API Reference](api-reference.md). For the internal processing pipeline, see the [Transformation Process Reference](transformation-process-reference.md). + +This reference describes all parameters accepted by the single-cell HDF5 transformation configuration file. Parameters are organized by top-level section. Required parameters are marked **mandatory**; all others are optional. + +The configuration is validated at the start of every run. If `file_type` is missing or invalid, the pipeline raises an error immediately. For all other sections, all validation errors are collected and reported together at the end of validation, so the complete set of issues is visible in a single run. + +Unrecognized keys are ignored with a warning logged. + +--- + +## Top-level parameters + +### `file_type` + +| | | +|---|---| +| **Type** | `string` | +| **Required** | Yes | +| **Accepted values** | `"h5ad"`, `"h5"` | + +Specifies the format of the input HDF5 file. Must be provided; the pipeline cannot proceed without a valid file type. + +### `dry_run` + +| | | +|---|---| +| **Type** | `boolean` | +| **Default** | `false` | + +When `true`, the transformation performs all extraction, validation, and linking resolution steps but does not upload any data to ODM and does not save logs as an attachment. Expression matrix compression is also skipped. Use to validate configuration before committing data. + +### `save_logs` + +| | | +|---|---| +| **Type** | `boolean` | +| **Default** | `true` | + +When `false`, transformation logs are not saved as an attachment in ODM after the run completes. Has no effect when `dry_run` is `true`. + +--- + +## `biosample_metadata` + +Settings for extracting, transforming, and exporting cell-level metadata to Sample, Library, or Preparation entities. The entire section is optional. If present, the following parameters apply. + +### `metadata_keys` + +| | | +|---|---| +| **Type** | `dict[string, string]` | +| **Required** | Yes, if `biosample_metadata` is present | + +Maps HDF5 group keys to metadata types. Use `"obs": "metadata"` to read standard cell metadata as the source for biosample-level aggregation. + +Example: +```json +{ + "obs": "metadata" +} +``` + +### `biosample_column_name` + +| | | +|---|---| +| **Type** | `string` | +| **Required** | Yes, if `biosample_metadata` is present | + +The name of the column in the cell metadata that identifies which biosample each cell belongs to (for example, a sample ID, library ID, or preparation ID column). Rows are grouped by this column for biosample-level aggregation. + +--- + +### `biosample_metadata.sample` + +Settings for exporting metadata to the Sample entity. Optional. + +#### `create_new_group` + +| | | +|---|---| +| **Type** | `boolean` | + +When `true`, the transformation creates a new Sample group in ODM and links it to the study. When omitted or `false`, the transformation updates existing Sample group objects instead of creating new ones. + +#### `template_id` + +| | | +|---|---| +| **Type** | `string` | + +The template ID to apply when creating a new Sample group. If not specified, the study's default template is applied. + +#### `columns_to_export` + +| | | +|---|---| +| **Type** | `list[string]` | + +List of cell metadata column names to include in the exported Sample metadata. Only columns that are constant per biosample (as identified by `biosample_column_name`) are eligible. Exported columns are automatically dropped from the cell metadata. + +#### `columns_renaming_map` + +| | | +|---|---| +| **Type** | `dict[string, string]` | + +Maps source column names to new names in the exported metadata. + +Example: +```json +{ + "tissue_type": "tissueType" +} +``` + +#### `columns_to_fill_missing_values` + +| | | +|---|---| +| **Type** | `dict[string, string]` | + +Specifies default values to use for missing entries in the listed columns. + +Example: +```json +{ + "disease": "unknown" +} +``` + +#### `columns_to_curate_values` + +| | | +|---|---| +| **Type** | `dict[string, dict[string, string]]` | + +Maps specific values in a column to replacement values. + +Example: +```json +{ + "tissue": { + "PBMCs": "peripheral blood mononuclear cells" + } +} +``` + +--- + +### `biosample_metadata.library` + +Settings for exporting metadata to the Library entity. Optional. Accepts the same parameters as `biosample_metadata.sample`, plus: + +#### `linking_group` + +| | | +|---|---| +| **Type** | `string` | + +Accession of an existing Sample group to link the new Library group to. If not specified, the pipeline uses: (1) a Sample group created in the same run, or (2) pre-fetched Sample group accessions for the study. + +--- + +### `biosample_metadata.preparation` + +Settings for exporting metadata to the Preparation entity. Optional. Accepts the same parameters as `biosample_metadata.library`, including `linking_group`. + +> **Constraint:** Only one of `library` or `preparation` may have `columns_to_export` set in the same configuration. + +--- + +## `cell_metadata` + +Settings for extracting and transforming cell-level metadata. The entire section is optional. If absent, no Cell Group is created. + +### `metadata_keys` + +| | | +|---|---| +| **Type** | `dict[string, string]` | +| **Required** | Yes, if `cell_metadata` is present | + +Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. + +Accepted key-value pairs for H5AD files: + +| Key | Value | Description | +|-----|-------|-------------| +| `obs` | `metadata` | Standard cell annotations | +| `obsm` | `embedding` | Multidimensional cell data (PCA, UMAP, etc.) | +| `obsp` | `pairwise` | Pairwise cell annotations (e.g., cell–cell distances) | + +For H5 files, specify metadata using the same H5AD key names (`obs`, `obsm`, `obsp`). The transformation maps these to the correct internal structure regardless of source format. + +Example: +```json +{ + "obs": "metadata", + "obsm": "embedding", + "obsp": "pairwise" +} +``` + +### `linking_group` + +| | | +|---|---| +| **Type** | `dict[string, string \| list[string] \| null]` | + +Specifies the parent SLP entity to which the Cell Group will be linked. Must contain exactly one key: `sample`, `library`, or `preparation`. The value is either a list of group accessions, a single accession string, or an empty value. + +If an empty value is provided (`[]`, `""`, or `null`), the pipeline resolves all available group accessions of the specified entity type for the study. + +If `linking_group` is absent and no new SLP groups are being created, auto-discovery applies: Library → Preparation → Sample, using the first entity type with at least one associated group. + +Examples: +```json +{ "library": "GSF017080" } +``` +```json +{ "preparation": [] } +``` + +### `columns_to_drop` + +| | | +|---|---| +| **Type** | `list[string]` | + +Column names to remove from the cell metadata before processing. + +Example: +```json +["taxon", "organism_id"] +``` + +### `columns_renaming_map` + +| | | +|---|---| +| **Type** | `dict[string, string]` | + +Maps source column names to new names. + +Example: +```json +{ + "sample": "batch", + "pctmt": "percentMito" +} +``` + +### `columns_to_fill_missing_values` + +| | | +|---|---| +| **Type** | `dict[string, string]` | + +Default values for missing entries in the specified columns. + +Example: +```json +{ + "batch": "unknown" +} +``` + +### `columns_to_curate_values` + +| | | +|---|---| +| **Type** | `dict[string, dict[string, string]]` | + +Replacement values for specific entries in specified columns. + +Example: +```json +{ + "sample": { + "LGVXCTRL1": "lung_healthy_1" + } +} +``` + +### `set_column_value` + +| | | +|---|---| +| **Type** | `dict[string, string]` | + +Sets a constant value for all rows in the specified columns. Can be used to add a new attribute column or overwrite an existing one. + +Example: +```json +{ + "sample_id": "lung_1" +} +``` + +### `columns_to_preserve_name` + +| | | +|---|---| +| **Type** | `list[string]` | + +Column names to exempt from the internal attribute name standardization step. Use for columns whose names contain characters that would otherwise be altered (for example, Leiden cluster columns with decimal suffixes such as `cluster_leiden_0.5`). + +Example: +```json +["cluster_leiden_0.5"] +``` + +### `add_qc_metrics` + +| | | +|---|---| +| **Type** | `boolean` | +| **Default** | `true` | + +When `true`, QC metrics are calculated and added to the cell metadata if not already present. QC metrics include number of counts, number of genes, and mitochondrial and ribosomal gene presence. When `false`, or when `dry_run` is `true`, QC calculation is skipped. + +--- + +## `feature_metadata` + +Settings for extracting and transforming feature (gene)-level metadata. The entire section is optional. + +### `metadata_keys` + +| | | +|---|---| +| **Type** | `dict[string, string]` | +| **Required** | Yes, if `feature_metadata` is present | + +Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. + +Accepted key-value pairs for H5AD files: + +| Key | Value | Description | +|-----|-------|-------------| +| `var` | `metadata` | Standard feature annotations | +| `varm` | `embedding` | Multidimensional feature data | +| `varp` | `pairwise` | Pairwise feature annotations | + +For H5 files, specify metadata using the same H5AD key names (`var`, `varm`, `varp`). The transformation maps these to the correct internal structure regardless of source format. + +Example: +```json +{ + "var": "metadata", + "varm": "embedding" +} +``` + +### `columns_to_drop` + +| | | +|---|---| +| **Type** | `list[string]` | + +Column names to remove from the feature metadata. + +### `columns_renaming_map` + +| | | +|---|---| +| **Type** | `dict[string, string]` | + +Maps source column names to new names. + +### `columns_to_fill_missing_values` + +| | | +|---|---| +| **Type** | `dict[string, string]` | + +Default values for missing entries in the specified columns. + +### `columns_to_curate_values` + +| | | +|---|---| +| **Type** | `dict[string, dict[string, string]]` | + +Replacement values for specific entries in specified columns. + +### `set_column_value` + +| | | +|---|---| +| **Type** | `dict[string, string]` | + +Sets a constant value for all rows in the specified columns. + +### `columns_to_preserve_name` + +| | | +|---|---| +| **Type** | `list[string]` | + +Column names to exempt from the internal attribute name standardization step. + +### `map_gene_ids_to_names` + +| | | +|---|---| +| **Type** | `boolean` | +| **Default** | `true` | + +When `true`, the transformation attempts to map gene IDs to gene names if gene names are absent and the standard `geneId` column is present. The pipeline infers the ID source (Ensembl or NCBI) and the species automatically. When `false`, gene ID mapping is skipped. Set to `false` for proteomics or other omics data that do not use gene IDs as identifiers. + +> The gene ID column must use the standard name `geneId` for mapping to be performed. + +**Supported organisms and annotation releases (hdf5-cells v0.0.4):** + +| Organism | Genome version | Ensembl release | NCBI release | +|----------|---------------|-----------------|--------------| +| *Homo sapiens* | GRCh38.p14 | 115 | GCF_000001405.40-RS_2025_08 | +| *Mus musculus* | GRCm39 | 115 | GCF_000001635.27-RS_2024_02 | +| *Rattus norvegicus* | GRCr8 | 115 | GCF_036323735.1-RS_2024_02 | +| *Sus scrofa* | Sscrofa11.1 | 115 | 106 | + +--- + +## `cell_expression` + +Settings for extracting and uploading the cell expression matrix. The entire section is optional. If absent, no Expression Group is created. + +### `data_class` + +| | | +|---|---| +| **Type** | `string` | +| **Required** | Yes, if `cell_expression` is present | + +The data class label for the expression data. + +Example: +```json +"Single-cell transcriptomics" +``` + +### `compression_level` + +| | | +|---|---| +| **Type** | `integer` (0–9) | +| **Default** | `4` | + +Controls the Brotli compression level for the output expression file. Higher values produce smaller files at the cost of longer compression time. + +### `chunk_size` + +| | | +|---|---| +| **Type** | `integer` | + +Number of features processed per chunk during expression data export. If not specified, the value is calculated automatically from available container memory. + +### `max_buffer_size` + +| | | +|---|---| +| **Type** | `integer` | +| **Default** | `50` | + +Controls how much data is held in memory before being flushed to disk during expression writing. + +### `number_format` + +| | | +|---|---| +| **Type** | `string` | + +Controls the numeric precision of values in the output file. Accepts either a printf-style format string (e.g. `"%.7g"`, `"%d"`) or a NumPy dtype string (e.g. `"float32"`, `"int64"`). If not set, the format is inferred from the data. + +### `columns_to_drop` + +| | | +|---|---| +| **Type** | `list[string]` | + +Column names to remove from the expression metadata. + +### `columns_renaming_map` + +| | | +|---|---| +| **Type** | `dict[string, string]` | + +Maps source column names to new names in the expression metadata. + +### `set_column_value` + +| | | +|---|---| +| **Type** | `dict[string, string]` | + +Sets a constant value for all rows in the specified expression metadata columns. + +### `source_file_metadata` + +| | | +|---|---| +| **Type** | `boolean` | +| **Default** | `true` | + +When `true`, metadata from the source HDF5 attachment is read and included in the expression metadata (subject to `columns_to_drop`, `columns_renaming_map`, and `set_column_value`). When `false`, source file metadata extraction is skipped. + +In all cases, the following statistics are always computed and appended to the expression metadata regardless of this flag: total number of cells, total number of features, sparsity (%), number of non-zero values, source file accession, and source file name. diff --git a/docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md new file mode 100644 index 0000000..b3818e5 --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md @@ -0,0 +1,392 @@ +# How-to Guides: Single-Cell HDF5 Transformations in ODM + +These guides show how to accomplish specific tasks using the single-cell HDF5 transformation. Each guide assumes you have a valid input file (H5AD or 10x H5) already registered in ODM as an attachment, and access to the ODM API. + +For a conceptual overview of the entities involved and how the transformation works, see [About Single-Cell HDF5 Transformations in ODM](about-sc-hdf5-transformations.md). For the full list of configuration parameters, see the [Configuration Reference](configuration-reference.md). For the API endpoint specifications, see the [API Reference](api-reference.md). For details on what the pipeline does internally at each stage, see the [Transformation Process Reference](transformation-process-reference.md). + +--- + +## How to run a transformation via the ODM API + +This guide covers the end-to-end steps to ingest single-cell data into ODM using the Processors Controller API. The process involves three steps: creating a configuration, running a dry run to validate it, and submitting the full run. + +### Step 1: Create a transformation configuration + +Create a configuration document that describes how to process your file. The `data` field contains the processing specification; the `name` and `description` are for your own reference. + +``` +POST /api/v1/transformations/configurations +``` + +```json +{ + "name": "my_study_config", + "description": "Cell and expression ingestion for study XYZ", + "data": { + "file_type": "h5ad", + "cell_metadata": { + "metadata_keys": { + "obs": "metadata", + "obsm": "embedding" + }, + "columns_to_drop": ["taxon"], + "columns_renaming_map": { + "sample": "batch" + } + }, + "feature_metadata": { + "metadata_keys": { + "var": "metadata" + } + }, + "cell_expression": { + "data_class": "Single-cell transcriptomics" + } + } +} +``` + +The response includes the `id` assigned to the new configuration. Save this — it is required for every subsequent step. + +For a full description of the `data` object, see the [Configuration Reference](configuration-reference.md). + +### Step 2: Identify the transformation image + +``` +GET /api/v1/transformations/images +``` + +Confirm that the `hdf5-cells` image is available. Note the version you want to use (typically `"latest"`, or a specific release tag for reproducibility). + +### Step 3: Submit a dry-run job + +``` +POST /api/v1/transformations/jobs +``` + +```json +{ + "configuration_id": , + "dry_run": true, + "image_reference": { + "name": "hdf5-cells", + "version": "latest" + }, + "input_accessions": [""], + "volume_size": 30 +} +``` + +The response includes the `id` of the created job. As a guideline for setting `volume_size`: +- For H5AD input files, allocate approximately **1.4× the size of the original attachment** (in GB). +- For 10x H5 input files, allocate at least **4× the size of the original attachment** (in GB). + +### Step 4: Monitor the dry-run job + +``` +GET /api/v1/transformations/jobs/{job_id} +``` + +Repeat until `status.state` reaches a terminal value: `COMPLETED` or `FAILED`. + +### Step 5: Review the logs + +``` +POST /api/v1/transformations/jobs/{job_id}/logs +``` + +Review the logs for warnings and errors. Pay particular attention to: +- Configuration validation messages. +- The file structure report (which metadata keys are present in your file). +- Linking validation results (whether all cell `batch` values map to existing SLP objects). +- Any columns flagged for automatic renaming or data type coercion. + +If issues are found, update the configuration and repeat from Step 3. See [How to iterate on a configuration using dry runs](#how-to-iterate-on-a-configuration-using-dry-runs) for the recommended cycle. + +### Step 6: Submit the full run + +Once the dry run completes without issues, submit the same job with `dry_run` set to `false`: + +``` +POST /api/v1/transformations/jobs +``` + +```json +{ + "configuration_id": , + "dry_run": false, + "image_reference": { + "name": "hdf5-cells", + "version": "latest" + }, + "input_accessions": [""], + "volume_size": 30 +} +``` + +Monitor and retrieve logs the same way as the dry run (Steps 4–5). When the job completes, the logs contain the ODM accessions assigned to each object that was created or updated. + +--- + +## How to iterate on a configuration using dry runs + +This guide describes the recommended iterative cycle for refining a transformation configuration before committing to a full run. Use this when the initial dry run reveals warnings or errors that require attention. + +The cycle follows this pattern: + +``` +Create configuration → Submit dry-run job → Review logs + ↑ | + └──── Update configuration ←──────────────┘ + (if issues found) +``` + +After reviewing the logs from a dry-run job, update the existing configuration using: + +``` +PUT /api/v1/transformations/configurations/{config_id} +``` + +The request body follows the same structure as the original `POST`. The configuration at the given `id` is fully replaced with the new content. + +Then resubmit the dry-run job with the same `configuration_id`. Because the configuration is updated in place, you can reuse the same `configuration_id` across all iterations without creating a new configuration for each attempt. + +Repeat until the dry run completes without errors or warnings that require action. Then submit the full run. + +--- + +## How to ingest cell and expression data from an H5AD file + +Use this when the study already has Sample, Library, or Preparation groups in ODM and you only need to add the single-cell layer. Configure at least `cell_metadata`, `feature_metadata`, and `cell_expression` in your configuration's `data` field. + +```json +{ + "file_type": "h5ad", + "cell_metadata": { + "metadata_keys": { + "obs": "metadata", + "obsm": "embedding" + }, + "columns_to_drop": ["taxon", "organism_id"], + "columns_renaming_map": { + "sample": "batch", + "pctmt": "percentMito" + } + }, + "feature_metadata": { + "metadata_keys": { + "var": "metadata" + } + }, + "cell_expression": { + "data_class": "Single-cell transcriptomics" + } +} +``` + +The transformation resolves the linking target automatically (Library → Preparation → Sample). To link to a specific group, set `cell_metadata.linking_group` explicitly: + +```json +"cell_metadata": { + "linking_group": { + "library": "GSF017080" + } +} +``` + +To link to all preparation groups in the study without specifying their accessions individually, set an empty value: + +```json +"cell_metadata": { + "linking_group": { + "preparation": [] + } +} +``` + +--- + +## How to create Sample, Library, or Preparation groups from your H5AD file + +Use this when your study does not yet have SLP groups in ODM, or when you want to derive biosample-level attributes from the cell metadata. + +Identify the column in your cell metadata that acts as a biosample identifier. Set this as `biosample_column_name`. Under the relevant entity (`sample`, `library`, or `preparation`), set `create_new_group: true` and list the columns to export under `columns_to_export`. + +```json +{ + "file_type": "h5ad", + "biosample_metadata": { + "metadata_keys": { + "obs": "metadata" + }, + "biosample_column_name": "sample_id", + "sample": { + "create_new_group": true, + "columns_to_export": ["tissue", "disease", "donor_id"], + "columns_renaming_map": { + "tissue": "tissueType" + } + } + }, + "cell_metadata": { + "metadata_keys": { + "obs": "metadata", + "obsm": "embedding" + } + }, + "feature_metadata": { + "metadata_keys": { + "var": "metadata" + } + }, + "cell_expression": { + "data_class": "Single-cell transcriptomics" + } +} +``` + +The transformation aggregates cells by `biosample_column_name` and exports only attributes that are constant per biosample. Exported columns are automatically removed from the cell metadata. + +Only one of `library` or `preparation` may have `columns_to_export` set in the same configuration. + +To create a Library group without exporting attributes (a placeholder group used only for linking), set `create_new_group: true` and omit `columns_to_export`: + +```json +"library": { + "create_new_group": true +} +``` + +--- + +## How to update existing biosample metadata + +Use this when SLP groups already exist in ODM but are missing attributes that are present in your HDF5 file. + +Configure `biosample_metadata` with `columns_to_export` for the target entity, but do not set `create_new_group: true`. + +```json +{ + "file_type": "h5ad", + "biosample_metadata": { + "metadata_keys": { + "obs": "metadata" + }, + "biosample_column_name": "library_id", + "library": { + "columns_to_export": ["sequencing_platform", "library_strategy"] + } + } +} +``` + +The transformation matches extracted rows to existing ODM objects on the entity ID column and updates only attributes that do not already exist. If any extracted ID does not match an existing ODM object, the transformation raises an error. Run a dry run first to catch ID mismatches before committing data. + +--- + +## How to discover which biosample attributes are available in your file + +Use this to identify which cell-level metadata columns are uniform per biosample (and therefore eligible for export), without uploading anything. + +Submit a dry-run job with `biosample_metadata` configured but without any `columns_to_export` entries: + +```json +{ + "file_type": "h5ad", + "biosample_metadata": { + "metadata_keys": { + "obs": "metadata" + }, + "biosample_column_name": "sample_id", + "sample": {} + } +} +``` + +The transformation logs the number of unique biosamples found and the list of columns that are constant across all cells per biosample. Use the logged list to plan your `columns_to_export` configuration before running a full ingestion. + +--- + +## How to process a 10x Genomics H5 file + +The only required change compared to an H5AD configuration is setting `file_type` to `"h5"`. Use the same H5AD key names (`obs`, `var`) in `metadata_keys` — the transformation converts the 10x H5 format to H5AD internally and applies unified processing. + +```json +{ + "file_type": "h5", + "cell_metadata": { + "metadata_keys": { + "obs": "metadata" + } + }, + "feature_metadata": { + "metadata_keys": { + "var": "metadata" + } + }, + "cell_expression": { + "data_class": "Single-cell transcriptomics" + } +} +``` + +When sizing the `volume_size` for a job using an H5 file, allocate at least **4× the size of the original attachment** (in GB). H5 files require significantly more scratch space due to the conversion step. + +Legacy 10x H5 files (v<3) are supported only if the file contains a single genome. Pre-process multi-genome files to extract the genome of interest before running the transformation. + +--- + +## How to configure metadata curation + +These operations are available in `cell_metadata`, `feature_metadata`, and per-entity settings within `biosample_metadata`. They are applied in the order listed. + +**To rename a column:** + +```json +"columns_renaming_map": { + "sample": "batch", + "pctmt": "percentMito" +} +``` + +**To drop columns:** + +```json +"columns_to_drop": ["taxon", "organism_id"] +``` + +**To fill missing values:** + +```json +"columns_to_fill_missing_values": { + "batch": "unknown" +} +``` + +**To replace specific values:** + +```json +"columns_to_curate_values": { + "sample": { + "LGVXCTRL1": "lung_healthy_1" + } +} +``` + +**To set a constant value for all rows:** + +```json +"set_column_value": { + "sample_id": "lung_1" +} +``` + +**To prevent a column from being automatically renamed:** + +```json +"columns_to_preserve_name": ["cluster_leiden_0.5"] +``` + +Operations are applied in order: drop → rename → fill missing values → curate values → set constant values. Attribute name standardization (mapping to ODM standard names and converting others to camelCase) runs after all explicit column operations. Columns listed in `columns_to_preserve_name` are exempt from this standardization step. + +For full parameter specifications, see the [Configuration Reference](configuration-reference.md). diff --git a/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md b/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md new file mode 100644 index 0000000..a9db775 --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md @@ -0,0 +1,214 @@ +# Transformation Process Reference: Single-Cell HDF5 Transformation + +> **Related documentation:** For conceptual background, see [About Single-Cell HDF5 Transformations in ODM](about-sc-hdf5-transformations.md). For configuration parameter definitions, see the [Configuration Reference](configuration-reference.md). + +This reference describes the internal processing stages of the single-cell HDF5 transformation pipeline. It is intended for users who need to understand what the pipeline does at each stage — for example, to interpret logs, diagnose errors, or reason about the order of operations. It is not a guide to running the transformation; see the [How-to Guides](how-to-sc-hdf5-transformations.md) for that purpose. + +--- + +## Stage 1: Initial setup and file preparation + +### 1.1 Configuration loading and validation + +The pipeline reads the transformation configuration file and validates all fields. + +**Top-level key validation** is performed first, checking the presence and data types of: `file_type`, `save_logs`, `biosample_metadata`, `cell_metadata`, `feature_metadata`, and `cell_expression`. If `file_type` is missing or contains an unsupported value (`"h5ad"` and `"h5"` are the only accepted values), the pipeline raises an error immediately and does not proceed. + +For all remaining sections, validation errors are accumulated and reported together at the end of the validation stage, so that all issues in the configuration are surfaced in a single run. + +**Per-section validation** covers: + +- Presence of required keys within each optional section. +- Data type correctness for every key in the section. +- Key-value correctness for `metadata_keys` entries. +- `biosample_metadata`: ensures that `library` and `preparation` are not both configured for simultaneous update. +- `cell_expression`: validates `number_format` as either a printf-format string or a NumPy dtype string; the resolved dtype is stored back into the configuration for downstream use. + +Unrecognized keys at any level are logged as warnings and ignored. + +### 1.2 Attachment and study metadata retrieval + +The pipeline retrieves the accession and metadata of the input HDF5 attachment from ODM. From this, it determines: + +- The name to assign to the processed data objects. +- The study accession that the resulting Cell Group and Expression Group will be associated with. + +### 1.3 Linking group determination + +Before any file processing begins, the pipeline resolves the parent SLP entity (Sample, Library, or Preparation group) to which the Cell Group will be linked. The resolution follows these rules in order: + +- **New SLP group creation deferred:** If `biosample_metadata` is present and any of `sample`, `library`, or `preparation` has `create_new_group: true`, linking resolution is deferred until after those new groups are created and uploaded (Stage 4). The cell group is then linked to the newly created groups. + +- **Explicit `linking_group` in `cell_metadata`:** If `cell_metadata.linking_group` is set, the specified entity type and accession(s) are used directly. An empty value (e.g. `[]`) resolves to all available group accessions of the specified entity type for the study. + +- **Auto-discovery:** If neither of the above applies, the pipeline fetches all SLP groups associated with the study from ODM and selects the first entity type that has at least one group, checking in the order: **Library → Preparation → Sample**. All accessions of the selected type are used for linking. + +If no SLP group can be found and no new group is being created, the pipeline raises an error. + +### 1.4 Temporary directory and file preparation + +A temporary directory is created to store all intermediate files produced during the run. The input HDF5 file is copied into this directory. If the input is of type `"h5"` (10x Genomics H5), it is converted to H5AD format and stored alongside the original copy, so that subsequent stages can stream from the H5AD representation uniformly regardless of source format. + +### 1.5 File structure inspection + +The pipeline opens the H5AD file and inspects its structure, logging: + +- Top-level keys (groups). +- Data types and shapes. +- Attribute names. + +This output is written to the transformation logs and is useful for verifying which metadata keys (such as `obs`, `var`, `obsm`) are present in the file before extraction begins. + +--- + +## Stage 2: Metadata extraction + +The configuration is checked to determine whether processing for `biosample_metadata`, `cell_metadata`, and/or `feature_metadata` is required. Each configured section is processed independently according to the steps below. + +### 2.1 Configuration and input validation + +For each metadata section, the pipeline reads parameters (data type, input/output files, file type, metadata keys, column operations) and validates the presence of required keys and supported file types. + +### 2.2 Biosample metadata (`biosample_metadata` config) + +When `biosample_metadata` is present in the configuration, the pipeline can export Sample, Library, or Preparation-level attributes derived from cell-level metadata. + +#### Configuration and input validation + +Parameters are read from `biosample_metadata`: `metadata_keys`, `biosample_column_name`, and per-entity settings under `sample`, `library`, and/or `preparation`. These include parameters for identifying exportable attributes (`columns_to_export`), metadata curation (`columns_renaming_map`, `columns_to_fill_missing_values`, `columns_to_curate_values`), and group creation and linking (`create_new_group`, `template_id`, `linking_group`). + +Only one of `library` or `preparation` may have `columns_to_export` set. Attributes exported to biosample metadata are automatically removed from the cell metadata in the subsequent processing step. Biosample attributes that do not need to be exported but also should not remain in cell metadata must be listed in `cell_metadata.columns_to_drop`. + +#### File reading and metadata extraction + +The pipeline opens the H5AD file, reads the metadata from the group indicated by `metadata_keys`, and groups the resulting table by `biosample_column_name`. Only attributes that are constant per biosample and listed in `columns_to_export` are processed for export. + +For each entity type with `columns_to_export` configured, columns are filtered and optionally curated; the entity ID column(s) (e.g. Sample Source ID, Library ID, Preparation ID) are set from the configuration, and the result is written to a TSV file in the temporary directory. + +A special scenario — exporting a placeholder group containing only ID column(s) — can be configured by setting `create_new_group: true` and omitting `columns_to_export` (when dry-run mode is disabled). + +#### Discovery mode + +When `biosample_metadata` is present, no entity has `columns_to_export` defined, and `dry_run` is enabled, the pipeline runs in discovery-only mode. It identifies which columns are uniform per value of `biosample_column_name`, logs the number of unique biosamples and the list of those attributes, and returns without writing any TSV. No ODM objects are created or modified. + +#### Existing biosample metadata update + +When `columns_to_export` is configured for an entity but `create_new_group` is not set, the pipeline prepares an update to existing ODM metadata objects rather than creating new groups. + +It fetches the current metadata for the entity type, then runs a matching procedure joining the extracted metadata to the existing metadata on the entity ID column (e.g. Sample Source ID, Library ID, Preparation ID). Only attributes that do not already exist in the ODM metadata are retained; columns with the same name are skipped. If any extracted ID does not match an existing ODM object, an error is raised listing the unmatched IDs. The matching result is written as a TSV file for use in Stage 4. + +### 2.3 File reading and metadata extraction + +For cell and feature metadata sections, the pipeline opens the H5AD file and reads the groups specified in `metadata_keys`: + +- Standard metadata (value `"metadata"`) is loaded into a DataFrame. +- **Embeddings** (`"embedding"`): Multidimensional arrays are serialized as comma-separated strings and added as columns. +- **Pairwise** (`"pairwise"`): The row mean of each pairwise matrix is calculated and added as a column. + +### 2.4 Index handling and sanity checks + +- If the DataFrame is empty and has neither columns nor an index, an error is raised. +- If the index is unnamed, it is assigned the default name `_index`. +- If the index name collides with an existing column name, it is renamed to avoid the conflict. +- The index is extracted and appended as a column to ensure barcode or feature ID information is preserved for downstream validation. + +> **Note:** If the cell barcode is stored in the index and the index has no name, the extracted column will be named `_index`. To use a different name, rename it using `columns_renaming_map` in the configuration. + +### 2.5 Column operations + +The following transformations are applied in the order listed, when specified in the configuration: + +1. **Drop columns** (`columns_to_drop`) +2. **Rename columns** (`columns_renaming_map`) +3. **Fill missing values** (`columns_to_fill_missing_values`) +4. **Curate values** (`columns_to_curate_values`) +5. **Coerce data types** +6. **Set constant values** (`set_column_value`) + +After all explicit column operations, **attribute name standardization** is applied: column names are mapped to ODM standard attribute names where a mapping exists; non-standard names are converted to camelCase. Columns listed in `columns_to_preserve_name` are exempt from this step. + +Data type validation is then performed on the resulting DataFrame. + +**Cell metadata additional steps:** + +- **Required column validation:** + - `barcode`: Unique cell identifiers. Duplicate or missing values cause an error. + - `batch`: Sample/library/preparation identifiers for linking. Missing values cause an error. +- **QC metric calculation** (if `add_qc_metrics` is `true` and `dry_run` is `false`): Number of counts, number of genes, percentage mitochondrial expression, and percentage ribosomal expression are computed and added if not already present. + +**Feature metadata additional steps:** + +- **Gene ID mapping** (if `map_gene_ids_to_names` is `true`): If gene names are absent and the standard `geneId` column is present, the pipeline infers the ID source (Ensembl or NCBI) and the species. If both can be determined, a new column with the mapped gene names is added. Supported organisms and annotation releases are listed in the [Configuration Reference](configuration-reference.md#map_gene_ids_to_names). + +### 2.6 Storing data + +The processed metadata DataFrame is written to the temporary directory as a TSV file. + +--- + +## Stage 3: Cell expression extraction + +### 3.1 Configuration and input validation + +The pipeline reads expression parameters: `data_class`, `compression_level`, `chunk_size`, `max_buffer_size`, and `number_format`. Parameters not specified in the configuration are either inferred from the data or set to sensible defaults. + +### 3.2 Expression matrix reading and validation + +The cell expression matrix is read from the HDF5 file. The pipeline validates that the matrix shape matches the number of cells and features as determined by the extracted metadata. + +### 3.3 Expression data writing + +The expression data, enriched with feature metadata according to the configuration, is written to a Brotli-compressed file (`.br`) in the temporary directory. + +### 3.4 Expression metadata reading and writing + +Expression metadata from the source attachment is read and transformed according to `columns_to_drop`, `columns_renaming_map`, and `set_column_value`, unless `source_file_metadata` is `false`. + +The following statistics are always computed and appended to the metadata regardless of the `source_file_metadata` flag: + +1. Total number of cells +2. Total number of features +3. Sparsity (%) +4. Number of non-zero values + +Source file accession and source file name are also included. The generated metadata file is written to the temporary directory. + +--- + +## Stage 4: Final steps and upload + +### 4.1 Dry run halt and validation + +If `dry_run` is `true`, the pipeline halts at this point. Expression matrix compression is skipped. Logs are reported but not saved as attachments. + +When `dry_run` is enabled and the cell linking group has been resolved, the pipeline performs a best-effort linking validation: + +- **Biosample coverage:** Unique values in the cell metadata `batch` column are compared against the ID values of the resolved SLP entity. Unmatched values are logged as warnings. +- **Duplicate IDs:** If the same ID value appears in more than one SLP object, a warning is logged (cells could multi-map to multiple entities). +- **Group accession coverage:** Group accessions that contain no biosample objects matching any cell `batch` value are logged as warnings. + +Validation mismatches are reported as warnings and do not abort the dry run. Use them to correct the configuration before submitting a full run. + +### 4.2 Upload to ODM + +Upload proceeds in a fixed order. Deviating from this order would break the linking chain. + +**4.2.1 SLP groups** + +If `biosample_metadata` is configured with at least one entity: + +- **New groups** (for entities with `create_new_group: true`): The corresponding TSV is uploaded as a new group via the entity-specific API endpoint, with `template_id` applied if specified. The new group is then linked to its parent — Sample groups are linked to the study accession; Library and Preparation groups are linked to a Sample group, resolved in this order: (1) `linking_group.sample` in the entity's config, (2) a Sample group created in the same run, (3) pre-fetched Sample group accessions for the study. The new group's accession is stored for use in the cell group linking step (library/preparation takes priority over sample). + +- **Existing groups** (for entities with `create_new_group` not set): For each row in the update TSV produced in Stage 2.2, the pipeline calls the ODM PATCH API endpoint for that entity's accession with the new attribute values. + +**4.2.2 Cell Group upload** + +The transformed cell metadata TSV is uploaded as a new Cell Group, which is linked to the parent SLP entity determined in Stage 1.3 (or resolved in Stage 4.2.1 if new SLP groups were created). + +**4.2.3 Expression Group upload** + +The Brotli-compressed expression file and its metadata file are uploaded to create a new Expression Group, which is linked to the newly created Cell Group. + +**4.2.4 Log upload** + +Transformation logs are uploaded as an attachment together with their metadata. This step is skipped if `save_logs` is `false`. From 8a6f027cef88141af04879122c77631df3d9c493 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Thu, 26 Mar 2026 17:04:15 +0100 Subject: [PATCH 02/74] Add different configuration reference options --- .../doc-odm-user-guide/config-ref-option-A.md | 157 ++++++++++ .../doc-odm-user-guide/config-ref-option-B.md | 152 ++++++++++ .../doc-odm-user-guide/config-ref-option-C.md | 286 ++++++++++++++++++ 3 files changed, 595 insertions(+) create mode 100644 docs/user-guide/doc-odm-user-guide/config-ref-option-A.md create mode 100644 docs/user-guide/doc-odm-user-guide/config-ref-option-B.md create mode 100644 docs/user-guide/doc-odm-user-guide/config-ref-option-C.md diff --git a/docs/user-guide/doc-odm-user-guide/config-ref-option-A.md b/docs/user-guide/doc-odm-user-guide/config-ref-option-A.md new file mode 100644 index 0000000..a1a3a48 --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/config-ref-option-A.md @@ -0,0 +1,157 @@ +# Configuration Reference: Single-Cell HDF5 Transformation + +> **Related documentation:** [About SC HDF5 Transformations](about-sc-hdf5-transformations.md) · [How-to Guides](how-to-sc-hdf5-transformations.md) · [API Reference](api-reference.md) · [Transformation Process Reference](transformation-process-reference.md) + +The configuration is validated at the start of every run. If `file_type` is missing or invalid, the pipeline raises an error immediately. All other validation errors are collected and reported together. Unrecognised keys are ignored with a warning. + +--- + +## Top-level parameters + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `file_type` | `string` | **Yes** | — | Format of the input file. Accepted values: `"h5ad"`, `"h5"`. | +| `dry_run` | `boolean` | No | `false` | Runs all extraction, validation, and linking steps without uploading data or saving logs. Compression is also skipped. | +| `save_logs` | `boolean` | No | `true` | When `false`, logs are not saved as an attachment after the run. Has no effect when `dry_run` is `true`. | + +--- + +## `biosample_metadata` + +Settings for extracting, transforming, and exporting cell-level metadata to Sample, Library, or Preparation entities. The entire section is optional. + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `metadata_keys` | `dict[string, string]` | Yes | — | Maps HDF5 group keys to metadata types. Use `"obs": "metadata"` to read standard cell metadata. | +| `biosample_column_name` | `string` | Yes | — | Column identifying which biosample each cell belongs to. Rows are grouped by this column for aggregation. | + +**`metadata_keys` example:** +```json +{ "obs": "metadata" } +``` + +### `biosample_metadata.sample` + +Settings for exporting metadata to the Sample entity. Optional. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `create_new_group` | `boolean` | `false` | When `true`, creates a new Sample group in ODM and links it to the study. | +| `template_id` | `string` | — | Template ID for the new Sample group. Falls back to the study default if omitted. | +| `columns_to_export` | `list[string]` | — | Cell metadata columns to include in the exported Sample metadata. Only columns constant per biosample are eligible; exported columns are dropped from cell metadata. | +| `columns_renaming_map` | `dict[string, string]` | — | Maps source column names to new names in the exported metadata. | +| `columns_to_fill_missing_values` | `dict[string, string]` | — | Default values for missing entries in specified columns. | +| `columns_to_curate_values` | `dict[string, dict[string, string]]` | — | Maps specific values in a column to replacement values. | + +**Examples:** +```json +{ "columns_renaming_map": { "tissue_type": "tissueType" } } +{ "columns_to_fill_missing_values": { "disease": "unknown" } } +{ "columns_to_curate_values": { "tissue": { "PBMCs": "peripheral blood mononuclear cells" } } } +``` + +### `biosample_metadata.library` + +Accepts the same parameters as `biosample_metadata.sample`, plus: + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `linking_group` | `string` | — | Accession of an existing Sample group to link the new Library group to. If omitted, the pipeline uses a Sample group from the same run or pre-fetched accessions. | + +### `biosample_metadata.preparation` + +Accepts the same parameters as `biosample_metadata.library`, including `linking_group`. + +> **Constraint:** Only one of `library` or `preparation` may have `columns_to_export` set in the same configuration. + +--- + +## `cell_metadata` + +Settings for extracting and transforming cell-level metadata. Optional. If absent, no Cell Group is created. + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `metadata_keys` | `dict[string, string]` | Yes | — | Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. | +| `linking_group` | `dict[string, string \| list[string] \| null]` | No | — | Specifies the parent SLP entity (sample/library/preparation) to link the Cell Group to. Empty value triggers auto-discovery of all available accessions. | +| `columns_to_drop` | `list[string]` | No | — | Column names to remove before processing. | +| `columns_renaming_map` | `dict[string, string]` | No | — | Maps source column names to new names. | +| `columns_to_fill_missing_values` | `dict[string, string]` | No | — | Default values for missing entries. | +| `columns_to_curate_values` | `dict[string, dict[string, string]]` | No | — | Replacement values for specific entries in specified columns. | +| `set_column_value` | `dict[string, string]` | No | — | Sets a constant value for all rows. Can add new columns or overwrite existing ones. | +| `columns_to_preserve_name` | `list[string]` | No | — | Columns to exempt from internal name standardisation (e.g. Leiden cluster columns with decimal suffixes). | +| `add_qc_metrics` | `boolean` | No | `true` | When `true`, adds QC metrics (counts, genes, mitochondrial/ribosomal presence) if not already present. Skipped when `dry_run` is `true`. | + +**`metadata_keys` accepted values (H5AD):** + +| Key | Value | Description | +|-----|-------|-------------| +| `obs` | `metadata` | Standard cell annotations | +| `obsm` | `embedding` | Multidimensional cell data (PCA, UMAP, etc.) | +| `obsp` | `pairwise` | Pairwise cell annotations | + +For H5 files, use the same H5AD key names — the transformation maps them to the correct internal structure. + +**Examples:** +```json +{ "metadata_keys": { "obs": "metadata", "obsm": "embedding" } } +{ "linking_group": { "library": "GSF017080" } } +{ "columns_to_drop": ["taxon", "organism_id"] } +{ "columns_renaming_map": { "sample": "batch", "pctmt": "percentMito" } } +{ "set_column_value": { "sample_id": "lung_1" } } +{ "columns_to_preserve_name": ["cluster_leiden_0.5"] } +``` + +--- + +## `feature_metadata` + +Settings for extracting and transforming feature (gene)-level metadata. Optional. + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `metadata_keys` | `dict[string, string]` | Yes | — | Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. | +| `columns_to_drop` | `list[string]` | No | — | Column names to remove from feature metadata. | +| `columns_renaming_map` | `dict[string, string]` | No | — | Maps source column names to new names. | +| `columns_to_fill_missing_values` | `dict[string, string]` | No | — | Default values for missing entries. | +| `columns_to_curate_values` | `dict[string, dict[string, string]]` | No | — | Replacement values for specific entries. | +| `set_column_value` | `dict[string, string]` | No | — | Sets a constant value for all rows. | +| `columns_to_preserve_name` | `list[string]` | No | — | Columns to exempt from internal name standardisation. | +| `map_gene_ids_to_names` | `boolean` | No | `true` | When `true`, maps gene IDs to gene names if names are absent and `geneId` column is present. Set to `false` for proteomics or non-gene-ID data. | + +**`metadata_keys` accepted values (H5AD):** + +| Key | Value | Description | +|-----|-------|-------------| +| `var` | `metadata` | Standard feature annotations | +| `varm` | `embedding` | Multidimensional feature data | +| `varp` | `pairwise` | Pairwise feature annotations | + +> The gene ID column must be named `geneId` for mapping to be performed. + +**Supported organisms (`map_gene_ids_to_names`) — hdf5-cells v0.0.4:** + +| Organism | Genome version | Ensembl release | NCBI release | +|----------|---------------|-----------------|--------------| +| *Homo sapiens* | GRCh38.p14 | 115 | GCF_000001405.40-RS_2025_08 | +| *Mus musculus* | GRCm39 | 115 | GCF_000001635.27-RS_2024_02 | +| *Rattus norvegicus* | GRCr8 | 115 | GCF_036323735.1-RS_2024_02 | +| *Sus scrofa* | Sscrofa11.1 | 115 | 106 | + +--- + +## `cell_expression` + +Settings for extracting and uploading the cell expression matrix. Optional. If absent, no Expression Group is created. + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `data_class` | `string` | **Yes** | — | Data class label for the expression data (e.g. `"Single-cell transcriptomics"`). | +| `compression_level` | `integer` (0–9) | No | `4` | Brotli compression level. Higher values produce smaller files at the cost of longer compression time. | +| `chunk_size` | `integer` | No | auto | Number of features processed per chunk. Calculated automatically from available memory if omitted. | +| `max_buffer_size` | `integer` | No | `50` | Amount of data held in memory before being flushed to disk during writing. | +| `number_format` | `string` | No | inferred | Numeric precision of output values. Accepts printf-style (`"%.7g"`, `"%d"`) or NumPy dtype (`"float32"`, `"int64"`). | +| `columns_to_drop` | `list[string]` | No | — | Column names to remove from expression metadata. | +| `columns_renaming_map` | `dict[string, string]` | No | — | Maps source column names to new names. | +| `set_column_value` | `dict[string, string]` | No | — | Sets a constant value for all rows in specified columns. | +| `source_file_metadata` | `boolean` | No | `true` | When `true`, metadata from the source HDF5 attachment is read and included in expression metadata. Summary statistics (cell count, feature count, sparsity, etc.) are always appended regardless of this flag. | diff --git a/docs/user-guide/doc-odm-user-guide/config-ref-option-B.md b/docs/user-guide/doc-odm-user-guide/config-ref-option-B.md new file mode 100644 index 0000000..bdfd44e --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/config-ref-option-B.md @@ -0,0 +1,152 @@ +# Configuration Reference: Single-Cell HDF5 Transformation + +> **Related documentation:** [About SC HDF5 Transformations](about-sc-hdf5-transformations.md) · [How-to Guides](how-to-sc-hdf5-transformations.md) · [API Reference](api-reference.md) · [Transformation Process Reference](transformation-process-reference.md) + +The configuration file controls how the transformation reads, processes, and indexes cell and feature metadata from HDF5 files. Parameters are organised by top-level section. The configuration is validated at run start; `file_type` errors fail immediately, all others are reported together at the end of validation. Unrecognised keys are ignored with a warning. + +--- + +## Top-level parameters + +**`file_type`** *(string, mandatory)* — Format of the input HDF5 file. Accepted values: `"h5ad"`, `"h5"`. + +**`dry_run`** *(boolean, optional, default: `false`)* — When `true`, performs all extraction, validation, and linking resolution without uploading data or saving logs. Compression is also skipped. Use to validate configuration before committing data. + +**`save_logs`** *(boolean, optional, default: `true`)* — When `false`, logs are not saved as an attachment after the run. Has no effect when `dry_run` is `true`. + +--- + +## `biosample_metadata` + +Settings for extracting, transforming, and exporting cell-level metadata to Sample, Library, or Preparation entities. The entire section is optional. + +**`metadata_keys`** *(dict[string, string], mandatory if section is present)* — Maps HDF5 group keys to metadata types. Use `"obs": "metadata"` to read standard cell metadata as the source for biosample-level aggregation. Example: +```json +{ "obs": "metadata" } +``` + +**`biosample_column_name`** *(string, mandatory if section is present)* — Column identifying which biosample each cell belongs to (e.g. a sample ID or library ID column). Rows are grouped by this column for aggregation. + +### sample + +Settings for exporting metadata to the Sample entity. Optional. + +**`create_new_group`** *(boolean, optional)* — When `true`, creates a new Sample group in ODM and links it to the study. When omitted or `false`, existing Sample group objects are updated instead. + +**`template_id`** *(string, optional)* — Template ID for the new Sample group. Falls back to the study default if omitted. + +**`columns_to_export`** *(list[string], optional)* — Cell metadata columns to include in the exported Sample metadata. Only columns constant per biosample are eligible; exported columns are automatically dropped from cell metadata. + +**`columns_renaming_map`** *(dict[string, string], optional)* — Maps source column names to new names. Example: `{ "tissue_type": "tissueType" }` + +**`columns_to_fill_missing_values`** *(dict[string, string], optional)* — Default values for missing entries. Example: `{ "disease": "unknown" }` + +**`columns_to_curate_values`** *(dict[string, dict[string, string]], optional)* — Replacement values for specific entries in specified columns. Example: +```json +{ "tissue": { "PBMCs": "peripheral blood mononuclear cells" } } +``` + +### library + +Accepts the same parameters as `sample`, plus: + +**`linking_group`** *(string, optional)* — Accession of an existing Sample group to link the new Library group to. If omitted, the pipeline uses a Sample group created in the same run, or pre-fetched accessions for the study. + +### preparation + +Accepts the same parameters as `library`, including `linking_group`. + +> **Constraint:** Only one of `library` or `preparation` may have `columns_to_export` set in the same configuration. + +--- + +## `cell_metadata` + +Settings for extracting and transforming cell-level metadata. Optional. If absent, no Cell Group is created. + +**`metadata_keys`** *(dict[string, string], mandatory if section is present)* — Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. Accepted values for H5AD files: + +- `"obs": "metadata"` — Standard cell annotations +- `"obsm": "embedding"` — Multidimensional cell data (PCA, UMAP, etc.) +- `"obsp": "pairwise"` — Pairwise cell annotations (e.g. cell–cell distances) + +For H5 files, use the same H5AD key names — the transformation maps them to the correct internal structure regardless of source format. Example: +```json +{ "obs": "metadata", "obsm": "embedding", "obsp": "pairwise" } +``` + +**`linking_group`** *(dict[string, string | list[string] | null], optional)* — Specifies the parent SLP entity to link the Cell Group to. Must contain exactly one key: `sample`, `library`, or `preparation`. An empty value (`[]`, `""`, or `null`) triggers auto-discovery of all available accessions of that type. If the parameter is absent entirely and no new SLP groups are being created, auto-discovery applies in this order: Library → Preparation → Sample. Examples: +```json +{ "library": "GSF017080" } +{ "preparation": [] } +``` + +**`columns_to_drop`** *(list[string], optional)* — Column names to remove before processing. Example: `["taxon", "organism_id"]` + +**`columns_renaming_map`** *(dict[string, string], optional)* — Maps source column names to new names. Example: `{ "sample": "batch", "pctmt": "percentMito" }` + +**`columns_to_fill_missing_values`** *(dict[string, string], optional)* — Default values for missing entries. Example: `{ "batch": "unknown" }` + +**`columns_to_curate_values`** *(dict[string, dict[string, string]], optional)* — Replacement values for specific entries. Example: `{ "sample": { "LGVXCTRL1": "lung_healthy_1" } }` + +**`set_column_value`** *(dict[string, string], optional)* — Sets a constant value for all rows. Can add new attribute columns or overwrite existing ones. Example: `{ "sample_id": "lung_1" }` + +**`columns_to_preserve_name`** *(list[string], optional)* — Columns to exempt from internal name standardisation. Use for columns whose names contain characters that would otherwise be altered (e.g. Leiden cluster columns with decimal suffixes such as `cluster_leiden_0.5`). + +**`add_qc_metrics`** *(boolean, optional, default: `true`)* — When `true`, QC metrics are calculated and added to cell metadata if not already present (counts, genes, mitochondrial and ribosomal gene presence). Skipped when `dry_run` is `true`. + +--- + +## `feature_metadata` + +Settings for extracting and transforming feature (gene)-level metadata. Optional. + +**`metadata_keys`** *(dict[string, string], mandatory if section is present)* — Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. Accepted values for H5AD files: + +- `"var": "metadata"` — Standard feature annotations +- `"varm": "embedding"` — Multidimensional feature data +- `"varp": "pairwise"` — Pairwise feature annotations + +For H5 files, use the same H5AD key names. Example: `{ "var": "metadata", "varm": "embedding" }` + +**`columns_to_drop`** *(list[string], optional)* — Column names to remove from feature metadata. + +**`columns_renaming_map`** *(dict[string, string], optional)* — Maps source column names to new names. + +**`columns_to_fill_missing_values`** *(dict[string, string], optional)* — Default values for missing entries. + +**`columns_to_curate_values`** *(dict[string, dict[string, string]], optional)* — Replacement values for specific entries. + +**`set_column_value`** *(dict[string, string], optional)* — Sets a constant value for all rows. + +**`columns_to_preserve_name`** *(list[string], optional)* — Columns to exempt from internal name standardisation. + +**`map_gene_ids_to_names`** *(boolean, optional, default: `true`)* — When `true`, attempts to map gene IDs to gene names if names are absent and the `geneId` column is present. The pipeline infers the ID source (Ensembl or NCBI) and species automatically. Set to `false` for proteomics or other omics data that do not use gene IDs as identifiers. + +> The gene ID column must be named `geneId` for mapping to be performed. + +Supported organisms (hdf5-cells v0.0.4): *Homo sapiens* (GRCh38.p14, Ensembl 115), *Mus musculus* (GRCm39, Ensembl 115), *Rattus norvegicus* (GRCr8, Ensembl 115), *Sus scrofa* (Sscrofa11.1, Ensembl 115). + +--- + +## `cell_expression` + +Settings for extracting and uploading the cell expression matrix. Optional. If absent, no Expression Group is created. + +**`data_class`** *(string, mandatory if section is present)* — Data class label for the expression data. Example: `"Single-cell transcriptomics"` + +**`compression_level`** *(integer 0–9, optional, default: `4`)* — Brotli compression level for the output file. Higher values produce smaller files at the cost of longer compression time. + +**`chunk_size`** *(integer, optional)* — Number of features processed per chunk during export. Calculated automatically from available container memory if omitted. + +**`max_buffer_size`** *(integer, optional, default: `50`)* — Amount of data held in memory before being flushed to disk during expression writing. + +**`number_format`** *(string, optional)* — Numeric precision of values in the output file. Accepts printf-style (`"%.7g"`, `"%d"`) or NumPy dtype (`"float32"`, `"int64"`). Inferred from the data if omitted. + +**`columns_to_drop`** *(list[string], optional)* — Column names to remove from expression metadata. + +**`columns_renaming_map`** *(dict[string, string], optional)* — Maps source column names to new names in expression metadata. + +**`set_column_value`** *(dict[string, string], optional)* — Sets a constant value for all rows in specified expression metadata columns. + +**`source_file_metadata`** *(boolean, optional, default: `true`)* — When `true`, metadata from the source HDF5 attachment is read and included in expression metadata (subject to `columns_to_drop`, `columns_renaming_map`, and `set_column_value`). When `false`, source file metadata extraction is skipped. In all cases, summary statistics are always appended (total cells, total features, sparsity %, non-zero values, source file accession and name). diff --git a/docs/user-guide/doc-odm-user-guide/config-ref-option-C.md b/docs/user-guide/doc-odm-user-guide/config-ref-option-C.md new file mode 100644 index 0000000..0b13597 --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/config-ref-option-C.md @@ -0,0 +1,286 @@ +# Configuration Reference: Single-Cell HDF5 Transformation + +> **Related documentation:** [About SC HDF5 Transformations](about-sc-hdf5-transformations.md) · [How-to Guides](how-to-sc-hdf5-transformations.md) · [API Reference](api-reference.md) · [Transformation Process Reference](transformation-process-reference.md) + +--- + +## Quick reference + +| Section | Parameter | Type | Required | Default | +|---------|-----------|------|----------|---------| +| *(top-level)* | `file_type` | string | **Yes** | — | +| *(top-level)* | `dry_run` | boolean | No | `false` | +| *(top-level)* | `save_logs` | boolean | No | `true` | +| `biosample_metadata` | `metadata_keys` | dict[str, str] | Yes* | — | +| `biosample_metadata` | `biosample_column_name` | string | Yes* | — | +| `biosample_metadata.sample` | `create_new_group` | boolean | No | `false` | +| `biosample_metadata.sample` | `template_id` | string | No | — | +| `biosample_metadata.sample` | `columns_to_export` | list[string] | No | — | +| `biosample_metadata.sample` | `columns_renaming_map` | dict[str, str] | No | — | +| `biosample_metadata.sample` | `columns_to_fill_missing_values` | dict[str, str] | No | — | +| `biosample_metadata.sample` | `columns_to_curate_values` | dict[str, dict[str, str]] | No | — | +| `biosample_metadata.library` | *(same as sample)* + `linking_group` | string | No | — | +| `biosample_metadata.preparation` | *(same as library)* | — | — | — | +| `cell_metadata` | `metadata_keys` | dict[str, str] | Yes* | — | +| `cell_metadata` | `linking_group` | dict[str, …] | No | auto | +| `cell_metadata` | `columns_to_drop` | list[string] | No | — | +| `cell_metadata` | `columns_renaming_map` | dict[str, str] | No | — | +| `cell_metadata` | `columns_to_fill_missing_values` | dict[str, str] | No | — | +| `cell_metadata` | `columns_to_curate_values` | dict[str, dict[str, str]] | No | — | +| `cell_metadata` | `set_column_value` | dict[str, str] | No | — | +| `cell_metadata` | `columns_to_preserve_name` | list[string] | No | — | +| `cell_metadata` | `add_qc_metrics` | boolean | No | `true` | +| `feature_metadata` | `metadata_keys` | dict[str, str] | Yes* | — | +| `feature_metadata` | `columns_to_drop` | list[string] | No | — | +| `feature_metadata` | `columns_renaming_map` | dict[str, str] | No | — | +| `feature_metadata` | `columns_to_fill_missing_values` | dict[str, str] | No | — | +| `feature_metadata` | `columns_to_curate_values` | dict[str, dict[str, str]] | No | — | +| `feature_metadata` | `set_column_value` | dict[str, str] | No | — | +| `feature_metadata` | `columns_to_preserve_name` | list[string] | No | — | +| `feature_metadata` | `map_gene_ids_to_names` | boolean | No | `true` | +| `cell_expression` | `data_class` | string | **Yes*** | — | +| `cell_expression` | `compression_level` | integer (0–9) | No | `4` | +| `cell_expression` | `chunk_size` | integer | No | auto | +| `cell_expression` | `max_buffer_size` | integer | No | `50` | +| `cell_expression` | `number_format` | string | No | inferred | +| `cell_expression` | `columns_to_drop` | list[string] | No | — | +| `cell_expression` | `columns_renaming_map` | dict[str, str] | No | — | +| `cell_expression` | `set_column_value` | dict[str, str] | No | — | +| `cell_expression` | `source_file_metadata` | boolean | No | `true` | + +*Yes* = required only if the parent section is present. + +--- + +## Parameter details + +### Top-level + +#### `file_type` +Format of the input HDF5 file. Must be provided; the pipeline cannot proceed without a valid file type. +- Accepted values: `"h5ad"`, `"h5"` + +#### `dry_run` +Runs all extraction, validation, and linking resolution steps without uploading data or saving logs. Compression is also skipped. Use to validate configuration before committing data. + +#### `save_logs` +When `false`, logs are not saved as an attachment after the run. Has no effect when `dry_run` is `true`. + +--- + +### `biosample_metadata` + +Settings for extracting, transforming, and exporting cell-level metadata to Sample, Library, or Preparation entities. The entire section is optional. + +#### `metadata_keys` +Maps HDF5 group keys to metadata types. Use `"obs": "metadata"` to read standard cell metadata as the source for biosample-level aggregation. + +```json +{ "obs": "metadata" } +``` + +#### `biosample_column_name` +Column identifying which biosample each cell belongs to (e.g. a sample ID or library ID column). Rows are grouped by this column for aggregation. + +--- + +#### `biosample_metadata.sample` + +#### `create_new_group` +When `true`, creates a new Sample group in ODM and links it to the study. When `false` or omitted, existing Sample group objects are updated instead. + +#### `template_id` +Template ID for the new Sample group. Falls back to the study default if omitted. + +#### `columns_to_export` +Cell metadata columns to include in the exported Sample metadata. Only columns that are constant per biosample are eligible; exported columns are automatically dropped from cell metadata. + +#### `columns_renaming_map` +Maps source column names to new names. +```json +{ "tissue_type": "tissueType" } +``` + +#### `columns_to_fill_missing_values` +Default values for missing entries. +```json +{ "disease": "unknown" } +``` + +#### `columns_to_curate_values` +Replacement values for specific entries in specified columns. +```json +{ "tissue": { "PBMCs": "peripheral blood mononuclear cells" } } +``` + +--- + +#### `biosample_metadata.library` + +Accepts all parameters from `biosample_metadata.sample`, plus: + +#### `linking_group` +Accession of an existing Sample group to link the new Library group to. If omitted, the pipeline uses a Sample group created in the same run, or pre-fetched accessions for the study. + +--- + +#### `biosample_metadata.preparation` + +Accepts all parameters from `biosample_metadata.library`, including `linking_group`. + +> **Constraint:** Only one of `library` or `preparation` may have `columns_to_export` set in the same configuration. + +--- + +### `cell_metadata` + +Settings for extracting and transforming cell-level metadata. Optional. If absent, no Cell Group is created. + +#### `metadata_keys` +Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. + +| Key | Value | Description | +|-----|-------|-------------| +| `obs` | `metadata` | Standard cell annotations | +| `obsm` | `embedding` | Multidimensional cell data (PCA, UMAP, etc.) | +| `obsp` | `pairwise` | Pairwise cell annotations | + +For H5 files, use the same H5AD key names — the transformation maps them to the correct internal structure. + +```json +{ "obs": "metadata", "obsm": "embedding", "obsp": "pairwise" } +``` + +#### `linking_group` +Specifies the parent SLP entity (sample/library/preparation) to link the Cell Group to. Must contain exactly one key. An empty value triggers auto-discovery of all available accessions of that entity type. If absent entirely and no new SLP groups are being created, auto-discovery applies: Library → Preparation → Sample. + +```json +{ "library": "GSF017080" } +{ "preparation": [] } +``` + +#### `columns_to_drop` +Column names to remove before processing. +```json +["taxon", "organism_id"] +``` + +#### `columns_renaming_map` +Maps source column names to new names. +```json +{ "sample": "batch", "pctmt": "percentMito" } +``` + +#### `columns_to_fill_missing_values` +Default values for missing entries. +```json +{ "batch": "unknown" } +``` + +#### `columns_to_curate_values` +Replacement values for specific entries in specified columns. +```json +{ "sample": { "LGVXCTRL1": "lung_healthy_1" } } +``` + +#### `set_column_value` +Sets a constant value for all rows. Can add new attribute columns or overwrite existing ones. +```json +{ "sample_id": "lung_1" } +``` + +#### `columns_to_preserve_name` +Columns to exempt from internal name standardisation. Use for column names that contain characters that would otherwise be altered (e.g. Leiden cluster columns: `cluster_leiden_0.5`). +```json +["cluster_leiden_0.5"] +``` + +#### `add_qc_metrics` +When `true`, QC metrics are calculated and added to cell metadata if not already present: number of counts, number of genes, mitochondrial and ribosomal gene presence. Skipped when `dry_run` is `true`. + +--- + +### `feature_metadata` + +Settings for extracting and transforming feature (gene)-level metadata. Optional. + +#### `metadata_keys` +Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. + +| Key | Value | Description | +|-----|-------|-------------| +| `var` | `metadata` | Standard feature annotations | +| `varm` | `embedding` | Multidimensional feature data | +| `varp` | `pairwise` | Pairwise feature annotations | + +For H5 files, use the same H5AD key names. + +#### `columns_to_drop` +Column names to remove from feature metadata. + +#### `columns_renaming_map` +Maps source column names to new names. + +#### `columns_to_fill_missing_values` +Default values for missing entries. + +#### `columns_to_curate_values` +Replacement values for specific entries. + +#### `set_column_value` +Sets a constant value for all rows. + +#### `columns_to_preserve_name` +Columns to exempt from internal name standardisation. + +#### `map_gene_ids_to_names` +When `true`, attempts to map gene IDs to gene names if names are absent and the `geneId` column is present. The pipeline infers the ID source (Ensembl or NCBI) and species automatically. Set to `false` for proteomics or other omics data that do not use gene IDs as identifiers. + +> The gene ID column must be named `geneId` for mapping to be performed. + +Supported organisms (hdf5-cells v0.0.4): + +| Organism | Genome version | Ensembl release | NCBI release | +|----------|---------------|-----------------|--------------| +| *Homo sapiens* | GRCh38.p14 | 115 | GCF_000001405.40-RS_2025_08 | +| *Mus musculus* | GRCm39 | 115 | GCF_000001635.27-RS_2024_02 | +| *Rattus norvegicus* | GRCr8 | 115 | GCF_036323735.1-RS_2024_02 | +| *Sus scrofa* | Sscrofa11.1 | 115 | 106 | + +--- + +### `cell_expression` + +Settings for extracting and uploading the cell expression matrix. Optional. If absent, no Expression Group is created. + +#### `data_class` +Data class label for the expression data. +```json +"Single-cell transcriptomics" +``` + +#### `compression_level` +Brotli compression level for the output file. Higher values produce smaller files at the cost of longer compression time. Range: 0–9. + +#### `chunk_size` +Number of features processed per chunk during export. Calculated automatically from available container memory if omitted. + +#### `max_buffer_size` +Amount of data held in memory before being flushed to disk during expression writing. + +#### `number_format` +Numeric precision of values in the output file. Accepts printf-style (`"%.7g"`, `"%d"`) or NumPy dtype (`"float32"`, `"int64"`). Inferred from the data if omitted. + +#### `columns_to_drop` +Column names to remove from expression metadata. + +#### `columns_renaming_map` +Maps source column names to new names in expression metadata. + +#### `set_column_value` +Sets a constant value for all rows in specified expression metadata columns. + +#### `source_file_metadata` +When `true`, metadata from the source HDF5 attachment is read and included in expression metadata (subject to `columns_to_drop`, `columns_renaming_map`, and `set_column_value`). When `false`, source file metadata extraction is skipped. + +Summary statistics are always appended regardless of this flag: total cells, total features, sparsity (%), non-zero values, source file accession, and source file name. From 0d0825f1d9cfb1be9e86315531ccce5134f571ac Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Fri, 27 Mar 2026 12:36:56 +0100 Subject: [PATCH 03/74] Refactor: minor text edit --- .../doc-odm-user-guide/transformation-process-reference.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md b/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md index a9db775..dde6916 100644 --- a/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md +++ b/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md @@ -134,7 +134,7 @@ Data type validation is then performed on the resulting DataFrame. - **Required column validation:** - `barcode`: Unique cell identifiers. Duplicate or missing values cause an error. - `batch`: Sample/library/preparation identifiers for linking. Missing values cause an error. -- **QC metric calculation** (if `add_qc_metrics` is `true` and `dry_run` is `false`): Number of counts, number of genes, percentage mitochondrial expression, and percentage ribosomal expression are computed and added if not already present. +- **QC metric calculation** (if `add_qc_metrics` is not `false` and `dry_run` is `false`): Number of counts, number of genes, percentage mitochondrial expression, and percentage ribosomal expression are computed and added if not already present. **Feature metadata additional steps:** From b8b90e8a262eee44961c84a3bac344caa07b48e4 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Fri, 27 Mar 2026 12:44:18 +0100 Subject: [PATCH 04/74] Fix: Remove dry_run from configuration parameters across the files --- .../doc-odm-user-guide/config-ref-option-A.md | 5 ++--- .../doc-odm-user-guide/config-ref-option-B.md | 6 ++---- .../doc-odm-user-guide/config-ref-option-C.md | 8 ++------ .../doc-odm-user-guide/configuration-reference.md | 12 ++---------- .../how-to-sc-hdf5-transformations.md | 2 +- .../transformation-process-reference.md | 6 +++--- 6 files changed, 12 insertions(+), 27 deletions(-) diff --git a/docs/user-guide/doc-odm-user-guide/config-ref-option-A.md b/docs/user-guide/doc-odm-user-guide/config-ref-option-A.md index a1a3a48..d223bc8 100644 --- a/docs/user-guide/doc-odm-user-guide/config-ref-option-A.md +++ b/docs/user-guide/doc-odm-user-guide/config-ref-option-A.md @@ -11,8 +11,7 @@ The configuration is validated at the start of every run. If `file_type` is miss | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| | `file_type` | `string` | **Yes** | — | Format of the input file. Accepted values: `"h5ad"`, `"h5"`. | -| `dry_run` | `boolean` | No | `false` | Runs all extraction, validation, and linking steps without uploading data or saving logs. Compression is also skipped. | -| `save_logs` | `boolean` | No | `true` | When `false`, logs are not saved as an attachment after the run. Has no effect when `dry_run` is `true`. | +| `save_logs` | `boolean` | No | `true` | When `false`, logs are not saved as an attachment after the run. Has no effect when environment variable `dry_run` is `true`. | --- @@ -80,7 +79,7 @@ Settings for extracting and transforming cell-level metadata. Optional. If absen | `columns_to_curate_values` | `dict[string, dict[string, string]]` | No | — | Replacement values for specific entries in specified columns. | | `set_column_value` | `dict[string, string]` | No | — | Sets a constant value for all rows. Can add new columns or overwrite existing ones. | | `columns_to_preserve_name` | `list[string]` | No | — | Columns to exempt from internal name standardisation (e.g. Leiden cluster columns with decimal suffixes). | -| `add_qc_metrics` | `boolean` | No | `true` | When `true`, adds QC metrics (counts, genes, mitochondrial/ribosomal presence) if not already present. Skipped when `dry_run` is `true`. | +| `add_qc_metrics` | `boolean` | No | `true` | When `true`, adds QC metrics (counts, genes, mitochondrial/ribosomal presence) if not already present. Skipped when environment variable `dry_run` is `true`. | **`metadata_keys` accepted values (H5AD):** diff --git a/docs/user-guide/doc-odm-user-guide/config-ref-option-B.md b/docs/user-guide/doc-odm-user-guide/config-ref-option-B.md index bdfd44e..bec7278 100644 --- a/docs/user-guide/doc-odm-user-guide/config-ref-option-B.md +++ b/docs/user-guide/doc-odm-user-guide/config-ref-option-B.md @@ -10,9 +10,7 @@ The configuration file controls how the transformation reads, processes, and ind **`file_type`** *(string, mandatory)* — Format of the input HDF5 file. Accepted values: `"h5ad"`, `"h5"`. -**`dry_run`** *(boolean, optional, default: `false`)* — When `true`, performs all extraction, validation, and linking resolution without uploading data or saving logs. Compression is also skipped. Use to validate configuration before committing data. - -**`save_logs`** *(boolean, optional, default: `true`)* — When `false`, logs are not saved as an attachment after the run. Has no effect when `dry_run` is `true`. +**`save_logs`** *(boolean, optional, default: `true`)* — When `false`, logs are not saved as an attachment after the run. Has no effect when environment variable `dry_run` is `true`. --- @@ -93,7 +91,7 @@ For H5 files, use the same H5AD key names — the transformation maps them to th **`columns_to_preserve_name`** *(list[string], optional)* — Columns to exempt from internal name standardisation. Use for columns whose names contain characters that would otherwise be altered (e.g. Leiden cluster columns with decimal suffixes such as `cluster_leiden_0.5`). -**`add_qc_metrics`** *(boolean, optional, default: `true`)* — When `true`, QC metrics are calculated and added to cell metadata if not already present (counts, genes, mitochondrial and ribosomal gene presence). Skipped when `dry_run` is `true`. +**`add_qc_metrics`** *(boolean, optional, default: `true`)* — When `true`, QC metrics are calculated and added to cell metadata if not already present (counts, genes, mitochondrial and ribosomal gene presence). Skipped when environment variable `dry_run` is `true`. --- diff --git a/docs/user-guide/doc-odm-user-guide/config-ref-option-C.md b/docs/user-guide/doc-odm-user-guide/config-ref-option-C.md index 0b13597..9af45b9 100644 --- a/docs/user-guide/doc-odm-user-guide/config-ref-option-C.md +++ b/docs/user-guide/doc-odm-user-guide/config-ref-option-C.md @@ -9,7 +9,6 @@ | Section | Parameter | Type | Required | Default | |---------|-----------|------|----------|---------| | *(top-level)* | `file_type` | string | **Yes** | — | -| *(top-level)* | `dry_run` | boolean | No | `false` | | *(top-level)* | `save_logs` | boolean | No | `true` | | `biosample_metadata` | `metadata_keys` | dict[str, str] | Yes* | — | | `biosample_metadata` | `biosample_column_name` | string | Yes* | — | @@ -60,11 +59,8 @@ Format of the input HDF5 file. Must be provided; the pipeline cannot proceed without a valid file type. - Accepted values: `"h5ad"`, `"h5"` -#### `dry_run` -Runs all extraction, validation, and linking resolution steps without uploading data or saving logs. Compression is also skipped. Use to validate configuration before committing data. - #### `save_logs` -When `false`, logs are not saved as an attachment after the run. Has no effect when `dry_run` is `true`. +When `false`, logs are not saved as an attachment after the run. Has no effect when environment variable `dry_run` is `true`. --- @@ -196,7 +192,7 @@ Columns to exempt from internal name standardisation. Use for column names that ``` #### `add_qc_metrics` -When `true`, QC metrics are calculated and added to cell metadata if not already present: number of counts, number of genes, mitochondrial and ribosomal gene presence. Skipped when `dry_run` is `true`. +When `true`, QC metrics are calculated and added to cell metadata if not already present: number of counts, number of genes, mitochondrial and ribosomal gene presence. Skipped when environment variable `dry_run` is `true`. --- diff --git a/docs/user-guide/doc-odm-user-guide/configuration-reference.md b/docs/user-guide/doc-odm-user-guide/configuration-reference.md index 46fc11e..e9e9991 100644 --- a/docs/user-guide/doc-odm-user-guide/configuration-reference.md +++ b/docs/user-guide/doc-odm-user-guide/configuration-reference.md @@ -22,14 +22,6 @@ Unrecognized keys are ignored with a warning logged. Specifies the format of the input HDF5 file. Must be provided; the pipeline cannot proceed without a valid file type. -### `dry_run` - -| | | -|---|---| -| **Type** | `boolean` | -| **Default** | `false` | - -When `true`, the transformation performs all extraction, validation, and linking resolution steps but does not upload any data to ODM and does not save logs as an attachment. Expression matrix compression is also skipped. Use to validate configuration before committing data. ### `save_logs` @@ -38,7 +30,7 @@ When `true`, the transformation performs all extraction, validation, and linking | **Type** | `boolean` | | **Default** | `true` | -When `false`, transformation logs are not saved as an attachment in ODM after the run completes. Has no effect when `dry_run` is `true`. +When `false`, transformation logs are not saved as an attachment in ODM after the run completes. Has no effect when environment variable `dry_run` is `true`. --- @@ -320,7 +312,7 @@ Example: | **Type** | `boolean` | | **Default** | `true` | -When `true`, QC metrics are calculated and added to the cell metadata if not already present. QC metrics include number of counts, number of genes, and mitochondrial and ribosomal gene presence. When `false`, or when `dry_run` is `true`, QC calculation is skipped. +When `true`, QC metrics are calculated and added to the cell metadata if not already present. QC metrics include number of counts, number of genes, and mitochondrial and ribosomal gene presence. When `false`, or when environment variable `dry_run` is `true`, QC calculation is skipped. --- diff --git a/docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md index b3818e5..3edf166 100644 --- a/docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md @@ -105,7 +105,7 @@ If issues are found, update the configuration and repeat from Step 3. See [How t ### Step 6: Submit the full run -Once the dry run completes without issues, submit the same job with `dry_run` set to `false`: +Once the dry run completes without issues, submit the same job with environment variable `dry_run` set to `false`: ``` POST /api/v1/transformations/jobs diff --git a/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md b/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md index dde6916..9d3c75d 100644 --- a/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md +++ b/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md @@ -89,7 +89,7 @@ A special scenario — exporting a placeholder group containing only ID column(s #### Discovery mode -When `biosample_metadata` is present, no entity has `columns_to_export` defined, and `dry_run` is enabled, the pipeline runs in discovery-only mode. It identifies which columns are uniform per value of `biosample_column_name`, logs the number of unique biosamples and the list of those attributes, and returns without writing any TSV. No ODM objects are created or modified. +When `biosample_metadata` is present, no entity has `columns_to_export` defined, and environment variable `dry_run` is enabled, the pipeline runs in discovery-only mode. It identifies which columns are uniform per value of `biosample_column_name`, logs the number of unique biosamples and the list of those attributes, and returns without writing any TSV. No ODM objects are created or modified. #### Existing biosample metadata update @@ -134,7 +134,7 @@ Data type validation is then performed on the resulting DataFrame. - **Required column validation:** - `barcode`: Unique cell identifiers. Duplicate or missing values cause an error. - `batch`: Sample/library/preparation identifiers for linking. Missing values cause an error. -- **QC metric calculation** (if `add_qc_metrics` is not `false` and `dry_run` is `false`): Number of counts, number of genes, percentage mitochondrial expression, and percentage ribosomal expression are computed and added if not already present. +- **QC metric calculation** (if `add_qc_metrics` is not `false` and environment variable `dry_run` is `false`): Number of counts, number of genes, percentage mitochondrial expression, and percentage ribosomal expression are computed and added if not already present. **Feature metadata additional steps:** @@ -179,7 +179,7 @@ Source file accession and source file name are also included. The generated meta ### 4.1 Dry run halt and validation -If `dry_run` is `true`, the pipeline halts at this point. Expression matrix compression is skipped. Logs are reported but not saved as attachments. +If environment variable `dry_run` is `true`, the pipeline halts at this point. Expression matrix compression is skipped. Logs are reported but not saved as attachments. When `dry_run` is enabled and the cell linking group has been resolved, the pipeline performs a best-effort linking validation: From c2e695d9dc80a5c9f026429c33ced496f6e85a0b Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 08:39:44 +0200 Subject: [PATCH 05/74] Text edit Co-authored-by: Maria Borodaenko --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index c150a6a..e40117d 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -1,4 +1,4 @@ -# About Single-Cell HDF5 Transformations in ODM +# Single-Cell HDF5 Transformations Overview > **Related documentation:** For step-by-step guidance on running the transformation, see the [How-to Guides](how-to-sc-hdf5-transformations.md). For the full configuration schema, see the [Configuration Reference](configuration-reference.md). For the API endpoints, see the [API Reference](api-reference.md). For the internal processing pipeline, see the [Transformation Process Reference](transformation-process-reference.md). From 1d87df4c97ae1dcce7270c9490992ba42965bb69 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 08:41:03 +0200 Subject: [PATCH 06/74] Text edit Co-authored-by: Maria Borodaenko --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index e40117d..1c54c65 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -4,7 +4,7 @@ Single-cell datasets are commonly distributed as HDF5-based files — most often H5AD (the AnnData format) or the 10x Genomics H5 format. While these formats are rich and expressive, they are not directly ingestible into ODM in a way that supports consistent indexing, cross-dataset search, and the entity relationships that ODM relies on for downstream discovery. -The single-cell HDF5 transformation exists to bridge this gap. Rather than requiring users to hand-craft intermediate files or manually restructure their data, the transformation automates the entire end-to-end process: reading the source file, extracting and optionally curating the relevant metadata, and uploading the results as structured ODM objects. +The result is feature-level indexed data that is ready for downstream analysis and cross-study discovery without manual file preparation. ## The ODM entity model for single-cell data From 939a6d15609e2103b8bb23612e15807240e11d71 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 08:42:04 +0200 Subject: [PATCH 07/74] Modify text describing goal of transformation Co-authored-by: Maria Borodaenko --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index 1c54c65..82ecc28 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -2,7 +2,7 @@ > **Related documentation:** For step-by-step guidance on running the transformation, see the [How-to Guides](how-to-sc-hdf5-transformations.md). For the full configuration schema, see the [Configuration Reference](configuration-reference.md). For the API endpoints, see the [API Reference](api-reference.md). For the internal processing pipeline, see the [Transformation Process Reference](transformation-process-reference.md). -Single-cell datasets are commonly distributed as HDF5-based files — most often H5AD (the AnnData format) or the 10x Genomics H5 format. While these formats are rich and expressive, they are not directly ingestible into ODM in a way that supports consistent indexing, cross-dataset search, and the entity relationships that ODM relies on for downstream discovery. +This transformation converts a single-cell HDF5 file into the ODM-compatible output files. It extracts expression data and related cell metadata, and can optionally harmonize metadata and create or update biosample objects in ODM. The output files are then imported and linked automatically. The result is feature-level indexed data that is ready for downstream analysis and cross-study discovery without manual file preparation. From 235ea233f28c4be89f03a151b9704803099abacd Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 08:42:23 +0200 Subject: [PATCH 08/74] Text edit Co-authored-by: Maria Borodaenko --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index 82ecc28..24ad086 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -79,5 +79,5 @@ This design allows the configuration to evolve (through iterations of the iterat The transformation supports the following HDF5-based input formats: - **H5AD (AnnData)** — the native format of the AnnData Python library, widely used in the single-cell ecosystem. -- **10x Genomics H5** — converted internally to H5AD before processing, so the extraction logic is unified regardless of the source format. +- **10x Genomics H5** — converted internally to H5AD before processing, so the same extraction workflow is used regardless of the input format. - **Legacy 10x Genomics H5 (v<3)** — supported provided the file contains a single genome. Multi-genome legacy files are not supported. From 5062e5b487f24d9e4f1339d4b3565bc1c5a3eb29 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 08:42:48 +0200 Subject: [PATCH 09/74] Text edit Co-authored-by: Maria Borodaenko --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index 24ad086..48d71de 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -80,4 +80,4 @@ The transformation supports the following HDF5-based input formats: - **H5AD (AnnData)** — the native format of the AnnData Python library, widely used in the single-cell ecosystem. - **10x Genomics H5** — converted internally to H5AD before processing, so the same extraction workflow is used regardless of the input format. -- **Legacy 10x Genomics H5 (v<3)** — supported provided the file contains a single genome. Multi-genome legacy files are not supported. +- **Legacy 10x Genomics H5 (v<3)** — supported only for files containing a single genome. Multi-genome legacy files are not supported. From 6ff62593607e8d3f8cea001cc90b26608dd2d67c Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 08:43:41 +0200 Subject: [PATCH 10/74] Include reference to single cell data user guide Co-authored-by: Maria Borodaenko --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index 48d71de..083788f 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -1,6 +1,6 @@ # Single-Cell HDF5 Transformations Overview -> **Related documentation:** For step-by-step guidance on running the transformation, see the [How-to Guides](how-to-sc-hdf5-transformations.md). For the full configuration schema, see the [Configuration Reference](configuration-reference.md). For the API endpoints, see the [API Reference](api-reference.md). For the internal processing pipeline, see the [Transformation Process Reference](transformation-process-reference.md). +> **Related documentation:** For step-by-step guidance on running the transformation, see the [How-to Guides](how-to-sc-hdf5-transformations.md). For the full configuration schema, see the [Configuration Reference](configuration-reference.md). For the API endpoints, see the [API Reference](api-reference.md). For the internal processing pipeline, see the [Transformation Process Reference](transformation-process-reference.md). For more information related the Single-cell data support in ODM, see the [Working with Single Cell Data](single-cell.md) This transformation converts a single-cell HDF5 file into the ODM-compatible output files. It extracts expression data and related cell metadata, and can optionally harmonize metadata and create or update biosample objects in ODM. The output files are then imported and linked automatically. From 76271be4ac2b2853be6a9b5db88579420d5d67a0 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 08:45:18 +0200 Subject: [PATCH 11/74] Reference linking group determination Co-authored-by: Maria Borodaenko --- .../about-sc-hdf5-transformations.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index 083788f..9b434db 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -44,15 +44,14 @@ The transformation can aggregate these cell-level attributes to the biosample le Attributes exported to biosample metadata are automatically removed from the cell metadata, preventing duplication. If a biosample attribute should remain in the cell metadata for other reasons, it must be explicitly retained by omitting it from `cell_metadata.columns_to_drop`. -## The linking resolution rules +## Linking created objects -When the transformation uploads a Cell Group, it must link it to a parent SLP entity. The transformation resolves this target using a defined priority order, so that users do not always need to specify the target explicitly: +When the transformation uploads a Cell Group, it links it to a parent Sample, Library, or Preparation entity (SLP). -1. If the configuration creates new SLP groups, the Cell Group is linked to those newly created groups after they are uploaded. -2. If `cell_metadata.linking_group` explicitly names a target (a sample, library, or preparation accession), that target is used directly. -3. If no explicit target is given, the transformation auto-discovers the appropriate SLP groups for the study from ODM, checking first for Library groups, then Preparation groups, then Sample groups. The first entity type that has at least one group associated with the study is used, and all accessions of that type are linked. +This is usually handled automatically. If the transformation creates new SLP objects, the Cell Group is linked to them. Otherwise, the transformation identifies the most appropriate existing SLP target in ODM. Users can override the automatic behavior by specifying the target explicitly in the configuration. +For details, see [Linking group determination](transformation-process-reference.md#13-linking-group-determination). -This priority order reflects the typical ODM study structure: Library groups are usually the most specific and appropriate parent for a Cell Group. If a study only has Sample-level grouping, the transformation falls back gracefully. +The created Expression Group created by the transformation is linked to the corresponding Cell Group . ## Dry run mode From 76bdaf16997ee032afd45c12afd7d1dcfeac1985 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 08:45:54 +0200 Subject: [PATCH 12/74] Rename section to reference specific API endpoints Co-authored-by: Maria Borodaenko --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index 9b434db..732c84d 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -61,7 +61,7 @@ Dry run mode is particularly useful for exploring which biosample-level attribut The recommended practice is to iterate on the configuration using repeated dry runs until all warnings are resolved before submitting a full transformation run. -## The API layer: configurations, images, and jobs +## Processors Controller API: configurations, images, and jobs The transformation is triggered and managed through the ODM Processors Controller API. This API models the workflow as three separate concerns, each of which can be managed independently: From 5b4f6a1c896bd8bb04f18341ad7c7528aaa2a298 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 08:48:19 +0200 Subject: [PATCH 13/74] Rephrase metadata and expression extraction description Co-authored-by: Maria Borodaenko --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index 732c84d..9d5f8f2 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -20,14 +20,13 @@ The transformation creates the Cell Group and Expression Group and links them in ## What the transformation reads from the source file -The transformation extracts three distinct types of data from a single HDF5 source file: +The transformation extracts three types of data from a HDF5 source file: -**Cell metadata** comes primarily from the `obs` slot of an H5AD file (or its equivalent in a 10x H5 file). This includes per-cell annotations such as barcodes, cluster assignments, quality control metrics, and any experimental annotations attached to individual cells. Multidimensional representations (such as PCA or UMAP coordinates stored in `obsm`) and pairwise cell annotations (from `obsp`) can also be extracted. +**Cell metadata** — extracted primarily from the `obs` in H5AD input file, or the equivalent structure in 10x H5 input. This includes per-cell annotations such as barcodes, cluster assignments, quality control metrics, and any other experimental annotations. Multidimensional representations stored in `obsm` (such as PCA or UMAP coordinates) and pairwise cell annotations from `obsp` can also be extracted. -**Feature metadata** comes from `var` (and optionally `varm` and `varp`). This includes per-gene annotations such as gene identifiers and names. The transformation can automatically map Ensembl or NCBI gene IDs to gene names for supported species, avoiding the need to pre-process gene annotation separately. - -**The expression matrix** is the count or normalized values matrix (`X`). The transformation validates its dimensions against the extracted cell and feature metadata, then writes it in a Brotli-compressed format optimised for ODM ingestion. +**Feature metadata** — extracted from `var`, and optionally from `varm` and `varp`. This includes per-gene annotations such as gene identifiers and gene names. For supported species, the transformation can also map Ensembl or NCBI gene identifiers to gene names automatically. +**The expression matrix** — extracted from `X`, which contains count or normalized expression values. The transformation validates the matrix dimensions against the extracted cell and feature metadata, then writes the matrix in a Brotli-compressed format optimized for ODM ingestion. ## The role of metadata curation Raw single-cell datasets frequently contain metadata that needs adjustment before it is useful in a cross-study context. Column names may differ between studies, values may be inconsistently coded, fields may be missing, or the biosample identifiers in the cell metadata may not match the naming conventions used by ODM's SLP entities. From 66dee86b0169c79a4886a6ac76f629b036d3a026 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 08:55:30 +0200 Subject: [PATCH 14/74] Modify SLP update description Co-authored-by: Maria Borodaenko --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index 9d5f8f2..e2465d4 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -37,11 +37,11 @@ This curation happens in-pipeline, which means the source file is never modified ## Biosample metadata and the aggregation model -A particularly important feature of the transformation is its ability to derive Sample, Library, or Preparation-level metadata directly from the cell metadata. In many single-cell datasets, attributes such as tissue type, disease condition, or donor information are stored as cell-level annotations (one value per cell), even though they logically belong at the biosample level. +Some single-cell datasets store tissue, disease, or other biosample-level attributes in cell metadata, repeating the same values for every cell. The transformation can aggregate these attributes into related biosample object: Sample, Library, or Preparation (SLP) objects in ODM. -The transformation can aggregate these cell-level attributes to the biosample level by grouping cells by a designated biosample identifier column. Only attributes that are constant across all cells belonging to the same biosample are considered eligible for export to SLP metadata. This ensures that the resulting biosample records are coherent and that no per-cell variation is incorrectly collapsed into a biosample-level value. +Aggregation is performed by grouping cells using a designated biosample identifier. Only attributes that are consistent across all cells in the same biosample can be assigned to related biosample objects. -Attributes exported to biosample metadata are automatically removed from the cell metadata, preventing duplication. If a biosample attribute should remain in the cell metadata for other reasons, it must be explicitly retained by omitting it from `cell_metadata.columns_to_drop`. +Attributes assigned to biosample objects are automatically removed from the cell metadata. This reduces duplication and improves the overall structure of the imported data. If an attribute must remain in the cell metadata, it must be explicitly retained by omitting it from `cell_metadata.columns_to_drop`. ## Linking created objects From 489f0f8d622bb57a3f483c84bfa9b58d7bc81127 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Mon, 30 Mar 2026 08:59:13 +0200 Subject: [PATCH 15/74] Update SLP update behaviour --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index e2465d4..f45c749 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -41,7 +41,7 @@ Some single-cell datasets store tissue, disease, or other biosample-level attrib Aggregation is performed by grouping cells using a designated biosample identifier. Only attributes that are consistent across all cells in the same biosample can be assigned to related biosample objects. -Attributes assigned to biosample objects are automatically removed from the cell metadata. This reduces duplication and improves the overall structure of the imported data. If an attribute must remain in the cell metadata, it must be explicitly retained by omitting it from `cell_metadata.columns_to_drop`. +Attributes assigned to biosample objects are automatically removed from the cell metadata. This reduces duplication and improves the overall structure of the imported data. ## Linking created objects From 94effba8fa6b6e32c54988f63083ce227d23938b Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 09:02:40 +0200 Subject: [PATCH 16/74] Update metadata curation description Co-authored-by: Maria Borodaenko --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index f45c749..2c90209 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -29,11 +29,11 @@ The transformation extracts three types of data from a HDF5 source file: **The expression matrix** — extracted from `X`, which contains count or normalized expression values. The transformation validates the matrix dimensions against the extracted cell and feature metadata, then writes the matrix in a Brotli-compressed format optimized for ODM ingestion. ## The role of metadata curation -Raw single-cell datasets frequently contain metadata that needs adjustment before it is useful in a cross-study context. Column names may differ between studies, values may be inconsistently coded, fields may be missing, or the biosample identifiers in the cell metadata may not match the naming conventions used by ODM's SLP entities. +Metadata curation is optional, but strongly recommended. It standardizes cell metadata so that it can be imported, linked, and indexed correctly in ODM. Certain fields must use the expected names and data types to ensure consistent linking and indexing. The transformation handles this for the user during processing. -The transformation addresses this through a set of configurable column operations applied during extraction. These include renaming columns, dropping irrelevant fields, filling missing values with defaults, and replacing specific values with standardized equivalents. Attribute names are also mapped to ODM standard names where applicable; non-standard names are automatically converted to camelCase to satisfy the ODM API requirements. +Curation also harmonizes metadata across datasets. This is essential for cross-study search and downstream analysis, because equivalent annotations must be represented consistently. This can include renaming attributes, replacing values with standardized terms, assigning default values, or dropping unnecessary columns. -This curation happens in-pipeline, which means the source file is never modified. The curated output exists only as intermediate files in a temporary directory and, ultimately, as the uploaded ODM objects. +Curation is applied only to the data produced by the transformation for import into ODM. The source file is not modified. ## Biosample metadata and the aggregation model From ab066af09190dc98ca954889e0fb9eb8e1686b66 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Date: Mon, 30 Mar 2026 10:11:12 +0200 Subject: [PATCH 17/74] Update description of dry-run mode Co-authored-by: Maria Borodaenko --- .../doc-odm-user-guide/about-sc-hdf5-transformations.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index 2c90209..6a53faa 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -54,11 +54,13 @@ The created Expression Group created by the transformation is linked to the corr ## Dry run mode -Before committing any data to ODM, users can run the transformation in dry run mode. In this mode, the transformation performs all extraction, curation, and validation steps — including resolving the linking target and validating that all cell batch identifiers match existing SLP objects — but uploads nothing. Logs are printed but not saved as attachments. +Dry run mode lets users validate the transformation setup before running a full import. In this mode, the transformation performs the initial processing steps, including reading the input, extracting metadata, applying curation, and running validation checks. It skips the most time-consuming output-generation steps, such as creating the expression matrix, and does not upload data to ODM. -Dry run mode is particularly useful for exploring which biosample-level attributes are available in a dataset before committing to a curation strategy. When `biosample_metadata` is configured without any `columns_to_export` entries, the dry run will log which columns are uniform per biosample and therefore eligible for export — without creating any files or objects. +Dry run mode is useful for checking that the configuration works as expected and that the required inputs, metadata mappings, and linkage settings are resolved correctly before a full run. -The recommended practice is to iterate on the configuration using repeated dry runs until all warnings are resolved before submitting a full transformation run. +When `biosample_metadata` is configured without any `columns_to_export` entries, dry run mode can also be used to inspect which attributes are uniform within each biosample and therefore eligible for re-assigning. + +The recommended approach is to iterate on the configuration using dry runs until warnings are resolved, and then run the full transformation. For details, see [How to iterate on a configuration using dry runs](how-to-sc-hdf5-transformations.md#how-to-iterate-on-a-configuration-using-dry-runs). ## Processors Controller API: configurations, images, and jobs From 214220d6a6ad7f83c86b532ae5f69f2d01c0f85d Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Mon, 30 Mar 2026 11:17:09 +0200 Subject: [PATCH 18/74] Update Processors controller API description --- .../about-sc-hdf5-transformations.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index 2c90209..8ee2296 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -62,15 +62,18 @@ The recommended practice is to iterate on the configuration using repeated dry r ## Processors Controller API: configurations, images, and jobs -The transformation is triggered and managed through the ODM Processors Controller API. This API models the workflow as three separate concerns, each of which can be managed independently: +The transformation is managed through the ODM Processors Controller API. It is based on three related components: configurations, images, and jobs. -**Transformation configurations** are stored JSON documents that describe how a source file should be processed: the input file format, which metadata to extract, and any curation rules to apply. Configurations are created, retrieved, and updated independently of any particular run. This separation means you can refine a configuration through many dry-run iterations without losing the history of changes, and reuse the same configuration across multiple runs or files. +**Transformation configurations** are JSON documents that define how input files should be processed, including the input format, metadata extraction, and curation rules. Configurations can be created, retrieved, and updated independently of any particular run. The same configuration can be reused across multiple files with the same structure. -**Transformation images** are versioned, containerized environments that execute the processing logic for a given file format. The image used for single-cell HDF5 files is called `hdf5-cells`. Specifying an image version (e.g. `"latest"` or a specific release tag) allows reproducibility and controlled upgrades when new versions are released. +**Transformation images** are versioned container images that run the processing logic. Available image versions can be queried through the API. The image used for single-cell HDF5 files is `hdf5-cells`. When starting a job, users can specify either `latest` or a specific release tag. -**Transformation jobs** are the actual execution records. A job binds a configuration and an image to one or more input file accessions, runs the processing pipeline, and produces a log. Each job is independent: you can re-run with a different configuration or image without affecting previous jobs or their results. +**Transformation jobs** are the execution records. A job combines a configuration, an image, and one or more input files, runs the transformation, and produces the output and logs. Jobs are independent, so the same input can be run again with a different configuration or image when needed. -This design allows the configuration to evolve (through iterations of the iterative dry-run cycle) while keeping the job history clean and auditable. +## Transformation logs + +Each transformation job produces a log that records the processing steps, warnings, detected issues, and created outputs. The log also includes provenance information, such as the source file name and accession, and the accessions of the created objects. +As part of the transformation, the log is uploaded to ODM and stored with the study as an attachment alongside the other generated files. This provides a persistent record of the transformation output. Logs are also available through the API for a limited time. By default, this retention period is two weeks. ## Supported input formats From 7d72262d426b390e0b395e35c3fd1a11c09396d0 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Mon, 30 Mar 2026 13:02:53 +0200 Subject: [PATCH 19/74] Create attribute mapping reference --- .../about-sc-hdf5-transformations.md | 7 +-- .../doc-odm-user-guide/attribute-mapping.md | 47 +++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 docs/user-guide/doc-odm-user-guide/attribute-mapping.md diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index e7d457a..f467dae 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -27,11 +27,12 @@ The transformation extracts three types of data from a HDF5 source file: **Feature metadata** — extracted from `var`, and optionally from `varm` and `varp`. This includes per-gene annotations such as gene identifiers and gene names. For supported species, the transformation can also map Ensembl or NCBI gene identifiers to gene names automatically. **The expression matrix** — extracted from `X`, which contains count or normalized expression values. The transformation validates the matrix dimensions against the extracted cell and feature metadata, then writes the matrix in a Brotli-compressed format optimized for ODM ingestion. + ## The role of metadata curation -Metadata curation is optional, but strongly recommended. It standardizes cell metadata so that it can be imported, linked, and indexed correctly in ODM. Certain fields must use the expected names and data types to ensure consistent linking and indexing. The transformation handles this for the user during processing. +Metadata curation is optional, but strongly recommended. It standardizes cell metadata so that it can be imported, linked, and indexed correctly in ODM. Certain fields must use the expected names and data types to ensure consistent linking and indexing. The transformation handles this for the user during processing. -Curation also harmonizes metadata across datasets. This is essential for cross-study search and downstream analysis, because equivalent annotations must be represented consistently. This can include renaming attributes, replacing values with standardized terms, assigning default values, or dropping unnecessary columns. +As part of curation, the transformation performs automatic attribute mapping: commonly used attribute names from tools such as Seurat, Scanpy, or Cell Ranger are recognized and renamed to the canonical ODM API names without any configuration. Automatic attribute mapping helps harmonizing metadata across datasets, which is essential for cross-study search and downstream analysis. Attributes that do not match any known name are retained and their names are automatically converted to camelCase for consistency with the ODM naming convention. For the full list of recognized names, see the [Attribute Mapping Reference](attribute-mapping.md). Curation is applied only to the data produced by the transformation for import into ODM. The source file is not modified. @@ -81,6 +82,6 @@ As part of the transformation, the log is uploaded to ODM and stored with the st The transformation supports the following HDF5-based input formats: -- **H5AD (AnnData)** — the native format of the AnnData Python library, widely used in the single-cell ecosystem. +- **H5AD (AnnData)** — the native format of the AnnData Python library, widely used for single-cell data processing. - **10x Genomics H5** — converted internally to H5AD before processing, so the same extraction workflow is used regardless of the input format. - **Legacy 10x Genomics H5 (v<3)** — supported only for files containing a single genome. Multi-genome legacy files are not supported. diff --git a/docs/user-guide/doc-odm-user-guide/attribute-mapping.md b/docs/user-guide/doc-odm-user-guide/attribute-mapping.md new file mode 100644 index 0000000..ccbcb19 --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/attribute-mapping.md @@ -0,0 +1,47 @@ +# Attribute Mapping Reference + +During metadata curation, the transformation automatically maps commonly used attribute names found in source HDF5 files to the canonical ODM API names. This makes it possible to ingest data from a wide variety of tools and workflows — such as Seurat, Scanpy, or Cell Ranger — without requiring manual renaming of attributes before import. + +Mapping is applied separately to cell metadata and feature metadata. When an attribute in the source file matches one of the known alternative names listed below, it is renamed to the corresponding ODM API display name. Attributes that do not match any known name are converted to camelCase. + +## Cell metadata attributes + +The table below lists the canonical ODM API name for each attribute alongside the alternative source names that are automatically recognized. + +| ODM API display name | Alternative names | +|---|---| +| cellID | — | +| barcode | — | +| batch | `sample_id`, `sample`, `run_id` | +| cellType | `cell_type`, `celltype`, `ident`, `labels` | +| cluster | `cluster_louvain`, `cluster_leiden`, `seurat_clusters` | +| nCounts | `n_counts`, `umi_count`, `nCount_RNA`, `total_umi`, `n_umi`, `n_reads`, `nUMI`, `UMI_count` | +| percentMito | `percent_mito`, `percent_mt`, `percent.mt`, `pct_mt`, `pct_mito`, `pct_counts_mito`, `percent.mito`, `percent.mito.raw`, `mito_ratio`, `pct_counts_mt` | +| umap | `X_umap`, `UMAP` | +| pca | `X_pca`, `PCA` | +| tsne | `X_tsne`, `tSNE` | +| pcaHarmony | `pca_harmony`, `X_harmony`, `harmony_embedding`, `X_pca_harmony` | +| nGenes | `n_genes`, `n_genes_by_counts`, `nGene`, `n_features`, `nFeature_RNA`, `genes_detected`, `detected_genes`, `gene_count`, `Total_Genes_Detected` | +| mitoCounts | `mito_counts`, `total_counts_mt`, `total_counts_mito`, `subsets_mt_sum`, `mt_sum`, `MT_sum` | +| riboCounts | `ribo_counts`, `total_counts_ribo`, `total_counts_rb`, `subsets_ribo_sum`, `rb_counts`, `rb_sum` | +| percentRibo | `percent_ribo`, `percent_rb`, `percent.rb`, `pct_counts_ribo`, `ribo_ratio`, `pct_ribo`, `pct_counts_rb`, `pct_counts_rrna` | +| percentHemoglobin | `percent_hb`, `pct_hb`, `hemoglobin_fraction`, `prop_hb`, `percent_hemoglobin` | +| doubletStatus | `doublet_status`, `is_doublet`, `predicted_doublet`, `multiplet_status` | +| doubletScore | `doublet_score`, `scrublet_score`, `doublet_probability`, `multiplet_score`, `doublet_stat` | +| sScore | `S_score`, `s.score`, `S.Score`, `s_phase_score`, `S_phase_probability` | +| g2mScore | `G2M_score`, `g2m.score`, `G2M.Score`, `g2m_phase_score`, `G2M_phase_probability` | +| CellCycle | `phase`, `cell_cycle_phase`, `cc_phase`, `cycle_stage` | +| ambientFraction | `ambient_fraction`, `decontX_score`, `rho`, `contamination_fraction`, `ambient_rna_percent`, `soup_fraction`, `soup_frac` | + +## Feature metadata attributes + +The table below lists the canonical ODM API name for each feature attribute alongside the alternative source names that are automatically recognized. + +| ODM API display name | Alternative names | +|---|---| +| geneId | `gene_id` (index), `gene_ids`, `ensembl_id`, `feature_id`, `stable_id`, `ENSEMBL` | +| gene | `symbol`, `symbols`, `gene_symbol`, `gene_symbols`, `feature_name`, `display_name`, `name`, `gene_name` | +| totalCounts | `total_counts`, `gene_total`, `sum_counts`, `count_sum`, `total_umis` | +| nCellsByCounts | `n_cells_by_counts`, `n_cells`, `num_cells`, `n_obs`, `num_cells_expressed` | +| meanCounts | `mean_counts`, `avg_exp`, `obs_mean`, `means` | +| pctDropoutByCounts | `pct_dropout_by_counts`, `pct_dropout`, `percent_dropout`, `dropout_rate` | From 56b7ca5039daa25ec5bff5b7a831f6c1edd7b5cc Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Mon, 30 Mar 2026 13:06:17 +0200 Subject: [PATCH 20/74] Select desired configuration reference option, remove unwanted files --- .../doc-odm-user-guide/config-ref-option-A.md | 156 ------ .../doc-odm-user-guide/config-ref-option-B.md | 150 ------ .../doc-odm-user-guide/config-ref-option-C.md | 282 ---------- .../configuration-reference.md | 507 +++--------------- 4 files changed, 79 insertions(+), 1016 deletions(-) delete mode 100644 docs/user-guide/doc-odm-user-guide/config-ref-option-A.md delete mode 100644 docs/user-guide/doc-odm-user-guide/config-ref-option-B.md delete mode 100644 docs/user-guide/doc-odm-user-guide/config-ref-option-C.md diff --git a/docs/user-guide/doc-odm-user-guide/config-ref-option-A.md b/docs/user-guide/doc-odm-user-guide/config-ref-option-A.md deleted file mode 100644 index d223bc8..0000000 --- a/docs/user-guide/doc-odm-user-guide/config-ref-option-A.md +++ /dev/null @@ -1,156 +0,0 @@ -# Configuration Reference: Single-Cell HDF5 Transformation - -> **Related documentation:** [About SC HDF5 Transformations](about-sc-hdf5-transformations.md) · [How-to Guides](how-to-sc-hdf5-transformations.md) · [API Reference](api-reference.md) · [Transformation Process Reference](transformation-process-reference.md) - -The configuration is validated at the start of every run. If `file_type` is missing or invalid, the pipeline raises an error immediately. All other validation errors are collected and reported together. Unrecognised keys are ignored with a warning. - ---- - -## Top-level parameters - -| Parameter | Type | Required | Default | Description | -|-----------|------|----------|---------|-------------| -| `file_type` | `string` | **Yes** | — | Format of the input file. Accepted values: `"h5ad"`, `"h5"`. | -| `save_logs` | `boolean` | No | `true` | When `false`, logs are not saved as an attachment after the run. Has no effect when environment variable `dry_run` is `true`. | - ---- - -## `biosample_metadata` - -Settings for extracting, transforming, and exporting cell-level metadata to Sample, Library, or Preparation entities. The entire section is optional. - -| Parameter | Type | Required | Default | Description | -|-----------|------|----------|---------|-------------| -| `metadata_keys` | `dict[string, string]` | Yes | — | Maps HDF5 group keys to metadata types. Use `"obs": "metadata"` to read standard cell metadata. | -| `biosample_column_name` | `string` | Yes | — | Column identifying which biosample each cell belongs to. Rows are grouped by this column for aggregation. | - -**`metadata_keys` example:** -```json -{ "obs": "metadata" } -``` - -### `biosample_metadata.sample` - -Settings for exporting metadata to the Sample entity. Optional. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `create_new_group` | `boolean` | `false` | When `true`, creates a new Sample group in ODM and links it to the study. | -| `template_id` | `string` | — | Template ID for the new Sample group. Falls back to the study default if omitted. | -| `columns_to_export` | `list[string]` | — | Cell metadata columns to include in the exported Sample metadata. Only columns constant per biosample are eligible; exported columns are dropped from cell metadata. | -| `columns_renaming_map` | `dict[string, string]` | — | Maps source column names to new names in the exported metadata. | -| `columns_to_fill_missing_values` | `dict[string, string]` | — | Default values for missing entries in specified columns. | -| `columns_to_curate_values` | `dict[string, dict[string, string]]` | — | Maps specific values in a column to replacement values. | - -**Examples:** -```json -{ "columns_renaming_map": { "tissue_type": "tissueType" } } -{ "columns_to_fill_missing_values": { "disease": "unknown" } } -{ "columns_to_curate_values": { "tissue": { "PBMCs": "peripheral blood mononuclear cells" } } } -``` - -### `biosample_metadata.library` - -Accepts the same parameters as `biosample_metadata.sample`, plus: - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `linking_group` | `string` | — | Accession of an existing Sample group to link the new Library group to. If omitted, the pipeline uses a Sample group from the same run or pre-fetched accessions. | - -### `biosample_metadata.preparation` - -Accepts the same parameters as `biosample_metadata.library`, including `linking_group`. - -> **Constraint:** Only one of `library` or `preparation` may have `columns_to_export` set in the same configuration. - ---- - -## `cell_metadata` - -Settings for extracting and transforming cell-level metadata. Optional. If absent, no Cell Group is created. - -| Parameter | Type | Required | Default | Description | -|-----------|------|----------|---------|-------------| -| `metadata_keys` | `dict[string, string]` | Yes | — | Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. | -| `linking_group` | `dict[string, string \| list[string] \| null]` | No | — | Specifies the parent SLP entity (sample/library/preparation) to link the Cell Group to. Empty value triggers auto-discovery of all available accessions. | -| `columns_to_drop` | `list[string]` | No | — | Column names to remove before processing. | -| `columns_renaming_map` | `dict[string, string]` | No | — | Maps source column names to new names. | -| `columns_to_fill_missing_values` | `dict[string, string]` | No | — | Default values for missing entries. | -| `columns_to_curate_values` | `dict[string, dict[string, string]]` | No | — | Replacement values for specific entries in specified columns. | -| `set_column_value` | `dict[string, string]` | No | — | Sets a constant value for all rows. Can add new columns or overwrite existing ones. | -| `columns_to_preserve_name` | `list[string]` | No | — | Columns to exempt from internal name standardisation (e.g. Leiden cluster columns with decimal suffixes). | -| `add_qc_metrics` | `boolean` | No | `true` | When `true`, adds QC metrics (counts, genes, mitochondrial/ribosomal presence) if not already present. Skipped when environment variable `dry_run` is `true`. | - -**`metadata_keys` accepted values (H5AD):** - -| Key | Value | Description | -|-----|-------|-------------| -| `obs` | `metadata` | Standard cell annotations | -| `obsm` | `embedding` | Multidimensional cell data (PCA, UMAP, etc.) | -| `obsp` | `pairwise` | Pairwise cell annotations | - -For H5 files, use the same H5AD key names — the transformation maps them to the correct internal structure. - -**Examples:** -```json -{ "metadata_keys": { "obs": "metadata", "obsm": "embedding" } } -{ "linking_group": { "library": "GSF017080" } } -{ "columns_to_drop": ["taxon", "organism_id"] } -{ "columns_renaming_map": { "sample": "batch", "pctmt": "percentMito" } } -{ "set_column_value": { "sample_id": "lung_1" } } -{ "columns_to_preserve_name": ["cluster_leiden_0.5"] } -``` - ---- - -## `feature_metadata` - -Settings for extracting and transforming feature (gene)-level metadata. Optional. - -| Parameter | Type | Required | Default | Description | -|-----------|------|----------|---------|-------------| -| `metadata_keys` | `dict[string, string]` | Yes | — | Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. | -| `columns_to_drop` | `list[string]` | No | — | Column names to remove from feature metadata. | -| `columns_renaming_map` | `dict[string, string]` | No | — | Maps source column names to new names. | -| `columns_to_fill_missing_values` | `dict[string, string]` | No | — | Default values for missing entries. | -| `columns_to_curate_values` | `dict[string, dict[string, string]]` | No | — | Replacement values for specific entries. | -| `set_column_value` | `dict[string, string]` | No | — | Sets a constant value for all rows. | -| `columns_to_preserve_name` | `list[string]` | No | — | Columns to exempt from internal name standardisation. | -| `map_gene_ids_to_names` | `boolean` | No | `true` | When `true`, maps gene IDs to gene names if names are absent and `geneId` column is present. Set to `false` for proteomics or non-gene-ID data. | - -**`metadata_keys` accepted values (H5AD):** - -| Key | Value | Description | -|-----|-------|-------------| -| `var` | `metadata` | Standard feature annotations | -| `varm` | `embedding` | Multidimensional feature data | -| `varp` | `pairwise` | Pairwise feature annotations | - -> The gene ID column must be named `geneId` for mapping to be performed. - -**Supported organisms (`map_gene_ids_to_names`) — hdf5-cells v0.0.4:** - -| Organism | Genome version | Ensembl release | NCBI release | -|----------|---------------|-----------------|--------------| -| *Homo sapiens* | GRCh38.p14 | 115 | GCF_000001405.40-RS_2025_08 | -| *Mus musculus* | GRCm39 | 115 | GCF_000001635.27-RS_2024_02 | -| *Rattus norvegicus* | GRCr8 | 115 | GCF_036323735.1-RS_2024_02 | -| *Sus scrofa* | Sscrofa11.1 | 115 | 106 | - ---- - -## `cell_expression` - -Settings for extracting and uploading the cell expression matrix. Optional. If absent, no Expression Group is created. - -| Parameter | Type | Required | Default | Description | -|-----------|------|----------|---------|-------------| -| `data_class` | `string` | **Yes** | — | Data class label for the expression data (e.g. `"Single-cell transcriptomics"`). | -| `compression_level` | `integer` (0–9) | No | `4` | Brotli compression level. Higher values produce smaller files at the cost of longer compression time. | -| `chunk_size` | `integer` | No | auto | Number of features processed per chunk. Calculated automatically from available memory if omitted. | -| `max_buffer_size` | `integer` | No | `50` | Amount of data held in memory before being flushed to disk during writing. | -| `number_format` | `string` | No | inferred | Numeric precision of output values. Accepts printf-style (`"%.7g"`, `"%d"`) or NumPy dtype (`"float32"`, `"int64"`). | -| `columns_to_drop` | `list[string]` | No | — | Column names to remove from expression metadata. | -| `columns_renaming_map` | `dict[string, string]` | No | — | Maps source column names to new names. | -| `set_column_value` | `dict[string, string]` | No | — | Sets a constant value for all rows in specified columns. | -| `source_file_metadata` | `boolean` | No | `true` | When `true`, metadata from the source HDF5 attachment is read and included in expression metadata. Summary statistics (cell count, feature count, sparsity, etc.) are always appended regardless of this flag. | diff --git a/docs/user-guide/doc-odm-user-guide/config-ref-option-B.md b/docs/user-guide/doc-odm-user-guide/config-ref-option-B.md deleted file mode 100644 index bec7278..0000000 --- a/docs/user-guide/doc-odm-user-guide/config-ref-option-B.md +++ /dev/null @@ -1,150 +0,0 @@ -# Configuration Reference: Single-Cell HDF5 Transformation - -> **Related documentation:** [About SC HDF5 Transformations](about-sc-hdf5-transformations.md) · [How-to Guides](how-to-sc-hdf5-transformations.md) · [API Reference](api-reference.md) · [Transformation Process Reference](transformation-process-reference.md) - -The configuration file controls how the transformation reads, processes, and indexes cell and feature metadata from HDF5 files. Parameters are organised by top-level section. The configuration is validated at run start; `file_type` errors fail immediately, all others are reported together at the end of validation. Unrecognised keys are ignored with a warning. - ---- - -## Top-level parameters - -**`file_type`** *(string, mandatory)* — Format of the input HDF5 file. Accepted values: `"h5ad"`, `"h5"`. - -**`save_logs`** *(boolean, optional, default: `true`)* — When `false`, logs are not saved as an attachment after the run. Has no effect when environment variable `dry_run` is `true`. - ---- - -## `biosample_metadata` - -Settings for extracting, transforming, and exporting cell-level metadata to Sample, Library, or Preparation entities. The entire section is optional. - -**`metadata_keys`** *(dict[string, string], mandatory if section is present)* — Maps HDF5 group keys to metadata types. Use `"obs": "metadata"` to read standard cell metadata as the source for biosample-level aggregation. Example: -```json -{ "obs": "metadata" } -``` - -**`biosample_column_name`** *(string, mandatory if section is present)* — Column identifying which biosample each cell belongs to (e.g. a sample ID or library ID column). Rows are grouped by this column for aggregation. - -### sample - -Settings for exporting metadata to the Sample entity. Optional. - -**`create_new_group`** *(boolean, optional)* — When `true`, creates a new Sample group in ODM and links it to the study. When omitted or `false`, existing Sample group objects are updated instead. - -**`template_id`** *(string, optional)* — Template ID for the new Sample group. Falls back to the study default if omitted. - -**`columns_to_export`** *(list[string], optional)* — Cell metadata columns to include in the exported Sample metadata. Only columns constant per biosample are eligible; exported columns are automatically dropped from cell metadata. - -**`columns_renaming_map`** *(dict[string, string], optional)* — Maps source column names to new names. Example: `{ "tissue_type": "tissueType" }` - -**`columns_to_fill_missing_values`** *(dict[string, string], optional)* — Default values for missing entries. Example: `{ "disease": "unknown" }` - -**`columns_to_curate_values`** *(dict[string, dict[string, string]], optional)* — Replacement values for specific entries in specified columns. Example: -```json -{ "tissue": { "PBMCs": "peripheral blood mononuclear cells" } } -``` - -### library - -Accepts the same parameters as `sample`, plus: - -**`linking_group`** *(string, optional)* — Accession of an existing Sample group to link the new Library group to. If omitted, the pipeline uses a Sample group created in the same run, or pre-fetched accessions for the study. - -### preparation - -Accepts the same parameters as `library`, including `linking_group`. - -> **Constraint:** Only one of `library` or `preparation` may have `columns_to_export` set in the same configuration. - ---- - -## `cell_metadata` - -Settings for extracting and transforming cell-level metadata. Optional. If absent, no Cell Group is created. - -**`metadata_keys`** *(dict[string, string], mandatory if section is present)* — Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. Accepted values for H5AD files: - -- `"obs": "metadata"` — Standard cell annotations -- `"obsm": "embedding"` — Multidimensional cell data (PCA, UMAP, etc.) -- `"obsp": "pairwise"` — Pairwise cell annotations (e.g. cell–cell distances) - -For H5 files, use the same H5AD key names — the transformation maps them to the correct internal structure regardless of source format. Example: -```json -{ "obs": "metadata", "obsm": "embedding", "obsp": "pairwise" } -``` - -**`linking_group`** *(dict[string, string | list[string] | null], optional)* — Specifies the parent SLP entity to link the Cell Group to. Must contain exactly one key: `sample`, `library`, or `preparation`. An empty value (`[]`, `""`, or `null`) triggers auto-discovery of all available accessions of that type. If the parameter is absent entirely and no new SLP groups are being created, auto-discovery applies in this order: Library → Preparation → Sample. Examples: -```json -{ "library": "GSF017080" } -{ "preparation": [] } -``` - -**`columns_to_drop`** *(list[string], optional)* — Column names to remove before processing. Example: `["taxon", "organism_id"]` - -**`columns_renaming_map`** *(dict[string, string], optional)* — Maps source column names to new names. Example: `{ "sample": "batch", "pctmt": "percentMito" }` - -**`columns_to_fill_missing_values`** *(dict[string, string], optional)* — Default values for missing entries. Example: `{ "batch": "unknown" }` - -**`columns_to_curate_values`** *(dict[string, dict[string, string]], optional)* — Replacement values for specific entries. Example: `{ "sample": { "LGVXCTRL1": "lung_healthy_1" } }` - -**`set_column_value`** *(dict[string, string], optional)* — Sets a constant value for all rows. Can add new attribute columns or overwrite existing ones. Example: `{ "sample_id": "lung_1" }` - -**`columns_to_preserve_name`** *(list[string], optional)* — Columns to exempt from internal name standardisation. Use for columns whose names contain characters that would otherwise be altered (e.g. Leiden cluster columns with decimal suffixes such as `cluster_leiden_0.5`). - -**`add_qc_metrics`** *(boolean, optional, default: `true`)* — When `true`, QC metrics are calculated and added to cell metadata if not already present (counts, genes, mitochondrial and ribosomal gene presence). Skipped when environment variable `dry_run` is `true`. - ---- - -## `feature_metadata` - -Settings for extracting and transforming feature (gene)-level metadata. Optional. - -**`metadata_keys`** *(dict[string, string], mandatory if section is present)* — Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. Accepted values for H5AD files: - -- `"var": "metadata"` — Standard feature annotations -- `"varm": "embedding"` — Multidimensional feature data -- `"varp": "pairwise"` — Pairwise feature annotations - -For H5 files, use the same H5AD key names. Example: `{ "var": "metadata", "varm": "embedding" }` - -**`columns_to_drop`** *(list[string], optional)* — Column names to remove from feature metadata. - -**`columns_renaming_map`** *(dict[string, string], optional)* — Maps source column names to new names. - -**`columns_to_fill_missing_values`** *(dict[string, string], optional)* — Default values for missing entries. - -**`columns_to_curate_values`** *(dict[string, dict[string, string]], optional)* — Replacement values for specific entries. - -**`set_column_value`** *(dict[string, string], optional)* — Sets a constant value for all rows. - -**`columns_to_preserve_name`** *(list[string], optional)* — Columns to exempt from internal name standardisation. - -**`map_gene_ids_to_names`** *(boolean, optional, default: `true`)* — When `true`, attempts to map gene IDs to gene names if names are absent and the `geneId` column is present. The pipeline infers the ID source (Ensembl or NCBI) and species automatically. Set to `false` for proteomics or other omics data that do not use gene IDs as identifiers. - -> The gene ID column must be named `geneId` for mapping to be performed. - -Supported organisms (hdf5-cells v0.0.4): *Homo sapiens* (GRCh38.p14, Ensembl 115), *Mus musculus* (GRCm39, Ensembl 115), *Rattus norvegicus* (GRCr8, Ensembl 115), *Sus scrofa* (Sscrofa11.1, Ensembl 115). - ---- - -## `cell_expression` - -Settings for extracting and uploading the cell expression matrix. Optional. If absent, no Expression Group is created. - -**`data_class`** *(string, mandatory if section is present)* — Data class label for the expression data. Example: `"Single-cell transcriptomics"` - -**`compression_level`** *(integer 0–9, optional, default: `4`)* — Brotli compression level for the output file. Higher values produce smaller files at the cost of longer compression time. - -**`chunk_size`** *(integer, optional)* — Number of features processed per chunk during export. Calculated automatically from available container memory if omitted. - -**`max_buffer_size`** *(integer, optional, default: `50`)* — Amount of data held in memory before being flushed to disk during expression writing. - -**`number_format`** *(string, optional)* — Numeric precision of values in the output file. Accepts printf-style (`"%.7g"`, `"%d"`) or NumPy dtype (`"float32"`, `"int64"`). Inferred from the data if omitted. - -**`columns_to_drop`** *(list[string], optional)* — Column names to remove from expression metadata. - -**`columns_renaming_map`** *(dict[string, string], optional)* — Maps source column names to new names in expression metadata. - -**`set_column_value`** *(dict[string, string], optional)* — Sets a constant value for all rows in specified expression metadata columns. - -**`source_file_metadata`** *(boolean, optional, default: `true`)* — When `true`, metadata from the source HDF5 attachment is read and included in expression metadata (subject to `columns_to_drop`, `columns_renaming_map`, and `set_column_value`). When `false`, source file metadata extraction is skipped. In all cases, summary statistics are always appended (total cells, total features, sparsity %, non-zero values, source file accession and name). diff --git a/docs/user-guide/doc-odm-user-guide/config-ref-option-C.md b/docs/user-guide/doc-odm-user-guide/config-ref-option-C.md deleted file mode 100644 index 9af45b9..0000000 --- a/docs/user-guide/doc-odm-user-guide/config-ref-option-C.md +++ /dev/null @@ -1,282 +0,0 @@ -# Configuration Reference: Single-Cell HDF5 Transformation - -> **Related documentation:** [About SC HDF5 Transformations](about-sc-hdf5-transformations.md) · [How-to Guides](how-to-sc-hdf5-transformations.md) · [API Reference](api-reference.md) · [Transformation Process Reference](transformation-process-reference.md) - ---- - -## Quick reference - -| Section | Parameter | Type | Required | Default | -|---------|-----------|------|----------|---------| -| *(top-level)* | `file_type` | string | **Yes** | — | -| *(top-level)* | `save_logs` | boolean | No | `true` | -| `biosample_metadata` | `metadata_keys` | dict[str, str] | Yes* | — | -| `biosample_metadata` | `biosample_column_name` | string | Yes* | — | -| `biosample_metadata.sample` | `create_new_group` | boolean | No | `false` | -| `biosample_metadata.sample` | `template_id` | string | No | — | -| `biosample_metadata.sample` | `columns_to_export` | list[string] | No | — | -| `biosample_metadata.sample` | `columns_renaming_map` | dict[str, str] | No | — | -| `biosample_metadata.sample` | `columns_to_fill_missing_values` | dict[str, str] | No | — | -| `biosample_metadata.sample` | `columns_to_curate_values` | dict[str, dict[str, str]] | No | — | -| `biosample_metadata.library` | *(same as sample)* + `linking_group` | string | No | — | -| `biosample_metadata.preparation` | *(same as library)* | — | — | — | -| `cell_metadata` | `metadata_keys` | dict[str, str] | Yes* | — | -| `cell_metadata` | `linking_group` | dict[str, …] | No | auto | -| `cell_metadata` | `columns_to_drop` | list[string] | No | — | -| `cell_metadata` | `columns_renaming_map` | dict[str, str] | No | — | -| `cell_metadata` | `columns_to_fill_missing_values` | dict[str, str] | No | — | -| `cell_metadata` | `columns_to_curate_values` | dict[str, dict[str, str]] | No | — | -| `cell_metadata` | `set_column_value` | dict[str, str] | No | — | -| `cell_metadata` | `columns_to_preserve_name` | list[string] | No | — | -| `cell_metadata` | `add_qc_metrics` | boolean | No | `true` | -| `feature_metadata` | `metadata_keys` | dict[str, str] | Yes* | — | -| `feature_metadata` | `columns_to_drop` | list[string] | No | — | -| `feature_metadata` | `columns_renaming_map` | dict[str, str] | No | — | -| `feature_metadata` | `columns_to_fill_missing_values` | dict[str, str] | No | — | -| `feature_metadata` | `columns_to_curate_values` | dict[str, dict[str, str]] | No | — | -| `feature_metadata` | `set_column_value` | dict[str, str] | No | — | -| `feature_metadata` | `columns_to_preserve_name` | list[string] | No | — | -| `feature_metadata` | `map_gene_ids_to_names` | boolean | No | `true` | -| `cell_expression` | `data_class` | string | **Yes*** | — | -| `cell_expression` | `compression_level` | integer (0–9) | No | `4` | -| `cell_expression` | `chunk_size` | integer | No | auto | -| `cell_expression` | `max_buffer_size` | integer | No | `50` | -| `cell_expression` | `number_format` | string | No | inferred | -| `cell_expression` | `columns_to_drop` | list[string] | No | — | -| `cell_expression` | `columns_renaming_map` | dict[str, str] | No | — | -| `cell_expression` | `set_column_value` | dict[str, str] | No | — | -| `cell_expression` | `source_file_metadata` | boolean | No | `true` | - -*Yes* = required only if the parent section is present. - ---- - -## Parameter details - -### Top-level - -#### `file_type` -Format of the input HDF5 file. Must be provided; the pipeline cannot proceed without a valid file type. -- Accepted values: `"h5ad"`, `"h5"` - -#### `save_logs` -When `false`, logs are not saved as an attachment after the run. Has no effect when environment variable `dry_run` is `true`. - ---- - -### `biosample_metadata` - -Settings for extracting, transforming, and exporting cell-level metadata to Sample, Library, or Preparation entities. The entire section is optional. - -#### `metadata_keys` -Maps HDF5 group keys to metadata types. Use `"obs": "metadata"` to read standard cell metadata as the source for biosample-level aggregation. - -```json -{ "obs": "metadata" } -``` - -#### `biosample_column_name` -Column identifying which biosample each cell belongs to (e.g. a sample ID or library ID column). Rows are grouped by this column for aggregation. - ---- - -#### `biosample_metadata.sample` - -#### `create_new_group` -When `true`, creates a new Sample group in ODM and links it to the study. When `false` or omitted, existing Sample group objects are updated instead. - -#### `template_id` -Template ID for the new Sample group. Falls back to the study default if omitted. - -#### `columns_to_export` -Cell metadata columns to include in the exported Sample metadata. Only columns that are constant per biosample are eligible; exported columns are automatically dropped from cell metadata. - -#### `columns_renaming_map` -Maps source column names to new names. -```json -{ "tissue_type": "tissueType" } -``` - -#### `columns_to_fill_missing_values` -Default values for missing entries. -```json -{ "disease": "unknown" } -``` - -#### `columns_to_curate_values` -Replacement values for specific entries in specified columns. -```json -{ "tissue": { "PBMCs": "peripheral blood mononuclear cells" } } -``` - ---- - -#### `biosample_metadata.library` - -Accepts all parameters from `biosample_metadata.sample`, plus: - -#### `linking_group` -Accession of an existing Sample group to link the new Library group to. If omitted, the pipeline uses a Sample group created in the same run, or pre-fetched accessions for the study. - ---- - -#### `biosample_metadata.preparation` - -Accepts all parameters from `biosample_metadata.library`, including `linking_group`. - -> **Constraint:** Only one of `library` or `preparation` may have `columns_to_export` set in the same configuration. - ---- - -### `cell_metadata` - -Settings for extracting and transforming cell-level metadata. Optional. If absent, no Cell Group is created. - -#### `metadata_keys` -Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. - -| Key | Value | Description | -|-----|-------|-------------| -| `obs` | `metadata` | Standard cell annotations | -| `obsm` | `embedding` | Multidimensional cell data (PCA, UMAP, etc.) | -| `obsp` | `pairwise` | Pairwise cell annotations | - -For H5 files, use the same H5AD key names — the transformation maps them to the correct internal structure. - -```json -{ "obs": "metadata", "obsm": "embedding", "obsp": "pairwise" } -``` - -#### `linking_group` -Specifies the parent SLP entity (sample/library/preparation) to link the Cell Group to. Must contain exactly one key. An empty value triggers auto-discovery of all available accessions of that entity type. If absent entirely and no new SLP groups are being created, auto-discovery applies: Library → Preparation → Sample. - -```json -{ "library": "GSF017080" } -{ "preparation": [] } -``` - -#### `columns_to_drop` -Column names to remove before processing. -```json -["taxon", "organism_id"] -``` - -#### `columns_renaming_map` -Maps source column names to new names. -```json -{ "sample": "batch", "pctmt": "percentMito" } -``` - -#### `columns_to_fill_missing_values` -Default values for missing entries. -```json -{ "batch": "unknown" } -``` - -#### `columns_to_curate_values` -Replacement values for specific entries in specified columns. -```json -{ "sample": { "LGVXCTRL1": "lung_healthy_1" } } -``` - -#### `set_column_value` -Sets a constant value for all rows. Can add new attribute columns or overwrite existing ones. -```json -{ "sample_id": "lung_1" } -``` - -#### `columns_to_preserve_name` -Columns to exempt from internal name standardisation. Use for column names that contain characters that would otherwise be altered (e.g. Leiden cluster columns: `cluster_leiden_0.5`). -```json -["cluster_leiden_0.5"] -``` - -#### `add_qc_metrics` -When `true`, QC metrics are calculated and added to cell metadata if not already present: number of counts, number of genes, mitochondrial and ribosomal gene presence. Skipped when environment variable `dry_run` is `true`. - ---- - -### `feature_metadata` - -Settings for extracting and transforming feature (gene)-level metadata. Optional. - -#### `metadata_keys` -Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. - -| Key | Value | Description | -|-----|-------|-------------| -| `var` | `metadata` | Standard feature annotations | -| `varm` | `embedding` | Multidimensional feature data | -| `varp` | `pairwise` | Pairwise feature annotations | - -For H5 files, use the same H5AD key names. - -#### `columns_to_drop` -Column names to remove from feature metadata. - -#### `columns_renaming_map` -Maps source column names to new names. - -#### `columns_to_fill_missing_values` -Default values for missing entries. - -#### `columns_to_curate_values` -Replacement values for specific entries. - -#### `set_column_value` -Sets a constant value for all rows. - -#### `columns_to_preserve_name` -Columns to exempt from internal name standardisation. - -#### `map_gene_ids_to_names` -When `true`, attempts to map gene IDs to gene names if names are absent and the `geneId` column is present. The pipeline infers the ID source (Ensembl or NCBI) and species automatically. Set to `false` for proteomics or other omics data that do not use gene IDs as identifiers. - -> The gene ID column must be named `geneId` for mapping to be performed. - -Supported organisms (hdf5-cells v0.0.4): - -| Organism | Genome version | Ensembl release | NCBI release | -|----------|---------------|-----------------|--------------| -| *Homo sapiens* | GRCh38.p14 | 115 | GCF_000001405.40-RS_2025_08 | -| *Mus musculus* | GRCm39 | 115 | GCF_000001635.27-RS_2024_02 | -| *Rattus norvegicus* | GRCr8 | 115 | GCF_036323735.1-RS_2024_02 | -| *Sus scrofa* | Sscrofa11.1 | 115 | 106 | - ---- - -### `cell_expression` - -Settings for extracting and uploading the cell expression matrix. Optional. If absent, no Expression Group is created. - -#### `data_class` -Data class label for the expression data. -```json -"Single-cell transcriptomics" -``` - -#### `compression_level` -Brotli compression level for the output file. Higher values produce smaller files at the cost of longer compression time. Range: 0–9. - -#### `chunk_size` -Number of features processed per chunk during export. Calculated automatically from available container memory if omitted. - -#### `max_buffer_size` -Amount of data held in memory before being flushed to disk during expression writing. - -#### `number_format` -Numeric precision of values in the output file. Accepts printf-style (`"%.7g"`, `"%d"`) or NumPy dtype (`"float32"`, `"int64"`). Inferred from the data if omitted. - -#### `columns_to_drop` -Column names to remove from expression metadata. - -#### `columns_renaming_map` -Maps source column names to new names in expression metadata. - -#### `set_column_value` -Sets a constant value for all rows in specified expression metadata columns. - -#### `source_file_metadata` -When `true`, metadata from the source HDF5 attachment is read and included in expression metadata (subject to `columns_to_drop`, `columns_renaming_map`, and `set_column_value`). When `false`, source file metadata extraction is skipped. - -Summary statistics are always appended regardless of this flag: total cells, total features, sparsity (%), non-zero values, source file accession, and source file name. diff --git a/docs/user-guide/doc-odm-user-guide/configuration-reference.md b/docs/user-guide/doc-odm-user-guide/configuration-reference.md index e9e9991..d223bc8 100644 --- a/docs/user-guide/doc-odm-user-guide/configuration-reference.md +++ b/docs/user-guide/doc-odm-user-guide/configuration-reference.md @@ -1,164 +1,65 @@ # Configuration Reference: Single-Cell HDF5 Transformation -> **Related documentation:** For conceptual background, see [About Single-Cell HDF5 Transformations in ODM](about-sc-hdf5-transformations.md). For step-by-step usage, see the [How-to Guides](how-to-sc-hdf5-transformations.md). For the API endpoints used to store and submit configurations, see the [API Reference](api-reference.md). For the internal processing pipeline, see the [Transformation Process Reference](transformation-process-reference.md). +> **Related documentation:** [About SC HDF5 Transformations](about-sc-hdf5-transformations.md) · [How-to Guides](how-to-sc-hdf5-transformations.md) · [API Reference](api-reference.md) · [Transformation Process Reference](transformation-process-reference.md) -This reference describes all parameters accepted by the single-cell HDF5 transformation configuration file. Parameters are organized by top-level section. Required parameters are marked **mandatory**; all others are optional. - -The configuration is validated at the start of every run. If `file_type` is missing or invalid, the pipeline raises an error immediately. For all other sections, all validation errors are collected and reported together at the end of validation, so the complete set of issues is visible in a single run. - -Unrecognized keys are ignored with a warning logged. +The configuration is validated at the start of every run. If `file_type` is missing or invalid, the pipeline raises an error immediately. All other validation errors are collected and reported together. Unrecognised keys are ignored with a warning. --- ## Top-level parameters -### `file_type` - -| | | -|---|---| -| **Type** | `string` | -| **Required** | Yes | -| **Accepted values** | `"h5ad"`, `"h5"` | - -Specifies the format of the input HDF5 file. Must be provided; the pipeline cannot proceed without a valid file type. - - -### `save_logs` - -| | | -|---|---| -| **Type** | `boolean` | -| **Default** | `true` | - -When `false`, transformation logs are not saved as an attachment in ODM after the run completes. Has no effect when environment variable `dry_run` is `true`. +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `file_type` | `string` | **Yes** | — | Format of the input file. Accepted values: `"h5ad"`, `"h5"`. | +| `save_logs` | `boolean` | No | `true` | When `false`, logs are not saved as an attachment after the run. Has no effect when environment variable `dry_run` is `true`. | --- ## `biosample_metadata` -Settings for extracting, transforming, and exporting cell-level metadata to Sample, Library, or Preparation entities. The entire section is optional. If present, the following parameters apply. - -### `metadata_keys` - -| | | -|---|---| -| **Type** | `dict[string, string]` | -| **Required** | Yes, if `biosample_metadata` is present | +Settings for extracting, transforming, and exporting cell-level metadata to Sample, Library, or Preparation entities. The entire section is optional. -Maps HDF5 group keys to metadata types. Use `"obs": "metadata"` to read standard cell metadata as the source for biosample-level aggregation. +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `metadata_keys` | `dict[string, string]` | Yes | — | Maps HDF5 group keys to metadata types. Use `"obs": "metadata"` to read standard cell metadata. | +| `biosample_column_name` | `string` | Yes | — | Column identifying which biosample each cell belongs to. Rows are grouped by this column for aggregation. | -Example: +**`metadata_keys` example:** ```json -{ - "obs": "metadata" -} +{ "obs": "metadata" } ``` -### `biosample_column_name` - -| | | -|---|---| -| **Type** | `string` | -| **Required** | Yes, if `biosample_metadata` is present | - -The name of the column in the cell metadata that identifies which biosample each cell belongs to (for example, a sample ID, library ID, or preparation ID column). Rows are grouped by this column for biosample-level aggregation. - ---- - ### `biosample_metadata.sample` Settings for exporting metadata to the Sample entity. Optional. -#### `create_new_group` - -| | | -|---|---| -| **Type** | `boolean` | - -When `true`, the transformation creates a new Sample group in ODM and links it to the study. When omitted or `false`, the transformation updates existing Sample group objects instead of creating new ones. - -#### `template_id` - -| | | -|---|---| -| **Type** | `string` | - -The template ID to apply when creating a new Sample group. If not specified, the study's default template is applied. +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `create_new_group` | `boolean` | `false` | When `true`, creates a new Sample group in ODM and links it to the study. | +| `template_id` | `string` | — | Template ID for the new Sample group. Falls back to the study default if omitted. | +| `columns_to_export` | `list[string]` | — | Cell metadata columns to include in the exported Sample metadata. Only columns constant per biosample are eligible; exported columns are dropped from cell metadata. | +| `columns_renaming_map` | `dict[string, string]` | — | Maps source column names to new names in the exported metadata. | +| `columns_to_fill_missing_values` | `dict[string, string]` | — | Default values for missing entries in specified columns. | +| `columns_to_curate_values` | `dict[string, dict[string, string]]` | — | Maps specific values in a column to replacement values. | -#### `columns_to_export` - -| | | -|---|---| -| **Type** | `list[string]` | - -List of cell metadata column names to include in the exported Sample metadata. Only columns that are constant per biosample (as identified by `biosample_column_name`) are eligible. Exported columns are automatically dropped from the cell metadata. - -#### `columns_renaming_map` - -| | | -|---|---| -| **Type** | `dict[string, string]` | - -Maps source column names to new names in the exported metadata. - -Example: +**Examples:** ```json -{ - "tissue_type": "tissueType" -} +{ "columns_renaming_map": { "tissue_type": "tissueType" } } +{ "columns_to_fill_missing_values": { "disease": "unknown" } } +{ "columns_to_curate_values": { "tissue": { "PBMCs": "peripheral blood mononuclear cells" } } } ``` -#### `columns_to_fill_missing_values` - -| | | -|---|---| -| **Type** | `dict[string, string]` | - -Specifies default values to use for missing entries in the listed columns. - -Example: -```json -{ - "disease": "unknown" -} -``` - -#### `columns_to_curate_values` - -| | | -|---|---| -| **Type** | `dict[string, dict[string, string]]` | - -Maps specific values in a column to replacement values. - -Example: -```json -{ - "tissue": { - "PBMCs": "peripheral blood mononuclear cells" - } -} -``` - ---- - ### `biosample_metadata.library` -Settings for exporting metadata to the Library entity. Optional. Accepts the same parameters as `biosample_metadata.sample`, plus: - -#### `linking_group` - -| | | -|---|---| -| **Type** | `string` | +Accepts the same parameters as `biosample_metadata.sample`, plus: -Accession of an existing Sample group to link the new Library group to. If not specified, the pipeline uses: (1) a Sample group created in the same run, or (2) pre-fetched Sample group accessions for the study. - ---- +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `linking_group` | `string` | — | Accession of an existing Sample group to link the new Library group to. If omitted, the pipeline uses a Sample group from the same run or pre-fetched accessions. | ### `biosample_metadata.preparation` -Settings for exporting metadata to the Preparation entity. Optional. Accepts the same parameters as `biosample_metadata.library`, including `linking_group`. +Accepts the same parameters as `biosample_metadata.library`, including `linking_group`. > **Constraint:** Only one of `library` or `preparation` may have `columns_to_export` set in the same configuration. @@ -166,170 +67,58 @@ Settings for exporting metadata to the Preparation entity. Optional. Accepts the ## `cell_metadata` -Settings for extracting and transforming cell-level metadata. The entire section is optional. If absent, no Cell Group is created. +Settings for extracting and transforming cell-level metadata. Optional. If absent, no Cell Group is created. -### `metadata_keys` +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `metadata_keys` | `dict[string, string]` | Yes | — | Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. | +| `linking_group` | `dict[string, string \| list[string] \| null]` | No | — | Specifies the parent SLP entity (sample/library/preparation) to link the Cell Group to. Empty value triggers auto-discovery of all available accessions. | +| `columns_to_drop` | `list[string]` | No | — | Column names to remove before processing. | +| `columns_renaming_map` | `dict[string, string]` | No | — | Maps source column names to new names. | +| `columns_to_fill_missing_values` | `dict[string, string]` | No | — | Default values for missing entries. | +| `columns_to_curate_values` | `dict[string, dict[string, string]]` | No | — | Replacement values for specific entries in specified columns. | +| `set_column_value` | `dict[string, string]` | No | — | Sets a constant value for all rows. Can add new columns or overwrite existing ones. | +| `columns_to_preserve_name` | `list[string]` | No | — | Columns to exempt from internal name standardisation (e.g. Leiden cluster columns with decimal suffixes). | +| `add_qc_metrics` | `boolean` | No | `true` | When `true`, adds QC metrics (counts, genes, mitochondrial/ribosomal presence) if not already present. Skipped when environment variable `dry_run` is `true`. | -| | | -|---|---| -| **Type** | `dict[string, string]` | -| **Required** | Yes, if `cell_metadata` is present | - -Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. - -Accepted key-value pairs for H5AD files: +**`metadata_keys` accepted values (H5AD):** | Key | Value | Description | |-----|-------|-------------| | `obs` | `metadata` | Standard cell annotations | | `obsm` | `embedding` | Multidimensional cell data (PCA, UMAP, etc.) | -| `obsp` | `pairwise` | Pairwise cell annotations (e.g., cell–cell distances) | - -For H5 files, specify metadata using the same H5AD key names (`obs`, `obsm`, `obsp`). The transformation maps these to the correct internal structure regardless of source format. - -Example: -```json -{ - "obs": "metadata", - "obsm": "embedding", - "obsp": "pairwise" -} -``` - -### `linking_group` - -| | | -|---|---| -| **Type** | `dict[string, string \| list[string] \| null]` | - -Specifies the parent SLP entity to which the Cell Group will be linked. Must contain exactly one key: `sample`, `library`, or `preparation`. The value is either a list of group accessions, a single accession string, or an empty value. - -If an empty value is provided (`[]`, `""`, or `null`), the pipeline resolves all available group accessions of the specified entity type for the study. - -If `linking_group` is absent and no new SLP groups are being created, auto-discovery applies: Library → Preparation → Sample, using the first entity type with at least one associated group. - -Examples: -```json -{ "library": "GSF017080" } -``` -```json -{ "preparation": [] } -``` - -### `columns_to_drop` - -| | | -|---|---| -| **Type** | `list[string]` | - -Column names to remove from the cell metadata before processing. - -Example: -```json -["taxon", "organism_id"] -``` - -### `columns_renaming_map` - -| | | -|---|---| -| **Type** | `dict[string, string]` | +| `obsp` | `pairwise` | Pairwise cell annotations | -Maps source column names to new names. +For H5 files, use the same H5AD key names — the transformation maps them to the correct internal structure. -Example: +**Examples:** ```json -{ - "sample": "batch", - "pctmt": "percentMito" -} +{ "metadata_keys": { "obs": "metadata", "obsm": "embedding" } } +{ "linking_group": { "library": "GSF017080" } } +{ "columns_to_drop": ["taxon", "organism_id"] } +{ "columns_renaming_map": { "sample": "batch", "pctmt": "percentMito" } } +{ "set_column_value": { "sample_id": "lung_1" } } +{ "columns_to_preserve_name": ["cluster_leiden_0.5"] } ``` -### `columns_to_fill_missing_values` - -| | | -|---|---| -| **Type** | `dict[string, string]` | - -Default values for missing entries in the specified columns. - -Example: -```json -{ - "batch": "unknown" -} -``` - -### `columns_to_curate_values` - -| | | -|---|---| -| **Type** | `dict[string, dict[string, string]]` | - -Replacement values for specific entries in specified columns. - -Example: -```json -{ - "sample": { - "LGVXCTRL1": "lung_healthy_1" - } -} -``` - -### `set_column_value` - -| | | -|---|---| -| **Type** | `dict[string, string]` | - -Sets a constant value for all rows in the specified columns. Can be used to add a new attribute column or overwrite an existing one. - -Example: -```json -{ - "sample_id": "lung_1" -} -``` - -### `columns_to_preserve_name` - -| | | -|---|---| -| **Type** | `list[string]` | - -Column names to exempt from the internal attribute name standardization step. Use for columns whose names contain characters that would otherwise be altered (for example, Leiden cluster columns with decimal suffixes such as `cluster_leiden_0.5`). - -Example: -```json -["cluster_leiden_0.5"] -``` - -### `add_qc_metrics` - -| | | -|---|---| -| **Type** | `boolean` | -| **Default** | `true` | - -When `true`, QC metrics are calculated and added to the cell metadata if not already present. QC metrics include number of counts, number of genes, and mitochondrial and ribosomal gene presence. When `false`, or when environment variable `dry_run` is `true`, QC calculation is skipped. - --- ## `feature_metadata` -Settings for extracting and transforming feature (gene)-level metadata. The entire section is optional. - -### `metadata_keys` +Settings for extracting and transforming feature (gene)-level metadata. Optional. -| | | -|---|---| -| **Type** | `dict[string, string]` | -| **Required** | Yes, if `feature_metadata` is present | +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `metadata_keys` | `dict[string, string]` | Yes | — | Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. | +| `columns_to_drop` | `list[string]` | No | — | Column names to remove from feature metadata. | +| `columns_renaming_map` | `dict[string, string]` | No | — | Maps source column names to new names. | +| `columns_to_fill_missing_values` | `dict[string, string]` | No | — | Default values for missing entries. | +| `columns_to_curate_values` | `dict[string, dict[string, string]]` | No | — | Replacement values for specific entries. | +| `set_column_value` | `dict[string, string]` | No | — | Sets a constant value for all rows. | +| `columns_to_preserve_name` | `list[string]` | No | — | Columns to exempt from internal name standardisation. | +| `map_gene_ids_to_names` | `boolean` | No | `true` | When `true`, maps gene IDs to gene names if names are absent and `geneId` column is present. Set to `false` for proteomics or non-gene-ID data. | -Maps HDF5 group keys to metadata types. At least one key with value `"metadata"` is required. - -Accepted key-value pairs for H5AD files: +**`metadata_keys` accepted values (H5AD):** | Key | Value | Description | |-----|-------|-------------| @@ -337,76 +126,9 @@ Accepted key-value pairs for H5AD files: | `varm` | `embedding` | Multidimensional feature data | | `varp` | `pairwise` | Pairwise feature annotations | -For H5 files, specify metadata using the same H5AD key names (`var`, `varm`, `varp`). The transformation maps these to the correct internal structure regardless of source format. - -Example: -```json -{ - "var": "metadata", - "varm": "embedding" -} -``` - -### `columns_to_drop` - -| | | -|---|---| -| **Type** | `list[string]` | - -Column names to remove from the feature metadata. - -### `columns_renaming_map` - -| | | -|---|---| -| **Type** | `dict[string, string]` | - -Maps source column names to new names. - -### `columns_to_fill_missing_values` - -| | | -|---|---| -| **Type** | `dict[string, string]` | - -Default values for missing entries in the specified columns. - -### `columns_to_curate_values` - -| | | -|---|---| -| **Type** | `dict[string, dict[string, string]]` | - -Replacement values for specific entries in specified columns. - -### `set_column_value` - -| | | -|---|---| -| **Type** | `dict[string, string]` | - -Sets a constant value for all rows in the specified columns. +> The gene ID column must be named `geneId` for mapping to be performed. -### `columns_to_preserve_name` - -| | | -|---|---| -| **Type** | `list[string]` | - -Column names to exempt from the internal attribute name standardization step. - -### `map_gene_ids_to_names` - -| | | -|---|---| -| **Type** | `boolean` | -| **Default** | `true` | - -When `true`, the transformation attempts to map gene IDs to gene names if gene names are absent and the standard `geneId` column is present. The pipeline infers the ID source (Ensembl or NCBI) and the species automatically. When `false`, gene ID mapping is skipped. Set to `false` for proteomics or other omics data that do not use gene IDs as identifiers. - -> The gene ID column must use the standard name `geneId` for mapping to be performed. - -**Supported organisms and annotation releases (hdf5-cells v0.0.4):** +**Supported organisms (`map_gene_ids_to_names`) — hdf5-cells v0.0.4:** | Organism | Genome version | Ensembl release | NCBI release | |----------|---------------|-----------------|--------------| @@ -419,87 +141,16 @@ When `true`, the transformation attempts to map gene IDs to gene names if gene n ## `cell_expression` -Settings for extracting and uploading the cell expression matrix. The entire section is optional. If absent, no Expression Group is created. - -### `data_class` - -| | | -|---|---| -| **Type** | `string` | -| **Required** | Yes, if `cell_expression` is present | - -The data class label for the expression data. - -Example: -```json -"Single-cell transcriptomics" -``` - -### `compression_level` - -| | | -|---|---| -| **Type** | `integer` (0–9) | -| **Default** | `4` | - -Controls the Brotli compression level for the output expression file. Higher values produce smaller files at the cost of longer compression time. - -### `chunk_size` - -| | | -|---|---| -| **Type** | `integer` | - -Number of features processed per chunk during expression data export. If not specified, the value is calculated automatically from available container memory. - -### `max_buffer_size` - -| | | -|---|---| -| **Type** | `integer` | -| **Default** | `50` | - -Controls how much data is held in memory before being flushed to disk during expression writing. - -### `number_format` - -| | | -|---|---| -| **Type** | `string` | - -Controls the numeric precision of values in the output file. Accepts either a printf-style format string (e.g. `"%.7g"`, `"%d"`) or a NumPy dtype string (e.g. `"float32"`, `"int64"`). If not set, the format is inferred from the data. - -### `columns_to_drop` - -| | | -|---|---| -| **Type** | `list[string]` | - -Column names to remove from the expression metadata. - -### `columns_renaming_map` - -| | | -|---|---| -| **Type** | `dict[string, string]` | - -Maps source column names to new names in the expression metadata. - -### `set_column_value` - -| | | -|---|---| -| **Type** | `dict[string, string]` | - -Sets a constant value for all rows in the specified expression metadata columns. - -### `source_file_metadata` - -| | | -|---|---| -| **Type** | `boolean` | -| **Default** | `true` | - -When `true`, metadata from the source HDF5 attachment is read and included in the expression metadata (subject to `columns_to_drop`, `columns_renaming_map`, and `set_column_value`). When `false`, source file metadata extraction is skipped. - -In all cases, the following statistics are always computed and appended to the expression metadata regardless of this flag: total number of cells, total number of features, sparsity (%), number of non-zero values, source file accession, and source file name. +Settings for extracting and uploading the cell expression matrix. Optional. If absent, no Expression Group is created. + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `data_class` | `string` | **Yes** | — | Data class label for the expression data (e.g. `"Single-cell transcriptomics"`). | +| `compression_level` | `integer` (0–9) | No | `4` | Brotli compression level. Higher values produce smaller files at the cost of longer compression time. | +| `chunk_size` | `integer` | No | auto | Number of features processed per chunk. Calculated automatically from available memory if omitted. | +| `max_buffer_size` | `integer` | No | `50` | Amount of data held in memory before being flushed to disk during writing. | +| `number_format` | `string` | No | inferred | Numeric precision of output values. Accepts printf-style (`"%.7g"`, `"%d"`) or NumPy dtype (`"float32"`, `"int64"`). | +| `columns_to_drop` | `list[string]` | No | — | Column names to remove from expression metadata. | +| `columns_renaming_map` | `dict[string, string]` | No | — | Maps source column names to new names. | +| `set_column_value` | `dict[string, string]` | No | — | Sets a constant value for all rows in specified columns. | +| `source_file_metadata` | `boolean` | No | `true` | When `true`, metadata from the source HDF5 attachment is read and included in expression metadata. Summary statistics (cell count, feature count, sparsity, etc.) are always appended regardless of this flag. | From 95738f04ec6b088dab62cf327e2fe646d3c9cacb Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Mon, 30 Mar 2026 13:22:18 +0200 Subject: [PATCH 21/74] Add gene ID - gene mapping in attribute mapping reference --- .../about-sc-hdf5-transformations.md | 2 +- .../doc-odm-user-guide/attribute-mapping.md | 15 +++++++++++++++ .../doc-odm-user-guide/configuration-reference.md | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md index f467dae..721d3b9 100644 --- a/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/about-sc-hdf5-transformations.md @@ -24,7 +24,7 @@ The transformation extracts three types of data from a HDF5 source file: **Cell metadata** — extracted primarily from the `obs` in H5AD input file, or the equivalent structure in 10x H5 input. This includes per-cell annotations such as barcodes, cluster assignments, quality control metrics, and any other experimental annotations. Multidimensional representations stored in `obsm` (such as PCA or UMAP coordinates) and pairwise cell annotations from `obsp` can also be extracted. -**Feature metadata** — extracted from `var`, and optionally from `varm` and `varp`. This includes per-gene annotations such as gene identifiers and gene names. For supported species, the transformation can also map Ensembl or NCBI gene identifiers to gene names automatically. +**Feature metadata** — extracted from `var`, and optionally from `varm` and `varp`. This includes per-gene annotations such as gene identifiers and gene names. For supported species, the transformation can also map Ensembl or NCBI gene identifiers to gene names automatically (see [Gene ID to name mapping](attribute-mapping.md#gene-id-to-name-mapping)). **The expression matrix** — extracted from `X`, which contains count or normalized expression values. The transformation validates the matrix dimensions against the extracted cell and feature metadata, then writes the matrix in a Brotli-compressed format optimized for ODM ingestion. diff --git a/docs/user-guide/doc-odm-user-guide/attribute-mapping.md b/docs/user-guide/doc-odm-user-guide/attribute-mapping.md index ccbcb19..fb522d0 100644 --- a/docs/user-guide/doc-odm-user-guide/attribute-mapping.md +++ b/docs/user-guide/doc-odm-user-guide/attribute-mapping.md @@ -45,3 +45,18 @@ The table below lists the canonical ODM API name for each feature attribute alon | nCellsByCounts | `n_cells_by_counts`, `n_cells`, `num_cells`, `n_obs`, `num_cells_expressed` | | meanCounts | `mean_counts`, `avg_exp`, `obs_mean`, `means` | | pctDropoutByCounts | `pct_dropout_by_counts`, `pct_dropout`, `percent_dropout`, `dropout_rate` | + +### Gene ID to name mapping + +When feature metadata contains a `geneId` column but no gene name column, the transformation can automatically resolve gene names from a built-in reference. This is controlled by the `map_gene_ids_to_names` parameter in the `feature_metadata` configuration block, which is enabled by default. Set it to `false` for proteomics or other non-gene-ID data where this behaviour is not appropriate. + +The mapping is performed using Ensembl and NCBI reference data. Both Ensembl gene IDs (e.g. `ENSG...`) and NCBI gene IDs are supported. The following organisms are supported in `hdf5-cells`: + +| Organism | Genome version | Ensembl release | NCBI release | +|----------|----------------|-----------------|--------------| +| *Homo sapiens* | GRCh38.p14 | 115 | GCF_000001405.40-RS_2025_08 | +| *Mus musculus* | GRCm39 | 115 | GCF_000001635.27-RS_2024_02 | +| *Rattus norvegicus* | GRCr8 | 115 | GCF_036323735.1-RS_2024_02 | +| *Sus scrofa* | Sscrofa11.1 | 115 | 106 | + +> The gene ID column must be named `geneId` for mapping to be performed. If the column has a different name in the source file, ensure it is covered by the feature metadata attribute mapping above so that it is renamed to `geneId` before this step runs. \ No newline at end of file diff --git a/docs/user-guide/doc-odm-user-guide/configuration-reference.md b/docs/user-guide/doc-odm-user-guide/configuration-reference.md index d223bc8..c68398f 100644 --- a/docs/user-guide/doc-odm-user-guide/configuration-reference.md +++ b/docs/user-guide/doc-odm-user-guide/configuration-reference.md @@ -147,7 +147,7 @@ Settings for extracting and uploading the cell expression matrix. Optional. If a |-----------|------|----------|---------|-------------| | `data_class` | `string` | **Yes** | — | Data class label for the expression data (e.g. `"Single-cell transcriptomics"`). | | `compression_level` | `integer` (0–9) | No | `4` | Brotli compression level. Higher values produce smaller files at the cost of longer compression time. | -| `chunk_size` | `integer` | No | auto | Number of features processed per chunk. Calculated automatically from available memory if omitted. | +| `chunk_size` | `integer` | No | inferred | Number of features processed per chunk. Calculated automatically from available memory if omitted. | | `max_buffer_size` | `integer` | No | `50` | Amount of data held in memory before being flushed to disk during writing. | | `number_format` | `string` | No | inferred | Numeric precision of output values. Accepts printf-style (`"%.7g"`, `"%d"`) or NumPy dtype (`"float32"`, `"int64"`). | | `columns_to_drop` | `list[string]` | No | — | Column names to remove from expression metadata. | From db33768d596cc249185c4c7ebb9db20f7f263fd6 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Mon, 30 Mar 2026 13:23:04 +0200 Subject: [PATCH 22/74] Remove supported species from gene mapping from config schema --- .../doc-odm-user-guide/configuration-reference.md | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/docs/user-guide/doc-odm-user-guide/configuration-reference.md b/docs/user-guide/doc-odm-user-guide/configuration-reference.md index c68398f..5a0d9f2 100644 --- a/docs/user-guide/doc-odm-user-guide/configuration-reference.md +++ b/docs/user-guide/doc-odm-user-guide/configuration-reference.md @@ -126,17 +126,6 @@ Settings for extracting and transforming feature (gene)-level metadata. Optional | `varm` | `embedding` | Multidimensional feature data | | `varp` | `pairwise` | Pairwise feature annotations | -> The gene ID column must be named `geneId` for mapping to be performed. - -**Supported organisms (`map_gene_ids_to_names`) — hdf5-cells v0.0.4:** - -| Organism | Genome version | Ensembl release | NCBI release | -|----------|---------------|-----------------|--------------| -| *Homo sapiens* | GRCh38.p14 | 115 | GCF_000001405.40-RS_2025_08 | -| *Mus musculus* | GRCm39 | 115 | GCF_000001635.27-RS_2024_02 | -| *Rattus norvegicus* | GRCr8 | 115 | GCF_036323735.1-RS_2024_02 | -| *Sus scrofa* | Sscrofa11.1 | 115 | 106 | - --- ## `cell_expression` From beba1767efc9e988949420124fb472f3fc06c9f2 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Mon, 30 Mar 2026 13:45:29 +0200 Subject: [PATCH 23/74] Update how to guide --- .../how-to-sc-hdf5-transformations.md | 43 +++++++++---------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md b/docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md index 3edf166..32f5225 100644 --- a/docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md +++ b/docs/user-guide/doc-odm-user-guide/how-to-sc-hdf5-transformations.md @@ -1,6 +1,6 @@ # How-to Guides: Single-Cell HDF5 Transformations in ODM -These guides show how to accomplish specific tasks using the single-cell HDF5 transformation. Each guide assumes you have a valid input file (H5AD or 10x H5) already registered in ODM as an attachment, and access to the ODM API. +These guides show how to accomplish specific tasks using the single-cell HDF5 transformation. Each guide assumes you have a valid input file (H5AD or 10x H5) already attached to a study in ODM. For a conceptual overview of the entities involved and how the transformation works, see [About Single-Cell HDF5 Transformations in ODM](about-sc-hdf5-transformations.md). For the full list of configuration parameters, see the [Configuration Reference](configuration-reference.md). For the API endpoint specifications, see the [API Reference](api-reference.md). For details on what the pipeline does internally at each stage, see the [Transformation Process Reference](transformation-process-reference.md). @@ -87,7 +87,7 @@ The response includes the `id` of the created job. As a guideline for setting `v GET /api/v1/transformations/jobs/{job_id} ``` -Repeat until `status.state` reaches a terminal value: `COMPLETED` or `FAILED`. +Repeat until `status.state` reaches a terminal value: `DONE` or `FAILED`. ### Step 5: Review the logs @@ -124,7 +124,7 @@ POST /api/v1/transformations/jobs } ``` -Monitor and retrieve logs the same way as the dry run (Steps 4–5). When the job completes, the logs contain the ODM accessions assigned to each object that was created or updated. +Monitor and retrieve logs the same way as the dry run (Steps 4–5). When the job completes, the logs contain the ODM accessions assigned to each object that was created or updated. The logs will also be uploaded as attachment to the same study. --- @@ -157,7 +157,7 @@ Repeat until the dry run completes without errors or warnings that require actio ## How to ingest cell and expression data from an H5AD file -Use this when the study already has Sample, Library, or Preparation groups in ODM and you only need to add the single-cell layer. Configure at least `cell_metadata`, `feature_metadata`, and `cell_expression` in your configuration's `data` field. +Use this when the study already has Sample, Library, or Preparation groups in ODM and you only need to add the single-cell layer. Configure `cell_metadata`, `feature_metadata`, and `cell_expression` in your configuration's `data` field. ```json { @@ -189,7 +189,7 @@ The transformation resolves the linking target automatically (Library → Prepar ```json "cell_metadata": { "linking_group": { - "library": "GSF017080" + "library": "GSFXXXXXX" } } ``` @@ -206,7 +206,7 @@ To link to all preparation groups in the study without specifying their accessio --- -## How to create Sample, Library, or Preparation groups from your H5AD file +## How to create or update Sample, Library, or Preparation groups from your H5AD file Use this when your study does not yet have SLP groups in ODM, or when you want to derive biosample-level attributes from the cell metadata. @@ -222,10 +222,7 @@ Identify the column in your cell metadata that acts as a biosample identifier. S "biosample_column_name": "sample_id", "sample": { "create_new_group": true, - "columns_to_export": ["tissue", "disease", "donor_id"], - "columns_renaming_map": { - "tissue": "tissueType" - } + "columns_to_export": ["tissue", "disease", "donor_id"] } }, "cell_metadata": { @@ -340,26 +337,18 @@ Legacy 10x H5 files (v<3) are supported only if the file contains a single genom These operations are available in `cell_metadata`, `feature_metadata`, and per-entity settings within `biosample_metadata`. They are applied in the order listed. -**To rename a column:** - -```json -"columns_renaming_map": { - "sample": "batch", - "pctmt": "percentMito" -} -``` - **To drop columns:** ```json "columns_to_drop": ["taxon", "organism_id"] ``` -**To fill missing values:** +**To rename a column:** ```json -"columns_to_fill_missing_values": { - "batch": "unknown" +"columns_renaming_map": { + "sample": "batch", + "pctmt": "percentMito" } ``` @@ -373,6 +362,14 @@ These operations are available in `cell_metadata`, `feature_metadata`, and per-e } ``` +**To fill missing values:** + +```json +"columns_to_fill_missing_values": { + "batch": "unknown" +} +``` + **To set a constant value for all rows:** ```json @@ -387,6 +384,6 @@ These operations are available in `cell_metadata`, `feature_metadata`, and per-e "columns_to_preserve_name": ["cluster_leiden_0.5"] ``` -Operations are applied in order: drop → rename → fill missing values → curate values → set constant values. Attribute name standardization (mapping to ODM standard names and converting others to camelCase) runs after all explicit column operations. Columns listed in `columns_to_preserve_name` are exempt from this standardization step. +Operations are applied in order: drop → rename → curate values → fill missing values → set constant values. Attribute name standardization (mapping to ODM standard names and converting others to camelCase) runs after all explicit column operations. Columns listed in `columns_to_preserve_name` are exempt from this standardization step. For full parameter specifications, see the [Configuration Reference](configuration-reference.md). From 82977f49f14be54237375f639403b6d10332e946 Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Mon, 30 Mar 2026 13:55:14 +0200 Subject: [PATCH 24/74] Update transformation reference --- .../transformation-process-reference.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md b/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md index 9d3c75d..3d118c7 100644 --- a/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md +++ b/docs/user-guide/doc-odm-user-guide/transformation-process-reference.md @@ -120,12 +120,11 @@ The following transformations are applied in the order listed, when specified in 1. **Drop columns** (`columns_to_drop`) 2. **Rename columns** (`columns_renaming_map`) -3. **Fill missing values** (`columns_to_fill_missing_values`) -4. **Curate values** (`columns_to_curate_values`) -5. **Coerce data types** -6. **Set constant values** (`set_column_value`) +3. **Curate values** (`columns_to_curate_values`) +4. **Fill missing values** (`columns_to_fill_missing_values`) +5. **Set constant values** (`set_column_value`) -After all explicit column operations, **attribute name standardization** is applied: column names are mapped to ODM standard attribute names where a mapping exists; non-standard names are converted to camelCase. Columns listed in `columns_to_preserve_name` are exempt from this step. +After all explicit column operations, **attribute name standardization** is applied: column names are mapped to ODM standard attribute names where a mapping exists; non-standard names are converted to camelCase. Columns listed in `columns_to_preserve_name` are exempt from this step. For the full list of recognized column names, see the [Attribute Mapping Reference](attribute-mapping.md). Data type validation is then performed on the resulting DataFrame. @@ -138,7 +137,7 @@ Data type validation is then performed on the resulting DataFrame. **Feature metadata additional steps:** -- **Gene ID mapping** (if `map_gene_ids_to_names` is `true`): If gene names are absent and the standard `geneId` column is present, the pipeline infers the ID source (Ensembl or NCBI) and the species. If both can be determined, a new column with the mapped gene names is added. Supported organisms and annotation releases are listed in the [Configuration Reference](configuration-reference.md#map_gene_ids_to_names). +- **Gene ID mapping** (if `map_gene_ids_to_names` is `true`): If gene names are absent and the standard `geneId` column is present, the pipeline infers the ID source (Ensembl or NCBI) and the species. If both can be determined, a new column with the mapped gene names is added. Supported organisms and annotation releases are listed in [Gene ID to name mapping](attribute-mapping.md#gene-id-to-name-mapping)). ### 2.6 Storing data @@ -170,8 +169,10 @@ The following statistics are always computed and appended to the metadata regard 2. Total number of features 3. Sparsity (%) 4. Number of non-zero values +5. Source file accession +6. Source file name -Source file accession and source file name are also included. The generated metadata file is written to the temporary directory. +The generated metadata file is written to the temporary directory. --- From 729b41c26e8dbe91abe59388adfe1e17988a03fb Mon Sep 17 00:00:00 2001 From: Isabel Gomez Redondo Date: Tue, 31 Mar 2026 18:08:06 +0200 Subject: [PATCH 25/74] Add quick tutorial and configurations for public datasets --- .../doc-odm-user-guide/extras/GSE156793.json | 111 ++++++++++ .../doc-odm-user-guide/extras/GSE165045.json | 49 +++++ .../extras/aggregated_config_1.json | 84 ++++++++ .../extras/aggregated_config_2.json | 108 ++++++++++ .../extras/aggregated_config_3.json | 117 +++++++++++ .../extras/dataset-import-commands.md | 193 ++++++++++++++++++ .../public-dataset-configurations-mapping.md | 25 +++ .../doc-odm-user-guide/quick-tutorial.md | 79 +++++++ 8 files changed, 766 insertions(+) create mode 100644 docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/GSE156793.json create mode 100644 docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/GSE165045.json create mode 100644 docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_1.json create mode 100644 docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_2.json create mode 100644 docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_3.json create mode 100644 docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/dataset-import-commands.md create mode 100644 docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/public-dataset-configurations-mapping.md create mode 100644 docs/user-guide/doc-odm-user-guide/quick-tutorial.md diff --git a/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/GSE156793.json b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/GSE156793.json new file mode 100644 index 0000000..79f66f1 --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/GSE156793.json @@ -0,0 +1,111 @@ +{ + "name": "GSE156793.json", + "description": "Config to transform GSE156793 dataset", + "data": { + "file_type": "h5ad", + "biosample_metadata": { + "metadata_keys": { + "obs": "metadata" + }, + "biosample_column_name": "RT_group", + "sample": { + "create_new_group": false, + "template_id": null, + "linking_group": null, + "columns_to_export": [ + "Fetus_id", + "Development_day" + ], + "columns_renaming_map": { + "Fetus_id": "Donor ID", + "Development_day": "Donor Age" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null + }, + "library": { + "create_new_group": false, + "template_id": null, + "linking_group": null, + "columns_to_export": [ + "Assay" + ], + "columns_renaming_map": { + "Assay": "Assay Type" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null + } + }, + "cell_metadata": { + "metadata_keys": { + "obs": "metadata", + "obsm": "embedding", + "obsp": "pairwise" + }, + "linking_group": null, + "columns_to_drop": [ + "batch", + "Organ", + "Sex", + "Batch", + "Experiment_batch" + ], + "columns_renaming_map": { + "_index": "barcode", + "RT_group": "batch", + "Main_cluster_name": "cluster", + "Organ_cell_lineage": "cell_type" + }, + "columns_to_curate_values": { + "matched_mca_cell_name": { + "nan": "" + }, + "bca_cluster_info": { + "nan": "" + }, + "matched_bca_cell_name": { + "nan": "" + }, + "X_umap": { + "nan,nan": "" + } + }, + "columns_to_fill_missing_values": { + "batch": "unknown" + }, + "columns_to_preserve_name": [ + "X_umap" + ], + "add_qc_metrics": true + }, + "feature_metadata": { + "metadata_keys": { + "var": "metadata", + "varm": "embedding", + "varp": "pairwise" + }, + "columns_to_drop": null, + "columns_renaming_map": { + "_index": "geneId", + "gene_short_name": "gene" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null, + "set_column_value": null, + "columns_to_preserve_name": null, + "map_gene_ids_to_names": true + }, + "cell_expression": { + "data_class": "Single-cell transcriptomics", + "compression_level": null, + "chunk_size": null, + "max_buffer_size": null, + "number_format": null, + "columns_to_drop": null, + "columns_renaming_map": null, + "set_column_value": null, + "source_file_metadata": null + } + } +} diff --git a/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/GSE165045.json b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/GSE165045.json new file mode 100644 index 0000000..b650ab3 --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/GSE165045.json @@ -0,0 +1,49 @@ +{ + "name": "GSE165045.json", + "description": "Config to transform GSE165045 dataset", + "data": { + "file_type": "h5ad", + "biosample_metadata": null, + "cell_metadata": { + "metadata_keys": { + "obs": "metadata" + }, + "linking_group": null, + "columns_to_drop": null, + "columns_renaming_map": { + "sample": "batch", + "_index": "barcode" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null, + "set_column_value": null, + "columns_to_preserve_name": null, + "add_qc_metrics": true + }, + "feature_metadata": { + "metadata_keys": { + "var": "metadata" + }, + "columns_to_drop": null, + "columns_renaming_map": { + "_index": "gene" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null, + "set_column_value": null, + "columns_to_preserve_name": null, + "map_gene_ids_to_names": true + }, + "cell_expression": { + "compression_level": null, + "chunk_size": null, + "max_buffer_size": null, + "data_class": "Single-cell transcriptomics", + "number_format": null, + "columns_to_drop": null, + "columns_renaming_map": null, + "set_column_value": null, + "source_file_metadata": null + } + } +} diff --git a/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_1.json b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_1.json new file mode 100644 index 0000000..c631ca8 --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_1.json @@ -0,0 +1,84 @@ +{ + "name": "aggregated_config_1.json", + "description": "Aggregated config 1 to transform several public datasets", + "data": { + "file_type": "h5ad", + "biosample_metadata": null, + "cell_metadata": { + "metadata_keys": { + "obs": "metadata", + "obsm": "embedding", + "obsp": "pairwise" + }, + "linking_group": null, + "columns_to_drop": [ + "barcode", + "Species", + "sex", + "age", + "disease", + "biosample_id", + "lvef", + "biosample_id" + ], + "columns_renaming_map": { + "index": "barcode", + "_index": "barcode", + "donor_id": "batch", + "sample": "batch", + "sample_id": "batch", + "Sample_Name": "batch", + "biological.individual": "batch", + "GSM_ID": "gsm_id", + "cell_type_leiden0.6": "cell_type", + "SubCluster": "cluster", + "cellbender_ncount": "n_counts", + "cellbender_ngenes": "n_genes", + "cellranger_percent_mito": "percent_mito", + "cellbender_entropy": "entropy", + "cellranger_doublet_scores": "doublet_scores" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null, + "set_column_value": null, + "columns_to_preserve_name": null, + "add_qc_metrics": true + }, + "feature_metadata": { + "metadata_keys": { + "var": "metadata", + "varm": "embedding", + "varp": "pairwise" + }, + "columns_to_drop": [ + "feature_biotype", + "feature_types", + "genome" + ], + "columns_renaming_map": { + "_index": "gene", + "index": "gene", + "GENE": "gene", + "var_index": "geneId", + "feature_is_filtered": "is_filtered" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null, + "set_column_value": null, + "columns_to_preserve_name": null, + "map_gene_ids_to_names": true + }, + "cell_expression": { + "compression_level": null, + "chunk_size": null, + "max_buffer_size": null, + "data_class": "Single-cell transcriptomics", + "number_format": null, + "columns_to_drop": null, + "columns_renaming_map": null, + "set_column_value": null, + "source_file_metadata": null + } + } +} + diff --git a/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_2.json b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_2.json new file mode 100644 index 0000000..6aaca9f --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_2.json @@ -0,0 +1,108 @@ +{ + "name": "aggregated_config_2.json", + "description": "Aggregated config 2 to transform several public datasets", + "data": { + "file_type": "h5ad", + "biosample_metadata": { + "metadata_keys": { + "obs": "metadata" + }, + "biosample_column_name": "sample", + "sample": { + "create_new_group": false, + "template_id": null, + "linking_group": null, + "columns_to_export": [ + "sex_ontology_term_id", + "development_stage_ontology_term_id", + "ethnicity_ontology_term_id", + "HbA1c", + "insulin_content", + "glucose_SI" + ], + "columns_renaming_map": { + "sex_ontology_term_id": "Donor Sex Term ID", + "development_stage_ontology_term_id": "Developmental Stage Term ID", + "ethnicity_ontology_term_id": "Donor Ethnicity Term ID", + "HbA1c": "Hemoglobin A1c (HbA1c) Concentration Value", + "insulin_content": "Fasting Insulin Concentration Value", + "glucose_SI": "Fasting Glucose Concentration Value" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null + }, + "library": { + "create_new_group": false, + "template_id": null, + "linking_group": null, + "columns_to_export": [ + "assay_ontology_term_id" + ], + "columns_renaming_map": { + "assay_ontology_term_id": "Assay Type Term ID" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null + } + }, + "cell_metadata": { + "metadata_keys": { + "obs": "metadata", + "obsm": "embedding" + }, + "linking_group": null, + "columns_to_drop": [ + "id", + "BMI", + "organism_ontolology_term_id", + "disease_ontology_term_id", + "is_primary_data", + "tissue_ontology_term_id" + ], + "columns_renaming_map": { + "_index": "barcode", + "sample": "batch", + "louvain_anno_broad": "louvain", + "louvain_anno_fine": "louvain_fine", + "cell_type_ontology_term_id": "cell_type", + "mt_frac": "percent_mito" + + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null, + "set_column_value": null, + "columns_to_preserve_name": null, + "add_qc_metrics": true + }, + "feature_metadata": { + "metadata_keys": { + "var": "metadata" + }, + "columns_to_drop": [ + "feature_biotype" + ], + "columns_renaming_map": { + "ensembl_ID": "geneId", + "human_ensembl_ID": "human_ensembl_id", + "feature_is_filtered": "is_filtered", + "filtered_mapped_human_ensembl_ID": "filtered_mapped_human_ensembl_id" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null, + "set_column_value": null, + "columns_to_preserve_name": null, + "map_gene_ids_to_names": true + }, + "cell_expression": { + "compression_level": null, + "chunk_size": null, + "max_buffer_size": null, + "data_class": "Single-cell transcriptomics", + "number_format": null, + "columns_to_drop": null, + "columns_renaming_map": null, + "set_column_value": null, + "source_file_metadata": null + } + } +} \ No newline at end of file diff --git a/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_3.json b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_3.json new file mode 100644 index 0000000..c208aab --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/aggregated_config_3.json @@ -0,0 +1,117 @@ +{ + "name": "aggregated_config_3.json", + "description": "Aggregated config 3 to transform several public datasets", + "data": { + "file_type": "h5ad", + "biosample_metadata": { + "metadata_keys": { + "obs": "metadata" + }, + "biosample_column_name": "sample_id", + "sample": { + "create_new_group": false, + "template_id": null, + "linking_group": null, + "columns_to_export": [ + "Condition", + "self_reported_ethnicity_ontology_term_id", + "tissue_type" + ], + "columns_renaming_map": { + "Condition": "Condition Group", + "self_reported_ethnicity_ontology_term_id": "Donor Ethnicity Term ID", + "tissue_type": "Cell Source" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": { + "Sample Source ID": { + "AM031": "Liver-32", + "AM042": "Liver-13", + "AM048": "Liver-14", + "AM061": "Liver-18", + "AM062": "Liver-33", + "AM072": "Liver-34" + } + } + }, + "library": { + "create_new_group": null, + "template_id": null, + "linking_group": null, + "columns_to_export": null, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null + } + }, + "cell_metadata": { + "metadata_keys": { + "obs": "metadata", + "obsm": "embedding", + "obsp": "pairwise" + }, + "linking_group": null, + "columns_to_drop": [ + "barcode", + "Sex", + "Age", + "batch", + "organism_ontology_term_id", + "donor_id", + "development_stage_ontology_term_id", + "sex_ontology_term_id", + "disease_ontology_term_id", + "tissue_ontology_term_id" + ], + "columns_renaming_map": { + "_index": "barcode", + "sample_id": "batch", + "log10GenesPerUMI_injured": "log10_genes_per_umi_injured", + "CellType_injured": "cell_type_injured", + "log10GenesPerUMI_healthy": "log10_genes_per_umi_healthy", + "CellType_healthy": "cell_type_healthy", + "cell_type_ontology_term_id": "cell_type" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": { + "batch": { + "AM031": "Liver-32", + "AM042": "Liver-13", + "AM048": "Liver-14", + "AM061": "Liver-18", + "AM062": "Liver-33", + "AM072": "Liver-34" + } + }, + "set_column_value": null, + "columns_to_preserve_name": null, + "add_qc_metrics": true + }, + "feature_metadata": { + "metadata_keys": { + "var": "metadata", + "varm": "embedding", + "varp": "pairwise" + }, + "columns_to_drop": null, + "columns_renaming_map": { + "_index": "gene" + }, + "columns_to_fill_missing_values": null, + "columns_to_curate_values": null, + "set_column_value": null, + "columns_to_preserve_name": null, + "map_gene_ids_to_names": true + }, + "cell_expression": { + "compression_level": null, + "chunk_size": null, + "max_buffer_size": null, + "data_class": "Single-cell transcriptomics", + "number_format": "float32", + "columns_to_drop": null, + "columns_renaming_map": null, + "set_column_value": null, + "source_file_metadata": null + } + } +} \ No newline at end of file diff --git a/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/dataset-import-commands.md b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/dataset-import-commands.md new file mode 100644 index 0000000..9550a99 --- /dev/null +++ b/docs/user-guide/doc-odm-user-guide/doc-odm-user-guide/extras/dataset-import-commands.md @@ -0,0 +1,193 @@ +# Curated Public Datasets: Import Commands + +The commands below load each curated single-cell dataset into an ODM instance using the `odm-import-data` CLI. Each command uploads study and sample/library metadata alongside the H5AD attachment, ready for transformation. + +Replace ``, ``, and `