diff --git a/conf/base.config b/conf/base.config index 9d0500d2..3c351c0f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -63,6 +63,15 @@ process { memory = { 10.GB * task.attempt, 'memory' } time = { 16.h * task.attempt, 'time' } } + withName: 'PRIDEPY_FETCH_SDRF' { + errorStrategy = 'retry' + maxRetries = 3 + } + withName: 'PRIDEPY_DOWNLOAD_FILE' { + errorStrategy = 'retry' + maxRetries = 3 + maxForks = 5 + } withLabel: process_gpu { ext.use_gpu = { workflow.profile.contains('gpu') } accelerator = { workflow.profile.contains('gpu') ? 1 : null } diff --git a/conf/modules.config b/conf/modules.config index 0975a406..ccb5aa6b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -315,6 +315,27 @@ process { process { + withName: 'SDRF_PIPELINES_PARSE_SDRF' { + publishDir = [ + path: {"${params.outdir}/sdrf"}, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ] + } + + withName: 'PRIDEPY_FETCH_SDRF' { + publishDir = [ + path: {"${params.outdir}/sdrf"}, + mode: params.publish_dir_mode, + pattern: '*.sdrf.tsv' + ] + } + + withName: 'PRIDEPY_DOWNLOAD_FILE' { + ext.args = '-p ftp' + publishDir = [enabled: false] + } + withName: 'TDF2MZML' { publishDir = [ enabled: false diff --git a/conf/test_sdrf.config b/conf/test_sdrf.config new file mode 100644 index 00000000..0b8a98ac --- /dev/null +++ b/conf/test_sdrf.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running SDRF/PRIDE input tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a minimal test using an SDRF + sheet as --input. The pipeline fetches the RAW files from PRIDE and converts the + SDRF to a samplesheet + search presets before running the standard workflow. + + Use as follows: + nextflow run nf-core/mhcquant -profile test_sdrf, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '6.GB', + time: '2.h' + ] +} + +params { + config_profile_name = 'SDRF test profile' + config_profile_description = 'Minimal SDRF input test dataset to check pipeline function' + + // Input data + input = params.pipelines_testdata_base_path + 'mhcquant/testdata/PXD009752.sdrf.tsv' + fasta = params.pipelines_testdata_base_path + 'mhcquant/testdata/UP000005640_9606_500prot.fasta' + + // Batch spectra during Comet search to fit within CI memory limits + spectrum_batch_size = 20000 +} diff --git a/docs/usage.md b/docs/usage.md index 228d1fb4..993f0b5b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -4,6 +4,21 @@ > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ +## Input modes + +The `--input` parameter accepts three formats: + +| Mode | Example | Description | +| ------------------- | ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Samplesheet TSV** | `--input samplesheet.tsv` | A local TSV file listing your MS runs (see [Samplesheet input](#samplesheet-input)). | +| **SDRF file** | `--input experiment.sdrf.tsv` | A local [SDRF-Proteomics](https://github.com/bigbio/proteomics-sample-metadata) file following the [immunopeptidomics template](https://github.com/bigbio/proteomics-sample-metadata/tree/master/templates). Raw files are fetched from PRIDE, search settings and sample metadata are parsed from the SDRF. Requires `--fasta`. | +| **PRIDE accession** | `--input PXD009752` | A PRIDE project accession. The project must include an SDRF file following the [immunopeptidomics template](https://github.com/bigbio/proteomics-sample-metadata/tree/master/templates); both the SDRF and raw files are fetched from PRIDE. Requires `--fasta`. | + +For the SDRF and PRIDE accession modes, the pipeline uses [sdrf-pipelines](https://github.com/bigbio/sdrf-pipelines) to translate the SDRF into an mhcquant samplesheet and a search-preset table, then downloads the raw files with [pridepy](https://github.com/bigbio/py-pride-archive-client). The generated samplesheet and presets are published under `/sdrf/` for transparency. + +> [!NOTE] +> SDRF files must follow the immunopeptidomics template from [bigbio/proteomics-sample-metadata](https://github.com/bigbio/proteomics-sample-metadata/tree/master/templates), and PRIDE accessions must point to a project that contains such an SDRF file — otherwise sample metadata and search parameters cannot be derived. When providing a local `.sdrf.tsv`, the PRIDE accession is inferred from the filename (e.g. `PXD009752.sdrf.tsv`); if your SDRF is named differently, pass the accession via `--input PXD...` instead. + ## Samplesheet input You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a tab-separated file with at least four columns, and a header row as shown in the examples below. diff --git a/modules/local/openmsthirdparty/percolatoradapter/main.nf b/modules/local/openmsthirdparty/percolatoradapter/main.nf index 59105393..a3f20178 100644 --- a/modules/local/openmsthirdparty/percolatoradapter/main.nf +++ b/modules/local/openmsthirdparty/percolatoradapter/main.nf @@ -14,7 +14,7 @@ process OPENMS_PERCOLATORADAPTER { tuple val(meta), path("*.idXML") , emit: idxml tuple val(meta), path("*_percolator_feature_weights.tsv"), emit: feature_weights, optional: true tuple val("${task.process}"), val('PercolatorAdapter'), eval("PercolatorAdapter 2>&1 | grep -E '^Version(.*)' | sed 's/Version: //g' | cut -d ' ' -f 1"), topic: versions - tuple val("${task.process}"), val('percolator'), eval("percolator -h 2>&1 | grep -E '^Percolator version(.*)' | sed 's/Percolator version //g'"), topic: versions + tuple val("${task.process}"), val('percolator'), eval("percolator -h 2>&1 | grep -E '^Percolator version(.*)' | sed 's/Percolator version //g' | cut -d',' -f1"), topic: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/pridepy/download_file/environment.yml b/modules/local/pridepy/download_file/environment.yml new file mode 100644 index 00000000..9a956eb8 --- /dev/null +++ b/modules/local/pridepy/download_file/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::pridepy=0.0.12 diff --git a/modules/local/pridepy/download_file/main.nf b/modules/local/pridepy/download_file/main.nf new file mode 100644 index 00000000..9548f382 --- /dev/null +++ b/modules/local/pridepy/download_file/main.nf @@ -0,0 +1,37 @@ +process PRIDEPY_DOWNLOAD_FILE { + label 'process_single' + tag "${file_name}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pridepy:0.0.12--pyhdfd78af_0' : + 'quay.io/biocontainers/pridepy:0.0.12--pyhdfd78af_0' }" + + input: + tuple val(meta), val(file_name), val(pride_accession) + + output: + tuple val(meta), path("${file_name}"), emit: downloaded_file + tuple val("${task.process}"), val('pridepy'), eval("pip show pridepy 2>/dev/null | grep Version | cut -d' ' -f2"), topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + pridepy download-file-by-name \\ + -a "${pride_accession}" \\ + -f "${file_name}" \\ + -o . \\ + ${args} + + # pridepy exits 0 even on download failure — validate file is non-empty + [ -s "${file_name}" ] || { echo "ERROR: Downloaded file ${file_name} is empty"; exit 1; } + """ + + stub: + """ + touch "${file_name}" + """ +} diff --git a/modules/local/pridepy/download_file/meta.yml b/modules/local/pridepy/download_file/meta.yml new file mode 100644 index 00000000..56749bf7 --- /dev/null +++ b/modules/local/pridepy/download_file/meta.yml @@ -0,0 +1,54 @@ +name: pridepy_download_file +description: Download a single file from the PRIDE Archive by name using pridepy. +keywords: + - pride + - download + - proteomics + - mass spectrometry +tools: + - pridepy: + description: | + Python client library and command line tool for the PRIDE Archive REST API. + Supports downloading files from PRIDE datasets by accession and file name. + homepage: https://github.com/PRIDE-Utilities/pridepy + documentation: https://github.com/PRIDE-Utilities/pridepy + licence: ["Apache-2.0"] +input: + - - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. [ id:'test', sample:'sample1', condition:'A', search_preset:'default' ] + - file_name: + type: string + description: Name of the file to download from the PRIDE Archive dataset. + - pride_accession: + type: string + description: PRIDE Archive accession number (e.g. PXD000001) identifying the dataset. +output: + downloaded_file: + - - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. [ id:'test', sample:'sample1', condition:'A', search_preset:'default' ] + - "${file_name}": + type: file + description: The file downloaded from the PRIDE Archive. + pattern: "*" +topics: + versions: + - - ${task.process}: + type: string + description: The process the versions were collected from + - pridepy: + type: string + description: The tool name + - "pip show pridepy 2>/dev/null | grep Version | cut -d' ' -f2": + type: eval + description: The expression to obtain the version of pridepy + +authors: + - "@jonasscheid" +maintainers: + - "@jonasscheid" diff --git a/modules/local/pridepy/fetch_sdrf/environment.yml b/modules/local/pridepy/fetch_sdrf/environment.yml new file mode 100644 index 00000000..9a956eb8 --- /dev/null +++ b/modules/local/pridepy/fetch_sdrf/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::pridepy=0.0.12 diff --git a/modules/local/pridepy/fetch_sdrf/main.nf b/modules/local/pridepy/fetch_sdrf/main.nf new file mode 100644 index 00000000..2165650a --- /dev/null +++ b/modules/local/pridepy/fetch_sdrf/main.nf @@ -0,0 +1,31 @@ +process PRIDEPY_FETCH_SDRF { + label 'process_single' + tag "$pride_id" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pridepy:0.0.12--pyhdfd78af_0' : + 'quay.io/biocontainers/pridepy:0.0.12--pyhdfd78af_0' }" + + input: + val pride_id + + output: + path "*.sdrf.tsv" , emit: sdrf + tuple val("${task.process}"), val('pridepy'), eval("pip show pridepy 2>/dev/null | grep Version | cut -d' ' -f2"), topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + pridepy stream-files-metadata -a "${pride_id}" -o files_metadata.json + sdrf_name=\$(python3 -c "import json,sys; sdrfs=[f['fileName'] for f in json.load(open('files_metadata.json')) if f['fileName'].endswith('.sdrf.tsv')]; print(sdrfs[0]) if sdrfs else sys.exit('ERROR: No SDRF file found for ${pride_id}')") + pridepy download-file-by-name -a "${pride_id}" -f "\$sdrf_name" -o . -p ftp + """ + + stub: + """ + touch ${pride_id}.sdrf.tsv + """ +} diff --git a/modules/local/pridepy/fetch_sdrf/meta.yml b/modules/local/pridepy/fetch_sdrf/meta.yml new file mode 100644 index 00000000..b1e0e7be --- /dev/null +++ b/modules/local/pridepy/fetch_sdrf/meta.yml @@ -0,0 +1,41 @@ +name: pridepy_fetch_sdrf +description: Fetch an SDRF file from the PRIDE Archive for a given project accession. +keywords: + - pride + - sdrf + - proteomics + - download +tools: + - pridepy: + description: | + Python package to access PRIDE Archive data programmatically, + including downloading files and metadata for public proteomics datasets. + homepage: https://github.com/PRIDE-Archive/pridepy + documentation: https://github.com/PRIDE-Archive/pridepy + licence: ["Apache-2.0"] +input: + - - pride_id: + type: string + description: PRIDE Archive project accession (e.g. PXD009752) +output: + sdrf: + - - "*.sdrf.tsv": + type: file + description: SDRF (Sample and Data Relationship Format) file describing the experimental design of the PRIDE project + pattern: "*.sdrf.tsv" +topics: + versions: + - - ${task.process}: + type: string + description: The process the versions were collected from + - pridepy: + type: string + description: The tool name + - "pip show pridepy 2>/dev/null | grep Version | cut -d' ' -f2": + type: eval + description: The expression to obtain the version of the tool + +authors: + - "@jonasscheid" +maintainers: + - "@jonasscheid" diff --git a/modules/local/sdrf_pipelines/parse_sdrf/environment.yml b/modules/local/sdrf_pipelines/parse_sdrf/environment.yml new file mode 100644 index 00000000..725912b3 --- /dev/null +++ b/modules/local/sdrf_pipelines/parse_sdrf/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::sdrf-pipelines=0.1.2 diff --git a/modules/local/sdrf_pipelines/parse_sdrf/main.nf b/modules/local/sdrf_pipelines/parse_sdrf/main.nf new file mode 100644 index 00000000..4f48ef2c --- /dev/null +++ b/modules/local/sdrf_pipelines/parse_sdrf/main.nf @@ -0,0 +1,34 @@ +process SDRF_PIPELINES_PARSE_SDRF { + label 'process_single' + tag "${sdrf.baseName}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.1.2--pyhdfd78af_0' : + 'quay.io/biocontainers/sdrf-pipelines:0.1.2--pyhdfd78af_0' }" + + input: + path sdrf + + output: + path "samplesheet.tsv" , emit: samplesheet + path "search_presets.tsv" , emit: search_presets + tuple val("${task.process}"), val('sdrf-pipelines'), eval("parse_sdrf --version | cut -d ' ' -f 2"), topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + parse_sdrf convert-mhcquant \\ + -s ${sdrf} \\ + -os samplesheet.tsv \\ + -op search_presets.tsv + """ + + stub: + """ + touch samplesheet.tsv + touch search_presets.tsv + """ +} diff --git a/modules/local/sdrf_pipelines/parse_sdrf/meta.yml b/modules/local/sdrf_pipelines/parse_sdrf/meta.yml new file mode 100644 index 00000000..86de3306 --- /dev/null +++ b/modules/local/sdrf_pipelines/parse_sdrf/meta.yml @@ -0,0 +1,57 @@ +name: sdrf_pipelines_parse_sdrf +description: Converts an SDRF (Sample and Data Relationship Format) file into an + mhcquant-compatible samplesheet and search presets TSV files using sdrf-pipelines. +keywords: + - sdrf + - samplesheet + - proteomics + - immunopeptidomics + - mhcquant +tools: + - "sdrf-pipelines": + description: "A set of tools to validate and convert SDRF files for proteomics + pipelines." + homepage: "https://github.com/bigbio/sdrf-pipelines" + documentation: "https://github.com/bigbio/sdrf-pipelines" + tool_dev_url: "https://github.com/bigbio/sdrf-pipelines" + doi: "10.1021/acs.jproteome.1c00505" + licence: ["Apache-2.0"] + identifier: "" + +input: + - - sdrf: + type: file + description: SDRF file describing the experimental design and sample metadata. + pattern: "*.{tsv,sdrf.tsv}" + ontologies: [] + +output: + samplesheet: + - - "samplesheet.tsv": + type: file + description: mhcquant-compatible samplesheet derived from the SDRF file. + pattern: "samplesheet.tsv" + ontologies: [] + search_presets: + - - "search_presets.tsv": + type: file + description: Search parameter presets derived from the SDRF file, one row + per unique search configuration. + pattern: "search_presets.tsv" + ontologies: [] +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - sdrf-pipelines: + type: string + description: The name of the tool + - "parse_sdrf --version | cut -d ' ' -f 2": + type: eval + description: The expression to obtain the version of the tool + +authors: + - "@jonasscheid" +maintainers: + - "@jonasscheid" diff --git a/nextflow.config b/nextflow.config index 4931c017..46484ee2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -230,6 +230,7 @@ profiles { singularity.runOptions = '--nv' } test { includeConfig 'conf/test.config' } + test_sdrf { includeConfig 'conf/test_sdrf.config' } test_mokapot { includeConfig 'conf/test_mokapot.config' } test_percolator { includeConfig 'conf/test_percolator.config' } test_ionannotator { includeConfig 'conf/test_ionannotator.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index cfba6adc..4e4d89c7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -14,13 +14,9 @@ "properties": { "input": { "type": "string", - "description": "Input raw / mzML files listed in a tsv file (see help for details)", - "help_text": "Use this to specify a sample sheet table including your input raw or mzml files as well as their meta information such as SampleID and Condition. For example:\n\n| ID | Sample | Condition | ReplicateFileName |\n| -----|:------------:| ----------:|------------------------------------------:|\n| 1 | MM15_Melanom | A | data/MM15_Melanom_W_1_A_standard.raw |\n| 2 | MM15_Melanom | B | data/MM15_Melanom_W_1_B_standard.raw |\n| 3 | MM17_Melanom | B | data/MM17_Melanom_W_1_B_standard.raw |\n\n```bash\n--input 'path/samples.tsv'\n```", - "format": "file-path", - "exists": true, - "schema": "assets/schema_input.json", - "mimetype": "text/csv", - "pattern": "^\\S+\\.tsv$", + "pattern": "^(PXD\\d{6,}|\\S+\\.sdrf\\.tsv|\\S+\\.tsv)$", + "description": "Input: samplesheet TSV, SDRF file (.sdrf.tsv), or PRIDE accession (PXD...)", + "help_text": "Accepts three input modes:\n\n1. **Samplesheet TSV** (default): Tab-separated file with columns ID, Sample, Condition, ReplicateFileName.\n2. **SDRF file** (`.sdrf.tsv`): Files are downloaded from PRIDE automatically. Requires `--fasta`.\n3. **PRIDE accession** (e.g., `PXD009752`): SDRF and raw files are fetched from PRIDE. Requires `--fasta`.", "fa_icon": "fas fa-file" }, "outdir": { diff --git a/subworkflows/local/sdrf_to_samplesheet/main.nf b/subworkflows/local/sdrf_to_samplesheet/main.nf new file mode 100644 index 00000000..7d060ef0 --- /dev/null +++ b/subworkflows/local/sdrf_to_samplesheet/main.nf @@ -0,0 +1,81 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SDRF_TO_SAMPLESHEET: Convert SDRF to mhcquant samplesheet + download files from PRIDE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { SDRF_PIPELINES_PARSE_SDRF } from '../../../modules/local/sdrf_pipelines/parse_sdrf/main' +include { PRIDEPY_FETCH_SDRF } from '../../../modules/local/pridepy/fetch_sdrf/main' +include { PRIDEPY_DOWNLOAD_FILE } from '../../../modules/local/pridepy/download_file/main' + +workflow SDRF_TO_SAMPLESHEET { + + take: + sdrf // path: local SDRF file, or null + pride_id // val: PRIDE accession, or null + + main: + + // If pride_id given but no local SDRF, fetch from PRIDE + if (pride_id && !sdrf) { + PRIDEPY_FETCH_SDRF(pride_id) + ch_sdrf = PRIDEPY_FETCH_SDRF.out.sdrf + } else { + ch_sdrf = channel.fromPath(sdrf, checkIfExists: true) + } + + // Convert SDRF to mhcquant samplesheet + search presets + SDRF_PIPELINES_PARSE_SDRF(ch_sdrf) + + // Resolve PRIDE accession for file downloads + def resolved_accession = pride_id ?: inferPrideAccession(sdrf) + + // Parse samplesheet to get file names for downloading + ch_samplesheet_rows = SDRF_PIPELINES_PARSE_SDRF.out.samplesheet + .splitCsv(header: true, sep: '\t') + .map { row -> + def meta = [ + id: row.ID as int, + sample: row.Sample.toString(), + condition: row.Condition.toString(), + search_preset: row.SearchPreset + ] + [meta, row.ReplicateFileName] + } + + // Download each file from PRIDE + ch_to_download = ch_samplesheet_rows + .map { meta, filename -> [meta, filename, resolved_accession] } + + PRIDEPY_DOWNLOAD_FILE(ch_to_download) + + // Write a validated samplesheet with local file paths + ch_samplesheet_file = PRIDEPY_DOWNLOAD_FILE.out.downloaded_file + .map { meta, downloaded_file -> + [meta.id, meta.sample, meta.condition, downloaded_file, meta.search_preset].join('\t') + } + .collectFile(name: 'sdrf_samplesheet.tsv', seed: ['ID', 'Sample', 'Condition', 'ReplicateFileName', 'SearchPreset'].join('\t'), newLine: true) + + emit: + samplesheet = ch_samplesheet_file // path: samplesheet.tsv with local file paths + search_presets = SDRF_PIPELINES_PARSE_SDRF.out.search_presets // path: search_presets.tsv +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +def inferPrideAccession(sdrf_path) { + def name = file(sdrf_path).name + def matcher = (name =~ /PXD\d{6,}/) + if (matcher.find()) { + return matcher.group() + } + error """\ + Could not infer PRIDE accession from SDRF filename: ${name} + Please provide input as a PRIDE accession (e.g., --input PXD009752) + or use an SDRF file named with the PXD accession (e.g., PXD009752.sdrf.tsv) + """.stripIndent() +} diff --git a/subworkflows/local/utils_nfcore_mhcquant_pipeline/main.nf b/subworkflows/local/utils_nfcore_mhcquant_pipeline/main.nf index 42809878..125e62fa 100644 --- a/subworkflows/local/utils_nfcore_mhcquant_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_mhcquant_pipeline/main.nf @@ -17,6 +17,7 @@ include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' include { imNotification } from '../../nf-core/utils_nfcore_pipeline' include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' +include { SDRF_TO_SAMPLESHEET } from '../sdrf_to_samplesheet' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -92,49 +93,80 @@ workflow PIPELINE_INITIALISATION { ) // - // Parse search parameter presets TSV into a map + // Detect input type and build samplesheet channels // - def presetsList = samplesheetToList(params.search_presets, "${projectDir}/assets/schema_search_presets.json") - def presetsMap = presetsList.collectEntries { row -> - def meta = (row instanceof List) ? row[0] : row - // nf-schema parses whitespace-only TSV fields as empty list []; convert to empty string - ['fixed_mods', 'variable_mods'].each { key -> - def val = meta[key] - if (val == null || (val instanceof List && val.size() == 0) || val == '') { - meta[key] = '' - } + def inputType = detectInputType(params.input) + + if (inputType == 'sdrf' || inputType == 'pride_id') { + // + // SDRF / PRIDE input mode: fetch SDRF, convert, download files + // + def sdrf_path = (inputType == 'sdrf') ? params.input : null + def pride_id = (inputType == 'pride_id') ? params.input : null + + if (inputType == 'sdrf' && !pride_id) { + def matcher = (file(params.input).name =~ /PXD\d{6,}/) + pride_id = matcher.find() ? matcher.group() : null } - [(meta.preset_name): meta] + + SDRF_TO_SAMPLESHEET(sdrf_path, pride_id) + + ch_samplesheet_file = SDRF_TO_SAMPLESHEET.out.samplesheet + ch_presets_file = SDRF_TO_SAMPLESHEET.out.search_presets + + } else { + // + // Standard samplesheet input mode + // + ch_samplesheet_file = channel.value(params.input) + ch_presets_file = channel.fromPath(params.search_presets, checkIfExists: true) } // - // Create channel from input file provided through params.input + // Build presets map (shared) // + ch_presets_map = ch_presets_file + .map { presets_file -> + samplesheetToList(presets_file.toString(), "${projectDir}/assets/schema_search_presets.json") + .collectEntries { item -> + // samplesheetToList wraps all-meta rows in a list + def row = (item instanceof List) ? item[0] : item + // nf-schema parses empty TSV cells as [] instead of ''; normalize for string operations + ['fixed_mods', 'variable_mods'].each { key -> if (!row[key]) row[key] = '' } + [(row.preset_name): row] + } + } - channel - .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) + // + // Parse samplesheet with nf-schema validation, enrich, resolve search params (shared) + // + ch_samplesheet_file + .flatMap { samplesheet_path -> + samplesheetToList(samplesheet_path.toString(), "${projectDir}/assets/schema_input.json") + } .map { meta, file, fasta -> def m = meta + [sample: meta.sample.toString(), condition: meta.condition.toString()] - [m.subMap('sample','condition'), m, file, fasta] + [m.subMap('sample', 'condition'), m, file, fasta] } .tap { ch_input } .groupTuple() - // get number of files per sample-condition - .map { group_meta, metas, files, fastas -> [ group_meta, files.size()] } - .combine( ch_input, by:0 ) + .map { group_meta, metas, files, fastas -> [group_meta, files.size()] } + .combine(ch_input, by: 0) .map { group_meta, group_count, meta, file, fasta -> - def enrichedMeta = meta + ['group_count':group_count, 'spectra':file.baseName.tokenize('.')[0], 'ext':getCustomExtension(file)] - def resolved = resolveSearchParams(enrichedMeta, presetsMap) - [resolved, file, fasta] + def enrichedMeta = meta + [group_count: group_count, spectra: file.baseName.tokenize('.')[0], ext: getCustomExtension(file)] + [enrichedMeta, file, fasta] } .set { ch_samplesheet_raw } - ch_samplesheet = ch_samplesheet_raw.map { meta, file, fasta -> [ meta, file ]} + ch_samplesheet = ch_samplesheet_raw + .combine(ch_presets_map) + .map { meta, file, fasta, presetsMap -> + [resolveSearchParams(meta, presetsMap), file] + } // - // Create channel from the reference_database through params.fasta or from the samplesheet fasta files + // Create channel from the reference_database through params.fasta // - if (params.fasta) { channel.fromPath(params.fasta, checkIfExists: true) .map { fasta -> [[id:fasta.getBaseName()], fasta] } @@ -151,10 +183,9 @@ workflow PIPELINE_INITIALISATION { To use the samplesheet FASTA files instead, remove the --fasta parameter. """.stripIndent() } - } else { - // Check if the FASTA files were provided in the samplesheet - ch_fasta = ch_samplesheet_raw.map { meta, file, fasta -> [ groupKey([id: "${meta.sample}_${meta.condition}"], meta.group_count), fasta] } + // Fasta from samplesheet column + ch_fasta = ch_samplesheet_raw.map { meta, file, fasta -> [groupKey([id: "${meta.sample}_${meta.condition}"], meta.group_count), fasta] } ch_fasta .map { meta, fasta -> fasta } .flatten() @@ -166,7 +197,6 @@ workflow PIPELINE_INITIALISATION { 2. Include a 'Fasta' column in your samplesheet '''.stripIndent() } - // Group FASTA files by sample and condition and keep only the first FASTA file per sample-condition ch_fasta .groupTuple() .map { group_meta, fastas -> [group_meta, fastas.first()] } @@ -232,6 +262,20 @@ workflow PIPELINE_COMPLETION { ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +// +// Detect input type from the --input parameter value +// +def detectInputType(input) { + def inputStr = input.toString() + if (inputStr =~ /^PXD\d{6,}$/) { + return 'pride_id' + } + if (inputStr.endsWith('.sdrf.tsv')) { + return 'sdrf' + } + return 'samplesheet' +} + // // Resolve a search parameter with priority: CLI params > samplesheet preset > nextflow.config default // diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap index 489747bf..a6669ef5 100644 --- a/tests/default.nf.test.snap +++ b/tests/default.nf.test.snap @@ -26,7 +26,7 @@ }, "OPENMS_PERCOLATORADAPTER": { "PercolatorAdapter": "3.5.0-pre-exported-20251212", - "percolator": "3.07.1, Build Date Mar 13 2025 17:19:27" + "percolator": "3.07.1" }, "OPENMS_PSMFEATUREEXTRACTOR": { "openms": "3.5.0" @@ -384,6 +384,6 @@ "nf-test": "0.9.3", "nextflow": "25.10.3" }, - "timestamp": "2026-02-12T11:44:22.343699485" + "timestamp": "2026-04-12T20:25:20.873410846" } } \ No newline at end of file diff --git a/tests/ionannotator.nf.test.snap b/tests/ionannotator.nf.test.snap index 694b9d9b..77dd99cd 100644 --- a/tests/ionannotator.nf.test.snap +++ b/tests/ionannotator.nf.test.snap @@ -27,7 +27,7 @@ }, "OPENMS_PERCOLATORADAPTER": { "PercolatorAdapter": "3.5.0-pre-exported-20251212", - "percolator": "3.07.1, Build Date Mar 13 2025 17:19:27" + "percolator": "3.07.1" }, "OPENMS_PSMFEATUREEXTRACTOR": { "openms": "3.5.0" @@ -405,6 +405,6 @@ "nf-test": "0.9.3", "nextflow": "25.10.3" }, - "timestamp": "2026-02-12T13:24:09.318954712" + "timestamp": "2026-04-12T20:31:52.006375101" } } \ No newline at end of file diff --git a/tests/sdrf.nf.test b/tests/sdrf.nf.test new file mode 100644 index 00000000..6c2d121f --- /dev/null +++ b/tests/sdrf.nf.test @@ -0,0 +1,54 @@ +nextflow_pipeline { + + name "Test pipeline with SDRF input" + script "../main.nf" + tag "pipeline" + profile "test_sdrf" + + test("-profile test_sdrf") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Extract peptidoform column from all TSV files + def peptidoform_data = [] + new File(params.outdir).eachFileRecurse { file -> + if (file.name.endsWith('.tsv')) { + def lines = file.readLines() + if (lines.size() > 0) { + def header = lines[0].split('\t') + def peptidoformIndex = header.findIndexOf { it == 'peptidoform' } + if (peptidoformIndex >= 0) { + def peptidoforms = lines.drop(1).collect { line -> + def fields = line.split('\t') + fields.size() > peptidoformIndex ? fields[peptidoformIndex] : '' + }.findAll { it != '' }.sort() + peptidoform_data.add([file.name, peptidoforms]) + } + } + } + } + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_mhcquant_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path, + // Peptidoform data from TSV files + peptidoform_data + ).match() } + ) + } + } +} diff --git a/tests/sdrf.nf.test.snap b/tests/sdrf.nf.test.snap new file mode 100644 index 00000000..560fff3d --- /dev/null +++ b/tests/sdrf.nf.test.snap @@ -0,0 +1,288 @@ +{ + "-profile test_sdrf": { + "content": [ + { + "MS2RESCORE": { + "MS2Rescore": "3.1.5" + }, + "OPENMSTHIRDPARTY_COMETADAPTER": { + "Comet": "2024.01 rev. 1", + "CometAdapter": "3.5.0" + }, + "OPENMS_DECOYDATABASE": { + "openms": "3.5.0" + }, + "OPENMS_IDFILTER_Q_VALUE": { + "openms": "3.5.0" + }, + "OPENMS_IDMASSACCURACY": { + "openms": "3.4.1" + }, + "OPENMS_IDMERGER": { + "openms": "3.5.0" + }, + "OPENMS_PEPTIDEINDEXER": { + "openms": "3.5.0" + }, + "OPENMS_PERCOLATORADAPTER": { + "PercolatorAdapter": "3.5.0-pre-exported-20251212", + "percolator": "3.07.1" + }, + "OPENMS_PSMFEATUREEXTRACTOR": { + "openms": "3.5.0" + }, + "OPENMS_TEXTEXPORTER": { + "openms": "3.5.0" + }, + "PRIDEPY_DOWNLOAD_FILE": { + "pridepy": "0.0.12" + }, + "PYOPENMS_CHROMATOGRAMEXTRACTOR": { + "pyopenms": "3.4.1" + }, + "SDRF_PIPELINES_PARSE_SDRF": { + "sdrf-pipelines": "0.1.2" + }, + "SUMMARIZE_RESULTS": { + "pyopenms": "3.4.1" + }, + "THERMORAWFILEPARSER": { + "thermorawfileparser": "1.4.5" + }, + "Workflow": { + "nf-core/mhcquant": "v3.2.0dev" + } + }, + [ + "intermediate_results", + "intermediate_results/comet", + "intermediate_results/comet/IP_lung_1T_060317_1_pin.tsv", + "intermediate_results/rescoring", + "intermediate_results/rescoring/lc2_1_ms2rescore.idXML", + "intermediate_results/rescoring/lc2_1_pout.idXML", + "intermediate_results/rescoring/lc2_1_pout_filtered.idXML", + "intermediate_results/rescoring/lc2_1_psm.idXML", + "lc2_1.tsv", + "multiqc", + "multiqc/ms2rescore", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_chromatogram.txt", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_general_stats.txt", + "multiqc/multiqc_data/multiqc_histogram_mz.txt", + "multiqc/multiqc_data/multiqc_histogram_rt.txt", + "multiqc/multiqc_data/multiqc_histogram_scores.txt", + "multiqc/multiqc_data/multiqc_length_dist.txt", + "multiqc/multiqc_data/multiqc_mass_error.txt", + "multiqc/multiqc_data/multiqc_percolator_barplot.txt", + "multiqc/multiqc_data/multiqc_scores_xcorr.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/percolator_plot.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/chromatogram-cnt.pdf", + "multiqc/multiqc_plots/pdf/chromatogram-log.pdf", + "multiqc/multiqc_plots/pdf/general_stats.pdf", + "multiqc/multiqc_plots/pdf/histogram_mz.pdf", + "multiqc/multiqc_plots/pdf/histogram_rt.pdf", + "multiqc/multiqc_plots/pdf/histogram_scores.pdf", + "multiqc/multiqc_plots/pdf/length_dist.pdf", + "multiqc/multiqc_plots/pdf/mass_error.pdf", + "multiqc/multiqc_plots/pdf/percolator_plot-cnt.pdf", + "multiqc/multiqc_plots/pdf/percolator_plot-pct.pdf", + "multiqc/multiqc_plots/pdf/scores_xcorr.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/chromatogram-cnt.png", + "multiqc/multiqc_plots/png/chromatogram-log.png", + "multiqc/multiqc_plots/png/general_stats.png", + "multiqc/multiqc_plots/png/histogram_mz.png", + "multiqc/multiqc_plots/png/histogram_rt.png", + "multiqc/multiqc_plots/png/histogram_scores.png", + "multiqc/multiqc_plots/png/length_dist.png", + "multiqc/multiqc_plots/png/mass_error.png", + "multiqc/multiqc_plots/png/percolator_plot-cnt.png", + "multiqc/multiqc_plots/png/percolator_plot-pct.png", + "multiqc/multiqc_plots/png/scores_xcorr.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/chromatogram-cnt.svg", + "multiqc/multiqc_plots/svg/chromatogram-log.svg", + "multiqc/multiqc_plots/svg/general_stats.svg", + "multiqc/multiqc_plots/svg/histogram_mz.svg", + "multiqc/multiqc_plots/svg/histogram_rt.svg", + "multiqc/multiqc_plots/svg/histogram_scores.svg", + "multiqc/multiqc_plots/svg/length_dist.svg", + "multiqc/multiqc_plots/svg/mass_error.svg", + "multiqc/multiqc_plots/svg/percolator_plot-cnt.svg", + "multiqc/multiqc_plots/svg/percolator_plot-pct.svg", + "multiqc/multiqc_plots/svg/scores_xcorr.svg", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_mhcquant_software_mqc_versions.yml", + "sdrf", + "sdrf/samplesheet.tsv", + "sdrf/search_presets.tsv" + ], + [ + "samplesheet.tsv:md5,4d7d2f1ca8d3def75b05ae8c01c481d8", + "search_presets.tsv:md5,3ba3b5c24f71025bfc1799ea87cd0a12" + ], + [ + [ + "lc2_1.tsv", + [ + "AEDGLKHEY", + "AEDKENYKKF", + "AEDKENYKKFY", + "AEGTLSKKL", + "AEKELHEKF", + "ATAQFKINKK", + "ATEQPLTAK", + "ATEQPLTAKK", + "ATFPGMWER", + "ATNKITIIFK", + "ATQTSVVVK", + "AVATALGLK", + "AVIQVSQIVAR", + "AVQEFGLAR", + "AVQEFGLARFK", + "AVSEGTKAVTK", + "AVTDQTVSK", + "AVTKYTSSK", + "AVVELVTVK", + "DAYPEIEKF", + "DYIDTIWKI", + "EEDPNTHILY", + "EEIEILLRY", + "EELQKIYKTY", + "EERVINEEY", + "EETPVVLQL", + "EEVPKRKW", + "EILKWYLNK", + "FPAGKVPAF", + "GEASRLAHY", + "GEWASGGVRSF", + "GGHSGSSYLNTVQK", + "GIAGSLTNK", + "GSLGFTVTK", + "GSQAGGSQTLK", + "GSSPEQVVRPK", + "GSYNKVFLAK", + "GTFLEGVAK", + "GTIHAGQPVK", + "GTNVNMPVSK", + "GTVDKKMVEK", + "GTVTPPPRLVK", + "HFDPEVVQI", + "HPDTGISSKAM", + "HPFIVKLHY", + "HPFLSGAETF", + "HPVEELLDSY", + "HTASPTGMMK", + "HYDAVEAEL", + "IVADHVASY", + "IYLPYLHEW", + "KAFNQGKIFK", + "KEIFLRELI", + "KEIPN(Deamidated)FPTL", + "KPITTGGVTY", + "KSFDTSLIRK", + "KVLHFFNVK", + "KYLADLPTL", + "KYVKVFHKF", + "LPAKDIQTNVY", + "LPALLEKNAM", + "LPPGSVISY", + "LPSSEVVKF", + "LVFPSEIVGK", + "LYDLVTEKM", + "MAIEAQQKF", + "MPVDPNEPTY", + "MPVGMTHGL", + "N(Deamidated)M(Oxidation)TSCHRPICRKEG", + "NAIEDTIFY", + "NESLFGKKY", + "NEYESRSLW", + "NPFEKGDLY", + "NVLDIMVTK", + "NYFHLAVAF", + "NYVPMTPGTFDF", + "PAPPPPPPP", + "PFLPQLQT", + "QELIGKKEY", + "QPFREAIAL", + "QPWEEIKTSY", + "RILFFNTPK", + "RTLQQMLLK", + "RVLDVTKKK", + "RYGPQFTL", + "RYIRDAHTF", + "RYLADLPTL", + "SADRVVAF", + "SAQSFFENK", + "SAVDPVQMK", + "SEADVAQQF", + "SEGTKAVTK", + "SEIAQKQKL", + "SELAEDKENY", + "SEN(Deamidated)ELKKAY", + "SENELKKAY", + "SFSPKTYSF", + "SGSSYLNTVQK", + "SIMEGPLSK", + "SLQDKQ(Deamidated)KGAK", + "SQIEKFQEK", + "SSADGSQPPK", + "SSIQGQWPK", + "SSLYIILKK", + "STDERAYQR", + "STEKIYIRK", + "STLAVTSQK", + "STMGYMMAK", + "STMGYMMAKK", + "SVAELRSQK", + "SVIVQPFSK", + "SVPREPIDRK", + "SVSQPVAQK", + "SYGSVFKAI", + "SYIAAISARF", + "SYQRAFNEF", + "TAADTAVYY", + "TEHYDIPKVSW", + "TENKERKSF", + "TFMDRGFVF", + "TPSSDVLVF", + "TPTGAISQY", + "TVFENLINK", + "TVSETFMSK", + "TYIDKSTQL", + "TYNKVLHFF", + "VPPVFVVSY", + "VPSPAQIMY", + "VPVEEQEEF", + "VSEGTKAVTK", + "VTQSEIAQK", + "VTQSEIAQKQK", + "VVQEPGQVFK", + "VWSDVTPLTF", + "VYEGPELNHAF", + "VYIKHPVSL", + "VYNKVHITL", + "YIKHPVSL", + "YYDVAKQLL", + "YYISPRLTF" + ] + ] + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-04-22T13:14:10.634291398" + } +} \ No newline at end of file diff --git a/tests/search_presets.nf.test.snap b/tests/search_presets.nf.test.snap index 72595e89..4410fbaf 100644 --- a/tests/search_presets.nf.test.snap +++ b/tests/search_presets.nf.test.snap @@ -26,7 +26,7 @@ }, "OPENMS_PERCOLATORADAPTER": { "PercolatorAdapter": "3.5.0-pre-exported-20251212", - "percolator": "3.07.1, Build Date Mar 13 2025 17:19:27" + "percolator": "3.07.1" }, "OPENMS_PSMFEATUREEXTRACTOR": { "openms": "3.5.0" diff --git a/tests/speclib.nf.test.snap b/tests/speclib.nf.test.snap index 7788ab52..2c4e1612 100644 --- a/tests/speclib.nf.test.snap +++ b/tests/speclib.nf.test.snap @@ -36,7 +36,7 @@ }, "OPENMS_PERCOLATORADAPTER": { "PercolatorAdapter": "3.5.0-pre-exported-20251212", - "percolator": "3.07.1, Build Date Mar 13 2025 17:19:27" + "percolator": "3.07.1" }, "OPENMS_PSMFEATUREEXTRACTOR": { "openms": "3.5.0" diff --git a/tests/test_single_quant.nf.test.snap b/tests/test_single_quant.nf.test.snap index 841f5149..12b2533a 100644 --- a/tests/test_single_quant.nf.test.snap +++ b/tests/test_single_quant.nf.test.snap @@ -26,7 +26,7 @@ }, "OPENMS_PERCOLATORADAPTER": { "PercolatorAdapter": "3.5.0-pre-exported-20251212", - "percolator": "3.07.1, Build Date Mar 13 2025 17:19:27" + "percolator": "3.07.1" }, "OPENMS_PSMFEATUREEXTRACTOR": { "openms": "3.5.0"