diff --git a/CITATIONS.md b/CITATIONS.md index d1fde135..922c51be 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -16,6 +16,18 @@ ## Pipeline tools +- [SeqKit2](https://pubmed.ncbi.nlm.nih.gov/38898985/) + + > Shen W, Sipos B, Zhao L. SeqKit2: A Swiss army knife for sequence and alignment processing. Imeta. 2024 Apr 5;3(3):e191. doi: 10.1002/imt2.191. PMID: 38898985; PMCID: PMC11183193. + +- [Flexiplex](https://pubmed.ncbi.nlm.nih.gov/38379414/) + + > Cheng O, Ling MH, Wang C, Wu S, Ritchie ME, Göke J, Amin N, Davidson NM. Flexiplex: a versatile demultiplexer and search tool for omics data. Bioinformatics. 2024 Mar 4;40(3):btae102. doi: 10.1093/bioinformatics/btae102. PMID: 38379414; PMCID: PMC10914444. + +- [Flexiformatter](https://github.com/ljwharbers/flexiformatter) + + > Luuk Harbers. (2025). ljwharbers/flexiformatter: 1.0.6 (1.0.6). Zenodo. https://doi.org/10.5281/zenodo.18098066 + - [BLAZE](https://pubmed.ncbi.nlm.nih.gov/37024980/) > You Y, Prawer YDJ, De Paoli-Iseppi R, Hunt CPJ, Parish CL, Shim H, Clark MB. Identification of cell barcodes from long-read single-cell RNA-seq with BLAZE. Genome Biol. 2023 Apr 6;24(1):66. doi: 10.1186/s13059-023-02907-y. PMID: 37024980; PMCID: PMC10077662. @@ -40,9 +52,9 @@ > De Coster W, D'Hert S, Schultz DT, Cruts M, Van Broeckhoven C. NanoPack: visualizing and processing long-read sequencing data. Bioinformatics 2018 Aug 1; 34(15):2666-9 doi:10.1093/bioinformatics/bty149. PubMed PMID: 29547981; PubMed Central PMCID: PMC6061794. -- [Nanofilt](https://pubmed.ncbi.nlm.nih.gov/29547981/) +- [Chopper](https://pubmed.ncbi.nlm.nih.gov/37171891/) - > De Coster W, D'Hert S, Schultz DT, Cruts M, Van Broeckhoven C. NanoPack: visualizing and processing long-read sequencing data. Bioinformatics 2018 Aug 1; 34(15):2666-9 doi:10.1093/bioinformatics/bty149. PubMed PMID: 29547981; PubMed Central PMCID: PMC6061794. + > De Coster W, Rademakers R. NanoPack2: population-scale evaluation of long-read sequencing data. Bioinformatics. 2023 May 4;39(5):btad311. doi: 10.1093/bioinformatics/btad311. PMID: 37171891; PMCID: PMC10196664. - [NanoPlot](https://pubmed.ncbi.nlm.nih.gov/29547981/) diff --git a/assets/schema_input.json b/assets/schema_input.json index 33400466..16ef5e59 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -23,6 +23,12 @@ }, "cell_count": { "type": "integer" + }, + "type": { + "type": "string", + "enum": ["dna", "cdna"], + "default": "cdna", + "errorMessage": "Type must be either 'dna' or 'cdna'. Default is 'cdna'." } }, "required": ["sample", "fastq", "cell_count"] diff --git a/assets/whitelist/3M-3pgex-may-2023_TRU.txt.zip b/assets/whitelist/3M-3pgex-may-2023_TRU.txt.gz similarity index 56% rename from assets/whitelist/3M-3pgex-may-2023_TRU.txt.zip rename to assets/whitelist/3M-3pgex-may-2023_TRU.txt.gz index 66f6ad26..94187ead 100644 Binary files a/assets/whitelist/3M-3pgex-may-2023_TRU.txt.zip and b/assets/whitelist/3M-3pgex-may-2023_TRU.txt.gz differ diff --git a/assets/whitelist/3M-5pgex-jan-2023.txt.zip b/assets/whitelist/3M-5pgex-jan-2023.txt.gz similarity index 56% rename from assets/whitelist/3M-5pgex-jan-2023.txt.zip rename to assets/whitelist/3M-5pgex-jan-2023.txt.gz index 4e74da34..2976464a 100644 Binary files a/assets/whitelist/3M-5pgex-jan-2023.txt.zip and b/assets/whitelist/3M-5pgex-jan-2023.txt.gz differ diff --git a/assets/whitelist/3M-february-2018.zip b/assets/whitelist/3M-february-2018.txt.gz similarity index 62% rename from assets/whitelist/3M-february-2018.zip rename to assets/whitelist/3M-february-2018.txt.gz index fbd3ae31..ecaff14d 100644 Binary files a/assets/whitelist/3M-february-2018.zip and b/assets/whitelist/3M-february-2018.txt.gz differ diff --git a/assets/whitelist/737K-august-2016.txt.gz b/assets/whitelist/737K-august-2016.txt.gz new file mode 100644 index 00000000..42de71f2 Binary files /dev/null and b/assets/whitelist/737K-august-2016.txt.gz differ diff --git a/assets/whitelist/737K-august-2016.txt.zip b/assets/whitelist/737K-august-2016.txt.zip deleted file mode 100644 index c6e33321..00000000 Binary files a/assets/whitelist/737K-august-2016.txt.zip and /dev/null differ diff --git a/assets/whitelist/cellranger_arc_atac.737K-arc-v1.txt.gz b/assets/whitelist/cellranger_arc_atac.737K-arc-v1.txt.gz new file mode 100644 index 00000000..4fcb9a01 Binary files /dev/null and b/assets/whitelist/cellranger_arc_atac.737K-arc-v1.txt.gz differ diff --git a/assets/whitelist/cellranger_arc_rna.737K-arc-v1.txt.gz b/assets/whitelist/cellranger_arc_rna.737K-arc-v1.txt.gz new file mode 100644 index 00000000..7da0fda0 Binary files /dev/null and b/assets/whitelist/cellranger_arc_rna.737K-arc-v1.txt.gz differ diff --git a/bin/generate_read_counts.sh b/bin/generate_read_counts.sh index 3b6af982..863bafa5 100755 --- a/bin/generate_read_counts.sh +++ b/bin/generate_read_counts.sh @@ -1,19 +1,17 @@ - get_fastqc_counts() { fastqc_file=$1 - counts=$(unzip -p ${fastqc_file} $(basename ${fastqc_file} .zip)/fastqc_data.txt | \ + counts=$(unzip -p "${fastqc_file}" "$(basename "${fastqc_file}" .zip)/fastqc_data.txt" | \ grep 'Total Sequences' | \ cut -f2 -d$'\t') - echo $counts - + echo "$counts" } get_nanoplot_counts() { nanoplot_file=$1 - counts=$(grep 'Number of reads' $nanoplot_file | awk '{print $NF}' | cut -f1 -d'.' | sed 's/,//g') - echo $counts + counts=$(grep 'Number of reads' "$nanoplot_file" | awk '{print $NF}' | cut -f1 -d'.' | sed 's/,//g') + echo "$counts" } output="" @@ -22,27 +20,23 @@ input="" while [[ $# -gt 0 ]] do flag=$1 - case "${flag}" in --input) input=$2; shift;; --output) output=$2; shift;; - *) echo "Unknown option $1 ${reset}" && exit 1 + *) echo "Unknown option $1" && exit 1 esac shift done -header="" -data="" - header="sample,base_fastq_counts,trimmed_read_counts,extracted_read_counts,corrected_read_counts" -echo "$header" > $output +echo "$header" > "$output" -for sample_name in $(for file in $(readlink -f $input)/*.tsv; do basename $file; done | cut -f1 -d'.' | sort -u) -do - ############### - # INPUT_FILES # - ############### +# Collect all sample names from both barcode file types +sample_names=$(find "$input" -type f -name "*.corrected_bc_umi.tsv" -o -name "*_known_barcodes.txt" | \ + sed -E 's|.*/||' | sed -E 's/_known_barcodes\.txt$//; s/\.corrected_bc_umi\.tsv$//' | sort -u) +for sample_name in $sample_names +do raw_fastqc="${sample_name}.raw_fastqc.zip" raw_nanoplot="${sample_name}.raw_NanoStats.txt" @@ -52,18 +46,18 @@ do extract_fastqc="${sample_name}.extracted_fastqc.zip" extract_nanoplot="${sample_name}.extracted_NanoStats.txt" - correct_csv="${sample_name}.corrected_bc_umi.tsv" - data="$(basename $sample_name)" + corrected_tsv="${sample_name}.corrected_bc_umi.tsv" + known_barcodes="${sample_name}_known_barcodes.txt" + + data="$(basename "$sample_name")" #################### # RAW FASTQ COUNTS # #################### - if [[ -s "$raw_fastqc" ]] - then + if [[ -s "$raw_fastqc" ]]; then fastqc_counts=$(get_fastqc_counts "$raw_fastqc") data="$data,$fastqc_counts" - elif [[ -s "$raw_nanoplot" ]] - then + elif [[ -s "$raw_nanoplot" ]]; then nanoplot_counts=$(get_nanoplot_counts "$raw_nanoplot") data="$data,$nanoplot_counts" else @@ -73,12 +67,10 @@ do ############### # TRIM COUNTS # ############### - if [[ -s "$trim_fastqc" ]] - then + if [[ -s "$trim_fastqc" ]]; then trim_counts=$(get_fastqc_counts "$trim_fastqc") data="$data,$trim_counts" - elif [[ -s "$trim_nanoplot" ]] - then + elif [[ -s "$trim_nanoplot" ]]; then nanoplot_counts=$(get_nanoplot_counts "$trim_nanoplot") data="$data,$nanoplot_counts" else @@ -88,12 +80,10 @@ do ##################### # PREEXTRACT COUNTS # ##################### - if [[ -s "$extract_fastqc" ]] - then + if [[ -s "$extract_fastqc" ]]; then extract_counts=$(get_fastqc_counts "$extract_fastqc") data="$data,$extract_counts" - elif [[ -s "$extract_nanoplot" ]] - then + elif [[ -s "$extract_nanoplot" ]]; then nanoplot_counts=$(get_nanoplot_counts "$extract_nanoplot") data="$data,$nanoplot_counts" else @@ -103,12 +93,15 @@ do ################## # CORRECT COUNTS # ################## - if [[ -s $correct_csv ]] - then - correct_counts=$(cut -f6 $correct_csv | awk '{if ($0 != "") {print $0}}' | wc -l) + if [[ -s "$known_barcodes" ]]; then + correct_sum=$(awk -F'\t' '{if ($2 != "") sum += $2} END {print sum}' "$known_barcodes") + data="$data,$correct_sum" + elif [[ -s "$corrected_tsv" ]]; then + correct_counts=$(cut -f6 "$corrected_tsv" | awk '{if ($0 != "") print $0}' | wc -l) data="$data,$correct_counts" else data="$data," fi - echo "$data" >> $output + + echo "$data" >> "$output" done diff --git a/conf/modules.config b/conf/modules.config index 2146507d..66d19e5e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,9 +35,9 @@ process { if (!params.skip_qc && !params.skip_fastqc) { process { withName: '.*:FASTQC_NANOPLOT_PRE_TRIM:FASTQC' { - ext.prefix = { "${meta.id}.raw" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.raw" } publishDir = [ - path: { "${params.outdir}/${meta.id}/qc/fastqc/pre_trim" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/qc/fastqc/pre_trim" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -45,9 +45,9 @@ if (!params.skip_qc && !params.skip_fastqc) { if (!params.skip_trimming) { withName: '.*:FASTQC_NANOPLOT_POST_TRIM:FASTQC' { - ext.prefix = { "${meta.id}.trimmed" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.trimmed" } publishDir = [ - path: { "${params.outdir}/${meta.id}/qc/fastqc/post_trim" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/qc/fastqc/post_trim" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -55,9 +55,9 @@ if (!params.skip_qc && !params.skip_fastqc) { } withName: '.*:FASTQC_NANOPLOT_POST_EXTRACT:FASTQC' { - ext.prefix = { "${meta.id}.extracted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.extracted" } publishDir = [ - path: { "${params.outdir}/${meta.id}/qc/fastqc/post_extract" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/qc/fastqc/post_extract" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -69,9 +69,9 @@ if (!params.skip_qc && !params.skip_fastqc) { if (!params.skip_qc && !params.skip_nanoplot) { process { withName: '.*:FASTQC_NANOPLOT_PRE_TRIM:NANOPLOT' { - ext.prefix = { "${meta.id}.raw" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.raw" } publishDir = [ - path: { "${params.outdir}/${meta.id}/qc/nanoplot/pre_trim/" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/qc/nanoplot/pre_trim/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -79,9 +79,9 @@ if (!params.skip_qc && !params.skip_nanoplot) { if (!params.skip_trimming) { withName: '.*:FASTQC_NANOPLOT_POST_TRIM:NANOPLOT' { - ext.prefix = { "${meta.id}.trimmed" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.trimmed" } publishDir = [ - path: { "${params.outdir}/${meta.id}/qc/nanoplot/post_trim/" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/qc/nanoplot/post_trim/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -89,9 +89,9 @@ if (!params.skip_qc && !params.skip_nanoplot) { } withName: '.*:FASTQC_NANOPLOT_POST_EXTRACT:NANOPLOT' { - ext.prefix = { "${meta.id}.extracted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.extracted" } publishDir = [ - path: { "${params.outdir}/${meta.id}/qc/nanoplot/post_extract/" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/qc/nanoplot/post_extract/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -102,9 +102,18 @@ if (!params.skip_qc && !params.skip_nanoplot) { // NANOCOMP if (!params.skip_qc && !params.skip_fastq_nanocomp) { process { - withName: '.*:NANOCOMP_FASTQ' { + withName: '.*:NANOCOMP_FASTQ_CDNA' { + ext.prefix = { "cdna_" } publishDir = [ - path: { "${params.outdir}/batch_qcs/nanocomp/fastq" }, + path: { "${params.outdir}/batch_qcs/nanocomp/fastq/cdna" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: '.*:NANOCOMP_FASTQ_DNA' { + ext.prefix = { "dna_" } + publishDir = [ + path: { "${params.outdir}/batch_qcs/nanocomp/fastq/dna" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -117,9 +126,9 @@ if (!params.skip_qc && !params.skip_fastq_nanocomp) { if (!params.skip_qc && !params.skip_toulligqc) { process { withName: '.*:FASTQC_NANOPLOT_PRE_TRIM:TOULLIGQC' { - ext.prefix = { "${meta.id}.raw_" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.raw_" } publishDir = [ - path: { "${params.outdir}/${meta.id}/qc/toulligqc/pre_trim/" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/qc/toulligqc/pre_trim/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -127,9 +136,9 @@ if (!params.skip_qc && !params.skip_toulligqc) { if (!params.skip_trimming) { withName: '.*:FASTQC_NANOPLOT_POST_TRIM:TOULLIGQC' { - ext.prefix = { "${meta.id}.trimmed_" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.trimmed_" } publishDir = [ - path: { "${params.outdir}/${meta.id}/qc/toulligqc/post_trim/" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/qc/toulligqc/post_trim/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -137,9 +146,9 @@ if (!params.skip_qc && !params.skip_toulligqc) { } withName: '.*:FASTQC_NANOPLOT_POST_EXTRACT:TOULLIGQC' { - ext.prefix = { "${meta.id}.extracted_" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.extracted_" } publishDir = [ - path: { "${params.outdir}/${meta.id}/qc/toulligqc/post_extract/" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/qc/toulligqc/post_extract/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -152,9 +161,20 @@ if (!params.skip_qc){ process { withName:'.*:BAM_SORT_STATS_SAMTOOLS_CORRECTED:BAM_STATS_SAMTOOLS:.*' { - ext.prefix = { "${meta.id}.corrected" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.corrected" } + publishDir = [ + path: { "${params.outdir}/${meta.id}/${meta.type}/qc/samtools/corrected" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + + process { + withName:'.*:BAM_SORT_STATS_SAMTOOLS:BAM_STATS_SAMTOOLS:.*' { + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.dna" } publishDir = [ - path: { "${params.outdir}/${meta.id}/qc/samtools/corrected" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/qc/samtools/corrected" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -246,6 +266,19 @@ process { } } +// SEQKIT SPLIT2 + +process { + withName: SEQKIT_SPLIT2 { + if (params.split_amount > 0){ + ext.args = "--by-size ${params.split_amount}" + } + publishDir = [ + enabled: false + ] + } +} + if (params.split_amount > 0) { process { withName: '.*:SPLIT_FILE_BC_FASTQ' { @@ -265,7 +298,7 @@ if (params.split_amount > 0) { process { withName: '.*:CAT_CAT_PREEXTRACT' { - ext.prefix = { "${meta.id}_filtered.fastq" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}_filtered.fastq" } publishDir = [ enabled: false ] @@ -274,7 +307,7 @@ if (params.split_amount > 0) { process { withName: '.*:CAT_CAT_BARCODE' { - ext.prefix = { "${meta.id}.corrected_bc_umi.tsv" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.corrected_bc_umi.tsv" } publishDir = [ enabled: false ] @@ -312,6 +345,22 @@ if (!params.skip_trimming) { } } + // CHOPPER + process { + withName: '.*:CHOPPER' { + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}_trimmed" } + ext.args2 = { + [ + params.min_length ? "--minlength ${params.min_length}" : "", + params.min_q_score ? "--quality ${params.min_q_score}" : "" + ].join(' ').trim() + } + publishDir = [ + enabled: false + ] + } + } + // NANOFILT if ( !params.skip_trimming ){ process { @@ -335,16 +384,159 @@ if (!params.skip_trimming) { // BARCODE DETECTION // /////////////////////// +// FLEXIPLEX + +// Flexiplex discovery +if ( params.split_amount > 0 ) { + process { + withName: '.*DEMULTIPLEX_FLEXIPLEX_DNA:FLEXIPLEX_DISCOVERY' { + ext.args = { + [ + params.custom_flexiplex_barcode_dna ?: params.barcode_format == "10X_multiome" ? + "-x 'ACCGAGATCTACAC' -b '????????????????' -x 'CGCGTCTGTCGTCGGCAGCGTCAGATGTGTATAAGAGACAG' -f 8" : + "", + ].join(' ').trim() + } + publishDir = [ + enabled: false + ] + } + } +} else { + process { + withName: '.*DEMULTIPLEX_FLEXIPLEX_DNA:FLEXIPLEX_DISCOVERY' { + ext.args = { + [ + params.custom_flexiplex_barcode_dna ?: params.barcode_format == "10X_multiome" ? + "-x 'ACCGAGATCTACAC' -b '????????????????' -x 'CGCGTCTGTCGTCGGCAGCGTCAGATGTGTATAAGAGACAG' -f 8" : + "", + ].join(' ').trim() + } + publishDir = [ + path: { "${params.outdir}/${meta.id}/${meta.type}/flexiplex/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} + + +// Flexiplex assign +process { + withName: '.*DEMULTIPLEX_FLEXIPLEX_DNA:FLEXIPLEX_ASSIGN' { + ext.args = { + [ + params.custom_flexiplex_barcode_dna ?: params.barcode_format == "10X_multiome" ? + "-x 'ACCGAGATCTACAC' -b '????????????????' -x 'CGCGTCTGTCGTCGGCAGCGTCAGATGTGTATAAGAGACAG' -f 8" : + "", + ].join(' ').trim() + } + publishDir = [ + enabled: false + ] + } +} + +// Flexiplex discovery +if ( params.split_amount > 0 ) { + process { + withName: '.*DEMULTIPLEX_FLEXIPLEX_CDNA:FLEXIPLEX_DISCOVERY' { + ext.args = { + [ + params.custom_flexiplex_barcode_cdna ?: params.barcode_format == "10X_multiome" ? + "-d 10x3v3" : + params.barcode_format == "10x5v3" ? + "-x CTACACGACGCTCTTCCGATCT -b ???????????????? -u ?????????? -x TTTCTTATATGGG -f 8 -e 2" : + "-d 10x${params.barcode_format.minus('10X_')}", + ].join(' ').trim() + } + publishDir = [ + enabled: false + ] + } + } +} else { + process { + withName: '.*DEMULTIPLEX_FLEXIPLEX_CDNA:FLEXIPLEX_DISCOVERY' { + ext.args = { + [ + params.custom_flexiplex_barcode_cdna ?: params.barcode_format == "10X_multiome" ? + "-d 10x3v3" : + params.barcode_format == "10x5v3" ? + "-x CTACACGACGCTCTTCCGATCT -b ???????????????? -u ?????????? -x TTTCTTATATGGG -f 8 -e 2" : + "-d 10x${params.barcode_format.minus('10X_')}", + ].join(' ').trim() + } + publishDir = [ + path: { "${params.outdir}/${meta.id}/${meta.type}/flexiplex/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} + +// Flexiplex assign +process { + withName: '.*DEMULTIPLEX_FLEXIPLEX_CDNA:FLEXIPLEX_ASSIGN' { + ext.args = { + [ + params.custom_flexiplex_barcode_cdna ?: params.barcode_format == "10X_multiome" ? + "-d 10x3v3" : + params.barcode_format == "10x5v3" ? + "-x CTACACGACGCTCTTCCGATCT -b ???????????????? -u ?????????? -x TTTCTTATATGGG -f 8 -e 2" : + "-d 10x${params.barcode_format.minus('10X_')}", + ].join(' ').trim() + } + publishDir = [ + enabled: false + ] + } +} + +// Merge flexiplex barcode files +process { + withName: '.*MERGE_BARCODES' { + publishDir = [ + path: { "${params.outdir}/${meta.id}/${meta.type}/flexiplex/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } +} + +// Flexiplex filter +process { + withName: '.*:FLEXIPLEX_FILTER' { + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}${meta.part ? "_part_${meta.part}" : ""}" } + publishDir = [ + path: { "${params.outdir}/${meta.id}/${meta.type}/flexiplex/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } +} + +// Flexiformatter +process { + withName: '.*:FLEXIFORMATTER' { + publishDir = [ + enabled: false + ] + } +} + // BLAZE process { withName: '.*:BLAZE' { ext.args = { [ - "--kit-version ${params.barcode_format.minus('10X_')}" + params.barcode_format == "10X_multiome" ? "--kit-version 3v3" : "--kit-version ${params.barcode_format.minus('10X_')}" ].join(' ').trim() } publishDir = [ - path: { "${params.outdir}/${meta.id}/blaze/" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/blaze/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -354,7 +546,8 @@ process { // PREEXTRACT_FASTQ process { withName: '.*:PREEXTRACT_FASTQ' { - ext.prefix = { params.split_amount <= 0 ? "${meta.id}" : "${reads}".toString().replace('.fastq', '') } + ext.prefix = { params.split_amount <= 0 ? "${meta.id}${meta.type ? ".${meta.type}" : ""}" : "${reads}".toString().replace('.fastq', '') } + ext.args = params.barcode_format == "10X_multiome" ? "-f 10X_3v3" : "-f ${params.barcode_format}" publishDir = [ enabled: false ] @@ -364,7 +557,7 @@ process { // CORRECT_BARCODES process { withName: '.*:CORRECT_BARCODES' { - ext.prefix = { params.split_amount <= 0 ? "${meta.id}" : "${bc_info}".toString().replace('.extracted.putative_bc_umi.tsv', '') } + ext.prefix = { params.split_amount <= 0 ? "${meta.id}${meta.type ? ".${meta.type}" : ""}" : "${bc_info}".toString().replace('.extracted.putative_bc_umi.tsv', '') } publishDir = [ enabled: false ] @@ -375,7 +568,7 @@ process { process { withName: '.*PROCESS_LONGREAD_SCRNA_GENOME.*:TAG_BARCODES' { publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/bam/barcode_tagged" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/bam/barcode_tagged" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -384,7 +577,7 @@ process { process { withName: '.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:TAG_BARCODES' { publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/bam/barcode_tagged" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/bam/barcode_tagged" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -393,9 +586,9 @@ process { // SAMTOOLS_INDEX process { - withName:'.*PROCESS_LONGREAD_SCRNA_GENOME.*:SAMTOOLS_INDEX_TAGGED' { + withName:'.*PROCESS_LONGREAD_SCRNA_GENOME.*:SAMTOOLS_INDEX' { publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/bam/barcode_tagged" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/bam/barcode_tagged" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -405,7 +598,7 @@ process { process { withName:'.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:SAMTOOLS_INDEX_TAGGED' { publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/bam/barcode_tagged" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/bam/barcode_tagged" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -415,7 +608,7 @@ process { // SAMTOOLS_FLAGSTAT process { withName:'.*PROCESS_LONGREAD_SCRNA_GENOME.*:SAMTOOLS_FLAGSTAT_TAGGED' { - ext.prefix = { "${meta.id}.genome.tagged" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.genome.tagged" } publishDir = [ enabled: false ] @@ -423,16 +616,17 @@ process { } process { withName:'.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:SAMTOOLS_FLAGSTAT_TAGGED' { - ext.prefix = { "${meta.id}.transcriptome.tagged" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.transcriptome.tagged" } publishDir = [ enabled: false ] } } -///////////////////// -// ALIGN_LONGREADS // -///////////////////// + +////////////////////////// +// ALIGN_LONGREADS_CNDA // +////////////////////////// // MINIMAP2_INDEX if (!params.skip_save_minimap2_index) { @@ -470,6 +664,21 @@ if (!params.skip_save_minimap2_index) { ] } } + process { + withName:'.*ALIGN_DEDUPLICATE_DNA:MINIMAP2_INDEX' { + ext.args = { + [ + "-ax map-ont", + params.save_transcript_secondary_alignment == false ? "--secondary=no " : "--secondary=yes " + ].join(' ').trim() + } + publishDir = [ + path: { "${params.outdir}/references/dna/minimap_index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } } // MINIMAP @@ -505,11 +714,25 @@ process { } } +process { + withName:'.*ALIGN_DEDUPLICATE_DNA:MINIMAP2_ALIGN' { + ext.args = { + [ + "-ax map-ont", + params.save_transcript_secondary_alignment == false ? "--secondary=no " : "--secondary=yes ", + ].join(' ').trim() + } + publishDir = [ + enabled: false + ] + } +} + // SAMTOOLS_VIEW process { withName:'.*:SAMTOOLS_FILTER_MAPPED' { ext.args = "-b -F 4" - ext.prefix = { "${meta.id}.mapped_only" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.mapped_only" } publishDir = [ enabled: false ] @@ -519,7 +742,7 @@ process { process { withName:'.*:SAMTOOLS_FILTER_DEDUP' { ext.args = "-b -F 0x0400" - ext.prefix = { "${meta.id}.dedup_filtered" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.dedup_filtered" } publishDir = [ enabled: false ] @@ -529,9 +752,9 @@ process { // SAMTOOLS_SORT process { withName:'.*PROCESS_LONGREAD_SCRNA_GENOME.*:ALIGN_LONGREADS:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_SORT' { - ext.prefix = { "${meta.id}.genome.sorted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.genome.sorted" } publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/bam/original" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/bam/original" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -539,9 +762,9 @@ process { } process { withName:'.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:ALIGN_LONGREADS:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_SORT' { - ext.prefix = { "${meta.id}.transcript.sorted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.transcript.sorted" } publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/bam/original" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/bam/original" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -550,9 +773,9 @@ process { process { withName:'.*PROCESS_LONGREAD_SCRNA_GENOME.*:BAM_SORT_STATS_SAMTOOLS_FILTERED:SAMTOOLS_SORT' { - ext.prefix = { "${meta.id}.genome_mapped_only.sorted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.genome_mapped_only.sorted" } publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/bam/mapped_only" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/bam/mapped_only" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -561,9 +784,9 @@ process { process { withName:'.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:BAM_SORT_STATS_SAMTOOLS_FILTERED:SAMTOOLS_SORT' { - ext.prefix = { "${meta.id}.transcript_mapped_only.sorted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.transcript_mapped_only.sorted" } publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/bam/mapped_only" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/bam/mapped_only" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -574,7 +797,7 @@ process { process { withName:'.*PROCESS_LONGREAD_SCRNA_GENOME.*:ALIGN_LONGREADS:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_INDEX' { publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/bam/original" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/bam/original" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -584,7 +807,7 @@ process { process { withName:'.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:ALIGN_LONGREADS:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_INDEX' { publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/bam/original" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/bam/original" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -594,7 +817,7 @@ process { process { withName:'.*PROCESS_LONGREAD_SCRNA_GENOME.*:ALIGN_LONGREADS:BAM_SORT_STATS_SAMTOOLS_FILTERED:SAMTOOLS_INDEX' { publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/bam/mapped_only" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/bam/mapped_only" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -604,7 +827,7 @@ process { process { withName:'.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:ALIGN_LONGREADS:BAM_SORT_STATS_SAMTOOLS_FILTERED:SAMTOOLS_INDEX' { publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/bam/mapped_only" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/bam/mapped_only" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -616,9 +839,9 @@ if (!params.skip_qc) { // SAMTOOLS FLAGSTAT/STAT/IDXSTAT process { withName:'.*PROCESS_LONGREAD_SCRNA_GENOME.*:ALIGN_LONGREADS:BAM_SORT_STATS_SAMTOOLS:BAM_STATS_SAMTOOLS:.*' { - ext.prefix = { "${meta.id}.genome.minimap" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.genome.minimap" } publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/qc/samtools/minimap" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/qc/samtools/minimap" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -627,9 +850,9 @@ if (!params.skip_qc) { process { withName:'.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:ALIGN_LONGREADS:BAM_SORT_STATS_SAMTOOLS:BAM_STATS_SAMTOOLS:.*' { - ext.prefix = { "${meta.id}.transcriptome.minimap" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.transcriptome.minimap" } publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/qc/samtools/minimap" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/qc/samtools/minimap" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -638,9 +861,9 @@ if (!params.skip_qc) { process { withName:'.*PROCESS_LONGREAD_SCRNA_GENOME.*:ALIGN_LONGREADS:BAM_SORT_STATS_SAMTOOLS_FILTERED:BAM_STATS_SAMTOOLS:.*' { - ext.prefix = { "${meta.id}.genome.mapped_only" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.genome.mapped_only" } publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/qc/samtools/mapped_only" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/qc/samtools/mapped_only" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -649,9 +872,9 @@ if (!params.skip_qc) { process { withName:'.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:ALIGN_LONGREADS:BAM_SORT_STATS_SAMTOOLS_FILTERED:BAM_STATS_SAMTOOLS:.*' { - ext.prefix = { "${meta.id}.transcriptome.mapped_only" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.transcriptome.mapped_only" } publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/qc/samtools/mapped_only" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/qc/samtools/mapped_only" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -663,7 +886,7 @@ if (!params.skip_qc) { process { withName:'.*PROCESS_LONGREAD_SCRNA_GENOME.*:RSEQC_READDISTRIBUTION' { publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/qc/rseqc" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/qc/rseqc" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -673,7 +896,7 @@ if (!params.skip_qc) { process { withName:'.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:RSEQC_READDISTRIBUTION' { publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/qc/rseqc" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/qc/rseqc" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -685,9 +908,9 @@ if (!params.skip_qc) { if (!params.skip_bam_nanocomp) { process { withName: '.*PROCESS_LONGREAD_SCRNA_GENOME.*:ALIGN_LONGREADS:NANOCOMP' { - ext.prefix = { "${meta.id}.genome" } + ext.prefix = { "genome_" } publishDir = [ - path: { "${params.outdir}/batch_qcs/genome/nanocomp/bam" }, + path: { "${params.outdir}/batch_qcs/nanocomp/bam/cdna/genome" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -696,17 +919,83 @@ if (!params.skip_qc) { process { withName: '.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:ALIGN_LONGREADS:NANOCOMP' { - ext.prefix = { "${meta.id}.transcriptome" } + ext.prefix = { "transcriptome_" } publishDir = [ - path: { "${params.outdir}/batch_qcs/transcriptome/nanocomp/bam" }, + path: { "${params.outdir}/batch_qcs/nanocomp/bam/cdna/transcriptome" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } } + process { + withName: '.*ALIGN_DEDUPLICATE_DNA:NANOCOMP' { + ext.prefix = { "dna_" } + publishDir = [ + path: { "${params.outdir}/batch_qcs/nanocomp/bam/dna" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + } +} + +///////////////////// +// ALIGN DEDUP DNA // +///////////////////// + +process {} + +process { + withName: '.*ALIGN_DEDUPLICATE_DNA:PICARD_MARKDUPLICATES' { + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.dna.dedup" } + ext.args = { + [ + "--BARCODE_TAG 'CB'", + "--TAG_DUPLICATE_SET_MEMBERS true", + "--TAGGING_POLICY 'All'" + ].join(' ').trim() + } + publishDir = [ + enabled: false + ] + } +} + +if (!params.skip_dedup) { + process { + withName: '.*ALIGN_DEDUPLICATE_DNA:BAM_SORT_STATS_SAMTOOLS' { + publishDir = [ + path: { "${params.outdir}/${meta.id}/${meta.type}/bam/dedup" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} else { + process { + withName: '.*ALIGN_DEDUPLICATE_DNA:BAM_SORT_STATS_SAMTOOLS' { + publishDir = [ + path: { "${params.outdir}/${meta.id}/${meta.type}/bam/mapped_only" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} + +process { + withName: '.*ALIGN_DEDUPLICATE_DNA:NANOCOMP' { + ext.prefix = { "dna_" } + publishDir = [ + path: { "${params.outdir}/batch_qcs/nanocomp/bam/dna" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } } + //////////////////// // BAM PROCESSING // //////////////////// @@ -717,7 +1006,7 @@ if (!params.skip_qc) { process { withName:'.*:BAM_SORT_STATS_SAMTOOLS_CORRECTED:SAMTOOLS_SORT' { publishDir = [ - path: { "${params.outdir}/${meta.id}/bam/corrected" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/bam/corrected" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -727,9 +1016,31 @@ process { if (!params.skip_dedup) { process { withName:'.*:BAM_SORT_STATS_SAMTOOLS_MERGED:SAMTOOLS_SORT' { - ext.prefix = { "${meta.id}.merged.sorted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.merged.sorted" } + publishDir = [ + path: { "${params.outdir}/${meta.id}/${meta.type}/bam/dedup" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} + +if (!params.skip_dedup) { + process { + withName: '.*ALIGN_DEDUPLICATE_DNA:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_SORT' { + publishDir = [ + path: { "${params.outdir}/${meta.id}/${meta.type}/bam/dedup" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} else { + process { + withName: '.*ALIGN_DEDUPLICATE_DNA:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_SORT' { publishDir = [ - path: { "${params.outdir}/${meta.id}/bam/dedup" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/bam/mapped_only" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -737,19 +1048,39 @@ if (!params.skip_dedup) { } } + // SAMTOOLS_INDEX process { withName:'.*:BAM_SORT_STATS_SAMTOOLS_CORRECTED:SAMTOOLS_INDEX' { publishDir = [ - path: { "${params.outdir}/${meta.id}/bam/corrected" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/bam/corrected" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } } - - +if (!params.skip_dedup) { + process { + withName: '.*ALIGN_DEDUPLICATE_DNA:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/${meta.id}/${meta.type}/bam/dedup" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} else { + process { + withName: '.*ALIGN_DEDUPLICATE_DNA:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/${meta.id}/${meta.type}/bam/mapped_only" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} ///////////////////////////// // UMI DEDUPLICATION SPLIT // @@ -757,7 +1088,7 @@ process { process { withName: '.*:GROUP_TRANSCRIPTS' { - ext.prefix = { "${meta.id}.sorted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.sorted" } publishDir = [ enabled: false ] @@ -790,10 +1121,11 @@ process { process { withName: '.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:UMITOOLS_DEDUP' { - ext.prefix = { "${meta.id}.transcriptome.umi_dedup" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.transcriptome.umi_dedup" } ext.args = { [ - '--per-cell' + '--per-cell', + params.demux_tool_cdna == "flexiplex" ? "--extract-umi-method tag --umi-tag UR --cell-tag CB" : "" ].join(' ').trim() } publishDir = [ @@ -804,7 +1136,7 @@ process { process { withName: '.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:PICARD_MARKDUPLICATES' { - ext.prefix = { "${meta.id}.transcriptome.dedup" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.transcriptome.dedup" } ext.args = { [ "--BARCODE_TAG 'CB'", @@ -820,7 +1152,7 @@ process { process { withName: '.*:PROCESS_LONGREAD_SCRNA_TRANSCRIPT:DEDUP_UMIS:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_SORT' { - ext.prefix = { "${meta.id}.sorted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.sorted" } publishDir = [ enabled: false ] @@ -829,7 +1161,7 @@ process { process { withName: '.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*:SAMTOOLS_INDEX_DEDUP' { - ext.prefix = { "${meta.id}.transcriptome.dedup.sorted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.transcriptome.dedup.sorted" } publishDir = [ enabled: false ] @@ -838,9 +1170,9 @@ process { process { withName: '.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*DEDUP_UMIS:SAMTOOLS_MERGE' { - ext.prefix = { "${meta.id}.transcriptome.dedup" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.transcriptome.dedup" } publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/bam/dedup" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/bam/dedup" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -849,9 +1181,9 @@ process { process { withName: '.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*DEDUP_UMIS:SAMTOOLS_INDEX_MERGED' { - ext.prefix = { "${meta.id}.transcriptome.dedup" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.transcriptome.dedup" } publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/bam/dedup" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/bam/dedup" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -860,9 +1192,9 @@ process { process { withName: '.*PROCESS_LONGREAD_SCRNA_TRANSCRIPT.*DEDUP_UMIS:BAM_STATS_SAMTOOLS.*' { - ext.prefix = { "${meta.id}.transcriptome.${params.dedup_tool}_dedup" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.transcriptome.${params.dedup_tool}_dedup" } publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/qc/samtools/dedup" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/qc/samtools/dedup" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -872,10 +1204,11 @@ process { if (!params.skip_dedup){ process { withName: '.*PROCESS_LONGREAD_SCRNA_GENOME.*:UMITOOLS_DEDUP' { - ext.prefix = { "${meta.id}.genome.umi_dedup" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.genome.umi_dedup" } ext.args = { [ - '--per-cell' + '--per-cell', + params.demux_tool_cdna == "flexiplex" ? "--extract-umi-method tag --umi-tag UR --cell-tag CB" : "" ].join(' ').trim() } publishDir = [ @@ -886,7 +1219,7 @@ if (!params.skip_dedup){ process { withName: '.*PROCESS_LONGREAD_SCRNA_GENOME.*:PICARD_MARKDUPLICATES' { - ext.prefix = { "${meta.id}.genome.dedup" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.genome.dedup" } ext.args = { [ "--BARCODE_TAG 'CB'", @@ -902,7 +1235,7 @@ if (!params.skip_dedup){ process { withName:'.*PROCESS_LONGREAD_SCRNA_GENOME.*:SAMTOOLS_INDEX_DEDUP' { - ext.prefix = { "${meta.id}.genome.dedup.sorted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.genome.dedup.sorted" } publishDir = [ enabled: false ] @@ -911,7 +1244,8 @@ if (!params.skip_dedup){ process { withName: '.*:PROCESS_LONGREAD_SCRNA_GENOME:DEDUP_UMIS:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_SORT' { - ext.prefix = { "${meta.id}.sorted" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.sorted" } + ext.args = "--write-index" publishDir = [ enabled: false ] @@ -920,9 +1254,9 @@ if (!params.skip_dedup){ process { withName: '.*PROCESS_LONGREAD_SCRNA_GENOME.*DEDUP_UMIS:SAMTOOLS_MERGE' { - ext.prefix = { "${meta.id}.genome.dedup" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.genome.dedup" } publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/bam/dedup" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/bam/dedup" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -931,9 +1265,9 @@ if (!params.skip_dedup){ process { withName: '.*PROCESS_LONGREAD_SCRNA_GENOME.*DEDUP_UMIS:SAMTOOLS_INDEX_MERGED' { - ext.prefix = { "${meta.id}.genome.dedup" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.genome.dedup" } publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/bam/dedup" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/bam/dedup" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -942,9 +1276,9 @@ if (!params.skip_dedup){ process { withName: '.*PROCESS_LONGREAD_SCRNA_GENOME.*DEDUP_UMIS:BAM_STATS_SAMTOOLS.*' { - ext.prefix = { "${meta.id}.genome.${params.dedup_tool}_dedup" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.genome.${params.dedup_tool}_dedup" } publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/qc/samtools/dedup" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/qc/samtools/dedup" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -952,35 +1286,7 @@ if (!params.skip_dedup){ } } - -///////////////////////////// -// QUANTIFY SCRNA ISOQUANT // -///////////////////////////// - -process { - withName: '.*:QUANTIFY_SCRNA_ISOQUANT:SPLIT_FASTA' { - publishDir = [ - enabled: false - ] - } -} - -process { - withName: '.*:QUANTIFY_SCRNA_ISOQUANT:SAMTOOLS_FAIDX_SPLIT' { - publishDir = [ - enabled: false - ] - } -} - -process { - withName: '.*:QUANTIFY_SCRNA_ISOQUANT:SPLIT_GTF' { - publishDir = [ - enabled: false - ] - } -} - +// BAMTOOLS_SPLIT - used in dedup_umis and other workflows process { withName: '.*:BAMTOOLS_SPLIT' { ext.args = { @@ -994,17 +1300,13 @@ process { } } -process { - withName: '.*:SAMTOOLS_INDEX_SPLIT' { - publishDir = [ - enabled: false - ] - } -} +///////////////////////////// +// QUANTIFY SCRNA ISOQUANT // +///////////////////////////// process { withName: '.*:ISOQUANT' { - ext.prefix = { "${meta.id}.${meta.chr}" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}" } ext.args = { [ "--complete_genedb", @@ -1017,29 +1319,7 @@ process { ].join(' ').trim() } publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/isoquant/output/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } -} - -process { - withName: '.*:MERGE_MTX_GENE' { - ext.prefix = { "${meta.id}.gene" } - publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/isoquant/feature_bc_mtx" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } -} - -process { - withName: '.*:MERGE_MTX_TRANSCRIPT' { - ext.prefix = { "${meta.id}.transcript" } - publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/isoquant/feature_bc_mtx" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/isoquant/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -1050,7 +1330,7 @@ if (!params.skip_qc && !params.skip_seurat) { process { withName: '.*QUANTIFY_SCRNA_ISOQUANT:QC_SCRNA_GENE:SEURAT' { publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/qc/seurat_isoquant/gene" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/qc/seurat_isoquant/gene" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -1060,7 +1340,7 @@ if (!params.skip_qc && !params.skip_seurat) { process { withName: '.*QUANTIFY_SCRNA_ISOQUANT:QC_SCRNA_TRANSCRIPT:SEURAT' { publishDir = [ - path: { "${params.outdir}/${meta.id}/genome/qc/seurat_isoquant/transcript" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/genome/qc/seurat_isoquant/transcript" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -1094,10 +1374,11 @@ process { withName:'.*:QUANTIFY_SCRNA_OARFISH:SAMTOOLS_SORT' { ext.args = { [ - "-t CB" + "-t CB", + "--write-index" ].join(' ').trim() } - ext.prefix = { "${meta.id}.bc_sort" } + ext.prefix = { "${meta.id}${meta.type ? ".${meta.type}" : ""}.bc_sort" } publishDir = [ enabled: false ] @@ -1112,7 +1393,7 @@ process { ].join(' ').trim() } publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/oarfish" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/oarfish" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -1123,7 +1404,7 @@ if (!params.skip_qc && !params.skip_seurat) { process { withName: '.*QUANTIFY_SCRNA_OARFISH:QC_SCRNA:SEURAT' { publishDir = [ - path: { "${params.outdir}/${meta.id}/transcriptome/qc/seurat_oarfish/" }, + path: { "${params.outdir}/${meta.id}/${meta.type}/transcriptome/qc/seurat_oarfish/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/conf/test_dna.config b/conf/test_dna.config new file mode 100644 index 00000000..1ac51b8c --- /dev/null +++ b/conf/test_dna.config @@ -0,0 +1,38 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running DNA tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/scnanoseq -profile test_dna, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test DNA profile' + config_profile_description = 'Minimal test dataset to check DNA pipeline function' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/scnanoseq/samplesheet/samplesheet_test_dna.csv' + + // Genome references + genome_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/scnanoseq/reference/chr21.fa" + gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/scnanoseq/reference/chr21.gtf" + + // Barcode options + barcode_format = "10X_multiome" + + // Demultiplexing tool - DNA only supports flexiplex + demux_tool_dna = "flexiplex" + +} diff --git a/conf/test_mixed.config b/conf/test_mixed.config new file mode 100644 index 00000000..8096fe8e --- /dev/null +++ b/conf/test_mixed.config @@ -0,0 +1,43 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running mixed DNA/cDNA tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test + with both DNA and cDNA samples (multiome). + + Use as follows: + nextflow run nf-core/scnanoseq -profile test_mixed, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test Mixed (Multiome) profile' + config_profile_description = 'Minimal test dataset to check both DNA and cDNA pipeline function' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/scnanoseq/samplesheet/samplesheet_test_mixed.csv' + + // Genome references + genome_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/scnanoseq/reference/chr21.fa" + gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/scnanoseq/reference/chr21.gtf" + + // Barcode options + barcode_format = "10X_multiome" + + // Demultiplexing tool - both use flexiplex for multiome + demux_tool_cdna = "flexiplex" + demux_tool_dna = "flexiplex" + + // Analysis options + quantifier = "isoquant" + +} diff --git a/docs/output.md b/docs/output.md index 01eac16f..790717a1 100644 --- a/docs/output.md +++ b/docs/output.md @@ -6,48 +6,54 @@ This document describes the output produced by the pipeline. Most of the plots a The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. +TODO: Should here be added which output is cDNA/DNA specific? +TODO: Go over entire output section and remove/add flexiplex etc where needed. + ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - [Preprocessing](#preprocessing) - - [Nanofilt](#nanofilt) - Read Quality Filtering and Trimming + - [Chopper](#chopper) - Read Quality Filtering and Trimming - [Barcode Calling](#barcode-calling) + - [Flexiplex](#flexiplex) - Barcode caller - [BLAZE](#blaze) - Barcode caller - [Alignment](#alignment) - [Minimap2](#minimap2) - Long read alignment - [Alignment Post-processing](#alignment-post-processing) - [Samtools](#samtools) - Sort and index alignments and make alignment qc - - [Barcode Tagging](#barcode-tagging) - Barcode tagging with quality metrics and barcode information + - [Barcode Tagging Blaze](#barcode-tagging-blaze) - Barcode tagging with quality metrics and barcode information + - [Barcode Tagging Flexiplex]($barcode-tagging-flexiplex) - Moving Barcode and/or UMI tag from read name to bam tags - [UMI-tools Dedup](#umi-tools-dedup) - UMI-based Read deduplication - [Picard MarkDuplicates](#picard-markduplicates) - Read deduplication -- [Feature-Barcode Quantification](#feature-barcode-quantification) +- [Feature-Barcode Quantification](#feature-barcode-quantification)\* - [IsoQuant](#isoquant) - Feature-barcode quantification (gene and transcript level) - [oarfish](#oarfish) - Feature-barcode quantification (transcript-level only) - [Seurat](#seurat) - Feature-barcode matrix QC -- [Other steps](#other-steps) +- [Other steps](#other-steps)\* - [UCSC](#ucsc) - Annotation BED file - [Quality Control](#quality-control) - [FastQC](#fastqc) - FASTQ QC - [NanoComp](#nanocomp) - Long Read FASTQ QC - [NanoPlot](#nanoplot) - Long Read FASTQ QC - [ToulligQC](#toulligqc) - Long Read FASTQ QC - - [RSeQC](#rseqc) - Various RNA-seq QC metrics + - [RSeQC](#rseqc) - Various RNA-seq QC metrics\* - [Read Counts](#read-counts) - Read Counts QC - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +\* Indicates RNA only output + +\*\* Indicates DNA only output + ## Preprocessing -### Nanofilt +### Chopper
Output files -- `/` - - `fastq/` - - `trimmed_nanofilt/` - - `*_filtered.fastq.gz`: The post-trimmed fastq. By default this will be mostly quality trimmed. +`*_trimmed.fastq.gz`: The post-trimmed fastq. By default this will be mostly quality trimmed.
@@ -55,6 +61,20 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d ## Barcode Calling +### Flexiplex + +
+Output files + +- `/` + - `flexiplex/` + - `*.barcodes_counts.txt` : This is a file containing each barcode and the counts of how many reads support it. + - `*.known_barcodes` : This file is a list of all "true" barcodes and the counts associated to it in the sample. Can be used as whitelist for downstream tools. + +
+ +[Flexiplex](https://github.com/DavidsonGroup/flexiplex/) is a fast, multithreaded, and user-configurable demultiplexer. Given a set of reads as either FASTQ or FASTA, it will demultiplex and/or identify a sequence of interest, reporting matching reads and read-barcode assignment. Flexiplex works in two modes: (i) when one or more sequences of interest are known, such as barcodes, and (ii) discovery mode—when only the sequence which flanks the region of interest is known. + ### BLAZE
@@ -148,7 +168,7 @@ The knee plot (an example is listed above) that is provided by BLAZE shows all b [Samtools](https://www.htslib.org/) is a suite of programs for reading, writing, editing, indexing, and viewing files that are in SAM, BAM, or CRAM format -### Barcode Tagging +### Barcode Tagging Blaze
Output files @@ -177,6 +197,32 @@ UMI quality tag = "UY" Note that barcodes are corrected with the custom script, `correct_barcodes.py`. +### Barcode Tagging Flexiplex + +
+Output files + +- `/` + - `genome/` + - `bam/` + - `barcode_tagged/` + - `*.tagged.bam` : The genome aligned bam containing tagged barcode and UMI metadata. + - `transcriptome/` + - `bam/` + - `barcode_tagged/` + - `*.tagged.bam` : The transcriptome aligned bam containing tagged barcode and UMI metadata. + +
+ +Barcode tagging is a custom python package specifically created to move barcode and/or umi tags that were added to the read name by flexiplex to the BAM tags, Useful for custom down stream analysis (e.g.: subsetting BAMs based on cell barcodes). Specifically the following tags are added: + +``` +barcode tag = "CB" +UMI tag = "UR" +``` + +Flexiplex barcodes are already corrected during the initial Flexiplex run and are thus not post-corrected. + ### UMI-tools Dedup
diff --git a/docs/usage.md b/docs/usage.md index 5cbe1d49..17b068af 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,7 +6,7 @@ ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' @@ -17,20 +17,25 @@ You will need to create a samplesheet with information about the samples you wou The example `samplesheet.csv` below contains a single FASTQ file per biological replicate with sample specific cell counts. ```csv title="samplesheet.csv" -sample,fastq,cell_count -CONTROL_REP1,AEG588A1_S1.fastq.gz,5000 -CONTROL_REP2,AEG588A2_S1.fastq.gz,6000 -CONTROL_REP3,AEG588A3_S1.fastq.gz,5000 -TREATMENT_REP1,AEG588A4_S1.fastq.gz,5500 -TREATMENT_REP2,AEG588A5_S1.fastq.gz,6000 -TREATMENT_REP3,AEG588A6_S1.fastq.gz,5000 +sample,fastq,cell_count,type +CONTROL_REP1,AEG588A1_S1.fastq.gz,5000,cdna +CONTROL_REP2,AEG588A2_S1.fastq.gz,6000,cdna +TREATMENT_REP1,AEG588A4_S1.fastq.gz,5500,cdna +TREATMENT_REP2,AEG588A5_S1.fastq.gz,6000,cdna +CONTROL_REP1,AEG588A1_S1.fastq.gz,5000,dna +CONTROL_REP2,AEG588A2_S1.fastq.gz,6000,dna +TREATMENT_REP1,AEG588A4_S1.fastq.gz,5500,dna +TREATMENT_REP2,AEG588A5_S1.fastq.gz,6000,dna ``` | Column | Description | | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | | `fastq` | Full path to FastQ file for Oxford Nanopore. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `cell_count` | Expected number of cells/nuclei. This value is used by the barcode calling tool (BLAZE) as a baseline when determining an acceptable number of detected barcodes. | +| `cell_count` | Expected number of cells/nuclei. This value is used by the barcode calling tool (BLAZE and/or Flexiplex) as a baseline when determining an acceptable number of detected barcodes. | +| `type` | An optional column specifiying whether the sample is DNA or cDNA. If omitted, the default `cdna` is used. | + +Note: DNA samples are only compatible with `flexiplex` demultiplexing. An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -39,14 +44,14 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across replicates 1 and 4 (`REP1` and `REP4` respectively): ```csv title="samplesheet.csv" -sample,fastq,cell_count -CONTROL_REP1,AEG588A1_S1.fastq.gz,5000 -CONTROL_REP1,AEG588A1_S2.fastq.gz,5000 -CONTROL_REP2,AEG588A2_S1.fastq.gz,2000 -CONTROL_REP3,AEG588A3_S1.fastq.gz,7500 -CONTROL_REP4,AEG588A4_S1.fastq.gz,9000 -CONTROL_REP4,AEG588A4_S2.fastq.gz,9000 -CONTROL_REP4,AEG588A4_S3.fastq.gz,9000 +sample,fastq,cell_count,type +CONTROL_REP1,AEG588A1_S1.fastq.gz,5000,cdna +CONTROL_REP1,AEG588A1_S2.fastq.gz,5000,cdna +CONTROL_REP2,AEG588A2_S1.fastq.gz,2000,cdna +CONTROL_REP3,AEG588A3_S1.fastq.gz,7500,cdna +CONTROL_REP4,AEG588A4_S1.fastq.gz,9000,cdna +CONTROL_REP4,AEG588A4_S2.fastq.gz,9000,cdna +CONTROL_REP4,AEG588A4_S3.fastq.gz,9000,cdna ``` ## Running the pipeline @@ -61,13 +66,17 @@ nextflow run nf-core/scnanoseq \ --transcript_fasta /path/to/transcriptome.fa \ --gtf /path/to/file.gtf \ --quantifier "isoquant,oarfish" \ + --demux_tool_cdna flexiplex \ + --demux_tool_dna flexiplex \ --barcode_format 10X_3v3 \ -profile ``` -Please note that while the above command specifies both transcriptome and genome fasta files, only one is needed for the pipeline and is dependent on which quantifier you wish to use. +Please note that while the above command specifies both transcriptome and genome fasta files, only one is needed for the pipeline and is dependent on which quantifier you wish to use. Furthermore, if you have any DNA samples, the `genome_fasta` is required. Additionally, for the `quantifier` parameter in the above command, we've listed the quantifiers as a comma-delimited string. It is possible to only use one quantifier, and can be accomplished by just providing the name of the quantifying tool you wish to run as a single value, i.e. providing `oarfish` if you only wish to run `oarfish`. +The pipeline supports barcode identification and extraction through both `flexiplex` and `blaze` and can be set through `demux_tool_dna` (only works with `flexiplex` for now) and `demux_tool_cdna` parameters. The barcode format can be specified through the `barcode_format` parameter. When working with completely custom barcode structures, you can additionally specify these with `custom_flexiplex_barcode_dna` and `custom_flexiplex_barcode_cdna` parameters. Note: ensure that you are using `flexiplex` as the barcode calling tool. This can be a string formatted as follows `"-x CTACACGACGCTCTTCCGATCT -b ???????????????? -u ?????????? -x TTTCTTATATGGG -f 8 -e 2"`, for more information check the documentation: https://davidsongroup.github.io/flexiplex/ + Note that the pipeline will create the following files in your working directory: ```bash diff --git a/modules.json b/modules.json index f4236cf7..3f682839 100644 --- a/modules.json +++ b/modules.json @@ -20,6 +20,12 @@ "git_sha": "0997b47c93c06b49aa7b3fefda87e728312cf2ca", "installed_by": ["modules"] }, + "chopper": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"], + "patch": "modules/nf-core/chopper/chopper.diff" + }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "de45447d060b8c8b98575bc637a4a575fd0638e1", @@ -120,6 +126,12 @@ "installed_by": ["modules"], "patch": "modules/nf-core/samtools/view/samtools-view.diff" }, + "seqkit/split2": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"], + "patch": "modules/nf-core/seqkit/split2/seqkit-split2.diff" + }, "toulligqc": { "branch": "master", "git_sha": "061a322293b3487e53f044304710e54cbf657717", @@ -129,7 +141,8 @@ "umitools/dedup": { "branch": "master", "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/umitools/dedup/umitools-dedup.diff" }, "unzipfiles": { "branch": "master", diff --git a/modules/local/flexiformatter/environment.yaml b/modules/local/flexiformatter/environment.yaml new file mode 100644 index 00000000..ca8d9a4c --- /dev/null +++ b/modules/local/flexiformatter/environment.yaml @@ -0,0 +1,5 @@ +name: flexiformatter +channels: + - bioconda +dependencies: + - bioconda::flexiformatter=1.0.2 diff --git a/modules/local/flexiformatter/main.nf b/modules/local/flexiformatter/main.nf new file mode 100644 index 00000000..6e4f328c --- /dev/null +++ b/modules/local/flexiformatter/main.nf @@ -0,0 +1,55 @@ +process FLEXIFORMATTER { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yaml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/flexiformatter%3A1.0.5--pyhdfd78af_0': + 'biocontainers/flexiformatter:1.0.5--pyhdfd78af_0' }" + + input: + tuple val(meta), path(bam) + val bam_index_extension + + output: + tuple val(meta), path("*_tagged.bam") , emit: bam + tuple val(meta), path("*_tagged.bam.bai") , optional: true, emit: bai + tuple val(meta), path("*_tagged.bam.csi") , optional: true, emit: csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_tagged" + def bam_index = bam_index_extension ? "${prefix}.bam##idx##${prefix}.bam.${bam_index_extension} --write-index" : "${prefix}.bam" + def bam_output = " | samtools sort -@ ${task.cpus} -o ${bam_index} ${args2}" + + """ + flexiformatter \\ + ${bam} \\ + ${args} \\ + ${bam_output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + flexiformatter: \$(echo \$(flexiformatter --version) |& sed 's/flexi_formatter version: //') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_tagged.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + flexiformatter: \$(flexiformatter --version |& sed 's/flexi_formatter version: //') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/flexiplex/assign/environment.yaml b/modules/local/flexiplex/assign/environment.yaml new file mode 100644 index 00000000..f37f464c --- /dev/null +++ b/modules/local/flexiplex/assign/environment.yaml @@ -0,0 +1,5 @@ +name: flexiplex +channels: + - bioconda +dependencies: + - bioconda::flexiplex=1.02.5 diff --git a/modules/local/flexiplex/assign/main.nf b/modules/local/flexiplex/assign/main.nf new file mode 100644 index 00000000..84669155 --- /dev/null +++ b/modules/local/flexiplex/assign/main.nf @@ -0,0 +1,51 @@ +process FLEXIPLEX_ASSIGN { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/flexiplex:1.02.5--py39h2de1943_0': + 'biocontainers/flexiplex:1.02.5--py39h2de1943_0' }" + + input: + tuple val(meta), path(reads), path(barcodes) + + output: + tuple val(meta), path("*flexiplex.fastq.gz") , emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}${meta.part ? "_part_${meta.part}" : ''}" + """ + # Run in assignment mode + + zcat ${reads} | flexiplex \\ + ${args} \\ + -k ${barcodes} \\ + -p ${task.cpus} \\ + | gzip -c > ${prefix}.flexiplex.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + flexiplex: \$(flexiplex --help |& sed '1!d ; s/FLEXIPLEX //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.flexiplex.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + flexiplex: \$(flexiplex --help |& sed '1!d ; s/FLEXIPLEX //') + END_VERSIONS + """ +} diff --git a/modules/local/flexiplex/discovery/environment.yaml b/modules/local/flexiplex/discovery/environment.yaml new file mode 100644 index 00000000..f37f464c --- /dev/null +++ b/modules/local/flexiplex/discovery/environment.yaml @@ -0,0 +1,5 @@ +name: flexiplex +channels: + - bioconda +dependencies: + - bioconda::flexiplex=1.02.5 diff --git a/modules/local/flexiplex/discovery/main.nf b/modules/local/flexiplex/discovery/main.nf new file mode 100644 index 00000000..6e645ffc --- /dev/null +++ b/modules/local/flexiplex/discovery/main.nf @@ -0,0 +1,49 @@ +process FLEXIPLEX_DISCOVERY { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/flexiplex:1.02.5--py39h2de1943_0': + 'biocontainers/flexiplex:1.02.5--py39h2de1943_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*barcodes_counts.txt") , emit: barcode_counts + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}${meta.part ? "_part_${meta.part}" : ''}" + """ + # Run in discovery mode + zcat ${reads} | flexiplex \\ + ${args} \\ + -n ${prefix} \\ + -p ${task.cpus} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + flexiplex: \$(flexiplex --help |& sed '1!d ; s/FLEXIPLEX //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch flexiplex_barcodes_counts.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + flexiplex: \$(flexiplex --help |& sed '1!d ; s/FLEXIPLEX //') + END_VERSIONS + """ +} diff --git a/modules/local/flexiplex/filter/environment.yaml b/modules/local/flexiplex/filter/environment.yaml new file mode 100644 index 00000000..f37f464c --- /dev/null +++ b/modules/local/flexiplex/filter/environment.yaml @@ -0,0 +1,5 @@ +name: flexiplex +channels: + - bioconda +dependencies: + - bioconda::flexiplex=1.02.5 diff --git a/modules/local/flexiplex/filter/main.nf b/modules/local/flexiplex/filter/main.nf new file mode 100644 index 00000000..e6130135 --- /dev/null +++ b/modules/local/flexiplex/filter/main.nf @@ -0,0 +1,49 @@ +process FLEXIPLEX_FILTER { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/flexiplex:1.02.5--py39h2de1943_0': + 'biocontainers/flexiplex:1.02.5--py39h2de1943_0' }" + + input: + tuple val(meta), path(barcodes) + path(whitelist) + + output: + tuple val(meta), path("*known_barcodes.txt") , emit: barcodes + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}${meta.part ? "_part_${meta.part}" : ''}" + """ + flexiplex-filter \\ + ${barcodes} \\ + --whitelist ${whitelist} \\ + --outfile ${prefix}_known_barcodes.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + flexiplex: \$(flexiplex --help |& sed '1!d ; s/FLEXIPLEX //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}_known_barcodes.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + flexiplex: \$(flexiplex --help |& sed '1!d ; s/FLEXIPLEX //') + END_VERSIONS + """ +} diff --git a/modules/local/isoquant.nf b/modules/local/isoquant.nf index 916bd4e7..6513f3af 100644 --- a/modules/local/isoquant.nf +++ b/modules/local/isoquant.nf @@ -1,41 +1,41 @@ process ISOQUANT { tag "$meta.id" - label 'process_medium' + label 'process_high' - conda "bioconda::isoquant=3.6.1" + conda "bioconda::isoquant=3.11.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/isoquant:3.6.1--hdfd78af_0' : - 'biocontainers/isoquant:3.6.1--hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/isoquant:3.11.1--hdfd78af_0' : + 'biocontainers/isoquant:3.11.1--hdfd78af_0' }" input: tuple val(meta), path(bam), path(bai), path(fasta), path(fai), path(gtf) val group_category output: - tuple val(meta), path("*/*/*.read_assignments.tsv.gz"), emit: read_assignments - tuple val(meta), path("*/*/*.corrected_reads.bed.gz"), emit: corrected_reads - tuple val(meta), path("*/*/*.transcript_tpm.tsv"), emit: transcript_tpm - tuple val(meta), path("*/*/*.transcript_counts.tsv"), emit: transcript_counts - tuple val(meta), path("*/*/*.gene_tpm.tsv"), emit: gene_tpm - tuple val(meta), path("*/*/*.gene_counts.tsv"), emit: gene_counts - tuple val(meta), path("*/isoquant.log"), emit: log - tuple val(meta), path("*/*/*.exon_counts.tsv"), emit: exon_counts, optional: true - tuple val(meta), path("*/*/*.intron_counts.tsv"), emit: intron_counts, optional: true - tuple val(meta), path("*/*/*.novel_vs_known.SQANTI-like.tsv"), emit: sqanti_output, optional: true - tuple val(meta), path("*/*/*.gene_grouped_tpm.tsv"), emit: grouped_gene_tpm, optional: true - tuple val(meta), path("*/*/*.gene_grouped_counts.tsv"), emit: grouped_gene_counts, optional: true - tuple val(meta), path("*/*/*.transcript_grouped_tpm.tsv"), emit: grouped_transcript_tpm, optional: true - tuple val(meta), path("*/*/*.transcript_grouped_counts.tsv"), emit: grouped_transcript_counts, optional: true - tuple val(meta), path("*/*/*.exon_grouped_counts.tsv"), emit: grouped_exon_counts, optional: true - tuple val(meta), path("*/*/*.intron_grouped_counts.tsv"), emit: grouped_intron_counts, optional: true - tuple val(meta), path("*/*/*.transcript_models.gtf"), emit: transcript_models, optional: true - tuple val(meta), path("*/*/*.transcript_model_reads.tsv.gz"), emit: transcript_model_reads, optional: true - tuple val(meta), path("*/*/*.transcript_model_tpm.tsv"), emit: transcript_model_tpm, optional: true - tuple val(meta), path("*/*/*.transcript_model_counts.tsv"), emit: transcript_model_counts, optional: true - tuple val(meta), path("*/*/*.extended_annotation.gtf"), emit: extended_gtf, optional: true - tuple val(meta), path("*/*/*.transcript_model_grouped_counts.tsv"), emit: grouped_transcript_model_counts, optional: true - tuple val(meta), path("*/*/*.transcript_model_grouped_tpm.tsv"), emit: grouped_transcript_model_tpm, optional: true - path "versions.yml", emit: versions + tuple val(meta), path("*/*/*.read_assignments.tsv.gz"), emit: read_assignments, optional: true + tuple val(meta), path("*/*/*.corrected_reads.bed.gz"), emit: corrected_reads, optional: true + tuple val(meta), path("*/*/*.transcript_tpm.tsv"), emit: transcript_tpm + tuple val(meta), path("*/*/*.transcript_counts.tsv"), emit: transcript_counts + tuple val(meta), path("*/*/*.gene_tpm.tsv"), emit: gene_tpm + tuple val(meta), path("*/*/*.gene_counts.tsv"), emit: gene_counts + tuple val(meta), path("*/isoquant.log"), emit: log + tuple val(meta), path("*/*/*.exon_counts.tsv"), emit: exon_counts, optional: true + tuple val(meta), path("*/*/*.intron_counts.tsv"), emit: intron_counts, optional: true + tuple val(meta), path("*/*/*.novel_vs_known.SQANTI-like.tsv"), emit: sqanti_output, optional: true + tuple val(meta), path("*/*/*.gene_grouped*_tpm.tsv"), emit: grouped_gene_tpm + tuple val(meta), path("*/*/*.gene_grouped*_counts.tsv"), emit: grouped_gene_counts + tuple val(meta), path("*/*/*.transcript_grouped*_tpm.tsv"), emit: grouped_transcript_tpm + tuple val(meta), path("*/*/*.transcript_grouped*_counts.tsv"), emit: grouped_transcript_counts + tuple val(meta), path("*/*/*.exon_grouped*_counts.tsv"), emit: grouped_exon_counts, optional: true + tuple val(meta), path("*/*/*.intron_grouped*_counts.tsv"), emit: grouped_intron_counts, optional: true + tuple val(meta), path("*/*/*.transcript_models.gtf"), emit: transcript_models + tuple val(meta), path("*/*/*.transcript_model_reads.tsv.gz"), emit: transcript_model_reads, optional: true + tuple val(meta), path("*/*/*.transcript_model_tpm.tsv"), emit: transcript_model_tpm, optional: true + tuple val(meta), path("*/*/*.transcript_model_counts.tsv"), emit: transcript_model_counts, optional: true + tuple val(meta), path("*/*/*.extended_annotation.gtf"), emit: extended_gtf + tuple val(meta), path("*/*/*.transcript_model_grouped*_counts.tsv"), emit: grouped_transcript_model_counts, optional: true + tuple val(meta), path("*/*/*.transcript_model_grouped*_tpm.tsv"), emit: grouped_transcript_model_tpm, optional: true + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/mergebarcodecounts/main.nf b/modules/local/mergebarcodecounts/main.nf new file mode 100644 index 00000000..d00cf3c0 --- /dev/null +++ b/modules/local/mergebarcodecounts/main.nf @@ -0,0 +1,46 @@ +process MERGEBARCODECOUNTS { + tag "$meta.id" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:22.04': + 'biocontainers/ubuntu:22.04' }" + + input: + tuple val(meta), path(barcode_counts) + + output: + tuple val(meta), path("*_barcode_counts.txt"), emit: barcode_counts + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + + awk -F'\t' '{counts[\$1]+=\$2} END {for (b in counts) print b "\t" counts[b]}' ${barcode_counts} \ + | sort -k2,2nr -T . > ${prefix}_barcode_counts.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mawk: \$(awk -W version | head -n1 | sed 's/mawk //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo $args + + touch merged_barcode_counts.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mawk: \$(awk -W version | head -n1 | sed 's/mawk //') + END_VERSIONS + """ +} diff --git a/modules/local/mergebarcodecounts/meta.yml b/modules/local/mergebarcodecounts/meta.yml new file mode 100644 index 00000000..54ab1323 --- /dev/null +++ b/modules/local/mergebarcodecounts/meta.yml @@ -0,0 +1,49 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "mergebarcodecounts" +description: Merging of barcode counts from flexiplex +keywords: + - barcode + - genomics +tools: + - "mergebarcodecounts": + description: "Merging of barcode counts from flexiplex" + homepage: "" + documentation: "" + tool_dev_url: "" + doi: "" + licence: null + identifier: null + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - barcode_counts: + type: file + description: Barcode count files + +output: + barcode_counts: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "barcode_counts": + type: file + description: Merged barcode count file + pattern: "*merged_barcode_counts.txt" + versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: "http://edamontology.org/format_3750" # YAML + +authors: + - "@ljwharbers" +maintainers: + - "@ljwharbers" diff --git a/modules/local/mergebarcodecounts/tests/main.nf.test b/modules/local/mergebarcodecounts/tests/main.nf.test new file mode 100644 index 00000000..0434be41 --- /dev/null +++ b/modules/local/mergebarcodecounts/tests/main.nf.test @@ -0,0 +1,73 @@ +// TODO nf-core: Once you have added the required tests, please run the following command to build this file: +// nf-core modules test mergebarcodecounts +nextflow_process { + + name "Test Process MERGEBARCODECOUNTS" + script "../main.nf" + process "MERGEBARCODECOUNTS" + + tag "modules" + tag "modules_" + tag "mergebarcodecounts" + + // TODO nf-core: Change the test name preferably indicating the test-data and file-format used + test("sarscov2 - bam") { + + // TODO nf-core: If you are created a test for a chained module + // (the module requires running more than one process to generate the required output) + // add the 'setup' method here. + // You can find more information about how to use a 'setup' method in the docs (https://nf-co.re/docs/contributing/modules#steps-for-creating-nf-test-for-chained-modules). + + when { + process { + """ + // TODO nf-core: define inputs of the process here. Example: + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + //TODO nf-core: Add all required assertions to verify the test output. + // See https://nf-co.re/docs/contributing/tutorials/nf-test_assertions for more information and examples. + ) + } + + } + + // TODO nf-core: Change the test name preferably indicating the test-data and file-format used but keep the " - stub" suffix. + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + // TODO nf-core: define inputs of the process here. Example: + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + //TODO nf-core: Add all required assertions to verify the test output. + ) + } + + } + +} diff --git a/modules/local/preextract_fastq.nf b/modules/local/preextract_fastq.nf index e7b70ea7..458432b8 100644 --- a/modules/local/preextract_fastq.nf +++ b/modules/local/preextract_fastq.nf @@ -10,7 +10,6 @@ process PREEXTRACT_FASTQ { input: tuple val(meta), path(reads), path(bc_list) - val bc_format output: tuple val(meta), path("*.putative_bc_umi.tsv") , emit: barcode_info @@ -29,7 +28,6 @@ process PREEXTRACT_FASTQ { -i ${reads} \\ -b ${bc_list} \\ -o ${prefix}.extracted \\ - -f ${bc_format} \\ -t ${task.cpus} \\ ${args} diff --git a/modules/nf-core/chopper/chopper.diff b/modules/nf-core/chopper/chopper.diff new file mode 100644 index 00000000..b7a633f4 --- /dev/null +++ b/modules/nf-core/chopper/chopper.diff @@ -0,0 +1,21 @@ +Changes in component 'nf-core/chopper' +'modules/nf-core/chopper/meta.yml' is unchanged +Changes in 'chopper/main.nf': +--- modules/nf-core/chopper/main.nf ++++ modules/nf-core/chopper/main.nf +@@ -4,8 +4,8 @@ + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +- 'https://depot.galaxyproject.org/singularity/chopper:0.9.0--hdcf5f25_0': +- 'biocontainers/chopper:0.9.0--hdcf5f25_0' }" ++ 'https://depot.galaxyproject.org/singularity/chopper:0.10.0--hcdda2d0_0': ++ 'biocontainers/chopper:0.10.0--hcdda2d0_0' }" + + input: + tuple val(meta), path(fastq) + +'modules/nf-core/chopper/environment.yml' is unchanged +'modules/nf-core/chopper/tests/main.nf.test' is unchanged +'modules/nf-core/chopper/tests/main.nf.test.snap' is unchanged +************************************************************ diff --git a/modules/nf-core/chopper/environment.yml b/modules/nf-core/chopper/environment.yml new file mode 100644 index 00000000..ad734838 --- /dev/null +++ b/modules/nf-core/chopper/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::chopper=0.9.0 diff --git a/modules/nf-core/chopper/main.nf b/modules/nf-core/chopper/main.nf new file mode 100644 index 00000000..65b8ab0e --- /dev/null +++ b/modules/nf-core/chopper/main.nf @@ -0,0 +1,56 @@ +process CHOPPER { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/chopper:0.10.0--hcdda2d0_0': + 'biocontainers/chopper:0.10.0--hcdda2d0_0' }" + + input: + tuple val(meta), path(fastq) + path fasta + + output: + tuple val(meta), path("*.fastq.gz") , emit: fastq + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def fasta_filtering = fasta ? "--contam ${fasta}" : "" + + if ("$fastq" == "${prefix}.fastq.gz") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ + zcat \\ + $args \\ + $fastq | \\ + chopper \\ + --threads $task.cpus \\ + $fasta_filtering \\ + $args2 | \\ + gzip \\ + $args3 > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + chopper: \$(chopper --version 2>&1 | cut -d ' ' -f 2) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo | gzip > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + chopper: \$(chopper --version 2>&1 | cut -d ' ' -f 2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/chopper/meta.yml b/modules/nf-core/chopper/meta.yml new file mode 100644 index 00000000..096ed167 --- /dev/null +++ b/modules/nf-core/chopper/meta.yml @@ -0,0 +1,70 @@ +name: "chopper" +description: Filter and trim long read data. +keywords: + - filter + - trimming + - fastq + - nanopore + - qc +tools: + - "zcat": + description: "zcat uncompresses either a list of files on the command line or + its standard input and writes the uncompressed data on standard output." + documentation: "https://linux.die.net/man/1/zcat" + args_id: "$args" + identifier: "" + - "chopper": + description: "A rust command line for filtering and trimming long reads." + homepage: "https://github.com/wdecoster/chopper" + documentation: "https://github.com/wdecoster/chopper" + tool_dev_url: "https://github.com/wdecoster/chopper" + doi: "10.1093/bioinformatics/bty149" + licence: ["MIT"] + args_id: "$args2" + identifier: "" + - "gzip": + description: "Gzip reduces the size of the named files using Lempel-Ziv coding + (LZ77)." + documentation: "https://linux.die.net/man/1/gzip" + args_id: "$args3" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: FastQ with reads from long read sequencing e.g. PacBio or ONT + pattern: "*.{fastq.gz}" + ontologies: [] + - fasta: + type: file + description: An optional reference fasta file against which to remove reads that + align to it. + pattern: "*.fasta" + ontologies: [] +output: + fastq: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fastq.gz": + type: file + description: Filtered and trimmed FastQ file + pattern: "*.{fastq.gz}" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@FynnFreyer" +maintainers: + - "@FynnFreyer" diff --git a/modules/nf-core/chopper/tests/main.nf.test b/modules/nf-core/chopper/tests/main.nf.test new file mode 100644 index 00000000..e611fa9f --- /dev/null +++ b/modules/nf-core/chopper/tests/main.nf.test @@ -0,0 +1,107 @@ +nextflow_process { + + name "Test Process CHOPPER" + script "../main.nf" + process "CHOPPER" + tag "chopper" + tag "modules" + tag "modules_nfcore" + + test("test with lambda reference") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id:'test_out' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test_2.fastq.gz', checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + """ + } + } + + then { + + def fastq_content = path(process.out.fastq.get(0).get(1)).linesGzip + + assertAll( + { assert process.success }, + // original pytest checks + { assert process.out.fastq.get(0).get(1) ==~ ".*/test_out.fastq.gz" }, + { assert !fastq_content.contains("@a52a642e-88d0-4584-babd-414ea84db484 runid=71c83ae0021f873e29b130c6562a4c27185f93b8 read=2768 ch=489 start_time=2021-08-11T12:07:39Z flow_cell_id=FAQ57606 protocol_group_id=210811_47CoV_SA sample_id=CS5 barcode=barcode04 barcode_alias=barcode04")}, + // additional nf-test checks + // Order of reads is not deterministic, so only assess whether the number of reads is correct + { assert snapshot( + fastq_content.size(), + process.out.versions + ).match() } + ) + } + } + + test("test without lambda reference") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id:'test_out' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + + def fastq_content = path(process.out.fastq.get(0).get(1)).linesGzip + + assertAll( + { assert process.success }, + // original pytest checks + { assert process.out.fastq.get(0).get(1) ==~ ".*/test_out.fastq.gz" }, + { assert fastq_content.contains("@2109d790-67ec-4fd1-8931-6c7e61908ff3 runid=97ca62ca093ff43533aa34c38a10b1d6325e7e7b read=52274 ch=243 start_time=2021-02-05T23:27:30Z flow_cell_id=FAP51364 protocol_group_id=data sample_id=RN20097 barcode=barcode01 barcode_alias=barcode01")}, + // additional nf-test checks + // Order of reads is not deterministic, so only assess whether the number of reads is correct + { assert snapshot( + fastq_content.size(), + process.out.versions + ).match() } + ) + } + } + + test("test-chopper-stub") { + options '-stub' + + when { + process { + """ + input[0] = [ + [id:'test_out' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out, + process.out.versions + ).match() } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/chopper/tests/main.nf.test.snap b/modules/nf-core/chopper/tests/main.nf.test.snap new file mode 100644 index 00000000..60522256 --- /dev/null +++ b/modules/nf-core/chopper/tests/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "test without lambda reference": { + "content": [ + 400, + [ + "versions.yml:md5,74a27493c09d0c481f6e52b517e12023" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-31T15:14:36.37897815" + }, + "test with lambda reference": { + "content": [ + 15984, + [ + "versions.yml:md5,74a27493c09d0c481f6e52b517e12023" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-31T15:14:31.324993049" + }, + "test-chopper-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_out" + }, + "test_out.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,74a27493c09d0c481f6e52b517e12023" + ], + "fastq": [ + [ + { + "id": "test_out" + }, + "test_out.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,74a27493c09d0c481f6e52b517e12023" + ] + }, + [ + "versions.yml:md5,74a27493c09d0c481f6e52b517e12023" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-31T15:29:08.715579423" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqkit/split2/environment.yml b/modules/nf-core/seqkit/split2/environment.yml new file mode 100644 index 00000000..b26fb1eb --- /dev/null +++ b/modules/nf-core/seqkit/split2/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqkit=2.9.0 diff --git a/modules/nf-core/seqkit/split2/main.nf b/modules/nf-core/seqkit/split2/main.nf new file mode 100644 index 00000000..a5be659c --- /dev/null +++ b/modules/nf-core/seqkit/split2/main.nf @@ -0,0 +1,78 @@ +process SEQKIT_SPLIT2 { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqkit:2.9.0--h9ee0642_0' : + 'biocontainers/seqkit:2.9.0--h9ee0642_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("**/*.{fastq,fq}{,.gz}") , emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + """ + seqkit \\ + split2 \\ + $args \\ + --threads $task.cpus \\ + $reads \\ + --out-dir ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(echo \$(seqkit 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + seqkit \\ + split2 \\ + $args \\ + --threads $task.cpus \\ + --read1 ${reads[0]} \\ + --read2 ${reads[1]} \\ + --out-dir ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(echo \$(seqkit 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + """ + mkdir -p ${prefix} + echo "" | gzip > ${prefix}/${reads[0]} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(echo \$(seqkit 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + mkdir -p ${prefix} + echo "" | gzip > ${prefix}/${reads[0]} + echo "" | gzip > ${prefix}/${reads[1]} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(echo \$(seqkit 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/seqkit/split2/meta.yml b/modules/nf-core/seqkit/split2/meta.yml new file mode 100644 index 00000000..bbb820f6 --- /dev/null +++ b/modules/nf-core/seqkit/split2/meta.yml @@ -0,0 +1,51 @@ +name: seqkit_split2 +description: Split single or paired-end fastq.gz files +keywords: + - split + - fastq + - seqkit +tools: + - seqkit: + description: | + Cross-platform and ultrafast toolkit for FASTA/Q file manipulation, written by Wei Shen. + homepage: https://github.com/shenwei356/seqkit + documentation: https://bioinf.shenwei.me/seqkit/ + doi: 10.1371/journal.pone.0163962 + licence: ["MIT"] + identifier: biotools:seqkit +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: FastQ files + pattern: "*.{fq.gz/fastq.gz}" + ontologies: [] +output: + reads: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "**/*.gz": + type: file + description: Split fastq files + pattern: "*.{fq.gz/fastq.gz}" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@FriederikeHanssen" + - "@heuermh" +maintainers: + - "@FriederikeHanssen" + - "@heuermh" diff --git a/modules/nf-core/seqkit/split2/seqkit-split2.diff b/modules/nf-core/seqkit/split2/seqkit-split2.diff new file mode 100644 index 00000000..452c6808 --- /dev/null +++ b/modules/nf-core/seqkit/split2/seqkit-split2.diff @@ -0,0 +1,24 @@ +Changes in component 'nf-core/seqkit/split2' +'modules/nf-core/seqkit/split2/meta.yml' is unchanged +Changes in 'seqkit/split2/main.nf': +--- modules/nf-core/seqkit/split2/main.nf ++++ modules/nf-core/seqkit/split2/main.nf +@@ -11,8 +11,8 @@ + tuple val(meta), path(reads) + + output: +- tuple val(meta), path("**/*.gz"), emit: reads +- path "versions.yml" , emit: versions ++ tuple val(meta), path("**/*.{fastq,fq}{,.gz}") , emit: reads ++ path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + +'modules/nf-core/seqkit/split2/environment.yml' is unchanged +'modules/nf-core/seqkit/split2/tests/length.config' is unchanged +'modules/nf-core/seqkit/split2/tests/main.nf.test' is unchanged +'modules/nf-core/seqkit/split2/tests/part.config' is unchanged +'modules/nf-core/seqkit/split2/tests/main.nf.test.snap' is unchanged +'modules/nf-core/seqkit/split2/tests/size.config' is unchanged +************************************************************ diff --git a/modules/nf-core/seqkit/split2/tests/length.config b/modules/nf-core/seqkit/split2/tests/length.config new file mode 100644 index 00000000..64d8a9aa --- /dev/null +++ b/modules/nf-core/seqkit/split2/tests/length.config @@ -0,0 +1,5 @@ +process { + withName: SEQKIT_SPLIT2 { + ext.args = '--by-length 8K' + } +} diff --git a/modules/nf-core/seqkit/split2/tests/main.nf.test b/modules/nf-core/seqkit/split2/tests/main.nf.test new file mode 100644 index 00000000..ea48154b --- /dev/null +++ b/modules/nf-core/seqkit/split2/tests/main.nf.test @@ -0,0 +1,222 @@ +nextflow_process { + + name "Test Process SEQKIT_SPLIT2" + script "../main.nf" + process "SEQKIT_SPLIT2" + + tag "modules" + tag "modules_nfcore" + tag "seqkit" + tag "seqkit/split2" + + test("single_end - length") { + + config "./length.config" + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("paired_end - length") { + + config "./length.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - length - stub") { + + options "-stub" + config "./length.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - part") { + + config "./part.config" + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("paired_end - part") { + + config "./part.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - part - stub") { + + options "-stub" + config "./part.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - size") { + + config "./size.config" + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("paired_end - size") { + + config "./size.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - size - stub") { + + options "-stub" + config "./size.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/seqkit/split2/tests/main.nf.test.snap b/modules/nf-core/seqkit/split2/tests/main.nf.test.snap new file mode 100644 index 00000000..2ed2c165 --- /dev/null +++ b/modules/nf-core/seqkit/split2/tests/main.nf.test.snap @@ -0,0 +1,371 @@ +{ + "paired_end - size": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,ecc4b1841cd94704bba742ea4dcd48b0", + "test_1.part_002.fastq.gz:md5,b3de467f2b6ab0d14e1f6ce14932a411", + "test_2.part_001.fastq.gz:md5,201ee95b559240e27830970b78a547c8", + "test_2.part_002.fastq.gz:md5,35ff29a76f34b2507a37287352324650" + ] + ] + ], + "1": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,ecc4b1841cd94704bba742ea4dcd48b0", + "test_1.part_002.fastq.gz:md5,b3de467f2b6ab0d14e1f6ce14932a411", + "test_2.part_001.fastq.gz:md5,201ee95b559240e27830970b78a547c8", + "test_2.part_002.fastq.gz:md5,35ff29a76f34b2507a37287352324650" + ] + ] + ], + "versions": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T16:15:20.968231" + }, + "single_end - size": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ecc4b1841cd94704bba742ea4dcd48b0", + "test_1.part_002.fastq.gz:md5,b3de467f2b6ab0d14e1f6ce14932a411" + ] + ] + ], + "1": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ecc4b1841cd94704bba742ea4dcd48b0", + "test_1.part_002.fastq.gz:md5,b3de467f2b6ab0d14e1f6ce14932a411" + ] + ] + ], + "versions": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T16:15:16.296246" + }, + "single_end - part": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,b3c4d28be7ea9b960fbf1cb452adb53c", + "test_1.part_002.fastq.gz:md5,c134c70c5b70c6b3c65979448b38917e", + "test_1.part_003.fastq.gz:md5,822a40283637e2715e77d1ed0ed5bd52" + ] + ] + ], + "1": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,b3c4d28be7ea9b960fbf1cb452adb53c", + "test_1.part_002.fastq.gz:md5,c134c70c5b70c6b3c65979448b38917e", + "test_1.part_003.fastq.gz:md5,822a40283637e2715e77d1ed0ed5bd52" + ] + ] + ], + "versions": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T16:15:02.927768" + }, + "single_end - length": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ffc87eb34d91d23ec9095bd8609a6a70", + "test_1.part_002.fastq.gz:md5,8d71a0abe239e05e5c57c4d27c799a1d" + ] + ] + ], + "1": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test_1.part_001.fastq.gz:md5,ffc87eb34d91d23ec9095bd8609a6a70", + "test_1.part_002.fastq.gz:md5,8d71a0abe239e05e5c57c4d27c799a1d" + ] + ] + ], + "versions": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T16:14:49.713576" + }, + "single_end - size - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T16:15:25.267006" + }, + "single_end - part - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T16:15:11.624135" + }, + "single_end - length - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T16:14:58.47365" + }, + "paired_end - length": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,ffc87eb34d91d23ec9095bd8609a6a70", + "test_1.part_002.fastq.gz:md5,8d71a0abe239e05e5c57c4d27c799a1d", + "test_2.part_001.fastq.gz:md5,77b9076f82a762711582584342bde5a1", + "test_2.part_002.fastq.gz:md5,33bb6e3edc759baa7ba6580da36def48" + ] + ] + ], + "1": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,ffc87eb34d91d23ec9095bd8609a6a70", + "test_1.part_002.fastq.gz:md5,8d71a0abe239e05e5c57c4d27c799a1d", + "test_2.part_001.fastq.gz:md5,77b9076f82a762711582584342bde5a1", + "test_2.part_002.fastq.gz:md5,33bb6e3edc759baa7ba6580da36def48" + ] + ] + ], + "versions": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T16:14:54.230006" + }, + "paired_end - part": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,b3c4d28be7ea9b960fbf1cb452adb53c", + "test_1.part_002.fastq.gz:md5,c134c70c5b70c6b3c65979448b38917e", + "test_1.part_003.fastq.gz:md5,822a40283637e2715e77d1ed0ed5bd52", + "test_2.part_001.fastq.gz:md5,13d14d69744bd93c8c51873b529bf714", + "test_2.part_002.fastq.gz:md5,5f5a78d78f312164a1159c62d65c15f1", + "test_2.part_003.fastq.gz:md5,48bcbceb485b73bcf1f198e252b016d8" + ] + ] + ], + "1": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.part_001.fastq.gz:md5,b3c4d28be7ea9b960fbf1cb452adb53c", + "test_1.part_002.fastq.gz:md5,c134c70c5b70c6b3c65979448b38917e", + "test_1.part_003.fastq.gz:md5,822a40283637e2715e77d1ed0ed5bd52", + "test_2.part_001.fastq.gz:md5,13d14d69744bd93c8c51873b529bf714", + "test_2.part_002.fastq.gz:md5,5f5a78d78f312164a1159c62d65c15f1", + "test_2.part_003.fastq.gz:md5,48bcbceb485b73bcf1f198e252b016d8" + ] + ] + ], + "versions": [ + "versions.yml:md5,f5d878e7143d05609ce5bb5974126990" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T16:15:07.43572" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqkit/split2/tests/part.config b/modules/nf-core/seqkit/split2/tests/part.config new file mode 100644 index 00000000..2436bfbd --- /dev/null +++ b/modules/nf-core/seqkit/split2/tests/part.config @@ -0,0 +1,5 @@ +process { + withName: SEQKIT_SPLIT2 { + ext.args = '--by-part 3' + } +} diff --git a/modules/nf-core/seqkit/split2/tests/size.config b/modules/nf-core/seqkit/split2/tests/size.config new file mode 100644 index 00000000..42a153d7 --- /dev/null +++ b/modules/nf-core/seqkit/split2/tests/size.config @@ -0,0 +1,5 @@ +process { + withName: SEQKIT_SPLIT2 { + ext.args = '--by-size 50' + } +} diff --git a/modules/nf-core/umitools/dedup/main.nf b/modules/nf-core/umitools/dedup/main.nf index 1e2a2aae..d897ef07 100644 --- a/modules/nf-core/umitools/dedup/main.nf +++ b/modules/nf-core/umitools/dedup/main.nf @@ -25,7 +25,6 @@ process UMITOOLS_DEDUP { script: def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" - def paired = meta.single_end ? "" : "--paired" stats = get_output_stats ? "--output-stats ${prefix}" : "" if ("$bam" == "${prefix}.bam") error "Input and output names are the same, set prefix in module configuration to disambiguate!" @@ -37,7 +36,6 @@ process UMITOOLS_DEDUP { -S ${prefix}.bam \\ -L ${prefix}.log \\ $stats \\ - $paired \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/umitools/dedup/umitools-dedup.diff b/modules/nf-core/umitools/dedup/umitools-dedup.diff new file mode 100644 index 00000000..4126f47d --- /dev/null +++ b/modules/nf-core/umitools/dedup/umitools-dedup.diff @@ -0,0 +1,28 @@ +Changes in component 'nf-core/umitools/dedup' +'modules/nf-core/umitools/dedup/meta.yml' is unchanged +Changes in 'umitools/dedup/main.nf': +--- modules/nf-core/umitools/dedup/main.nf ++++ modules/nf-core/umitools/dedup/main.nf +@@ -25,7 +25,6 @@ + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" +- def paired = meta.single_end ? "" : "--paired" + stats = get_output_stats ? "--output-stats ${prefix}" : "" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + +@@ -37,7 +36,6 @@ + -S ${prefix}.bam \\ + -L ${prefix}.log \\ + $stats \\ +- $paired \\ + $args + + cat <<-END_VERSIONS > versions.yml + +'modules/nf-core/umitools/dedup/environment.yml' is unchanged +'modules/nf-core/umitools/dedup/tests/tags.yml' is unchanged +'modules/nf-core/umitools/dedup/tests/main.nf.test' is unchanged +'modules/nf-core/umitools/dedup/tests/main.nf.test.snap' is unchanged +'modules/nf-core/umitools/dedup/tests/nextflow.config' is unchanged +************************************************************ diff --git a/nextflow.config b/nextflow.config index 6812dc60..cf051b2f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -30,8 +30,13 @@ params { skip_trimming = false // Cell barcode options - whitelist = null + cdna_whitelist = null + dna_whitelist = null barcode_format = null + custom_flexiplex_barcode_dna = null + custom_flexiplex_barcode_cdna = null + demux_tool_cdna = 'flexiplex' // Options: flexiplex, blaze + demux_tool_dna = 'flexiplex' // Options: flexiplex dedup_tool = 'umitools' // Library strandness option @@ -201,8 +206,10 @@ profiles { ] } } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + test_mixed { includeConfig 'conf/test_mixed.config' } + test_dna { includeConfig 'conf/test_dna.config' } + test_full { includeConfig 'conf/test_full.config' } } // Load nf-core custom profiles from different Institutions diff --git a/nextflow_schema.json b/nextflow_schema.json index b7f89adf..d0fb3819 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -151,22 +151,47 @@ "description": "Options related to the barcode and umis.", "default": "", "properties": { - "whitelist": { + "cdna_whitelist": { "type": "string", - "description": "User-provided file containing a list of cellular barcodes. Using this parameter will override the default whitelists provided by the pipeline and use the user-provided one instead.", + "description": "User-provided file containing a list of cellular barcodes for cDNA samples. Using this parameter will override the default cDNA whitelists provided by the pipeline and use the user-provided one instead.", + "format": "file-path", + "fa_icon": "far fa-file-alt" + }, + "dna_whitelist": { + "type": "string", + "description": "User-provided file containing a list of cellular barcodes for DNA samples. Using this parameter will override the default DNA whitelists provided by the pipeline and use the user-provided one instead.", "format": "file-path", "fa_icon": "far fa-file-alt" }, "barcode_format": { "type": "string", "description": "Specify the format for the barcode+umi. This parameter also defines a default barcode whitelist for the pipeline to use for barcode calling, this can be overridden with the 'whitelist' parameter.", - "enum": ["10X_3v3", "10X_3v4", "10X_5v2", "10X_5v3"], + "enum": ["10X_3v3", "10X_3v4", "10X_5v2", "10X_5v3", "10X_multiome"], "fa_icon": "fas fa-barcode" }, + "custom_flexiplex_barcode_dna": { + "type": "string", + "description": "Custom DNA flexiplex barcode format to use. Example: '-x CTACACGACGCTCTTCCGATCT -b ???????????????? -u ???????????? -x TTTTTTTTT -f 8 -e 2'. For information on structure: https://davidsongroup.github.io/flexiplex/#usage" + }, + "custom_flexiplex_barcode_cdna": { + "type": "string", + "description": "Custom cDNA flexiplex barcode format to use. Example: '-x CTACACGACGCTCTTCCGATCT -b ???????????????? -u ???????????? -x TTTTTTTTT -f 8 -e 2'. For information on structure: https://davidsongroup.github.io/flexiplex/#usage" + }, + "demux_tool_dna": { + "type": "string", + "default": "flexiplex", + "description": "Demultiplexing tool to use for DNA (options: flexiplex)" + }, + "demux_tool_cdna": { + "type": "string", + "default": "flexiplex", + "description": "Demultiplexing tool to use for DNA (options: flexiplex or blaze)" + }, "dedup_tool": { "type": "string", "description": "Specify which tool to be used for deduplication (Options: picard, umitools)", - "enum": ["umitools", "picard"] + "enum": ["umitools", "picard"], + "default": "umitools" } }, "fa_icon": "fas fa-microscope", @@ -220,7 +245,6 @@ "pattern": "^(oarfish|isoquant)(,(oarfish|isoquant))*$" } }, - "required": ["quantifier"], "description": "Options related to post-mapping analysis" }, "process_skipping_options": { diff --git a/subworkflows/local/align_deduplicate_dna.nf b/subworkflows/local/align_deduplicate_dna.nf new file mode 100644 index 00000000..d8160911 --- /dev/null +++ b/subworkflows/local/align_deduplicate_dna.nf @@ -0,0 +1,150 @@ +// +// Performs alignment and deduplication for DNA samples +// + +// MODULES +include { MINIMAP2_INDEX } from '../../modules/nf-core/minimap2/index' +include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align' +include { PICARD_MARKDUPLICATES } from '../../modules/nf-core/picard/markduplicates' +include { BAM_SORT_STATS_SAMTOOLS } from '../../subworkflows/nf-core/bam_sort_stats_samtools' +include { FLEXIFORMATTER } from '../../modules/local/flexiformatter' +include { NANOCOMP } from '../../modules/nf-core/nanocomp/main' + +workflow ALIGN_DEDUPLICATE_DNA { + take: + fasta // channel: [ val(meta), path(fasta) ] + fai // channel: [ val(meta), path(fai) ] + fastq // channel: [ val(meta), path(fastq) ] + + skip_save_minimap2_index // bool: Skip saving the minimap2 index + skip_qc // bool: Skip qc steps + skip_bam_nanocomp // bool: Skip Nanocomp + skip_dedup // bool: Skip deduplication + + main: + ch_versions = Channel.empty() + + // Minimap results + minimap_bam = Channel.empty() + minimap_bai = Channel.empty() + + // Deduplicated bam file + dedup_bam = Channel.empty() + dedup_bai = Channel.empty() + + // SAMtool stats after dedup + stats = Channel.empty() + flagstat = Channel.empty() + idxstats = Channel.empty() + + // NanoComp results + nanocomp_bam_html = Channel.empty() + nanocomp_bam_txt = Channel.empty() + + // + // MINIMAP2_INDEX + // + if (skip_save_minimap2_index) { + MINIMAP2_INDEX ( fasta ) + ch_minimap_ref = MINIMAP2_INDEX.out.index + ch_versions = ch_versions.mix(MINIMAP2_INDEX.out.versions) + } else { + ch_minimap_ref = fasta + } + + // + // MINIMAP2_ALIGN + // + + MINIMAP2_ALIGN ( + fastq, + ch_minimap_ref, + true, + "bai", + "", + "" + ) + + ch_versions = ch_versions.mix(MINIMAP2_ALIGN.out.versions) + + // + // MODULE: Run FLEXIFORMATTER + // + FLEXIFORMATTER ( + MINIMAP2_ALIGN.out.bam, + 'bai' + ) + + ch_versions = ch_versions.mix(FLEXIFORMATTER.out.versions) + + FLEXIFORMATTER.out.bam + .set { ch_tagged_bam } + + // + // MODULE: MarkDuplicates + // + final_bam = ch_tagged_bam + if( !skip_dedup ) { + // + // MODULE: Picard Mark Duplicates + // + PICARD_MARKDUPLICATES ( + MINIMAP2_ALIGN.out.bam, + fasta, + fai + ) + final_bam = PICARD_MARKDUPLICATES.out.bam + ch_versions = ch_versions.mix(PICARD_MARKDUPLICATES.out.versions) + } + + + // + // SUBWORKFLOW: BAM_SORT_STATS_SAMTOOLS + // The subworkflow is called in both the minimap2 bams and filtered (mapped only) version + // TODO: No reason that this is again sorting and indexing. + // Change to STATS_SAMTOOLS + BAM_SORT_STATS_SAMTOOLS ( final_bam, fasta ) + ch_versions = ch_versions.mix(BAM_SORT_STATS_SAMTOOLS.out.versions) + + // + // MODULE: NanoComp for BAM files (unfiltered for QC purposes) + // + ch_nanocomp_bam_html = Channel.empty() + ch_nanocomp_bam_txt = Channel.empty() + + if (!skip_qc && !skip_bam_nanocomp) { + + NANOCOMP ( + BAM_SORT_STATS_SAMTOOLS.out.bam + .collect{it[1]} + .map{ + [ [ 'id': 'nanocomp_bam.' ] , it ] + } + ) + + ch_nanocomp_bam_html = NANOCOMP.out.report_html + ch_nanocomp_bam_txt = NANOCOMP.out.stats_txt + ch_versions = ch_versions.mix( NANOCOMP.out.versions ) + } + + emit: + // Versions + versions = ch_versions + + // Minimap results + minimap_bam = MINIMAP2_ALIGN.out.bam + minimap_bai = MINIMAP2_ALIGN.out.index + + // Deduplicated bam file + dedup_bam = BAM_SORT_STATS_SAMTOOLS.out.bam + dedup_bai = BAM_SORT_STATS_SAMTOOLS.out.bai + + // SAMtool stats after dedup + stats = BAM_SORT_STATS_SAMTOOLS.out.stats + flagstat = BAM_SORT_STATS_SAMTOOLS.out.flagstat + idxstats = BAM_SORT_STATS_SAMTOOLS.out.idxstats + + // NanoComp results + nanocomp_bam_html = ch_nanocomp_bam_html + nanocomp_bam_txt = ch_nanocomp_bam_txt +} diff --git a/subworkflows/local/align_longreads.nf b/subworkflows/local/align_longreads.nf index e54da516..81903b78 100644 --- a/subworkflows/local/align_longreads.nf +++ b/subworkflows/local/align_longreads.nf @@ -10,9 +10,8 @@ include { BAM_SORT_STATS_SAMTOOLS as BAM_SORT_STATS_SAMTOOLS_FILTERED } from '.. include { MINIMAP2_INDEX } from '../../modules/nf-core/minimap2/index' include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align' include { SAMTOOLS_VIEW as SAMTOOLS_FILTER_MAPPED } from '../../modules/nf-core/samtools/view' - -include { RSEQC_READDISTRIBUTION } from '../../modules/nf-core/rseqc/readdistribution/main' -include { NANOCOMP } from '../../modules/nf-core/nanocomp/main' +include { RSEQC_READDISTRIBUTION } from '../../modules/nf-core/rseqc/readdistribution/main' +include { NANOCOMP } from '../../modules/nf-core/nanocomp/main' workflow ALIGN_LONGREADS { @@ -59,6 +58,8 @@ workflow ALIGN_LONGREADS { // // SUBWORKFLOW: BAM_SORT_STATS_SAMTOOLS // The subworkflow is called in both the minimap2 bams and filtered (mapped only) version + // TODO: No reason that this is again sorting and indexing. + // Change to STATS_SAMTOOLS BAM_SORT_STATS_SAMTOOLS ( MINIMAP2_ALIGN.out.bam, fasta ) ch_versions = ch_versions.mix(BAM_SORT_STATS_SAMTOOLS.out.versions) diff --git a/subworkflows/local/dedup_umis.nf b/subworkflows/local/dedup_umis.nf index ebc9e726..7ef2a4da 100644 --- a/subworkflows/local/dedup_umis.nf +++ b/subworkflows/local/dedup_umis.nf @@ -48,19 +48,14 @@ workflow DEDUP_UMIS { BAMTOOLS_SPLIT ( in_bam ) ch_versions = ch_versions.mix(BAMTOOLS_SPLIT.out.versions.first()) ch_split_bam = BAMTOOLS_SPLIT.out.bam - .map{ + .flatMap{ meta, bam -> - [bam] - } - .flatten() - .map{ - bam -> - def bam_basename = bam.toString().split('/')[-1] - def split_bam_basename = bam_basename.split(/\./) - def new_meta = [ - 'id': split_bam_basename.take(split_bam_basename.size()-1).join("."), - ] - [ new_meta, bam ] + def bamList = bam instanceof List ? bam : [bam] + bamList.collect { b -> + def bam_basename = b.toString().split('/')[-1] + def split_bam_basename = bam_basename.split(/\./) + [ meta + [ 'id': split_bam_basename.take(split_bam_basename.size()-1).join(".") ], b ] + } } } else { @@ -151,7 +146,7 @@ workflow DEDUP_UMIS { meta, bam -> def bam_basename = bam.toString().split('/')[-1] def split_bam_basename = bam_basename.split(/\./) - def new_meta = [ 'id': split_bam_basename[0] ] + def new_meta = meta + [ 'id': split_bam_basename[0] ] [ new_meta, bam ] } .groupTuple(), diff --git a/subworkflows/local/demultiplex_blaze.nf b/subworkflows/local/demultiplex_blaze.nf new file mode 100644 index 00000000..b60476e2 --- /dev/null +++ b/subworkflows/local/demultiplex_blaze.nf @@ -0,0 +1,134 @@ +// +// Performs demultiplexing using BLAZE +// + +// MODULES +include { BLAZE } from '../../modules/local/blaze' +include { PREEXTRACT_FASTQ } from '../../modules/local/preextract_fastq' +include { CORRECT_BARCODES } from '../../modules/local/correct_barcodes' +include { SPLIT_FILE as SPLIT_FILE_BC_FASTQ } from "../../modules/local/split_file" +include { SPLIT_FILE as SPLIT_FILE_BC_CSV } from "../../modules/local/split_file" +include { CAT_CAT as CAT_CAT_PREEXTRACT } from "../../modules/nf-core/cat/cat/main" +include { CAT_CAT as CAT_CAT_BARCODE } from "../../modules/nf-core/cat/cat/main" +include { PIGZ_COMPRESS } from '../../modules/nf-core/pigz/compress/main' +include { PIGZ_UNCOMPRESS as PIGZ_UNCOMPRESS_BC } from "../../modules/nf-core/pigz/uncompress/main" +include { PIGZ_UNCOMPRESS as PIGZ_UNCOMPRESS_FASTQ } from "../../modules/nf-core/pigz/uncompress/main" + +workflow DEMULTIPLEX_BLAZE { + take: + ch_trimmed_reads_combined // channel: [ val(meta), path(trimmed_reads_combined) ] + whitelist // channel: [ val(meta), path(whitelist) ] + + main: + ch_versions = Channel.empty() + ch_extracted_fastq = Channel.empty() + ch_corrected_bc_info = Channel.empty() + + // + // MODULE: Uncompress fastq.gz + // + + // + // MODULE: Generate whitelist + // + + PIGZ_UNCOMPRESS_FASTQ ( ch_trimmed_reads_combined ) + + ch_versions = ch_versions.mix(PIGZ_UNCOMPRESS_FASTQ.out.versions) + ch_trimmed_reads_combined_fastq = PIGZ_UNCOMPRESS_FASTQ.out.file + + // + // MODULE: Unzip whitelist + // + + // Unzip the whitelist if needed + if (whitelist.extension == "gz"){ + + PIGZ_UNCOMPRESS_BC ( [[:], whitelist] ) + + ch_whitelist = + PIGZ_UNCOMPRESS_BC.out.file + .map { + meta, whitelist -> + [whitelist] + } + + ch_versions = ch_versions.mix(PIGZ_UNCOMPRESS_BC.out.versions) + } else { + ch_whitelist = whitelist + } + + BLAZE ( ch_trimmed_reads_combined_fastq, ch_whitelist ) + + ch_putative_bc = BLAZE.out.putative_bc + ch_gt_whitelist = BLAZE.out.whitelist + ch_whitelist_bc_count = BLAZE.out.bc_count + ch_versions = ch_versions.mix(BLAZE.out.versions) + + ch_split_bc_fastqs = ch_trimmed_reads_combined_fastq + ch_split_bc = ch_putative_bc + if (params.split_amount > 0) { + SPLIT_FILE_BC_FASTQ( ch_trimmed_reads_combined_fastq, '.fastq', params.split_amount * 4 ) + + SPLIT_FILE_BC_FASTQ.out.split_files + .transpose() + .set { ch_split_bc_fastqs } + + ch_versions = ch_versions.mix(SPLIT_FILE_BC_FASTQ.out.versions) + + SPLIT_FILE_BC_CSV ( ch_putative_bc, '.csv', (params.split_amount ) ) + SPLIT_FILE_BC_CSV.out.split_files + .transpose() + .set { ch_split_bc } + } + + // + // MODULE: Extract barcodes + // + + PREEXTRACT_FASTQ( ch_split_bc_fastqs.join(ch_split_bc) ) + ch_barcode_info = PREEXTRACT_FASTQ.out.barcode_info + ch_preextract_fastq = PREEXTRACT_FASTQ.out.extracted_fastq + + // + // MODULE: Correct Barcodes + // + + CORRECT_BARCODES ( + ch_barcode_info + .combine ( ch_gt_whitelist, by: 0) + .combine ( ch_whitelist_bc_count, by: 0 ) + ) + ch_corrected_bc_file = CORRECT_BARCODES.out.corrected_bc_info + ch_versions = ch_versions.mix(CORRECT_BARCODES.out.versions) + + ch_extracted_fastq = ch_preextract_fastq + ch_corrected_bc_info = ch_corrected_bc_file + + if (params.split_amount > 0){ + // + // MODULE: Cat Preextract + // + CAT_CAT_PREEXTRACT(ch_preextract_fastq.groupTuple()) + ch_cat_preextract_fastq = CAT_CAT_PREEXTRACT.out.file_out + + // + // MODULE: Cat barcode file + // + CAT_CAT_BARCODE (ch_corrected_bc_file.groupTuple()) + ch_corrected_bc_info = CAT_CAT_BARCODE.out.file_out + + // + // MODULE: Zip the reads + // + PIGZ_COMPRESS (ch_cat_preextract_fastq ) + ch_extracted_fastq = PIGZ_COMPRESS.out.archive + ch_versions = ch_versions.mix(PIGZ_COMPRESS.out.versions) + } + emit: + // Versions + versions = ch_versions + + extracted_fastq = ch_extracted_fastq + corrected_bc_info = ch_corrected_bc_info +} diff --git a/subworkflows/local/demultiplex_flexiplex.nf b/subworkflows/local/demultiplex_flexiplex.nf new file mode 100644 index 00000000..e17eafc3 --- /dev/null +++ b/subworkflows/local/demultiplex_flexiplex.nf @@ -0,0 +1,149 @@ +// +// Creates gtfs to that add introns as features +// + +include { PIGZ_COMPRESS } from '../../modules/nf-core/pigz/compress/main' +include { PIGZ_UNCOMPRESS } from '../../modules/nf-core/pigz/uncompress/main' +include { SEQKIT_SPLIT2 } from '../../modules/nf-core/seqkit/split2/main' +include { FLEXIPLEX_DISCOVERY } from '../../modules/local/flexiplex/discovery/main' +include { FLEXIPLEX_FILTER } from '../../modules/local/flexiplex/filter/main' +include { FLEXIPLEX_ASSIGN } from '../../modules/local/flexiplex/assign/main' +include { CAT_FASTQ } from '../../modules/nf-core/cat/fastq/main' +include { MERGEBARCODECOUNTS as MERGE_BARCODES } from '../../modules/local/mergebarcodecounts/main' + +workflow DEMULTIPLEX_FLEXIPLEX { + take: + reads + whitelist + + main: + ch_versions = Channel.empty() + ch_flexiplex_fastq = Channel.empty() + ch_flexiplex_barcodes = Channel.empty() + + // Unzip the whitelist if needed + if (whitelist.extension == "gz"){ + + PIGZ_UNCOMPRESS ( [[:], whitelist] ) + + ch_whitelist = + PIGZ_UNCOMPRESS.out.file + .map { + meta, whitelist -> + [whitelist] + } + + ch_versions = ch_versions.mix(PIGZ_UNCOMPRESS.out.versions) + } else { + ch_whitelist = whitelist + } + + + flexiplex_input = reads + if (params.split_amount > 0) { + // + // MODULE: SPLIT2: Split reads into parts + // + SEQKIT_SPLIT2 ( + reads + ) + + ch_versions = SEQKIT_SPLIT2.out.versions + + // Transpose channel and add part to metadata + flexiplex_input = SEQKIT_SPLIT2.out.reads + .map { meta, reads -> + newmeta = [splitcount: reads.size()] + [meta + newmeta, reads] } + .transpose() + .map { meta , reads -> + part = (reads =~ /.*part_(\d+)\.fastq(?:\.gz)?$/)[0][1] + newmeta = [part: part] + [meta + newmeta, reads] } + } + + // + // MODULE: Run flexiplex + // + + FLEXIPLEX_DISCOVERY ( + flexiplex_input + ) + + ch_versions = ch_versions.mix(FLEXIPLEX_DISCOVERY.out.versions) + + + // + // Merge barcode counts if split + // + ch_barcodes = FLEXIPLEX_DISCOVERY.out.barcode_counts + if (params.split_amount > 0 ) { + + ch_flexiplex_barcodes = FLEXIPLEX_DISCOVERY.out.barcode_counts + .map { meta, barcode_counts -> + key = groupKey(meta.subMap('id', 'single_end', 'cell_counts', 'type'), meta.splitcount) + [key, barcode_counts] } + .groupTuple() + + MERGE_BARCODES ( + ch_flexiplex_barcodes + ) + + ch_versions = ch_versions.mix(MERGE_BARCODES.out.versions) + ch_barcodes = MERGE_BARCODES.out.barcode_counts + } + + // + // MODULE: Filter flexiplex + // + + FLEXIPLEX_FILTER ( + ch_barcodes, + ch_whitelist + ) + + ch_versions = ch_versions.mix(FLEXIPLEX_FILTER.out.versions) + ch_corrected_bc_info = FLEXIPLEX_FILTER.out.barcodes + + // Merge the reads and barcodes channels + flexiplex_input_barcodes = flexiplex_input + .combine(ch_corrected_bc_info) + .map { meta, reads, meta2, barcodes -> { + meta.id == meta2.id ? [meta, reads, barcodes] : null } + } + + // + // MODULE: Assign flexiplex + // + + FLEXIPLEX_ASSIGN ( + flexiplex_input_barcodes, + ) + + ch_versions = ch_versions.mix(FLEXIPLEX_ASSIGN.out.versions) + + ch_flexiplex_fastq = FLEXIPLEX_ASSIGN.out.reads + if (params.split_amount > 0) { + // + // MODULE: cat fastq + // + + ch_grouped_flexiplex_fastq = FLEXIPLEX_ASSIGN.out.reads + .map { meta, reads -> + key = groupKey(meta.subMap('id', 'single_end', 'cell_counts', 'type'), meta.splitcount) + [key, reads] } + .groupTuple() + + CAT_FASTQ ( + ch_grouped_flexiplex_fastq + ) + + ch_flexiplex_fastq = CAT_FASTQ.out.reads + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) + } + + emit: + flexiplex_fastq = ch_flexiplex_fastq + flexiplex_barcodes = ch_corrected_bc_info + versions = ch_versions +} diff --git a/subworkflows/local/process_longread_scrna.nf b/subworkflows/local/process_longread_scrna.nf index f879364d..cb4c90f6 100644 --- a/subworkflows/local/process_longread_scrna.nf +++ b/subworkflows/local/process_longread_scrna.nf @@ -9,14 +9,14 @@ include { QUANTIFY_SCRNA_OARFISH } from '../../subworkflows/local/quantify_scrn include { DEDUP_UMIS } from '../../subworkflows/local/dedup_umis' // MODULES +include { TAG_BARCODES } from '../../modules/local/tag_barcodes' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index' include { PICARD_MARKDUPLICATES } from '../../modules/nf-core/picard/markduplicates' include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_TAGGED } from '../../modules/nf-core/samtools/flagstat' include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_DEDUP } from '../../modules/nf-core/samtools/flagstat' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_TAGGED } from '../../modules/nf-core/samtools/index' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_DEDUP } from '../../modules/nf-core/samtools/index' include { SAMTOOLS_VIEW as SAMTOOLS_FILTER_DEDUP } from '../../modules/nf-core/samtools/view' - -include { TAG_BARCODES } from '../../modules/local/tag_barcodes' +include { FLEXIFORMATTER } from '../../modules/local/flexiformatter' workflow PROCESS_LONGREAD_SCRNA { @@ -62,26 +62,43 @@ workflow PROCESS_LONGREAD_SCRNA { // // MODULE: Tag Barcodes // + ch_tagged_bam = Channel.empty() + ch_tagged_bai = Channel.empty() - TAG_BARCODES ( - ALIGN_LONGREADS.out.sorted_bam - .join( ALIGN_LONGREADS.out.sorted_bai, by: 0 ) - .join( read_bc_info, by: 0) - ) - ch_versions = ch_versions.mix(TAG_BARCODES.out.versions) + if (params.demux_tool_cdna == "flexiplex") { + FLEXIFORMATTER ( + ALIGN_LONGREADS.out.sorted_bam, + "bai" + ) - // - // MODULE: Index Tagged Bam - // - SAMTOOLS_INDEX_TAGGED ( TAG_BARCODES.out.tagged_bam ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX_TAGGED.out.versions) + ch_versions = ch_versions.mix(FLEXIFORMATTER.out.versions) + ch_tagged_bam = FLEXIFORMATTER.out.bam + ch_tagged_bai = FLEXIFORMATTER.out.bai + + + } else if (params.demux_tool_cdna == "blaze") { + TAG_BARCODES ( + ALIGN_LONGREADS.out.sorted_bam + .join( ALIGN_LONGREADS.out.sorted_bai, by: 0 ) + .join( read_bc_info, by: 0) + ) + ch_versions = ch_versions.mix(TAG_BARCODES.out.versions) + + // + // MODULE: Index Tagged Bam + // + SAMTOOLS_INDEX ( TAG_BARCODES.out.tagged_bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions) + + ch_tagged_bam = TAG_BARCODES.out.tagged_bam + ch_tagged_bai = SAMTOOLS_INDEX.out.bai + } // // MODULE: Flagstat Tagged Bam // SAMTOOLS_FLAGSTAT_TAGGED ( - TAG_BARCODES.out.tagged_bam - .join( SAMTOOLS_INDEX_TAGGED.out.bai, by: [0]) + ch_tagged_bam.join(ch_tagged_bai) ) ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT_TAGGED.out.versions) @@ -95,8 +112,8 @@ workflow PROCESS_LONGREAD_SCRNA { fasta, fai, gtf, - TAG_BARCODES.out.tagged_bam, - SAMTOOLS_INDEX_TAGGED.out.bai, + ch_tagged_bam, + ch_tagged_bai, true, // Used to split the bam genome_aligned, dedup_tool, @@ -109,12 +126,12 @@ workflow PROCESS_LONGREAD_SCRNA { ch_versions = DEDUP_UMIS.out.versions } else { - ch_bam = TAG_BARCODES.out.tagged_bam - ch_bai = SAMTOOLS_INDEX_TAGGED.out.bai + ch_bam = ch_tagged_bam + ch_bai = ch_tagged_bai ch_flagstat = SAMTOOLS_FLAGSTAT_TAGGED.out.flagstat .map{ meta, flagstat -> - id = ['id': meta.id] + id = meta [id, flagstat] } @@ -170,8 +187,8 @@ workflow PROCESS_LONGREAD_SCRNA { minimap_nanocomp_bam_txt = ALIGN_LONGREADS.out.nanocomp_bam_txt // Barcode tagging results + qc's - bc_tagged_bam = TAG_BARCODES.out.tagged_bam - bc_tagged_bai = SAMTOOLS_INDEX_TAGGED.out.bai + bc_tagged_bam = ch_tagged_bam + bc_tagged_bai = ch_tagged_bai bc_tagged_flagstat = SAMTOOLS_FLAGSTAT_TAGGED.out.flagstat // Deduplication results diff --git a/subworkflows/local/quantify_scrna_isoquant.nf b/subworkflows/local/quantify_scrna_isoquant.nf index ccc89360..af96b04e 100644 --- a/subworkflows/local/quantify_scrna_isoquant.nf +++ b/subworkflows/local/quantify_scrna_isoquant.nf @@ -2,16 +2,9 @@ // Performs feature quantification for long read single-cell rna data // -include { BAMTOOLS_SPLIT } from '../../modules/nf-core/bamtools/split/main' include { ISOQUANT } from '../../modules/local/isoquant' -include { MERGE_MTX as MERGE_MTX_GENE } from '../../modules/local/merge_mtx' -include { MERGE_MTX as MERGE_MTX_TRANSCRIPT } from '../../modules/local/merge_mtx' include { QC_SCRNA as QC_SCRNA_GENE } from '../../subworkflows/local/qc_scrna' include { QC_SCRNA as QC_SCRNA_TRANSCRIPT } from '../../subworkflows/local/qc_scrna' -include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX_SPLIT } from '../../modules/nf-core/samtools/faidx/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_SPLIT } from '../../modules/nf-core/samtools/index/main' -include { SPLIT_GTF } from '../../modules/local/split_gtf' -include { SPLIT_FASTA } from '../../modules/local/split_fasta' workflow QUANTIFY_SCRNA_ISOQUANT { take: @@ -27,123 +20,45 @@ workflow QUANTIFY_SCRNA_ISOQUANT { main: ch_versions = Channel.empty() - // - // MODULE: Split the FASTA - // - SPLIT_FASTA( in_fasta ) - ch_versions = ch_versions.mix(SPLIT_FASTA.out.versions) - ch_split_fasta = SPLIT_FASTA.out.split_fasta - .flatten() - .map{ - fasta -> - def fasta_basename = fasta.toString().split('/')[-1] - def new_meta = [ 'chr': fasta_basename.split(/\./)[0] ] - [ new_meta, fasta ] - } - - SAMTOOLS_FAIDX_SPLIT( ch_split_fasta, [ [:], "$projectDir/assets/dummy_file.txt" ]) - ch_split_fai = SAMTOOLS_FAIDX_SPLIT.out.fai - ch_versions = ch_versions.mix(SAMTOOLS_FAIDX_SPLIT.out.versions) - - // - // MODULE: Split the GTF - // - SPLIT_GTF( in_gtf ) - ch_split_gtf = SPLIT_GTF.out.split_gtf - .flatten() - .map{ - gtf -> - def gtf_basename = gtf.toString().split('/')[-1] - def new_meta = ['chr': gtf_basename.split(/\./)[0]] - [ new_meta, gtf ] - } - ch_versions = ch_versions.mix(SPLIT_GTF.out.versions) - - // - // MODULE: Bamtools split - // - BAMTOOLS_SPLIT ( in_bam ) - ch_versions = ch_versions.mix(BAMTOOLS_SPLIT.out.versions.first()) - - ch_split_bam = BAMTOOLS_SPLIT.out.bam - .map { - meta, bam -> - return [ bam ] - } - .flatten() - .map { bam -> - def bam_basename = bam.toString().split('/')[-1] - - def chrom = bam_basename.split(/\./)[1].replace("REF_", "") - - def split_bam_basename = bam_basename.split(/\./) - def new_id = split_bam_basename.take(split_bam_basename.size()-2).join(".") - - def new_meta = ['id': new_id, 'chr': chrom] - return [ new_meta, bam ] - } - - // - // MODULE: Samtools Index - // - SAMTOOLS_INDEX_SPLIT( ch_split_bam ) - ch_split_bai = SAMTOOLS_INDEX_SPLIT.out.bai - ch_versions = ch_versions.mix(SAMTOOLS_INDEX_SPLIT.out.versions.first()) - // Prepare isoquant input channel - // bam and bai files need to be joined with split fasta, fai and gtf files - isoquant_input = ch_split_bam - .join(ch_split_bai, by: [0]) - .map { meta, bam, bai -> - def chrom = [ 'chr': meta.chr ] - [ chrom, meta, bam, bai ] - } - .combine(ch_split_fasta, by: [0]) - .combine(ch_split_fai, by: [0]) - .combine(ch_split_gtf, by: [0]) - .map { chrom, meta, bam, bai, fasta, fai, gtf -> + // Run IsoQuant once per sample on full BAM + full reference files. + ch_fasta = in_fasta.map { meta, fasta -> fasta } + ch_fai = in_fai.map { meta, fai -> fai } + ch_gtf = in_gtf.map { meta, gtf -> gtf } + + isoquant_input = in_bam + .join(in_bai, by: [0]) + .combine(ch_fasta) + .combine(ch_fai) + .combine(ch_gtf) + .map { meta, bam, bai, fasta, fai, gtf -> [ meta, bam, bai, fasta, fai, gtf ] } // // MODULE: Isoquant // + ISOQUANT ( isoquant_input, 'tag:CB' ) ch_versions = ch_versions.mix(ISOQUANT.out.versions) - // - // MODULE: Merge Matrix - // - ch_split_gene_mtx = ISOQUANT.out.grouped_gene_counts + // Use IsoQuant grouped outputs directly (single run per sample, no split/merge). + ch_merged_gene_mtx = ISOQUANT.out.grouped_gene_counts .map { meta, gene_mtx -> - def new_meta = [ 'id': meta.id ] + def new_meta = meta return [ new_meta, gene_mtx ] } - .groupTuple() - MERGE_MTX_GENE ( - ch_split_gene_mtx - ) - ch_merged_gene_mtx = MERGE_MTX_GENE.out.merged_mtx - ch_versions = ch_versions.mix(MERGE_MTX_GENE.out.versions) - - ch_split_transcript_mtx = ISOQUANT.out.grouped_transcript_counts + ch_merged_transcript_mtx = ISOQUANT.out.grouped_transcript_counts .map { meta, transcript_mtx -> - def new_meta = [ 'id': meta.id ] + def new_meta = meta return [ new_meta, transcript_mtx ] } - .groupTuple() - - MERGE_MTX_TRANSCRIPT ( - ch_split_transcript_mtx - ) - ch_merged_transcript_mtx = MERGE_MTX_TRANSCRIPT.out.merged_mtx - ch_versions = ch_versions.mix(MERGE_MTX_TRANSCRIPT.out.versions) ch_gene_qc_stats = Channel.empty() ch_transcript_qc_stats = Channel.empty() diff --git a/subworkflows/local/quantify_scrna_oarfish.nf b/subworkflows/local/quantify_scrna_oarfish.nf index 681a7207..5be5aa47 100644 --- a/subworkflows/local/quantify_scrna_oarfish.nf +++ b/subworkflows/local/quantify_scrna_oarfish.nf @@ -38,7 +38,7 @@ workflow QUANTIFY_SCRNA_OARFISH { .join(OARFISH.out.mtx, by: [0]) .map{ meta,features,barcodes,mtx -> - new_meta = [ 'id' : meta.id ] + new_meta = meta [ new_meta, [ features, barcodes, mtx ]] }, in_flagstat, diff --git a/subworkflows/local/utils_nfcore_scnanoseq_pipeline/main.nf b/subworkflows/local/utils_nfcore_scnanoseq_pipeline/main.nf index cc0b9ecd..aab57401 100644 --- a/subworkflows/local/utils_nfcore_scnanoseq_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_scnanoseq_pipeline/main.nf @@ -75,8 +75,8 @@ workflow PIPELINE_INITIALISATION { Channel .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) .map{ - meta, fastq, cell_count_val -> - return [ meta.id, meta + [ single_end:true, cell_count: cell_count_val ], [ fastq ] ] + meta, fastq, cell_count_val, type -> + return [ [meta.id, type], meta + [ single_end:true, cell_count: cell_count_val, type: type ], [ fastq ] ] } .groupTuple() .map { samplesheet -> @@ -152,12 +152,20 @@ workflow PIPELINE_COMPLETION { def validateInputParameters() { genomeExistsError() - if ((params.quantifier.equals('isoquant') || params.quantifier.equals('both')) && !params.genome_fasta) { + // Only require a quantifier when cDNA input is present. + def has_cdna_input = samplesheetToList(params.input, "${projectDir}/assets/schema_input.json") + .any { row -> row[3].toString().equalsIgnoreCase('cdna') } + + if (has_cdna_input && !params.quantifier) { + error("Input contains cDNA reads but --quantifier was not provided. Please set --quantifier to one or more of: isoquant,oarfish") + } + + if (params.quantifier && (params.quantifier.equals('isoquant') || params.quantifier.equals('both')) && !params.genome_fasta) { def error_string = "In order to quantify with isoquant, a genome fasta must be provided" error(error_string) } - if ((params.quantifier.equals('oarfish') || params.quantifier.equals('both')) && !params.transcript_fasta) { + if (params.quantifier && (params.quantifier.equals('oarfish') || params.quantifier.equals('both')) && !params.transcript_fasta) { def error_string = "In order to quantify with oarfish, a transcript fasta must be provided" error(error_string) } diff --git a/workflows/scnanoseq.nf b/workflows/scnanoseq.nf index 6b793c02..6b18044a 100644 --- a/workflows/scnanoseq.nf +++ b/workflows/scnanoseq.nf @@ -5,22 +5,36 @@ */ // Whitelist -if (params.whitelist) { - blaze_whitelist = params.whitelist +if (params.cdna_whitelist) { + cdna_whitelist = file(params.cdna_whitelist) } else { if (params.barcode_format.equals("10X_3v3")) { - blaze_whitelist = file("$baseDir/assets/whitelist/3M-february-2018.zip") + cdna_whitelist = file("$baseDir/assets/whitelist/3M-february-2018.txt.gz") } else if (params.barcode_format.equals("10X_5v2")) { - blaze_whitelist = file("$baseDir/assets/whitelist/737K-august-2016.txt.zip") + cdna_whitelist = file("$baseDir/assets/whitelist/737K-august-2016.txt.gz") } else if (params.barcode_format.equals("10X_3v4")) { - blaze_whitelist = file("$baseDir/assets/whitelist/3M-3pgex-may-2023_TRU.txt.zip") + cdna_whitelist = file("$baseDir/assets/whitelist/3M-3pgex-may-2023_TRU.txt.gz") } else if (params.barcode_format.equals("10X_5v3")) { - blaze_whitelist = file("$baseDir/assets/whitelist/3M-5pgex-jan-2023.txt.zip") + cdna_whitelist = file("$baseDir/assets/whitelist/3M-5pgex-jan-2023.txt.gz") } + else if (params.barcode_format.equals("10X_multiome")) { + cdna_whitelist = file("$baseDir/assets/whitelist/cellranger_arc_rna.737K-arc-v1.txt.gz") + } else { + cdna_whitelist = [] + } +} + +if (params.dna_whitelist) { + dna_whitelist = file(params.dna_whitelist) +} +else if (params.barcode_format.equals("10X_multiome")) { + dna_whitelist = file("$baseDir/assets/whitelist/cellranger_arc_atac.737K-arc-v1.txt.gz") +} else { + dna_whitelist = [] } // Quantifiers @@ -31,7 +45,7 @@ TRANSCRIPT_QUANT_OPTS = [ 'oarfish' ] genome_quants = [] transcript_quants = [] -for (quantifier in params.quantifier.split(',')) { +for (quantifier in (params.quantifier ? params.quantifier.split(',') : [])) { if (quantifier in GENOME_QUANT_OPTS) { genome_quants.add(quantifier) } @@ -65,9 +79,6 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? f include { NANOFILT } from "../modules/local/nanofilt" include { SPLIT_FILE } from "../modules/local/split_file" -include { SPLIT_FILE as SPLIT_FILE_BC_FASTQ } from "../modules/local/split_file" -include { SPLIT_FILE as SPLIT_FILE_BC_CSV } from "../modules/local/split_file" -include { BLAZE } from "../modules/local/blaze" include { PREEXTRACT_FASTQ } from "../modules/local/preextract_fastq.nf" include { READ_COUNTS } from "../modules/local/read_counts.nf" include { CORRECT_BARCODES } from "../modules/local/correct_barcodes" @@ -77,10 +88,14 @@ include { UCSC_GENEPREDTOBED } from "../modules/local/ucsc_genepr // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // + include { PREPARE_REFERENCE_FILES } from "../subworkflows/local/prepare_reference_files" +include { DEMULTIPLEX_FLEXIPLEX as DEMULTIPLEX_FLEXIPLEX_CDNA } from "../subworkflows/local/demultiplex_flexiplex" +include { DEMULTIPLEX_FLEXIPLEX as DEMULTIPLEX_FLEXIPLEX_DNA } from "../subworkflows/local/demultiplex_flexiplex" +include { DEMULTIPLEX_BLAZE } from "../subworkflows/local/demultiplex_blaze" include { PROCESS_LONGREAD_SCRNA as PROCESS_LONGREAD_SCRNA_GENOME } from "../subworkflows/local/process_longread_scrna" include { PROCESS_LONGREAD_SCRNA as PROCESS_LONGREAD_SCRNA_TRANSCRIPT } from "../subworkflows/local/process_longread_scrna" - +include { ALIGN_DEDUPLICATE_DNA } from "../subworkflows/local/align_deduplicate_dna" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS @@ -89,22 +104,21 @@ include { PROCESS_LONGREAD_SCRNA as PROCESS_LONGREAD_SCRNA_TRANSCRIPT } from ".. // // MODULE: Installed directly from nf-core/modules // -include { PIGZ_UNCOMPRESS as GUNZIP_FASTQ } from "../modules/nf-core/pigz/uncompress/main" -include { PIGZ_UNCOMPRESS as GUNZIP_WHITELIST } from "../modules/nf-core/pigz/uncompress/main" -include { PIGZ_COMPRESS } from "../modules/nf-core/pigz/compress/main" -include { NANOCOMP as NANOCOMP_FASTQ } from "../modules/nf-core/nanocomp/main" + +include { NANOCOMP as NANOCOMP_FASTQ_CDNA } from "../modules/nf-core/nanocomp/main" +include { NANOCOMP as NANOCOMP_FASTQ_DNA } from "../modules/nf-core/nanocomp/main" +include { CHOPPER } from "../modules/nf-core/chopper/main" include { MULTIQC as MULTIQC_RAWQC } from "../modules/nf-core/multiqc/main" include { MULTIQC as MULTIQC_FINALQC } from "../modules/nf-core/multiqc/main" include { CUSTOM_DUMPSOFTWAREVERSIONS } from "../modules/nf-core/custom/dumpsoftwareversions/main" include { CAT_CAT } from "../modules/nf-core/cat/cat/main" -include { CAT_CAT as CAT_CAT_PREEXTRACT } from "../modules/nf-core/cat/cat/main" -include { CAT_CAT as CAT_CAT_BARCODE } from "../modules/nf-core/cat/cat/main" include { CAT_FASTQ } from "../modules/nf-core/cat/fastq/main" include { paramsSummaryMap } from "plugin/nf-schema" /* * SUBWORKFLOW: Consisting entirely of nf-core/subworkflows */ + include { QCFASTQ_NANOPLOT_FASTQC as FASTQC_NANOPLOT_PRE_TRIM } from "../subworkflows/nf-core/qcfastq_nanoplot_fastqc" include { QCFASTQ_NANOPLOT_FASTQC as FASTQC_NANOPLOT_POST_TRIM } from "../subworkflows/nf-core/qcfastq_nanoplot_fastqc" include { QCFASTQ_NANOPLOT_FASTQC as FASTQC_NANOPLOT_POST_EXTRACT } from "../subworkflows/nf-core/qcfastq_nanoplot_fastqc" @@ -112,7 +126,6 @@ include { paramsSummaryMultiqc } from ". include { softwareVersionsToYAML } from "../subworkflows/nf-core/utils_nfcore_pipeline" include { methodsDescriptionText } from "../subworkflows/local/utils_nfcore_scnanoseq_pipeline" - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -131,8 +144,9 @@ workflow SCNANOSEQ { // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // + ch_samplesheet - .branch{ + .branch { meta, fastq -> single: fastq.size() == 1 return [ meta, fastq.flatten() ] @@ -144,6 +158,7 @@ workflow SCNANOSEQ { // // MODULE: Combine fastqs from the same sample // + CAT_FASTQ ( ch_fastqs.multiple ) .reads .mix ( ch_fastqs.single ) @@ -176,18 +191,29 @@ workflow SCNANOSEQ { ch_nanocomp_fastq_txt = Channel.empty() if (!params.skip_qc && !params.skip_fastq_nanocomp) { - NANOCOMP_FASTQ ( + NANOCOMP_FASTQ_CDNA ( + ch_cat_fastq + .filter{ meta, fastq -> meta.type == 'cdna' } + .collect{it[1]} + .map{ + [ [ 'id': 'cdna_fastq.', 'type': 'cdna' ] , it ] + } + ) + + NANOCOMP_FASTQ_DNA ( ch_cat_fastq + .filter{ meta, fastq -> meta.type == 'dna' } .collect{it[1]} .map{ - [ [ 'id': 'nanocomp_fastq.' ] , it ] + [ [ 'id': 'dna_fastq.', 'type': 'dna' ] , it ] } ) - ch_nanocomp_fastq_html = NANOCOMP_FASTQ.out.report_html - ch_nanocomp_fastq_txt = NANOCOMP_FASTQ.out.stats_txt + ch_nanocomp_fastq_html = NANOCOMP_FASTQ_CDNA.out.report_html.mix( NANOCOMP_FASTQ_DNA.out.report_html ) + ch_nanocomp_fastq_txt = NANOCOMP_FASTQ_CDNA.out.stats_txt.mix( NANOCOMP_FASTQ_DNA.out.stats_txt ) - ch_versions = ch_versions.mix( NANOCOMP_FASTQ.out.versions ) + ch_versions = ch_versions.mix( NANOCOMP_FASTQ_CDNA.out.versions ) + ch_versions = ch_versions.mix( NANOCOMP_FASTQ_DNA.out.versions ) } @@ -226,13 +252,6 @@ workflow SCNANOSEQ { ch_versions = ch_versions.mix(UCSC_GENEPREDTOBED.out.versions) } - // - // MODULE: Unzip fastq - // - GUNZIP_FASTQ( ch_cat_fastq ) - ch_unzipped_fastqs = GUNZIP_FASTQ.out.file - ch_versions = ch_versions.mix( GUNZIP_FASTQ.out.versions ) - // // MODULE: Trim and filter reads // @@ -240,37 +259,15 @@ workflow SCNANOSEQ { ch_trimmed_reads_combined = Channel.empty() if (!params.skip_trimming){ + // - // MODULE: Split fastq + // MODULE: Chopper // - ch_fastqs = ch_unzipped_fastqs - if (params.split_amount > 0) { - SPLIT_FILE( ch_unzipped_fastqs, '.fastq', params.split_amount ) + CHOPPER ( ch_cat_fastq, [] ) - // Temporarily change the meta object so that the id is present on the - // fastq to prevent duplicated names - SPLIT_FILE.out.split_files - .transpose() - .set { ch_fastqs } - - ch_versions = ch_versions.mix(SPLIT_FILE.out.versions) - } - - ch_trimmed_reads = ch_fastqs - if (!params.skip_trimming) { - - NANOFILT ( ch_fastqs ) - ch_trimmed_reads = NANOFILT.out.reads - ch_versions = ch_versions.mix(NANOFILT.out.versions) - } - - // If the fastqs were split, combine them together - ch_trimmed_reads_combined = ch_trimmed_reads - if (params.split_amount > 0){ - CAT_CAT(ch_trimmed_reads.groupTuple()) - ch_trimmed_reads_combined = CAT_CAT.out.file_out - } + versions = CHOPPER.out.versions + ch_trimmed_reads_combined = CHOPPER.out.fastq // // SUBWORKFLOW: Fastq QC with Nanoplot and FastQC - post-trim QC @@ -289,103 +286,78 @@ workflow SCNANOSEQ { ch_versions = ch_versions.mix(FASTQC_NANOPLOT_POST_TRIM.out.fastqc_version.first().ifEmpty(null)) } } else { - ch_trimmed_reads_combined = ch_unzipped_fastqs + ch_trimmed_reads_combined = ch_cat_fastq } - // - // MODULE: Unzip whitelist - // - - // NOTE: Blaze does not support '.gzip' - ch_blaze_whitelist = blaze_whitelist - - if (blaze_whitelist.endsWith('.gz')){ - - GUNZIP_WHITELIST ( [[:], blaze_whitelist ]) - ch_blaze_whitelist = - GUNZIP_WHITELIST.out.file - .map { - meta, whitelist -> - [whitelist] - } - - ch_versions = ch_versions.mix(GUNZIP_WHITELIST.out.versions) - } + // Branch channel to dna and cdna + ch_trimmed_reads_combined = ch_trimmed_reads_combined + .branch { + meta, fastq -> + dna: meta.type == 'dna' + return [ meta, fastq ] + cdna: meta.type == 'cdna' + return [ meta, fastq ] + } // - // MODULE: Generate whitelist + // SUBWORKFLOW: Demultiplex reads using FLEXIPLEX for DNA // - BLAZE ( ch_trimmed_reads_combined, ch_blaze_whitelist ) - - ch_putative_bc = BLAZE.out.putative_bc - ch_gt_whitelist = BLAZE.out.whitelist - ch_whitelist_bc_count = BLAZE.out.bc_count - ch_versions = ch_versions.mix(BLAZE.out.versions) - - ch_split_bc_fastqs = ch_trimmed_reads_combined - ch_split_bc = ch_putative_bc - if (params.split_amount > 0) { - SPLIT_FILE_BC_FASTQ( ch_trimmed_reads_combined, '.fastq', params.split_amount ) - - SPLIT_FILE_BC_FASTQ.out.split_files - .transpose() - .set { ch_split_bc_fastqs } - - ch_versions = ch_versions.mix(SPLIT_FILE_BC_FASTQ.out.versions) + ch_extracted_fastq_dna = Channel.empty() + ch_corrected_bc_info_dna = Channel.empty() + if (params.demux_tool_dna == "flexiplex") { + DEMULTIPLEX_FLEXIPLEX_DNA ( + ch_trimmed_reads_combined.dna, + dna_whitelist + ) - SPLIT_FILE_BC_CSV ( ch_putative_bc, '.csv', (params.split_amount / 4) ) - SPLIT_FILE_BC_CSV.out.split_files - .transpose() - .set { ch_split_bc } + ch_versions = ch_versions.mix(DEMULTIPLEX_FLEXIPLEX_DNA.out.versions) + ch_extracted_fastq_dna = DEMULTIPLEX_FLEXIPLEX_DNA.out.flexiplex_fastq + ch_corrected_bc_info_dna = DEMULTIPLEX_FLEXIPLEX_DNA.out.flexiplex_barcodes + } else if (params.demux_tool_dna == "blaze") { + error "Blaze demultiplexing is not currently supported for DNA reads. Please use flexiplex." } + ch_extracted_fastq_cdna = Channel.empty() + ch_corrected_bc_info_cdna = Channel.empty() + if (params.demux_tool_cdna == "flexiplex") { - // - // MODULE: Extract barcodes - // + // + // SUBWORKFLOW: Demultiplex reads using FLEXIPLEX for cDNA + // - PREEXTRACT_FASTQ( ch_split_bc_fastqs.join(ch_split_bc), params.barcode_format ) - ch_barcode_info = PREEXTRACT_FASTQ.out.barcode_info - ch_preextract_fastq = PREEXTRACT_FASTQ.out.extracted_fastq + DEMULTIPLEX_FLEXIPLEX_CDNA ( + ch_trimmed_reads_combined.cdna, + cdna_whitelist + ) - // - // MODULE: Correct Barcodes - // + ch_versions = ch_versions.mix(DEMULTIPLEX_FLEXIPLEX_CDNA.out.versions) - CORRECT_BARCODES ( - ch_barcode_info - .combine ( ch_gt_whitelist, by: 0) - .combine ( ch_whitelist_bc_count, by: 0 ) - ) - ch_corrected_bc_file = CORRECT_BARCODES.out.corrected_bc_info - ch_versions = ch_versions.mix(CORRECT_BARCODES.out.versions) + ch_extracted_fastq_cdna = DEMULTIPLEX_FLEXIPLEX_CDNA.out.flexiplex_fastq + ch_corrected_bc_info_cdna = DEMULTIPLEX_FLEXIPLEX_CDNA.out.flexiplex_barcodes - ch_extracted_fastq = ch_preextract_fastq - ch_corrected_bc_info = ch_corrected_bc_file + } else if (params.demux_tool_cdna == "blaze") { - if (params.split_amount > 0){ // - // MODULE: Cat Preextract + // SUBWORKFLOW: Demultiplex reads using BLAZE for cDNA // - CAT_CAT_PREEXTRACT(ch_preextract_fastq.groupTuple()) - ch_cat_preextract_fastq = CAT_CAT_PREEXTRACT.out.file_out - // - // MODULE: Cat barcode file - // - CAT_CAT_BARCODE (ch_corrected_bc_file.groupTuple()) - ch_corrected_bc_info = CAT_CAT_BARCODE.out.file_out + DEMULTIPLEX_BLAZE ( + ch_trimmed_reads_combined.cdna, + cdna_whitelist + ) - // - // MODULE: Zip the reads - // - PIGZ_COMPRESS (ch_cat_preextract_fastq ) - ch_extracted_fastq = PIGZ_COMPRESS.out.archive - ch_versions = ch_versions.mix(PIGZ_COMPRESS.out.versions) + ch_versions = ch_versions.mix(DEMULTIPLEX_BLAZE.out.versions) + + ch_extracted_fastq_cdna = DEMULTIPLEX_BLAZE.out.extracted_fastq + ch_corrected_bc_info_cdna = DEMULTIPLEX_BLAZE.out.corrected_bc_info } + // Recombine channels for QC modules + ch_extracted_fastq = ch_extracted_fastq_cdna.mix(ch_extracted_fastq_dna) + ch_corrected_bc_info = ch_corrected_bc_info_cdna.mix(ch_corrected_bc_info_dna) + // // SUBWORKFLOW: Fastq QC with Nanoplot and FastQC - post-extract QC // @@ -430,7 +402,7 @@ workflow SCNANOSEQ { } // - // SUBWORKFLOW: Align Long Read Data + // SUBWORKFLOW: Align Long Read cDNA data // ch_multiqc_finalqc_files = Channel.empty() @@ -440,9 +412,9 @@ workflow SCNANOSEQ { genome_fasta, genome_fai, gtf, - ch_extracted_fastq, + ch_extracted_fastq_cdna, ch_rseqc_bed, - ch_corrected_bc_info, + ch_corrected_bc_info_cdna, genome_quants, params.dedup_tool, true, // Used to indicate the bam is genome aligned @@ -499,9 +471,9 @@ workflow SCNANOSEQ { transcript_fasta, transcript_fai, gtf, - ch_extracted_fastq, + ch_extracted_fastq_cdna, ch_rseqc_bed, - ch_corrected_bc_info, + ch_corrected_bc_info_cdna, transcript_quants, params.dedup_tool, false, // Indicates this is NOT genome aligned @@ -544,6 +516,29 @@ workflow SCNANOSEQ { ) } + // + // SUBWORKFLOW: Align and deduplicate DNA samples + // + + ALIGN_DEDUPLICATE_DNA ( + genome_fasta, + genome_fai, + ch_extracted_fastq_dna, + params.skip_save_minimap2_index, + params.skip_qc, + params.skip_bam_nanocomp, + params.skip_dedup + ) + + ch_versions = ch_versions.mix(ALIGN_DEDUPLICATE_DNA.out.versions) + + ch_multiqc_finalqc_files = ch_multiqc_finalqc_files.mix( + ALIGN_DEDUPLICATE_DNA.out.flagstat.collect{it[1]}.ifEmpty([]) + ) + ch_multiqc_finalqc_files = ch_multiqc_finalqc_files.mix( + ALIGN_DEDUPLICATE_DNA.out.nanocomp_bam_txt.collect{it[1]}.ifEmpty([]) + ) + // // SOFTWARE_VERSIONS //