bbglab · FerriolCalvet · Apr 3, 2025 · May 8, 2025 · May 14, 2025 · May 14, 2025
diff --git a/bin/create_consensus_panel.py b/bin/create_consensus_panel.py
@@ -50,6 +50,9 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
     #####
     # Filter failing columns only for rows that pass the compliance threshold
     compliance_df_passing = compliance_df.filter(passing_rows)
+
+    print(f"DEBUG: Total positions passing compliance threshold: {compliance_df_passing.height}")
+    print(f"DEBUG: Number of samples: {compliance_df_passing.width}")
 
     # Invert all boolean values (True → False, False → True)
     failing_mask = pl.DataFrame([
@@ -67,6 +70,7 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
                     "Failed": True
                 })
 
+    print(f"DEBUG: Total failing entries found: {len(failing_columns_counts)}")
 
     if failing_columns_counts:
         failing_columns_counts_df = pl.DataFrame(failing_columns_counts)
@@ -76,6 +80,12 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
             .rename({"count": "FAILING_COUNT"})
         )
         failure_counts_filtered.write_csv(f"failing_consensus.{version}.tsv", separator="\t")
+        print(f"DEBUG: Created failing_consensus.{version}.tsv with {failure_counts_filtered.height} samples")
+    else:
+        # Create empty file with header for consistency
+        empty_df = pl.DataFrame({"SAMPLE_ID": [], "FAILING_COUNT": []}, schema={"SAMPLE_ID": pl.Utf8, "FAILING_COUNT": pl.Int64})
+        empty_df.write_csv(f"failing_consensus.{version}.tsv", separator="\t")
+        print(f"DEBUG: No failures detected - created empty failing_consensus.{version}.tsv")
 
 
 @click.command()

diff --git a/bin/create_panel_versions.py b/bin/create_panel_versions.py
@@ -1,14 +1,20 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
+"""
+create_panel_versions.py
 
-import click
-import pandas as pd
-import os
+Generates multiple VEP annotation panel subsets based on the 'IMPACT' column
+using the high-performance Polars library.
+
+Usage:
+    python create_panel_versions.py --compact-annot-panel-path <input_tsv> --output <output_prefix>
+"""
 
-# TODO: check pandas version 2.0.3
-# -- Auxiliary functions -- #
+import polars as pl
+import click
+import sys
 
-panel_impact_dict = {
+PANEL_IMPACT_DICT = {
 
     "protein_affecting": ["nonsense", "missense",
                             "essential_splice",
@@ -68,25 +74,33 @@
 
 }
 
-# -- Main function -- #
 
-def create_panel_versions(compact_annot_panel_path, output_path):
+def create_panel_versions(input_path: str, output_prefix: str) -> None:
+    """
+    Generates panel subsets from a VEP-annotated file using Polars.
+
+    \b
+    INPUT_PATH: Path to the annotated TSV file.
+    OUTPUT_PREFIX: Prefix for the output files (e.g., 'output/panel').
+    """
+    try:
+        df = pl.read_csv(input_path, separator="\t")
+    except Exception as e:
+        click.echo(f"Error reading input file: {e}", err=True)
+        sys.exit(1)
 
-    # Load VEP annotated panel, already compacted to have one variant per site
-    ## requires column named IMPACT with consequence type
-    compact_annot_panel_df = pd.read_csv(compact_annot_panel_path, sep = "\t")
+    if "IMPACT" not in df.columns:
+        click.echo("ERROR: 'IMPACT' column not found in input file.", err=True)
+        sys.exit(1)
 
-    # Create panel versions
-    for version in panel_impact_dict:
+    for version_name, impact_values in PANEL_IMPACT_DICT.items():
+        filtered = df.filter(pl.col("IMPACT").is_in(impact_values))
+        filtered.write_csv(f"{output_prefix}.{version_name}.tsv", separator="\t")
 
-        panel_version = compact_annot_panel_df.loc[compact_annot_panel_df["IMPACT"].isin(panel_impact_dict[version])]
-        panel_version.to_csv(f"{output_path}.{version}.tsv",
-                                sep = "\t", index = False)
+    # Write the full file as a version
+    df.write_csv(f"{output_prefix}.all.tsv", separator="\t")
 
-    # Store complete panel (better change this way of using this version in nextflow)
-    version = "all"
-    compact_annot_panel_df.to_csv(f"{output_path}.{version}.tsv",
-                                    sep = "\t", index = False)
+    click.echo("Panel versions generated successfully.")
 
 
 @click.command()

diff --git a/conf/base.config b/conf/base.config
@@ -15,7 +15,7 @@ process {
     // TODO nf-core: Check the defaults for all processes
     cpus           = { 1 }
     memory         = { 6.GB * task.attempt }
-    time           = { 15.min * task.attempt }
+    time           = { 30.min * task.attempt }
 
 
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -632,6 +632,9 @@ process {
                     : null
     }
 
+    withName: SITESFROMPOSITIONS {
+        ext.chunk_size = params.panel_sites_chunk_size ?: 0
+    }
     withLabel : deepcsa_core {
         container = "docker.io/bbglab/deepcsa-core:0.1.0"
     }

diff --git a/conf/results_outputs.config b/conf/results_outputs.config
@@ -140,6 +140,20 @@ process {
     }
 
     withName: POSTPROCESSVEPPANEL {
+        publishDir = [
+            enabled : false
+        ]
+    }
+
+    withName: 'SITESFROMPOSITIONS' {
+        publishDir = [
+            path: { "${params.outdir}/regions/allsites" },
+            mode: params.publish_dir_mode,
+            pattern: 'captured_positions.sites4VEP.full.tsv',
+        ]
+    }
+
+    withName: 'SORTPANELRICH|SORTPANELRICHALL' {
         publishDir = [
             path: { "${params.outdir}/regions/annotations" },
             mode: params.publish_dir_mode,

diff --git a/conf/tools/panels.config b/conf/tools/panels.config
@@ -15,11 +15,12 @@ process {
     withName: 'BBGTOOLS:DEEPCSA:CREATEPANELS:VCFANNOTATEPANEL:ENSEMBLVEP_VEP' {
         ext.args   = "${params.vep_params_panel} --tab"
         publishDir = [
-            [
-                mode: params.publish_dir_mode,
-                path: { "${params.outdir}/regions/panelannotation" },
-                pattern: "*{gz}",
-            ]
+            enabled : false
+            // [
+            //     mode: params.publish_dir_mode,
+            //     path: { "${params.outdir}/regions/panelannotation" },
+            //     pattern: "*{gz}",
+            // ]
         ]
     }
 

diff --git a/modules/local/createpanels/captured/main.nf b/modules/local/createpanels/captured/main.nf
@@ -1,11 +1,11 @@
 process CREATECAPTUREDPANELS {
     tag "$meta.id"
-    label 'process_single'
-    label 'process_medium_high_memory'
-
-    container "community.wave.seqera.io/library/bedtools_pybedtools_pandas_pip_pruned:78080da05d53636d"
-
 
+    conda "python=3.10.17 bioconda::pybedtools=0.12.0 conda-forge::polars=1.30.0 conda-forge::click=8.2.1 conda-forge::gcc_linux-64=15.1.0 conda-forge::gxx_linux-64=15.1.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'docker://bbglab/deepcsa_bed:latest' :
+        'bbglab/deepcsa_bed:latest' }"
+
     input:
     tuple val(meta), path(compact_captured_panel_annotation)
 
@@ -36,7 +36,8 @@ process CREATECAPTUREDPANELS {
         bedtools merge \\
             -i <(
                 tail -n +2 \$captured_panel | \\
-                awk -F'\\t' '{print \$1, \$2-1, \$2}' OFS='\\t' | uniq
+                awk -F'\\t' '{print \$1, \$2-1, \$2}' OFS='\\t' | \\
+                sort -k1,1 -k2,2n | uniq
             ) > \${captured_panel%.tsv}.bed;
     done
 

diff --git a/modules/local/sitesfrompositions/main.nf b/modules/local/sitesfrompositions/main.nf
@@ -13,12 +13,13 @@ process SITESFROMPOSITIONS {
     tuple val(meta), path(depths)
 
     output:
-    tuple val(meta), path("*.sites4VEP.tsv")  , emit: annotated_panel_reg
-    path  "versions.yml"                      , topic: versions
+    tuple val(meta), path("*.sites4VEP.chunk*.tsv")  , emit: annotated_panel_reg
+    path  "versions.yml"                             , topic: versions
 
 
     script:
     def assembly = task.ext.assembly ?: "hg38"
+    def chunk_size = task.ext.chunk_size ?: 0
 
     // TODO
     // see if there is a better way to filter out chromosomes
@@ -34,7 +35,23 @@ process SITESFROMPOSITIONS {
 
     rm captured_positions.tsv
 
-    awk '{print "chr"\$0}' captured_positions.sites4VEP.tmp.tsv > captured_positions.sites4VEP.tsv
+    awk '{print "chr"\$0}' captured_positions.sites4VEP.tmp.tsv > captured_positions.sites4VEP.full.tsv
+
+    # Chunk the sites file if chunk_size is set
+    if [ ${chunk_size} -gt 0 ]; then
+        echo "[SITESFROMPOSITIONS] Chunking sites file with chunk_size=${chunk_size}"
+
+        # Split file into chunks (excluding header)
+        cat captured_positions.sites4VEP.full.tsv | split -l ${chunk_size} --additional-suffix=.tsv -d - captured_positions.sites4VEP.chunk
+
+        n_chunks=\$(ls captured_positions.sites4VEP.chunk*.tsv | wc -l)
+        echo "[SITESFROMPOSITIONS] Created \${n_chunks} chunks"
+
+    else
+        echo "[SITESFROMPOSITIONS] No chunking (chunk_size=0), processing as single file"
+        cp captured_positions.sites4VEP.full.tsv captured_positions.sites4VEP.chunk1.tsv
+    fi
+
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         python: \$(python --version | sed 's/Python //g')
@@ -43,7 +60,7 @@ process SITESFROMPOSITIONS {
 
     stub:
     """
-    touch captured_positions.sites4VEP.tsv;
+    touch captured_positions.sites4VEP.chunk1.tsv;
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/sortpanel/main.nf b/modules/local/sortpanel/main.nf
@@ -0,0 +1,37 @@
+process SORT_MERGED_PANEL {
+
+    tag "${meta.id}"
+    label 'deepcsa_core'
+
+
+    input:
+    tuple val(meta), path(panel)
+
+    output:
+    tuple val(meta), path("*.sorted.tsv") , emit: sorted
+    path "versions.yml"                   , topic: versions
+
+    script:
+    // Sort by chromosome (field 1) and position (field 2). Assumes header in first line.
+    // Using version sort for chromosome (handles chr1 chr2 chr10) after stripping 'chr' if present.
+    """
+    echo "[SORT_MERGED_PANEL] Sorting panel for ${meta.id}"
+    head -n 1 ${panel} > sorted.tmp
+    tail -n +2 ${panel} | awk 'BEGIN{OFS="\\t"} {sub(/^chr/,"",\$1); print}' | sort -k1,1V -k2,2n | awk 'BEGIN{OFS="\\t"} {print "chr"\$0}' >> sorted.tmp
+    mv sorted.tmp ${panel.getBaseName()}.sorted.tsv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        bash: \$(bash --version | head -n 1 | sed 's/^.*version //; s/ .*//')
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch ${panel.getBaseName()}.sorted.tsv
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        bash: \$(bash --version | head -n 1 | sed 's/^.*version //; s/ .*//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/ensemblvep/veppanel/main.nf b/modules/nf-core/ensemblvep/veppanel/main.nf
diff --git a/nextflow.config b/nextflow.config
@@ -103,6 +103,7 @@ params {
     min_muts_per_sample                      = 0
     selected_genes                           = ''
     panel_with_canonical                     = true
+    panel_sites_chunk_size                   = 0  // 0 means no chunking (default), set to positive integer to enable chunking
 
     germline_threshold                       = 0.3
     mutation_depth_threshold                 = 100

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -571,6 +571,21 @@
                 }
             }
         },
+        "parallel_processing_parameters": {
+            "title": "Parallel processing and chunking options",
+            "type": "object",
+            "fa_icon": "fas fa-tasks",
+            "description": "Parameters to control parallel processing, chunking, and memory management during panel creation and annotation.",
+            "properties": {
+                "panel_sites_chunk_size": {
+                    "type": "integer",
+                    "description": "Number of sites per chunk for parallel VEP annotation (0 = no chunking)",
+                    "default": 0,
+                    "fa_icon": "fas fa-cut",
+                    "help_text": "When set to a positive integer, splits the sites file into chunks for parallel processing through VEP annotation. Set to 0 to disable chunking (process as single file). Recommended values: 100000-500000 for large datasets."
+                }
+            }
+        },
         "filtering_parameters": {
             "title": "Profile computation options",
             "type": "object",
@@ -1115,6 +1130,9 @@
         {
             "$ref": "#/$defs/profile_computation_config"
         },
+        {
+            "$ref": "#/$defs/parallel_processing_parameters"
+        },
         {
             "$ref": "#/$defs/filtering_parameters"
         },