Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
035a0c7
dev: VEP chunk and VEP cache beegfs
migrau Apr 3, 2025
8ef2919
fix: use standard cache for ENSEMBLVEP_VEP
migrau May 8, 2025
40bb507
perf: improve VEP performance by converting input format
migrau May 14, 2025
bb21b25
fix: panel_postprocessing_annotation.py
migrau May 14, 2025
7c73d3b
fix: arguments safe_transform_context
migrau May 16, 2025
276152d
perf: chunking panel_custom_processing.py
migrau May 20, 2025
7bc3a16
perf: CREATECAPTUREDPANELS containers edited. create_panel_versions.p…
migrau May 22, 2025
346665d
fix: python3 container for CREATECAPTUREDPANELS
migrau Jun 4, 2025
08d8fad
fix: remove container option CREATECAPTUREDPANELS. fix conda versions…
migrau Jun 4, 2025
5c8ff55
fix: typo CREATECAPTUREDPANELS
migrau Jun 4, 2025
891ec85
fix: wave true only for CREATECAPTUREDPANELS
migrau Jun 4, 2025
e1fd6af
fix: syntax config module CREATECAPTUREDPANELS
migrau Jun 5, 2025
ca0ae01
fix: new way to specify wave for a single process
migrau Jun 5, 2025
5560c25
fix: toString added for wave
migrau Jun 5, 2025
c0c3e97
fix: wave label added
migrau Jun 5, 2025
24efcf6
fix: wave true for everything
migrau Jun 5, 2025
7734938
fix: wave false except CREATECAPTUREDPANELS
migrau Jun 5, 2025
b625332
fix: comma...
migrau Jun 5, 2025
8110a34
fix: wave removed. New container created
migrau Jun 5, 2025
e718e41
fix: Removed wave from nextflow.config
migrau Jun 6, 2025
9fd0ed7
fix: adjust memory requeriments
migrau Jun 30, 2025
abc85ed
perf: added new profile, nanoseq
migrau Jun 30, 2025
3e0b4b5
fix: naming withLabel config review
migrau Jul 1, 2025
61ec864
fix: nanoseq config resourceLimits
migrau Jul 1, 2025
0188172
fix: correct withName *
migrau Jul 1, 2025
b0e422a
fix: SITESFROMPOSITIONS memory test
migrau Jul 1, 2025
63dcea7
fix SITESFROMPOSITIONS
migrau Jul 1, 2025
7c2f56b
fix: SITESFROMPOSITIONS
migrau Jul 1, 2025
6e53f23
fix: fix profile
migrau Jul 1, 2025
e9d1b3b
fix: SITESFROMPOSITIONS config
migrau Jul 1, 2025
1dffd94
fix: POSTPROCESSVEPPANEL. Time
migrau Jul 2, 2025
24b170a
fix: RESOURCE LIMITS added
migrau Jul 3, 2025
d243ebc
fix: typo
migrau Jul 3, 2025
945c129
fix: update base.config
migrau Jul 3, 2025
198ff20
fix: adjust nanoconfig
migrau Jul 3, 2025
0cfd80f
Merge branch 'dev' into dev-chunk-optimization-POSTPROCESSVEPPANEL
migrau Nov 14, 2025
6c64f4d
fix: parallelization optional. Include sort for bedtools merge
migrau Nov 14, 2025
b2f12fd
fix: gene omega error: "No flagged entries found; skipping plots and …
migrau Nov 16, 2025
d4ed3c2
fix: Add debug logging and ensure failing_consensus file is always cr…
migrau Nov 18, 2025
4be3b45
feat: Add chunking support for SITESFROMPOSITIONS with genomic sorting
migrau Nov 19, 2025
e52cb76
feat: add parallel_processing_parameters section to schema for chunki…
migrau Nov 19, 2025
92580ce
update dnds genes list
FerriolCalvet Nov 21, 2025
2f9ea7a
Merge branch 'dev' into dev-chunk-only-parallel
FerriolCalvet Feb 27, 2026
a7eb77e
minor update
FerriolCalvet Feb 27, 2026
1a9e500
update sites from positions
FerriolCalvet Feb 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions bin/create_consensus_panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
#####
# Filter failing columns only for rows that pass the compliance threshold
compliance_df_passing = compliance_df.filter(passing_rows)

print(f"DEBUG: Total positions passing compliance threshold: {compliance_df_passing.height}")
print(f"DEBUG: Number of samples: {compliance_df_passing.width}")

# Invert all boolean values (True → False, False → True)
failing_mask = pl.DataFrame([
Expand All @@ -67,6 +70,7 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
"Failed": True
})

print(f"DEBUG: Total failing entries found: {len(failing_columns_counts)}")

if failing_columns_counts:
failing_columns_counts_df = pl.DataFrame(failing_columns_counts)
Expand All @@ -76,6 +80,12 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
.rename({"count": "FAILING_COUNT"})
)
failure_counts_filtered.write_csv(f"failing_consensus.{version}.tsv", separator="\t")
print(f"DEBUG: Created failing_consensus.{version}.tsv with {failure_counts_filtered.height} samples")
else:
# Create empty file with header for consistency
empty_df = pl.DataFrame({"SAMPLE_ID": [], "FAILING_COUNT": []}, schema={"SAMPLE_ID": pl.Utf8, "FAILING_COUNT": pl.Int64})
empty_df.write_csv(f"failing_consensus.{version}.tsv", separator="\t")
print(f"DEBUG: No failures detected - created empty failing_consensus.{version}.tsv")


@click.command()
Expand Down
56 changes: 35 additions & 21 deletions bin/create_panel_versions.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
#!/usr/bin/env python
#!/usr/bin/env python3

"""
create_panel_versions.py

import click
import pandas as pd
import os
Generates multiple VEP annotation panel subsets based on the 'IMPACT' column
using the high-performance Polars library.

Usage:
python create_panel_versions.py --compact-annot-panel-path <input_tsv> --output <output_prefix>
"""

# TODO: check pandas version 2.0.3
# -- Auxiliary functions -- #
import polars as pl
import click
import sys

panel_impact_dict = {
PANEL_IMPACT_DICT = {

"protein_affecting": ["nonsense", "missense",
"essential_splice",
Expand Down Expand Up @@ -68,25 +74,33 @@

}

# -- Main function -- #

def create_panel_versions(compact_annot_panel_path, output_path):
def create_panel_versions(input_path: str, output_prefix: str) -> None:
"""
Generates panel subsets from a VEP-annotated file using Polars.

\b
INPUT_PATH: Path to the annotated TSV file.
OUTPUT_PREFIX: Prefix for the output files (e.g., 'output/panel').
"""
try:
df = pl.read_csv(input_path, separator="\t")
except Exception as e:
click.echo(f"Error reading input file: {e}", err=True)
sys.exit(1)

# Load VEP annotated panel, already compacted to have one variant per site
## requires column named IMPACT with consequence type
compact_annot_panel_df = pd.read_csv(compact_annot_panel_path, sep = "\t")
if "IMPACT" not in df.columns:
click.echo("ERROR: 'IMPACT' column not found in input file.", err=True)
sys.exit(1)

# Create panel versions
for version in panel_impact_dict:
for version_name, impact_values in PANEL_IMPACT_DICT.items():
filtered = df.filter(pl.col("IMPACT").is_in(impact_values))
filtered.write_csv(f"{output_prefix}.{version_name}.tsv", separator="\t")

panel_version = compact_annot_panel_df.loc[compact_annot_panel_df["IMPACT"].isin(panel_impact_dict[version])]
panel_version.to_csv(f"{output_path}.{version}.tsv",
sep = "\t", index = False)
# Write the full file as a version
df.write_csv(f"{output_prefix}.all.tsv", separator="\t")

# Store complete panel (better change this way of using this version in nextflow)
version = "all"
compact_annot_panel_df.to_csv(f"{output_path}.{version}.tsv",
sep = "\t", index = False)
click.echo("Panel versions generated successfully.")


@click.command()
Expand Down
2 changes: 1 addition & 1 deletion conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ process {
// TODO nf-core: Check the defaults for all processes
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 15.min * task.attempt }
time = { 30.min * task.attempt }



Expand Down
3 changes: 3 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,9 @@ process {
: null
}

withName: SITESFROMPOSITIONS {
ext.chunk_size = params.panel_sites_chunk_size ?: 0
}
withLabel : deepcsa_core {
container = "docker.io/bbglab/deepcsa-core:0.1.0"
}
Expand Down
14 changes: 14 additions & 0 deletions conf/results_outputs.config
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,20 @@ process {
}

withName: POSTPROCESSVEPPANEL {
publishDir = [
enabled : false
]
}

withName: 'SITESFROMPOSITIONS' {
publishDir = [
path: { "${params.outdir}/regions/allsites" },
mode: params.publish_dir_mode,
pattern: 'captured_positions.sites4VEP.full.tsv',
]
}

withName: 'SORTPANELRICH|SORTPANELRICHALL' {
publishDir = [
path: { "${params.outdir}/regions/annotations" },
mode: params.publish_dir_mode,
Expand Down
11 changes: 6 additions & 5 deletions conf/tools/panels.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@ process {
withName: 'BBGTOOLS:DEEPCSA:CREATEPANELS:VCFANNOTATEPANEL:ENSEMBLVEP_VEP' {
ext.args = "${params.vep_params_panel} --tab"
publishDir = [
[
mode: params.publish_dir_mode,
path: { "${params.outdir}/regions/panelannotation" },
pattern: "*{gz}",
]
enabled : false
// [
// mode: params.publish_dir_mode,
// path: { "${params.outdir}/regions/panelannotation" },
// pattern: "*{gz}",
// ]
]
}

Expand Down
13 changes: 7 additions & 6 deletions modules/local/createpanels/captured/main.nf
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
process CREATECAPTUREDPANELS {
tag "$meta.id"
label 'process_single'
label 'process_medium_high_memory'

container "community.wave.seqera.io/library/bedtools_pybedtools_pandas_pip_pruned:78080da05d53636d"


conda "python=3.10.17 bioconda::pybedtools=0.12.0 conda-forge::polars=1.30.0 conda-forge::click=8.2.1 conda-forge::gcc_linux-64=15.1.0 conda-forge::gxx_linux-64=15.1.0"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'docker://bbglab/deepcsa_bed:latest' :
'bbglab/deepcsa_bed:latest' }"

input:
tuple val(meta), path(compact_captured_panel_annotation)

Expand Down Expand Up @@ -36,7 +36,8 @@ process CREATECAPTUREDPANELS {
bedtools merge \\
-i <(
tail -n +2 \$captured_panel | \\
awk -F'\\t' '{print \$1, \$2-1, \$2}' OFS='\\t' | uniq
awk -F'\\t' '{print \$1, \$2-1, \$2}' OFS='\\t' | \\
sort -k1,1 -k2,2n | uniq
) > \${captured_panel%.tsv}.bed;
done

Expand Down
25 changes: 21 additions & 4 deletions modules/local/sitesfrompositions/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ process SITESFROMPOSITIONS {
tuple val(meta), path(depths)

output:
tuple val(meta), path("*.sites4VEP.tsv") , emit: annotated_panel_reg
path "versions.yml" , topic: versions
tuple val(meta), path("*.sites4VEP.chunk*.tsv") , emit: annotated_panel_reg
path "versions.yml" , topic: versions


script:
def assembly = task.ext.assembly ?: "hg38"
def chunk_size = task.ext.chunk_size ?: 0

// TODO
// see if there is a better way to filter out chromosomes
Expand All @@ -34,7 +35,23 @@ process SITESFROMPOSITIONS {

rm captured_positions.tsv

awk '{print "chr"\$0}' captured_positions.sites4VEP.tmp.tsv > captured_positions.sites4VEP.tsv
awk '{print "chr"\$0}' captured_positions.sites4VEP.tmp.tsv > captured_positions.sites4VEP.full.tsv

# Chunk the sites file if chunk_size is set
if [ ${chunk_size} -gt 0 ]; then
echo "[SITESFROMPOSITIONS] Chunking sites file with chunk_size=${chunk_size}"

# Split file into chunks (excluding header)
cat captured_positions.sites4VEP.full.tsv | split -l ${chunk_size} --additional-suffix=.tsv -d - captured_positions.sites4VEP.chunk

n_chunks=\$(ls captured_positions.sites4VEP.chunk*.tsv | wc -l)
echo "[SITESFROMPOSITIONS] Created \${n_chunks} chunks"

else
echo "[SITESFROMPOSITIONS] No chunking (chunk_size=0), processing as single file"
cp captured_positions.sites4VEP.full.tsv captured_positions.sites4VEP.chunk1.tsv
fi

cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
Expand All @@ -43,7 +60,7 @@ process SITESFROMPOSITIONS {

stub:
"""
touch captured_positions.sites4VEP.tsv;
touch captured_positions.sites4VEP.chunk1.tsv;

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
37 changes: 37 additions & 0 deletions modules/local/sortpanel/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
process SORT_MERGED_PANEL {

tag "${meta.id}"
label 'deepcsa_core'


input:
tuple val(meta), path(panel)

output:
tuple val(meta), path("*.sorted.tsv") , emit: sorted
path "versions.yml" , topic: versions

script:
// Sort by chromosome (field 1) and position (field 2). Assumes header in first line.
// Using version sort for chromosome (handles chr1 chr2 chr10) after stripping 'chr' if present.
"""
echo "[SORT_MERGED_PANEL] Sorting panel for ${meta.id}"
head -n 1 ${panel} > sorted.tmp
tail -n +2 ${panel} | awk 'BEGIN{OFS="\\t"} {sub(/^chr/,"",\$1); print}' | sort -k1,1V -k2,2n | awk 'BEGIN{OFS="\\t"} {print "chr"\$0}' >> sorted.tmp
mv sorted.tmp ${panel.getBaseName()}.sorted.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
bash: \$(bash --version | head -n 1 | sed 's/^.*version //; s/ .*//')
END_VERSIONS
"""

stub:
"""
touch ${panel.getBaseName()}.sorted.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
bash: \$(bash --version | head -n 1 | sed 's/^.*version //; s/ .*//')
END_VERSIONS
"""
}
12 changes: 10 additions & 2 deletions modules/nf-core/ensemblvep/veppanel/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ params {
min_muts_per_sample = 0
selected_genes = ''
panel_with_canonical = true
panel_sites_chunk_size = 0 // 0 means no chunking (default), set to positive integer to enable chunking

germline_threshold = 0.3
mutation_depth_threshold = 100
Expand Down
18 changes: 18 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,21 @@
}
}
},
"parallel_processing_parameters": {
"title": "Parallel processing and chunking options",
"type": "object",
"fa_icon": "fas fa-tasks",
"description": "Parameters to control parallel processing, chunking, and memory management during panel creation and annotation.",
"properties": {
"panel_sites_chunk_size": {
"type": "integer",
"description": "Number of sites per chunk for parallel VEP annotation (0 = no chunking)",
"default": 0,
"fa_icon": "fas fa-cut",
"help_text": "When set to a positive integer, splits the sites file into chunks for parallel processing through VEP annotation. Set to 0 to disable chunking (process as single file). Recommended values: 100000-500000 for large datasets."
}
}
},
"filtering_parameters": {
"title": "Profile computation options",
"type": "object",
Expand Down Expand Up @@ -1115,6 +1130,9 @@
{
"$ref": "#/$defs/profile_computation_config"
},
{
"$ref": "#/$defs/parallel_processing_parameters"
},
{
"$ref": "#/$defs/filtering_parameters"
},
Expand Down
Loading