From ada77ebd98a02fcc6a9defdd1c1564013775a538 Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Mon, 23 Feb 2026 14:49:47 +0100 Subject: [PATCH 01/19] documentation: add docstrings (#229) - also simplified plot_coverage_per_gene --- bin/panels_computedna2protein.py | 95 +++++++++++++++++++++++++++----- 1 file changed, 81 insertions(+), 14 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index 4a0437c5..b2a57771 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -1,8 +1,36 @@ #!/usr/bin/env python -import time +""" +Panels Compute DNA to Protein - MAF processing to protein positions and +coverage computation. + +This script processes the MAF file to retrieve the transcript-gene pairs, +then retrieves exon, and CDS coordinates and maps DNA positions to protein positions. +It also computes the coverage of each position and generates plots of coverage +per gene. + +Arguments: +---------------------- +--mutations-file : str + Path to the MAF file containing mutations. +--consensus-file : str + Path to the consensus panel file. +--depths-file : str + Path to the file containing depth information for all samples. + +Authors +---------------------- +Author : Stefano Pellegrini (@St3451) +Email : stefano.pellegrini@irbbarcelona.org + +Contributors +---------------------- +- Ferriol Calvet - @FerriolCalvet (ferriol.calvet@irbbarcelona.org) +- Marta Huertas - @m-huertasp (marta.huertas@irbbarcelona.org) +""" -# Third-party imports +import time +import logging import requests import numpy as np import pandas as pd @@ -10,14 +38,35 @@ import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages +# Logging +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s - %(message)s", level=logging.DEBUG, datefmt="%m/%d/%Y %I:%M:%S %p" +) +LOG = logging.getLogger("DNA2protein") + ##### # Define functions ##### def get_transcript_gene_from_maf(path_maf, consensus_file): + """ + Process MAF file to retrieve gene-transcript pairs for the genes in the consensus panel. + + Parameters + ---------- + path_maf : str + Path to the MAF file containing mutations. + consensus_file : str + Path to the consensus panel file. + + Returns + ------- + gene_transcript_pairs : pandas.DataFrame + DataFrame with gene-transcript pairs. + """ maf_df = pd.read_table(path_maf) genes_in_consensus = pd.read_table(consensus_file)["GENE"].unique() - # Final filter + # Filter MAF (keep only positions with canonical protein position) maf_df_f = maf_df[ (maf_df["TYPE"].isin(["SNV", "INSERTION", "DELETION"])) & (maf_df["canonical_SYMBOL"].isin(genes_in_consensus)) @@ -27,27 +76,45 @@ def get_transcript_gene_from_maf(path_maf, consensus_file): gene_transcript_pairs = maf_df_f[["canonical_SYMBOL", "canonical_Feature"]].drop_duplicates().reset_index(drop=True) gene_transcript_pairs.columns = ["Gene", "Ens_transcript_ID"] + LOG.info(f"Retrieved {len(gene_transcript_pairs)} gene-transcript pairs from MAF file.") return gene_transcript_pairs def plot_coverage_per_gene(depths_df): + """ + Wrapper function to plot coverage per gene for DNA, protein and exon levels. - dna_coverage = depths_df.drop_duplicates(subset=['GENE', 'DNA_POS', 'COVERED']) - dna_coverage_summary = dna_coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - plot_single_coverage(dna_coverage_summary, "DNA") - - prot_coverage = depths_df.drop_duplicates(subset=['GENE', 'PROT_POS', 'COVERED']) - prot_coverage_summary = prot_coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - plot_single_coverage(prot_coverage_summary, "Protein") + Parameters + ---------- + depths_df : pandas.DataFrame + DataFrame containing depth and coverage information for DNA, protein and exon positions. + In this case, it will be the "exons_depth" created in `get_dna2prot_depth` function. + """ + coverage = depths_df.drop_duplicates(subset=['GENE', 'DNA_POS', 'COVERED']) + coverage_summary = coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - exon_coverage = depths_df.drop_duplicates(subset=['GENE', 'EXON_RANK', 'COVERED']) - exon_coverage_summary = exon_coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - plot_single_coverage(exon_coverage_summary, "Exon") + for prefix in ["DNA", "Protein", "Exon"]: + LOG.info(f"Plotting coverage for {prefix}...") + plot_single_coverage(coverage_summary, prefix) def plot_single_coverage(coverage_summary, prefix, batch_size = 5): + """ + Plot coverage for a single prefix (DNA, protein or exon) and save the results in a PDF file. + + Parameters + ---------- + coverage_summary : pandas.DataFrame + DataFrame containing the coverage summary for each gene and coverage status. + prefix : str + The prefix to plot (DNA, Protein or Exon). + batch_size : int, optional + The number of genes to include in each batch when plotting (default is 5). + """ + # Generate copy of the coverage summary to avoid modifying the original DataFrame + coverage_summary_cp = coverage_summary.copy() - coverage_pivot = coverage_summary.pivot(index='GENE', columns='COVERED', values='COUNT').fillna(0) + coverage_pivot = coverage_summary_cp.pivot(index='GENE', columns='COVERED', values='COUNT').fillna(0) coverage_pivot = coverage_pivot.sort_values(1, ascending=False).reset_index() coverage_pivot = coverage_pivot.set_index('GENE') genes_list = coverage_pivot.index.tolist() From d538127f72b79242157f298b1514bb086ef7d62b Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 10:27:17 +0100 Subject: [PATCH 02/19] refactor: get gff through ftp instead of api calls (#229) - Removed functions to get the api calls - New functions to obtain the gff and filter it - Modified functions to process the data - Added docstrings --- bin/panels_computedna2protein.py | 215 ++++++++++++++++++++++--------- 1 file changed, 151 insertions(+), 64 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index b2a57771..ada6d51c 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -29,12 +29,16 @@ - Marta Huertas - @m-huertasp (marta.huertas@irbbarcelona.org) """ +import click import time import logging import requests import numpy as np import pandas as pd -import click +import polars as pl +import io +import gzip +import re import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages @@ -183,66 +187,118 @@ def plot_single_coverage(coverage_summary, prefix, batch_size = 5): # Depth # ===== +# Generator to filter lines before they reach a dataframe +def get_gff_to_generator(release: int = 111): + """ + Get GFF file from ensembl FTP and filter it on the fly to keep only exon and CDS lines. + Parameters + ------------ + release : int + The release number of the Ensembl GFF file to retrieve (default is 111). -def get_tr_lookup(transcript_id, max_iter = 20): + Returns + ------------ + generator + A generator that yields lines from the GFF file that correspond to exon and CDS features. """ - Get exons coord + url = f"https://ftp.ensembl.org/pub/release-{release}/gff3/homo_sapiens/Homo_sapiens.GRCh38.{release}.gff3.gz" + + # Open request + response = requests.get(url, stream=True) + # decompress the stream on the fly + with gzip.GzipFile(fileobj=response.raw) as gz: + for line in gz: + # Decode bytes to string + line_str = line.decode('utf-8') + + # Skip comments + if line_str.startswith('#'): + continue + + # Only "yield" lines that are exon or CDS + # This drastically reduces the data sent to the DataFrame + parts = line_str.split('\t') + if parts[2] in ["exon", "CDS"]: + yield line_str + +def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int = 111) -> pd.DataFrame: """ + Transforms the yields from get_gff_to_generator into a filtered DataFrame. The reading and filtering is done with polars + to improve efficiency. - server = "https://rest.ensembl.org" - ext = f"/lookup/id/{transcript_id}?expand=1" - - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - - iter_count = 0 - while not r.ok and iter_count < max_iter: - print("Retrying lookup... (attempt {}/{})".format(iter_count+1, max_iter)) - time.sleep(5) - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - iter_count += 1 - - return r.json() - + Parameters + ------------ + gene_n_transcript : pd.DataFrame + A DataFrame containing gene and transcript information. + release : int + The Ensembl release number to use for GFF file retrieval (default is 111). -def get_cds_coord(transcript_id, len_cds_with_utr, max_iter = 20): + Returns + ------------ + pd.DataFrame + A filtered DataFrame of GFF lines for the specified genes and release. """ - Get CDS coordinates + # Join the generator into a single buffer for Polars to read + filtered_buffer = io.StringIO("".join(get_gff_to_generator(release=release))) + + # Read generator with polars, generate transcript_id column and filter by the genes in the panel + df = ( + pl.read_csv( + filtered_buffer, + separator="\t", + has_header=False, + new_columns=["chr", "source", "feature", "start", "end", "score", "strand", "phase", "attributes"], + schema_overrides={"start": pl.Int64, "end": pl.Int64, "chr": pl.Utf8, "feature": pl.Utf8, "attributes": pl.Utf8} + ) + .with_columns([ + pl.col("attributes").str.extract(r"transcript:(ENST\d+)", 1).alias("transcript_id") + ]) + .filter( + [pl.col("transcript_id").is_in(gene_n_transcript["Ens_transcript_ID"].to_list())] + ) + ) + + # Transform to pandas for easier manipulation later on + return df.to_pandas() + +def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list[int]: """ - server = "https://rest.ensembl.org" - ext = f"/map/cds/{transcript_id}/1..{len_cds_with_utr}?" - - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - - iter_count = 0 - while not r.ok and iter_count < max_iter: - print("Retrying CDS map... (attempt {}/{})".format(iter_count+1, max_iter)) - time.sleep(5) - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - iter_count += 1 - - return r.json()["mappings"] + Parses the coordinates of an exon row from the GFF DataFrame and returns the exon ID and its coordinates in a list format. + Parameters + ---------- + exon : pandas.Series + A row from the GFF DataFrame corresponding to an exon feature. -def parse_cds_coord(exon): + Returns + ------- + exon_id : str + The ID of the exon extracted from the attributes column. + exons_coord : list + A list containing the chromosome, start, end, and strand information of the exon. + """ - strand = exon["strand"] + strand = 1 if exon.strand == "+" else -1 if strand == 1: - start = exon["start"] - end = exon["end"] + start = exon.start + end = exon.end else: - start = exon["end"] - end = exon["start"] + start = exon.end + end = exon.start - if "id" in exon: - exon_id = exon["id"] - chrom = f'chr{exon["seq_region_name"]}' + # Extract exon_id using regex + match = re.search(r"exon_id=([^;]+)", exon.attributes) + if match: + # Extract exon_id using regex + exon_id = match.group(1) + chrom = f'chr{exon.chr}' return exon_id, [chrom, start, end, strand] else: - chrom = exon["seq_region_name"] + chrom = exon.chr return [chrom, start, end, strand] @@ -295,38 +351,69 @@ def get_prot_coverage(dna_prot_df, gene, filter_masked_depth=True): return gene_dna_prot_df -def get_exon_coord_wrapper(gene_n_transcript): +def get_exon_coord_wrapper_new(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Wrapper function to retrieve exon and CDS coordinates for the genes and transcripts in the panel. - # Init df for coordinates - coord_df = gene_n_transcript + Parameters + ---------- + gene_n_transcript : pandas.DataFrame + A DataFrame containing gene and transcript information for the panel. + gff_df : pandas.DataFrame + A DataFrame containing the GFF data filtered for exon and CDS features. - # Get coord + Returns + ------- + final_coord_df : pandas.DataFrame + A DataFrame containing the CDS coordinates for each gene-transcript pair, along with protein position mapping. + final_exons_df : pandas.DataFrame + A DataFrame containing the exon coordinates (including UTRs) for each gene-transcript pair. + """ coord_df_lst = [] exons_coord_df_lst = [] - for gene, transcript in coord_df.values: - print("Processing gene:", gene) - coord_lst = [] - # Get the coord of exons with CDS and UTR as well as the length with UTR and exons ID - exons_lookup = get_tr_lookup(transcript) # We will use this to get Exons ID - for i, exon in enumerate(exons_lookup["Exon"]): - exon_id, exons_coord = parse_cds_coord(exon) - exons_coord_df_lst.append([f"{gene}--exon_{i+1}_{transcript}_{exon_id}"] + exons_coord) + for gene, transcript in gene_n_transcript.values: + print(f"Processing gene: {gene}") + + # Get all features for this transcript once to avoid double filtering + transcript_data = gff_df[gff_df["transcript_id"] == transcript] + if transcript_data.empty: + continue - # Get the CDS coordinates of the exons removing UTRs to map to protein positions - for i, exon in enumerate(get_cds_coord(transcript, exons_lookup["length"])): - coord_lst.append((parse_cds_coord(exon) + [i])) + # Determine strand from the first available entry + strand = transcript_data["strand"].iloc[0] + is_positive = (strand == "+") - gene_coord_df = pd.DataFrame(coord_lst, columns = ["Chr", "Start", "End", "Strand", "Exon_rank"]) - gene_coord_df["Gene"] = gene - gene_coord_df["Ens_transcript_ID"] = transcript - coord_df_lst.append(gene_coord_df) + # --- Handle exons (with UTRs) --- + exons_lookup = transcript_data[transcript_data["feature"] == "exon"] + # Sort based on strand: '+' is low-to-high, '-' is high-to-low + exons_lookup = exons_lookup.sort_values("start", ascending=is_positive) - coord_df = pd.concat(coord_df_lst) - exons_coord_df = pd.DataFrame(exons_coord_df_lst, columns = ["ID", "Chr", "Start", "End", "Strand"]) + for i, exon in enumerate(exons_lookup.itertuples(index=False)): + exon_id, exons_coord = parse_cds_coord(exon) + exons_coord_df_lst.append([f"{gene}--exon_{i+1}_{transcript}_{exon_id}"] + exons_coord) - return coord_df, exons_coord_df + # --- Handle CDS --- + cds_lookup = transcript_data[transcript_data["feature"] == "CDS"] + cds_lookup = cds_lookup.sort_values("start", ascending=is_positive) + coord_lst = [] + for i, exon in enumerate(cds_lookup.itertuples(index=False)): + exons_coord = parse_cds_coord(exon) + # Include the biological rank (i) + coord_lst.append(exons_coord + [i]) + + if coord_lst: + gene_coord_df = pd.DataFrame(coord_lst, columns=["Chr", "Start", "End", "Strand", "Exon_rank"]) + gene_coord_df["Gene"] = gene + gene_coord_df["Ens_transcript_ID"] = transcript + coord_df_lst.append(gene_coord_df) + + # Combine results + final_coord_df = pd.concat(coord_df_lst) if coord_df_lst else pd.DataFrame() + final_exons_df = pd.DataFrame(exons_coord_df_lst, columns=["ID", "Chr", "Start", "End", "Strand"]) + + return final_coord_df, final_exons_df def dna2prot_depth(gene_list, coord_df, dna_sites, depth_df): From 683d50f7dac0003ddbe9018bc674f3053992c6ac Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 10:57:21 +0100 Subject: [PATCH 03/19] documentation: add docstrings to rest of functions (#229) - Also modified the order - Removed unused functions --- bin/panels_computedna2protein.py | 483 ++++++++++++++++--------------- 1 file changed, 255 insertions(+), 228 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index ada6d51c..41793d2b 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -30,7 +30,6 @@ """ import click -import time import logging import requests import numpy as np @@ -48,9 +47,9 @@ ) LOG = logging.getLogger("DNA2protein") -##### -# Define functions -##### + +# Data parsing and processing functions +# ---------------------------------------------------------- def get_transcript_gene_from_maf(path_maf, consensus_file): """ Process MAF file to retrieve gene-transcript pairs for the genes in the consensus panel. @@ -83,110 +82,6 @@ def get_transcript_gene_from_maf(path_maf, consensus_file): LOG.info(f"Retrieved {len(gene_transcript_pairs)} gene-transcript pairs from MAF file.") return gene_transcript_pairs - -def plot_coverage_per_gene(depths_df): - """ - Wrapper function to plot coverage per gene for DNA, protein and exon levels. - - Parameters - ---------- - depths_df : pandas.DataFrame - DataFrame containing depth and coverage information for DNA, protein and exon positions. - In this case, it will be the "exons_depth" created in `get_dna2prot_depth` function. - """ - coverage = depths_df.drop_duplicates(subset=['GENE', 'DNA_POS', 'COVERED']) - coverage_summary = coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - - for prefix in ["DNA", "Protein", "Exon"]: - LOG.info(f"Plotting coverage for {prefix}...") - plot_single_coverage(coverage_summary, prefix) - - -def plot_single_coverage(coverage_summary, prefix, batch_size = 5): - """ - Plot coverage for a single prefix (DNA, protein or exon) and save the results in a PDF file. - - Parameters - ---------- - coverage_summary : pandas.DataFrame - DataFrame containing the coverage summary for each gene and coverage status. - prefix : str - The prefix to plot (DNA, Protein or Exon). - batch_size : int, optional - The number of genes to include in each batch when plotting (default is 5). - """ - # Generate copy of the coverage summary to avoid modifying the original DataFrame - coverage_summary_cp = coverage_summary.copy() - - coverage_pivot = coverage_summary_cp.pivot(index='GENE', columns='COVERED', values='COUNT').fillna(0) - coverage_pivot = coverage_pivot.sort_values(1, ascending=False).reset_index() - coverage_pivot = coverage_pivot.set_index('GENE') - genes_list = coverage_pivot.index.tolist() - coverage_perc = coverage_pivot.div(coverage_pivot.sum(axis=1), axis=0) * 100 - coverage_pivot.to_csv(f"coverage_count_{prefix}.tsv", header=True, index=True, sep='\t') - coverage_perc.to_csv(f"coverage_perc_{prefix}.tsv", header=True, index=True, sep='\t') - - if len(genes_list) < batch_size: - - fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - - covered_col = True if True in coverage_pivot.columns else (1 if 1 in coverage_pivot.columns else coverage_pivot.columns[-1]) - - coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff'], - ) - - axes[0].set_title(f'{prefix} : covered vs. non-covered') - axes[0].set_ylabel(f'Number of {prefix}') - axes[0].set_xlabel('Gene') - axes[0].legend(title='Covered', loc='upper right') - axes[0].tick_params(axis='x', rotation=45) - - coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') - axes[1].set_title(f'{prefix} : percentage of covered') - axes[1].set_ylabel('Percentage (%)') - axes[1].set_xlabel('Gene') - axes[1].set_ylim(0, 100) - axes[1].tick_params(axis='x', rotation=45) - - plt.tight_layout() - plt.savefig(f"coverage_per_{prefix}.pdf", dpi=300) - plt.show() - - - else: - with PdfPages(f"coverage_per_{prefix}_batches.pdf") as pdf: - # split into batches N genes - for i in range(0, len(genes_list), batch_size): - batch_genes = genes_list[i:i+batch_size] - batch_coverage_pivot = coverage_pivot.loc[batch_genes] - batch_coverage_perc = coverage_perc.loc[batch_genes] - - fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - - covered_col = True if True in batch_coverage_pivot.columns else (1 if 1 in batch_coverage_pivot.columns else batch_coverage_pivot.columns[-1]) - - batch_coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff']) - axes[0].set_title(f'{prefix} : covered vs. non-covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') - axes[0].set_ylabel(f'Number of {prefix}') - axes[0].set_xlabel('Gene') - axes[0].legend(title='Covered', loc='upper right') - axes[0].tick_params(axis='x', rotation=45) - - batch_coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') - axes[1].set_title(f'{prefix} : percentage of covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') - axes[1].set_ylabel('Percentage (%)') - axes[1].set_xlabel('Gene') - axes[1].set_ylim(0, 100) - axes[1].tick_params(axis='x', rotation=45) - - plt.tight_layout() - pdf.savefig(dpi=300) - plt.show() - plt.close() - - -# Depth -# ===== # Generator to filter lines before they reach a dataframe def get_gff_to_generator(release: int = 111): """ @@ -262,7 +157,7 @@ def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int = 111) -> p # Transform to pandas for easier manipulation later on return df.to_pandas() -def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list[int]: +def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list: """ Parses the coordinates of an exon row from the GFF DataFrame and returns the exon ID and its coordinates in a list format. @@ -302,56 +197,7 @@ def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list[int]: return [chrom, start, end, strand] - -# Get Exon coord to protein pos -# ----------------------------- - -def get_dna_exon_pos(exon_range, strand): - - if strand == -1: - return np.arange(exon_range[1], exon_range[0] + 1)[::-1] - else: - return np.arange(exon_range[0], exon_range[1] + 1) - - -def get_exon_ix(i, exon_range, strand): - - len_exon = len(get_dna_exon_pos(exon_range, strand)) - - return np.repeat(i, len_exon) - - -def get_dna_map_to_protein(coord_df): - - strand = coord_df["Strand"].unique()[0] - - exons_range = coord_df[["Start", "End"]].values - exons = np.concatenate([get_dna_exon_pos(exon, strand) for exon in exons_range]) - exons_ix = np.concatenate([get_exon_ix(i, exon, strand) for i, exon in enumerate(exons_range)]) - prot_pos = np.arange(len(exons)) // 3 + 1 - - df = pd.DataFrame({"GENE" : coord_df["Gene"].unique()[0], - "CHROM" : f'chr{coord_df["Chr"].unique()[0]}', - "DNA_POS" : exons, - "PROT_POS" : prot_pos, - "REVERSE_STRAND" : strand, - "EXON_RANK" : exons_ix, - "TRANSCRIPT_ID" : coord_df["Ens_transcript_ID"].unique()[0]}) - - return df - - -def get_prot_coverage(dna_prot_df, gene, filter_masked_depth=True): - - gene_dna_prot_df = dna_prot_df[dna_prot_df["GENE"] == gene] - gene_dna_prot_df = gene_dna_prot_df.dropna(subset=["PROT_POS"])[["PROT_POS", "COVERED", "DEPTH"]].reset_index(drop=True) - gene_dna_prot_df = gene_dna_prot_df.groupby("PROT_POS").sum().reset_index() - gene_dna_prot_df.COVERED = (gene_dna_prot_df.COVERED > 0).astype(int) - - return gene_dna_prot_df - - -def get_exon_coord_wrapper_new(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: +def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """ Wrapper function to retrieve exon and CDS coordinates for the genes and transcripts in the panel. @@ -415,11 +261,137 @@ def get_exon_coord_wrapper_new(gene_n_transcript: pd.DataFrame, gff_df: pd.DataF return final_coord_df, final_exons_df +# Coordinate parsing and DNA-to-protein mapping functions +# ---------------------------------------------------------- +def get_dna_exon_pos(exon_range: list, strand: int) -> list: + """ + Get the DNA positions of an exon given its range and strand. + Parameters + ------------ + exon_range : list + A list containing the start and end positions of the exon. + strand : int + The strand of the exon (1 for positive strand, -1 for negative strand). + + Returns + ------------ + list + A list of DNA positions corresponding to the exon, ordered according to the strand. + """ + + if strand == -1: + return np.arange(exon_range[1], exon_range[0] + 1)[::-1] + else: + return np.arange(exon_range[0], exon_range[1] + 1) + + +def get_exon_ix(i: int, exon_range: list, strand: int) -> list: + """ + Get the exon index for each DNA position in the exon, ordered according to the strand. + + Parameters + ------------ + i : int + The exon index (starting from 0). + exon_range : list + A list containing the start and end positions of the exon. + strand : int + The strand of the exon (1 for positive strand, -1 for negative strand). + + Returns + ------------ + list + A list of exon indices corresponding to each DNA position in the exon, + ordered according to the strand. + """ + len_exon = len(get_dna_exon_pos(exon_range, strand)) + + return np.repeat(i, len_exon) + -def dna2prot_depth(gene_list, coord_df, dna_sites, depth_df): +def get_dna_map_to_protein(coord_df: pd.DataFrame) -> pd.DataFrame: + """ + Get a mapping of DNA positions to protein positions for a given gene-transcript pair. + + Parameters + ------------ + coord_df : pandas.DataFrame + A DataFrame containing the coordinates of the CDS for a specific gene-transcript pair. + + Returns + ------------ + pandas.DataFrame + A DataFrame containing the mapping of DNA positions to protein positions. + """ + strand = coord_df["Strand"].unique()[0] + + exons_range = coord_df[["Start", "End"]].values + exons = np.concatenate([get_dna_exon_pos(exon, strand) for exon in exons_range]) + exons_ix = np.concatenate([get_exon_ix(i, exon, strand) for i, exon in enumerate(exons_range)]) + prot_pos = np.arange(len(exons)) // 3 + 1 + + df = pd.DataFrame({"GENE" : coord_df["Gene"].unique()[0], + "CHROM" : f'chr{coord_df["Chr"].unique()[0]}', + "DNA_POS" : exons, + "PROT_POS" : prot_pos, + "REVERSE_STRAND" : strand, + "EXON_RANK" : exons_ix, + "TRANSCRIPT_ID" : coord_df["Ens_transcript_ID"].unique()[0]}) + + return df + +def find_exon(x_coord: dict, exon_coord_df: pd.DataFrame) -> str | np.nan: + """ + Find the exon ID for a given DNA position. + Parameters + ------------ + x_coord : dict + A dictionary containing the DNA position, chromosome, and strand information. + exon_coord_df : pandas.DataFrame + A DataFrame containing the coordinates of all exons. + + Returns + ------------ + str or np.nan + The ID of the exon that contains the given DNA position, or np.nan if no match is found. + """ + + dna_pos, chrom, strand = x_coord["DNA_POS"], x_coord["CHROM"], x_coord["REVERSE_STRAND"] + + if strand == -1: + matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] <= dna_pos) & (dna_pos <= exon_coord_df['Start'])] + + else: + matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] >= dna_pos) & (dna_pos >= exon_coord_df['Start'])] + + return matches['ID'].values[0] if not matches.empty else np.nan + + +# Depth computation and coverage functions +# ---------------------------------------------------------- +def dna2prot_depth(gene_list: list, coord_df: pd.DataFrame, dna_sites: pd.DataFrame, depth_df: pd.DataFrame) -> pd.DataFrame: """ Get a DNA to protein mapping of all positions in the provided list of genes - Add as well coverage info & DNA to GENE annotation + Add coverage info & DNA to GENE annotation + + Parameters + ------------ + gene_list : list + A list of gene names for which to compute the DNA to protein mapping and coverage information. + coord_df : pandas.DataFrame + A DataFrame containing the coordinates of the CDS + for the genes in the panel. + dna_sites : pandas.DataFrame + A DataFrame containing the DNA positions that are part of the consensus panel, + along with their coverage information. + depth_df : pandas.DataFrame + A DataFrame containing the depth information for all positions in the genome. + + Returns + ------------ + pandas.DataFrame + A DataFrame containing the mapping of DNA positions to protein positions for the specified genes, + along with coverage and depth information for each position. """ # Map DNA to protein pos, get exons index to protein pos, etc @@ -429,10 +401,6 @@ def dna2prot_depth(gene_list, coord_df, dna_sites, depth_df): dna_prot_df_lst.append(get_dna_map_to_protein(gene_coord_df)) dna_prot_df = pd.concat(dna_prot_df_lst) - # dna_prot_df - # contains all the protein positions of the genes in the panel - # mapped to its corresponding genomic position - # Merge CDS position with availble sites (not masked) and depth info # and any other site that was included in the panel (splicing sites out of the CDS) dna_prot_df = dna_sites.merge(dna_prot_df, on=["GENE", "CHROM", "DNA_POS"], how="outer") @@ -449,14 +417,29 @@ def dna2prot_depth(gene_list, coord_df, dna_sites, depth_df): return dna_prot_df -def get_dna2prot_depth(gene_n_transcript_info, depth_file, consensus_file): +def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, consensus_file: str) -> tuple[pd.DataFrame, pd.DataFrame]: """ - This function outputs: - dna_prot_df: + Function to get the DNA to protein mapping for all positions in the provided list of genes, + along with coverage and depth information for each position, and the definition of all exons of + the genes/transcripts in the panel including UTR regions. - exons_coord_df: - df containing the definition of all exons of the genes/transcripts in the panel - including UTR regions + Parameters + ---------- + gene_n_transcript_info : pandas.DataFrame + A DataFrame containing gene and transcript information for the panel. + depth_file : str + Path to the file containing depth information for all positions in the genome. + consensus_file : str + Path to the consensus panel file containing DNA positions and coverage information. + + Returns + ---------- + dna_prot_df : pandas.DataFrame + A DataFrame containing the mapping of DNA positions to protein positions for the specified genes, + along with coverage and depth information for each position. + exons_coord_df_final : pandas.DataFrame + A DataFrame containing the coordinates of all exons (including UTRs) for the genes and transcripts + in the panel, formatted for BED files. """ consensus_df = pd.read_table(consensus_file) @@ -482,26 +465,28 @@ def get_dna2prot_depth(gene_n_transcript_info, depth_file, consensus_file): return dna_prot_df, exons_coord_df_final +def get_exon_depth_saturation(gene_depth: pd.DataFrame, gene_mut: pd.DataFrame, dna: bool = False) -> pd.DataFrame: + """ + Compute the depth and saturation of each exon in a gene. -# Utils function to retrieve exon ID from coordinate - -def find_exon(x_coord, exon_coord_df): - - dna_pos, chrom, strand = x_coord["DNA_POS"], x_coord["CHROM"], x_coord["REVERSE_STRAND"] - - if strand == -1: - matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] <= dna_pos) & (dna_pos <= exon_coord_df['Start'])] - - else: - matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] >= dna_pos) & (dna_pos >= exon_coord_df['Start'])] - - return matches['ID'].values[0] if not matches.empty else np.nan - - - - -def get_exon_depth_saturation(gene_depth, gene_mut, dna=False): - + Parameters + ---------- + gene_depth : pandas.DataFrame + A DataFrame containing the depth and coverage information for each DNA position in the gene, along with the corresponding + protein position and exon rank. + gene_mut : pandas.DataFrame + A DataFrame containing the mutation information for the gene, including the DNA positions of the mutations (if dna=True) + or the protein positions of the mutations (if dna=False). + dna : bool, optional + A boolean indicating whether the input data is at the DNA level (True) or at the protein level (False). This affects how + saturation is calculated (default is False). + + Returns + ------- + exon_depth : pandas.DataFrame + A DataFrame containing the average depth, coverage, number of mutated positions, and saturation for each exon in the gene, + along with the starting protein position of each exon. + """ # Exon average depth gene_depth = gene_depth.copy() exon_depth = gene_depth.groupby("EXON_RANK").apply( @@ -533,62 +518,104 @@ def get_exon_depth_saturation(gene_depth, gene_mut, dna=False): return exon_depth -def get_exon_mid_prot_pos(exon_info, prot_len): +# Plots +# ---------------------------------------------------------- +def plot_coverage_per_gene(depths_df: pd.DataFrame) -> None: + """ + Wrapper function to plot coverage per gene for DNA, protein and exon levels. - lst_end_pos = [] - for i in range(len(exon_info)): - exon = exon_info.iloc[i] - start_pos = int(exon.START_PROT_POS) - end_pos = int(exon_info.iloc[i+1].START_PROT_POS) if i < len(exon_info) -1 else prot_len - lst_end_pos.append(end_pos) - exon_info["END_PROT_POS"] = lst_end_pos - exon_info["MID_PROT_POS"] = (exon_info["START_PROT_POS"] + exon_info["END_PROT_POS"]) / 2 + Parameters + ---------- + depths_df : pandas.DataFrame + DataFrame containing depth and coverage information for DNA, protein and exon positions. + In this case, it will be the "exons_depth" created in `get_dna2prot_depth` function. + """ + coverage = depths_df.drop_duplicates(subset=['GENE', 'DNA_POS', 'COVERED']) + coverage_summary = coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - return exon_info + for prefix in ["DNA", "Protein", "Exon"]: + LOG.info(f"Plotting coverage for {prefix}...") + plot_single_coverage(coverage_summary, prefix) -def get_res_coverage(dna_prot_df): +def plot_single_coverage(coverage_summary: pd.DataFrame, prefix: str, batch_size: int = 5): + """ + Plot coverage for a single prefix (DNA, protein or exon) and save the results in a PDF file. - res_depth = dna_prot_df.dropna(subset=["PROT_POS"])[["PROT_POS", "COVERED", "DEPTH"]].reset_index(drop=True) - res_depth = res_depth.groupby("PROT_POS").mean().reset_index() + Parameters + ---------- + coverage_summary : pandas.DataFrame + DataFrame containing the coverage summary for each gene and coverage status. + prefix : str + The prefix to plot (DNA, Protein or Exon). + batch_size : int, optional + The number of genes to include in each batch when plotting (default is 5). + """ + # Generate copy of the coverage summary to avoid modifying the original DataFrame + coverage_summary_cp = coverage_summary.copy() - return res_depth + coverage_pivot = coverage_summary_cp.pivot(index='GENE', columns='COVERED', values='COUNT').fillna(0) + coverage_pivot = coverage_pivot.sort_values(1, ascending=False).reset_index() + coverage_pivot = coverage_pivot.set_index('GENE') + genes_list = coverage_pivot.index.tolist() + coverage_perc = coverage_pivot.div(coverage_pivot.sum(axis=1), axis=0) * 100 + coverage_pivot.to_csv(f"coverage_count_{prefix}.tsv", header=True, index=True, sep='\t') + coverage_perc.to_csv(f"coverage_perc_{prefix}.tsv", header=True, index=True, sep='\t') + if len(genes_list) < batch_size: -def add_consecutive_numbers(nums, max_n): + fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - result = [] - for i in range(len(nums)): - result.append(nums[i]) - # Check if the current number is the start of a consecutive sequence - if i < len(nums) - 1 and nums[i] + 1 != nums[i + 1]: - result.append(nums[i] + 1) + covered_col = True if True in coverage_pivot.columns else (1 if 1 in coverage_pivot.columns else coverage_pivot.columns[-1]) - # Add the last consecutive number after the final element - result.append(nums[-1] + 1) + coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff'], + ) - return result + axes[0].set_title(f'{prefix} : covered vs. non-covered') + axes[0].set_ylabel(f'Number of {prefix}') + axes[0].set_xlabel('Gene') + axes[0].legend(title='Covered', loc='upper right') + axes[0].tick_params(axis='x', rotation=45) + coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') + axes[1].set_title(f'{prefix} : percentage of covered') + axes[1].set_ylabel('Percentage (%)') + axes[1].set_xlabel('Gene') + axes[1].set_ylim(0, 100) + axes[1].tick_params(axis='x', rotation=45) -def where_plus(condition): - """ - Util function to extend the color of mpl filling to the next position. - """ + plt.tight_layout() + plt.savefig(f"coverage_per_{prefix}.pdf", dpi=300) + else: + with PdfPages(f"coverage_per_{prefix}_batches.pdf") as pdf: + # split into batches N genes + for i in range(0, len(genes_list), batch_size): + batch_genes = genes_list[i:i+batch_size] + batch_coverage_pivot = coverage_pivot.loc[batch_genes] + batch_coverage_perc = coverage_perc.loc[batch_genes] - ix = np.where(condition)[0] + fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - if len(ix) > 0: - ix = add_consecutive_numbers(ix, max_n=len(condition)) - if len(condition) in ix: - ix.remove(len(condition)) - boolean_vector = np.zeros(len(condition), dtype=bool) - boolean_vector[ix] = True + covered_col = True if True in batch_coverage_pivot.columns else (1 if 1 in batch_coverage_pivot.columns else batch_coverage_pivot.columns[-1]) - return pd.Series(boolean_vector) + batch_coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff']) + axes[0].set_title(f'{prefix} : covered vs. non-covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') + axes[0].set_ylabel(f'Number of {prefix}') + axes[0].set_xlabel('Gene') + axes[0].legend(title='Covered', loc='upper right') + axes[0].tick_params(axis='x', rotation=45) - else: - return condition + batch_coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') + axes[1].set_title(f'{prefix} : percentage of covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') + axes[1].set_ylabel('Percentage (%)') + axes[1].set_xlabel('Gene') + axes[1].set_ylim(0, 100) + axes[1].tick_params(axis='x', rotation=45) + plt.tight_layout() + pdf.savefig(dpi=300) + plt.show() + plt.close() @click.command() @click.option('--mutations-file', type=click.Path(exists=True), help='Mutations file') From e9eb510696c441a1ee027968046662c1d21726e4 Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 11:16:46 +0100 Subject: [PATCH 04/19] refactor: separate parse_cds_coord into exon and cds parsers (#229) --- bin/panels_computedna2protein.py | 73 +++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index 41793d2b..517ff2d6 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -100,7 +100,22 @@ def get_gff_to_generator(release: int = 111): url = f"https://ftp.ensembl.org/pub/release-{release}/gff3/homo_sapiens/Homo_sapiens.GRCh38.{release}.gff3.gz" # Open request - response = requests.get(url, stream=True) + try: + response = requests.get(url, stream=True, timeout=60) + response.raise_for_status() + except requests.exceptions.Timeout: + error = RuntimeError(f"Request timed out while fetching Ensembl GFF3 (release {release}) from: {url}") + LOG.error(error) + raise error + except requests.exceptions.HTTPError as e: + error = RuntimeError(f"HTTP error {response.status_code} while fetching Ensembl GFF3 (release {release}) from: {url}") + LOG.error(error) + raise error + except requests.exceptions.RequestException as e: + error = RuntimeError(f"Failed to fetch Ensembl GFF3 (release {release}) from: {url}") + LOG.error(error) + raise error + # decompress the stream on the fly with gzip.GzipFile(fileobj=response.raw) as gz: for line in gz: @@ -157,7 +172,16 @@ def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int = 111) -> p # Transform to pandas for easier manipulation later on return df.to_pandas() -def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list: +def _parse_strand_coords(exon) -> tuple[str, int, int, int]: + """Extract chromosome, strand-ordered start/end, and strand integer from a GFF row.""" + strand = 1 if exon.strand == "+" else -1 + if strand == 1: + start, end = exon.start, exon.end + else: + start, end = exon.end, exon.start + return exon.chr, start, end, strand + +def parse_exon_coord(exon: pd.Series) -> tuple[str, list]: """ Parses the coordinates of an exon row from the GFF DataFrame and returns the exon ID and its coordinates in a list format. @@ -173,29 +197,35 @@ def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list: exons_coord : list A list containing the chromosome, start, end, and strand information of the exon. """ - - strand = 1 if exon.strand == "+" else -1 - - if strand == 1: - start = exon.start - end = exon.end - else: - start = exon.end - end = exon.start + chrom, start, end, strand = _parse_strand_coords(exon) # Extract exon_id using regex match = re.search(r"exon_id=([^;]+)", exon.attributes) - if match: - # Extract exon_id using regex - exon_id = match.group(1) - chrom = f'chr{exon.chr}' + + # Extract exon_id using regex + exon_id = match.group(1) if match else "" + chrom = f'chr{exon.chr}' - return exon_id, [chrom, start, end, strand] + return exon_id, [chrom, start, end, strand] - else: - chrom = exon.chr +def parse_cds_coord(exon: pd.Series) -> list: + """ + Parses the coordinates of a CDS row from the GFF DataFrame and returns the coordinates in a list format. + + Parameters + ---------- + exon : pandas.Series + A row from the GFF DataFrame corresponding to an exon feature. + + Returns + ------- + coord : list + A list containing the chromosome, start, end, and strand information of the CDS. + """ + chrom, start, end, strand = _parse_strand_coords(exon) - return [chrom, start, end, strand] + # Return the CDS ID and coordinates + return [chrom, start, end, strand] def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """ @@ -236,7 +266,7 @@ def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame exons_lookup = exons_lookup.sort_values("start", ascending=is_positive) for i, exon in enumerate(exons_lookup.itertuples(index=False)): - exon_id, exons_coord = parse_cds_coord(exon) + exon_id, exons_coord = parse_exon_coord(exon) exons_coord_df_lst.append([f"{gene}--exon_{i+1}_{transcript}_{exon_id}"] + exons_coord) # --- Handle CDS --- @@ -444,6 +474,7 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co consensus_df = pd.read_table(consensus_file) depth_df = pd.read_table(depth_file) + gff_df = gff_to_filtered_df(gene_n_transcript_info) consensus_df = consensus_df.merge(depth_df[["CHROM", "POS", "CONTEXT"]], on = ["CHROM", "POS"], how = 'left') consensus_df = consensus_df.rename(columns={"POS" : "DNA_POS"}) @@ -452,7 +483,7 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co depth_df["DEPTH"] = depth_df.drop(columns=["CHROM", "POS", "CONTEXT"]).mean(1) depth_df = depth_df[["CHROM", "POS", "DEPTH"]].rename(columns = {"POS" : "DNA_POS"}) - coord_df, exons_coord_df = get_exon_coord_wrapper(gene_n_transcript_info) + coord_df, exons_coord_df = get_exon_coord_wrapper(gene_n_transcript_info, gff_df) gene_list = list(coord_df["Gene"].unique()) dna_prot_df = dna2prot_depth(gene_list, coord_df, consensus_df, depth_df) From bca9ab1ffb754360923776a599f1e1953e2aecfc Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 12:38:50 +0100 Subject: [PATCH 05/19] refactor: vectorized find_exon (#229) - Improved efficiency of find_exon by applying vectorized dataframe operations - Fix errors in docstrings and parameter definition - Add log information and remove info from matplotlib --- bin/panels_computedna2protein.py | 160 +++++++++++++------------------ 1 file changed, 67 insertions(+), 93 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index 517ff2d6..11d4aaf8 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -45,6 +45,8 @@ logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s - %(message)s", level=logging.DEBUG, datefmt="%m/%d/%Y %I:%M:%S %p" ) +logging.getLogger('matplotlib').setLevel(logging.WARNING) # Avoid too much logging + LOG = logging.getLogger("DNA2protein") @@ -62,7 +64,7 @@ def get_transcript_gene_from_maf(path_maf, consensus_file): Path to the consensus panel file. Returns - ------- + ------------ gene_transcript_pairs : pandas.DataFrame DataFrame with gene-transcript pairs. """ @@ -169,6 +171,7 @@ def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int = 111) -> p ) ) + LOG.info(f"Filtered GFF data to {df.shape[0]} rows for the specified genes and release.") # Transform to pandas for easier manipulation later on return df.to_pandas() @@ -186,25 +189,32 @@ def parse_exon_coord(exon: pd.Series) -> tuple[str, list]: Parses the coordinates of an exon row from the GFF DataFrame and returns the exon ID and its coordinates in a list format. Parameters - ---------- + ------------ exon : pandas.Series A row from the GFF DataFrame corresponding to an exon feature. Returns - ------- + ------------ exon_id : str The ID of the exon extracted from the attributes column. exons_coord : list A list containing the chromosome, start, end, and strand information of the exon. """ chrom, start, end, strand = _parse_strand_coords(exon) + + # Add "chr" prefix to chromosome + chrom = f'chr{exon.chr}' # Extract exon_id using regex match = re.search(r"exon_id=([^;]+)", exon.attributes) # Extract exon_id using regex - exon_id = match.group(1) if match else "" - chrom = f'chr{exon.chr}' + if match: + exon_id = match.group(1) + + else: + exon_id = "" + LOG.warning(f"Could not extract exon_id from attributes: {exon.transcript_id} - {exon.attributes}") return exon_id, [chrom, start, end, strand] @@ -213,12 +223,12 @@ def parse_cds_coord(exon: pd.Series) -> list: Parses the coordinates of a CDS row from the GFF DataFrame and returns the coordinates in a list format. Parameters - ---------- + ------------ exon : pandas.Series A row from the GFF DataFrame corresponding to an exon feature. Returns - ------- + ------------ coord : list A list containing the chromosome, start, end, and strand information of the CDS. """ @@ -232,14 +242,14 @@ def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame Wrapper function to retrieve exon and CDS coordinates for the genes and transcripts in the panel. Parameters - ---------- + ------------ gene_n_transcript : pandas.DataFrame A DataFrame containing gene and transcript information for the panel. gff_df : pandas.DataFrame A DataFrame containing the GFF data filtered for exon and CDS features. Returns - ------- + ------------ final_coord_df : pandas.DataFrame A DataFrame containing the CDS coordinates for each gene-transcript pair, along with protein position mapping. final_exons_df : pandas.DataFrame @@ -249,7 +259,6 @@ def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame exons_coord_df_lst = [] for gene, transcript in gene_n_transcript.values: - print(f"Processing gene: {gene}") # Get all features for this transcript once to avoid double filtering transcript_data = gff_df[gff_df["transcript_id"] == transcript] @@ -289,11 +298,12 @@ def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame final_coord_df = pd.concat(coord_df_lst) if coord_df_lst else pd.DataFrame() final_exons_df = pd.DataFrame(exons_coord_df_lst, columns=["ID", "Chr", "Start", "End", "Strand"]) + LOG.info(f"Retrieved coordinates for {len(final_coord_df['Gene'].unique())} genes and {len(final_exons_df['ID'].unique())} exons.") return final_coord_df, final_exons_df # Coordinate parsing and DNA-to-protein mapping functions # ---------------------------------------------------------- -def get_dna_exon_pos(exon_range: list, strand: int) -> list: +def get_dna_exon_pos(exon_range: np.ndarray, strand: int) -> list: """ Get the DNA positions of an exon given its range and strand. Parameters @@ -315,7 +325,7 @@ def get_dna_exon_pos(exon_range: list, strand: int) -> list: return np.arange(exon_range[0], exon_range[1] + 1) -def get_exon_ix(i: int, exon_range: list, strand: int) -> list: +def get_exon_ix(i: int, exon_range: np.ndarray, strand: int): """ Get the exon index for each DNA position in the exon, ordered according to the strand. @@ -323,8 +333,8 @@ def get_exon_ix(i: int, exon_range: list, strand: int) -> list: ------------ i : int The exon index (starting from 0). - exon_range : list - A list containing the start and end positions of the exon. + exon_range : np.ndarray + A numpy array containing the start and end positions of the exon. strand : int The strand of the exon (1 for positive strand, -1 for negative strand). @@ -370,31 +380,48 @@ def get_dna_map_to_protein(coord_df: pd.DataFrame) -> pd.DataFrame: return df -def find_exon(x_coord: dict, exon_coord_df: pd.DataFrame) -> str | np.nan: +def find_exon(dna_prot_df: pd.DataFrame, exon_coord_df: pd.DataFrame): """ - Find the exon ID for a given DNA position. + For each DNA position in *dna_prot_df* the function finds the matching exon + ID from *exon_coord_df* by merging on chromosome and filtering by positional + interval. This avoids a Python-level loop and is orders of magnitude faster + than the ``apply`` approach for large panels. + Parameters ------------ - x_coord : dict - A dictionary containing the DNA position, chromosome, and strand information. - exon_coord_df : pandas.DataFrame - A DataFrame containing the coordinates of all exons. + dna_prot_df : pd.DataFrame + DataFrame with at least ``CHROM`` and ``DNA_POS`` columns. + exon_coord_df : pd.DataFrame + DataFrame with ``Chr``, ``Start``, ``End``, and ``ID`` columns. Returns ------------ - str or np.nan - The ID of the exon that contains the given DNA position, or np.nan if no match is found. + np.ndarray + Array of exon IDs aligned to *dna_prot_df*\'s row order; + ``np.nan`` where no matching exon is found. """ + # Normalise strand-swapped coordinates so lo <= hi in all cases + exon_df = exon_coord_df[["Chr", "Start", "End", "ID"]].copy() + exon_df["lo"] = exon_df[["Start", "End"]].min(axis=1) + exon_df["hi"] = exon_df[["Start", "End"]].max(axis=1) + + # Cross-merge on chromosome; each row of dna_prot_df is paired only with + # exons on the same chromosome, then filtered to those whose interval + # contains the DNA position. + merged = dna_prot_df[["CHROM", "DNA_POS"]].merge( + exon_df[["Chr", "lo", "hi", "ID"]], + left_on="CHROM", right_on="Chr", + how="left" + ) + in_exon = (merged["DNA_POS"] >= merged["lo"]) & (merged["DNA_POS"] <= merged["hi"]) + # Keep the first matching exon per position + matched = merged[in_exon].drop_duplicates(subset=["CHROM", "DNA_POS"]) - dna_pos, chrom, strand = x_coord["DNA_POS"], x_coord["CHROM"], x_coord["REVERSE_STRAND"] - - if strand == -1: - matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] <= dna_pos) & (dna_pos <= exon_coord_df['Start'])] - - else: - matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] >= dna_pos) & (dna_pos >= exon_coord_df['Start'])] - - return matches['ID'].values[0] if not matches.empty else np.nan + # Re-align to the original DataFrame row order + result = dna_prot_df[["CHROM", "DNA_POS"]].merge( + matched[["CHROM", "DNA_POS", "ID"]], on=["CHROM", "DNA_POS"], how="left" + ) + return result["ID"].values # Depth computation and coverage functions @@ -454,7 +481,7 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co the genes/transcripts in the panel including UTR regions. Parameters - ---------- + ------------ gene_n_transcript_info : pandas.DataFrame A DataFrame containing gene and transcript information for the panel. depth_file : str @@ -463,7 +490,7 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co Path to the consensus panel file containing DNA positions and coverage information. Returns - ---------- + ------------ dna_prot_df : pandas.DataFrame A DataFrame containing the mapping of DNA positions to protein positions for the specified genes, along with coverage and depth information for each position. @@ -479,15 +506,15 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co consensus_df = consensus_df.merge(depth_df[["CHROM", "POS", "CONTEXT"]], on = ["CHROM", "POS"], how = 'left') consensus_df = consensus_df.rename(columns={"POS" : "DNA_POS"}) - if "DEPTH" not in depth_df: + if "DEPTH" not in depth_df.columns: depth_df["DEPTH"] = depth_df.drop(columns=["CHROM", "POS", "CONTEXT"]).mean(1) - depth_df = depth_df[["CHROM", "POS", "DEPTH"]].rename(columns = {"POS" : "DNA_POS"}) + depth_df = depth_df[["CHROM", "POS", "DEPTH"]] coord_df, exons_coord_df = get_exon_coord_wrapper(gene_n_transcript_info, gff_df) gene_list = list(coord_df["Gene"].unique()) - dna_prot_df = dna2prot_depth(gene_list, coord_df, consensus_df, depth_df) + dna_prot_df = dna2prot_depth(gene_list=gene_list, coord_df=coord_df, dna_sites=consensus_df, depth_df=depth_df) - dna_prot_df["EXON_ID"] = dna_prot_df.apply(lambda x: find_exon(x, exons_coord_df), axis=1) + dna_prot_df["EXON_ID"] = find_exon(dna_prot_df, exons_coord_df) # fix coordinates order in exons_coord_df for BED exons_coord_df_final = exons_coord_df.copy() @@ -496,58 +523,6 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co return dna_prot_df, exons_coord_df_final -def get_exon_depth_saturation(gene_depth: pd.DataFrame, gene_mut: pd.DataFrame, dna: bool = False) -> pd.DataFrame: - """ - Compute the depth and saturation of each exon in a gene. - - Parameters - ---------- - gene_depth : pandas.DataFrame - A DataFrame containing the depth and coverage information for each DNA position in the gene, along with the corresponding - protein position and exon rank. - gene_mut : pandas.DataFrame - A DataFrame containing the mutation information for the gene, including the DNA positions of the mutations (if dna=True) - or the protein positions of the mutations (if dna=False). - dna : bool, optional - A boolean indicating whether the input data is at the DNA level (True) or at the protein level (False). This affects how - saturation is calculated (default is False). - - Returns - ------- - exon_depth : pandas.DataFrame - A DataFrame containing the average depth, coverage, number of mutated positions, and saturation for each exon in the gene, - along with the starting protein position of each exon. - """ - # Exon average depth - gene_depth = gene_depth.copy() - exon_depth = gene_depth.groupby("EXON_RANK").apply( - lambda x: 0 if sum(x.COVERED) == 0 else sum(x.DEPTH) / sum(x.COVERED)).reset_index().rename(columns = {0 : "DEPTH"}) - exon_depth["START_PROT_POS"] = gene_depth.groupby("EXON_RANK").apply(lambda x: x.PROT_POS.min()) - - # Exon saturation - if dna: - exon_depth["COVERED"] = gene_depth.groupby("EXON_RANK").apply(lambda x: sum(x.COVERED)).values - gene_depth["MUTATED"] = gene_depth.DNA_POS.isin(gene_mut.DNA_POS.unique()).astype(int) - exon_depth["MUTATED"] = gene_depth.groupby("EXON_RANK").apply(lambda x: sum(x.MUTATED)).values - exon_depth["SATURATION"] = exon_depth["MUTATED"] / exon_depth["COVERED"] - else: - - # If an amino acids from the same codon are in two exons, consider the position only in the second one - # (but look at booth to know if the residue is in the panel and if it is mutated) - exon_depth_prot = gene_depth.groupby("PROT_POS").apply(lambda x: (x.COVERED.max(), x.EXON_RANK.max())).reset_index() - exon_depth_prot[["COVERED", "EXON_RANK"]] = pd.DataFrame(exon_depth_prot[0].tolist(), index=exon_depth_prot.index) - exon_depth_prot = exon_depth_prot.drop(columns=[0]) - - exon_depth["COVERED"] = exon_depth_prot.groupby("EXON_RANK").apply(lambda x: sum(x.COVERED)).values - exon_depth_prot["MUTATED"] = exon_depth_prot["PROT_POS"].isin(gene_mut["Pos"].unique()).astype(int) - exon_depth["MUTATED"] = exon_depth_prot.groupby("EXON_RANK").apply(lambda x: sum(x.MUTATED)).values - exon_depth["SATURATION"] = exon_depth["MUTATED"] / exon_depth["COVERED"] - - # check_mutated_masked = exon_depth_prot[(exon_depth_prot["MUTATED"] == 1) & (exon_depth_prot["COVERED"] == 0)] - # check_mutated_masked["GENE"] = gene_depth.GENE.unique()[0] - - return exon_depth - # Plots # ---------------------------------------------------------- @@ -645,7 +620,6 @@ def plot_single_coverage(coverage_summary: pd.DataFrame, prefix: str, batch_size plt.tight_layout() pdf.savefig(dpi=300) - plt.show() plt.close() @click.command() @@ -653,13 +627,11 @@ def plot_single_coverage(coverage_summary: pd.DataFrame, prefix: str, batch_size @click.option('--consensus-file', type=click.Path(exists=True), help='Input consensus panel file') @click.option('--depths-file', type=click.Path(exists=True), help='Input depths of all samples file') def main(mutations_file, consensus_file, depths_file): - click.echo("Starting to run plot saturation results...") - # Count each mutation only ones if it appears in multiple reads gene_n_transcript = get_transcript_gene_from_maf(mutations_file, consensus_file) exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file) - print("Exons coordinates and depth computed") + LOG.info("Exons coordinates and depth computed!") exons_depth.to_csv("depths_per_position_exon_gene.tsv", header = True, index = False, sep = '\t') exons_coordinates_bed_like = exons_coord_id[['Chr', 'Start', 'End', 'ID']] @@ -667,6 +639,8 @@ def main(mutations_file, consensus_file, depths_file): plot_coverage_per_gene(exons_depth) + LOG.info("All done!") + if __name__ == '__main__': main() From 385f3a9a615e24a574fcee18b2cd81dd8b6d8e00 Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 14:05:51 +0100 Subject: [PATCH 06/19] refactor: add option to change ensembl release (#229) - Added click option to script - Added definition to nextflow process --- bin/panels_computedna2protein.py | 33 +++++++------------------------ modules/local/dna2protein/main.nf | 4 +++- 2 files changed, 10 insertions(+), 27 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index 11d4aaf8..58655d03 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -129,12 +129,11 @@ def get_gff_to_generator(release: int = 111): continue # Only "yield" lines that are exon or CDS - # This drastically reduces the data sent to the DataFrame parts = line_str.split('\t') if parts[2] in ["exon", "CDS"]: yield line_str -def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int = 111) -> pd.DataFrame: +def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int) -> pd.DataFrame: """ Transforms the yields from get_gff_to_generator into a filtered DataFrame. The reading and filtering is done with polars to improve efficiency. @@ -218,25 +217,6 @@ def parse_exon_coord(exon: pd.Series) -> tuple[str, list]: return exon_id, [chrom, start, end, strand] -def parse_cds_coord(exon: pd.Series) -> list: - """ - Parses the coordinates of a CDS row from the GFF DataFrame and returns the coordinates in a list format. - - Parameters - ------------ - exon : pandas.Series - A row from the GFF DataFrame corresponding to an exon feature. - - Returns - ------------ - coord : list - A list containing the chromosome, start, end, and strand information of the CDS. - """ - chrom, start, end, strand = _parse_strand_coords(exon) - - # Return the CDS ID and coordinates - return [chrom, start, end, strand] - def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """ Wrapper function to retrieve exon and CDS coordinates for the genes and transcripts in the panel. @@ -284,7 +264,7 @@ def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame coord_lst = [] for i, exon in enumerate(cds_lookup.itertuples(index=False)): - exons_coord = parse_cds_coord(exon) + exons_coord = list(_parse_strand_coords(exon)) # Include the biological rank (i) coord_lst.append(exons_coord + [i]) @@ -474,7 +454,7 @@ def dna2prot_depth(gene_list: list, coord_df: pd.DataFrame, dna_sites: pd.DataFr return dna_prot_df -def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, consensus_file: str) -> tuple[pd.DataFrame, pd.DataFrame]: +def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, consensus_file: str, release: int) -> tuple[pd.DataFrame, pd.DataFrame]: """ Function to get the DNA to protein mapping for all positions in the provided list of genes, along with coverage and depth information for each position, and the definition of all exons of @@ -501,7 +481,7 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co consensus_df = pd.read_table(consensus_file) depth_df = pd.read_table(depth_file) - gff_df = gff_to_filtered_df(gene_n_transcript_info) + gff_df = gff_to_filtered_df(gene_n_transcript_info, release=release) consensus_df = consensus_df.merge(depth_df[["CHROM", "POS", "CONTEXT"]], on = ["CHROM", "POS"], how = 'left') consensus_df = consensus_df.rename(columns={"POS" : "DNA_POS"}) @@ -626,11 +606,12 @@ def plot_single_coverage(coverage_summary: pd.DataFrame, prefix: str, batch_size @click.option('--mutations-file', type=click.Path(exists=True), help='Mutations file') @click.option('--consensus-file', type=click.Path(exists=True), help='Input consensus panel file') @click.option('--depths-file', type=click.Path(exists=True), help='Input depths of all samples file') -def main(mutations_file, consensus_file, depths_file): +@click.option('--ensembl-release', type=int, default=111, help='Ensembl release number to use for GFF file retrieval (default is 111)') +def main(mutations_file, consensus_file, depths_file, ensembl_release): # Count each mutation only ones if it appears in multiple reads gene_n_transcript = get_transcript_gene_from_maf(mutations_file, consensus_file) - exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file) + exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file, ensembl_release) LOG.info("Exons coordinates and depth computed!") exons_depth.to_csv("depths_per_position_exon_gene.tsv", header = True, index = False, sep = '\t') diff --git a/modules/local/dna2protein/main.nf b/modules/local/dna2protein/main.nf index 07d41544..b4ce0baa 100644 --- a/modules/local/dna2protein/main.nf +++ b/modules/local/dna2protein/main.nf @@ -20,12 +20,14 @@ process DNA_2_PROTEIN_MAPPING { script: + def ensembl_release = task.ext.vep_cache_version ? "--ensembl-release \"${task.ext.vep_cache_version}\"" : "" """ cut -f 1,2,6 ${panel_file} | uniq > ${meta2.id}.panel.unique.tsv panels_computedna2protein.py \\ --mutations-file ${mutations_file} \\ --consensus-file ${meta2.id}.panel.unique.tsv \\ - --depths-file ${all_samples_depths} + --depths-file ${all_samples_depths} \\ + ${ensembl_release} cat <<-END_VERSIONS > versions.yml "${task.process}": From 821537ac5b1e390ead93fe883b3828641a7868a9 Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 14:43:35 +0100 Subject: [PATCH 07/19] fix: forgot to add the configuration (#229) --- conf/modules.config | 4 ++++ modules/local/dna2protein/main.nf | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 327648f0..2b8a0cef 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -72,6 +72,10 @@ process { ] } + withName: DNA2PROTEINMAPPING { + ext.ensembl_release = params.vep_cache_version ?: 111 + } + withName: QUERYDEPTHS { ext.prefix = { ".subset_depths" } ext.args = '' diff --git a/modules/local/dna2protein/main.nf b/modules/local/dna2protein/main.nf index b4ce0baa..5d48e45b 100644 --- a/modules/local/dna2protein/main.nf +++ b/modules/local/dna2protein/main.nf @@ -20,14 +20,14 @@ process DNA_2_PROTEIN_MAPPING { script: - def ensembl_release = task.ext.vep_cache_version ? "--ensembl-release \"${task.ext.vep_cache_version}\"" : "" + def ensembl_release = task.ext.ensembl_release """ cut -f 1,2,6 ${panel_file} | uniq > ${meta2.id}.panel.unique.tsv panels_computedna2protein.py \\ --mutations-file ${mutations_file} \\ --consensus-file ${meta2.id}.panel.unique.tsv \\ --depths-file ${all_samples_depths} \\ - ${ensembl_release} + --ensembl-release ${ensembl_release} cat <<-END_VERSIONS > versions.yml "${task.process}": From 6efbf68e122e31339b2c9ae6a7938e8956517ba3 Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 14:44:41 +0100 Subject: [PATCH 08/19] fix: indent problem (#229) --- modules/local/dna2protein/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/dna2protein/main.nf b/modules/local/dna2protein/main.nf index 5d48e45b..f89dcf4b 100644 --- a/modules/local/dna2protein/main.nf +++ b/modules/local/dna2protein/main.nf @@ -20,7 +20,7 @@ process DNA_2_PROTEIN_MAPPING { script: - def ensembl_release = task.ext.ensembl_release + def ensembl_release = task.ext.ensembl_release """ cut -f 1,2,6 ${panel_file} | uniq > ${meta2.id}.panel.unique.tsv panels_computedna2protein.py \\ From 211803c91e75e0db04c432c68328e851f8e3b451 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Wed, 25 Feb 2026 15:57:46 +0100 Subject: [PATCH 09/19] remove all references to mm10 and hg19 --- bin/panel_postprocessing_annotation.py | 6 ++---- bin/postprocessing_annotation.py | 6 ++---- bin/sites_table_from_positions.py | 5 +---- bin/utils_context.py | 2 +- conf/modules.config | 18 ++++++------------ conf/tools/omega.config | 8 +++----- docs/usage.md | 2 +- 7 files changed, 16 insertions(+), 31 deletions(-) diff --git a/bin/panel_postprocessing_annotation.py b/bin/panel_postprocessing_annotation.py index e13e057b..9cf3f173 100755 --- a/bin/panel_postprocessing_annotation.py +++ b/bin/panel_postprocessing_annotation.py @@ -4,15 +4,13 @@ import pandas as pd import numpy as np from itertools import product -from bgreference import hg38, hg19, mm10, mm39 +from bgreference import hg38, mm39 from utils_context import transform_context from utils_impacts import * from read_utils import custom_na_values assembly_name2function = { "hg38": hg38, - "hg19": hg19, - "mm10": mm10, "mm39": mm39 } @@ -199,7 +197,7 @@ def vep2summarizedannotation_panel(VEP_output_file, all_possible_sites_annotated @click.command() @click.option('--vep_output_file', type=click.Path(exists=True), required=True, help='Path to the VEP output file.') -@click.option('--assembly', type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), default='hg38', help='Genome assembly.') +@click.option('--assembly', type=click.Choice(['hg38', 'mm39']), default='hg38', help='Genome assembly.') @click.option('--output_file', type=click.Path(), required=True, help='Path to the output annotated file.') @click.option('--only_canonical', is_flag=True, default=False, help='Use only canonical transcripts.') def main(vep_output_file, assembly, output_file, only_canonical): diff --git a/bin/postprocessing_annotation.py b/bin/postprocessing_annotation.py index 24cab413..50f83c72 100755 --- a/bin/postprocessing_annotation.py +++ b/bin/postprocessing_annotation.py @@ -4,7 +4,7 @@ import click import pandas as pd -from bgreference import hg38, hg19, mm10, mm39 +from bgreference import hg38, mm39 from utils import vartype from utils_context import transform_context @@ -12,8 +12,6 @@ from read_utils import custom_na_values assembly_name2function = {"hg38": hg38, - "hg19": hg19, - "mm10": mm10, "mm39": mm39} @@ -245,7 +243,7 @@ def vep2summarizedannotation(VEP_output_file, all_possible_sites_annotated_file, @click.command() @click.argument('vep_output_file', type=click.Path(exists=True)) @click.argument('all_possible_sites_annotated_file', type=click.Path()) -@click.option('--assembly-name', default='hg38', show_default=True, type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), help='Reference genome assembly name') +@click.option('--assembly-name', default='hg38', show_default=True, type=click.Choice(['hg38', 'mm39']), help='Reference genome assembly name') @click.option('--all-underscore', is_flag=True, default=False, show_default=True, help='Whether to use _ to separate all parts of MUT_ID (default: False)') @click.option('--hotspots-annotation-file', default=None, type=click.Path(exists=False), help='Path to hotspots annotation file') @click.option('--gnomad-af-threshold', default=0.001, show_default=True, type=float, help='gnomAD allele frequency threshold') diff --git a/bin/sites_table_from_positions.py b/bin/sites_table_from_positions.py index 979e49d8..892371fa 100755 --- a/bin/sites_table_from_positions.py +++ b/bin/sites_table_from_positions.py @@ -4,11 +4,9 @@ import click import pandas as pd -from bgreference import hg38, hg19, mm10, mm39 +from bgreference import hg38, mm39 assembly_name2function = {"hg38": hg38, - "hg19": hg19, - "mm10": mm10, "mm39": mm39 } @@ -49,7 +47,6 @@ def generate_all_sites_4VEP(input_positions, genome, output_file_with_sites): @click.command() @click.option('--input-positions', required=True, type=click.Path(exists=True), help='Input positions file (TSV)') -#@click.option('--genome-assembly', required=True, type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), help='Genome assembly (hg38, hg19, mm10, mm39)') @click.option('--genome-assembly', required=True, type=click.Choice(['hg38', 'mm39']), help='Genome assembly (hg38, mm39)') @click.option('--output-file-with-sites', required=True, type=click.Path(), help='Output file for sites (TSV)') def main(input_positions, genome_assembly, output_file_with_sites): diff --git a/bin/utils_context.py b/bin/utils_context.py index cf8e488e..3cacd0ae 100644 --- a/bin/utils_context.py +++ b/bin/utils_context.py @@ -1,4 +1,4 @@ -from bgreference import hg38, hg19, mm10, mm39 +from bgreference import hg38, mm39 from itertools import product diff --git a/conf/modules.config b/conf/modules.config index 327648f0..c9267ed4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -553,13 +553,9 @@ process { --cushion 100" ext.genome_assembly = params.vep_genome == 'GRCh38' ? 'GRCh38' - : params.vep_genome == 'GRCh37' - ? 'GRCh37' - : params.vep_genome == 'GRCm38' - ? 'mm10' - : params.vep_genome == 'GRCm39' - ? 'mm39' - : null + : params.vep_genome == 'GRCm39' + ? 'mm39' + : null publishDir = [ [ mode: params.publish_dir_mode, @@ -625,11 +621,9 @@ process { withName: 'SITESFROMPOSITIONS|SUMANNOTATION|SIGPROFILERASSIGNMENT|ONCODRIVECLUSTL|POSTPROCESSVEPPANEL' { ext.assembly = params.vep_genome == 'GRCh38' ? 'hg38' - : params.vep_genome == 'GRCm38' - ? 'mm10' - : params.vep_genome == 'GRCm39' - ? 'mm39' - : null + : params.vep_genome == 'GRCm39' + ? 'mm39' + : null } withLabel : deepcsa_core { diff --git a/conf/tools/omega.config b/conf/tools/omega.config index da5b089a..0d28c498 100644 --- a/conf/tools/omega.config +++ b/conf/tools/omega.config @@ -168,10 +168,8 @@ process { withName: 'PREPROCESSING.*|ESTIMATOR.*' { ext.assembly = params.vep_genome == 'GRCh38' ? 'hg38' - : params.vep_genome == 'GRCm38' - ? 'mm10' - : params.vep_genome == 'GRCm39' - ? 'mm39' - : null + : params.vep_genome == 'GRCm39' + ? 'mm39' + : null } } diff --git a/docs/usage.md b/docs/usage.md index c196e28e..e99ab95f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -274,7 +274,7 @@ These files identify sites overlapping common SNPs and noisy or variable genomic - Nanoseq SNP: Common SNP positions that should be excluded from analysis - Nanoseq Noise: Regions with high noise or variability -Both files are available for GRCh37 and GRCh38 at the [shared folder](https://drive.google.com/drive/folders/1wqkgpRTuf4EUhqCGSLA4fIg9qEEw3ZcL) from Iñigo Martincorena's group, at the Wellcome Sanger Institute. +Both files are available for GRCh38 at the [shared folder](https://drive.google.com/drive/folders/1wqkgpRTuf4EUhqCGSLA4fIg9qEEw3ZcL) from Iñigo Martincorena's group, at the Wellcome Sanger Institute. ## Additional customizable parameters From 48d1378841bcb3635f3c89bc1f3ad9dc9662f30e Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Wed, 25 Feb 2026 16:18:47 +0100 Subject: [PATCH 10/19] query exons consensus mutations once - then distribute in the appropriate steps --- subworkflows/local/adjmutdensity/main.nf | 4 +-- subworkflows/local/dnds/main.nf | 8 +---- subworkflows/local/mutability/main.nf | 11 +------ subworkflows/local/omega/main.nf | 6 ++-- subworkflows/local/oncodrive3d/main.nf | 4 +-- subworkflows/local/oncodriveclustl/main.nf | 2 +- workflows/deepcsa.nf | 35 +++++++++++++--------- 7 files changed, 29 insertions(+), 41 deletions(-) diff --git a/subworkflows/local/adjmutdensity/main.nf b/subworkflows/local/adjmutdensity/main.nf index 25cfbe53..1da8bcee 100644 --- a/subworkflows/local/adjmutdensity/main.nf +++ b/subworkflows/local/adjmutdensity/main.nf @@ -1,4 +1,4 @@ -include { TABIX_BGZIPTABIX_QUERY as QUERYMUTATIONS } from '../../../modules/nf-core/tabix/bgziptabixquery/main' +include { TABIX_BGZIPTABIX_QUERY as QUERYMUTATIONS } from '../../../modules/nf-core/tabix/bgziptabixquery/main' include { SUBSET_MAF as SUBSETMUTDENSITYADJUSTED } from '../../../modules/local/subsetmaf/main' @@ -16,7 +16,7 @@ workflow MUTATION_DENSITY { main: - // Intersect BED of all sites with BED of sample filtered sites + // Intersect BED of all sites of interest with mutations QUERYMUTATIONS(mutations, bedfile) SUBSETMUTDENSITYADJUSTED(QUERYMUTATIONS.out.subset) diff --git a/subworkflows/local/dnds/main.nf b/subworkflows/local/dnds/main.nf index 92960196..edf8e534 100644 --- a/subworkflows/local/dnds/main.nf +++ b/subworkflows/local/dnds/main.nf @@ -1,5 +1,3 @@ -include { TABIX_BGZIPTABIX_QUERY as QUERYMUTATIONS } from '../../../modules/nf-core/tabix/bgziptabixquery/main' - include { SUBSET_MAF as SUBSET_DNDS } from '../../../modules/local/subsetmaf/main' include { PREPROCESS_DNDS as PREPROCESSDEPTHS } from '../../../modules/local/dnds/preprocess/main' @@ -10,7 +8,6 @@ workflow DNDS { take: mutations depth - bedfile panel main: @@ -18,10 +15,7 @@ workflow DNDS { covariates = params.dnds_covariates ? channel.fromPath( params.dnds_covariates, checkIfExists: true).first() : channel.empty() ref_trans = params.dnds_ref_transcripts ? channel.fromPath( params.dnds_ref_transcripts, checkIfExists: true).first() : channel.empty() - - QUERYMUTATIONS(mutations, bedfile) - - SUBSET_DNDS(QUERYMUTATIONS.out.subset) + SUBSET_DNDS(mutations) PREPROCESSDEPTHS(depth, panel) diff --git a/subworkflows/local/mutability/main.nf b/subworkflows/local/mutability/main.nf index 3e1c2f2b..01132d80 100644 --- a/subworkflows/local/mutability/main.nf +++ b/subworkflows/local/mutability/main.nf @@ -16,19 +16,10 @@ workflow MUTABILITY { depth profile panel_file - bedfiles main: - // actual code - // this line was not needed nor used in the computation of mutabilities, - // since there is an additional left_join merge in the compute mutabilities code, - // the depths can be the full ones - // QUERYDEPTHS(depth, bedfiles) - QUERYMUTATIONS(mutations, bedfiles) - - - SUBSETMUTABILITY(QUERYMUTATIONS.out.subset) + SUBSETMUTABILITY(mutations) SUBSETMUTABILITY.out.mutations .join(profile) .join(depth) diff --git a/subworkflows/local/omega/main.nf b/subworkflows/local/omega/main.nf index 7b5f73d7..de583490 100644 --- a/subworkflows/local/omega/main.nf +++ b/subworkflows/local/omega/main.nf @@ -45,12 +45,10 @@ workflow OMEGA_ANALYSIS{ all_gloc_results = channel.empty() // Intersect BED of all sites with BED of sample filtered sites - QUERYMUTATIONS(mutations, bedfile) - QUERYPANEL(panel_captured_rich, bedfile) - SUBSETOMEGA(QUERYMUTATIONS.out.subset) - SUBSETOMEGAMULTI(QUERYMUTATIONS.out.subset) + SUBSETOMEGA(mutations) + SUBSETOMEGAMULTI(mutations) SUBSETOMEGA.out.mutations .join( depth ) diff --git a/subworkflows/local/oncodrive3d/main.nf b/subworkflows/local/oncodrive3d/main.nf index 82ca8974..cc4394dd 100644 --- a/subworkflows/local/oncodrive3d/main.nf +++ b/subworkflows/local/oncodrive3d/main.nf @@ -14,15 +14,13 @@ workflow ONCODRIVE3D_ANALYSIS{ take: mutations mutabilities - bedfile datasets annotations raw_vep main: - QUERYMUTATIONS(mutations, bedfile) - SUBSETONCODRIVE3D(QUERYMUTATIONS.out.subset) + SUBSETONCODRIVE3D(mutations) // mutations preprocessing if (params.o3d_raw_vep){ diff --git a/subworkflows/local/oncodriveclustl/main.nf b/subworkflows/local/oncodriveclustl/main.nf index 9639cf42..f7762278 100644 --- a/subworkflows/local/oncodriveclustl/main.nf +++ b/subworkflows/local/oncodriveclustl/main.nf @@ -1,6 +1,6 @@ include { CREATECUSTOMBEDFILE as ONCODRIVECLUSTLBED } from '../../../modules/local/createpanels/custombedfile/main' -include { SUBSET_MAF as SUBSETONCODRIVECLUSTL } from '../../../modules/local/subsetmaf/main' +include { SUBSET_MAF as SUBSETONCODRIVECLUSTL } from '../../../modules/local/subsetmaf/main' include { ONCODRIVECLUSTL } from '../../../modules/local/bbgtools/oncodriveclustl/main' // include { ONCODRIVECLUSTL as ONCODRIVECLUSTLSNVS } from '../../../modules/local/bbgtools/oncodriveclustl/main' diff --git a/workflows/deepcsa.nf b/workflows/deepcsa.nf index 7ad9dfb7..2a722251 100644 --- a/workflows/deepcsa.nf +++ b/workflows/deepcsa.nf @@ -101,6 +101,8 @@ include { TABLE_2_GROUP as TABLE2GROUP } from '../m include { ANNOTATE_DEPTHS as ANNOTATEDEPTHS } from '../modules/local/annotatedepth/main' include { DOWNSAMPLE_DEPTHS as DOWNSAMPLEDEPTHS } from '../modules/local/downsample/depths/main' +include { TABIX_BGZIPTABIX_QUERY as QUERYMUTATIONSEXONS } from '../modules/nf-core/tabix/bgziptabixquery/main' + include { SELECT_MUTDENSITIES as SYNMUTDENSITY } from '../modules/local/select_mutdensity/main' include { SELECT_MUTDENSITIES as SYNMUTREADSDENSITY } from '../modules/local/select_mutdensity/main' @@ -282,11 +284,18 @@ workflow DEEPCSA{ } + // Intersect BED of all sites with somatic mutations to keep only those mutations in the exons consensus panel + QUERYMUTATIONSEXONS(somatic_mutations, CREATEPANELS.out.exons_consensus_bed) + mutations_in_exons = QUERYMUTATIONSEXONS.out.subset + // Mutational profile if ( params.profileall || run_mutabilities || params.omega ){ MUTPROFILEALL(somatic_mutations, DEPTHSALLCONS.out.subset, CREATEPANELS.out.all_consensus_bed, wgs_trinucs, TABLE2GROUP.out.json_allgroups) if (run_mutdensity){ + + // TODO explore if we could use the ALL consensus panel instead of the exons one + // I am suggesting this since we are already distinguishing per consequence type within the script MUTDENSITYADJUSTED(somatic_mutations, DEPTHSALLCONS.out.subset, CREATEPANELS.out.exons_consensus_bed, CREATEPANELS.out.exons_consensus_panel, MUTPROFILEALL.out.profile, wgs_trinucs) // Concatenate all outputs into a single file @@ -313,19 +322,17 @@ workflow DEEPCSA{ if (run_mutabilities) { if (params.profileall){ - MUTABILITYALL(somatic_mutations, + MUTABILITYALL(mutations_in_exons, annotated_depths, MUTPROFILEALL.out.profile, - CREATEPANELS.out.exons_consensus_panel, - CREATEPANELS.out.exons_consensus_bed + CREATEPANELS.out.exons_consensus_panel ) } if (params.profilenonprot){ - MUTABILITYNONPROT(somatic_mutations, + MUTABILITYNONPROT(mutations_in_exons, annotated_depths, MUTPROFILENONPROT.out.profile, - CREATEPANELS.out.exons_consensus_panel, - CREATEPANELS.out.exons_consensus_bed + CREATEPANELS.out.exons_consensus_panel ) } } @@ -356,7 +363,7 @@ workflow DEEPCSA{ oncodrivefml_regressions_files = channel.empty() if (params.profileall){ mode = "all" - ONCODRIVEFMLALL(somatic_mutations, MUTABILITYALL.out.mutability, + ONCODRIVEFMLALL(mutations_in_exons, MUTABILITYALL.out.mutability, CREATEPANELS.out.exons_consensus_panel, cadd_scores, mode ) @@ -368,7 +375,7 @@ workflow DEEPCSA{ } if (params.profilenonprot && params.positive_selection_non_protein_affecting){ mode = "non_prot_aff" - ONCODRIVEFMLNONPROT(somatic_mutations, MUTABILITYNONPROT.out.mutability, + ONCODRIVEFMLNONPROT(mutations_in_exons, MUTABILITYNONPROT.out.mutability, CREATEPANELS.out.exons_consensus_panel, cadd_scores, mode ) @@ -381,7 +388,7 @@ workflow DEEPCSA{ if (params.oncodrive3d){ if (params.profileall){ // Oncodrive3D - ONCODRIVE3D(somatic_mutations, MUTABILITYALL.out.mutability, CREATEPANELS.out.exons_consensus_bed, + ONCODRIVE3D(mutations_in_exons, MUTABILITYALL.out.mutability, datasets3d, annotations3d, MUT_PREPROCESSING.out.all_raw_vep_annotation) positive_selection_results = positive_selection_results.join(ONCODRIVE3D.out.results, remainder: true) positive_selection_results = positive_selection_results.join(ONCODRIVE3D.out.results_pos, remainder: true) @@ -391,7 +398,7 @@ workflow DEEPCSA{ // if (params.expected_mutated_cells & params.dnds){ if (params.dnds){ - DNDS(somatic_mutations, + DNDS(mutations_in_exons, DEPTHSEXONSCONS.out.subset, CREATEPANELS.out.exons_consensus_bed, CREATEPANELS.out.exons_consensus_panel @@ -404,7 +411,7 @@ workflow DEEPCSA{ // Omega if (params.profileall){ - OMEGA(somatic_mutations, + OMEGA(mutations_in_exons, DEPTHSEXONSCONS.out.subset, MUTPROFILEALL.out.profile, CREATEPANELS.out.exons_consensus_bed.first(), @@ -434,7 +441,7 @@ workflow DEEPCSA{ if (params.omega_multi){ // Omega multi - OMEGAMULTI(somatic_mutations, + OMEGAMULTI(mutations_in_exons, DEPTHSEXONSCONS.out.subset, MUTPROFILEALL.out.profile, CREATEPANELS.out.exons_consensus_bed.first(), @@ -457,7 +464,7 @@ workflow DEEPCSA{ } } if (params.profilenonprot && params.positive_selection_non_protein_affecting){ - OMEGANONPROT(somatic_mutations, + OMEGANONPROT(mutations_in_exons, DEPTHSEXONSCONS.out.subset, MUTPROFILENONPROT.out.profile, CREATEPANELS.out.exons_consensus_bed.first(), @@ -475,7 +482,7 @@ workflow DEEPCSA{ } if (params.omega_multi){ - OMEGANONPROTMULTI(somatic_mutations, + OMEGANONPROTMULTI(mutations_in_exons, DEPTHSEXONSCONS.out.subset, MUTPROFILENONPROT.out.profile, CREATEPANELS.out.exons_consensus_bed.first(), From 6dd367f0a7fb752ffad7ddfc096c5f16556cfc42 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Wed, 25 Feb 2026 16:20:25 +0100 Subject: [PATCH 11/19] enable plotting summary with run_mutdensity - allow plotting summary of interindividual variability when run_mutdensity is enabled --- workflows/deepcsa.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/deepcsa.nf b/workflows/deepcsa.nf index 2a722251..e625b4fd 100644 --- a/workflows/deepcsa.nf +++ b/workflows/deepcsa.nf @@ -576,7 +576,7 @@ workflow DEEPCSA{ // DNA2PROTEINMAPPING.out.depths_exons_positions ) - if (params.omega || params.oncodrive3d || params.oncodrivefml || params.indels) { + if (params.omega || params.oncodrive3d || params.oncodrivefml || params.indels || run_mutdensity) { if (params.omega){ positive_selection_results = positive_selection_results.combine(PLOTTINGQC.out.flagged_omegas) } From 5c2460ff5f0c5fe4e87a8575bbef6c229b0cb80f Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Wed, 25 Feb 2026 16:23:37 +0100 Subject: [PATCH 12/19] refactor: add species and genome as parameters (#229) --- bin/panels_computedna2protein.py | 12 +++++++----- conf/modules.config | 2 ++ modules/local/dna2protein/main.nf | 6 +++++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index 58655d03..82234d87 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -85,7 +85,7 @@ def get_transcript_gene_from_maf(path_maf, consensus_file): return gene_transcript_pairs # Generator to filter lines before they reach a dataframe -def get_gff_to_generator(release: int = 111): +def get_gff_to_generator(release: int = 111, species: str = "homo_sapiens", genome: str = "GRCh38"): """ Get GFF file from ensembl FTP and filter it on the fly to keep only exon and CDS lines. @@ -99,7 +99,7 @@ def get_gff_to_generator(release: int = 111): generator A generator that yields lines from the GFF file that correspond to exon and CDS features. """ - url = f"https://ftp.ensembl.org/pub/release-{release}/gff3/homo_sapiens/Homo_sapiens.GRCh38.{release}.gff3.gz" + url = f"https://ftp.ensembl.org/pub/release-{release}/gff3/{species}/{species.capitalize()}.{genome}.{release}.gff3.gz" # Open request try: @@ -454,7 +454,7 @@ def dna2prot_depth(gene_list: list, coord_df: pd.DataFrame, dna_sites: pd.DataFr return dna_prot_df -def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, consensus_file: str, release: int) -> tuple[pd.DataFrame, pd.DataFrame]: +def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, consensus_file: str, release: int, species: str, genome: str) -> tuple[pd.DataFrame, pd.DataFrame]: """ Function to get the DNA to protein mapping for all positions in the provided list of genes, along with coverage and depth information for each position, and the definition of all exons of @@ -606,12 +606,14 @@ def plot_single_coverage(coverage_summary: pd.DataFrame, prefix: str, batch_size @click.option('--mutations-file', type=click.Path(exists=True), help='Mutations file') @click.option('--consensus-file', type=click.Path(exists=True), help='Input consensus panel file') @click.option('--depths-file', type=click.Path(exists=True), help='Input depths of all samples file') +@click.option('--ensembl-species', type=str, default="homo_sapiens", help='Ensembl species name to use for GFF file retrieval (default)') +@click.option('--ensembl-genome', type=str, default="GRCh38", help='Ensembl genome name to use for GFF file retrieval (default)') @click.option('--ensembl-release', type=int, default=111, help='Ensembl release number to use for GFF file retrieval (default is 111)') -def main(mutations_file, consensus_file, depths_file, ensembl_release): +def main(mutations_file, consensus_file, depths_file, ensembl_species, ensembl_genome, ensembl_release): # Count each mutation only ones if it appears in multiple reads gene_n_transcript = get_transcript_gene_from_maf(mutations_file, consensus_file) - exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file, ensembl_release) + exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file, ensembl_release, ensembl_species, ensembl_genome) LOG.info("Exons coordinates and depth computed!") exons_depth.to_csv("depths_per_position_exon_gene.tsv", header = True, index = False, sep = '\t') diff --git a/conf/modules.config b/conf/modules.config index 2b8a0cef..d48baa50 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -74,6 +74,8 @@ process { withName: DNA2PROTEINMAPPING { ext.ensembl_release = params.vep_cache_version ?: 111 + ext.ensembl_species = params.vep_species ?: 'homo_sapiens' + ext.ensembl_genome = params.vep_genome ?: 'GRCh38' } withName: QUERYDEPTHS { diff --git a/modules/local/dna2protein/main.nf b/modules/local/dna2protein/main.nf index f89dcf4b..d5fa8caa 100644 --- a/modules/local/dna2protein/main.nf +++ b/modules/local/dna2protein/main.nf @@ -21,13 +21,17 @@ process DNA_2_PROTEIN_MAPPING { script: def ensembl_release = task.ext.ensembl_release + def ensembl_species = task.ext.ensembl_species + def ensembl_genome = task.ext.ensembl_genome """ cut -f 1,2,6 ${panel_file} | uniq > ${meta2.id}.panel.unique.tsv panels_computedna2protein.py \\ --mutations-file ${mutations_file} \\ --consensus-file ${meta2.id}.panel.unique.tsv \\ --depths-file ${all_samples_depths} \\ - --ensembl-release ${ensembl_release} + --ensembl-release ${ensembl_release} \\ + --ensembl-species ${ensembl_species} \\ + --ensembl-genome ${ensembl_genome} cat <<-END_VERSIONS > versions.yml "${task.process}": From 2998b2c146959856104a5a757334cf5d5f9aa26d Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Wed, 25 Feb 2026 17:14:21 +0100 Subject: [PATCH 13/19] add hdp signatures decomposition -not working, to be tested --- .../signatures/hdp/process_results/main.nf | 5 +- .../signatures/hdp/reformat_sigs/main.nf | 46 ++++++++++++++ .../assignment/{ => cosmic_fit}/main.nf | 4 +- .../assignment/decompose_fit/main.nf | 61 +++++++++++++++++++ subworkflows/local/signatures/main.nf | 10 +-- subworkflows/local/signatures_hdp/main.nf | 13 ++-- 6 files changed, 125 insertions(+), 14 deletions(-) create mode 100644 modules/local/signatures/hdp/reformat_sigs/main.nf rename modules/local/signatures/sigprofiler/assignment/{ => cosmic_fit}/main.nf (95%) create mode 100644 modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf diff --git a/modules/local/signatures/hdp/process_results/main.nf b/modules/local/signatures/hdp/process_results/main.nf index eaa9bba7..40fececc 100644 --- a/modules/local/signatures/hdp/process_results/main.nf +++ b/modules/local/signatures/hdp/process_results/main.nf @@ -9,8 +9,9 @@ process PROCESS_HDP_RESULTS { tuple val(meta2), path (iteration_dir) output: - tuple val(meta), path("output_dir"), emit: processed_results - path "versions.yml" , topic: versions + tuple val(meta), path("output_dir") , emit: processed_results + tuple val(meta), path("**/components.txt") , emit: signatures + path "versions.yml" , topic: versions script: diff --git a/modules/local/signatures/hdp/reformat_sigs/main.nf b/modules/local/signatures/hdp/reformat_sigs/main.nf new file mode 100644 index 00000000..d450a2cc --- /dev/null +++ b/modules/local/signatures/hdp/reformat_sigs/main.nf @@ -0,0 +1,46 @@ + +process SIGNATURES_HDP_TO_SIGPROFILER { + tag "$meta.id" + label 'process_medium' + label 'deepcsa_core' + + + input: + tuple val(meta), path (signatures) + + output: + tuple val(meta), path("*.tsv") , emit: signatures_for_sp + path "versions.yml" , topic: versions + + + script: + def prefix = task.ext.prefix ?: "" + prefix = "${meta.id}${prefix}" + """ + #!/usr/bin/env python + + import pandas as pd + data = pd.read_table("${signatures}") + data = data.sort_index().reset_index() + data.columns = ["Type"] + [f"HDP_{x}" for x in data.columns[1:] ] + data.to_csv("${signatures.baseName}.sp_formatted.tsv", sep = '\t', header = True, index = False) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "" + prefix = "${meta.id}${prefix}" + """ + touch ${prefix}.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + +} \ No newline at end of file diff --git a/modules/local/signatures/sigprofiler/assignment/main.nf b/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf similarity index 95% rename from modules/local/signatures/sigprofiler/assignment/main.nf rename to modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf index 0e725789..73bdacab 100644 --- a/modules/local/signatures/sigprofiler/assignment/main.nf +++ b/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf @@ -1,4 +1,4 @@ -process SIGPROFILERASSIGNMENT { +process SIGPROFILERASSIGNMENT_COSMIC_FIT { tag "$meta.id" label 'process_medium' @@ -30,7 +30,7 @@ process SIGPROFILERASSIGNMENT { SigProfilerAssignment cosmic_fit \\ ${matrix} \\ output_${name} \\ - --signatures ${reference_signatures} \\ + --signature_database ${reference_signatures} \\ --genome_build ${assembly} \\ --cpu ${task.cpus} \\ --volume spa_volume diff --git a/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf b/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf new file mode 100644 index 00000000..5edfd7e9 --- /dev/null +++ b/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf @@ -0,0 +1,61 @@ +process SIGPROFILERASSIGNMENT_DECOMPOSE_FIT { + tag "$meta.id" + label 'process_medium' + + container 'docker.io/ferriolcalvet/sigprofiler_assignment:1.1.3' + + input: + tuple val(meta), val(type), path(matrix) + path (extracted_signatures) + path (reference_signatures) + + output: + tuple val(meta), path("**.pdf") , emit: plots + tuple val(meta), path("**.txt") , emit: stats + tuple val(meta), path("**Decomposed_MutationType_Probabilities.*.txt") , emit: mutation_probs + path "versions.yml" , topic: versions + + + script: + def name = "${meta.id}.${type}" + def assembly = task.ext.assembly ?: "GRCh38" + + // FIXME: the definition of subgroups to exclude seems not to work in the new CLI SigProfilerAssignment + // def exclude_signature_subgroups = params.exclude_subgroups ? "--exclude_signature_subgroups \"${params.exclude_subgroups}\"" : "" + """ + mkdir -p spa_volume + export SIGPROFILERMATRIXGENERATOR_VOLUME='./spa_volume' + export SIGPROFILERPLOTTING_VOLUME='./spa_volume' + export SIGPROFILERASSIGNMENT_VOLUME='./spa_volume' + + SigProfilerAssignment decompose_fit \\ + ${matrix} \\ + signature_decomposition_${name} \\ + --signatures ${extracted_signatures} \\ + --signature_database ${reference_signatures} \\ + --genome_build ${assembly} \\ + --cpu ${task.cpus} \\ + --volume spa_volume + + #mv signature_decomposition_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.txt signature_decomposition_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.${name}.txt; + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + SigProfilerAssignment : 1.1.3 + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "" + prefix = "${meta.id}${prefix}" + """ + touch ${prefix}.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + SigProfilerAssignment : 1.1.3 + END_VERSIONS + """ +} diff --git a/subworkflows/local/signatures/main.nf b/subworkflows/local/signatures/main.nf index ffc71f7a..1cc8798a 100644 --- a/subworkflows/local/signatures/main.nf +++ b/subworkflows/local/signatures/main.nf @@ -1,7 +1,8 @@ -include { MATRIX_CONCAT as MATRIXCONCATWGS } from '../../../modules/local/sig_matrix_concat/main' -include { SIGPROFILERASSIGNMENT } from '../../../modules/local/signatures/sigprofiler/assignment/main' -include { SIGNATURES_PROBABILITIES as SIGPROBS } from '../../../modules/local/combine_sbs/main' -include { HDP_EXTRACTION as HDPEXTRACTION } from '../signatures_hdp/main' +include { MATRIX_CONCAT as MATRIXCONCATWGS } from '../../../modules/local/sig_matrix_concat/main' +include { SIGPROFILERASSIGNMENT_COSMIC_FIT as SIGPROFILERASSIGNMENT } from '../../../modules/local/signatures/sigprofiler/assignment/cosmic_fit/main' +include { SIGPROFILERASSIGNMENT_DECOMPOSE_FIT as HDPREASSIGNMENT } from '../../../modules/local/signatures/sigprofiler/assignment/decompose_fit/main' +include { SIGNATURES_PROBABILITIES as SIGPROBS } from '../../../modules/local/combine_sbs/main' +include { HDP_EXTRACTION as HDPEXTRACTION } from '../signatures_hdp/main' workflow SIGNATURES { @@ -38,6 +39,7 @@ workflow SIGNATURES { HDPEXTRACTION(named_matrices_wgs_hdp, reference_signatures) + HDPREASSIGNMENT(named_matrices_wgs_sp, HDPEXTRACTION.out.signatures, reference_signatures) emit: plots = SIGPROFILERASSIGNMENT.out.plots // channel: [ val(meta), file(depths) ] diff --git a/subworkflows/local/signatures_hdp/main.nf b/subworkflows/local/signatures_hdp/main.nf index f6b1dd2e..e11082db 100644 --- a/subworkflows/local/signatures_hdp/main.nf +++ b/subworkflows/local/signatures_hdp/main.nf @@ -1,9 +1,8 @@ include { PREPARE_INPUT } from '../../../modules/local/signatures/hdp/prepareinput/main' include { RUN_HDP_CHAIN_SAMPLING } from '../../../modules/local/signatures/hdp/chainsampling/main' include { PROCESS_HDP_RESULTS } from '../../../modules/local/signatures/hdp/process_results/main' -include { COMPARE_SIGNATURES as COMPARESIGNATURES } from '../../../modules/local/signatures/hdp/compare_sigs/main' - - +include { COMPARE_SIGNATURES as COMPARESIGNATURES } from '../../../modules/local/signatures/hdp/compare_sigs/main' +include { SIGNATURES_HDP_TO_SIGPROFILER as REFORMATSIGNATURES } from '../../../modules/local/signatures/hdp/reformat_sigs/main' workflow HDP_EXTRACTION { @@ -32,9 +31,11 @@ workflow HDP_EXTRACTION { COMPARESIGNATURES(PROCESS_HDP_RESULTS.out.processed_results, reference_signatures) + REFORMATSIGNATURES(PROCESS_HDP_RESULTS.out.processed_results) + - // emit: - // plots = SIGPROFILERASSIGNMENT.out.plots // channel: [ val(meta), file(depths) ] - // // plots_extraction = MSIGHDP.out.plots // channel: [ val(meta), file(depths) ] + emit: + signatures = REFORMATSIGNATURES.out.signatures_for_sp // channel: [ val(meta), file(depths) ] + // plots_extraction = MSIGHDP.out.plots // channel: [ val(meta), file(depths) ] // mutation_probs = signature_probs_samples } From f665a5d7dc0d16cf6f41c5ebfd9f1c6155a4b2b4 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Thu, 26 Feb 2026 12:25:55 +0100 Subject: [PATCH 14/19] HDP decomposition and reassignment working - use SPA to decompose and reassign signatures to samples --- .../local/signatures/hdp/process_results/main.nf | 2 +- .../local/signatures/hdp/reformat_sigs/main.nf | 15 +++++++-------- .../sigprofiler/assignment/cosmic_fit/main.nf | 2 +- .../sigprofiler/assignment/decompose_fit/main.nf | 8 +++++--- subworkflows/local/signatures_hdp/main.nf | 2 +- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/modules/local/signatures/hdp/process_results/main.nf b/modules/local/signatures/hdp/process_results/main.nf index 40fececc..5b93dafd 100644 --- a/modules/local/signatures/hdp/process_results/main.nf +++ b/modules/local/signatures/hdp/process_results/main.nf @@ -10,7 +10,7 @@ process PROCESS_HDP_RESULTS { output: tuple val(meta), path("output_dir") , emit: processed_results - tuple val(meta), path("**/components.txt") , emit: signatures + tuple val(meta), path("**/components.txt") , emit: signatures path "versions.yml" , topic: versions diff --git a/modules/local/signatures/hdp/reformat_sigs/main.nf b/modules/local/signatures/hdp/reformat_sigs/main.nf index d450a2cc..b596f9e7 100644 --- a/modules/local/signatures/hdp/reformat_sigs/main.nf +++ b/modules/local/signatures/hdp/reformat_sigs/main.nf @@ -17,14 +17,13 @@ process SIGNATURES_HDP_TO_SIGPROFILER { def prefix = task.ext.prefix ?: "" prefix = "${meta.id}${prefix}" """ - #!/usr/bin/env python - - import pandas as pd - data = pd.read_table("${signatures}") - data = data.sort_index().reset_index() - data.columns = ["Type"] + [f"HDP_{x}" for x in data.columns[1:] ] - data.to_csv("${signatures.baseName}.sp_formatted.tsv", sep = '\t', header = True, index = False) - + python < versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') diff --git a/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf b/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf index 73bdacab..848ef692 100644 --- a/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf +++ b/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf @@ -35,7 +35,7 @@ process SIGPROFILERASSIGNMENT_COSMIC_FIT { --cpu ${task.cpus} \\ --volume spa_volume - mv output_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.txt output_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.${name}.txt; + cp output_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.txt output_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.${name}.txt; cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf b/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf index 5edfd7e9..b5442b0c 100644 --- a/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf +++ b/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf @@ -6,7 +6,7 @@ process SIGPROFILERASSIGNMENT_DECOMPOSE_FIT { input: tuple val(meta), val(type), path(matrix) - path (extracted_signatures) + tuple val(meta2), path (extracted_signatures) path (reference_signatures) output: @@ -17,6 +17,7 @@ process SIGPROFILERASSIGNMENT_DECOMPOSE_FIT { script: + def args = task.ext.args ?: '' def name = "${meta.id}.${type}" def assembly = task.ext.assembly ?: "GRCh38" @@ -35,9 +36,10 @@ process SIGPROFILERASSIGNMENT_DECOMPOSE_FIT { --signature_database ${reference_signatures} \\ --genome_build ${assembly} \\ --cpu ${task.cpus} \\ - --volume spa_volume + --volume spa_volume \\ + ${args} - #mv signature_decomposition_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.txt signature_decomposition_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.${name}.txt; + cp signature_decomposition_${name}/Decompose_Solution/Activities/Decomposed_MutationType_Probabilities.txt signature_decomposition_${name}/Decompose_Solution/Activities/Decomposed_MutationType_Probabilities.${name}.txt; cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/signatures_hdp/main.nf b/subworkflows/local/signatures_hdp/main.nf index e11082db..9ece3ca2 100644 --- a/subworkflows/local/signatures_hdp/main.nf +++ b/subworkflows/local/signatures_hdp/main.nf @@ -31,7 +31,7 @@ workflow HDP_EXTRACTION { COMPARESIGNATURES(PROCESS_HDP_RESULTS.out.processed_results, reference_signatures) - REFORMATSIGNATURES(PROCESS_HDP_RESULTS.out.processed_results) + REFORMATSIGNATURES(PROCESS_HDP_RESULTS.out.signatures) emit: From 1f0650b024aa93a6696691ea0b2753318f060c13 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Thu, 26 Feb 2026 14:42:16 +0100 Subject: [PATCH 15/19] fix output and versions --- conf/modules.config | 20 +++++++++++++++++++ .../signatures/hdp/reformat_sigs/main.nf | 11 +++++----- .../sigprofiler/assignment/cosmic_fit/main.nf | 6 ++---- subworkflows/local/signatures/main.nf | 2 +- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c9267ed4..c2870b54 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -546,6 +546,26 @@ process { ] } + withName: HDPREASSIGNMENT { + publishDir = [ + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/signatures/hdp_decomposition_spa" }, + pattern: "**{txt,pdf}", + ] + ] + } + + withName: REFORMATSIGNATURES { + publishDir = [ + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/signatures/signatures_hdp/signatures_reformatted" }, + pattern: "**{tsv}", + ] + ] + } + withName: SIGPROMATRIXGENERATOR { ext.args = "--plot \ --tsb_stat \ diff --git a/modules/local/signatures/hdp/reformat_sigs/main.nf b/modules/local/signatures/hdp/reformat_sigs/main.nf index b596f9e7..d9c70d08 100644 --- a/modules/local/signatures/hdp/reformat_sigs/main.nf +++ b/modules/local/signatures/hdp/reformat_sigs/main.nf @@ -24,11 +24,12 @@ data = data.sort_index().reset_index() data.columns = ["Type"] + [f"HDP_{x}" for x in data.columns[1:]] data.to_csv("${signatures.baseName}.sp_formatted.tsv", sep='\\t', header=True, index=False) CODE - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ + +cat <<-END_VERSIONS > versions.yml +"${task.process}": + python: \$(python --version | sed 's/Python //g') +END_VERSIONS + """ stub: def prefix = task.ext.prefix ?: "" diff --git a/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf b/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf index 848ef692..720c8da7 100644 --- a/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf +++ b/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf @@ -39,8 +39,7 @@ process SIGPROFILERASSIGNMENT_COSMIC_FIT { cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python --version | sed 's/Python //g') - SigProfilerAssignment : 0.1.1 + SigProfilerAssignment : 1.1.3 END_VERSIONS """ @@ -52,8 +51,7 @@ process SIGPROFILERASSIGNMENT_COSMIC_FIT { cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python --version | sed 's/Python //g') - SigProfilerAssignment : 0.1.1 + SigProfilerAssignment : 1.1.3 END_VERSIONS """ } diff --git a/subworkflows/local/signatures/main.nf b/subworkflows/local/signatures/main.nf index 1cc8798a..17da923c 100644 --- a/subworkflows/local/signatures/main.nf +++ b/subworkflows/local/signatures/main.nf @@ -39,7 +39,7 @@ workflow SIGNATURES { HDPEXTRACTION(named_matrices_wgs_hdp, reference_signatures) - HDPREASSIGNMENT(named_matrices_wgs_sp, HDPEXTRACTION.out.signatures, reference_signatures) + HDPREASSIGNMENT(named_matrices_wgs_sp, HDPEXTRACTION.out.signatures.first(), reference_signatures) emit: plots = SIGPROFILERASSIGNMENT.out.plots // channel: [ val(meta), file(depths) ] From 2056733a809471fc12dc50410c3ad19b07dee458 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Thu, 26 Feb 2026 15:24:23 +0100 Subject: [PATCH 16/19] fix output of masks --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index c2870b54..a8adaea6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -239,7 +239,7 @@ process { ], [ mode: params.publish_dir_mode, - path: { "${params.outdir}/flagged_positions" }, + path: { "${params.outdir}/processing_files/flagged_positions" }, pattern: "*{bed}", ] ] From e73286acb6e27a531125c11051dcf3e69ddf3f9d Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Fri, 27 Feb 2026 07:39:11 +0100 Subject: [PATCH 17/19] fix bug and adjust resources --- conf/modules.config | 2 +- modules/local/dna2protein/main.nf | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index b99c7c76..ee0956c0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -112,7 +112,7 @@ process { } - withName: QUERYMUTATIONS { + withName: 'QUERYMUTATIONS|QUERYMUTATIONSEXONS' { ext.prefix = { ".subset_mutations" } ext.args = '' ext.args2 = '-s 1 -b 2 -e 2' diff --git a/modules/local/dna2protein/main.nf b/modules/local/dna2protein/main.nf index d5fa8caa..2a6a97da 100644 --- a/modules/local/dna2protein/main.nf +++ b/modules/local/dna2protein/main.nf @@ -1,6 +1,7 @@ process DNA_2_PROTEIN_MAPPING { tag "$meta.id" label 'process_single' + label 'process_medium_high_memory' label 'deepcsa_core' From f34fad9c2a134acf5fa82421ca9eed8a3d4d999a Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Fri, 27 Feb 2026 07:44:45 +0100 Subject: [PATCH 18/19] fix bug dnds --- workflows/deepcsa.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/deepcsa.nf b/workflows/deepcsa.nf index e625b4fd..8fb08139 100644 --- a/workflows/deepcsa.nf +++ b/workflows/deepcsa.nf @@ -400,7 +400,6 @@ workflow DEEPCSA{ if (params.dnds){ DNDS(mutations_in_exons, DEPTHSEXONSCONS.out.subset, - CREATEPANELS.out.exons_consensus_bed, CREATEPANELS.out.exons_consensus_panel ) } From 437b83c9f5b24b098522069fecca2480dde2e097 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Fri, 27 Feb 2026 08:57:11 +0100 Subject: [PATCH 19/19] fix genome assembly def in sigprofiler --- conf/modules.config | 16 ++++++++++------ .../sigprofiler/assignment/cosmic_fit/main.nf | 2 +- .../sigprofiler/assignment/decompose_fit/main.nf | 2 +- .../signatures/sigprofiler/extractor/main.nf | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ee0956c0..94f6c967 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -542,6 +542,14 @@ process { --seed 123" } + withName: 'SIGPROFILERASSIGNMENT|HDPREASSIGNMENT|SIGPROMATRIXGENERATOR' { + ext.genome_assembly = params.vep_genome == 'GRCh38' + ? 'GRCh38' + : params.vep_genome == 'GRCm39' + ? 'mm39' + : null + } + withName: SIGPROFILERASSIGNMENT { publishDir = [ [ @@ -577,11 +585,7 @@ process { --tsb_stat \ --seqInfo \ --cushion 100" - ext.genome_assembly = params.vep_genome == 'GRCh38' - ? 'GRCh38' - : params.vep_genome == 'GRCm39' - ? 'mm39' - : null + publishDir = [ [ mode: params.publish_dir_mode, @@ -644,7 +648,7 @@ process { --missing_values mean_samples" } - withName: 'SITESFROMPOSITIONS|SUMANNOTATION|SIGPROFILERASSIGNMENT|ONCODRIVECLUSTL|POSTPROCESSVEPPANEL' { + withName: 'SITESFROMPOSITIONS|SUMANNOTATION|ONCODRIVECLUSTL|POSTPROCESSVEPPANEL' { ext.assembly = params.vep_genome == 'GRCh38' ? 'hg38' : params.vep_genome == 'GRCm39' diff --git a/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf b/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf index 720c8da7..b3f77163 100644 --- a/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf +++ b/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf @@ -17,7 +17,7 @@ process SIGPROFILERASSIGNMENT_COSMIC_FIT { script: def name = "${meta.id}.${type}" - def assembly = task.ext.assembly ?: "GRCh38" + def assembly = task.ext.genome_assembly ?: "GRCh38" // FIXME: the definition of subgroups to exclude seems not to work in the new CLI SigProfilerAssignment // def exclude_signature_subgroups = params.exclude_subgroups ? "--exclude_signature_subgroups \"${params.exclude_subgroups}\"" : "" diff --git a/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf b/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf index b5442b0c..fded2c7d 100644 --- a/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf +++ b/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf @@ -19,7 +19,7 @@ process SIGPROFILERASSIGNMENT_DECOMPOSE_FIT { script: def args = task.ext.args ?: '' def name = "${meta.id}.${type}" - def assembly = task.ext.assembly ?: "GRCh38" + def assembly = task.ext.genome_assembly ?: "GRCh38" // FIXME: the definition of subgroups to exclude seems not to work in the new CLI SigProfilerAssignment // def exclude_signature_subgroups = params.exclude_subgroups ? "--exclude_signature_subgroups \"${params.exclude_subgroups}\"" : "" diff --git a/modules/local/signatures/sigprofiler/extractor/main.nf b/modules/local/signatures/sigprofiler/extractor/main.nf index 2673aeac..4f774066 100644 --- a/modules/local/signatures/sigprofiler/extractor/main.nf +++ b/modules/local/signatures/sigprofiler/extractor/main.nf @@ -25,7 +25,7 @@ process SIGPROFILEREXTRACTOR { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "" prefix = "${meta.id}${prefix}" - def assembly = task.ext.assembly ?: "GRCh38" + def assembly = task.ext.genome_assembly ?: "GRCh38" // python -c "from SigProfilerAssignment import Analyzer as Analyze; Analyze.cosmic_fit('${matrix}', 'output', input_type='matrix', context_type='96', // collapse_to_SBS96=True, cosmic_version=3.4, exome=False, // genome_build="GRCh38", signature_database='${reference_signatures}',