From ada77ebd98a02fcc6a9defdd1c1564013775a538 Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Mon, 23 Feb 2026 14:49:47 +0100 Subject: [PATCH 1/9] documentation: add docstrings (#229) - also simplified plot_coverage_per_gene --- bin/panels_computedna2protein.py | 95 +++++++++++++++++++++++++++----- 1 file changed, 81 insertions(+), 14 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index 4a0437c5..b2a57771 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -1,8 +1,36 @@ #!/usr/bin/env python -import time +""" +Panels Compute DNA to Protein - MAF processing to protein positions and +coverage computation. + +This script processes the MAF file to retrieve the transcript-gene pairs, +then retrieves exon, and CDS coordinates and maps DNA positions to protein positions. +It also computes the coverage of each position and generates plots of coverage +per gene. + +Arguments: +---------------------- +--mutations-file : str + Path to the MAF file containing mutations. +--consensus-file : str + Path to the consensus panel file. +--depths-file : str + Path to the file containing depth information for all samples. + +Authors +---------------------- +Author : Stefano Pellegrini (@St3451) +Email : stefano.pellegrini@irbbarcelona.org + +Contributors +---------------------- +- Ferriol Calvet - @FerriolCalvet (ferriol.calvet@irbbarcelona.org) +- Marta Huertas - @m-huertasp (marta.huertas@irbbarcelona.org) +""" -# Third-party imports +import time +import logging import requests import numpy as np import pandas as pd @@ -10,14 +38,35 @@ import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages +# Logging +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s - %(message)s", level=logging.DEBUG, datefmt="%m/%d/%Y %I:%M:%S %p" +) +LOG = logging.getLogger("DNA2protein") + ##### # Define functions ##### def get_transcript_gene_from_maf(path_maf, consensus_file): + """ + Process MAF file to retrieve gene-transcript pairs for the genes in the consensus panel. + + Parameters + ---------- + path_maf : str + Path to the MAF file containing mutations. + consensus_file : str + Path to the consensus panel file. + + Returns + ------- + gene_transcript_pairs : pandas.DataFrame + DataFrame with gene-transcript pairs. + """ maf_df = pd.read_table(path_maf) genes_in_consensus = pd.read_table(consensus_file)["GENE"].unique() - # Final filter + # Filter MAF (keep only positions with canonical protein position) maf_df_f = maf_df[ (maf_df["TYPE"].isin(["SNV", "INSERTION", "DELETION"])) & (maf_df["canonical_SYMBOL"].isin(genes_in_consensus)) @@ -27,27 +76,45 @@ def get_transcript_gene_from_maf(path_maf, consensus_file): gene_transcript_pairs = maf_df_f[["canonical_SYMBOL", "canonical_Feature"]].drop_duplicates().reset_index(drop=True) gene_transcript_pairs.columns = ["Gene", "Ens_transcript_ID"] + LOG.info(f"Retrieved {len(gene_transcript_pairs)} gene-transcript pairs from MAF file.") return gene_transcript_pairs def plot_coverage_per_gene(depths_df): + """ + Wrapper function to plot coverage per gene for DNA, protein and exon levels. - dna_coverage = depths_df.drop_duplicates(subset=['GENE', 'DNA_POS', 'COVERED']) - dna_coverage_summary = dna_coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - plot_single_coverage(dna_coverage_summary, "DNA") - - prot_coverage = depths_df.drop_duplicates(subset=['GENE', 'PROT_POS', 'COVERED']) - prot_coverage_summary = prot_coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - plot_single_coverage(prot_coverage_summary, "Protein") + Parameters + ---------- + depths_df : pandas.DataFrame + DataFrame containing depth and coverage information for DNA, protein and exon positions. + In this case, it will be the "exons_depth" created in `get_dna2prot_depth` function. + """ + coverage = depths_df.drop_duplicates(subset=['GENE', 'DNA_POS', 'COVERED']) + coverage_summary = coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - exon_coverage = depths_df.drop_duplicates(subset=['GENE', 'EXON_RANK', 'COVERED']) - exon_coverage_summary = exon_coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - plot_single_coverage(exon_coverage_summary, "Exon") + for prefix in ["DNA", "Protein", "Exon"]: + LOG.info(f"Plotting coverage for {prefix}...") + plot_single_coverage(coverage_summary, prefix) def plot_single_coverage(coverage_summary, prefix, batch_size = 5): + """ + Plot coverage for a single prefix (DNA, protein or exon) and save the results in a PDF file. + + Parameters + ---------- + coverage_summary : pandas.DataFrame + DataFrame containing the coverage summary for each gene and coverage status. + prefix : str + The prefix to plot (DNA, Protein or Exon). + batch_size : int, optional + The number of genes to include in each batch when plotting (default is 5). + """ + # Generate copy of the coverage summary to avoid modifying the original DataFrame + coverage_summary_cp = coverage_summary.copy() - coverage_pivot = coverage_summary.pivot(index='GENE', columns='COVERED', values='COUNT').fillna(0) + coverage_pivot = coverage_summary_cp.pivot(index='GENE', columns='COVERED', values='COUNT').fillna(0) coverage_pivot = coverage_pivot.sort_values(1, ascending=False).reset_index() coverage_pivot = coverage_pivot.set_index('GENE') genes_list = coverage_pivot.index.tolist() From d538127f72b79242157f298b1514bb086ef7d62b Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 10:27:17 +0100 Subject: [PATCH 2/9] refactor: get gff through ftp instead of api calls (#229) - Removed functions to get the api calls - New functions to obtain the gff and filter it - Modified functions to process the data - Added docstrings --- bin/panels_computedna2protein.py | 215 ++++++++++++++++++++++--------- 1 file changed, 151 insertions(+), 64 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index b2a57771..ada6d51c 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -29,12 +29,16 @@ - Marta Huertas - @m-huertasp (marta.huertas@irbbarcelona.org) """ +import click import time import logging import requests import numpy as np import pandas as pd -import click +import polars as pl +import io +import gzip +import re import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages @@ -183,66 +187,118 @@ def plot_single_coverage(coverage_summary, prefix, batch_size = 5): # Depth # ===== +# Generator to filter lines before they reach a dataframe +def get_gff_to_generator(release: int = 111): + """ + Get GFF file from ensembl FTP and filter it on the fly to keep only exon and CDS lines. + Parameters + ------------ + release : int + The release number of the Ensembl GFF file to retrieve (default is 111). -def get_tr_lookup(transcript_id, max_iter = 20): + Returns + ------------ + generator + A generator that yields lines from the GFF file that correspond to exon and CDS features. """ - Get exons coord + url = f"https://ftp.ensembl.org/pub/release-{release}/gff3/homo_sapiens/Homo_sapiens.GRCh38.{release}.gff3.gz" + + # Open request + response = requests.get(url, stream=True) + # decompress the stream on the fly + with gzip.GzipFile(fileobj=response.raw) as gz: + for line in gz: + # Decode bytes to string + line_str = line.decode('utf-8') + + # Skip comments + if line_str.startswith('#'): + continue + + # Only "yield" lines that are exon or CDS + # This drastically reduces the data sent to the DataFrame + parts = line_str.split('\t') + if parts[2] in ["exon", "CDS"]: + yield line_str + +def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int = 111) -> pd.DataFrame: """ + Transforms the yields from get_gff_to_generator into a filtered DataFrame. The reading and filtering is done with polars + to improve efficiency. - server = "https://rest.ensembl.org" - ext = f"/lookup/id/{transcript_id}?expand=1" - - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - - iter_count = 0 - while not r.ok and iter_count < max_iter: - print("Retrying lookup... (attempt {}/{})".format(iter_count+1, max_iter)) - time.sleep(5) - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - iter_count += 1 - - return r.json() - + Parameters + ------------ + gene_n_transcript : pd.DataFrame + A DataFrame containing gene and transcript information. + release : int + The Ensembl release number to use for GFF file retrieval (default is 111). -def get_cds_coord(transcript_id, len_cds_with_utr, max_iter = 20): + Returns + ------------ + pd.DataFrame + A filtered DataFrame of GFF lines for the specified genes and release. """ - Get CDS coordinates + # Join the generator into a single buffer for Polars to read + filtered_buffer = io.StringIO("".join(get_gff_to_generator(release=release))) + + # Read generator with polars, generate transcript_id column and filter by the genes in the panel + df = ( + pl.read_csv( + filtered_buffer, + separator="\t", + has_header=False, + new_columns=["chr", "source", "feature", "start", "end", "score", "strand", "phase", "attributes"], + schema_overrides={"start": pl.Int64, "end": pl.Int64, "chr": pl.Utf8, "feature": pl.Utf8, "attributes": pl.Utf8} + ) + .with_columns([ + pl.col("attributes").str.extract(r"transcript:(ENST\d+)", 1).alias("transcript_id") + ]) + .filter( + [pl.col("transcript_id").is_in(gene_n_transcript["Ens_transcript_ID"].to_list())] + ) + ) + + # Transform to pandas for easier manipulation later on + return df.to_pandas() + +def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list[int]: """ - server = "https://rest.ensembl.org" - ext = f"/map/cds/{transcript_id}/1..{len_cds_with_utr}?" - - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - - iter_count = 0 - while not r.ok and iter_count < max_iter: - print("Retrying CDS map... (attempt {}/{})".format(iter_count+1, max_iter)) - time.sleep(5) - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - iter_count += 1 - - return r.json()["mappings"] + Parses the coordinates of an exon row from the GFF DataFrame and returns the exon ID and its coordinates in a list format. + Parameters + ---------- + exon : pandas.Series + A row from the GFF DataFrame corresponding to an exon feature. -def parse_cds_coord(exon): + Returns + ------- + exon_id : str + The ID of the exon extracted from the attributes column. + exons_coord : list + A list containing the chromosome, start, end, and strand information of the exon. + """ - strand = exon["strand"] + strand = 1 if exon.strand == "+" else -1 if strand == 1: - start = exon["start"] - end = exon["end"] + start = exon.start + end = exon.end else: - start = exon["end"] - end = exon["start"] + start = exon.end + end = exon.start - if "id" in exon: - exon_id = exon["id"] - chrom = f'chr{exon["seq_region_name"]}' + # Extract exon_id using regex + match = re.search(r"exon_id=([^;]+)", exon.attributes) + if match: + # Extract exon_id using regex + exon_id = match.group(1) + chrom = f'chr{exon.chr}' return exon_id, [chrom, start, end, strand] else: - chrom = exon["seq_region_name"] + chrom = exon.chr return [chrom, start, end, strand] @@ -295,38 +351,69 @@ def get_prot_coverage(dna_prot_df, gene, filter_masked_depth=True): return gene_dna_prot_df -def get_exon_coord_wrapper(gene_n_transcript): +def get_exon_coord_wrapper_new(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Wrapper function to retrieve exon and CDS coordinates for the genes and transcripts in the panel. - # Init df for coordinates - coord_df = gene_n_transcript + Parameters + ---------- + gene_n_transcript : pandas.DataFrame + A DataFrame containing gene and transcript information for the panel. + gff_df : pandas.DataFrame + A DataFrame containing the GFF data filtered for exon and CDS features. - # Get coord + Returns + ------- + final_coord_df : pandas.DataFrame + A DataFrame containing the CDS coordinates for each gene-transcript pair, along with protein position mapping. + final_exons_df : pandas.DataFrame + A DataFrame containing the exon coordinates (including UTRs) for each gene-transcript pair. + """ coord_df_lst = [] exons_coord_df_lst = [] - for gene, transcript in coord_df.values: - print("Processing gene:", gene) - coord_lst = [] - # Get the coord of exons with CDS and UTR as well as the length with UTR and exons ID - exons_lookup = get_tr_lookup(transcript) # We will use this to get Exons ID - for i, exon in enumerate(exons_lookup["Exon"]): - exon_id, exons_coord = parse_cds_coord(exon) - exons_coord_df_lst.append([f"{gene}--exon_{i+1}_{transcript}_{exon_id}"] + exons_coord) + for gene, transcript in gene_n_transcript.values: + print(f"Processing gene: {gene}") + + # Get all features for this transcript once to avoid double filtering + transcript_data = gff_df[gff_df["transcript_id"] == transcript] + if transcript_data.empty: + continue - # Get the CDS coordinates of the exons removing UTRs to map to protein positions - for i, exon in enumerate(get_cds_coord(transcript, exons_lookup["length"])): - coord_lst.append((parse_cds_coord(exon) + [i])) + # Determine strand from the first available entry + strand = transcript_data["strand"].iloc[0] + is_positive = (strand == "+") - gene_coord_df = pd.DataFrame(coord_lst, columns = ["Chr", "Start", "End", "Strand", "Exon_rank"]) - gene_coord_df["Gene"] = gene - gene_coord_df["Ens_transcript_ID"] = transcript - coord_df_lst.append(gene_coord_df) + # --- Handle exons (with UTRs) --- + exons_lookup = transcript_data[transcript_data["feature"] == "exon"] + # Sort based on strand: '+' is low-to-high, '-' is high-to-low + exons_lookup = exons_lookup.sort_values("start", ascending=is_positive) - coord_df = pd.concat(coord_df_lst) - exons_coord_df = pd.DataFrame(exons_coord_df_lst, columns = ["ID", "Chr", "Start", "End", "Strand"]) + for i, exon in enumerate(exons_lookup.itertuples(index=False)): + exon_id, exons_coord = parse_cds_coord(exon) + exons_coord_df_lst.append([f"{gene}--exon_{i+1}_{transcript}_{exon_id}"] + exons_coord) - return coord_df, exons_coord_df + # --- Handle CDS --- + cds_lookup = transcript_data[transcript_data["feature"] == "CDS"] + cds_lookup = cds_lookup.sort_values("start", ascending=is_positive) + coord_lst = [] + for i, exon in enumerate(cds_lookup.itertuples(index=False)): + exons_coord = parse_cds_coord(exon) + # Include the biological rank (i) + coord_lst.append(exons_coord + [i]) + + if coord_lst: + gene_coord_df = pd.DataFrame(coord_lst, columns=["Chr", "Start", "End", "Strand", "Exon_rank"]) + gene_coord_df["Gene"] = gene + gene_coord_df["Ens_transcript_ID"] = transcript + coord_df_lst.append(gene_coord_df) + + # Combine results + final_coord_df = pd.concat(coord_df_lst) if coord_df_lst else pd.DataFrame() + final_exons_df = pd.DataFrame(exons_coord_df_lst, columns=["ID", "Chr", "Start", "End", "Strand"]) + + return final_coord_df, final_exons_df def dna2prot_depth(gene_list, coord_df, dna_sites, depth_df): From 683d50f7dac0003ddbe9018bc674f3053992c6ac Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 10:57:21 +0100 Subject: [PATCH 3/9] documentation: add docstrings to rest of functions (#229) - Also modified the order - Removed unused functions --- bin/panels_computedna2protein.py | 483 ++++++++++++++++--------------- 1 file changed, 255 insertions(+), 228 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index ada6d51c..41793d2b 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -30,7 +30,6 @@ """ import click -import time import logging import requests import numpy as np @@ -48,9 +47,9 @@ ) LOG = logging.getLogger("DNA2protein") -##### -# Define functions -##### + +# Data parsing and processing functions +# ---------------------------------------------------------- def get_transcript_gene_from_maf(path_maf, consensus_file): """ Process MAF file to retrieve gene-transcript pairs for the genes in the consensus panel. @@ -83,110 +82,6 @@ def get_transcript_gene_from_maf(path_maf, consensus_file): LOG.info(f"Retrieved {len(gene_transcript_pairs)} gene-transcript pairs from MAF file.") return gene_transcript_pairs - -def plot_coverage_per_gene(depths_df): - """ - Wrapper function to plot coverage per gene for DNA, protein and exon levels. - - Parameters - ---------- - depths_df : pandas.DataFrame - DataFrame containing depth and coverage information for DNA, protein and exon positions. - In this case, it will be the "exons_depth" created in `get_dna2prot_depth` function. - """ - coverage = depths_df.drop_duplicates(subset=['GENE', 'DNA_POS', 'COVERED']) - coverage_summary = coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - - for prefix in ["DNA", "Protein", "Exon"]: - LOG.info(f"Plotting coverage for {prefix}...") - plot_single_coverage(coverage_summary, prefix) - - -def plot_single_coverage(coverage_summary, prefix, batch_size = 5): - """ - Plot coverage for a single prefix (DNA, protein or exon) and save the results in a PDF file. - - Parameters - ---------- - coverage_summary : pandas.DataFrame - DataFrame containing the coverage summary for each gene and coverage status. - prefix : str - The prefix to plot (DNA, Protein or Exon). - batch_size : int, optional - The number of genes to include in each batch when plotting (default is 5). - """ - # Generate copy of the coverage summary to avoid modifying the original DataFrame - coverage_summary_cp = coverage_summary.copy() - - coverage_pivot = coverage_summary_cp.pivot(index='GENE', columns='COVERED', values='COUNT').fillna(0) - coverage_pivot = coverage_pivot.sort_values(1, ascending=False).reset_index() - coverage_pivot = coverage_pivot.set_index('GENE') - genes_list = coverage_pivot.index.tolist() - coverage_perc = coverage_pivot.div(coverage_pivot.sum(axis=1), axis=0) * 100 - coverage_pivot.to_csv(f"coverage_count_{prefix}.tsv", header=True, index=True, sep='\t') - coverage_perc.to_csv(f"coverage_perc_{prefix}.tsv", header=True, index=True, sep='\t') - - if len(genes_list) < batch_size: - - fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - - covered_col = True if True in coverage_pivot.columns else (1 if 1 in coverage_pivot.columns else coverage_pivot.columns[-1]) - - coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff'], - ) - - axes[0].set_title(f'{prefix} : covered vs. non-covered') - axes[0].set_ylabel(f'Number of {prefix}') - axes[0].set_xlabel('Gene') - axes[0].legend(title='Covered', loc='upper right') - axes[0].tick_params(axis='x', rotation=45) - - coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') - axes[1].set_title(f'{prefix} : percentage of covered') - axes[1].set_ylabel('Percentage (%)') - axes[1].set_xlabel('Gene') - axes[1].set_ylim(0, 100) - axes[1].tick_params(axis='x', rotation=45) - - plt.tight_layout() - plt.savefig(f"coverage_per_{prefix}.pdf", dpi=300) - plt.show() - - - else: - with PdfPages(f"coverage_per_{prefix}_batches.pdf") as pdf: - # split into batches N genes - for i in range(0, len(genes_list), batch_size): - batch_genes = genes_list[i:i+batch_size] - batch_coverage_pivot = coverage_pivot.loc[batch_genes] - batch_coverage_perc = coverage_perc.loc[batch_genes] - - fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - - covered_col = True if True in batch_coverage_pivot.columns else (1 if 1 in batch_coverage_pivot.columns else batch_coverage_pivot.columns[-1]) - - batch_coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff']) - axes[0].set_title(f'{prefix} : covered vs. non-covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') - axes[0].set_ylabel(f'Number of {prefix}') - axes[0].set_xlabel('Gene') - axes[0].legend(title='Covered', loc='upper right') - axes[0].tick_params(axis='x', rotation=45) - - batch_coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') - axes[1].set_title(f'{prefix} : percentage of covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') - axes[1].set_ylabel('Percentage (%)') - axes[1].set_xlabel('Gene') - axes[1].set_ylim(0, 100) - axes[1].tick_params(axis='x', rotation=45) - - plt.tight_layout() - pdf.savefig(dpi=300) - plt.show() - plt.close() - - -# Depth -# ===== # Generator to filter lines before they reach a dataframe def get_gff_to_generator(release: int = 111): """ @@ -262,7 +157,7 @@ def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int = 111) -> p # Transform to pandas for easier manipulation later on return df.to_pandas() -def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list[int]: +def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list: """ Parses the coordinates of an exon row from the GFF DataFrame and returns the exon ID and its coordinates in a list format. @@ -302,56 +197,7 @@ def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list[int]: return [chrom, start, end, strand] - -# Get Exon coord to protein pos -# ----------------------------- - -def get_dna_exon_pos(exon_range, strand): - - if strand == -1: - return np.arange(exon_range[1], exon_range[0] + 1)[::-1] - else: - return np.arange(exon_range[0], exon_range[1] + 1) - - -def get_exon_ix(i, exon_range, strand): - - len_exon = len(get_dna_exon_pos(exon_range, strand)) - - return np.repeat(i, len_exon) - - -def get_dna_map_to_protein(coord_df): - - strand = coord_df["Strand"].unique()[0] - - exons_range = coord_df[["Start", "End"]].values - exons = np.concatenate([get_dna_exon_pos(exon, strand) for exon in exons_range]) - exons_ix = np.concatenate([get_exon_ix(i, exon, strand) for i, exon in enumerate(exons_range)]) - prot_pos = np.arange(len(exons)) // 3 + 1 - - df = pd.DataFrame({"GENE" : coord_df["Gene"].unique()[0], - "CHROM" : f'chr{coord_df["Chr"].unique()[0]}', - "DNA_POS" : exons, - "PROT_POS" : prot_pos, - "REVERSE_STRAND" : strand, - "EXON_RANK" : exons_ix, - "TRANSCRIPT_ID" : coord_df["Ens_transcript_ID"].unique()[0]}) - - return df - - -def get_prot_coverage(dna_prot_df, gene, filter_masked_depth=True): - - gene_dna_prot_df = dna_prot_df[dna_prot_df["GENE"] == gene] - gene_dna_prot_df = gene_dna_prot_df.dropna(subset=["PROT_POS"])[["PROT_POS", "COVERED", "DEPTH"]].reset_index(drop=True) - gene_dna_prot_df = gene_dna_prot_df.groupby("PROT_POS").sum().reset_index() - gene_dna_prot_df.COVERED = (gene_dna_prot_df.COVERED > 0).astype(int) - - return gene_dna_prot_df - - -def get_exon_coord_wrapper_new(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: +def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """ Wrapper function to retrieve exon and CDS coordinates for the genes and transcripts in the panel. @@ -415,11 +261,137 @@ def get_exon_coord_wrapper_new(gene_n_transcript: pd.DataFrame, gff_df: pd.DataF return final_coord_df, final_exons_df +# Coordinate parsing and DNA-to-protein mapping functions +# ---------------------------------------------------------- +def get_dna_exon_pos(exon_range: list, strand: int) -> list: + """ + Get the DNA positions of an exon given its range and strand. + Parameters + ------------ + exon_range : list + A list containing the start and end positions of the exon. + strand : int + The strand of the exon (1 for positive strand, -1 for negative strand). + + Returns + ------------ + list + A list of DNA positions corresponding to the exon, ordered according to the strand. + """ + + if strand == -1: + return np.arange(exon_range[1], exon_range[0] + 1)[::-1] + else: + return np.arange(exon_range[0], exon_range[1] + 1) + + +def get_exon_ix(i: int, exon_range: list, strand: int) -> list: + """ + Get the exon index for each DNA position in the exon, ordered according to the strand. + + Parameters + ------------ + i : int + The exon index (starting from 0). + exon_range : list + A list containing the start and end positions of the exon. + strand : int + The strand of the exon (1 for positive strand, -1 for negative strand). + + Returns + ------------ + list + A list of exon indices corresponding to each DNA position in the exon, + ordered according to the strand. + """ + len_exon = len(get_dna_exon_pos(exon_range, strand)) + + return np.repeat(i, len_exon) + -def dna2prot_depth(gene_list, coord_df, dna_sites, depth_df): +def get_dna_map_to_protein(coord_df: pd.DataFrame) -> pd.DataFrame: + """ + Get a mapping of DNA positions to protein positions for a given gene-transcript pair. + + Parameters + ------------ + coord_df : pandas.DataFrame + A DataFrame containing the coordinates of the CDS for a specific gene-transcript pair. + + Returns + ------------ + pandas.DataFrame + A DataFrame containing the mapping of DNA positions to protein positions. + """ + strand = coord_df["Strand"].unique()[0] + + exons_range = coord_df[["Start", "End"]].values + exons = np.concatenate([get_dna_exon_pos(exon, strand) for exon in exons_range]) + exons_ix = np.concatenate([get_exon_ix(i, exon, strand) for i, exon in enumerate(exons_range)]) + prot_pos = np.arange(len(exons)) // 3 + 1 + + df = pd.DataFrame({"GENE" : coord_df["Gene"].unique()[0], + "CHROM" : f'chr{coord_df["Chr"].unique()[0]}', + "DNA_POS" : exons, + "PROT_POS" : prot_pos, + "REVERSE_STRAND" : strand, + "EXON_RANK" : exons_ix, + "TRANSCRIPT_ID" : coord_df["Ens_transcript_ID"].unique()[0]}) + + return df + +def find_exon(x_coord: dict, exon_coord_df: pd.DataFrame) -> str | np.nan: + """ + Find the exon ID for a given DNA position. + Parameters + ------------ + x_coord : dict + A dictionary containing the DNA position, chromosome, and strand information. + exon_coord_df : pandas.DataFrame + A DataFrame containing the coordinates of all exons. + + Returns + ------------ + str or np.nan + The ID of the exon that contains the given DNA position, or np.nan if no match is found. + """ + + dna_pos, chrom, strand = x_coord["DNA_POS"], x_coord["CHROM"], x_coord["REVERSE_STRAND"] + + if strand == -1: + matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] <= dna_pos) & (dna_pos <= exon_coord_df['Start'])] + + else: + matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] >= dna_pos) & (dna_pos >= exon_coord_df['Start'])] + + return matches['ID'].values[0] if not matches.empty else np.nan + + +# Depth computation and coverage functions +# ---------------------------------------------------------- +def dna2prot_depth(gene_list: list, coord_df: pd.DataFrame, dna_sites: pd.DataFrame, depth_df: pd.DataFrame) -> pd.DataFrame: """ Get a DNA to protein mapping of all positions in the provided list of genes - Add as well coverage info & DNA to GENE annotation + Add coverage info & DNA to GENE annotation + + Parameters + ------------ + gene_list : list + A list of gene names for which to compute the DNA to protein mapping and coverage information. + coord_df : pandas.DataFrame + A DataFrame containing the coordinates of the CDS + for the genes in the panel. + dna_sites : pandas.DataFrame + A DataFrame containing the DNA positions that are part of the consensus panel, + along with their coverage information. + depth_df : pandas.DataFrame + A DataFrame containing the depth information for all positions in the genome. + + Returns + ------------ + pandas.DataFrame + A DataFrame containing the mapping of DNA positions to protein positions for the specified genes, + along with coverage and depth information for each position. """ # Map DNA to protein pos, get exons index to protein pos, etc @@ -429,10 +401,6 @@ def dna2prot_depth(gene_list, coord_df, dna_sites, depth_df): dna_prot_df_lst.append(get_dna_map_to_protein(gene_coord_df)) dna_prot_df = pd.concat(dna_prot_df_lst) - # dna_prot_df - # contains all the protein positions of the genes in the panel - # mapped to its corresponding genomic position - # Merge CDS position with availble sites (not masked) and depth info # and any other site that was included in the panel (splicing sites out of the CDS) dna_prot_df = dna_sites.merge(dna_prot_df, on=["GENE", "CHROM", "DNA_POS"], how="outer") @@ -449,14 +417,29 @@ def dna2prot_depth(gene_list, coord_df, dna_sites, depth_df): return dna_prot_df -def get_dna2prot_depth(gene_n_transcript_info, depth_file, consensus_file): +def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, consensus_file: str) -> tuple[pd.DataFrame, pd.DataFrame]: """ - This function outputs: - dna_prot_df: + Function to get the DNA to protein mapping for all positions in the provided list of genes, + along with coverage and depth information for each position, and the definition of all exons of + the genes/transcripts in the panel including UTR regions. - exons_coord_df: - df containing the definition of all exons of the genes/transcripts in the panel - including UTR regions + Parameters + ---------- + gene_n_transcript_info : pandas.DataFrame + A DataFrame containing gene and transcript information for the panel. + depth_file : str + Path to the file containing depth information for all positions in the genome. + consensus_file : str + Path to the consensus panel file containing DNA positions and coverage information. + + Returns + ---------- + dna_prot_df : pandas.DataFrame + A DataFrame containing the mapping of DNA positions to protein positions for the specified genes, + along with coverage and depth information for each position. + exons_coord_df_final : pandas.DataFrame + A DataFrame containing the coordinates of all exons (including UTRs) for the genes and transcripts + in the panel, formatted for BED files. """ consensus_df = pd.read_table(consensus_file) @@ -482,26 +465,28 @@ def get_dna2prot_depth(gene_n_transcript_info, depth_file, consensus_file): return dna_prot_df, exons_coord_df_final +def get_exon_depth_saturation(gene_depth: pd.DataFrame, gene_mut: pd.DataFrame, dna: bool = False) -> pd.DataFrame: + """ + Compute the depth and saturation of each exon in a gene. -# Utils function to retrieve exon ID from coordinate - -def find_exon(x_coord, exon_coord_df): - - dna_pos, chrom, strand = x_coord["DNA_POS"], x_coord["CHROM"], x_coord["REVERSE_STRAND"] - - if strand == -1: - matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] <= dna_pos) & (dna_pos <= exon_coord_df['Start'])] - - else: - matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] >= dna_pos) & (dna_pos >= exon_coord_df['Start'])] - - return matches['ID'].values[0] if not matches.empty else np.nan - - - - -def get_exon_depth_saturation(gene_depth, gene_mut, dna=False): - + Parameters + ---------- + gene_depth : pandas.DataFrame + A DataFrame containing the depth and coverage information for each DNA position in the gene, along with the corresponding + protein position and exon rank. + gene_mut : pandas.DataFrame + A DataFrame containing the mutation information for the gene, including the DNA positions of the mutations (if dna=True) + or the protein positions of the mutations (if dna=False). + dna : bool, optional + A boolean indicating whether the input data is at the DNA level (True) or at the protein level (False). This affects how + saturation is calculated (default is False). + + Returns + ------- + exon_depth : pandas.DataFrame + A DataFrame containing the average depth, coverage, number of mutated positions, and saturation for each exon in the gene, + along with the starting protein position of each exon. + """ # Exon average depth gene_depth = gene_depth.copy() exon_depth = gene_depth.groupby("EXON_RANK").apply( @@ -533,62 +518,104 @@ def get_exon_depth_saturation(gene_depth, gene_mut, dna=False): return exon_depth -def get_exon_mid_prot_pos(exon_info, prot_len): +# Plots +# ---------------------------------------------------------- +def plot_coverage_per_gene(depths_df: pd.DataFrame) -> None: + """ + Wrapper function to plot coverage per gene for DNA, protein and exon levels. - lst_end_pos = [] - for i in range(len(exon_info)): - exon = exon_info.iloc[i] - start_pos = int(exon.START_PROT_POS) - end_pos = int(exon_info.iloc[i+1].START_PROT_POS) if i < len(exon_info) -1 else prot_len - lst_end_pos.append(end_pos) - exon_info["END_PROT_POS"] = lst_end_pos - exon_info["MID_PROT_POS"] = (exon_info["START_PROT_POS"] + exon_info["END_PROT_POS"]) / 2 + Parameters + ---------- + depths_df : pandas.DataFrame + DataFrame containing depth and coverage information for DNA, protein and exon positions. + In this case, it will be the "exons_depth" created in `get_dna2prot_depth` function. + """ + coverage = depths_df.drop_duplicates(subset=['GENE', 'DNA_POS', 'COVERED']) + coverage_summary = coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - return exon_info + for prefix in ["DNA", "Protein", "Exon"]: + LOG.info(f"Plotting coverage for {prefix}...") + plot_single_coverage(coverage_summary, prefix) -def get_res_coverage(dna_prot_df): +def plot_single_coverage(coverage_summary: pd.DataFrame, prefix: str, batch_size: int = 5): + """ + Plot coverage for a single prefix (DNA, protein or exon) and save the results in a PDF file. - res_depth = dna_prot_df.dropna(subset=["PROT_POS"])[["PROT_POS", "COVERED", "DEPTH"]].reset_index(drop=True) - res_depth = res_depth.groupby("PROT_POS").mean().reset_index() + Parameters + ---------- + coverage_summary : pandas.DataFrame + DataFrame containing the coverage summary for each gene and coverage status. + prefix : str + The prefix to plot (DNA, Protein or Exon). + batch_size : int, optional + The number of genes to include in each batch when plotting (default is 5). + """ + # Generate copy of the coverage summary to avoid modifying the original DataFrame + coverage_summary_cp = coverage_summary.copy() - return res_depth + coverage_pivot = coverage_summary_cp.pivot(index='GENE', columns='COVERED', values='COUNT').fillna(0) + coverage_pivot = coverage_pivot.sort_values(1, ascending=False).reset_index() + coverage_pivot = coverage_pivot.set_index('GENE') + genes_list = coverage_pivot.index.tolist() + coverage_perc = coverage_pivot.div(coverage_pivot.sum(axis=1), axis=0) * 100 + coverage_pivot.to_csv(f"coverage_count_{prefix}.tsv", header=True, index=True, sep='\t') + coverage_perc.to_csv(f"coverage_perc_{prefix}.tsv", header=True, index=True, sep='\t') + if len(genes_list) < batch_size: -def add_consecutive_numbers(nums, max_n): + fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - result = [] - for i in range(len(nums)): - result.append(nums[i]) - # Check if the current number is the start of a consecutive sequence - if i < len(nums) - 1 and nums[i] + 1 != nums[i + 1]: - result.append(nums[i] + 1) + covered_col = True if True in coverage_pivot.columns else (1 if 1 in coverage_pivot.columns else coverage_pivot.columns[-1]) - # Add the last consecutive number after the final element - result.append(nums[-1] + 1) + coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff'], + ) - return result + axes[0].set_title(f'{prefix} : covered vs. non-covered') + axes[0].set_ylabel(f'Number of {prefix}') + axes[0].set_xlabel('Gene') + axes[0].legend(title='Covered', loc='upper right') + axes[0].tick_params(axis='x', rotation=45) + coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') + axes[1].set_title(f'{prefix} : percentage of covered') + axes[1].set_ylabel('Percentage (%)') + axes[1].set_xlabel('Gene') + axes[1].set_ylim(0, 100) + axes[1].tick_params(axis='x', rotation=45) -def where_plus(condition): - """ - Util function to extend the color of mpl filling to the next position. - """ + plt.tight_layout() + plt.savefig(f"coverage_per_{prefix}.pdf", dpi=300) + else: + with PdfPages(f"coverage_per_{prefix}_batches.pdf") as pdf: + # split into batches N genes + for i in range(0, len(genes_list), batch_size): + batch_genes = genes_list[i:i+batch_size] + batch_coverage_pivot = coverage_pivot.loc[batch_genes] + batch_coverage_perc = coverage_perc.loc[batch_genes] - ix = np.where(condition)[0] + fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - if len(ix) > 0: - ix = add_consecutive_numbers(ix, max_n=len(condition)) - if len(condition) in ix: - ix.remove(len(condition)) - boolean_vector = np.zeros(len(condition), dtype=bool) - boolean_vector[ix] = True + covered_col = True if True in batch_coverage_pivot.columns else (1 if 1 in batch_coverage_pivot.columns else batch_coverage_pivot.columns[-1]) - return pd.Series(boolean_vector) + batch_coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff']) + axes[0].set_title(f'{prefix} : covered vs. non-covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') + axes[0].set_ylabel(f'Number of {prefix}') + axes[0].set_xlabel('Gene') + axes[0].legend(title='Covered', loc='upper right') + axes[0].tick_params(axis='x', rotation=45) - else: - return condition + batch_coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') + axes[1].set_title(f'{prefix} : percentage of covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') + axes[1].set_ylabel('Percentage (%)') + axes[1].set_xlabel('Gene') + axes[1].set_ylim(0, 100) + axes[1].tick_params(axis='x', rotation=45) + plt.tight_layout() + pdf.savefig(dpi=300) + plt.show() + plt.close() @click.command() @click.option('--mutations-file', type=click.Path(exists=True), help='Mutations file') From e9eb510696c441a1ee027968046662c1d21726e4 Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 11:16:46 +0100 Subject: [PATCH 4/9] refactor: separate parse_cds_coord into exon and cds parsers (#229) --- bin/panels_computedna2protein.py | 73 +++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index 41793d2b..517ff2d6 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -100,7 +100,22 @@ def get_gff_to_generator(release: int = 111): url = f"https://ftp.ensembl.org/pub/release-{release}/gff3/homo_sapiens/Homo_sapiens.GRCh38.{release}.gff3.gz" # Open request - response = requests.get(url, stream=True) + try: + response = requests.get(url, stream=True, timeout=60) + response.raise_for_status() + except requests.exceptions.Timeout: + error = RuntimeError(f"Request timed out while fetching Ensembl GFF3 (release {release}) from: {url}") + LOG.error(error) + raise error + except requests.exceptions.HTTPError as e: + error = RuntimeError(f"HTTP error {response.status_code} while fetching Ensembl GFF3 (release {release}) from: {url}") + LOG.error(error) + raise error + except requests.exceptions.RequestException as e: + error = RuntimeError(f"Failed to fetch Ensembl GFF3 (release {release}) from: {url}") + LOG.error(error) + raise error + # decompress the stream on the fly with gzip.GzipFile(fileobj=response.raw) as gz: for line in gz: @@ -157,7 +172,16 @@ def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int = 111) -> p # Transform to pandas for easier manipulation later on return df.to_pandas() -def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list: +def _parse_strand_coords(exon) -> tuple[str, int, int, int]: + """Extract chromosome, strand-ordered start/end, and strand integer from a GFF row.""" + strand = 1 if exon.strand == "+" else -1 + if strand == 1: + start, end = exon.start, exon.end + else: + start, end = exon.end, exon.start + return exon.chr, start, end, strand + +def parse_exon_coord(exon: pd.Series) -> tuple[str, list]: """ Parses the coordinates of an exon row from the GFF DataFrame and returns the exon ID and its coordinates in a list format. @@ -173,29 +197,35 @@ def parse_cds_coord(exon: pd.Series) -> tuple[str, list] | list: exons_coord : list A list containing the chromosome, start, end, and strand information of the exon. """ - - strand = 1 if exon.strand == "+" else -1 - - if strand == 1: - start = exon.start - end = exon.end - else: - start = exon.end - end = exon.start + chrom, start, end, strand = _parse_strand_coords(exon) # Extract exon_id using regex match = re.search(r"exon_id=([^;]+)", exon.attributes) - if match: - # Extract exon_id using regex - exon_id = match.group(1) - chrom = f'chr{exon.chr}' + + # Extract exon_id using regex + exon_id = match.group(1) if match else "" + chrom = f'chr{exon.chr}' - return exon_id, [chrom, start, end, strand] + return exon_id, [chrom, start, end, strand] - else: - chrom = exon.chr +def parse_cds_coord(exon: pd.Series) -> list: + """ + Parses the coordinates of a CDS row from the GFF DataFrame and returns the coordinates in a list format. + + Parameters + ---------- + exon : pandas.Series + A row from the GFF DataFrame corresponding to an exon feature. + + Returns + ------- + coord : list + A list containing the chromosome, start, end, and strand information of the CDS. + """ + chrom, start, end, strand = _parse_strand_coords(exon) - return [chrom, start, end, strand] + # Return the CDS ID and coordinates + return [chrom, start, end, strand] def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """ @@ -236,7 +266,7 @@ def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame exons_lookup = exons_lookup.sort_values("start", ascending=is_positive) for i, exon in enumerate(exons_lookup.itertuples(index=False)): - exon_id, exons_coord = parse_cds_coord(exon) + exon_id, exons_coord = parse_exon_coord(exon) exons_coord_df_lst.append([f"{gene}--exon_{i+1}_{transcript}_{exon_id}"] + exons_coord) # --- Handle CDS --- @@ -444,6 +474,7 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co consensus_df = pd.read_table(consensus_file) depth_df = pd.read_table(depth_file) + gff_df = gff_to_filtered_df(gene_n_transcript_info) consensus_df = consensus_df.merge(depth_df[["CHROM", "POS", "CONTEXT"]], on = ["CHROM", "POS"], how = 'left') consensus_df = consensus_df.rename(columns={"POS" : "DNA_POS"}) @@ -452,7 +483,7 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co depth_df["DEPTH"] = depth_df.drop(columns=["CHROM", "POS", "CONTEXT"]).mean(1) depth_df = depth_df[["CHROM", "POS", "DEPTH"]].rename(columns = {"POS" : "DNA_POS"}) - coord_df, exons_coord_df = get_exon_coord_wrapper(gene_n_transcript_info) + coord_df, exons_coord_df = get_exon_coord_wrapper(gene_n_transcript_info, gff_df) gene_list = list(coord_df["Gene"].unique()) dna_prot_df = dna2prot_depth(gene_list, coord_df, consensus_df, depth_df) From bca9ab1ffb754360923776a599f1e1953e2aecfc Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 12:38:50 +0100 Subject: [PATCH 5/9] refactor: vectorized find_exon (#229) - Improved efficiency of find_exon by applying vectorized dataframe operations - Fix errors in docstrings and parameter definition - Add log information and remove info from matplotlib --- bin/panels_computedna2protein.py | 160 +++++++++++++------------------ 1 file changed, 67 insertions(+), 93 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index 517ff2d6..11d4aaf8 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -45,6 +45,8 @@ logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s - %(message)s", level=logging.DEBUG, datefmt="%m/%d/%Y %I:%M:%S %p" ) +logging.getLogger('matplotlib').setLevel(logging.WARNING) # Avoid too much logging + LOG = logging.getLogger("DNA2protein") @@ -62,7 +64,7 @@ def get_transcript_gene_from_maf(path_maf, consensus_file): Path to the consensus panel file. Returns - ------- + ------------ gene_transcript_pairs : pandas.DataFrame DataFrame with gene-transcript pairs. """ @@ -169,6 +171,7 @@ def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int = 111) -> p ) ) + LOG.info(f"Filtered GFF data to {df.shape[0]} rows for the specified genes and release.") # Transform to pandas for easier manipulation later on return df.to_pandas() @@ -186,25 +189,32 @@ def parse_exon_coord(exon: pd.Series) -> tuple[str, list]: Parses the coordinates of an exon row from the GFF DataFrame and returns the exon ID and its coordinates in a list format. Parameters - ---------- + ------------ exon : pandas.Series A row from the GFF DataFrame corresponding to an exon feature. Returns - ------- + ------------ exon_id : str The ID of the exon extracted from the attributes column. exons_coord : list A list containing the chromosome, start, end, and strand information of the exon. """ chrom, start, end, strand = _parse_strand_coords(exon) + + # Add "chr" prefix to chromosome + chrom = f'chr{exon.chr}' # Extract exon_id using regex match = re.search(r"exon_id=([^;]+)", exon.attributes) # Extract exon_id using regex - exon_id = match.group(1) if match else "" - chrom = f'chr{exon.chr}' + if match: + exon_id = match.group(1) + + else: + exon_id = "" + LOG.warning(f"Could not extract exon_id from attributes: {exon.transcript_id} - {exon.attributes}") return exon_id, [chrom, start, end, strand] @@ -213,12 +223,12 @@ def parse_cds_coord(exon: pd.Series) -> list: Parses the coordinates of a CDS row from the GFF DataFrame and returns the coordinates in a list format. Parameters - ---------- + ------------ exon : pandas.Series A row from the GFF DataFrame corresponding to an exon feature. Returns - ------- + ------------ coord : list A list containing the chromosome, start, end, and strand information of the CDS. """ @@ -232,14 +242,14 @@ def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame Wrapper function to retrieve exon and CDS coordinates for the genes and transcripts in the panel. Parameters - ---------- + ------------ gene_n_transcript : pandas.DataFrame A DataFrame containing gene and transcript information for the panel. gff_df : pandas.DataFrame A DataFrame containing the GFF data filtered for exon and CDS features. Returns - ------- + ------------ final_coord_df : pandas.DataFrame A DataFrame containing the CDS coordinates for each gene-transcript pair, along with protein position mapping. final_exons_df : pandas.DataFrame @@ -249,7 +259,6 @@ def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame exons_coord_df_lst = [] for gene, transcript in gene_n_transcript.values: - print(f"Processing gene: {gene}") # Get all features for this transcript once to avoid double filtering transcript_data = gff_df[gff_df["transcript_id"] == transcript] @@ -289,11 +298,12 @@ def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame final_coord_df = pd.concat(coord_df_lst) if coord_df_lst else pd.DataFrame() final_exons_df = pd.DataFrame(exons_coord_df_lst, columns=["ID", "Chr", "Start", "End", "Strand"]) + LOG.info(f"Retrieved coordinates for {len(final_coord_df['Gene'].unique())} genes and {len(final_exons_df['ID'].unique())} exons.") return final_coord_df, final_exons_df # Coordinate parsing and DNA-to-protein mapping functions # ---------------------------------------------------------- -def get_dna_exon_pos(exon_range: list, strand: int) -> list: +def get_dna_exon_pos(exon_range: np.ndarray, strand: int) -> list: """ Get the DNA positions of an exon given its range and strand. Parameters @@ -315,7 +325,7 @@ def get_dna_exon_pos(exon_range: list, strand: int) -> list: return np.arange(exon_range[0], exon_range[1] + 1) -def get_exon_ix(i: int, exon_range: list, strand: int) -> list: +def get_exon_ix(i: int, exon_range: np.ndarray, strand: int): """ Get the exon index for each DNA position in the exon, ordered according to the strand. @@ -323,8 +333,8 @@ def get_exon_ix(i: int, exon_range: list, strand: int) -> list: ------------ i : int The exon index (starting from 0). - exon_range : list - A list containing the start and end positions of the exon. + exon_range : np.ndarray + A numpy array containing the start and end positions of the exon. strand : int The strand of the exon (1 for positive strand, -1 for negative strand). @@ -370,31 +380,48 @@ def get_dna_map_to_protein(coord_df: pd.DataFrame) -> pd.DataFrame: return df -def find_exon(x_coord: dict, exon_coord_df: pd.DataFrame) -> str | np.nan: +def find_exon(dna_prot_df: pd.DataFrame, exon_coord_df: pd.DataFrame): """ - Find the exon ID for a given DNA position. + For each DNA position in *dna_prot_df* the function finds the matching exon + ID from *exon_coord_df* by merging on chromosome and filtering by positional + interval. This avoids a Python-level loop and is orders of magnitude faster + than the ``apply`` approach for large panels. + Parameters ------------ - x_coord : dict - A dictionary containing the DNA position, chromosome, and strand information. - exon_coord_df : pandas.DataFrame - A DataFrame containing the coordinates of all exons. + dna_prot_df : pd.DataFrame + DataFrame with at least ``CHROM`` and ``DNA_POS`` columns. + exon_coord_df : pd.DataFrame + DataFrame with ``Chr``, ``Start``, ``End``, and ``ID`` columns. Returns ------------ - str or np.nan - The ID of the exon that contains the given DNA position, or np.nan if no match is found. + np.ndarray + Array of exon IDs aligned to *dna_prot_df*\'s row order; + ``np.nan`` where no matching exon is found. """ + # Normalise strand-swapped coordinates so lo <= hi in all cases + exon_df = exon_coord_df[["Chr", "Start", "End", "ID"]].copy() + exon_df["lo"] = exon_df[["Start", "End"]].min(axis=1) + exon_df["hi"] = exon_df[["Start", "End"]].max(axis=1) + + # Cross-merge on chromosome; each row of dna_prot_df is paired only with + # exons on the same chromosome, then filtered to those whose interval + # contains the DNA position. + merged = dna_prot_df[["CHROM", "DNA_POS"]].merge( + exon_df[["Chr", "lo", "hi", "ID"]], + left_on="CHROM", right_on="Chr", + how="left" + ) + in_exon = (merged["DNA_POS"] >= merged["lo"]) & (merged["DNA_POS"] <= merged["hi"]) + # Keep the first matching exon per position + matched = merged[in_exon].drop_duplicates(subset=["CHROM", "DNA_POS"]) - dna_pos, chrom, strand = x_coord["DNA_POS"], x_coord["CHROM"], x_coord["REVERSE_STRAND"] - - if strand == -1: - matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] <= dna_pos) & (dna_pos <= exon_coord_df['Start'])] - - else: - matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] >= dna_pos) & (dna_pos >= exon_coord_df['Start'])] - - return matches['ID'].values[0] if not matches.empty else np.nan + # Re-align to the original DataFrame row order + result = dna_prot_df[["CHROM", "DNA_POS"]].merge( + matched[["CHROM", "DNA_POS", "ID"]], on=["CHROM", "DNA_POS"], how="left" + ) + return result["ID"].values # Depth computation and coverage functions @@ -454,7 +481,7 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co the genes/transcripts in the panel including UTR regions. Parameters - ---------- + ------------ gene_n_transcript_info : pandas.DataFrame A DataFrame containing gene and transcript information for the panel. depth_file : str @@ -463,7 +490,7 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co Path to the consensus panel file containing DNA positions and coverage information. Returns - ---------- + ------------ dna_prot_df : pandas.DataFrame A DataFrame containing the mapping of DNA positions to protein positions for the specified genes, along with coverage and depth information for each position. @@ -479,15 +506,15 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co consensus_df = consensus_df.merge(depth_df[["CHROM", "POS", "CONTEXT"]], on = ["CHROM", "POS"], how = 'left') consensus_df = consensus_df.rename(columns={"POS" : "DNA_POS"}) - if "DEPTH" not in depth_df: + if "DEPTH" not in depth_df.columns: depth_df["DEPTH"] = depth_df.drop(columns=["CHROM", "POS", "CONTEXT"]).mean(1) - depth_df = depth_df[["CHROM", "POS", "DEPTH"]].rename(columns = {"POS" : "DNA_POS"}) + depth_df = depth_df[["CHROM", "POS", "DEPTH"]] coord_df, exons_coord_df = get_exon_coord_wrapper(gene_n_transcript_info, gff_df) gene_list = list(coord_df["Gene"].unique()) - dna_prot_df = dna2prot_depth(gene_list, coord_df, consensus_df, depth_df) + dna_prot_df = dna2prot_depth(gene_list=gene_list, coord_df=coord_df, dna_sites=consensus_df, depth_df=depth_df) - dna_prot_df["EXON_ID"] = dna_prot_df.apply(lambda x: find_exon(x, exons_coord_df), axis=1) + dna_prot_df["EXON_ID"] = find_exon(dna_prot_df, exons_coord_df) # fix coordinates order in exons_coord_df for BED exons_coord_df_final = exons_coord_df.copy() @@ -496,58 +523,6 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co return dna_prot_df, exons_coord_df_final -def get_exon_depth_saturation(gene_depth: pd.DataFrame, gene_mut: pd.DataFrame, dna: bool = False) -> pd.DataFrame: - """ - Compute the depth and saturation of each exon in a gene. - - Parameters - ---------- - gene_depth : pandas.DataFrame - A DataFrame containing the depth and coverage information for each DNA position in the gene, along with the corresponding - protein position and exon rank. - gene_mut : pandas.DataFrame - A DataFrame containing the mutation information for the gene, including the DNA positions of the mutations (if dna=True) - or the protein positions of the mutations (if dna=False). - dna : bool, optional - A boolean indicating whether the input data is at the DNA level (True) or at the protein level (False). This affects how - saturation is calculated (default is False). - - Returns - ------- - exon_depth : pandas.DataFrame - A DataFrame containing the average depth, coverage, number of mutated positions, and saturation for each exon in the gene, - along with the starting protein position of each exon. - """ - # Exon average depth - gene_depth = gene_depth.copy() - exon_depth = gene_depth.groupby("EXON_RANK").apply( - lambda x: 0 if sum(x.COVERED) == 0 else sum(x.DEPTH) / sum(x.COVERED)).reset_index().rename(columns = {0 : "DEPTH"}) - exon_depth["START_PROT_POS"] = gene_depth.groupby("EXON_RANK").apply(lambda x: x.PROT_POS.min()) - - # Exon saturation - if dna: - exon_depth["COVERED"] = gene_depth.groupby("EXON_RANK").apply(lambda x: sum(x.COVERED)).values - gene_depth["MUTATED"] = gene_depth.DNA_POS.isin(gene_mut.DNA_POS.unique()).astype(int) - exon_depth["MUTATED"] = gene_depth.groupby("EXON_RANK").apply(lambda x: sum(x.MUTATED)).values - exon_depth["SATURATION"] = exon_depth["MUTATED"] / exon_depth["COVERED"] - else: - - # If an amino acids from the same codon are in two exons, consider the position only in the second one - # (but look at booth to know if the residue is in the panel and if it is mutated) - exon_depth_prot = gene_depth.groupby("PROT_POS").apply(lambda x: (x.COVERED.max(), x.EXON_RANK.max())).reset_index() - exon_depth_prot[["COVERED", "EXON_RANK"]] = pd.DataFrame(exon_depth_prot[0].tolist(), index=exon_depth_prot.index) - exon_depth_prot = exon_depth_prot.drop(columns=[0]) - - exon_depth["COVERED"] = exon_depth_prot.groupby("EXON_RANK").apply(lambda x: sum(x.COVERED)).values - exon_depth_prot["MUTATED"] = exon_depth_prot["PROT_POS"].isin(gene_mut["Pos"].unique()).astype(int) - exon_depth["MUTATED"] = exon_depth_prot.groupby("EXON_RANK").apply(lambda x: sum(x.MUTATED)).values - exon_depth["SATURATION"] = exon_depth["MUTATED"] / exon_depth["COVERED"] - - # check_mutated_masked = exon_depth_prot[(exon_depth_prot["MUTATED"] == 1) & (exon_depth_prot["COVERED"] == 0)] - # check_mutated_masked["GENE"] = gene_depth.GENE.unique()[0] - - return exon_depth - # Plots # ---------------------------------------------------------- @@ -645,7 +620,6 @@ def plot_single_coverage(coverage_summary: pd.DataFrame, prefix: str, batch_size plt.tight_layout() pdf.savefig(dpi=300) - plt.show() plt.close() @click.command() @@ -653,13 +627,11 @@ def plot_single_coverage(coverage_summary: pd.DataFrame, prefix: str, batch_size @click.option('--consensus-file', type=click.Path(exists=True), help='Input consensus panel file') @click.option('--depths-file', type=click.Path(exists=True), help='Input depths of all samples file') def main(mutations_file, consensus_file, depths_file): - click.echo("Starting to run plot saturation results...") - # Count each mutation only ones if it appears in multiple reads gene_n_transcript = get_transcript_gene_from_maf(mutations_file, consensus_file) exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file) - print("Exons coordinates and depth computed") + LOG.info("Exons coordinates and depth computed!") exons_depth.to_csv("depths_per_position_exon_gene.tsv", header = True, index = False, sep = '\t') exons_coordinates_bed_like = exons_coord_id[['Chr', 'Start', 'End', 'ID']] @@ -667,6 +639,8 @@ def main(mutations_file, consensus_file, depths_file): plot_coverage_per_gene(exons_depth) + LOG.info("All done!") + if __name__ == '__main__': main() From 385f3a9a615e24a574fcee18b2cd81dd8b6d8e00 Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 14:05:51 +0100 Subject: [PATCH 6/9] refactor: add option to change ensembl release (#229) - Added click option to script - Added definition to nextflow process --- bin/panels_computedna2protein.py | 33 +++++++------------------------ modules/local/dna2protein/main.nf | 4 +++- 2 files changed, 10 insertions(+), 27 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index 11d4aaf8..58655d03 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -129,12 +129,11 @@ def get_gff_to_generator(release: int = 111): continue # Only "yield" lines that are exon or CDS - # This drastically reduces the data sent to the DataFrame parts = line_str.split('\t') if parts[2] in ["exon", "CDS"]: yield line_str -def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int = 111) -> pd.DataFrame: +def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int) -> pd.DataFrame: """ Transforms the yields from get_gff_to_generator into a filtered DataFrame. The reading and filtering is done with polars to improve efficiency. @@ -218,25 +217,6 @@ def parse_exon_coord(exon: pd.Series) -> tuple[str, list]: return exon_id, [chrom, start, end, strand] -def parse_cds_coord(exon: pd.Series) -> list: - """ - Parses the coordinates of a CDS row from the GFF DataFrame and returns the coordinates in a list format. - - Parameters - ------------ - exon : pandas.Series - A row from the GFF DataFrame corresponding to an exon feature. - - Returns - ------------ - coord : list - A list containing the chromosome, start, end, and strand information of the CDS. - """ - chrom, start, end, strand = _parse_strand_coords(exon) - - # Return the CDS ID and coordinates - return [chrom, start, end, strand] - def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """ Wrapper function to retrieve exon and CDS coordinates for the genes and transcripts in the panel. @@ -284,7 +264,7 @@ def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame coord_lst = [] for i, exon in enumerate(cds_lookup.itertuples(index=False)): - exons_coord = parse_cds_coord(exon) + exons_coord = list(_parse_strand_coords(exon)) # Include the biological rank (i) coord_lst.append(exons_coord + [i]) @@ -474,7 +454,7 @@ def dna2prot_depth(gene_list: list, coord_df: pd.DataFrame, dna_sites: pd.DataFr return dna_prot_df -def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, consensus_file: str) -> tuple[pd.DataFrame, pd.DataFrame]: +def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, consensus_file: str, release: int) -> tuple[pd.DataFrame, pd.DataFrame]: """ Function to get the DNA to protein mapping for all positions in the provided list of genes, along with coverage and depth information for each position, and the definition of all exons of @@ -501,7 +481,7 @@ def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, co consensus_df = pd.read_table(consensus_file) depth_df = pd.read_table(depth_file) - gff_df = gff_to_filtered_df(gene_n_transcript_info) + gff_df = gff_to_filtered_df(gene_n_transcript_info, release=release) consensus_df = consensus_df.merge(depth_df[["CHROM", "POS", "CONTEXT"]], on = ["CHROM", "POS"], how = 'left') consensus_df = consensus_df.rename(columns={"POS" : "DNA_POS"}) @@ -626,11 +606,12 @@ def plot_single_coverage(coverage_summary: pd.DataFrame, prefix: str, batch_size @click.option('--mutations-file', type=click.Path(exists=True), help='Mutations file') @click.option('--consensus-file', type=click.Path(exists=True), help='Input consensus panel file') @click.option('--depths-file', type=click.Path(exists=True), help='Input depths of all samples file') -def main(mutations_file, consensus_file, depths_file): +@click.option('--ensembl-release', type=int, default=111, help='Ensembl release number to use for GFF file retrieval (default is 111)') +def main(mutations_file, consensus_file, depths_file, ensembl_release): # Count each mutation only ones if it appears in multiple reads gene_n_transcript = get_transcript_gene_from_maf(mutations_file, consensus_file) - exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file) + exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file, ensembl_release) LOG.info("Exons coordinates and depth computed!") exons_depth.to_csv("depths_per_position_exon_gene.tsv", header = True, index = False, sep = '\t') diff --git a/modules/local/dna2protein/main.nf b/modules/local/dna2protein/main.nf index 07d41544..b4ce0baa 100644 --- a/modules/local/dna2protein/main.nf +++ b/modules/local/dna2protein/main.nf @@ -20,12 +20,14 @@ process DNA_2_PROTEIN_MAPPING { script: + def ensembl_release = task.ext.vep_cache_version ? "--ensembl-release \"${task.ext.vep_cache_version}\"" : "" """ cut -f 1,2,6 ${panel_file} | uniq > ${meta2.id}.panel.unique.tsv panels_computedna2protein.py \\ --mutations-file ${mutations_file} \\ --consensus-file ${meta2.id}.panel.unique.tsv \\ - --depths-file ${all_samples_depths} + --depths-file ${all_samples_depths} \\ + ${ensembl_release} cat <<-END_VERSIONS > versions.yml "${task.process}": From 821537ac5b1e390ead93fe883b3828641a7868a9 Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 14:43:35 +0100 Subject: [PATCH 7/9] fix: forgot to add the configuration (#229) --- conf/modules.config | 4 ++++ modules/local/dna2protein/main.nf | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 327648f0..2b8a0cef 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -72,6 +72,10 @@ process { ] } + withName: DNA2PROTEINMAPPING { + ext.ensembl_release = params.vep_cache_version ?: 111 + } + withName: QUERYDEPTHS { ext.prefix = { ".subset_depths" } ext.args = '' diff --git a/modules/local/dna2protein/main.nf b/modules/local/dna2protein/main.nf index b4ce0baa..5d48e45b 100644 --- a/modules/local/dna2protein/main.nf +++ b/modules/local/dna2protein/main.nf @@ -20,14 +20,14 @@ process DNA_2_PROTEIN_MAPPING { script: - def ensembl_release = task.ext.vep_cache_version ? "--ensembl-release \"${task.ext.vep_cache_version}\"" : "" + def ensembl_release = task.ext.ensembl_release """ cut -f 1,2,6 ${panel_file} | uniq > ${meta2.id}.panel.unique.tsv panels_computedna2protein.py \\ --mutations-file ${mutations_file} \\ --consensus-file ${meta2.id}.panel.unique.tsv \\ --depths-file ${all_samples_depths} \\ - ${ensembl_release} + --ensembl-release ${ensembl_release} cat <<-END_VERSIONS > versions.yml "${task.process}": From 6efbf68e122e31339b2c9ae6a7938e8956517ba3 Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Tue, 24 Feb 2026 14:44:41 +0100 Subject: [PATCH 8/9] fix: indent problem (#229) --- modules/local/dna2protein/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/dna2protein/main.nf b/modules/local/dna2protein/main.nf index 5d48e45b..f89dcf4b 100644 --- a/modules/local/dna2protein/main.nf +++ b/modules/local/dna2protein/main.nf @@ -20,7 +20,7 @@ process DNA_2_PROTEIN_MAPPING { script: - def ensembl_release = task.ext.ensembl_release + def ensembl_release = task.ext.ensembl_release """ cut -f 1,2,6 ${panel_file} | uniq > ${meta2.id}.panel.unique.tsv panels_computedna2protein.py \\ From 5c2460ff5f0c5fe4e87a8575bbef6c229b0cb80f Mon Sep 17 00:00:00 2001 From: "Marta H." Date: Wed, 25 Feb 2026 16:23:37 +0100 Subject: [PATCH 9/9] refactor: add species and genome as parameters (#229) --- bin/panels_computedna2protein.py | 12 +++++++----- conf/modules.config | 2 ++ modules/local/dna2protein/main.nf | 6 +++++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index 58655d03..82234d87 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -85,7 +85,7 @@ def get_transcript_gene_from_maf(path_maf, consensus_file): return gene_transcript_pairs # Generator to filter lines before they reach a dataframe -def get_gff_to_generator(release: int = 111): +def get_gff_to_generator(release: int = 111, species: str = "homo_sapiens", genome: str = "GRCh38"): """ Get GFF file from ensembl FTP and filter it on the fly to keep only exon and CDS lines. @@ -99,7 +99,7 @@ def get_gff_to_generator(release: int = 111): generator A generator that yields lines from the GFF file that correspond to exon and CDS features. """ - url = f"https://ftp.ensembl.org/pub/release-{release}/gff3/homo_sapiens/Homo_sapiens.GRCh38.{release}.gff3.gz" + url = f"https://ftp.ensembl.org/pub/release-{release}/gff3/{species}/{species.capitalize()}.{genome}.{release}.gff3.gz" # Open request try: @@ -454,7 +454,7 @@ def dna2prot_depth(gene_list: list, coord_df: pd.DataFrame, dna_sites: pd.DataFr return dna_prot_df -def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, consensus_file: str, release: int) -> tuple[pd.DataFrame, pd.DataFrame]: +def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, consensus_file: str, release: int, species: str, genome: str) -> tuple[pd.DataFrame, pd.DataFrame]: """ Function to get the DNA to protein mapping for all positions in the provided list of genes, along with coverage and depth information for each position, and the definition of all exons of @@ -606,12 +606,14 @@ def plot_single_coverage(coverage_summary: pd.DataFrame, prefix: str, batch_size @click.option('--mutations-file', type=click.Path(exists=True), help='Mutations file') @click.option('--consensus-file', type=click.Path(exists=True), help='Input consensus panel file') @click.option('--depths-file', type=click.Path(exists=True), help='Input depths of all samples file') +@click.option('--ensembl-species', type=str, default="homo_sapiens", help='Ensembl species name to use for GFF file retrieval (default)') +@click.option('--ensembl-genome', type=str, default="GRCh38", help='Ensembl genome name to use for GFF file retrieval (default)') @click.option('--ensembl-release', type=int, default=111, help='Ensembl release number to use for GFF file retrieval (default is 111)') -def main(mutations_file, consensus_file, depths_file, ensembl_release): +def main(mutations_file, consensus_file, depths_file, ensembl_species, ensembl_genome, ensembl_release): # Count each mutation only ones if it appears in multiple reads gene_n_transcript = get_transcript_gene_from_maf(mutations_file, consensus_file) - exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file, ensembl_release) + exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file, ensembl_release, ensembl_species, ensembl_genome) LOG.info("Exons coordinates and depth computed!") exons_depth.to_csv("depths_per_position_exon_gene.tsv", header = True, index = False, sep = '\t') diff --git a/conf/modules.config b/conf/modules.config index 2b8a0cef..d48baa50 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -74,6 +74,8 @@ process { withName: DNA2PROTEINMAPPING { ext.ensembl_release = params.vep_cache_version ?: 111 + ext.ensembl_species = params.vep_species ?: 'homo_sapiens' + ext.ensembl_genome = params.vep_genome ?: 'GRCh38' } withName: QUERYDEPTHS { diff --git a/modules/local/dna2protein/main.nf b/modules/local/dna2protein/main.nf index f89dcf4b..d5fa8caa 100644 --- a/modules/local/dna2protein/main.nf +++ b/modules/local/dna2protein/main.nf @@ -21,13 +21,17 @@ process DNA_2_PROTEIN_MAPPING { script: def ensembl_release = task.ext.ensembl_release + def ensembl_species = task.ext.ensembl_species + def ensembl_genome = task.ext.ensembl_genome """ cut -f 1,2,6 ${panel_file} | uniq > ${meta2.id}.panel.unique.tsv panels_computedna2protein.py \\ --mutations-file ${mutations_file} \\ --consensus-file ${meta2.id}.panel.unique.tsv \\ --depths-file ${all_samples_depths} \\ - --ensembl-release ${ensembl_release} + --ensembl-release ${ensembl_release} \\ + --ensembl-species ${ensembl_species} \\ + --ensembl-genome ${ensembl_genome} cat <<-END_VERSIONS > versions.yml "${task.process}":