diff --git a/bin/panel_postprocessing_annotation.py b/bin/panel_postprocessing_annotation.py index e13e057b..9cf3f173 100755 --- a/bin/panel_postprocessing_annotation.py +++ b/bin/panel_postprocessing_annotation.py @@ -4,15 +4,13 @@ import pandas as pd import numpy as np from itertools import product -from bgreference import hg38, hg19, mm10, mm39 +from bgreference import hg38, mm39 from utils_context import transform_context from utils_impacts import * from read_utils import custom_na_values assembly_name2function = { "hg38": hg38, - "hg19": hg19, - "mm10": mm10, "mm39": mm39 } @@ -199,7 +197,7 @@ def vep2summarizedannotation_panel(VEP_output_file, all_possible_sites_annotated @click.command() @click.option('--vep_output_file', type=click.Path(exists=True), required=True, help='Path to the VEP output file.') -@click.option('--assembly', type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), default='hg38', help='Genome assembly.') +@click.option('--assembly', type=click.Choice(['hg38', 'mm39']), default='hg38', help='Genome assembly.') @click.option('--output_file', type=click.Path(), required=True, help='Path to the output annotated file.') @click.option('--only_canonical', is_flag=True, default=False, help='Use only canonical transcripts.') def main(vep_output_file, assembly, output_file, only_canonical): diff --git a/bin/panels_computedna2protein.py b/bin/panels_computedna2protein.py index 4a0437c5..82234d87 100755 --- a/bin/panels_computedna2protein.py +++ b/bin/panels_computedna2protein.py @@ -1,23 +1,77 @@ #!/usr/bin/env python -import time +""" +Panels Compute DNA to Protein - MAF processing to protein positions and +coverage computation. + +This script processes the MAF file to retrieve the transcript-gene pairs, +then retrieves exon, and CDS coordinates and maps DNA positions to protein positions. +It also computes the coverage of each position and generates plots of coverage +per gene. + +Arguments: +---------------------- +--mutations-file : str + Path to the MAF file containing mutations. +--consensus-file : str + Path to the consensus panel file. +--depths-file : str + Path to the file containing depth information for all samples. + +Authors +---------------------- +Author : Stefano Pellegrini (@St3451) +Email : stefano.pellegrini@irbbarcelona.org + +Contributors +---------------------- +- Ferriol Calvet - @FerriolCalvet (ferriol.calvet@irbbarcelona.org) +- Marta Huertas - @m-huertasp (marta.huertas@irbbarcelona.org) +""" -# Third-party imports +import click +import logging import requests import numpy as np import pandas as pd -import click +import polars as pl +import io +import gzip +import re import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages -##### -# Define functions -##### +# Logging +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s - %(message)s", level=logging.DEBUG, datefmt="%m/%d/%Y %I:%M:%S %p" +) +logging.getLogger('matplotlib').setLevel(logging.WARNING) # Avoid too much logging + +LOG = logging.getLogger("DNA2protein") + + +# Data parsing and processing functions +# ---------------------------------------------------------- def get_transcript_gene_from_maf(path_maf, consensus_file): + """ + Process MAF file to retrieve gene-transcript pairs for the genes in the consensus panel. + + Parameters + ---------- + path_maf : str + Path to the MAF file containing mutations. + consensus_file : str + Path to the consensus panel file. + + Returns + ------------ + gene_transcript_pairs : pandas.DataFrame + DataFrame with gene-transcript pairs. + """ maf_df = pd.read_table(path_maf) genes_in_consensus = pd.read_table(consensus_file)["GENE"].unique() - # Final filter + # Filter MAF (keep only positions with canonical protein position) maf_df_f = maf_df[ (maf_df["TYPE"].isin(["SNV", "INSERTION", "DELETION"])) & (maf_df["canonical_SYMBOL"].isin(genes_in_consensus)) @@ -27,163 +81,223 @@ def get_transcript_gene_from_maf(path_maf, consensus_file): gene_transcript_pairs = maf_df_f[["canonical_SYMBOL", "canonical_Feature"]].drop_duplicates().reset_index(drop=True) gene_transcript_pairs.columns = ["Gene", "Ens_transcript_ID"] + LOG.info(f"Retrieved {len(gene_transcript_pairs)} gene-transcript pairs from MAF file.") return gene_transcript_pairs +# Generator to filter lines before they reach a dataframe +def get_gff_to_generator(release: int = 111, species: str = "homo_sapiens", genome: str = "GRCh38"): + """ + Get GFF file from ensembl FTP and filter it on the fly to keep only exon and CDS lines. -def plot_coverage_per_gene(depths_df): - - dna_coverage = depths_df.drop_duplicates(subset=['GENE', 'DNA_POS', 'COVERED']) - dna_coverage_summary = dna_coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - plot_single_coverage(dna_coverage_summary, "DNA") - - prot_coverage = depths_df.drop_duplicates(subset=['GENE', 'PROT_POS', 'COVERED']) - prot_coverage_summary = prot_coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - plot_single_coverage(prot_coverage_summary, "Protein") - - exon_coverage = depths_df.drop_duplicates(subset=['GENE', 'EXON_RANK', 'COVERED']) - exon_coverage_summary = exon_coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') - plot_single_coverage(exon_coverage_summary, "Exon") - - -def plot_single_coverage(coverage_summary, prefix, batch_size = 5): - - coverage_pivot = coverage_summary.pivot(index='GENE', columns='COVERED', values='COUNT').fillna(0) - coverage_pivot = coverage_pivot.sort_values(1, ascending=False).reset_index() - coverage_pivot = coverage_pivot.set_index('GENE') - genes_list = coverage_pivot.index.tolist() - coverage_perc = coverage_pivot.div(coverage_pivot.sum(axis=1), axis=0) * 100 - coverage_pivot.to_csv(f"coverage_count_{prefix}.tsv", header=True, index=True, sep='\t') - coverage_perc.to_csv(f"coverage_perc_{prefix}.tsv", header=True, index=True, sep='\t') - - if len(genes_list) < batch_size: - - fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - - covered_col = True if True in coverage_pivot.columns else (1 if 1 in coverage_pivot.columns else coverage_pivot.columns[-1]) - - coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff'], - ) - - axes[0].set_title(f'{prefix} : covered vs. non-covered') - axes[0].set_ylabel(f'Number of {prefix}') - axes[0].set_xlabel('Gene') - axes[0].legend(title='Covered', loc='upper right') - axes[0].tick_params(axis='x', rotation=45) - - coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') - axes[1].set_title(f'{prefix} : percentage of covered') - axes[1].set_ylabel('Percentage (%)') - axes[1].set_xlabel('Gene') - axes[1].set_ylim(0, 100) - axes[1].tick_params(axis='x', rotation=45) - - plt.tight_layout() - plt.savefig(f"coverage_per_{prefix}.pdf", dpi=300) - plt.show() - + Parameters + ------------ + release : int + The release number of the Ensembl GFF file to retrieve (default is 111). + Returns + ------------ + generator + A generator that yields lines from the GFF file that correspond to exon and CDS features. + """ + url = f"https://ftp.ensembl.org/pub/release-{release}/gff3/{species}/{species.capitalize()}.{genome}.{release}.gff3.gz" + + # Open request + try: + response = requests.get(url, stream=True, timeout=60) + response.raise_for_status() + except requests.exceptions.Timeout: + error = RuntimeError(f"Request timed out while fetching Ensembl GFF3 (release {release}) from: {url}") + LOG.error(error) + raise error + except requests.exceptions.HTTPError as e: + error = RuntimeError(f"HTTP error {response.status_code} while fetching Ensembl GFF3 (release {release}) from: {url}") + LOG.error(error) + raise error + except requests.exceptions.RequestException as e: + error = RuntimeError(f"Failed to fetch Ensembl GFF3 (release {release}) from: {url}") + LOG.error(error) + raise error + + # decompress the stream on the fly + with gzip.GzipFile(fileobj=response.raw) as gz: + for line in gz: + # Decode bytes to string + line_str = line.decode('utf-8') + + # Skip comments + if line_str.startswith('#'): + continue + + # Only "yield" lines that are exon or CDS + parts = line_str.split('\t') + if parts[2] in ["exon", "CDS"]: + yield line_str + +def gff_to_filtered_df(gene_n_transcript: pd.DataFrame, release: int) -> pd.DataFrame: + """ + Transforms the yields from get_gff_to_generator into a filtered DataFrame. The reading and filtering is done with polars + to improve efficiency. + + Parameters + ------------ + gene_n_transcript : pd.DataFrame + A DataFrame containing gene and transcript information. + release : int + The Ensembl release number to use for GFF file retrieval (default is 111). + + Returns + ------------ + pd.DataFrame + A filtered DataFrame of GFF lines for the specified genes and release. + """ + # Join the generator into a single buffer for Polars to read + filtered_buffer = io.StringIO("".join(get_gff_to_generator(release=release))) + + # Read generator with polars, generate transcript_id column and filter by the genes in the panel + df = ( + pl.read_csv( + filtered_buffer, + separator="\t", + has_header=False, + new_columns=["chr", "source", "feature", "start", "end", "score", "strand", "phase", "attributes"], + schema_overrides={"start": pl.Int64, "end": pl.Int64, "chr": pl.Utf8, "feature": pl.Utf8, "attributes": pl.Utf8} + ) + .with_columns([ + pl.col("attributes").str.extract(r"transcript:(ENST\d+)", 1).alias("transcript_id") + ]) + .filter( + [pl.col("transcript_id").is_in(gene_n_transcript["Ens_transcript_ID"].to_list())] + ) + ) + + LOG.info(f"Filtered GFF data to {df.shape[0]} rows for the specified genes and release.") + # Transform to pandas for easier manipulation later on + return df.to_pandas() + +def _parse_strand_coords(exon) -> tuple[str, int, int, int]: + """Extract chromosome, strand-ordered start/end, and strand integer from a GFF row.""" + strand = 1 if exon.strand == "+" else -1 + if strand == 1: + start, end = exon.start, exon.end else: - with PdfPages(f"coverage_per_{prefix}_batches.pdf") as pdf: - # split into batches N genes - for i in range(0, len(genes_list), batch_size): - batch_genes = genes_list[i:i+batch_size] - batch_coverage_pivot = coverage_pivot.loc[batch_genes] - batch_coverage_perc = coverage_perc.loc[batch_genes] - - fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - - covered_col = True if True in batch_coverage_pivot.columns else (1 if 1 in batch_coverage_pivot.columns else batch_coverage_pivot.columns[-1]) - - batch_coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff']) - axes[0].set_title(f'{prefix} : covered vs. non-covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') - axes[0].set_ylabel(f'Number of {prefix}') - axes[0].set_xlabel('Gene') - axes[0].legend(title='Covered', loc='upper right') - axes[0].tick_params(axis='x', rotation=45) - - batch_coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') - axes[1].set_title(f'{prefix} : percentage of covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') - axes[1].set_ylabel('Percentage (%)') - axes[1].set_xlabel('Gene') - axes[1].set_ylim(0, 100) - axes[1].tick_params(axis='x', rotation=45) - - plt.tight_layout() - pdf.savefig(dpi=300) - plt.show() - plt.close() - + start, end = exon.end, exon.start + return exon.chr, start, end, strand -# Depth -# ===== - - -def get_tr_lookup(transcript_id, max_iter = 20): +def parse_exon_coord(exon: pd.Series) -> tuple[str, list]: """ - Get exons coord + Parses the coordinates of an exon row from the GFF DataFrame and returns the exon ID and its coordinates in a list format. + + Parameters + ------------ + exon : pandas.Series + A row from the GFF DataFrame corresponding to an exon feature. + + Returns + ------------ + exon_id : str + The ID of the exon extracted from the attributes column. + exons_coord : list + A list containing the chromosome, start, end, and strand information of the exon. """ + chrom, start, end, strand = _parse_strand_coords(exon) + + # Add "chr" prefix to chromosome + chrom = f'chr{exon.chr}' + + # Extract exon_id using regex + match = re.search(r"exon_id=([^;]+)", exon.attributes) + + # Extract exon_id using regex + if match: + exon_id = match.group(1) + + else: + exon_id = "" + LOG.warning(f"Could not extract exon_id from attributes: {exon.transcript_id} - {exon.attributes}") - server = "https://rest.ensembl.org" - ext = f"/lookup/id/{transcript_id}?expand=1" - - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - - iter_count = 0 - while not r.ok and iter_count < max_iter: - print("Retrying lookup... (attempt {}/{})".format(iter_count+1, max_iter)) - time.sleep(5) - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - iter_count += 1 - - return r.json() - + return exon_id, [chrom, start, end, strand] -def get_cds_coord(transcript_id, len_cds_with_utr, max_iter = 20): +def get_exon_coord_wrapper(gene_n_transcript: pd.DataFrame, gff_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """ - Get CDS coordinates + Wrapper function to retrieve exon and CDS coordinates for the genes and transcripts in the panel. + + Parameters + ------------ + gene_n_transcript : pandas.DataFrame + A DataFrame containing gene and transcript information for the panel. + gff_df : pandas.DataFrame + A DataFrame containing the GFF data filtered for exon and CDS features. + + Returns + ------------ + final_coord_df : pandas.DataFrame + A DataFrame containing the CDS coordinates for each gene-transcript pair, along with protein position mapping. + final_exons_df : pandas.DataFrame + A DataFrame containing the exon coordinates (including UTRs) for each gene-transcript pair. """ - server = "https://rest.ensembl.org" - ext = f"/map/cds/{transcript_id}/1..{len_cds_with_utr}?" - - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - - iter_count = 0 - while not r.ok and iter_count < max_iter: - print("Retrying CDS map... (attempt {}/{})".format(iter_count+1, max_iter)) - time.sleep(5) - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - iter_count += 1 - - return r.json()["mappings"] - - -def parse_cds_coord(exon): - - strand = exon["strand"] - - if strand == 1: - start = exon["start"] - end = exon["end"] - else: - start = exon["end"] - end = exon["start"] - - if "id" in exon: - exon_id = exon["id"] - chrom = f'chr{exon["seq_region_name"]}' + coord_df_lst = [] + exons_coord_df_lst = [] - return exon_id, [chrom, start, end, strand] + for gene, transcript in gene_n_transcript.values: + + # Get all features for this transcript once to avoid double filtering + transcript_data = gff_df[gff_df["transcript_id"] == transcript] + if transcript_data.empty: + continue - else: - chrom = exon["seq_region_name"] + # Determine strand from the first available entry + strand = transcript_data["strand"].iloc[0] + is_positive = (strand == "+") - return [chrom, start, end, strand] + # --- Handle exons (with UTRs) --- + exons_lookup = transcript_data[transcript_data["feature"] == "exon"] + # Sort based on strand: '+' is low-to-high, '-' is high-to-low + exons_lookup = exons_lookup.sort_values("start", ascending=is_positive) + for i, exon in enumerate(exons_lookup.itertuples(index=False)): + exon_id, exons_coord = parse_exon_coord(exon) + exons_coord_df_lst.append([f"{gene}--exon_{i+1}_{transcript}_{exon_id}"] + exons_coord) -# Get Exon coord to protein pos -# ----------------------------- + # --- Handle CDS --- + cds_lookup = transcript_data[transcript_data["feature"] == "CDS"] + cds_lookup = cds_lookup.sort_values("start", ascending=is_positive) -def get_dna_exon_pos(exon_range, strand): + coord_lst = [] + for i, exon in enumerate(cds_lookup.itertuples(index=False)): + exons_coord = list(_parse_strand_coords(exon)) + # Include the biological rank (i) + coord_lst.append(exons_coord + [i]) + + if coord_lst: + gene_coord_df = pd.DataFrame(coord_lst, columns=["Chr", "Start", "End", "Strand", "Exon_rank"]) + gene_coord_df["Gene"] = gene + gene_coord_df["Ens_transcript_ID"] = transcript + coord_df_lst.append(gene_coord_df) + + # Combine results + final_coord_df = pd.concat(coord_df_lst) if coord_df_lst else pd.DataFrame() + final_exons_df = pd.DataFrame(exons_coord_df_lst, columns=["ID", "Chr", "Start", "End", "Strand"]) + + LOG.info(f"Retrieved coordinates for {len(final_coord_df['Gene'].unique())} genes and {len(final_exons_df['ID'].unique())} exons.") + return final_coord_df, final_exons_df + +# Coordinate parsing and DNA-to-protein mapping functions +# ---------------------------------------------------------- +def get_dna_exon_pos(exon_range: np.ndarray, strand: int) -> list: + """ + Get the DNA positions of an exon given its range and strand. + Parameters + ------------ + exon_range : list + A list containing the start and end positions of the exon. + strand : int + The strand of the exon (1 for positive strand, -1 for negative strand). + + Returns + ------------ + list + A list of DNA positions corresponding to the exon, ordered according to the strand. + """ if strand == -1: return np.arange(exon_range[1], exon_range[0] + 1)[::-1] @@ -191,15 +305,44 @@ def get_dna_exon_pos(exon_range, strand): return np.arange(exon_range[0], exon_range[1] + 1) -def get_exon_ix(i, exon_range, strand): - +def get_exon_ix(i: int, exon_range: np.ndarray, strand: int): + """ + Get the exon index for each DNA position in the exon, ordered according to the strand. + + Parameters + ------------ + i : int + The exon index (starting from 0). + exon_range : np.ndarray + A numpy array containing the start and end positions of the exon. + strand : int + The strand of the exon (1 for positive strand, -1 for negative strand). + + Returns + ------------ + list + A list of exon indices corresponding to each DNA position in the exon, + ordered according to the strand. + """ len_exon = len(get_dna_exon_pos(exon_range, strand)) return np.repeat(i, len_exon) -def get_dna_map_to_protein(coord_df): - +def get_dna_map_to_protein(coord_df: pd.DataFrame) -> pd.DataFrame: + """ + Get a mapping of DNA positions to protein positions for a given gene-transcript pair. + + Parameters + ------------ + coord_df : pandas.DataFrame + A DataFrame containing the coordinates of the CDS for a specific gene-transcript pair. + + Returns + ------------ + pandas.DataFrame + A DataFrame containing the mapping of DNA positions to protein positions. + """ strand = coord_df["Strand"].unique()[0] exons_range = coord_df[["Start", "End"]].values @@ -217,55 +360,75 @@ def get_dna_map_to_protein(coord_df): return df - -def get_prot_coverage(dna_prot_df, gene, filter_masked_depth=True): - - gene_dna_prot_df = dna_prot_df[dna_prot_df["GENE"] == gene] - gene_dna_prot_df = gene_dna_prot_df.dropna(subset=["PROT_POS"])[["PROT_POS", "COVERED", "DEPTH"]].reset_index(drop=True) - gene_dna_prot_df = gene_dna_prot_df.groupby("PROT_POS").sum().reset_index() - gene_dna_prot_df.COVERED = (gene_dna_prot_df.COVERED > 0).astype(int) - - return gene_dna_prot_df - - -def get_exon_coord_wrapper(gene_n_transcript): - - # Init df for coordinates - coord_df = gene_n_transcript - - # Get coord - coord_df_lst = [] - exons_coord_df_lst = [] - for gene, transcript in coord_df.values: - print("Processing gene:", gene) - coord_lst = [] - - # Get the coord of exons with CDS and UTR as well as the length with UTR and exons ID - exons_lookup = get_tr_lookup(transcript) # We will use this to get Exons ID - for i, exon in enumerate(exons_lookup["Exon"]): - exon_id, exons_coord = parse_cds_coord(exon) - exons_coord_df_lst.append([f"{gene}--exon_{i+1}_{transcript}_{exon_id}"] + exons_coord) - - # Get the CDS coordinates of the exons removing UTRs to map to protein positions - for i, exon in enumerate(get_cds_coord(transcript, exons_lookup["length"])): - coord_lst.append((parse_cds_coord(exon) + [i])) - - gene_coord_df = pd.DataFrame(coord_lst, columns = ["Chr", "Start", "End", "Strand", "Exon_rank"]) - gene_coord_df["Gene"] = gene - gene_coord_df["Ens_transcript_ID"] = transcript - coord_df_lst.append(gene_coord_df) - - coord_df = pd.concat(coord_df_lst) - exons_coord_df = pd.DataFrame(exons_coord_df_lst, columns = ["ID", "Chr", "Start", "End", "Strand"]) - - return coord_df, exons_coord_df - - - -def dna2prot_depth(gene_list, coord_df, dna_sites, depth_df): +def find_exon(dna_prot_df: pd.DataFrame, exon_coord_df: pd.DataFrame): + """ + For each DNA position in *dna_prot_df* the function finds the matching exon + ID from *exon_coord_df* by merging on chromosome and filtering by positional + interval. This avoids a Python-level loop and is orders of magnitude faster + than the ``apply`` approach for large panels. + + Parameters + ------------ + dna_prot_df : pd.DataFrame + DataFrame with at least ``CHROM`` and ``DNA_POS`` columns. + exon_coord_df : pd.DataFrame + DataFrame with ``Chr``, ``Start``, ``End``, and ``ID`` columns. + + Returns + ------------ + np.ndarray + Array of exon IDs aligned to *dna_prot_df*\'s row order; + ``np.nan`` where no matching exon is found. + """ + # Normalise strand-swapped coordinates so lo <= hi in all cases + exon_df = exon_coord_df[["Chr", "Start", "End", "ID"]].copy() + exon_df["lo"] = exon_df[["Start", "End"]].min(axis=1) + exon_df["hi"] = exon_df[["Start", "End"]].max(axis=1) + + # Cross-merge on chromosome; each row of dna_prot_df is paired only with + # exons on the same chromosome, then filtered to those whose interval + # contains the DNA position. + merged = dna_prot_df[["CHROM", "DNA_POS"]].merge( + exon_df[["Chr", "lo", "hi", "ID"]], + left_on="CHROM", right_on="Chr", + how="left" + ) + in_exon = (merged["DNA_POS"] >= merged["lo"]) & (merged["DNA_POS"] <= merged["hi"]) + # Keep the first matching exon per position + matched = merged[in_exon].drop_duplicates(subset=["CHROM", "DNA_POS"]) + + # Re-align to the original DataFrame row order + result = dna_prot_df[["CHROM", "DNA_POS"]].merge( + matched[["CHROM", "DNA_POS", "ID"]], on=["CHROM", "DNA_POS"], how="left" + ) + return result["ID"].values + + +# Depth computation and coverage functions +# ---------------------------------------------------------- +def dna2prot_depth(gene_list: list, coord_df: pd.DataFrame, dna_sites: pd.DataFrame, depth_df: pd.DataFrame) -> pd.DataFrame: """ Get a DNA to protein mapping of all positions in the provided list of genes - Add as well coverage info & DNA to GENE annotation + Add coverage info & DNA to GENE annotation + + Parameters + ------------ + gene_list : list + A list of gene names for which to compute the DNA to protein mapping and coverage information. + coord_df : pandas.DataFrame + A DataFrame containing the coordinates of the CDS + for the genes in the panel. + dna_sites : pandas.DataFrame + A DataFrame containing the DNA positions that are part of the consensus panel, + along with their coverage information. + depth_df : pandas.DataFrame + A DataFrame containing the depth information for all positions in the genome. + + Returns + ------------ + pandas.DataFrame + A DataFrame containing the mapping of DNA positions to protein positions for the specified genes, + along with coverage and depth information for each position. """ # Map DNA to protein pos, get exons index to protein pos, etc @@ -275,10 +438,6 @@ def dna2prot_depth(gene_list, coord_df, dna_sites, depth_df): dna_prot_df_lst.append(get_dna_map_to_protein(gene_coord_df)) dna_prot_df = pd.concat(dna_prot_df_lst) - # dna_prot_df - # contains all the protein positions of the genes in the panel - # mapped to its corresponding genomic position - # Merge CDS position with availble sites (not masked) and depth info # and any other site that was included in the panel (splicing sites out of the CDS) dna_prot_df = dna_sites.merge(dna_prot_df, on=["GENE", "CHROM", "DNA_POS"], how="outer") @@ -295,31 +454,47 @@ def dna2prot_depth(gene_list, coord_df, dna_sites, depth_df): return dna_prot_df -def get_dna2prot_depth(gene_n_transcript_info, depth_file, consensus_file): +def get_dna2prot_depth(gene_n_transcript_info: pd.DataFrame, depth_file: str, consensus_file: str, release: int, species: str, genome: str) -> tuple[pd.DataFrame, pd.DataFrame]: """ - This function outputs: - dna_prot_df: - - exons_coord_df: - df containing the definition of all exons of the genes/transcripts in the panel - including UTR regions + Function to get the DNA to protein mapping for all positions in the provided list of genes, + along with coverage and depth information for each position, and the definition of all exons of + the genes/transcripts in the panel including UTR regions. + + Parameters + ------------ + gene_n_transcript_info : pandas.DataFrame + A DataFrame containing gene and transcript information for the panel. + depth_file : str + Path to the file containing depth information for all positions in the genome. + consensus_file : str + Path to the consensus panel file containing DNA positions and coverage information. + + Returns + ------------ + dna_prot_df : pandas.DataFrame + A DataFrame containing the mapping of DNA positions to protein positions for the specified genes, + along with coverage and depth information for each position. + exons_coord_df_final : pandas.DataFrame + A DataFrame containing the coordinates of all exons (including UTRs) for the genes and transcripts + in the panel, formatted for BED files. """ consensus_df = pd.read_table(consensus_file) depth_df = pd.read_table(depth_file) + gff_df = gff_to_filtered_df(gene_n_transcript_info, release=release) consensus_df = consensus_df.merge(depth_df[["CHROM", "POS", "CONTEXT"]], on = ["CHROM", "POS"], how = 'left') consensus_df = consensus_df.rename(columns={"POS" : "DNA_POS"}) - if "DEPTH" not in depth_df: + if "DEPTH" not in depth_df.columns: depth_df["DEPTH"] = depth_df.drop(columns=["CHROM", "POS", "CONTEXT"]).mean(1) - depth_df = depth_df[["CHROM", "POS", "DEPTH"]].rename(columns = {"POS" : "DNA_POS"}) + depth_df = depth_df[["CHROM", "POS", "DEPTH"]] - coord_df, exons_coord_df = get_exon_coord_wrapper(gene_n_transcript_info) + coord_df, exons_coord_df = get_exon_coord_wrapper(gene_n_transcript_info, gff_df) gene_list = list(coord_df["Gene"].unique()) - dna_prot_df = dna2prot_depth(gene_list, coord_df, consensus_df, depth_df) + dna_prot_df = dna2prot_depth(gene_list=gene_list, coord_df=coord_df, dna_sites=consensus_df, depth_df=depth_df) - dna_prot_df["EXON_ID"] = dna_prot_df.apply(lambda x: find_exon(x, exons_coord_df), axis=1) + dna_prot_df["EXON_ID"] = find_exon(dna_prot_df, exons_coord_df) # fix coordinates order in exons_coord_df for BED exons_coord_df_final = exons_coord_df.copy() @@ -329,125 +504,117 @@ def get_dna2prot_depth(gene_n_transcript_info, depth_file, consensus_file): return dna_prot_df, exons_coord_df_final -# Utils function to retrieve exon ID from coordinate - -def find_exon(x_coord, exon_coord_df): - - dna_pos, chrom, strand = x_coord["DNA_POS"], x_coord["CHROM"], x_coord["REVERSE_STRAND"] - - if strand == -1: - matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] <= dna_pos) & (dna_pos <= exon_coord_df['Start'])] - - else: - matches = exon_coord_df[(exon_coord_df['Chr'] == chrom) & (exon_coord_df['End'] >= dna_pos) & (dna_pos >= exon_coord_df['Start'])] - - return matches['ID'].values[0] if not matches.empty else np.nan - - - - -def get_exon_depth_saturation(gene_depth, gene_mut, dna=False): - - # Exon average depth - gene_depth = gene_depth.copy() - exon_depth = gene_depth.groupby("EXON_RANK").apply( - lambda x: 0 if sum(x.COVERED) == 0 else sum(x.DEPTH) / sum(x.COVERED)).reset_index().rename(columns = {0 : "DEPTH"}) - exon_depth["START_PROT_POS"] = gene_depth.groupby("EXON_RANK").apply(lambda x: x.PROT_POS.min()) - - # Exon saturation - if dna: - exon_depth["COVERED"] = gene_depth.groupby("EXON_RANK").apply(lambda x: sum(x.COVERED)).values - gene_depth["MUTATED"] = gene_depth.DNA_POS.isin(gene_mut.DNA_POS.unique()).astype(int) - exon_depth["MUTATED"] = gene_depth.groupby("EXON_RANK").apply(lambda x: sum(x.MUTATED)).values - exon_depth["SATURATION"] = exon_depth["MUTATED"] / exon_depth["COVERED"] - else: - - # If an amino acids from the same codon are in two exons, consider the position only in the second one - # (but look at booth to know if the residue is in the panel and if it is mutated) - exon_depth_prot = gene_depth.groupby("PROT_POS").apply(lambda x: (x.COVERED.max(), x.EXON_RANK.max())).reset_index() - exon_depth_prot[["COVERED", "EXON_RANK"]] = pd.DataFrame(exon_depth_prot[0].tolist(), index=exon_depth_prot.index) - exon_depth_prot = exon_depth_prot.drop(columns=[0]) - - exon_depth["COVERED"] = exon_depth_prot.groupby("EXON_RANK").apply(lambda x: sum(x.COVERED)).values - exon_depth_prot["MUTATED"] = exon_depth_prot["PROT_POS"].isin(gene_mut["Pos"].unique()).astype(int) - exon_depth["MUTATED"] = exon_depth_prot.groupby("EXON_RANK").apply(lambda x: sum(x.MUTATED)).values - exon_depth["SATURATION"] = exon_depth["MUTATED"] / exon_depth["COVERED"] - - # check_mutated_masked = exon_depth_prot[(exon_depth_prot["MUTATED"] == 1) & (exon_depth_prot["COVERED"] == 0)] - # check_mutated_masked["GENE"] = gene_depth.GENE.unique()[0] - - return exon_depth - - -def get_exon_mid_prot_pos(exon_info, prot_len): - - lst_end_pos = [] - for i in range(len(exon_info)): - exon = exon_info.iloc[i] - start_pos = int(exon.START_PROT_POS) - end_pos = int(exon_info.iloc[i+1].START_PROT_POS) if i < len(exon_info) -1 else prot_len - lst_end_pos.append(end_pos) - exon_info["END_PROT_POS"] = lst_end_pos - exon_info["MID_PROT_POS"] = (exon_info["START_PROT_POS"] + exon_info["END_PROT_POS"]) / 2 +# Plots +# ---------------------------------------------------------- +def plot_coverage_per_gene(depths_df: pd.DataFrame) -> None: + """ + Wrapper function to plot coverage per gene for DNA, protein and exon levels. - return exon_info + Parameters + ---------- + depths_df : pandas.DataFrame + DataFrame containing depth and coverage information for DNA, protein and exon positions. + In this case, it will be the "exons_depth" created in `get_dna2prot_depth` function. + """ + coverage = depths_df.drop_duplicates(subset=['GENE', 'DNA_POS', 'COVERED']) + coverage_summary = coverage.groupby(['GENE', 'COVERED']).size().reset_index(name='COUNT') + for prefix in ["DNA", "Protein", "Exon"]: + LOG.info(f"Plotting coverage for {prefix}...") + plot_single_coverage(coverage_summary, prefix) -def get_res_coverage(dna_prot_df): - res_depth = dna_prot_df.dropna(subset=["PROT_POS"])[["PROT_POS", "COVERED", "DEPTH"]].reset_index(drop=True) - res_depth = res_depth.groupby("PROT_POS").mean().reset_index() +def plot_single_coverage(coverage_summary: pd.DataFrame, prefix: str, batch_size: int = 5): + """ + Plot coverage for a single prefix (DNA, protein or exon) and save the results in a PDF file. + + Parameters + ---------- + coverage_summary : pandas.DataFrame + DataFrame containing the coverage summary for each gene and coverage status. + prefix : str + The prefix to plot (DNA, Protein or Exon). + batch_size : int, optional + The number of genes to include in each batch when plotting (default is 5). + """ + # Generate copy of the coverage summary to avoid modifying the original DataFrame + coverage_summary_cp = coverage_summary.copy() - return res_depth + coverage_pivot = coverage_summary_cp.pivot(index='GENE', columns='COVERED', values='COUNT').fillna(0) + coverage_pivot = coverage_pivot.sort_values(1, ascending=False).reset_index() + coverage_pivot = coverage_pivot.set_index('GENE') + genes_list = coverage_pivot.index.tolist() + coverage_perc = coverage_pivot.div(coverage_pivot.sum(axis=1), axis=0) * 100 + coverage_pivot.to_csv(f"coverage_count_{prefix}.tsv", header=True, index=True, sep='\t') + coverage_perc.to_csv(f"coverage_perc_{prefix}.tsv", header=True, index=True, sep='\t') + if len(genes_list) < batch_size: -def add_consecutive_numbers(nums, max_n): + fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - result = [] - for i in range(len(nums)): - result.append(nums[i]) - # Check if the current number is the start of a consecutive sequence - if i < len(nums) - 1 and nums[i] + 1 != nums[i + 1]: - result.append(nums[i] + 1) + covered_col = True if True in coverage_pivot.columns else (1 if 1 in coverage_pivot.columns else coverage_pivot.columns[-1]) - # Add the last consecutive number after the final element - result.append(nums[-1] + 1) + coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff'], + ) - return result + axes[0].set_title(f'{prefix} : covered vs. non-covered') + axes[0].set_ylabel(f'Number of {prefix}') + axes[0].set_xlabel('Gene') + axes[0].legend(title='Covered', loc='upper right') + axes[0].tick_params(axis='x', rotation=45) + coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') + axes[1].set_title(f'{prefix} : percentage of covered') + axes[1].set_ylabel('Percentage (%)') + axes[1].set_xlabel('Gene') + axes[1].set_ylim(0, 100) + axes[1].tick_params(axis='x', rotation=45) -def where_plus(condition): - """ - Util function to extend the color of mpl filling to the next position. - """ + plt.tight_layout() + plt.savefig(f"coverage_per_{prefix}.pdf", dpi=300) + else: + with PdfPages(f"coverage_per_{prefix}_batches.pdf") as pdf: + # split into batches N genes + for i in range(0, len(genes_list), batch_size): + batch_genes = genes_list[i:i+batch_size] + batch_coverage_pivot = coverage_pivot.loc[batch_genes] + batch_coverage_perc = coverage_perc.loc[batch_genes] - ix = np.where(condition)[0] + fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True) - if len(ix) > 0: - ix = add_consecutive_numbers(ix, max_n=len(condition)) - if len(condition) in ix: - ix.remove(len(condition)) - boolean_vector = np.zeros(len(condition), dtype=bool) - boolean_vector[ix] = True + covered_col = True if True in batch_coverage_pivot.columns else (1 if 1 in batch_coverage_pivot.columns else batch_coverage_pivot.columns[-1]) - return pd.Series(boolean_vector) + batch_coverage_pivot.plot(kind='bar', stacked=True, ax=axes[0], color=['#ff9999','#66b3ff']) + axes[0].set_title(f'{prefix} : covered vs. non-covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') + axes[0].set_ylabel(f'Number of {prefix}') + axes[0].set_xlabel('Gene') + axes[0].legend(title='Covered', loc='upper right') + axes[0].tick_params(axis='x', rotation=45) - else: - return condition + batch_coverage_perc[covered_col].plot(kind='bar', ax=axes[1], color='#66b3ff') + axes[1].set_title(f'{prefix} : percentage of covered (genes {i+1}-{min(i+batch_size, len(genes_list))})') + axes[1].set_ylabel('Percentage (%)') + axes[1].set_xlabel('Gene') + axes[1].set_ylim(0, 100) + axes[1].tick_params(axis='x', rotation=45) + plt.tight_layout() + pdf.savefig(dpi=300) + plt.close() @click.command() @click.option('--mutations-file', type=click.Path(exists=True), help='Mutations file') @click.option('--consensus-file', type=click.Path(exists=True), help='Input consensus panel file') @click.option('--depths-file', type=click.Path(exists=True), help='Input depths of all samples file') -def main(mutations_file, consensus_file, depths_file): - click.echo("Starting to run plot saturation results...") - +@click.option('--ensembl-species', type=str, default="homo_sapiens", help='Ensembl species name to use for GFF file retrieval (default)') +@click.option('--ensembl-genome', type=str, default="GRCh38", help='Ensembl genome name to use for GFF file retrieval (default)') +@click.option('--ensembl-release', type=int, default=111, help='Ensembl release number to use for GFF file retrieval (default is 111)') +def main(mutations_file, consensus_file, depths_file, ensembl_species, ensembl_genome, ensembl_release): # Count each mutation only ones if it appears in multiple reads gene_n_transcript = get_transcript_gene_from_maf(mutations_file, consensus_file) - exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file) - print("Exons coordinates and depth computed") + exons_depth, exons_coord_id = get_dna2prot_depth(gene_n_transcript, depths_file, consensus_file, ensembl_release, ensembl_species, ensembl_genome) + LOG.info("Exons coordinates and depth computed!") exons_depth.to_csv("depths_per_position_exon_gene.tsv", header = True, index = False, sep = '\t') exons_coordinates_bed_like = exons_coord_id[['Chr', 'Start', 'End', 'ID']] @@ -455,6 +622,8 @@ def main(mutations_file, consensus_file, depths_file): plot_coverage_per_gene(exons_depth) + LOG.info("All done!") + if __name__ == '__main__': main() diff --git a/bin/postprocessing_annotation.py b/bin/postprocessing_annotation.py index 24cab413..50f83c72 100755 --- a/bin/postprocessing_annotation.py +++ b/bin/postprocessing_annotation.py @@ -4,7 +4,7 @@ import click import pandas as pd -from bgreference import hg38, hg19, mm10, mm39 +from bgreference import hg38, mm39 from utils import vartype from utils_context import transform_context @@ -12,8 +12,6 @@ from read_utils import custom_na_values assembly_name2function = {"hg38": hg38, - "hg19": hg19, - "mm10": mm10, "mm39": mm39} @@ -245,7 +243,7 @@ def vep2summarizedannotation(VEP_output_file, all_possible_sites_annotated_file, @click.command() @click.argument('vep_output_file', type=click.Path(exists=True)) @click.argument('all_possible_sites_annotated_file', type=click.Path()) -@click.option('--assembly-name', default='hg38', show_default=True, type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), help='Reference genome assembly name') +@click.option('--assembly-name', default='hg38', show_default=True, type=click.Choice(['hg38', 'mm39']), help='Reference genome assembly name') @click.option('--all-underscore', is_flag=True, default=False, show_default=True, help='Whether to use _ to separate all parts of MUT_ID (default: False)') @click.option('--hotspots-annotation-file', default=None, type=click.Path(exists=False), help='Path to hotspots annotation file') @click.option('--gnomad-af-threshold', default=0.001, show_default=True, type=float, help='gnomAD allele frequency threshold') diff --git a/bin/sites_table_from_positions.py b/bin/sites_table_from_positions.py index 979e49d8..892371fa 100755 --- a/bin/sites_table_from_positions.py +++ b/bin/sites_table_from_positions.py @@ -4,11 +4,9 @@ import click import pandas as pd -from bgreference import hg38, hg19, mm10, mm39 +from bgreference import hg38, mm39 assembly_name2function = {"hg38": hg38, - "hg19": hg19, - "mm10": mm10, "mm39": mm39 } @@ -49,7 +47,6 @@ def generate_all_sites_4VEP(input_positions, genome, output_file_with_sites): @click.command() @click.option('--input-positions', required=True, type=click.Path(exists=True), help='Input positions file (TSV)') -#@click.option('--genome-assembly', required=True, type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), help='Genome assembly (hg38, hg19, mm10, mm39)') @click.option('--genome-assembly', required=True, type=click.Choice(['hg38', 'mm39']), help='Genome assembly (hg38, mm39)') @click.option('--output-file-with-sites', required=True, type=click.Path(), help='Output file for sites (TSV)') def main(input_positions, genome_assembly, output_file_with_sites): diff --git a/bin/utils_context.py b/bin/utils_context.py index cf8e488e..3cacd0ae 100644 --- a/bin/utils_context.py +++ b/bin/utils_context.py @@ -1,4 +1,4 @@ -from bgreference import hg38, hg19, mm10, mm39 +from bgreference import hg38, mm39 from itertools import product diff --git a/conf/modules.config b/conf/modules.config index 327648f0..94f6c967 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -72,6 +72,12 @@ process { ] } + withName: DNA2PROTEINMAPPING { + ext.ensembl_release = params.vep_cache_version ?: 111 + ext.ensembl_species = params.vep_species ?: 'homo_sapiens' + ext.ensembl_genome = params.vep_genome ?: 'GRCh38' + } + withName: QUERYDEPTHS { ext.prefix = { ".subset_depths" } ext.args = '' @@ -106,7 +112,7 @@ process { } - withName: QUERYMUTATIONS { + withName: 'QUERYMUTATIONS|QUERYMUTATIONSEXONS' { ext.prefix = { ".subset_mutations" } ext.args = '' ext.args2 = '-s 1 -b 2 -e 2' @@ -239,7 +245,7 @@ process { ], [ mode: params.publish_dir_mode, - path: { "${params.outdir}/flagged_positions" }, + path: { "${params.outdir}/processing_files/flagged_positions" }, pattern: "*{bed}", ] ] @@ -536,6 +542,14 @@ process { --seed 123" } + withName: 'SIGPROFILERASSIGNMENT|HDPREASSIGNMENT|SIGPROMATRIXGENERATOR' { + ext.genome_assembly = params.vep_genome == 'GRCh38' + ? 'GRCh38' + : params.vep_genome == 'GRCm39' + ? 'mm39' + : null + } + withName: SIGPROFILERASSIGNMENT { publishDir = [ [ @@ -546,20 +560,32 @@ process { ] } + withName: HDPREASSIGNMENT { + publishDir = [ + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/signatures/hdp_decomposition_spa" }, + pattern: "**{txt,pdf}", + ] + ] + } + + withName: REFORMATSIGNATURES { + publishDir = [ + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/signatures/signatures_hdp/signatures_reformatted" }, + pattern: "**{tsv}", + ] + ] + } + withName: SIGPROMATRIXGENERATOR { ext.args = "--plot \ --tsb_stat \ --seqInfo \ --cushion 100" - ext.genome_assembly = params.vep_genome == 'GRCh38' - ? 'GRCh38' - : params.vep_genome == 'GRCh37' - ? 'GRCh37' - : params.vep_genome == 'GRCm38' - ? 'mm10' - : params.vep_genome == 'GRCm39' - ? 'mm39' - : null + publishDir = [ [ mode: params.publish_dir_mode, @@ -622,14 +648,12 @@ process { --missing_values mean_samples" } - withName: 'SITESFROMPOSITIONS|SUMANNOTATION|SIGPROFILERASSIGNMENT|ONCODRIVECLUSTL|POSTPROCESSVEPPANEL' { + withName: 'SITESFROMPOSITIONS|SUMANNOTATION|ONCODRIVECLUSTL|POSTPROCESSVEPPANEL' { ext.assembly = params.vep_genome == 'GRCh38' ? 'hg38' - : params.vep_genome == 'GRCm38' - ? 'mm10' - : params.vep_genome == 'GRCm39' - ? 'mm39' - : null + : params.vep_genome == 'GRCm39' + ? 'mm39' + : null } withLabel : deepcsa_core { diff --git a/conf/tools/omega.config b/conf/tools/omega.config index da5b089a..0d28c498 100644 --- a/conf/tools/omega.config +++ b/conf/tools/omega.config @@ -168,10 +168,8 @@ process { withName: 'PREPROCESSING.*|ESTIMATOR.*' { ext.assembly = params.vep_genome == 'GRCh38' ? 'hg38' - : params.vep_genome == 'GRCm38' - ? 'mm10' - : params.vep_genome == 'GRCm39' - ? 'mm39' - : null + : params.vep_genome == 'GRCm39' + ? 'mm39' + : null } } diff --git a/docs/usage.md b/docs/usage.md index c196e28e..e99ab95f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -274,7 +274,7 @@ These files identify sites overlapping common SNPs and noisy or variable genomic - Nanoseq SNP: Common SNP positions that should be excluded from analysis - Nanoseq Noise: Regions with high noise or variability -Both files are available for GRCh37 and GRCh38 at the [shared folder](https://drive.google.com/drive/folders/1wqkgpRTuf4EUhqCGSLA4fIg9qEEw3ZcL) from Iñigo Martincorena's group, at the Wellcome Sanger Institute. +Both files are available for GRCh38 at the [shared folder](https://drive.google.com/drive/folders/1wqkgpRTuf4EUhqCGSLA4fIg9qEEw3ZcL) from Iñigo Martincorena's group, at the Wellcome Sanger Institute. ## Additional customizable parameters diff --git a/modules/local/dna2protein/main.nf b/modules/local/dna2protein/main.nf index 07d41544..2a6a97da 100644 --- a/modules/local/dna2protein/main.nf +++ b/modules/local/dna2protein/main.nf @@ -1,6 +1,7 @@ process DNA_2_PROTEIN_MAPPING { tag "$meta.id" label 'process_single' + label 'process_medium_high_memory' label 'deepcsa_core' @@ -20,12 +21,18 @@ process DNA_2_PROTEIN_MAPPING { script: + def ensembl_release = task.ext.ensembl_release + def ensembl_species = task.ext.ensembl_species + def ensembl_genome = task.ext.ensembl_genome """ cut -f 1,2,6 ${panel_file} | uniq > ${meta2.id}.panel.unique.tsv panels_computedna2protein.py \\ --mutations-file ${mutations_file} \\ --consensus-file ${meta2.id}.panel.unique.tsv \\ - --depths-file ${all_samples_depths} + --depths-file ${all_samples_depths} \\ + --ensembl-release ${ensembl_release} \\ + --ensembl-species ${ensembl_species} \\ + --ensembl-genome ${ensembl_genome} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/signatures/hdp/process_results/main.nf b/modules/local/signatures/hdp/process_results/main.nf index eaa9bba7..5b93dafd 100644 --- a/modules/local/signatures/hdp/process_results/main.nf +++ b/modules/local/signatures/hdp/process_results/main.nf @@ -9,8 +9,9 @@ process PROCESS_HDP_RESULTS { tuple val(meta2), path (iteration_dir) output: - tuple val(meta), path("output_dir"), emit: processed_results - path "versions.yml" , topic: versions + tuple val(meta), path("output_dir") , emit: processed_results + tuple val(meta), path("**/components.txt") , emit: signatures + path "versions.yml" , topic: versions script: diff --git a/modules/local/signatures/hdp/reformat_sigs/main.nf b/modules/local/signatures/hdp/reformat_sigs/main.nf new file mode 100644 index 00000000..d9c70d08 --- /dev/null +++ b/modules/local/signatures/hdp/reformat_sigs/main.nf @@ -0,0 +1,46 @@ + +process SIGNATURES_HDP_TO_SIGPROFILER { + tag "$meta.id" + label 'process_medium' + label 'deepcsa_core' + + + input: + tuple val(meta), path (signatures) + + output: + tuple val(meta), path("*.tsv") , emit: signatures_for_sp + path "versions.yml" , topic: versions + + + script: + def prefix = task.ext.prefix ?: "" + prefix = "${meta.id}${prefix}" + """ + python < versions.yml +"${task.process}": + python: \$(python --version | sed 's/Python //g') +END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "" + prefix = "${meta.id}${prefix}" + """ + touch ${prefix}.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + +} \ No newline at end of file diff --git a/modules/local/signatures/sigprofiler/assignment/main.nf b/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf similarity index 81% rename from modules/local/signatures/sigprofiler/assignment/main.nf rename to modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf index 0e725789..b3f77163 100644 --- a/modules/local/signatures/sigprofiler/assignment/main.nf +++ b/modules/local/signatures/sigprofiler/assignment/cosmic_fit/main.nf @@ -1,4 +1,4 @@ -process SIGPROFILERASSIGNMENT { +process SIGPROFILERASSIGNMENT_COSMIC_FIT { tag "$meta.id" label 'process_medium' @@ -17,7 +17,7 @@ process SIGPROFILERASSIGNMENT { script: def name = "${meta.id}.${type}" - def assembly = task.ext.assembly ?: "GRCh38" + def assembly = task.ext.genome_assembly ?: "GRCh38" // FIXME: the definition of subgroups to exclude seems not to work in the new CLI SigProfilerAssignment // def exclude_signature_subgroups = params.exclude_subgroups ? "--exclude_signature_subgroups \"${params.exclude_subgroups}\"" : "" @@ -30,17 +30,16 @@ process SIGPROFILERASSIGNMENT { SigProfilerAssignment cosmic_fit \\ ${matrix} \\ output_${name} \\ - --signatures ${reference_signatures} \\ + --signature_database ${reference_signatures} \\ --genome_build ${assembly} \\ --cpu ${task.cpus} \\ --volume spa_volume - mv output_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.txt output_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.${name}.txt; + cp output_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.txt output_${name}/Assignment_Solution/Activities/Decomposed_MutationType_Probabilities.${name}.txt; cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python --version | sed 's/Python //g') - SigProfilerAssignment : 0.1.1 + SigProfilerAssignment : 1.1.3 END_VERSIONS """ @@ -52,8 +51,7 @@ process SIGPROFILERASSIGNMENT { cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python --version | sed 's/Python //g') - SigProfilerAssignment : 0.1.1 + SigProfilerAssignment : 1.1.3 END_VERSIONS """ } diff --git a/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf b/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf new file mode 100644 index 00000000..fded2c7d --- /dev/null +++ b/modules/local/signatures/sigprofiler/assignment/decompose_fit/main.nf @@ -0,0 +1,63 @@ +process SIGPROFILERASSIGNMENT_DECOMPOSE_FIT { + tag "$meta.id" + label 'process_medium' + + container 'docker.io/ferriolcalvet/sigprofiler_assignment:1.1.3' + + input: + tuple val(meta), val(type), path(matrix) + tuple val(meta2), path (extracted_signatures) + path (reference_signatures) + + output: + tuple val(meta), path("**.pdf") , emit: plots + tuple val(meta), path("**.txt") , emit: stats + tuple val(meta), path("**Decomposed_MutationType_Probabilities.*.txt") , emit: mutation_probs + path "versions.yml" , topic: versions + + + script: + def args = task.ext.args ?: '' + def name = "${meta.id}.${type}" + def assembly = task.ext.genome_assembly ?: "GRCh38" + + // FIXME: the definition of subgroups to exclude seems not to work in the new CLI SigProfilerAssignment + // def exclude_signature_subgroups = params.exclude_subgroups ? "--exclude_signature_subgroups \"${params.exclude_subgroups}\"" : "" + """ + mkdir -p spa_volume + export SIGPROFILERMATRIXGENERATOR_VOLUME='./spa_volume' + export SIGPROFILERPLOTTING_VOLUME='./spa_volume' + export SIGPROFILERASSIGNMENT_VOLUME='./spa_volume' + + SigProfilerAssignment decompose_fit \\ + ${matrix} \\ + signature_decomposition_${name} \\ + --signatures ${extracted_signatures} \\ + --signature_database ${reference_signatures} \\ + --genome_build ${assembly} \\ + --cpu ${task.cpus} \\ + --volume spa_volume \\ + ${args} + + cp signature_decomposition_${name}/Decompose_Solution/Activities/Decomposed_MutationType_Probabilities.txt signature_decomposition_${name}/Decompose_Solution/Activities/Decomposed_MutationType_Probabilities.${name}.txt; + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + SigProfilerAssignment : 1.1.3 + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "" + prefix = "${meta.id}${prefix}" + """ + touch ${prefix}.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + SigProfilerAssignment : 1.1.3 + END_VERSIONS + """ +} diff --git a/modules/local/signatures/sigprofiler/extractor/main.nf b/modules/local/signatures/sigprofiler/extractor/main.nf index 2673aeac..4f774066 100644 --- a/modules/local/signatures/sigprofiler/extractor/main.nf +++ b/modules/local/signatures/sigprofiler/extractor/main.nf @@ -25,7 +25,7 @@ process SIGPROFILEREXTRACTOR { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "" prefix = "${meta.id}${prefix}" - def assembly = task.ext.assembly ?: "GRCh38" + def assembly = task.ext.genome_assembly ?: "GRCh38" // python -c "from SigProfilerAssignment import Analyzer as Analyze; Analyze.cosmic_fit('${matrix}', 'output', input_type='matrix', context_type='96', // collapse_to_SBS96=True, cosmic_version=3.4, exome=False, // genome_build="GRCh38", signature_database='${reference_signatures}', diff --git a/subworkflows/local/adjmutdensity/main.nf b/subworkflows/local/adjmutdensity/main.nf index 25cfbe53..1da8bcee 100644 --- a/subworkflows/local/adjmutdensity/main.nf +++ b/subworkflows/local/adjmutdensity/main.nf @@ -1,4 +1,4 @@ -include { TABIX_BGZIPTABIX_QUERY as QUERYMUTATIONS } from '../../../modules/nf-core/tabix/bgziptabixquery/main' +include { TABIX_BGZIPTABIX_QUERY as QUERYMUTATIONS } from '../../../modules/nf-core/tabix/bgziptabixquery/main' include { SUBSET_MAF as SUBSETMUTDENSITYADJUSTED } from '../../../modules/local/subsetmaf/main' @@ -16,7 +16,7 @@ workflow MUTATION_DENSITY { main: - // Intersect BED of all sites with BED of sample filtered sites + // Intersect BED of all sites of interest with mutations QUERYMUTATIONS(mutations, bedfile) SUBSETMUTDENSITYADJUSTED(QUERYMUTATIONS.out.subset) diff --git a/subworkflows/local/dnds/main.nf b/subworkflows/local/dnds/main.nf index 92960196..edf8e534 100644 --- a/subworkflows/local/dnds/main.nf +++ b/subworkflows/local/dnds/main.nf @@ -1,5 +1,3 @@ -include { TABIX_BGZIPTABIX_QUERY as QUERYMUTATIONS } from '../../../modules/nf-core/tabix/bgziptabixquery/main' - include { SUBSET_MAF as SUBSET_DNDS } from '../../../modules/local/subsetmaf/main' include { PREPROCESS_DNDS as PREPROCESSDEPTHS } from '../../../modules/local/dnds/preprocess/main' @@ -10,7 +8,6 @@ workflow DNDS { take: mutations depth - bedfile panel main: @@ -18,10 +15,7 @@ workflow DNDS { covariates = params.dnds_covariates ? channel.fromPath( params.dnds_covariates, checkIfExists: true).first() : channel.empty() ref_trans = params.dnds_ref_transcripts ? channel.fromPath( params.dnds_ref_transcripts, checkIfExists: true).first() : channel.empty() - - QUERYMUTATIONS(mutations, bedfile) - - SUBSET_DNDS(QUERYMUTATIONS.out.subset) + SUBSET_DNDS(mutations) PREPROCESSDEPTHS(depth, panel) diff --git a/subworkflows/local/mutability/main.nf b/subworkflows/local/mutability/main.nf index 3e1c2f2b..01132d80 100644 --- a/subworkflows/local/mutability/main.nf +++ b/subworkflows/local/mutability/main.nf @@ -16,19 +16,10 @@ workflow MUTABILITY { depth profile panel_file - bedfiles main: - // actual code - // this line was not needed nor used in the computation of mutabilities, - // since there is an additional left_join merge in the compute mutabilities code, - // the depths can be the full ones - // QUERYDEPTHS(depth, bedfiles) - QUERYMUTATIONS(mutations, bedfiles) - - - SUBSETMUTABILITY(QUERYMUTATIONS.out.subset) + SUBSETMUTABILITY(mutations) SUBSETMUTABILITY.out.mutations .join(profile) .join(depth) diff --git a/subworkflows/local/omega/main.nf b/subworkflows/local/omega/main.nf index 7b5f73d7..de583490 100644 --- a/subworkflows/local/omega/main.nf +++ b/subworkflows/local/omega/main.nf @@ -45,12 +45,10 @@ workflow OMEGA_ANALYSIS{ all_gloc_results = channel.empty() // Intersect BED of all sites with BED of sample filtered sites - QUERYMUTATIONS(mutations, bedfile) - QUERYPANEL(panel_captured_rich, bedfile) - SUBSETOMEGA(QUERYMUTATIONS.out.subset) - SUBSETOMEGAMULTI(QUERYMUTATIONS.out.subset) + SUBSETOMEGA(mutations) + SUBSETOMEGAMULTI(mutations) SUBSETOMEGA.out.mutations .join( depth ) diff --git a/subworkflows/local/oncodrive3d/main.nf b/subworkflows/local/oncodrive3d/main.nf index 82ca8974..cc4394dd 100644 --- a/subworkflows/local/oncodrive3d/main.nf +++ b/subworkflows/local/oncodrive3d/main.nf @@ -14,15 +14,13 @@ workflow ONCODRIVE3D_ANALYSIS{ take: mutations mutabilities - bedfile datasets annotations raw_vep main: - QUERYMUTATIONS(mutations, bedfile) - SUBSETONCODRIVE3D(QUERYMUTATIONS.out.subset) + SUBSETONCODRIVE3D(mutations) // mutations preprocessing if (params.o3d_raw_vep){ diff --git a/subworkflows/local/oncodriveclustl/main.nf b/subworkflows/local/oncodriveclustl/main.nf index 9639cf42..f7762278 100644 --- a/subworkflows/local/oncodriveclustl/main.nf +++ b/subworkflows/local/oncodriveclustl/main.nf @@ -1,6 +1,6 @@ include { CREATECUSTOMBEDFILE as ONCODRIVECLUSTLBED } from '../../../modules/local/createpanels/custombedfile/main' -include { SUBSET_MAF as SUBSETONCODRIVECLUSTL } from '../../../modules/local/subsetmaf/main' +include { SUBSET_MAF as SUBSETONCODRIVECLUSTL } from '../../../modules/local/subsetmaf/main' include { ONCODRIVECLUSTL } from '../../../modules/local/bbgtools/oncodriveclustl/main' // include { ONCODRIVECLUSTL as ONCODRIVECLUSTLSNVS } from '../../../modules/local/bbgtools/oncodriveclustl/main' diff --git a/subworkflows/local/signatures/main.nf b/subworkflows/local/signatures/main.nf index ffc71f7a..17da923c 100644 --- a/subworkflows/local/signatures/main.nf +++ b/subworkflows/local/signatures/main.nf @@ -1,7 +1,8 @@ -include { MATRIX_CONCAT as MATRIXCONCATWGS } from '../../../modules/local/sig_matrix_concat/main' -include { SIGPROFILERASSIGNMENT } from '../../../modules/local/signatures/sigprofiler/assignment/main' -include { SIGNATURES_PROBABILITIES as SIGPROBS } from '../../../modules/local/combine_sbs/main' -include { HDP_EXTRACTION as HDPEXTRACTION } from '../signatures_hdp/main' +include { MATRIX_CONCAT as MATRIXCONCATWGS } from '../../../modules/local/sig_matrix_concat/main' +include { SIGPROFILERASSIGNMENT_COSMIC_FIT as SIGPROFILERASSIGNMENT } from '../../../modules/local/signatures/sigprofiler/assignment/cosmic_fit/main' +include { SIGPROFILERASSIGNMENT_DECOMPOSE_FIT as HDPREASSIGNMENT } from '../../../modules/local/signatures/sigprofiler/assignment/decompose_fit/main' +include { SIGNATURES_PROBABILITIES as SIGPROBS } from '../../../modules/local/combine_sbs/main' +include { HDP_EXTRACTION as HDPEXTRACTION } from '../signatures_hdp/main' workflow SIGNATURES { @@ -38,6 +39,7 @@ workflow SIGNATURES { HDPEXTRACTION(named_matrices_wgs_hdp, reference_signatures) + HDPREASSIGNMENT(named_matrices_wgs_sp, HDPEXTRACTION.out.signatures.first(), reference_signatures) emit: plots = SIGPROFILERASSIGNMENT.out.plots // channel: [ val(meta), file(depths) ] diff --git a/subworkflows/local/signatures_hdp/main.nf b/subworkflows/local/signatures_hdp/main.nf index f6b1dd2e..9ece3ca2 100644 --- a/subworkflows/local/signatures_hdp/main.nf +++ b/subworkflows/local/signatures_hdp/main.nf @@ -1,9 +1,8 @@ include { PREPARE_INPUT } from '../../../modules/local/signatures/hdp/prepareinput/main' include { RUN_HDP_CHAIN_SAMPLING } from '../../../modules/local/signatures/hdp/chainsampling/main' include { PROCESS_HDP_RESULTS } from '../../../modules/local/signatures/hdp/process_results/main' -include { COMPARE_SIGNATURES as COMPARESIGNATURES } from '../../../modules/local/signatures/hdp/compare_sigs/main' - - +include { COMPARE_SIGNATURES as COMPARESIGNATURES } from '../../../modules/local/signatures/hdp/compare_sigs/main' +include { SIGNATURES_HDP_TO_SIGPROFILER as REFORMATSIGNATURES } from '../../../modules/local/signatures/hdp/reformat_sigs/main' workflow HDP_EXTRACTION { @@ -32,9 +31,11 @@ workflow HDP_EXTRACTION { COMPARESIGNATURES(PROCESS_HDP_RESULTS.out.processed_results, reference_signatures) + REFORMATSIGNATURES(PROCESS_HDP_RESULTS.out.signatures) + - // emit: - // plots = SIGPROFILERASSIGNMENT.out.plots // channel: [ val(meta), file(depths) ] - // // plots_extraction = MSIGHDP.out.plots // channel: [ val(meta), file(depths) ] + emit: + signatures = REFORMATSIGNATURES.out.signatures_for_sp // channel: [ val(meta), file(depths) ] + // plots_extraction = MSIGHDP.out.plots // channel: [ val(meta), file(depths) ] // mutation_probs = signature_probs_samples } diff --git a/workflows/deepcsa.nf b/workflows/deepcsa.nf index 7ad9dfb7..8fb08139 100644 --- a/workflows/deepcsa.nf +++ b/workflows/deepcsa.nf @@ -101,6 +101,8 @@ include { TABLE_2_GROUP as TABLE2GROUP } from '../m include { ANNOTATE_DEPTHS as ANNOTATEDEPTHS } from '../modules/local/annotatedepth/main' include { DOWNSAMPLE_DEPTHS as DOWNSAMPLEDEPTHS } from '../modules/local/downsample/depths/main' +include { TABIX_BGZIPTABIX_QUERY as QUERYMUTATIONSEXONS } from '../modules/nf-core/tabix/bgziptabixquery/main' + include { SELECT_MUTDENSITIES as SYNMUTDENSITY } from '../modules/local/select_mutdensity/main' include { SELECT_MUTDENSITIES as SYNMUTREADSDENSITY } from '../modules/local/select_mutdensity/main' @@ -282,11 +284,18 @@ workflow DEEPCSA{ } + // Intersect BED of all sites with somatic mutations to keep only those mutations in the exons consensus panel + QUERYMUTATIONSEXONS(somatic_mutations, CREATEPANELS.out.exons_consensus_bed) + mutations_in_exons = QUERYMUTATIONSEXONS.out.subset + // Mutational profile if ( params.profileall || run_mutabilities || params.omega ){ MUTPROFILEALL(somatic_mutations, DEPTHSALLCONS.out.subset, CREATEPANELS.out.all_consensus_bed, wgs_trinucs, TABLE2GROUP.out.json_allgroups) if (run_mutdensity){ + + // TODO explore if we could use the ALL consensus panel instead of the exons one + // I am suggesting this since we are already distinguishing per consequence type within the script MUTDENSITYADJUSTED(somatic_mutations, DEPTHSALLCONS.out.subset, CREATEPANELS.out.exons_consensus_bed, CREATEPANELS.out.exons_consensus_panel, MUTPROFILEALL.out.profile, wgs_trinucs) // Concatenate all outputs into a single file @@ -313,19 +322,17 @@ workflow DEEPCSA{ if (run_mutabilities) { if (params.profileall){ - MUTABILITYALL(somatic_mutations, + MUTABILITYALL(mutations_in_exons, annotated_depths, MUTPROFILEALL.out.profile, - CREATEPANELS.out.exons_consensus_panel, - CREATEPANELS.out.exons_consensus_bed + CREATEPANELS.out.exons_consensus_panel ) } if (params.profilenonprot){ - MUTABILITYNONPROT(somatic_mutations, + MUTABILITYNONPROT(mutations_in_exons, annotated_depths, MUTPROFILENONPROT.out.profile, - CREATEPANELS.out.exons_consensus_panel, - CREATEPANELS.out.exons_consensus_bed + CREATEPANELS.out.exons_consensus_panel ) } } @@ -356,7 +363,7 @@ workflow DEEPCSA{ oncodrivefml_regressions_files = channel.empty() if (params.profileall){ mode = "all" - ONCODRIVEFMLALL(somatic_mutations, MUTABILITYALL.out.mutability, + ONCODRIVEFMLALL(mutations_in_exons, MUTABILITYALL.out.mutability, CREATEPANELS.out.exons_consensus_panel, cadd_scores, mode ) @@ -368,7 +375,7 @@ workflow DEEPCSA{ } if (params.profilenonprot && params.positive_selection_non_protein_affecting){ mode = "non_prot_aff" - ONCODRIVEFMLNONPROT(somatic_mutations, MUTABILITYNONPROT.out.mutability, + ONCODRIVEFMLNONPROT(mutations_in_exons, MUTABILITYNONPROT.out.mutability, CREATEPANELS.out.exons_consensus_panel, cadd_scores, mode ) @@ -381,7 +388,7 @@ workflow DEEPCSA{ if (params.oncodrive3d){ if (params.profileall){ // Oncodrive3D - ONCODRIVE3D(somatic_mutations, MUTABILITYALL.out.mutability, CREATEPANELS.out.exons_consensus_bed, + ONCODRIVE3D(mutations_in_exons, MUTABILITYALL.out.mutability, datasets3d, annotations3d, MUT_PREPROCESSING.out.all_raw_vep_annotation) positive_selection_results = positive_selection_results.join(ONCODRIVE3D.out.results, remainder: true) positive_selection_results = positive_selection_results.join(ONCODRIVE3D.out.results_pos, remainder: true) @@ -391,9 +398,8 @@ workflow DEEPCSA{ // if (params.expected_mutated_cells & params.dnds){ if (params.dnds){ - DNDS(somatic_mutations, + DNDS(mutations_in_exons, DEPTHSEXONSCONS.out.subset, - CREATEPANELS.out.exons_consensus_bed, CREATEPANELS.out.exons_consensus_panel ) } @@ -404,7 +410,7 @@ workflow DEEPCSA{ // Omega if (params.profileall){ - OMEGA(somatic_mutations, + OMEGA(mutations_in_exons, DEPTHSEXONSCONS.out.subset, MUTPROFILEALL.out.profile, CREATEPANELS.out.exons_consensus_bed.first(), @@ -434,7 +440,7 @@ workflow DEEPCSA{ if (params.omega_multi){ // Omega multi - OMEGAMULTI(somatic_mutations, + OMEGAMULTI(mutations_in_exons, DEPTHSEXONSCONS.out.subset, MUTPROFILEALL.out.profile, CREATEPANELS.out.exons_consensus_bed.first(), @@ -457,7 +463,7 @@ workflow DEEPCSA{ } } if (params.profilenonprot && params.positive_selection_non_protein_affecting){ - OMEGANONPROT(somatic_mutations, + OMEGANONPROT(mutations_in_exons, DEPTHSEXONSCONS.out.subset, MUTPROFILENONPROT.out.profile, CREATEPANELS.out.exons_consensus_bed.first(), @@ -475,7 +481,7 @@ workflow DEEPCSA{ } if (params.omega_multi){ - OMEGANONPROTMULTI(somatic_mutations, + OMEGANONPROTMULTI(mutations_in_exons, DEPTHSEXONSCONS.out.subset, MUTPROFILENONPROT.out.profile, CREATEPANELS.out.exons_consensus_bed.first(), @@ -569,7 +575,7 @@ workflow DEEPCSA{ // DNA2PROTEINMAPPING.out.depths_exons_positions ) - if (params.omega || params.oncodrive3d || params.oncodrivefml || params.indels) { + if (params.omega || params.oncodrive3d || params.oncodrivefml || params.indels || run_mutdensity) { if (params.omega){ positive_selection_results = positive_selection_results.combine(PLOTTINGQC.out.flagged_omegas) }