diff --git a/bin/depth_group_comparison.py b/bin/depth_group_comparison.py new file mode 100755 index 00000000..7d62184e --- /dev/null +++ b/bin/depth_group_comparison.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python + +""" +Plot depth of all genes and specific genes per sample groups. + +This script plots the average depth at all consensus exons across all genes and for specific genes of interest, either in the panel or in a custom subset of genes, stratified by sample groups defined in the metadata. +The output is stored in a pdf file. +""" + +import click +import pandas as pd +from utils_plot import plots_general_config +import matplotlib.pyplot as plt +import matplotlib as mpl +from matplotlib.backends.backend_pdf import PdfPages +import seaborn as sns +import ast +import re + +mpl.rcParams.update({ + 'axes.titlesize': plots_general_config["title_fontsize"], + 'axes.labelsize': plots_general_config["xylabel_fontsize"], + 'xtick.labelsize': plots_general_config["xyticks_fontsize"], + 'ytick.labelsize': plots_general_config["xyticks_fontsize"], + 'figure.titlesize': plots_general_config["title_fontsize"], +}) + +separator2character = { + 'tab' : '\t', + 'comma' : ',' +} + + +def plot_depth_per_group(df, group_col, data_type, pdf): + ''' + Function to plot depth within a group of samples in all genes or a specific subset of genes + ''' + + col_name = group_col[0] if isinstance(group_col, list) else group_col + + # Get the number of unique categories to plot + num_categories = df[col_name].nunique() + 2 + plt.figure(figsize=(num_categories, 4)) + + if data_type == 'ALL_GENES': + plot_df = df[df['GENE'] == 'ALL_GENES'] + title = f"Average Depth for ALL_GENES in {col_name} group" + + else: # data_type is the Gene Name + plot_df = df[df['GENE'] == data_type] + title = f"Average Depth for {data_type} in {col_name} group" + + if plot_df.empty: + print(f"No data available for {data_type} in {col_name} group. Skipping plot.") + return + + sns.boxplot(data=plot_df, x=col_name, y="MEAN_GENE_DEPTH", hue=col_name, showfliers=False, showmeans=False, legend=False) + sns.stripplot(data=plot_df, x=col_name, y="MEAN_GENE_DEPTH", color='grey', alpha=0.5, size=4, legend=False) + + plt.title(title, fontsize=plots_general_config["title_fontsize"]) + plt.xlabel('', fontsize=plots_general_config["xylabel_fontsize"]) + plt.ylabel(f"Average Cons Exons Depth", fontsize=plots_general_config["xylabel_fontsize"]) + plt.yticks( fontsize=plots_general_config["yticks_fontsize"]) + plt.xticks(fontsize=plots_general_config["xticks_fontsize"]) + plt.tick_params(axis='x', rotation=90) + plt.tight_layout() + pdf.savefig() + plt.close() + plt.show() + + return + + +@click.command() +@click.option('--table-filename', required=True, type=click.Path(exists=True), help='Input features table file') +@click.option('--depth-gene-sample', required=True, type=click.Path(exists=True), help='Input depth file per gene per sample') +@click.option('--depth-sample', required=True, type=click.Path(exists=True), help='Input depth file per sample') +@click.option('--separator', required=True, type=click.Choice(['tab', 'comma']), help='Separator used in features table: tab or comma') +@click.option('--unique-identifier', default=None, type=str, help='Unique identifier column name') +@click.option('--groups', required=True, type=str, help='List of columns with grouping information') +@click.option('--custom-genes', required=False, type=str, help='Comma separated list of custom genes') + + +def main(table_filename, depth_gene_sample, depth_sample, unique_identifier, separator, groups, custom_genes): + + sep_char = separator2character[separator] + uniq_name = unique_identifier if unique_identifier else "sample" + + # Read tables + features_table = pd.read_table(table_filename, header=0, sep=sep_char) + depth_genes_samples = pd.read_table(depth_gene_sample, header=0, sep="\t") + depth_per_sample = pd.read_table(depth_sample, header=0, sep="\t") + + # Process panel genes + panel_genes = sorted(set(depth_genes_samples['GENE'].unique())) + if custom_genes: + print(f'Custom genes provided, plotting custom genes only: {custom_genes}') + custom_gene_list = [g.strip() for g in custom_genes.split(",")] + panel_genes = sorted(set(custom_gene_list) & set(panel_genes)) + + else: + print(f'No custom genes provided, plotting all genes in the panel: {panel_genes}') + + + # Process depth per sample to add the 'ALL_GENES' depth value per sample + print('Processing per sample depth table to add the ALL_GENES depth column: ') + depth_per_sample['GENE'] = 'ALL_GENES' + depth_per_sample = depth_per_sample.rename(columns={'avg_depth_sample': 'MEAN_GENE_DEPTH'}) + + print('Depth per sample table after adding ALL_GENES value in GENES column:') + print(depth_per_sample.head()) + + depth_genes_samples = pd.concat([depth_genes_samples, depth_per_sample], ignore_index=True) + print('Added ALL_GENES depth values to depth genes samples table:') + print('Length of table:', len(depth_genes_samples)) + print(depth_genes_samples.head()) + + # Merge data with metadata table to get the group information for each sample + merged_depth_df = pd.merge(features_table, depth_genes_samples, how='left', left_on=uniq_name, right_on='SAMPLE_ID') + print('Merged depth and metadata table:') + print(merged_depth_df.head()) + print('Length of merged table:', len(merged_depth_df)) + + # Process groups + # First clean the string (adds quotes if the shell stripped them) + cleaned = re.sub(r'(? versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + """ + touch depth_group_comparison.plot_depth_per_group.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 4faa435f..ffc7f647 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,6 +17,7 @@ params { features_table_separator = 'comma' features_unique_identifier = null features_groups_list = null + features_genes_list = null custom_groups = false custom_groups_file = null diff --git a/workflows/deepcsa.nf b/workflows/deepcsa.nf index 7ad9dfb7..30a57d08 100644 --- a/workflows/deepcsa.nf +++ b/workflows/deepcsa.nf @@ -101,6 +101,8 @@ include { TABLE_2_GROUP as TABLE2GROUP } from '../m include { ANNOTATE_DEPTHS as ANNOTATEDEPTHS } from '../modules/local/annotatedepth/main' include { DOWNSAMPLE_DEPTHS as DOWNSAMPLEDEPTHS } from '../modules/local/downsample/depths/main' +include { ANALYZE_DEPTHS_GROUPS as ANALYZEDEPTHSGROUPS } from '../modules/local/analyzedepths/main' + include { SELECT_MUTDENSITIES as SYNMUTDENSITY } from '../modules/local/select_mutdensity/main' include { SELECT_MUTDENSITIES as SYNMUTREADSDENSITY } from '../modules/local/select_mutdensity/main' @@ -227,6 +229,11 @@ workflow DEEPCSA{ } PLOTDEPTHSEXONSCONS(ANNOTATEDEPTHS.out.all_samples_depths, CREATEPANELS.out.exons_consensus_bed, CREATEPANELS.out.exons_consensus_panel) + // ANALYZEDEPTHSGROUPS should run only when user defines a group list + if (params.features_groups_list) { + ANALYZEDEPTHSGROUPS(features_table, PLOTDEPTHSEXONSCONS.out.average_depth_gene_sample) + } + // Enrich regions in consensus panels ENRICHPANELS(MUT_PREPROCESSING.out.mutations_all_samples, ANNOTATEDEPTHS.out.all_samples_depths,