From 89b9ed94970e5f35c0af4a3ce512abb5790e7125 Mon Sep 17 00:00:00 2001 From: FrancesBlow <63001177+FrancesBlow@users.noreply.github.com> Date: Fri, 4 Jul 2025 10:46:42 +0100 Subject: [PATCH 1/3] Add files via upload Python script to rescale mapping scores to 1000 per gene (NB only rescales junctions that map to a single gene) --- scripts/rescale_by_gene.py | 154 +++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 scripts/rescale_by_gene.py diff --git a/scripts/rescale_by_gene.py b/scripts/rescale_by_gene.py new file mode 100644 index 0000000..59bea8f --- /dev/null +++ b/scripts/rescale_by_gene.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python +# coding: utf-8 + +# This is a script to rescale scores in a BED file (output from regtools) to 1000 per gene. +# Uses Python env 'venv', see scripts/Python_env_MARS.txt + +# In[ ]: + + +import gffutils +import pandas as pd +import pyranges as pr +import matplotlib.pyplot as plt + + +# In[ ]: + + +# File paths +gff_file = "/mnt/data/project0061/ref/schistosoma_mansoni.PRJEA36577.WBPS19.annotations.gff3" +db_file = "annotations.db" +bed_file = "output.junctions.bed" + + +# In[ ]: + + +# Step 1: Create or load gffutils database +try: + db = gffutils.FeatureDB(db_file) + print("Loaded existing gffutils database.") +except Exception: + print("Creating gffutils database, this may take a while...") + db = gffutils.create_db( + gff_file, + dbfn=db_file, + force=True, + keep_order=True, + merge_strategy="merge", + sort_attribute_values=True, + ) + print("Database created.") + + +# In[ ]: + + +# Step 2: Extract gene features into a DataFrame +genes = [] +for gene in db.features_of_type("gene"): + genes.append({ + "Chromosome": gene.chrom, + "Start": gene.start - 1, # Convert to 0-based for BED + "End": gene.end, + "Strand": gene.strand, + "Gene": gene.id + }) + +genes_df = pd.DataFrame(genes) +genes_pr = pr.PyRanges(genes_df) + + +# In[ ]: + + +# Step 3: Load your BED file into PyRanges +bed = pr.read_bed(bed_file) +bed_df = bed.df.copy() +bed_df["original_index"] = bed_df.index # to track rows +bed = pr.PyRanges(bed_df) + + +# In[ ]: + + +# Step 4: Join BED with genes and count overlaps +joined = bed.join(genes_pr) +joined_df = joined.df + +# Count unique gene overlaps per original BED entry +gene_counts = joined_df.groupby("original_index")["Gene"].nunique() + +# Keep only those overlapping **exactly one** gene +keep_indices = gene_counts[gene_counts == 1].index + +# Filter original join to those rows +filtered_df = joined_df[joined_df["original_index"].isin(keep_indices)] + + +# In[ ]: + + +# Step 5: Plot histogram of gene overlaps per original BED entry +plt.figure(figsize=(8, 5)) +plt.hist(gene_counts, bins=range(1, gene_counts.max() + 2), color='steelblue', edgecolor='black', align='left') +plt.xlabel("Number of genes overlapped per BED entry") +plt.ylabel("Number of BED entries") +plt.title("Histogram of Gene Overlaps per BED Entry") +plt.xticks(range(1, gene_counts.max() + 1)) +plt.grid(axis='y', linestyle='--', alpha=0.5) +plt.tight_layout() +plt.savefig("bed_gene_overlap_histogram.png", dpi=300) +plt.show() + + +# In[ ]: + + +# Step 6: Rescale scores per gene +filtered_df = filtered_df.copy() +filtered_df.loc[:, "Score"] = pd.to_numeric(filtered_df["Score"], errors="coerce").fillna(0) + +def scale_scores(group): + scores = group["Score"].astype(float) + total = scores.sum() + if total == 0: + group["Score"] = 0.0 + else: + scaled = (scores / total) * 1000 + group["Score"] = scaled.round(2) + return group + +# Apply the scaling function per gene +scaled_df = ( + filtered_df + .copy() + .groupby("Gene", group_keys=False) + .apply(scale_scores) +) + +# Round and convert scores to integer +scaled_df["Score"] = scaled_df["Score"].round(2).astype(int) + + +# In[ ]: + + +# Step 7: Map scaled scores back to original BED rows +original_bed = bed.df.set_index("original_index") +scaled_df = scaled_df.set_index("original_index") +original_bed["Score"] = original_bed["Score"].astype(float) +original_bed.loc[scaled_df.index, "Score"] = scaled_df["Score"] + +# Clean up +output_df = original_bed.reset_index(drop=True) + + +# In[ ]: + + +# Step 8: Save to BED file +bed_cols = [col for col in output_df.columns if col not in ["Gene"]] +output_df[bed_cols].to_csv("scaled_output.filtered.bed", sep="\t", header=False, index=False) + From e405a0ad75cac75f943950be324b57c018b1b9b6 Mon Sep 17 00:00:00 2001 From: FrancesBlow <63001177+FrancesBlow@users.noreply.github.com> Date: Fri, 4 Jul 2025 10:48:32 +0100 Subject: [PATCH 2/3] Add files via upload Updated installation requirements in Python environment to rescale scores by gene in BED file. --- docs/Python_env_MARS.txt | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/docs/Python_env_MARS.txt b/docs/Python_env_MARS.txt index e4446f6..c475c20 100644 --- a/docs/Python_env_MARS.txt +++ b/docs/Python_env_MARS.txt @@ -19,8 +19,24 @@ pip install pysam #Install jupyter pip install jupyter +#Install pyranges +pip install pyranges + +#Install gffutils +pip install gffutils + +#Install matplotlib +pip install matplotlib + +##To run Python script on MARS +#Convert jupyter notebook to python script +jupyter nbconvert --to python regtools_bed_by_celltype.ipynb #This produces regtools_bed_by_celltype.py (also in scripts folder) + +#Run the script +python3 regtools_bed_by_celltype.py + #Convert jupyter notebook to python script -jupyter nbconvert --to script regtools_bed_by_celltype.ipynb #This produces regtools_bed_by_celltype.py (also in scripts folder) +jupyter nbconvert --to python rescale_by_gene_v2.ipynb #This produces rescale_by_gene.py (also in scripts folder) #Run the script -python3 regtools_bed_by_celltype.py \ No newline at end of file +python3 rescale_by_gene_v2.py \ No newline at end of file From f2e4392c652d67cee5eb23d8aa7b79f87d5212af Mon Sep 17 00:00:00 2001 From: FrancesBlow <63001177+FrancesBlow@users.noreply.github.com> Date: Fri, 4 Jul 2025 10:50:54 +0100 Subject: [PATCH 3/3] Add files via upload Fixed a typo in the Python_env_MARS.txt file (wrong name of rescale by gene script). --- docs/Python_env_MARS.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Python_env_MARS.txt b/docs/Python_env_MARS.txt index c475c20..37d2304 100644 --- a/docs/Python_env_MARS.txt +++ b/docs/Python_env_MARS.txt @@ -36,7 +36,7 @@ jupyter nbconvert --to python regtools_bed_by_celltype.ipynb #This produces regt python3 regtools_bed_by_celltype.py #Convert jupyter notebook to python script -jupyter nbconvert --to python rescale_by_gene_v2.ipynb #This produces rescale_by_gene.py (also in scripts folder) +jupyter nbconvert --to python rescale_by_gene.ipynb #This produces rescale_by_gene.py (also in scripts folder) #Run the script -python3 rescale_by_gene_v2.py \ No newline at end of file +python3 rescale_by_gene.py \ No newline at end of file