From da37737e71b200ef348ed9fe105c366d20be04b3 Mon Sep 17 00:00:00 2001
From: Austin Wang
Date: Fri, 11 Apr 2025 11:34:22 -0700
Subject: [PATCH 01/39] Vectorize report graphics
---
pyproject.toml | 2 +-
src/finemo/evaluation.py | 35 ++++++++++++++++++++++----------
src/finemo/main.py | 6 ++----
src/finemo/templates/report.html | 20 +++++++++---------
4 files changed, 37 insertions(+), 26 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 111f8c9..0b33113 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
name = "finemo"
description = "Identification of regulatory elements from neural network contribution scores for DNA."
keywords = ["deep learning", "genomics"]
-version = "0.30"
+version = "0.31"
readme = "README.md"
license = {file = "LICENSE"}
authors = [
diff --git a/src/finemo/evaluation.py b/src/finemo/evaluation.py
index 8fecafc..a121fb5 100644
--- a/src/finemo/evaluation.py
+++ b/src/finemo/evaluation.py
@@ -77,8 +77,10 @@ def plot_hit_distributions(occ_df, motif_names, plot_dir):
y[unique] = freq
ax.bar(x, y)
- output_path = os.path.join(motifs_dir, f"{m}.png")
- plt.savefig(output_path, dpi=300)
+ output_path_png = os.path.join(motifs_dir, f"{m}.png")
+ plt.savefig(output_path_png, dpi=300)
+ output_path_svg = os.path.join(motifs_dir, f"{m}.svg")
+ plt.savefig(output_path_svg)
plt.close(fig)
@@ -95,13 +97,15 @@ def plot_hit_distributions(occ_df, motif_names, plot_dir):
ax.set_xlabel("Motifs per peak")
ax.set_ylabel("Frequency")
- output_path = os.path.join(plot_dir, "total_hit_distribution.png")
- plt.savefig(output_path, dpi=300)
+ output_path_png = os.path.join(plot_dir, "total_hit_distribution.png")
+ plt.savefig(output_path_png, dpi=300)
+ output_path_svg = os.path.join(plot_dir, "total_hit_distribution.svg")
+ plt.savefig(output_path_svg, dpi=300)
plt.close(fig)
-def plot_peak_motif_indicator_heatmap(peak_hit_counts, motif_names, output_path):
+def plot_peak_motif_indicator_heatmap(peak_hit_counts, motif_names, output_dir):
"""
Plots a simple indicator heatmap of the motifs in each peak.
"""
@@ -122,7 +126,10 @@ def plot_peak_motif_indicator_heatmap(peak_hit_counts, motif_names, output_path)
ax.set_xlabel("Motif i")
ax.set_ylabel("Motif j")
- plt.savefig(output_path, dpi=300)
+ output_path_png = os.path.join(output_dir, "motif_cooocurrence.png")
+ plt.savefig(output_path_png, dpi=300)
+ output_path_svg = os.path.join(output_dir, "motif_cooocurrence.svg")
+ plt.savefig(output_path_svg)
plt.close()
@@ -393,8 +400,6 @@ def plot_cwms(cwms, trim_bounds, out_dir, alphabet=LOGO_ALPHABET, colors=LOGO_CO
motif_dir = os.path.join(out_dir, m)
os.makedirs(motif_dir, exist_ok=True)
for cwm_type, cwm in v.items():
- output_path = os.path.join(motif_dir, f"{cwm_type}.png")
-
fig, ax = plt.subplots(figsize=(10,2))
plot_logo(ax, cwm, alphabet, colors=colors, font_props=font, shade_bounds=trim_bounds[m][cwm_type])
@@ -402,11 +407,15 @@ def plot_cwms(cwms, trim_bounds, out_dir, alphabet=LOGO_ALPHABET, colors=LOGO_CO
for name, spine in ax.spines.items():
spine.set_visible(False)
- plt.savefig(output_path, dpi=100)
+ output_path_png = os.path.join(motif_dir, f"{cwm_type}.png")
+ plt.savefig(output_path_png, dpi=100)
+ output_path_svg = os.path.join(motif_dir, f"{cwm_type}.svg")
+ plt.savefig(output_path_svg)
+
plt.close(fig)
-def plot_hit_vs_seqlet_counts(recall_data, output_path):
+def plot_hit_vs_seqlet_counts(recall_data, output_dir):
x = []
y = []
m = []
@@ -430,7 +439,11 @@ def plot_hit_vs_seqlet_counts(recall_data, output_path):
ax.set_xlabel("Hits per motif")
ax.set_ylabel("Seqlets per motif")
- plt.savefig(output_path, dpi=300)
+ output_path_png = os.path.join(output_dir, "hit_vs_seqlet_counts.png")
+ plt.savefig(output_path_png, dpi=300)
+ output_path_svg = os.path.join(output_dir, "hit_vs_seqlet_counts.svg")
+ plt.savefig(output_path_svg)
+
plt.close()
diff --git a/src/finemo/main.py b/src/finemo/main.py
index 77ce256..fd031fd 100644
--- a/src/finemo/main.py
+++ b/src/finemo/main.py
@@ -216,8 +216,7 @@ def report(regions_path, hits_dir, modisco_h5_path, peaks_path, motifs_include_p
evaluation.plot_hit_distributions(occ_df, motif_names, out_dir)
- coooc_path = os.path.join(out_dir, "motif_cooocurrence.png")
- evaluation.plot_peak_motif_indicator_heatmap(coooc, motif_names, coooc_path)
+ evaluation.plot_peak_motif_indicator_heatmap(coooc, motif_names, out_dir)
plot_dir = os.path.join(out_dir, "CWMs")
evaluation.plot_cwms(cwms, trim_bounds, plot_dir)
@@ -227,8 +226,7 @@ def report(regions_path, hits_dir, modisco_h5_path, peaks_path, motifs_include_p
seqlets_path = os.path.join(out_dir, "seqlets.tsv")
data_io.write_modisco_seqlets(seqlets_df, seqlets_path)
- plot_path = os.path.join(out_dir, "hit_vs_seqlet_counts.png")
- evaluation.plot_hit_vs_seqlet_counts(report_data, plot_path)
+ evaluation.plot_hit_vs_seqlet_counts(report_data, out_dir)
report_path = os.path.join(out_dir, "report.html")
evaluation.write_report(report_df, motif_names, report_path, compute_recall, seqlets_df is not None)
diff --git a/src/finemo/templates/report.html b/src/finemo/templates/report.html
index dff2f92..2b58a7e 100644
--- a/src/finemo/templates/report.html
+++ b/src/finemo/templates/report.html
@@ -201,7 +201,7 @@ Hit vs. seqlet counts
The dashed line is the identity line.
When comparing a shared set of regions, the hit counts should be mostly greater than the corresponding seqlet counts, since TF-MoDISco stringently filters seqlets and usually uses a smaller input window.
-
+
{% endif %}
Hit and seqlet motif comparisons
@@ -309,13 +309,13 @@ Hit and seqlet motif comparisons
{{ item.num_seqlets_only }} |
{{ item.num_hits_restricted_only }} |
{% endif %}
-  |
-  |
-  |
-  |
+  |
+  |
+  |
+  |
{% if compute_recall %}
-  |
-  |
+  |
+  |
{% endif %}
{% endfor %}
@@ -332,7 +332,7 @@ Overall distribution of hits per peak
This plot shows the distribution of hit counts per peak for any motif.
The number of peaks with no hits should be near zero.
-
+
Per-motif distributions of hits per peak
@@ -349,7 +349,7 @@
Per-motif distributions of hits per peak
{% for m in motif_names %}
{{ m }} |
-  |
+  |
{% endfor %}
@@ -361,7 +361,7 @@ Motif co-occurrence
The color intensity here represents the cosine similarity between the motifs' occurrence across peaks,
where occurence is defined as the presence of a hit for a motif in a peak.
-
+