From 3dd19eff964661d3e53ad12984c823d4d756dd44 Mon Sep 17 00:00:00 2001
From: SamBryce-Smith <samuelbrycesmith@gmail.com>
Date: Fri, 10 Jan 2025 16:21:30 +0000
Subject: [PATCH 1/3] handle edge case where no putative novel ALEs found

---
 scripts/get_novel_last_exons.py | 90 ++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 41 deletions(-)

diff --git a/scripts/get_novel_last_exons.py b/scripts/get_novel_last_exons.py
index 198d7f0..70018ff 100644
--- a/scripts/get_novel_last_exons.py
+++ b/scripts/get_novel_last_exons.py
@@ -523,6 +523,10 @@ def find_spliced_events(novel_li,
     # Only novel last SJs overlapping ref SJs kept
     novel_spliced = novel_li.join(ref_introns, strandedness="same", suffix=suffix)
 
+    if len(novel_spliced) == 0:
+        # no putative events found, return empty ranges
+        return pr.PyRanges()
+
     eprint(f"Number of putative novel spliced events - {_n_ids(novel_spliced, id_col)}")
     # eprint(f"ref exon ranks\n {novel_spliced.as_df()[rank_col].drop_duplicates()}")
 
@@ -777,47 +781,51 @@ def main(input_gtf_path,
 
     eprint(f"Complete - took {end - start} s")
 
-    # Spliced is last introns, want corresponding last exons in output
-    spliced_le = novel_le.subset(lambda df: df["transcript_id"].isin(set(spliced.transcript_id)))
-
-    # Want to retain metadata from matching reference regions
-    # These coordinates/metadata correspond to matching overlapping ref last intron/SJ
-    spliced = spliced[["transcript_id",
-                       "gene_id_ref",
-                       "transcript_id_ref",
-                       "gene_name", # comes from ref GTF, but not always gene_name in StringTie output ('ref_gene_name') will prefer matching from ref
-                       "Start_ref",
-                       "End_ref",
-                       "event_type"]]
-
-    # Add the _ref suffix so it's obvious the col came from the reference GTF
-    spliced = spliced.apply(lambda df: df.rename(columns={"gene_name": "gene_name_ref"}))
-
-    spliced_cols = spliced.columns.tolist()
-
-    # eprint(spliced_le.columns)
-
-    spliced_le = spliced_le.apply_pair(spliced,
-                                       lambda df, df2:_pd_merge_gr(df,
-                                                                   df2.drop(columns=["Chromosome",
-                                                                                     "Start",
-                                                                                     "End",
-                                                                                     "Strand"
-                                                                                     ]
-                                                                            ),
-                                                                   how="left",
-                                                                   on="transcript_id",
-                                                                   suffixes=[None, "_spl"],
-                                                                   to_merge_cols=spliced_cols),
-                                       )
-    # eprint(spliced_le.columns)
-
-    # Since drop default cols from spliced, should have no cols with suffix
-    # spliced_le = spliced_le.drop(like="_spl$")
-
-    # eprint(spliced_le.columns)
-
-    combined = pr.concat([extensions, spliced_le])
+    if len(spliced) != 0:
+        # Spliced is last introns, want corresponding last exons in output
+        spliced_le = novel_le.subset(lambda df: df["transcript_id"].isin(set(spliced.transcript_id)))
+
+        # Want to retain metadata from matching reference regions
+        # These coordinates/metadata correspond to matching overlapping ref last intron/SJ
+        spliced = spliced[["transcript_id",
+                        "gene_id_ref",
+                        "transcript_id_ref",
+                        "gene_name", # comes from ref GTF, but not always gene_name in StringTie output ('ref_gene_name') will prefer matching from ref
+                        "Start_ref",
+                        "End_ref",
+                        "event_type"]]
+
+        # Add the _ref suffix so it's obvious the col came from the reference GTF
+        spliced = spliced.apply(lambda df: df.rename(columns={"gene_name": "gene_name_ref"}))
+
+        spliced_cols = spliced.columns.tolist()
+
+        # eprint(spliced_le.columns)
+
+        spliced_le = spliced_le.apply_pair(spliced,
+                                        lambda df, df2:_pd_merge_gr(df,
+                                                                    df2.drop(columns=["Chromosome",
+                                                                                        "Start",
+                                                                                        "End",
+                                                                                        "Strand"
+                                                                                        ]
+                                                                                ),
+                                                                    how="left",
+                                                                    on="transcript_id",
+                                                                    suffixes=[None, "_spl"],
+                                                                    to_merge_cols=spliced_cols),
+                                        )
+        # eprint(spliced_le.columns)
+
+        # Since drop default cols from spliced, should have no cols with suffix
+        # spliced_le = spliced_le.drop(like="_spl$")
+
+        # eprint(spliced_le.columns)
+
+        combined = pr.concat([extensions, spliced_le])
+        
+    else:
+        combined = extensions
 
     # Finally, collapse metadata/duplicate attribute values for each last exon (transcript ID)
     # This can occur if same last exon matches to multiple reference transcripts/junctions

From c6d8406891cf1836f35f0f98b24dbd669682776e Mon Sep 17 00:00:00 2001
From: SamBryce-Smith <samuelbrycesmith@gmail.com>
Date: Wed, 15 Jan 2025 14:13:37 +0000
Subject: [PATCH 2/3] bugfix: return empty objects when no extensions found

---
 scripts/get_novel_last_exons.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/scripts/get_novel_last_exons.py b/scripts/get_novel_last_exons.py
index 70018ff..4b6333d 100644
--- a/scripts/get_novel_last_exons.py
+++ b/scripts/get_novel_last_exons.py
@@ -449,9 +449,21 @@ def find_extension_events(novel_le,
                                                                            suffix
                                                                            )
                                            )
-        post_tol_ids = set(novel_le_ext.as_df()[id_col])
+        
+        try:
+            post_tol_ids = set(novel_le_ext.as_df()[id_col])
+        except KeyError:
+            # empty df/no extensions found
+            post_tol_ids = set()
 
         eprint(f"After 5'end match tolerance filter, number of events - {len(post_tol_ids)}")
+        if len(post_tol_ids) == 0:
+            # no extensions found, return empty pyranges
+            if return_filtered_ids:
+                # return tuple of empty gr & empty sets
+                return pr.PyRanges(), set(), set()
+            else:
+                return pr.PyRanges()
 
         if return_filtered_ids:
             end_5p_filt_ids = post_tol_ids - post_len_ids

From ca8644bacc60749eba85d9143c4d2af3af93f923 Mon Sep 17 00:00:00 2001
From: SamBryce-Smith <samuelbrycesmith@gmail.com>
Date: Wed, 15 Jan 2025 14:29:25 +0000
Subject: [PATCH 3/3] bugfix: handle edge case where no ids fail the 3p end
 filters

---
 scripts/filter_tx_by_three_end.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/scripts/filter_tx_by_three_end.py b/scripts/filter_tx_by_three_end.py
index b2c310c..f1970b6 100644
--- a/scripts/filter_tx_by_three_end.py
+++ b/scripts/filter_tx_by_three_end.py
@@ -720,6 +720,7 @@ def main(gtf_path,
     # eprint(le_combined_pass.columns)
 
     pass_ids = set(le_combined_pass.as_df()[le_id_col])
+    eprint(f"Number of events passing database/PAS motif filters - {len(pass_ids)}")
 
     #6. Generate 'match_stats' dataframe plus summary counts dfs
     # Subset to df of Tx_id, atlas_filter, motif_filter, atlas_distance & motifs_found
@@ -736,11 +737,16 @@ def main(gtf_path,
                "event_type"
                ]
 
-    fail_match_stats = (le.subset(lambda df: ~df[le_id_col].isin(pass_ids))
-                        .as_df()
-                        [ms_cols]
-                        .drop_duplicates(subset=["transcript_id"])
-                        )
+    fail_match_stats = le.subset(lambda df: ~df[le_id_col].isin(pass_ids))
+    # Subset to minimal output columns
+    if len(fail_match_stats) > 0:
+        fail_match_stats = (fail_match_stats
+                            .as_df()
+                            [ms_cols]
+                            .drop_duplicates(subset=["transcript_id"])
+                            )
+    else:
+        fail_match_stats = pd.DataFrame(columns=ms_cols)
 
     pass_match_stats = le_combined_pass.as_df()[ms_cols]
 
@@ -906,7 +912,7 @@ def main(gtf_path,
         pas_motifs = gruber_pas_motifs
 
     elif args.motifs.capitalize() == "Beaudoing":
-        pas_motifs = Beaudoing_pas_motifs
+        pas_motifs = beaudoing_pas_motifs
 
     elif os.path.exists(args.motifs):
         eprint(f"reading in pas motifs from file - {args.motifs}")