From 3dd19eff964661d3e53ad12984c823d4d756dd44 Mon Sep 17 00:00:00 2001 From: SamBryce-Smith Date: Fri, 10 Jan 2025 16:21:30 +0000 Subject: [PATCH 1/3] handle edge case where no putative novel ALEs found --- scripts/get_novel_last_exons.py | 90 ++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/scripts/get_novel_last_exons.py b/scripts/get_novel_last_exons.py index 198d7f0..70018ff 100644 --- a/scripts/get_novel_last_exons.py +++ b/scripts/get_novel_last_exons.py @@ -523,6 +523,10 @@ def find_spliced_events(novel_li, # Only novel last SJs overlapping ref SJs kept novel_spliced = novel_li.join(ref_introns, strandedness="same", suffix=suffix) + if len(novel_spliced) == 0: + # no putative events found, return empty ranges + return pr.PyRanges() + eprint(f"Number of putative novel spliced events - {_n_ids(novel_spliced, id_col)}") # eprint(f"ref exon ranks\n {novel_spliced.as_df()[rank_col].drop_duplicates()}") @@ -777,47 +781,51 @@ def main(input_gtf_path, eprint(f"Complete - took {end - start} s") - # Spliced is last introns, want corresponding last exons in output - spliced_le = novel_le.subset(lambda df: df["transcript_id"].isin(set(spliced.transcript_id))) - - # Want to retain metadata from matching reference regions - # These coordinates/metadata correspond to matching overlapping ref last intron/SJ - spliced = spliced[["transcript_id", - "gene_id_ref", - "transcript_id_ref", - "gene_name", # comes from ref GTF, but not always gene_name in StringTie output ('ref_gene_name') will prefer matching from ref - "Start_ref", - "End_ref", - "event_type"]] - - # Add the _ref suffix so it's obvious the col came from the reference GTF - spliced = spliced.apply(lambda df: df.rename(columns={"gene_name": "gene_name_ref"})) - - spliced_cols = spliced.columns.tolist() - - # eprint(spliced_le.columns) - - spliced_le = spliced_le.apply_pair(spliced, - lambda df, df2:_pd_merge_gr(df, - df2.drop(columns=["Chromosome", - "Start", - "End", - "Strand" - ] - ), - how="left", - on="transcript_id", - suffixes=[None, "_spl"], - to_merge_cols=spliced_cols), - ) - # eprint(spliced_le.columns) - - # Since drop default cols from spliced, should have no cols with suffix - # spliced_le = spliced_le.drop(like="_spl$") - - # eprint(spliced_le.columns) - - combined = pr.concat([extensions, spliced_le]) + if len(spliced) != 0: + # Spliced is last introns, want corresponding last exons in output + spliced_le = novel_le.subset(lambda df: df["transcript_id"].isin(set(spliced.transcript_id))) + + # Want to retain metadata from matching reference regions + # These coordinates/metadata correspond to matching overlapping ref last intron/SJ + spliced = spliced[["transcript_id", + "gene_id_ref", + "transcript_id_ref", + "gene_name", # comes from ref GTF, but not always gene_name in StringTie output ('ref_gene_name') will prefer matching from ref + "Start_ref", + "End_ref", + "event_type"]] + + # Add the _ref suffix so it's obvious the col came from the reference GTF + spliced = spliced.apply(lambda df: df.rename(columns={"gene_name": "gene_name_ref"})) + + spliced_cols = spliced.columns.tolist() + + # eprint(spliced_le.columns) + + spliced_le = spliced_le.apply_pair(spliced, + lambda df, df2:_pd_merge_gr(df, + df2.drop(columns=["Chromosome", + "Start", + "End", + "Strand" + ] + ), + how="left", + on="transcript_id", + suffixes=[None, "_spl"], + to_merge_cols=spliced_cols), + ) + # eprint(spliced_le.columns) + + # Since drop default cols from spliced, should have no cols with suffix + # spliced_le = spliced_le.drop(like="_spl$") + + # eprint(spliced_le.columns) + + combined = pr.concat([extensions, spliced_le]) + + else: + combined = extensions # Finally, collapse metadata/duplicate attribute values for each last exon (transcript ID) # This can occur if same last exon matches to multiple reference transcripts/junctions From c6d8406891cf1836f35f0f98b24dbd669682776e Mon Sep 17 00:00:00 2001 From: SamBryce-Smith Date: Wed, 15 Jan 2025 14:13:37 +0000 Subject: [PATCH 2/3] bugfix: return empty objects when no extensions found --- scripts/get_novel_last_exons.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/scripts/get_novel_last_exons.py b/scripts/get_novel_last_exons.py index 70018ff..4b6333d 100644 --- a/scripts/get_novel_last_exons.py +++ b/scripts/get_novel_last_exons.py @@ -449,9 +449,21 @@ def find_extension_events(novel_le, suffix ) ) - post_tol_ids = set(novel_le_ext.as_df()[id_col]) + + try: + post_tol_ids = set(novel_le_ext.as_df()[id_col]) + except KeyError: + # empty df/no extensions found + post_tol_ids = set() eprint(f"After 5'end match tolerance filter, number of events - {len(post_tol_ids)}") + if len(post_tol_ids) == 0: + # no extensions found, return empty pyranges + if return_filtered_ids: + # return tuple of empty gr & empty sets + return pr.PyRanges(), set(), set() + else: + return pr.PyRanges() if return_filtered_ids: end_5p_filt_ids = post_tol_ids - post_len_ids From ca8644bacc60749eba85d9143c4d2af3af93f923 Mon Sep 17 00:00:00 2001 From: SamBryce-Smith Date: Wed, 15 Jan 2025 14:29:25 +0000 Subject: [PATCH 3/3] bugfix: handle edge case where no ids fail the 3p end filters --- scripts/filter_tx_by_three_end.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/scripts/filter_tx_by_three_end.py b/scripts/filter_tx_by_three_end.py index b2c310c..f1970b6 100644 --- a/scripts/filter_tx_by_three_end.py +++ b/scripts/filter_tx_by_three_end.py @@ -720,6 +720,7 @@ def main(gtf_path, # eprint(le_combined_pass.columns) pass_ids = set(le_combined_pass.as_df()[le_id_col]) + eprint(f"Number of events passing database/PAS motif filters - {len(pass_ids)}") #6. Generate 'match_stats' dataframe plus summary counts dfs # Subset to df of Tx_id, atlas_filter, motif_filter, atlas_distance & motifs_found @@ -736,11 +737,16 @@ def main(gtf_path, "event_type" ] - fail_match_stats = (le.subset(lambda df: ~df[le_id_col].isin(pass_ids)) - .as_df() - [ms_cols] - .drop_duplicates(subset=["transcript_id"]) - ) + fail_match_stats = le.subset(lambda df: ~df[le_id_col].isin(pass_ids)) + # Subset to minimal output columns + if len(fail_match_stats) > 0: + fail_match_stats = (fail_match_stats + .as_df() + [ms_cols] + .drop_duplicates(subset=["transcript_id"]) + ) + else: + fail_match_stats = pd.DataFrame(columns=ms_cols) pass_match_stats = le_combined_pass.as_df()[ms_cols] @@ -906,7 +912,7 @@ def main(gtf_path, pas_motifs = gruber_pas_motifs elif args.motifs.capitalize() == "Beaudoing": - pas_motifs = Beaudoing_pas_motifs + pas_motifs = beaudoing_pas_motifs elif os.path.exists(args.motifs): eprint(f"reading in pas motifs from file - {args.motifs}")