From 0f55da5780974592f4adafbf208c342b5f345e49 Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Tue, 11 Nov 2025 09:38:22 -0800 Subject: [PATCH 01/18] Adjust environment name --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index b0b6a75..d74efd5 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: dimelo +name: dimelo-toolkit channels: - conda-forge - nanoporetech From f2bc1695b57ace2c1b5cf097d91da193b1bd2b74 Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Tue, 11 Nov 2025 13:10:26 -0800 Subject: [PATCH 02/18] Change default colors closer to those for the manuscript --- dimelo/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dimelo/utils.py b/dimelo/utils.py index e4ad62b..c44d0d2 100644 --- a/dimelo/utils.py +++ b/dimelo/utils.py @@ -27,7 +27,9 @@ "CG,0": "orange", "CG,0,m": "yellow", "CG,0,h": "red", - "GCH,1": "purple", + "HCG,1":"orange", + "WCG,1":"red", + "GCH,1": "green", } ) # Default colorscales for plotly; based off of DEFAULT_COLORS From cdf9defb3e2d199218b7b8d24f06b5471ab8244e Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Wed, 12 Nov 2025 10:49:53 -0800 Subject: [PATCH 03/18] Add relative=False option (absolute coordinates) for enrichment profiles --- dimelo/plot_enrichment_profile.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/dimelo/plot_enrichment_profile.py b/dimelo/plot_enrichment_profile.py index 97ec461..c01310d 100644 --- a/dimelo/plot_enrichment_profile.py +++ b/dimelo/plot_enrichment_profile.py @@ -12,6 +12,7 @@ def plot_enrichment_profile( motifs: list[str], sample_names: list[str], window_size: int, + relative: bool = True, single_strand: bool = False, regions_5to3prime: bool = False, smooth_window: int | None = None, @@ -39,6 +40,7 @@ def plot_enrichment_profile( mod_names: list of modifications to extract; expected to match mods available in the relevant mod_files sample_names: list of names to use for labeling traces in the output; legend entries window_size: half-size of the desired window to plot; how far the window stretches on either side of the center point + relative: True means x-axis is centered around region centers, False means x-axis is absolute genome positions single_strand: True means we only grab counts from reads from the same strand as the region of interest, False means we always grab both strands within the regions regions_5to3prime: True means negative strand regions get flipped, False means no flipping @@ -65,8 +67,21 @@ def plot_enrichment_profile( cores=cores, ) + if relative: + offset_center = 0 + else: + regions_dict = utils.regions_dict_from_input( + regions_list[0], + window_size, + ) + if len(regions_dict)==1 and len(list(regions_dict.values())[0])==1: + region_tuple = list(regions_dict.values())[0][0] + offset_center = (region_tuple[0] + region_tuple[1]) // 2 + else: + raise ValueError("relative=False must be used when plotting more than one region.") + axes = make_enrichment_profile_plot( - trace_vectors=trace_vectors, sample_names=sample_names, **kwargs + trace_vectors=trace_vectors, sample_names=sample_names, offset_center=offset_center, **kwargs ) return axes @@ -241,6 +256,7 @@ def get_enrichment_profiles( def make_enrichment_profile_plot( trace_vectors: list[np.ndarray], sample_names: list[str], + offset_center: int = 0, **kwargs, ) -> Axes: """ @@ -252,6 +268,7 @@ def make_enrichment_profile_plot( Args: trace_vectors: list of enrichment profile traces sample_names: list of names to use for labeling traces in the output; legend entries + offset_center: position offset to apply to x-axis (e.g., when plotting absolute genome positions) kwargs: other keyword parameters passed through to utils.line_plot Returns: @@ -261,8 +278,8 @@ def make_enrichment_profile_plot( raise ValueError("Unequal number of inputs") axes = utils.line_plot( indep_vector=np.arange( - -len(trace_vectors[0]) // 2, - len(trace_vectors[0]) // 2 + len(trace_vectors[0]) % 2, + offset_center - len(trace_vectors[0]) // 2, + offset_center + len(trace_vectors[0]) // 2 + len(trace_vectors[0]) % 2, ), indep_name="pos", dep_vectors=trace_vectors, From 011da257f4841af91d235088506fcfb8e9bfd0d2 Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Wed, 12 Nov 2025 11:09:03 -0800 Subject: [PATCH 04/18] Add support for pseudouridine by supporting mod codes more than one character long --- dimelo/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dimelo/utils.py b/dimelo/utils.py index c44d0d2..7d501e1 100644 --- a/dimelo/utils.py +++ b/dimelo/utils.py @@ -75,7 +75,7 @@ def __init__(self, motif_string): if self.modified_pos >= len(self.motif_seq): raise ValueError(f"Motif {motif_string} has an out-of-range mod index.") self.modified_base = self.motif_seq[self.modified_pos] - self.mod_codes = set(parts[2]) + self.mod_codes = set([parts[2]]) else: # Motifs need both a sequence and an index, separated by a comma raise ValueError( From 9513e7e8fc5275a443cd60a3723a6a168b52c1ad Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Thu, 13 Nov 2025 13:27:37 -0800 Subject: [PATCH 05/18] Improve plot_read_browser customizability --- dimelo/plot_read_browser.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/dimelo/plot_read_browser.py b/dimelo/plot_read_browser.py index 4ead839..fd3a18b 100644 --- a/dimelo/plot_read_browser.py +++ b/dimelo/plot_read_browser.py @@ -67,7 +67,7 @@ def plot_read_browser( motifs=motifs, single_strand=single_strand, sort_by=sort_by, - calculate_mod_fractions=False, + calculate_mod_fractions=True, subset_parameters=subset_parameters, ) @@ -107,6 +107,7 @@ def plot_read_browser( region_start=region_start, region_end=region_end, hover=hover, + thresh=thresh, **kwargs, ) @@ -294,6 +295,10 @@ def make_browser_figure( region_start: int, region_end: int, hover: bool = True, + thresh: int | float | None = None, + width: int = 1, + size: int = 4, + colorscales: dict = utils.DEFAULT_COLORSCALES, **kwargs, ) -> plotly.graph_objs.Figure: """ @@ -341,7 +346,7 @@ def make_browser_figure( x=[row.read_start, row.read_end], y=[row.y_index, row.y_index], mode="lines", - line=dict(width=1, color="lightgrey"), + line=dict(width=width, color="lightgrey"), showlegend=False, hoverinfo="text", hovertext=row.read_name, @@ -365,9 +370,11 @@ def make_browser_figure( ] ), marker=dict( - size=4, + size=size, color=motif_df["prob"], - colorscale=utils.DEFAULT_COLORSCALES[motif], + colorscale=colorscales[motif], + cmin=thresh, + cmax=1.0, colorbar=dict( title=dict( text=f"{motif} probability", From e0aad85133cb9125ca4af7de17bcf3a08a66b245 Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Fri, 14 Nov 2025 11:01:30 -0800 Subject: [PATCH 06/18] Read length management adjusted for gapped alignments e.g. RNA with splicing. TODO: adjust logic once we get true read start and end in modkit --- dimelo/parse_bam.py | 23 ++++++++++++++--------- dimelo/plot_read_browser.py | 10 +++++++--- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/dimelo/parse_bam.py b/dimelo/parse_bam.py index 88a5310..9130076 100644 --- a/dimelo/parse_bam.py +++ b/dimelo/parse_bam.py @@ -968,6 +968,13 @@ def read_by_base_txt_to_hdf5( canonical_base = fields[15] prob = float(fields[10]) mod_code = fields[11] + read_len = int(fields[9]) + ref_strand = fields[5] + # TODO: verify that read position is in the right (ref) coordinate system + if ref_strand == "+": + pos_in_read_ref = int(fields[1]) + elif ref_strand == "-": + pos_in_read_ref = read_len - int(fields[1]) - 1 if read_name != fields[0]: # Record the previous read details unless this is the first line @@ -1037,22 +1044,20 @@ def read_by_base_txt_to_hdf5( read_counter += 1 ## Set up for next read + # Metadata read_name = fields[0] read_chrom = fields[3] - read_len = int(fields[9]) - ref_strand = fields[5] - # TODO: verify that read position is in the right (ref) coordinate system - if ref_strand == "+": - pos_in_read_ref = int(fields[1]) - elif ref_strand == "-": - pos_in_read_ref = read_len - int(fields[1]) - 1 - # Calculate read info + # Calculate read start (leftmost position on ref genome) + # TODO: logic can be replaced when we switch to true read start/end from modkit read_start = pos_in_genome - pos_in_read_ref - read_end = read_start + read_len # Instantiate lists mod_values_list = [] valid_coordinates_list = [] + # Adjust the read_end (rightmost position on ref genome) each time there's a new mod + # This will lead to the most accurate end positions for gapped reads + # TODO: logic can be replaced when we switch to true read start/end from modkit + read_end = pos_in_genome + (read_len - pos_in_read_ref) # Regardless of whether its a new read or not, # add modification to vector if motif type is correct # for the motif in question diff --git a/dimelo/plot_read_browser.py b/dimelo/plot_read_browser.py index fd3a18b..4e404f8 100644 --- a/dimelo/plot_read_browser.py +++ b/dimelo/plot_read_browser.py @@ -170,9 +170,13 @@ def format_browser_data( # differences are caused by the assumptions made in parse_bam.extract h5 conversion # and don't matter for much but do cause the full duplicate check to leave duplicates, # which then cause non-unique indices for mapping in collapse mode - read_extent_df = read_df[ - ["read_start", "read_end", "y_index", "read_name"] - ].drop_duplicates(subset=["read_name"]) + read_extent_df = ( + read_df[["read_start", "read_end", "y_index", "read_name"]] + .assign(read_length=lambda df: df.read_end - df.read_start) + .sort_values("read_length", ascending=False) # TODO: logic can be removed once we reference true read lengths from modkit + .drop_duplicates(subset=["read_name"]) + .drop(columns=["read_length"]) + ) # * represents the methylation events mod_event_df = ( read_df[["y_index", "read_name", "motif", "pos_vector", "prob_vector"]] From aaab7d134b2bbb599144de16204b6c42d16ab04a Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Fri, 14 Nov 2025 12:02:44 -0800 Subject: [PATCH 07/18] Add span_full_window parameter to select only reads that both are within and cover the entirety of the window used for loading --- dimelo/load_processed.py | 4 ++++ dimelo/plot_read_browser.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/dimelo/load_processed.py b/dimelo/load_processed.py index 62a4556..af7f011 100644 --- a/dimelo/load_processed.py +++ b/dimelo/load_processed.py @@ -629,6 +629,7 @@ def read_vectors_from_hdf5( quiet: bool = True, # currently unused; change to default False when pbars are implemented cores: int | None = None, # currently unused subset_parameters: dict | None = None, + span_full_window: bool = False, ) -> tuple[list[tuple], list[str], dict | None]: """ User-facing function. @@ -679,6 +680,7 @@ def read_vectors_from_hdf5( subset_parameters: Parameters to pass to the utils.random_sample() method, to subset the reads to be returned. If not None, at least one of n or frac must be provided. The array parameter should not be provided here. + span_full_window: If True, only load reads that fully span the window defined by region_start-region_end Returns: a list of tuples, each tuple containing all datasets corresponding to an individual read that @@ -731,6 +733,8 @@ def read_vectors_from_hdf5( relevant_read_indices = np.flatnonzero( (read_ends > region_start) & (read_starts < region_end) + & (read_starts <= region_start if span_full_window else True) + & (read_ends >= region_end if span_full_window else True) & np.isin(read_motifs, motifs) & (read_chromosomes == chrom) & ( diff --git a/dimelo/plot_read_browser.py b/dimelo/plot_read_browser.py index 4e404f8..e80c760 100644 --- a/dimelo/plot_read_browser.py +++ b/dimelo/plot_read_browser.py @@ -16,6 +16,7 @@ def plot_read_browser( sort_by: str | list[str] = "shuffle", hover: bool = True, subset_parameters: dict | None = None, + span_full_window: bool = False, **kwargs, ) -> plotly.graph_objs.Figure: """ @@ -45,6 +46,7 @@ def plot_read_browser( hover: if False, disables display of information on mouse hover subset_parameters: Parameters to pass to the utils.random_sample() method, to subset the reads to be returned. If not None, at least one of n or frac must be provided. + span_full_window: if True, only plot reads that fully span the window defined by region_start-region_end Returns: plotly Figure object containing the plot @@ -69,6 +71,7 @@ def plot_read_browser( sort_by=sort_by, calculate_mod_fractions=True, subset_parameters=subset_parameters, + span_full_window=span_full_window, ) mod_vector_index = entry_labels.index("mod_vector") From 9f0addcad2ce0cffda24f522d8f705b4b7f73ebc Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Fri, 14 Nov 2025 13:06:41 -0800 Subject: [PATCH 08/18] Move metadata per-read parsing to below the read saving functionality. The previous order introduced a bug and failed tests because of strand mismatch. This now passes all tests while maintaining the closer-to-correct read_end information in gapped reads. --- dimelo/parse_bam.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/dimelo/parse_bam.py b/dimelo/parse_bam.py index 9130076..5305a3b 100644 --- a/dimelo/parse_bam.py +++ b/dimelo/parse_bam.py @@ -968,13 +968,6 @@ def read_by_base_txt_to_hdf5( canonical_base = fields[15] prob = float(fields[10]) mod_code = fields[11] - read_len = int(fields[9]) - ref_strand = fields[5] - # TODO: verify that read position is in the right (ref) coordinate system - if ref_strand == "+": - pos_in_read_ref = int(fields[1]) - elif ref_strand == "-": - pos_in_read_ref = read_len - int(fields[1]) - 1 if read_name != fields[0]: # Record the previous read details unless this is the first line @@ -1047,6 +1040,13 @@ def read_by_base_txt_to_hdf5( # Metadata read_name = fields[0] read_chrom = fields[3] + read_len = int(fields[9]) + ref_strand = fields[5] + # TODO: verify that read position is in the right (ref) coordinate system + if ref_strand == "+": + pos_in_read_ref = int(fields[1]) + elif ref_strand == "-": + pos_in_read_ref = read_len - int(fields[1]) - 1 # Calculate read start (leftmost position on ref genome) # TODO: logic can be replaced when we switch to true read start/end from modkit read_start = pos_in_genome - pos_in_read_ref @@ -1057,6 +1057,10 @@ def read_by_base_txt_to_hdf5( # Adjust the read_end (rightmost position on ref genome) each time there's a new mod # This will lead to the most accurate end positions for gapped reads # TODO: logic can be replaced when we switch to true read start/end from modkit + if ref_strand == "+": + pos_in_read_ref = int(fields[1]) + elif ref_strand == "-": + pos_in_read_ref = read_len - int(fields[1]) - 1 read_end = pos_in_genome + (read_len - pos_in_read_ref) # Regardless of whether its a new read or not, # add modification to vector if motif type is correct From 8a9c7c1780d240473c1cf3be1fa6055a6fbcc50a Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Fri, 14 Nov 2025 22:47:07 -0800 Subject: [PATCH 09/18] Add sorting option for read_length --- dimelo/load_processed.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/dimelo/load_processed.py b/dimelo/load_processed.py index af7f011..1c9da0a 100644 --- a/dimelo/load_processed.py +++ b/dimelo/load_processed.py @@ -796,7 +796,7 @@ def read_vectors_from_hdf5( ) # We add region information (start, end, and strand; chromosome is already present!) # so that it is possible to sort by and process based on these - readwise_datasets += ["region_start", "region_end", "region_strand"] + readwise_datasets += ["region_start", "region_end", "region_strand", "read_length"] # This is sanitizing the dataset entries and adjusting prob values if needed if binarized: @@ -813,6 +813,9 @@ def read_vectors_from_hdf5( for tup in read_tuples_raw ] + read_start_idx = readwise_datasets.index("read_start") + read_end_idx = readwise_datasets.index("read_end") + if calculate_mod_fractions: # Add the MOTIF_mod_fraction entries to the readwise_datasets list for future reference in sorting readwise_datasets += [f"{motif}_mod_fraction" for motif in motifs] @@ -832,8 +835,10 @@ def read_vectors_from_hdf5( read_tuples_all = [] for read_tuple in read_tuples_processed: + read_length = read_tuple[read_end_idx] - read_tuple[read_start_idx] read_tuples_all.append( tuple(val for val in read_tuple) + + (read_length,) + tuple( mod_frac for mod_frac in mod_fractions_by_read_name_by_motif[ @@ -842,7 +847,10 @@ def read_vectors_from_hdf5( ) ) else: - read_tuples_all = read_tuples_processed + read_tuples_all = [] + for read_tuple in read_tuples_processed: + read_length = read_tuple[read_end_idx] - read_tuple[read_start_idx] + read_tuples_all.append(tuple(val for val in read_tuple) + (read_length,)) ## Sort the reads From 20c48a5371518d1ea898b5abd6f1836dcdfee77a Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Fri, 14 Nov 2025 23:01:04 -0800 Subject: [PATCH 10/18] Add asc vs desc options to sort_by for read_vectors_from_h5. Passes extract unit tests. --- dimelo/load_processed.py | 43 +++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/dimelo/load_processed.py b/dimelo/load_processed.py index 1c9da0a..2164263 100644 --- a/dimelo/load_processed.py +++ b/dimelo/load_processed.py @@ -854,29 +854,54 @@ def read_vectors_from_hdf5( ## Sort the reads + # Normalize sort_by to a list of tuples (field, order) + # Support formats: + # - "field" or ["field1", "field2"] -> [("field1", "asc"), ("field2", "asc")] + # - [("field1", "desc"), "field2"] -> [("field1", "desc"), ("field2", "asc")] + # Enforce that sort_by is a list if not isinstance(sort_by, list): sort_by = [sort_by] + # Parse into (field, order) tuples + sort_by_normalized = [] + for item in sort_by: + if isinstance(item, tuple): + field, order = item + if order not in ["asc", "desc"]: + raise ValueError( + f"Sort order must be 'asc' or 'desc', got '{order}' for field '{field}'" + ) + sort_by_normalized.append((field, order)) + else: + # Default to ascending order + sort_by_normalized.append((item, "asc")) + # If 'shuffle' appears anywhere in sort_by, we first shuffle the list - if "shuffle" in sort_by: + if any(field == "shuffle" for field, _ in sort_by_normalized): utils.rng.shuffle(read_tuples_all) + # Build sorting configuration try: - sort_by_indices = [ - readwise_datasets.index(sort_item) - for sort_item in sort_by - if sort_item != "shuffle" + sort_config = [ + (readwise_datasets.index(field), order == "desc") + for field, order in sort_by_normalized + if field != "shuffle" ] except ValueError as e: raise ValueError( f"Sorting error. {e}. Datasets include {readwise_datasets}. If you need mod fraction sorting make sure you are not setting calculate_read_fraction to False." ) from e - if len(sort_by_indices) > 0: - sorted_read_tuples = sorted( - read_tuples_all, key=lambda x: tuple(x[index] for index in sort_by_indices) - ) + if len(sort_config) > 0: + # Use stable sort from right to left (reverse order of sort keys) + sorted_read_tuples = read_tuples_all + for idx, is_reverse in reversed(sort_config): + sorted_read_tuples = sorted( + sorted_read_tuples, + key=lambda x: x[idx], + reverse=is_reverse + ) else: sorted_read_tuples = read_tuples_all From 72caadfd9288dd9ab78285f3d141c8d34c722712 Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Tue, 18 Nov 2025 16:23:04 -0800 Subject: [PATCH 11/18] ruff format --- dimelo/load_processed.py | 4 +- dimelo/parse_bam.py | 4 +- dimelo/plot_enrichment_profile.py | 11 +- dimelo/plot_read_browser.py | 4 +- dimelo/test/dimelo_test.py | 171 +++++++++++++++--------------- dimelo/utils.py | 4 +- 6 files changed, 102 insertions(+), 96 deletions(-) diff --git a/dimelo/load_processed.py b/dimelo/load_processed.py index 2164263..b14e4f0 100644 --- a/dimelo/load_processed.py +++ b/dimelo/load_processed.py @@ -898,9 +898,7 @@ def read_vectors_from_hdf5( sorted_read_tuples = read_tuples_all for idx, is_reverse in reversed(sort_config): sorted_read_tuples = sorted( - sorted_read_tuples, - key=lambda x: x[idx], - reverse=is_reverse + sorted_read_tuples, key=lambda x: x[idx], reverse=is_reverse ) else: sorted_read_tuples = read_tuples_all diff --git a/dimelo/parse_bam.py b/dimelo/parse_bam.py index 5305a3b..e86b7ee 100644 --- a/dimelo/parse_bam.py +++ b/dimelo/parse_bam.py @@ -1041,8 +1041,8 @@ def read_by_base_txt_to_hdf5( read_name = fields[0] read_chrom = fields[3] read_len = int(fields[9]) - ref_strand = fields[5] - # TODO: verify that read position is in the right (ref) coordinate system + ref_strand = fields[5] + # TODO: verify that read position is in the right (ref) coordinate system if ref_strand == "+": pos_in_read_ref = int(fields[1]) elif ref_strand == "-": diff --git a/dimelo/plot_enrichment_profile.py b/dimelo/plot_enrichment_profile.py index c01310d..afd873f 100644 --- a/dimelo/plot_enrichment_profile.py +++ b/dimelo/plot_enrichment_profile.py @@ -74,14 +74,19 @@ def plot_enrichment_profile( regions_list[0], window_size, ) - if len(regions_dict)==1 and len(list(regions_dict.values())[0])==1: + if len(regions_dict) == 1 and len(list(regions_dict.values())[0]) == 1: region_tuple = list(regions_dict.values())[0][0] offset_center = (region_tuple[0] + region_tuple[1]) // 2 else: - raise ValueError("relative=False must be used when plotting more than one region.") + raise ValueError( + "relative=False must be used when plotting more than one region." + ) axes = make_enrichment_profile_plot( - trace_vectors=trace_vectors, sample_names=sample_names, offset_center=offset_center, **kwargs + trace_vectors=trace_vectors, + sample_names=sample_names, + offset_center=offset_center, + **kwargs, ) return axes diff --git a/dimelo/plot_read_browser.py b/dimelo/plot_read_browser.py index e80c760..5d10592 100644 --- a/dimelo/plot_read_browser.py +++ b/dimelo/plot_read_browser.py @@ -176,7 +176,9 @@ def format_browser_data( read_extent_df = ( read_df[["read_start", "read_end", "y_index", "read_name"]] .assign(read_length=lambda df: df.read_end - df.read_start) - .sort_values("read_length", ascending=False) # TODO: logic can be removed once we reference true read lengths from modkit + .sort_values( + "read_length", ascending=False + ) # TODO: logic can be removed once we reference true read lengths from modkit .drop_duplicates(subset=["read_name"]) .drop(columns=["read_length"]) ) diff --git a/dimelo/test/dimelo_test.py b/dimelo/test/dimelo_test.py index f7b84b2..b25f7cd 100644 --- a/dimelo/test/dimelo_test.py +++ b/dimelo/test/dimelo_test.py @@ -60,12 +60,12 @@ def test_unit__pileup( # Read and compare file contents file1_contents = f1.read() file2_contents = f2.read() - assert ( - file1_contents == file2_contents - ), f"{test_case}: {pileup_bed} does not match {pileup_target}." - assert filecmp.cmp( - regions_processed, regions_target, shallow=False - ), f"{test_case}: {regions_processed} does not match {regions_target}." + assert file1_contents == file2_contents, ( + f"{test_case}: {pileup_bed} does not match {pileup_target}." + ) + assert filecmp.cmp(regions_processed, regions_target, shallow=False), ( + f"{test_case}: {regions_processed} does not match {regions_target}." + ) else: print(f"{test_case} skipped for pileup.") @@ -117,13 +117,13 @@ def test_unit__extract( for target_item in target_dataset ], f"{test_case}: {dataset} does not match." else: - assert ( - test_dataset == target_dataset - ), f"{test_case}: {dataset} does not match." + assert test_dataset == target_dataset, ( + f"{test_case}: {dataset} does not match." + ) # assert os.path.getsize(extract_h5) == os.path.getsize(extract_target), f"{test_case}: {extract_h5} does not match {extract_target}." - assert filecmp.cmp( - regions_processed, regions_target, shallow=False - ), f"{test_case}: {regions_processed} does not match {regions_target}." + assert filecmp.cmp(regions_processed, regions_target, shallow=False), ( + f"{test_case}: {regions_processed} does not match {regions_target}." + ) else: print(f"{test_case} skipped for extract.") @@ -157,9 +157,9 @@ def test_integration__pileup_load_plot( motif=motif, **kwargs_counts_from_bedmethyl, ) - assert ( - actual == expected - ), f"{test_case}: Counts for motif {motif} are not equal" + assert actual == expected, ( + f"{test_case}: Counts for motif {motif} are not equal" + ) kwargs_vectors_from_bedmethyl = filter_kwargs_for_func( dm.load_processed.pileup_vectors_from_bedmethyl, kwargs @@ -171,16 +171,16 @@ def test_integration__pileup_load_plot( motif=motif, **kwargs_vectors_from_bedmethyl, ) - assert len(expected_tuple) == len( - actual_tuple - ), f"{test_case}: Unexpected number of arrays returned for {motif}" + assert len(expected_tuple) == len(actual_tuple), ( + f"{test_case}: Unexpected number of arrays returned for {motif}" + ) for expected, actual in zip(expected_tuple, actual_tuple): # TODO: The following was the original assertion error message, but it was not written in a functional way. Find a way to make it work as intended. # assert np.array_equal(expected, actual), f"{test_case}: Arrays for motif {motif} are not equal: expected {value} but got {actual[key]}" - assert np.array_equal( - expected, actual - ), f"{test_case}: Arrays for motif {motif} are not equal." + assert np.array_equal(expected, actual), ( + f"{test_case}: Arrays for motif {motif} are not equal." + ) else: print( f"{test_case} loading skipped for pileup_load_plot, continuing to plotting." @@ -275,13 +275,13 @@ def test_integration__extract_load_plot( {value[np.where(value != actual[key])[0]]} vs {actual[key][np.where(value != actual[key])[0]]}. """ elif isinstance(value, (str, int, bool)): - assert ( - actual[key] == expected[key] - ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." + assert actual[key] == expected[key], ( + f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." + ) else: - assert np.isclose( - actual[key], value, atol=1e-4 - ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." + assert np.isclose(actual[key], value, atol=1e-4), ( + f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." + ) else: print("{test_case} skipped for read_vectors_from_hdf5.") kwargs_plot_reads_plot_reads = filter_kwargs_for_func( @@ -299,9 +299,9 @@ def test_integration__extract_load_plot( mod_file_name=extract_h5, **kwargs_plot_reads_plot_reads, ) - assert "No threshold has been applied" in str( - excinfo.value - ), f"{test_case}: unexpected exception {excinfo.value}" + assert "No threshold has been applied" in str(excinfo.value), ( + f"{test_case}: unexpected exception {excinfo.value}" + ) # providing a threshold should be enough to run plot_reads.plot_reads without an error kwargs_plot_reads_plot_reads["thresh"] = 0.75 ax = dm.plot_reads.plot_reads( @@ -382,9 +382,9 @@ def test_unit__pileup_counts_from_bedmethyl( motif=motif, **kwargs_counts_from_bedmethyl, ) - assert ( - actual == expected - ), f"{test_case}: Counts for motif {motif} are not equal" + assert actual == expected, ( + f"{test_case}: Counts for motif {motif} are not equal" + ) else: print(f"{test_case} skipped for pileup_counts_from_bedmethyl.") @@ -405,14 +405,14 @@ def test_unit__pileup_vectors_from_bedmethyl( motif=motif, **kwargs_vectors_from_bedmethyl, ) - assert len(expected_tuple) == len( - actual_tuple - ), f"{test_case}: Unexpected number of arrays returned for {motif}" + assert len(expected_tuple) == len(actual_tuple), ( + f"{test_case}: Unexpected number of arrays returned for {motif}" + ) for expected, actual in zip(expected_tuple, actual_tuple): - assert np.array_equal( - expected, actual - ), f"{test_case}: Arrays for motif {motif} are not equal" + assert np.array_equal(expected, actual), ( + f"{test_case}: Arrays for motif {motif} are not equal" + ) else: print(f"{test_case} skipped for pileup_vectors_from_bedmethyl.") @@ -448,13 +448,13 @@ def test_unit__read_vectors_from_hdf5( {value[np.where(value != actual[key])[0]]} vs {actual[key][np.where(value != actual[key])[0]]}. """ elif isinstance(value, (str, int, bool)): - assert ( - actual[key] == expected[key] - ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." + assert actual[key] == expected[key], ( + f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." + ) else: - assert np.isclose( - actual[key], value, atol=1e-4 - ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." + assert np.isclose(actual[key], value, atol=1e-4), ( + f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." + ) else: print("{test_case} skipped for read_vectors_from_hdf5.") @@ -558,9 +558,9 @@ def test_unit__plot_enrichment_plot_enrichment( sample_names=["label" for _ in regions_list], **kwargs_plot_enrichment_plot_enrichment, ) - assert isinstance( - ax, Axes - ), f"{test_case}: plotting failed for {motif}." + assert isinstance(ax, Axes), ( + f"{test_case}: plotting failed for {motif}." + ) else: print(f"{test_case} skipped for plot_enrichment.plot_enrichment.") @@ -587,9 +587,9 @@ def test_unit__plot_enrichment_by_regions( sample_names=["label" for _ in regions_list], **kwargs_plot_enrichment_by_regions, ) - assert isinstance( - ax, Axes - ), f"{test_case}: plotting failed for {motif}." + assert isinstance(ax, Axes), ( + f"{test_case}: plotting failed for {motif}." + ) else: print(f"{test_case} skipped for plot_enrichment.by_regions.") @@ -718,9 +718,9 @@ def test_unit__plot_enrichment_profile_plot_enrichment_profile( sample_names=["label" for _ in regions_list], **kwargs_plot_enrichment_profile_plot_enrichment_profile, ) - assert isinstance( - ax, Axes - ), f"{test_case}: plotting failed for {motif}." + assert isinstance(ax, Axes), ( + f"{test_case}: plotting failed for {motif}." + ) else: print( f"{test_case} skipped for plot_enrichment_profile.plot_enrichment_profile." @@ -751,9 +751,9 @@ def test_unit__plot_enrichment_profile_by_regions( sample_names=["label" for _ in regions_list], **kwargs_plot_enrichment_profile_by_regions, ) - assert isinstance( - ax, Axes - ), f"{test_case}: plotting failed for {motif}." + assert isinstance(ax, Axes), ( + f"{test_case}: plotting failed for {motif}." + ) else: print(f"{test_case} skipped for plot_enrichment_profile.by_regions.") @@ -884,9 +884,9 @@ def test_unit__plot_depth_profile_plot_depth_profile( sample_names=["label" for _ in regions_list], **kwargs_plot_depth_profile_plot_depth_profile, ) - assert isinstance( - ax, Axes - ), f"{test_case}: plotting failed for {motif}." + assert isinstance(ax, Axes), ( + f"{test_case}: plotting failed for {motif}." + ) else: print(f"{test_case} skipped for plot_depth_profile.plot_depth_profile.") @@ -915,9 +915,9 @@ def test_unit__plot_depth_profile_by_regions( sample_names=["label" for _ in regions_list], **kwargs_plot_depth_profile_by_regions, ) - assert isinstance( - ax, Axes - ), f"{test_case}: plotting failed for {motif}." + assert isinstance(ax, Axes), ( + f"{test_case}: plotting failed for {motif}." + ) else: print(f"{test_case} skipped for plot_depth_profile.by_regions.") @@ -1044,9 +1044,9 @@ def test_unit__plot_depth_histogram_plot_depth_histogram( sample_names=["label" for _ in regions_list], **kwargs_plot_depth_histogram_plot_depth_histogram, ) - assert isinstance( - ax, Axes - ), f"{test_case}: plotting failed for {motif}." + assert isinstance(ax, Axes), ( + f"{test_case}: plotting failed for {motif}." + ) else: print(f"{test_case} skipped for plot_depth_histogram.plot_depth_histogram.") @@ -1075,9 +1075,9 @@ def test_unit__plot_depth_histogram_by_regions( sample_names=["label" for _ in regions_list], **kwargs_plot_depth_histogram_by_regions, ) - assert isinstance( - ax, Axes - ), f"{test_case}: plotting failed for {motif}." + assert isinstance(ax, Axes), ( + f"{test_case}: plotting failed for {motif}." + ) else: print(f"{test_case} skipped for plot_depth_histogram.by_regions.") @@ -1166,9 +1166,9 @@ def test_unit__plot_reads_plot_reads( mod_file_name=results["extract"][0], **kwargs_plot_reads_plot_reads, ) - assert "No threshold has been applied" in str( - excinfo.value - ), f"{test_case}: unexpected exception {excinfo.value}" + assert "No threshold has been applied" in str(excinfo.value), ( + f"{test_case}: unexpected exception {excinfo.value}" + ) # providing a threshold should be enough to run plot_reads.plot_reads without an error kwargs_plot_reads_plot_reads["thresh"] = 0.75 ax = dm.plot_reads.plot_reads( @@ -1206,9 +1206,9 @@ def test_unit__plot_read_browser( region=kwargs["regions"], **kwargs_plot_read_browser, ) - assert isinstance( - fig, plotly.graph_objs.Figure - ), f"{test_case}: plotting failed." + assert isinstance(fig, plotly.graph_objs.Figure), ( + f"{test_case}: plotting failed." + ) else: with pytest.raises(ValueError) as excinfo: fig = dm.plot_read_browser.plot_read_browser( @@ -1220,22 +1220,23 @@ def test_unit__plot_read_browser( isinstance(kwargs["regions"], list) or Path(kwargs["regions"]).suffix == ".bed" ) and kwargs["thresh"] is None: - assert ( - "Invalid region" in str(excinfo.value) - ), f"{test_case}: unexpected exception for no-threshold bad-region case {excinfo.value}" + assert "Invalid region" in str(excinfo.value), ( + f"{test_case}: unexpected exception for no-threshold bad-region case {excinfo.value}" + ) elif ( kwargs["thresh"] is not None and not isinstance(kwargs["regions"], list) and Path(kwargs["regions"]).suffix != ".bed" ): - assert ( - "A threshold has been applied" in str(excinfo.value) - ), f"{test_case}: unexpected exception thresholded valid-region case {excinfo.value}" + assert "A threshold has been applied" in str(excinfo.value), ( + f"{test_case}: unexpected exception thresholded valid-region case {excinfo.value}" + ) else: - assert ( - "A threshold has been applied" in str(excinfo.value) - or "Invalid region" in str(excinfo.value) - ), f"{test_case}: unexpected exception thresholded bad-region case {excinfo.value}" + assert "A threshold has been applied" in str( + excinfo.value + ) or "Invalid region" in str(excinfo.value), ( + f"{test_case}: unexpected exception thresholded bad-region case {excinfo.value}" + ) else: print(f"{test_case} skipped for test_unit__plot_read_browser") diff --git a/dimelo/utils.py b/dimelo/utils.py index 7d501e1..91a2661 100644 --- a/dimelo/utils.py +++ b/dimelo/utils.py @@ -27,8 +27,8 @@ "CG,0": "orange", "CG,0,m": "yellow", "CG,0,h": "red", - "HCG,1":"orange", - "WCG,1":"red", + "HCG,1": "orange", + "WCG,1": "red", "GCH,1": "green", } ) From e06fca1bc3a4051fe23d5d1a6c7558d09f0f2d4e Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Tue, 25 Nov 2025 10:46:35 -0800 Subject: [PATCH 12/18] Documentation changes and top-level parameter exposure per PR comments --- dimelo/plot_enrichment_profile.py | 2 +- dimelo/plot_read_browser.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/dimelo/plot_enrichment_profile.py b/dimelo/plot_enrichment_profile.py index afd873f..20a625c 100644 --- a/dimelo/plot_enrichment_profile.py +++ b/dimelo/plot_enrichment_profile.py @@ -40,7 +40,7 @@ def plot_enrichment_profile( mod_names: list of modifications to extract; expected to match mods available in the relevant mod_files sample_names: list of names to use for labeling traces in the output; legend entries window_size: half-size of the desired window to plot; how far the window stretches on either side of the center point - relative: True means x-axis is centered around region centers, False means x-axis is absolute genome positions + relative: True means x-axis is centered around region centers, False means x-axis is absolute genome positions. Must be True when plotting more than one region. single_strand: True means we only grab counts from reads from the same strand as the region of interest, False means we always grab both strands within the regions regions_5to3prime: True means negative strand regions get flipped, False means no flipping diff --git a/dimelo/plot_read_browser.py b/dimelo/plot_read_browser.py index 5d10592..f5e7404 100644 --- a/dimelo/plot_read_browser.py +++ b/dimelo/plot_read_browser.py @@ -17,6 +17,9 @@ def plot_read_browser( hover: bool = True, subset_parameters: dict | None = None, span_full_window: bool = False, + width: int = 1, + size: int = 4, + colorscales: dict = utils.DEFAULT_COLORSCALES, **kwargs, ) -> plotly.graph_objs.Figure: """ @@ -47,6 +50,9 @@ def plot_read_browser( subset_parameters: Parameters to pass to the utils.random_sample() method, to subset the reads to be returned. If not None, at least one of n or frac must be provided. span_full_window: if True, only plot reads that fully span the window defined by region_start-region_end + width: width of the read lines in the browser + size: size of the modification event markers in the browser + colorscales: dictionary mapping motif names to plotly colorscale specifications Returns: plotly Figure object containing the plot @@ -173,6 +179,8 @@ def format_browser_data( # differences are caused by the assumptions made in parse_bam.extract h5 conversion # and don't matter for much but do cause the full duplicate check to leave duplicates, # which then cause non-unique indices for mapping in collapse mode + # The sorting below is just to make sure that when we drop duplicates, we keep + # the longest read extent for each read_name. It doesn't effect final figure sort order. read_extent_df = ( read_df[["read_start", "read_end", "y_index", "read_name"]] .assign(read_length=lambda df: df.read_end - df.read_start) @@ -325,6 +333,10 @@ def make_browser_figure( region_start: start position of the region being browsed region_end: end position of the region being browsed hover: if False, disables display of information on mouse hover + thresh: pass down threshold for color scaling of modification probabilities + width: width of the read lines in the browser + size: size of the modification event markers in the browser + colorscales: dictionary mapping motif names to plotly colorscale specifications TODO: Think about how this interfaces with different types of initial sorting... TODO: Make it so that this method does NOT modify the input dataframe From 841faf949874e7b1fbc46c3f82eeaa2fc8e35a38 Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Tue, 25 Nov 2025 10:54:09 -0800 Subject: [PATCH 13/18] Shorten long lines to see if that helps with ruff format --- dimelo/test/dimelo_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dimelo/test/dimelo_test.py b/dimelo/test/dimelo_test.py index b25f7cd..cdf9953 100644 --- a/dimelo/test/dimelo_test.py +++ b/dimelo/test/dimelo_test.py @@ -51,8 +51,10 @@ def test_unit__pileup( pileup_target, regions_target = results["pileup"] if pileup_target is not None and regions_target is not None: - # This is necessary because the gzipped files are different on mac vs linux, but the contents should be identical (and are, so far) - # Not sure why the compression ratio is better on Linux when both are using pysam.tabix_compress with pysam 0.22.0 and zlib 1.2.13 but whatcha gonna do + # This is necessary because the gzipped files are different on mac vs linux, + # but the contents should be identical (and are, so far) + # Not sure why the compression ratio is better on Linux when both are using + # pysam.tabix_compress with pysam 0.22.0 and zlib 1.2.13 but whatcha gonna do with ( gzip.open(pileup_bed, "rt") as f1, gzip.open(pileup_target, "rt") as f2, From b0b78e0429ba033e8d92f776bee10573a6e411a0 Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Tue, 25 Nov 2025 10:57:10 -0800 Subject: [PATCH 14/18] ruff format after downgrading to ruff 0.6.8 --- dimelo/test/dimelo_test.py | 171 ++++++++++++++++++------------------- 1 file changed, 85 insertions(+), 86 deletions(-) diff --git a/dimelo/test/dimelo_test.py b/dimelo/test/dimelo_test.py index cdf9953..170397a 100644 --- a/dimelo/test/dimelo_test.py +++ b/dimelo/test/dimelo_test.py @@ -62,12 +62,12 @@ def test_unit__pileup( # Read and compare file contents file1_contents = f1.read() file2_contents = f2.read() - assert file1_contents == file2_contents, ( - f"{test_case}: {pileup_bed} does not match {pileup_target}." - ) - assert filecmp.cmp(regions_processed, regions_target, shallow=False), ( - f"{test_case}: {regions_processed} does not match {regions_target}." - ) + assert ( + file1_contents == file2_contents + ), f"{test_case}: {pileup_bed} does not match {pileup_target}." + assert filecmp.cmp( + regions_processed, regions_target, shallow=False + ), f"{test_case}: {regions_processed} does not match {regions_target}." else: print(f"{test_case} skipped for pileup.") @@ -119,13 +119,13 @@ def test_unit__extract( for target_item in target_dataset ], f"{test_case}: {dataset} does not match." else: - assert test_dataset == target_dataset, ( - f"{test_case}: {dataset} does not match." - ) + assert ( + test_dataset == target_dataset + ), f"{test_case}: {dataset} does not match." # assert os.path.getsize(extract_h5) == os.path.getsize(extract_target), f"{test_case}: {extract_h5} does not match {extract_target}." - assert filecmp.cmp(regions_processed, regions_target, shallow=False), ( - f"{test_case}: {regions_processed} does not match {regions_target}." - ) + assert filecmp.cmp( + regions_processed, regions_target, shallow=False + ), f"{test_case}: {regions_processed} does not match {regions_target}." else: print(f"{test_case} skipped for extract.") @@ -159,9 +159,9 @@ def test_integration__pileup_load_plot( motif=motif, **kwargs_counts_from_bedmethyl, ) - assert actual == expected, ( - f"{test_case}: Counts for motif {motif} are not equal" - ) + assert ( + actual == expected + ), f"{test_case}: Counts for motif {motif} are not equal" kwargs_vectors_from_bedmethyl = filter_kwargs_for_func( dm.load_processed.pileup_vectors_from_bedmethyl, kwargs @@ -173,16 +173,16 @@ def test_integration__pileup_load_plot( motif=motif, **kwargs_vectors_from_bedmethyl, ) - assert len(expected_tuple) == len(actual_tuple), ( - f"{test_case}: Unexpected number of arrays returned for {motif}" - ) + assert len(expected_tuple) == len( + actual_tuple + ), f"{test_case}: Unexpected number of arrays returned for {motif}" for expected, actual in zip(expected_tuple, actual_tuple): # TODO: The following was the original assertion error message, but it was not written in a functional way. Find a way to make it work as intended. # assert np.array_equal(expected, actual), f"{test_case}: Arrays for motif {motif} are not equal: expected {value} but got {actual[key]}" - assert np.array_equal(expected, actual), ( - f"{test_case}: Arrays for motif {motif} are not equal." - ) + assert np.array_equal( + expected, actual + ), f"{test_case}: Arrays for motif {motif} are not equal." else: print( f"{test_case} loading skipped for pileup_load_plot, continuing to plotting." @@ -277,13 +277,13 @@ def test_integration__extract_load_plot( {value[np.where(value != actual[key])[0]]} vs {actual[key][np.where(value != actual[key])[0]]}. """ elif isinstance(value, (str, int, bool)): - assert actual[key] == expected[key], ( - f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." - ) + assert ( + actual[key] == expected[key] + ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." else: - assert np.isclose(actual[key], value, atol=1e-4), ( - f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." - ) + assert np.isclose( + actual[key], value, atol=1e-4 + ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." else: print("{test_case} skipped for read_vectors_from_hdf5.") kwargs_plot_reads_plot_reads = filter_kwargs_for_func( @@ -301,9 +301,9 @@ def test_integration__extract_load_plot( mod_file_name=extract_h5, **kwargs_plot_reads_plot_reads, ) - assert "No threshold has been applied" in str(excinfo.value), ( - f"{test_case}: unexpected exception {excinfo.value}" - ) + assert "No threshold has been applied" in str( + excinfo.value + ), f"{test_case}: unexpected exception {excinfo.value}" # providing a threshold should be enough to run plot_reads.plot_reads without an error kwargs_plot_reads_plot_reads["thresh"] = 0.75 ax = dm.plot_reads.plot_reads( @@ -384,9 +384,9 @@ def test_unit__pileup_counts_from_bedmethyl( motif=motif, **kwargs_counts_from_bedmethyl, ) - assert actual == expected, ( - f"{test_case}: Counts for motif {motif} are not equal" - ) + assert ( + actual == expected + ), f"{test_case}: Counts for motif {motif} are not equal" else: print(f"{test_case} skipped for pileup_counts_from_bedmethyl.") @@ -407,14 +407,14 @@ def test_unit__pileup_vectors_from_bedmethyl( motif=motif, **kwargs_vectors_from_bedmethyl, ) - assert len(expected_tuple) == len(actual_tuple), ( - f"{test_case}: Unexpected number of arrays returned for {motif}" - ) + assert len(expected_tuple) == len( + actual_tuple + ), f"{test_case}: Unexpected number of arrays returned for {motif}" for expected, actual in zip(expected_tuple, actual_tuple): - assert np.array_equal(expected, actual), ( - f"{test_case}: Arrays for motif {motif} are not equal" - ) + assert np.array_equal( + expected, actual + ), f"{test_case}: Arrays for motif {motif} are not equal" else: print(f"{test_case} skipped for pileup_vectors_from_bedmethyl.") @@ -450,13 +450,13 @@ def test_unit__read_vectors_from_hdf5( {value[np.where(value != actual[key])[0]]} vs {actual[key][np.where(value != actual[key])[0]]}. """ elif isinstance(value, (str, int, bool)): - assert actual[key] == expected[key], ( - f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." - ) + assert ( + actual[key] == expected[key] + ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." else: - assert np.isclose(actual[key], value, atol=1e-4), ( - f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." - ) + assert np.isclose( + actual[key], value, atol=1e-4 + ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}." else: print("{test_case} skipped for read_vectors_from_hdf5.") @@ -560,9 +560,9 @@ def test_unit__plot_enrichment_plot_enrichment( sample_names=["label" for _ in regions_list], **kwargs_plot_enrichment_plot_enrichment, ) - assert isinstance(ax, Axes), ( - f"{test_case}: plotting failed for {motif}." - ) + assert isinstance( + ax, Axes + ), f"{test_case}: plotting failed for {motif}." else: print(f"{test_case} skipped for plot_enrichment.plot_enrichment.") @@ -589,9 +589,9 @@ def test_unit__plot_enrichment_by_regions( sample_names=["label" for _ in regions_list], **kwargs_plot_enrichment_by_regions, ) - assert isinstance(ax, Axes), ( - f"{test_case}: plotting failed for {motif}." - ) + assert isinstance( + ax, Axes + ), f"{test_case}: plotting failed for {motif}." else: print(f"{test_case} skipped for plot_enrichment.by_regions.") @@ -720,9 +720,9 @@ def test_unit__plot_enrichment_profile_plot_enrichment_profile( sample_names=["label" for _ in regions_list], **kwargs_plot_enrichment_profile_plot_enrichment_profile, ) - assert isinstance(ax, Axes), ( - f"{test_case}: plotting failed for {motif}." - ) + assert isinstance( + ax, Axes + ), f"{test_case}: plotting failed for {motif}." else: print( f"{test_case} skipped for plot_enrichment_profile.plot_enrichment_profile." @@ -753,9 +753,9 @@ def test_unit__plot_enrichment_profile_by_regions( sample_names=["label" for _ in regions_list], **kwargs_plot_enrichment_profile_by_regions, ) - assert isinstance(ax, Axes), ( - f"{test_case}: plotting failed for {motif}." - ) + assert isinstance( + ax, Axes + ), f"{test_case}: plotting failed for {motif}." else: print(f"{test_case} skipped for plot_enrichment_profile.by_regions.") @@ -886,9 +886,9 @@ def test_unit__plot_depth_profile_plot_depth_profile( sample_names=["label" for _ in regions_list], **kwargs_plot_depth_profile_plot_depth_profile, ) - assert isinstance(ax, Axes), ( - f"{test_case}: plotting failed for {motif}." - ) + assert isinstance( + ax, Axes + ), f"{test_case}: plotting failed for {motif}." else: print(f"{test_case} skipped for plot_depth_profile.plot_depth_profile.") @@ -917,9 +917,9 @@ def test_unit__plot_depth_profile_by_regions( sample_names=["label" for _ in regions_list], **kwargs_plot_depth_profile_by_regions, ) - assert isinstance(ax, Axes), ( - f"{test_case}: plotting failed for {motif}." - ) + assert isinstance( + ax, Axes + ), f"{test_case}: plotting failed for {motif}." else: print(f"{test_case} skipped for plot_depth_profile.by_regions.") @@ -1046,9 +1046,9 @@ def test_unit__plot_depth_histogram_plot_depth_histogram( sample_names=["label" for _ in regions_list], **kwargs_plot_depth_histogram_plot_depth_histogram, ) - assert isinstance(ax, Axes), ( - f"{test_case}: plotting failed for {motif}." - ) + assert isinstance( + ax, Axes + ), f"{test_case}: plotting failed for {motif}." else: print(f"{test_case} skipped for plot_depth_histogram.plot_depth_histogram.") @@ -1077,9 +1077,9 @@ def test_unit__plot_depth_histogram_by_regions( sample_names=["label" for _ in regions_list], **kwargs_plot_depth_histogram_by_regions, ) - assert isinstance(ax, Axes), ( - f"{test_case}: plotting failed for {motif}." - ) + assert isinstance( + ax, Axes + ), f"{test_case}: plotting failed for {motif}." else: print(f"{test_case} skipped for plot_depth_histogram.by_regions.") @@ -1168,9 +1168,9 @@ def test_unit__plot_reads_plot_reads( mod_file_name=results["extract"][0], **kwargs_plot_reads_plot_reads, ) - assert "No threshold has been applied" in str(excinfo.value), ( - f"{test_case}: unexpected exception {excinfo.value}" - ) + assert "No threshold has been applied" in str( + excinfo.value + ), f"{test_case}: unexpected exception {excinfo.value}" # providing a threshold should be enough to run plot_reads.plot_reads without an error kwargs_plot_reads_plot_reads["thresh"] = 0.75 ax = dm.plot_reads.plot_reads( @@ -1208,9 +1208,9 @@ def test_unit__plot_read_browser( region=kwargs["regions"], **kwargs_plot_read_browser, ) - assert isinstance(fig, plotly.graph_objs.Figure), ( - f"{test_case}: plotting failed." - ) + assert isinstance( + fig, plotly.graph_objs.Figure + ), f"{test_case}: plotting failed." else: with pytest.raises(ValueError) as excinfo: fig = dm.plot_read_browser.plot_read_browser( @@ -1222,23 +1222,22 @@ def test_unit__plot_read_browser( isinstance(kwargs["regions"], list) or Path(kwargs["regions"]).suffix == ".bed" ) and kwargs["thresh"] is None: - assert "Invalid region" in str(excinfo.value), ( - f"{test_case}: unexpected exception for no-threshold bad-region case {excinfo.value}" - ) + assert ( + "Invalid region" in str(excinfo.value) + ), f"{test_case}: unexpected exception for no-threshold bad-region case {excinfo.value}" elif ( kwargs["thresh"] is not None and not isinstance(kwargs["regions"], list) and Path(kwargs["regions"]).suffix != ".bed" ): - assert "A threshold has been applied" in str(excinfo.value), ( - f"{test_case}: unexpected exception thresholded valid-region case {excinfo.value}" - ) + assert ( + "A threshold has been applied" in str(excinfo.value) + ), f"{test_case}: unexpected exception thresholded valid-region case {excinfo.value}" else: - assert "A threshold has been applied" in str( - excinfo.value - ) or "Invalid region" in str(excinfo.value), ( - f"{test_case}: unexpected exception thresholded bad-region case {excinfo.value}" - ) + assert ( + "A threshold has been applied" in str(excinfo.value) + or "Invalid region" in str(excinfo.value) + ), f"{test_case}: unexpected exception thresholded bad-region case {excinfo.value}" else: print(f"{test_case} skipped for test_unit__plot_read_browser") From 1e8d6766bbbcf13f9586ab33aea3e51569f5b9d4 Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Tue, 25 Nov 2025 11:28:39 -0800 Subject: [PATCH 15/18] Update package name and add new kwargs to function descriptions --- README.md | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 6dda4b7..bcd9191 100644 --- a/README.md +++ b/README.md @@ -51,27 +51,27 @@ This README document contains installation instructions and documentation for va **Platforms:** Mac and Linux operating systems, ARM (e.g. M1/M2 mac) and x86 (e.g. Intel mac) architectures. The package has been tested on HPC clusters, but there may be additional complexities depending on how these systems are set up. -*For Windows, we recommend using [Google Colab](https://colab.research.google.com/). We have not tested on [Windows Linux Subsystem](https://learn.microsoft.com/en-us/windows/wsl/install) but in principle that should work too. Windows support is possible in future, but blocked by [conda availability for modkit executables](https://anaconda.org/nanoporetech/modkit) and the [current implementation](dimelo/run_modkit.py) of live error/progress tracking during modkit execution, which relies on a unix-only library as of Python 3.11. The urgency of a Windows implementation will depend on user need, so please let us know if this is important for you.* +*For Windows, we recommend using [Google Colab](https://colab.research.google.com/). We have not tested on [Windows Linux Subsystem](https://learn.microsoft.com/en-us/windows/wsl/install) but in principle that should work too. Windows support is possible in future, but blocked by [conda availability for modkit executables](https://anaconda.org/nanoporetech/modkit) and the [current implementation](dimelo-toolkit/run_modkit.py) of live error/progress tracking during modkit execution, which relies on a unix-only library as of Python 3.11. The urgency of a Windows implementation will depend on user need, so please let us know if this is important for you.* **Conda and Python:** The default installation requires conda, or alternatives like mamba. See [here](https://www.anaconda.com/download) for conda installation. The installation instructions below will install Python 3.11 for you within a conda virtual environment, but depending on your system configuration you may need to ensure that you are not also loading a different version of Python on your path. If you encounter unexpected errors when importing `dimelo`, e.g. complaining about syntax, consider checking your Python version. -### Load source code from the modkit_parsing_beta branch +### Load source code for dimelo-toolkit -Open your terminal or command line and navigate to wherever you want to keep the `dimelo` source code (e.g. your Documents folder, `cd Documents`) and clone the repo +Open your terminal or command line and navigate to wherever you want to keep the `dimelo-toolkit` source code (e.g. your Documents folder, `cd Documents`) and clone the repo ``` -git clone https://github.com/streetslab/dimelo +git clone https://github.com/streetslab/dimelo-toolkit ``` ### Set up virtual environment -Navigate into the dimelo directory +Navigate into the dimelo-toolkit directory ``` -cd dimelo +cd dimelo-toolkit ``` -Create a conda environment using environment.yml. This will make a new conda environment with the name `dimelo`. +Create a conda environment using environment.yml. This will make a new conda environment with the name `dimelo-toolkit`. ``` conda env create -f environment.yml @@ -79,15 +79,15 @@ conda env create -f environment.yml *If you want to handle environment creation yourself, see [the alternative installation instructions](#alternative-installations).* -### Install pip dependencies and core dimelo package +### Install pip dependencies and core dimelo-toolkit package Activate your conda environment, which should now contain python 3.11 and a modkit executable on the path and executable on your system. ``` -conda activate dimelo +conda activate dimelo-toolkit ``` -Ensure that you are still in the top-level dimelo directory. Install the dimelo package and its dependencies from source. +Ensure that you are still in the top-level dimelo-toolkit directory. Install the dimelo-toolkit package and its dependencies from source. ``` pip install . @@ -95,7 +95,7 @@ pip install . ## Google Colab Installation -Run the following code in the first cell of your notebook to grab `modkit v0.2.4` from conda and install the `dimelo modkit_parsing_beta` branch. This will have to be run whenever you make a new Colab instance, unless you have a better way of managing this, in which case please reach out. The tutorial notebook runs equivalent code blocks to set up your environment, so if you are trying to run the tutorial you can skip to [Basic Use](#basic-use). +Run the following code in the first cell of your notebook to grab `modkit v0.2.4` from conda and install the `dimelo-toolkit main` branch. This will have to be run whenever you make a new Colab instance, unless you have a better way of managing this, in which case please reach out. The tutorial notebook runs equivalent code blocks to set up your environment, so if you are trying to run the tutorial you can skip to [Basic Use](#basic-use). ``` from google.colab import drive @@ -104,14 +104,14 @@ drive.mount('/content/drive') import condacolab condacolab.install() !conda install nanoporetech::modkit==0.2.4 -!git clone https://github.com/streetslab/dimelo -!cd dimelo && pip install ipywidgets==7.7.1 . +!git clone https://github.com/streetslab/dimelo-toolkit +!cd dimelo-toolkit && pip install ipywidgets==7.7.1 . import dimelo ``` ## Alternative Installations -Alternatively, you can install modkit into any conda environment you like. If you want to, you can install modkit some other way, and then add it to the path of your notebook or script. *NOTE: if you are creating the environment yourself, be sure to use python 3.10 or greater. Some dimelo package features require relatively new python releases.* +Alternatively, you can install modkit into any conda environment you like. If you want to, you can install modkit some other way, and then add it to the path of your notebook or script. *NOTE: if you are creating the environment yourself, be sure to use python 3.10 or greater. Some dimelo-toolkit features require relatively new python releases.* ``` conda install nanoporetech::modkit==0.2.4 @@ -125,7 +125,7 @@ sys.path.append('path_to_modkit_executable_directory') ``` ## Developer Installation -If you are planning on developing for the `dimelo` package, change the `pip install` command to install the package in "editable" mode, so that your code changes are reflected in your environment: +If you are planning on developing for the `dimelo-toolkit` package, change the `pip install` command to install the package in "editable" mode, so that your code changes are reflected in your environment: ``` pip install . -e ``` @@ -159,7 +159,7 @@ See the [tutorial](tutorial.ipynb) as a starting point. For local operation on Mac or Linux, you will already have cloned the repo to disk in the installation step. Activate your conda environment, make sure you have jupyter installed, and then launch a jupyter notebook server and navigate to `tutorial.ipynb`. You can also use other tools to open the jupyter notebook or you can simply reference it as an example. ``` -conda activate dimelo +conda activate dimelo-toolkit jupyter notebook ``` @@ -224,7 +224,7 @@ You should expect to see some text outputs and a series of progress bars. Progre There should not be such issues for command line operation. See below an example of command line progress outputs: you should expect relatively fast pre-processing, 10-90 seconds, and then contig processing times depending heavily on the size of your `.bam` file and the extent of your `regions`. ``` -(dimelo_modkit_parsing) oberondixon-luinenburg@Oberons-MacBook-Pro package_test_notebooks % python dimelo_cmd.py +(dimelo-toolkt) oberondixon-luinenburg@Oberons-MacBook-Pro package_test_notebooks % python dimelo_cmd.py modkit found with expected version 0.2.4 No output directory provided, using input directory /Users/oberondixon-luinenburg/Documents/Ioannidis-Streets/dimelo_test_data/20230702_jm_lmnb1_acessibility_redux No specified number of cores requested. 8 available on machine, allocating all. @@ -246,6 +246,7 @@ def plot_enrichment_profile( motifs: list[str], sample_names: list[str], window_size: int, + relative: bool = True, single_strand: bool = False, regions_5to3prime: bool = False, smooth_window: int | None = None, @@ -268,6 +269,7 @@ def plot_enrichment_profile( mod_names: list of modifications to extract; expected to match mods available in the relevant mod_files sample_names: list of names to use for labeling traces in the output; legend entries window_size: half-size of the desired window to plot; how far the window stretches on either side of the center point + relative: True means x-axis is centered around region centers, False means x-axis is absolute genome positions. Must be True when plotting more than one region. single_strand: True means we only grab counts from reads from the same strand as the region of interest, False means we always grab both strands within the regions regions_5to3prime: True means negative strand regions get flipped, False means no flipping @@ -440,6 +442,10 @@ def plot_read_browser( sort_by: str | list[str] = "shuffle", hover: bool = True, subset_parameters: dict | None = None, + span_full_window: bool = False, + width: int = 1, + size: int = 4, + colorscales: dict = utils.DEFAULT_COLORSCALES, **kwargs, ) -> plotly.graph_objs.Figure: """ @@ -469,6 +475,10 @@ def plot_read_browser( hover: if False, disables display of information on mouse hover subset_parameters: Parameters to pass to the utils.random_sample() method, to subset the reads to be returned. If not None, at least one of n or frac must be provided. + span_full_window: if True, only plot reads that fully span the window defined by region_start-region_end + width: width of the read lines in the browser + size: size of the modification event markers in the browser + colorscales: dictionary mapping motif names to plotly colorscale specifications Returns: plotly Figure object containing the plot @@ -747,6 +757,7 @@ def read_vectors_from_hdf5( quiet: bool = True, # currently unused; change to default False when pbars are implemented cores: int | None = None, # currently unused subset_parameters: dict | None = None, + span_full_window: bool = False, ) -> tuple[list[tuple], list[str], dict | None]: """ User-facing function. @@ -795,6 +806,7 @@ def read_vectors_from_hdf5( subset_parameters: Parameters to pass to the utils.random_sample() method, to subset the reads to be returned. If not None, at least one of n or frac must be provided. The array parameter should not be provided here. + span_full_window: If True, only load reads that fully span the window defined by region_start-region_end Returns: a list of tuples, each tuple containing all datasets corresponding to an individual read that From 473500bd9b11a59c1f4febad0f683e780870dcdf Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Tue, 25 Nov 2025 12:17:29 -0800 Subject: [PATCH 16/18] Update kwargs handling to avoid bugs in the collapse sorting case --- dimelo/plot_read_browser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dimelo/plot_read_browser.py b/dimelo/plot_read_browser.py index f5e7404..001be85 100644 --- a/dimelo/plot_read_browser.py +++ b/dimelo/plot_read_browser.py @@ -117,7 +117,9 @@ def plot_read_browser( region_end=region_end, hover=hover, thresh=thresh, - **kwargs, + width=width, + size=size, + colorscales=colorscales, ) return fig @@ -316,7 +318,6 @@ def make_browser_figure( width: int = 1, size: int = 4, colorscales: dict = utils.DEFAULT_COLORSCALES, - **kwargs, ) -> plotly.graph_objs.Figure: """ Make a browser figure, using the provided pre-processed data From e70da7a23041dc731108df56f327b1499f09efff Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Tue, 25 Nov 2025 12:20:22 -0800 Subject: [PATCH 17/18] Fix mistake in removing kwargs in too many places --- dimelo/plot_read_browser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dimelo/plot_read_browser.py b/dimelo/plot_read_browser.py index 001be85..a49d496 100644 --- a/dimelo/plot_read_browser.py +++ b/dimelo/plot_read_browser.py @@ -120,6 +120,7 @@ def plot_read_browser( width=width, size=size, colorscales=colorscales, + **kwargs, ) return fig @@ -318,6 +319,7 @@ def make_browser_figure( width: int = 1, size: int = 4, colorscales: dict = utils.DEFAULT_COLORSCALES, + **kwargs, ) -> plotly.graph_objs.Figure: """ Make a browser figure, using the provided pre-processed data From ba5b9c4c44528cdb72fe1ff17644d669b4df74e6 Mon Sep 17 00:00:00 2001 From: OberonDixon Date: Tue, 25 Nov 2025 13:15:57 -0800 Subject: [PATCH 18/18] Fix typo and remove personal info --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index bcd9191..57948a8 100644 --- a/README.md +++ b/README.md @@ -224,9 +224,8 @@ You should expect to see some text outputs and a series of progress bars. Progre There should not be such issues for command line operation. See below an example of command line progress outputs: you should expect relatively fast pre-processing, 10-90 seconds, and then contig processing times depending heavily on the size of your `.bam` file and the extent of your `regions`. ``` -(dimelo-toolkt) oberondixon-luinenburg@Oberons-MacBook-Pro package_test_notebooks % python dimelo_cmd.py +(dimelo-toolkit) % python dimelo_cmd.py modkit found with expected version 0.2.4 -No output directory provided, using input directory /Users/oberondixon-luinenburg/Documents/Ioannidis-Streets/dimelo_test_data/20230702_jm_lmnb1_acessibility_redux No specified number of cores requested. 8 available on machine, allocating all. Modification threshold of 0.9 will be treated as coming from range 0-1. ████████████████████| Preprocessing complete for motifs ['A,0'] in chm13.draft_v1.1.fasta: 100% | 00:30