From a46bf9ce6038aac8f3ace0a0e982ee0bc496cf33 Mon Sep 17 00:00:00 2001 From: Andrew Riha Date: Mon, 26 Jan 2026 21:57:12 -0800 Subject: [PATCH 1/3] Fix link rendering --- docs/conf.py | 1 + docs/requirements.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 1a0d14c..33b329e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -69,6 +69,7 @@ "colon_fence", "deflist", "fieldlist", + "linkify", "substitution", "tasklist", ] diff --git a/docs/requirements.txt b/docs/requirements.txt index 3b80d46..ec05c16 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,3 +2,4 @@ sphinx==9.1.0 furo==2025.12.19 myst-parser==5.0.0 sphinx-copybutton==0.5.2 +linkify-it-py>=2.0.0 From bbcd47e02da1458a2c116336f25a0ecfe0c45c29 Mon Sep 17 00:00:00 2001 From: Andrew Riha Date: Mon, 26 Jan 2026 22:27:15 -0800 Subject: [PATCH 2/3] Refactor shared test utilities to module --- docs/api.rst | 8 + src/snps/testing.py | 374 ++++++++++++++++++++++++++++++++++++++++++++ tests/__init__.py | 182 ++------------------- 3 files changed, 396 insertions(+), 168 deletions(-) create mode 100644 src/snps/testing.py diff --git a/docs/api.rst b/docs/api.rst index ebe8003..490fc68 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -100,3 +100,11 @@ Helper functions and utilities. :members: :undoc-members: :show-inheritance: + +snps.testing +~~~~~~~~~~~~ + +.. automodule:: snps.testing + :members: + :undoc-members: + :show-inheritance: diff --git a/src/snps/testing.py b/src/snps/testing.py new file mode 100644 index 0000000..5627fe8 --- /dev/null +++ b/src/snps/testing.py @@ -0,0 +1,374 @@ +"""Shared test utilities for snps.""" + +from __future__ import annotations + +import os +from typing import Any + +import numpy as np +import pandas as pd +from pandas.api.types import is_object_dtype, is_string_dtype + +# Standard dtypes for normalized SNP DataFrames +NORMALIZED_DTYPES = { + "rsid": object, + "chrom": object, + "pos": np.uint32, + "genotype": object, +} + + +def get_complement(base: str) -> str: + """Get the complement of a DNA base. + + Parameters + ---------- + base : str + A single DNA base (A, C, G, or T) + + Returns + ------- + str + The complementary base (A<->T, C<->G), or the original if not a valid base + """ + complements = {"A": "T", "T": "A", "C": "G", "G": "C"} + return complements.get(base, base) + + +def complement_genotype(genotype: str) -> str: + """Get the complement of a genotype (both alleles). + + Parameters + ---------- + genotype : str + A two-character genotype string (e.g., "AT", "CG") + + Returns + ------- + str + The complemented genotype, or np.nan if input is null + """ + if pd.isnull(genotype): + return np.nan + return "".join(get_complement(base) for base in genotype) + + +def complement_one_allele(genotype: str) -> str: + """Get the complement of only the first allele of a genotype. + + The second allele is preserved unchanged. This is useful for simulating + partial strand complementation in test data. + + Parameters + ---------- + genotype : str + A two-character genotype string (e.g., "AT", "CG") + + Returns + ------- + str + Genotype with first allele complemented, or np.nan if input is null + """ + if pd.isnull(genotype): + return np.nan + return get_complement(genotype[0]) + genotype[1] + + +def create_snp_df( + rsid: list[str], + chrom: list[str], + pos: list[int], + genotype: list[str], +) -> pd.DataFrame: + """Create a normalized SNP DataFrame. + + Parameters + ---------- + rsid : list of str + SNP identifiers (becomes the index) + chrom : list of str + Chromosome values + pos : list of int + Position values + genotype : list of str + Genotype values + + Returns + ------- + ~pandas.DataFrame + DataFrame with rsid index and chrom, pos, genotype columns + """ + df = pd.DataFrame( + {"rsid": rsid, "chrom": chrom, "pos": pos, "genotype": genotype}, + columns=["rsid", "chrom", "pos", "genotype"], + ) + df = df.astype(NORMALIZED_DTYPES) + df = df.set_index("rsid") + return df + + +def create_simulated_snp_df( + chrom: str = "1", + pos_start: int = 1, + pos_max: int = 248140902, + pos_step: int = 100, + pos_dtype: type = np.uint32, + genotype: str = "AA", + insert_nulls: bool = True, + null_snp_step: int = 101, + complement_genotype_one_allele: bool = False, + complement_genotype_two_alleles: bool = False, + complement_snp_step: int = 50, +) -> pd.DataFrame: + """Create a simulated SNP DataFrame for testing. + + This is the core logic for creating simulated SNP data. Each project + can wrap this to assign to their specific object types. + + Parameters + ---------- + chrom : str + Chromosome value for all SNPs (default: "1") + pos_start : int + Starting position (default: 1) + pos_max : int + Maximum position (default: 248140902) + pos_step : int + Step between positions (default: 100) + pos_dtype : type + Numpy dtype for positions (default: np.uint32) + genotype : str + Default genotype for all SNPs (default: "AA") + insert_nulls : bool + Whether to insert null genotypes (default: True) + null_snp_step : int + Insert null every N SNPs (default: 101) + complement_genotype_one_allele : bool + Complement first allele at intervals (default: False) + complement_genotype_two_alleles : bool + Complement both alleles at intervals (default: False) + complement_snp_step : int + Apply complement every N SNPs (default: 50) + + Returns + ------- + ~pandas.DataFrame + DataFrame with rsid index and chrom, pos, genotype columns + """ + positions = np.arange(pos_start, pos_max, pos_step, dtype=pos_dtype) + snps = pd.DataFrame( + {"chrom": chrom}, + index=pd.Index([f"rs{x + 1}" for x in range(len(positions))], name="rsid"), + ) + snps["pos"] = positions + snps["genotype"] = genotype + + if insert_nulls: + snps.loc[snps.iloc[0::null_snp_step, :].index, "genotype"] = np.nan + + indices = snps.iloc[0::complement_snp_step, :].index + if complement_genotype_two_alleles: + snps.loc[indices, "genotype"] = snps.loc[indices, "genotype"].apply( + complement_genotype + ) + elif complement_genotype_one_allele: + snps.loc[indices, "genotype"] = snps.loc[indices, "genotype"].apply( + complement_one_allele + ) + + return snps + + +def assert_series_equal_with_string_dtype( + left: pd.Series, + right: pd.Series, + test_case: Any = None, + **kwargs, +) -> None: + """Assert Series are equal, accepting both object and StringDtype for string data. + + In Python 3.14+, pandas infers StringDtype for string data instead of object. + This function compares Series without strict dtype matching for string data. + + Parameters + ---------- + left : ~pandas.Series + First Series to compare + right : ~pandas.Series + Second Series to compare + test_case : object, optional + Object with assertTrue method for assertions (uses assert if None) + **kwargs : dict + Additional arguments passed to pd.testing.assert_series_equal + """ + import pandas as pd + + # Verify string series have string or object dtypes + if is_string_dtype(left.dtype) or is_object_dtype(left.dtype): + right_is_string = is_string_dtype(right.dtype) or is_object_dtype(right.dtype) + if test_case: + test_case.assertTrue( + right_is_string, + f"Right series dtype {right.dtype} should be string/object type", + ) + else: + assert right_is_string, ( + f"Right series dtype {right.dtype} should be string/object type" + ) + # Compare Series without strict dtype matching + pd.testing.assert_series_equal(left, right, check_dtype=False, **kwargs) + + +def assert_frame_equal_with_string_index( + left: pd.DataFrame, + right: pd.DataFrame, + test_case: Any = None, + **kwargs, +) -> None: + """Assert DataFrames are equal, accepting both object and StringDtype for string columns. + + In Python 3.14+, pandas infers StringDtype for string columns/indices instead of object. + This function validates that string columns have string types, then compares the + DataFrames without strict dtype matching for object/string columns. + + Parameters + ---------- + left : ~pandas.DataFrame + First DataFrame to compare + right : ~pandas.DataFrame + Second DataFrame to compare + test_case : object, optional + Object with assertTrue method for assertions (uses assert if None) + **kwargs : dict + Additional arguments passed to pd.testing.assert_frame_equal + """ + import pandas as pd + + def _assert(condition: bool, message: str) -> None: + if test_case: + test_case.assertTrue(condition, message) + else: + assert condition, message + + # Verify index dtypes are string types if they're named 'rsid' + if left.index.name == "rsid": + _assert( + is_string_dtype(left.index.dtype), + f"Left index dtype {left.index.dtype} is not a string type", + ) + if right.index.name == "rsid": + _assert( + is_string_dtype(right.index.dtype), + f"Right index dtype {right.index.dtype} is not a string type", + ) + + # Verify string columns (chrom, genotype) have string dtypes + for col in ["chrom", "genotype"]: + if col in left.columns: + _assert( + is_string_dtype(left[col].dtype) or is_object_dtype(left[col].dtype), + f"Left column '{col}' dtype {left[col].dtype} is not a string/object type", + ) + if col in right.columns: + _assert( + is_string_dtype(right[col].dtype) or is_object_dtype(right[col].dtype), + f"Right column '{col}' dtype {right[col].dtype} is not a string/object type", + ) + + # Compare DataFrames without strict dtype matching for string columns + pd.testing.assert_frame_equal( + left, right, check_index_type=False, check_dtype=False, **kwargs + ) + + +class SNPsTestMixin: + """Mixin class providing common test assertions and utilities for SNP DataFrames. + + This mixin can be combined with unittest.TestCase to add convenient + assertion methods for comparing SNP DataFrames with flexible string dtype handling, + plus common test utilities like creating test DataFrames. + + Example + ------- + >>> class MyTestCase(SNPsTestMixin, TestCase): + ... def test_something(self): + ... df = self.generic_snps() + ... self.assert_frame_equal_with_string_index(df, expected_df) + """ + + @property + def downloads_enabled(self) -> bool: + """Check if external downloads are enabled for tests. + + Only download from external resources when an environment variable named + "DOWNLOADS_ENABLED" is set to "true". + + Returns + ------- + bool + """ + return os.getenv("DOWNLOADS_ENABLED") == "true" + + @staticmethod + def get_complement(base: str) -> str: + """Get the complement of a DNA base. + + See :func:`get_complement` for details. + """ + return get_complement(base) + + def complement_genotype(self, genotype: str) -> str: + """Get the complement of a genotype (both alleles). + + See :func:`complement_genotype` for details. + """ + return complement_genotype(genotype) + + def complement_one_allele(self, genotype: str) -> str: + """Get the complement of only the first allele of a genotype. + + See :func:`complement_one_allele` for details. + """ + return complement_one_allele(genotype) + + @staticmethod + def create_snp_df( + rsid: list[str], + chrom: list[str], + pos: list[int], + genotype: list[str], + ) -> pd.DataFrame: + """Create a normalized SNP DataFrame. + + See :func:`create_snp_df` for details. + """ + return create_snp_df(rsid, chrom, pos, genotype) + + def generic_snps(self) -> pd.DataFrame: + """Create a generic SNP DataFrame for testing. + + Returns + ------- + ~pandas.DataFrame + DataFrame with 8 SNPs (rs1-rs8) on chromosome 1 + """ + return create_snp_df( + rsid=[f"rs{i}" for i in range(1, 9)], + chrom=["1"] * 8, + pos=list(range(101, 109)), + genotype=["AA", "CC", "GG", "TT", np.nan, "GC", "TC", "AT"], + ) + + def assert_series_equal_with_string_dtype(self, left, right, **kwargs): + """Assert Series are equal, accepting both object and StringDtype for string data. + + See :func:`assert_series_equal_with_string_dtype` for details. + """ + assert_series_equal_with_string_dtype(left, right, test_case=self, **kwargs) + + def assert_frame_equal_with_string_index(self, left, right, **kwargs): + """Assert DataFrames are equal, accepting both object and StringDtype for string columns. + + See :func:`assert_frame_equal_with_string_index` for details. + """ + assert_frame_equal_with_string_index(left, right, test_case=self, **kwargs) diff --git a/tests/__init__.py b/tests/__init__.py index d7d1a0a..d0cb797 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -9,10 +9,11 @@ from pandas.api.types import is_object_dtype, is_string_dtype, is_unsigned_integer_dtype from snps import SNPs +from snps.testing import SNPsTestMixin, create_simulated_snp_df from snps.utils import gzip_file, zip_file -class BaseSNPsTestCase(TestCase): +class BaseSNPsTestCase(SNPsTestMixin, TestCase): def simulate_snps( self, chrom="1", @@ -27,97 +28,22 @@ def simulate_snps( complement_snp_step=50, ): s = SNPs() - s._build = 37 - - positions = np.arange(pos_start, pos_max, pos_step, dtype=np.uint32) - snps = pd.DataFrame( - {"chrom": chrom}, - index=pd.Index( - ["rs" + str(x + 1) for x in range(len(positions))], name="rsid" - ), + s._snps = create_simulated_snp_df( + chrom=chrom, + pos_start=pos_start, + pos_max=pos_max, + pos_step=pos_step, + pos_dtype=np.uint32, + genotype=genotype, + insert_nulls=insert_nulls, + null_snp_step=null_snp_step, + complement_genotype_one_allele=complement_genotype_one_chrom, + complement_genotype_two_alleles=complement_genotype_two_chroms, + complement_snp_step=complement_snp_step, ) - snps["pos"] = positions - snps["genotype"] = genotype - - if insert_nulls: - snps.loc[snps.iloc[0::null_snp_step, :].index, "genotype"] = np.nan - - indices = snps.iloc[0::complement_snp_step, :].index - if complement_genotype_two_chroms: - snps.loc[indices, "genotype"] = snps.loc[indices, "genotype"].apply( - self.complement_two_chroms - ) - elif complement_genotype_one_chrom: - snps.loc[indices, "genotype"] = snps.loc[indices, "genotype"].apply( - self.complement_one_chrom - ) - - s._snps = snps - return s - @property - def downloads_enabled(self): - """Property indicating if downloads are enabled. - - Only download from external resources when an environment variable named - "DOWNLOADS_ENABLED" is set to "true". - - Returns - ------- - bool - """ - return True if os.getenv("DOWNLOADS_ENABLED") == "true" else False - - @staticmethod - def get_complement(base): - if base == "A": - return "T" - elif base == "G": - return "C" - elif base == "C": - return "G" - elif base == "T": - return "A" - else: - return base - - def complement_one_chrom(self, genotype): - if pd.isnull(genotype): - return np.nan - - complement = "" - - for base in list(genotype): - complement += self.get_complement(base) - complement += genotype[1] - return complement - - def complement_two_chroms(self, genotype): - if pd.isnull(genotype): - return np.nan - - complement = "" - - for base in list(genotype): - complement += self.get_complement(base) - - return complement - - @staticmethod - def create_snp_df(rsid, chrom, pos, genotype): - df = pd.DataFrame( - {"rsid": rsid, "chrom": chrom, "pos": pos, "genotype": genotype}, - columns=["rsid", "chrom", "pos", "genotype"], - ) - df.rsid = df.rsid.astype(object) - df.chrom = df.chrom.astype(object) - df.pos = df.pos.astype(np.uint32) - df.genotype = df.genotype.astype(object) - df = df.set_index("rsid") - return df - def load_assign_PAR_SNPs(self, path): """Load and assign PAR SNPs. @@ -502,14 +428,6 @@ def snps_GRCh38_PAR(self): genotype=["AA", "AA", "AA"], ) - def generic_snps(self): - return self.create_snp_df( - rsid=["rs" + str(i) for i in range(1, 9)], - chrom=["1"] * 8, - pos=list(range(101, 109)), - genotype=["AA", "CC", "GG", "TT", np.nan, "GC", "TC", "AT"], - ) - def generic_snps_vcf(self): df = self.generic_snps() return pd.concat( @@ -646,78 +564,6 @@ def run_parsing_tests_vcf( snps_df, ) - def assert_series_equal_with_string_dtype(self, left, right, **kwargs): - """Assert Series are equal, accepting both object and StringDtype for string data. - - In Python 3.14+, pandas infers StringDtype for string data instead of object. - This wrapper compares Series without strict dtype matching for string data. - - Parameters - ---------- - left : pd.Series - First Series to compare - right : pd.Series - Second Series to compare - **kwargs : dict - Additional arguments passed to pd.testing.assert_series_equal - """ - # Verify string series have string or object dtypes - if is_string_dtype(left.dtype) or is_object_dtype(left.dtype): - self.assertTrue( - is_string_dtype(right.dtype) or is_object_dtype(right.dtype), - f"Right series dtype {right.dtype} should be string/object type", - ) - # Compare Series without strict dtype matching - pd.testing.assert_series_equal(left, right, check_dtype=False, **kwargs) - - def assert_frame_equal_with_string_index(self, left, right, **kwargs): - """Assert DataFrames are equal, accepting both object and StringDtype for string columns. - - In Python 3.14+, pandas infers StringDtype for string columns/indices instead of object. - This wrapper validates that string columns have string types, then compares the - DataFrames without strict dtype matching for object/string columns. - - Parameters - ---------- - left : pd.DataFrame - First DataFrame to compare - right : pd.DataFrame - Second DataFrame to compare - **kwargs : dict - Additional arguments passed to pd.testing.assert_frame_equal - """ - # Verify index dtypes are string types if they're named 'rsid' - if left.index.name == "rsid": - self.assertTrue( - is_string_dtype(left.index.dtype), - f"Left index dtype {left.index.dtype} is not a string type", - ) - if right.index.name == "rsid": - self.assertTrue( - is_string_dtype(right.index.dtype), - f"Right index dtype {right.index.dtype} is not a string type", - ) - - # Verify string columns (chrom, genotype) have string dtypes - for col in ["chrom", "genotype"]: - if col in left.columns: - self.assertTrue( - is_string_dtype(left[col].dtype) - or is_object_dtype(left[col].dtype), - f"Left column '{col}' dtype {left[col].dtype} is not a string/object type", - ) - if col in right.columns: - self.assertTrue( - is_string_dtype(right[col].dtype) - or is_object_dtype(right[col].dtype), - f"Right column '{col}' dtype {right[col].dtype} is not a string/object type", - ) - - # Compare DataFrames without strict dtype matching for string columns - pd.testing.assert_frame_equal( - left, right, check_index_type=False, check_dtype=False, **kwargs - ) - def make_normalized_dataframe_assertions(self, df): self.assertEqual(df.index.name, "rsid") # Accept both object dtype and StringDtype (used in Python 3.14+) From 4f0e47e2230eabb9c40f0c753ce2e69a12dde2bb Mon Sep 17 00:00:00 2001 From: Andrew Riha Date: Mon, 26 Jan 2026 22:38:33 -0800 Subject: [PATCH 3/3] Update description --- docs/conf.py | 2 +- src/snps/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 33b329e..9a4d5a7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -143,7 +143,7 @@ "snps Documentation", author, "snps", - "Tools for reading, writing, merging, and remapping SNPs.", + "tools for reading, writing, generating, merging, and remapping SNPs", "Miscellaneous", ) ] diff --git a/src/snps/__init__.py b/src/snps/__init__.py index 37b5c9c..8f975a0 100644 --- a/src/snps/__init__.py +++ b/src/snps/__init__.py @@ -1,6 +1,6 @@ """`snps` -tools for reading, writing, merging, and remapping SNPs +tools for reading, writing, generating, merging, and remapping SNPs """