From 90cd8bfa6540de3bae5b4960fb834b27547d3fb8 Mon Sep 17 00:00:00 2001 From: Andrew Riha Date: Fri, 9 Jan 2026 21:00:37 -0800 Subject: [PATCH 1/2] Add synthetic SNPs generation feature --- .gitignore | 2 + .vscode/tasks.json | 30 + README.rst | 207 ++----- analysis/parse-opensnp-files/README.rst | 15 - analysis/parse-opensnp-files/get_file.py | 21 - .../parse_opensnp_files.py | 131 ----- analysis/xy-chrom-snp-ratios/README.rst | 11 + docs/index.rst | 2 +- docs/snps.rst | 8 + setup.cfg | 1 + src/snps/build_constants.py | 133 +++++ src/snps/io/__init__.py | 3 +- src/snps/io/generator.py | 545 ++++++++++++++++++ src/snps/io/reader.py | 24 +- src/snps/resources.py | 148 ++--- src/snps/snps.py | 68 +-- tests/io/test_generator.py | 480 +++++++++++++++ tests/test_resources.py | 107 ++-- 18 files changed, 1376 insertions(+), 560 deletions(-) create mode 100644 .vscode/tasks.json delete mode 100644 analysis/parse-opensnp-files/README.rst delete mode 100644 analysis/parse-opensnp-files/get_file.py delete mode 100644 analysis/parse-opensnp-files/parse_opensnp_files.py create mode 100644 src/snps/build_constants.py create mode 100644 src/snps/io/generator.py create mode 100644 tests/io/test_generator.py diff --git a/.gitignore b/.gitignore index 038d98ca..98ecec51 100644 --- a/.gitignore +++ b/.gitignore @@ -103,10 +103,12 @@ ENV/ # development .idea/ +.DS_Store # snps output*/ resources/* +synthetic_23andme.txt.gz # snps tests tests/output/* diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 00000000..9127f594 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,30 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "Run All Tests", + "type": "shell", + "command": "source .venv/bin/activate && python -m pytest tests README.rst", + "group": { + "kind": "test", + "isDefault": true + }, + "presentation": { + "reveal": "always", + "panel": "dedicated" + }, + "problemMatcher": [] + }, + { + "label": "Test README Examples", + "type": "shell", + "command": "source .venv/bin/activate && python -m doctest README.rst -v", + "group": "test", + "presentation": { + "reveal": "always", + "panel": "dedicated" + }, + "problemMatcher": [] + } + ] +} diff --git a/README.rst b/README.rst index 43ffd4a4..a7831a20 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ snps ==== -tools for reading, writing, merging, and remapping SNPs 🧬 +tools for reading, writing, generating, merging, and remapping SNPs 🧬 ``snps`` *strives to be an easy-to-use and accessible open-source library for working with genotype data* @@ -19,8 +19,9 @@ Input / Output - Read and write VCF files (e.g., convert `23andMe `_ to VCF) - Merge raw data files from different DNA tests, identifying discrepant SNPs in the process - Read data in a variety of formats (e.g., files, bytes, compressed with `gzip` or `zip`) -- Handle several variations of file types, validated via - `openSNP parsing analysis `_ +- Handle several variations of file types, historically validated using + data from `openSNP `_ +- Generate synthetic genotype data for testing and examples Build / Assembly Detection and Remapping ```````````````````````````````````````` @@ -90,175 +91,91 @@ capability, ``snps`` can be installed with `ezancestry >> import logging, sys ->>> logger = logging.getLogger() ->>> logger.setLevel(logging.INFO) ->>> logger.addHandler(logging.StreamHandler(sys.stdout)) - -Now we're ready to download some example data from `openSNP `_: +To try these examples, first generate some sample data: >>> from snps.resources import Resources ->>> r = Resources() ->>> paths = r.download_example_datasets() -Downloading resources/662.23andme.340.txt.gz -Downloading resources/662.ftdna-illumina.341.csv.gz +>>> paths = Resources().create_example_datasets() -Load Raw Data -````````````` -Load a `23andMe `_ raw data file: +Load a Raw Data File +```````````````````` +Load a raw data file exported from a DNA testing source (e.g., +`23andMe `_, `AncestryDNA `_, +`Family Tree DNA `_): >>> from snps import SNPs ->>> s = SNPs("resources/662.23andme.340.txt.gz") +>>> s = SNPs("resources/sample1.23andme.txt.gz") + +``snps`` automatically detects the source format and `normalizes +`_ the data: + >>> s.source '23andMe' >>> s.count -991786 +991767 +>>> s.build +37 +>>> s.assembly +'GRCh37' -The ``SNPs`` class accepts a path to a file or a bytes object. A ``Reader`` class attempts to -infer the data source and load the SNPs. The loaded SNPs are -`normalized `_ and -available via a ``pandas.DataFrame``: +The SNPs are available as a ``pandas.DataFrame``: >>> df = s.snps >>> df.columns.values array(['chrom', 'pos', 'genotype'], dtype=object) ->>> df.index.name -'rsid' ->>> df.chrom.dtype.name -'object' ->>> df.pos.dtype.name -'uint32' ->>> df.genotype.dtype.name -'object' >>> len(df) -991786 - -``snps`` also attempts to detect the build / assembly of the data: - ->>> s.build -37 ->>> s.build_detected -True ->>> s.assembly -'GRCh37' +991767 Merge Raw Data Files ```````````````````` -The dataset consists of raw data files from two different DNA testing sources - let's combine -these files. Specifically, we'll update the ``SNPs`` object with SNPs from a -`Family Tree DNA `_ file. - ->>> merge_results = s.merge([SNPs("resources/662.ftdna-illumina.341.csv.gz")]) -Merging SNPs('662.ftdna-illumina.341.csv.gz') -SNPs('662.ftdna-illumina.341.csv.gz') has Build 36; remapping to Build 37 -Downloading resources/NCBI36_GRCh37.tar.gz -27 SNP positions were discrepant; keeping original positions -151 SNP genotypes were discrepant; marking those as null ->>> s.source -'23andMe, FTDNA' ->>> s.count -1006960 ->>> s.build -37 ->>> s.build_detected -True +Combine SNPs from multiple files (e.g., combine data from different testing companies): -If the SNPs being merged have a build that differs from the destination build, the SNPs to merge -will be remapped automatically. After this example merge, the build is still detected, since the -build was detected for all ``SNPs`` objects that were merged. +>>> results = s.merge([SNPs("resources/sample2.ftdna.csv.gz")]) +>>> s.count +1006949 -As the data gets added, it's compared to the existing data, and SNP position and genotype -discrepancies are identified. (The discrepancy thresholds can be tuned via parameters.) These -discrepant SNPs are available for inspection after the merge via properties of the ``SNPs`` object. +SNPs are compared during the merge. Position and genotype discrepancies are identified and +can be inspected via properties of the ``SNPs`` object: +>>> len(s.discrepant_merge_positions) +27 >>> len(s.discrepant_merge_genotypes) -151 - -Additionally, any non-called / null genotypes will be updated during the merge, if the file -being merged has a called genotype for the SNP. - -Moreover, ``merge`` takes a ``chrom`` parameter - this enables merging of only SNPs associated -with the specified chromosome (e.g., "Y" or "MT"). - -Finally, ``merge`` returns a list of ``dict``, where each ``dict`` has information corresponding -to the results of each merge (e.g., SNPs in common). - ->>> sorted(list(merge_results[0].keys())) -['common_rsids', 'discrepant_genotype_rsids', 'discrepant_position_rsids', 'merged'] ->>> merge_results[0]["merged"] -True ->>> len(merge_results[0]["common_rsids"]) -692918 +156 Remap SNPs `````````` -Now, let's remap the merged SNPs to change the assembly / build: +Convert SNPs between genome assemblies (Build 36/NCBI36, Build 37/GRCh37, Build 38/GRCh38): ->>> s.snps.loc["rs3094315"].pos -752566 >>> chromosomes_remapped, chromosomes_not_remapped = s.remap(38) -Downloading resources/GRCh37_GRCh38.tar.gz ->>> s.build -38 >>> s.assembly 'GRCh38' ->>> s.snps.loc["rs3094315"].pos -817186 - -SNPs can be remapped between Build 36 (``NCBI36``), Build 37 (``GRCh37``), and Build 38 -(``GRCh38``). Save SNPs ````````` -Ok, so far we've merged the SNPs from two files (ensuring the same build in the process and -identifying discrepancies along the way). Then, we remapped the SNPs to Build 38. Now, let's save -the merged and remapped dataset consisting of 1M+ SNPs to a tab-separated values (TSV) file: - ->>> saved_snps = s.to_tsv("out.txt") -Saving output/out.txt ->>> print(saved_snps) -output/out.txt - -Moreover, let's get the reference sequences for this assembly and save the SNPs as a VCF file: - ->>> saved_snps = s.to_vcf("out.vcf") -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.2.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.3.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.4.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.5.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.6.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.7.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.8.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.9.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.10.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.11.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.12.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.13.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.14.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.15.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.16.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.17.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.18.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.19.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.20.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.21.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.22.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.X.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.Y.fa.gz -Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz -Saving output/out.vcf -1 SNP positions were found to be discrepant when saving VCF - -When saving a VCF, if any SNPs have positions outside of the reference sequence, they are marked -as discrepant and are available via a property of the ``SNPs`` object. - -All `output files `_ are saved to the -output directory. +Save SNPs to common file formats: + +>>> _ = s.to_tsv("output.txt") +>>> _ = s.to_csv("output.csv") + +To save as VCF, ``snps`` automatically downloads the required reference sequences for the +assembly. This ensures the REF alleles in the VCF are accurate: + +>>> _ = s.to_vcf("output.vcf") # doctest: +SKIP + +All output files are saved to the `output directory +`_. + +Generate Synthetic Data +``````````````````````` +Generate synthetic genotype data for testing, examples, or demonstrations: + +>>> from snps.io import SyntheticSNPGenerator +>>> gen = SyntheticSNPGenerator(build=37, seed=123) +>>> gen.save_as_23andme("synthetic_23andme.txt.gz", num_snps=10000) +'synthetic_23andme.txt.gz' + +The generator supports multiple output formats (23andMe, AncestryDNA, FTDNA) and +automatically injects build-specific marker SNPs to ensure accurate build detection. Documentation ------------- @@ -266,11 +183,13 @@ Documentation is available `here `_. Acknowledgements ---------------- -Thanks to Mike Agostino, Padma Reddy, Kevin Arvai, `openSNP `_, -`Open Humans `_, and `Sano Genetics `_. +Thanks to Mike Agostino, Padma Reddy, Kevin Arvai, `Open Humans `_, +and `Sano Genetics `_. This project was historically validated using +data from `openSNP `_. -``snps`` incorporates code and concepts generated with the assistance of -`OpenAI's `_ `ChatGPT `_. ✨ +``snps`` incorporates code and concepts generated with the assistance of various +generative AI tools (including but not limited to `ChatGPT `_, +`Grok `_, and `Claude `_). ✨ License ------- diff --git a/analysis/parse-opensnp-files/README.rst b/analysis/parse-opensnp-files/README.rst deleted file mode 100644 index 6d6f8b90..00000000 --- a/analysis/parse-opensnp-files/README.rst +++ /dev/null @@ -1,15 +0,0 @@ -parse-opensnp-files -=================== -scripts to load and debug parsing of openSNP datadump files - -Method ------- -Attempt to parse each file in the `openSNP `_ datadump by creating a -``SNPs`` object. For files where SNPs were loaded, save summary statistics to a dataframe and -output as a CSV. For files where no SNPs were loaded, save a message for each file indicating -the issue and optionally extract these files from the datadump for debugging. - -Results -------- -As of May 2020, ``snps`` can parse ~96.6% of the genotype files in the datadump. Additionally, -``snps`` can detect the build in ~99.9% of those files. diff --git a/analysis/parse-opensnp-files/get_file.py b/analysis/parse-opensnp-files/get_file.py deleted file mode 100644 index 2e1ae536..00000000 --- a/analysis/parse-opensnp-files/get_file.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Get a file from the openSNP datadump for debugging.""" - -import os - -from atomicwrites import atomic_write - -from snps.resources import Resources -from snps.utils import create_dir - -OUTPUT_DIR = "output" -FILE = "user662_file340_yearofbirth_unknown_sex_unknown.23andme.txt" - -if __name__ == "__main__": - # create output directory for this example - create_dir(OUTPUT_DIR) - - # assume script is being run from examples dir - r = Resources(resources_dir="../../resources") - - with atomic_write(os.path.join(OUTPUT_DIR, FILE), mode="wb") as f: - f.write(r.load_opensnp_datadump_file(FILE)) diff --git a/analysis/parse-opensnp-files/parse_opensnp_files.py b/analysis/parse-opensnp-files/parse_opensnp_files.py deleted file mode 100644 index 7a55112f..00000000 --- a/analysis/parse-opensnp-files/parse_opensnp_files.py +++ /dev/null @@ -1,131 +0,0 @@ -"""Parse openSNP datadump files. - -Attempt to parse each file in the openSNP datadump. For files where SNPs were loaded, -save summary statistics to a dataframe and output as a CSV. For files where no SNPs were -loaded, save a message for each file indicating the issue and optionally extract these -files from the datadump for debugging. - -""" - -import logging -import os -import random - -import pandas as pd -from atomicwrites import atomic_write - -from snps import SNPs -from snps.resources import Resources -from snps.utils import Parallelizer, clean_str, create_dir, save_df_as_csv - -OUTPUT_DIR = "output" -EXTRACT_FILES = True - -# create output directory for this example -create_dir(OUTPUT_DIR) - -# assume script is being run from examples dir -r = Resources(resources_dir="../../resources") - -# setup logger to output to file in output directory -logging.basicConfig( - filename=f"{os.path.join(OUTPUT_DIR, 'parse-opensnp-files.txt')}", - format="%(asctime)s: %(message)s", - filemode="w", - level=logging.INFO, -) - -logger = logging.getLogger() - - -def load_file(task): - file = task["file"] - - try: - s = SNPs(r.load_opensnp_datadump_file(file), assign_par_snps=False) - except Exception as err: - return {"msg": str(err).strip()[:100], "file": file} - - if s.count != 0: - d = s.summary - d.update({"file": file}) - return d - else: - return {"msg": "no SNPs processed", "file": file} - - -def main(): - logger.info("start") - - # get filenames from openSNP data dump - filenames = r.get_opensnp_datadump_filenames() - - filenames = [ - filename - for filename in filenames - if "readme" not in filename and "phenotype" not in filename - ] - - # draw a sample from the observations - random.seed(1) - SAMPLE_SIZE = len(filenames) - # SAMPLE_SIZE = 10 - samples = random.sample(range(len(filenames)), SAMPLE_SIZE) - - # setup tasks for parallelizing / execution on multiple cores - p = Parallelizer(parallelize=True) - tasks = [{"file": filenames[i]} for i in samples] - - # run tasks; results is a list of dicts - results = p(load_file, tasks) - - # get results from `load_file` where `count` was non-zero - rows = [item for item in results if "msg" not in item] - - df = pd.DataFrame( - rows, - columns=["file", "source", "build", "build_detected", "chromosomes", "count"], - ) - - save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv") - - # log parsing statistics - file_count = len(filenames) - logger.info(f"{file_count} files in the openSNP datadump") - logger.info(f"{(len(df) / file_count):.2%} of openSNP datadump files parsed") - logger.info( - f"build detected in {len(df.loc[df.build_detected]) / len(df):.2%} of files parsed" - ) - - # extract files from the datadump where `load_file` returned a message - if EXTRACT_FILES: - # group files with same message (e.g., {"some message": ["file1", "file2"], ...}) - d = {} - for result in results: - if "msg" in result: - if result["msg"] in d: - d[result["msg"]].append(result["file"]) - else: - d[result["msg"]] = [result["file"]] - - # add messages / file filters as necessary... - d["build not detected"] = list(df.loc[~df.build_detected].file.values) - - # extract files that have messages for debugging - for msg, files in d.items(): - if len(files) == 0: - continue - - # create a directory for each message (prefix indicates number of files) - path = os.path.join(OUTPUT_DIR, f"{len(files):04}_{clean_str(msg)}") - create_dir(path) - # save each file with message into created directory - for filename in files: - with atomic_write(os.path.join(path, filename), mode="wb") as f: - f.write(r.load_opensnp_datadump_file(filename)) - - logger.info("stop") - - -if __name__ == "__main__": - main() diff --git a/analysis/xy-chrom-snp-ratios/README.rst b/analysis/xy-chrom-snp-ratios/README.rst index 8fedb789..5191e83c 100644 --- a/analysis/xy-chrom-snp-ratios/README.rst +++ b/analysis/xy-chrom-snp-ratios/README.rst @@ -2,6 +2,17 @@ xy-chrom-snp-ratios =================== an analysis of heterozygous X SNP and not-null Y SNP ratios +.. note:: + + **Historical Analysis** + + This analysis was conducted using openSNP data dump functionality that has been + removed from the library. The code in this directory is **non-functional** with + current versions of ``snps`` and is preserved for historical reference only. + + The analysis results and visualization remain valid as a demonstration of sex + determination from SNP data. + Method ------ All files in the `openSNP `_ data dump that ``snps`` could read were diff --git a/docs/index.rst b/docs/index.rst index 9dfa6878..986d3292 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,7 +5,7 @@ `snps` ====== -*tools for reading, writing, merging, and remapping SNPs* +*tools for reading, writing, generating, merging, and remapping SNPs* .. toctree:: diff --git a/docs/snps.rst b/docs/snps.rst index d9110f28..ebf89552 100644 --- a/docs/snps.rst +++ b/docs/snps.rst @@ -47,6 +47,14 @@ snps.io.writer :show-inheritance: :exclude-members: write_file +snps.io.generator +~~~~~~~~~~~~~~~~~ + +.. automodule:: snps.io.generator + :members: + :undoc-members: + :show-inheritance: + snps.resources -------------- diff --git a/setup.cfg b/setup.cfg index fb025f86..180fc9f2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,6 +19,7 @@ addopts = -ra --strict --tb=short + --doctest-glob=README.rst # http://coverage.readthedocs.io/en/latest/ [coverage:run] diff --git a/src/snps/build_constants.py b/src/snps/build_constants.py new file mode 100644 index 00000000..7fd1a9e4 --- /dev/null +++ b/src/snps/build_constants.py @@ -0,0 +1,133 @@ +"""Centralized constants for genome builds and marker SNPs. + +This module provides single source of truth for: +- Valid genome builds +- Marker SNPs for build detection with positions +- Chromosome sizes for each build + +References +---------- +1. Genome Reference Consortium, https://www.ncbi.nlm.nih.gov/grc + - GRCh38: https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh38 + - GRCh37: https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh37 +2. Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center + for Biotechnology Information, National Library of Medicine. dbSNP accession: rs3094315, + rs11928389, rs2500347, rs964481, rs2341354, rs3850290, and rs1329546 + (dbSNP Build ID: 151). Available from: http://www.ncbi.nlm.nih.gov/SNP/ +3. UCSC Genome Browser chromosome size data: + - NCBI36/hg18: https://hgdownload.soe.ucsc.edu/goldenPath/hg18/bigZips/hg18.chrom.sizes + - GRCh37/hg19: https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes + - GRCh38/hg38: https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes +""" + +from typing import Dict, Final, Tuple, Union + +# Valid genome builds supported by this library +VALID_BUILDS: Final[Tuple[int, int, int]] = (36, 37, 38) + +# Build name mappings +BUILD_ASSEMBLY_MAP: Final[Dict[int, str]] = { + 36: "NCBI36", + 37: "GRCh37", + 38: "GRCh38", +} + +# Marker SNPs for build detection +# These SNPs have known positions across different builds and are used +# to automatically detect which build a genotype file uses +# Format: {rsid: {"chrom": str, 36: pos, 37: pos, 38: pos}} +BUILD_MARKER_SNPS: Final[Dict[str, Dict[Union[str, int], Union[str, int]]]] = { + "rs3094315": {"chrom": "1", 36: 742429, 37: 752566, 38: 817186}, + "rs11928389": {"chrom": "1", 36: 50908372, 37: 50927009, 38: 50889578}, + "rs2500347": {"chrom": "1", 36: 143649677, 37: 144938320, 38: 148946169}, + "rs964481": {"chrom": "20", 36: 27566744, 37: 27656823, 38: 27638706}, + "rs2341354": {"chrom": "1", 36: 908436, 37: 918573, 38: 983193}, + "rs3850290": {"chrom": "2", 36: 22315141, 37: 23245301, 38: 22776092}, + "rs1329546": {"chrom": "1", 36: 135302086, 37: 135474420, 38: 136392261}, +} + +# Chromosome sizes indexed by build, then chromosome +# Values are approximate sizes in base pairs +CHROM_SIZES: Final[Dict[int, Dict[str, int]]] = { + 36: { + "1": 247249719, + "2": 242951149, + "3": 199501827, + "4": 191273063, + "5": 180857866, + "6": 170899992, + "7": 158821424, + "8": 146274826, + "9": 140273252, + "10": 135374737, + "11": 134452384, + "12": 132349534, + "13": 114142980, + "14": 106368585, + "15": 100338915, + "16": 88827254, + "17": 78774742, + "18": 76117153, + "19": 63811651, + "20": 62435964, + "21": 46944323, + "22": 49691432, + "X": 154913754, + "Y": 57772954, + "MT": 16571, + }, + 37: { + "1": 249250621, + "2": 243199373, + "3": 198022430, + "4": 191154276, + "5": 180915260, + "6": 171115067, + "7": 159138663, + "8": 146364022, + "9": 141213431, + "10": 135534747, + "11": 135006516, + "12": 133851895, + "13": 115169878, + "14": 107349540, + "15": 102531392, + "16": 90354753, + "17": 81195210, + "18": 78077248, + "19": 59128983, + "20": 63025520, + "21": 48129895, + "22": 51304566, + "X": 155270560, + "Y": 59373566, + "MT": 16571, + }, + 38: { + "1": 248956422, + "2": 242193529, + "3": 198295559, + "4": 190214555, + "5": 181538259, + "6": 170805979, + "7": 159345973, + "8": 145138636, + "9": 138394717, + "10": 133797422, + "11": 135086622, + "12": 133275309, + "13": 114364328, + "14": 107043718, + "15": 101991189, + "16": 90338345, + "17": 83257441, + "18": 80373285, + "19": 58617616, + "20": 64444167, + "21": 46709983, + "22": 50818468, + "X": 156040895, + "Y": 57227415, + "MT": 16569, + }, +} diff --git a/src/snps/io/__init__.py b/src/snps/io/__init__.py index 455b39e9..c500c3ea 100644 --- a/src/snps/io/__init__.py +++ b/src/snps/io/__init__.py @@ -1,5 +1,6 @@ -"""Classes for reading and writing SNPs.""" +"""Classes for reading, writing, and generating SNPs.""" +from .generator import SyntheticSNPGenerator as SyntheticSNPGenerator from .reader import Reader as Reader from .reader import get_empty_snps_dataframe as get_empty_snps_dataframe from .writer import Writer as Writer diff --git a/src/snps/io/generator.py b/src/snps/io/generator.py new file mode 100644 index 00000000..ab4a1efa --- /dev/null +++ b/src/snps/io/generator.py @@ -0,0 +1,545 @@ +"""Generate synthetic genotype data for testing and examples.""" + +from __future__ import annotations + +import gzip +import logging +import os +from collections.abc import Callable +from io import StringIO +from typing import Any + +import numpy as np +import pandas as pd +from atomicwrites import atomic_write +from numpy.typing import NDArray +from pandas.api.types import CategoricalDtype + +from snps.build_constants import BUILD_MARKER_SNPS, CHROM_SIZES, VALID_BUILDS +from snps.constants import REFERENCE_SEQUENCE_CHROMS +from snps.io.reader import get_empty_snps_dataframe + +logger = logging.getLogger(__name__) + + +class SyntheticSNPGenerator: + """Generate realistic synthetic genotype data. + + This class generates synthetic SNP data that mimics real genotype files + from various DNA testing companies. The generated data is suitable for + testing, examples, and documentation. + + Parameters + ---------- + build : int + Genome build (36, 37, or 38), default is 37 + seed : int, optional + Random seed for reproducibility + + Examples + -------- + >>> gen = SyntheticSNPGenerator(build=37, seed=123) + >>> gen.save_as_23andme("output.txt", num_snps=10000) + 'output.txt' + """ + + def __init__(self, build: int = 37, seed: int | None = None) -> None: + if build not in VALID_BUILDS: + raise ValueError(f"Unsupported build: {build}. Use {VALID_BUILDS}.") + self.build: int = build + self.seed: int | None = seed + self.rng: np.random.Generator = np.random.default_rng(seed) + self.chrom_sizes: dict[str, int] = CHROM_SIZES[build] + + def generate_snps( + self, + num_snps: int = 10000, + chromosomes: list[str] | None = None, + missing_rate: float = 0.01, + inject_build_markers: bool = True, + ) -> pd.DataFrame: + """Generate a DataFrame of synthetic SNPs. + + Parameters + ---------- + num_snps : int + Approximate number of SNPs to generate + chromosomes : list of str, optional + Chromosomes to include (default: all autosomes plus X, Y, MT) + missing_rate : float + Proportion of SNPs with missing genotypes (default: 0.01) + inject_build_markers : bool + Inject known marker SNPs for build detection (default: True) + + Returns + ------- + pd.DataFrame + DataFrame with columns: rsid (index), chrom, pos, genotype + """ + if chromosomes is None: + chromosomes = list(REFERENCE_SEQUENCE_CHROMS) + + snps_per_chrom = self._calculate_snps_per_chromosome(num_snps, chromosomes) + all_snps = [] + rsid_counter = 1 + + for chrom in chromosomes: + n = snps_per_chrom[chrom] + chrom_size = self.chrom_sizes[chrom] + + # Cap SNP count to available positions (chromosome size - 1) + max_positions = chrom_size - 1 + n = min(n, max_positions) + + # Generate unique positions efficiently + positions = self._generate_unique_positions(n, chrom_size) + genotypes = self._generate_genotypes(n, missing_rate) + rsids = [f"rs{rsid_counter + i}" for i in range(n)] + rsid_counter += n + + all_snps.append( + pd.DataFrame( + { + "rsid": rsids, + "chrom": chrom, + "pos": positions, + "genotype": genotypes, + } + ) + ) + + if not all_snps: + return get_empty_snps_dataframe() + + df = pd.concat(all_snps, ignore_index=True).set_index("rsid") + df["chrom"] = df["chrom"].astype(object) + df["pos"] = df["pos"].astype(np.uint32) + df["genotype"] = df["genotype"].astype(object) + + if inject_build_markers: + df = self._inject_build_markers(df, chromosomes) + + # Renumber RSIDs sequentially while preserving build markers + df = self._renumber_rsids_sequentially(df) + + return df + + def create_example_dataset_pair(self, output_dir: str = ".") -> tuple[str, str]: + """Create a pair of realistic example datasets suitable for merging. + + Generates two correlated genotype files that share a large number of + common SNPs, with some discrepancies to demonstrate merge functionality. + + Parameters + ---------- + output_dir : str + Directory for output files + + Returns + ------- + tuple of (str, str) + Paths to (file1_23andme, file2_ftdna) + """ + # Generate base dataset (~700K SNPs shared between files) + base_snps = self._generate_base_snps(700000) + + # File 1: 23andMe format with ~991K SNPs + file1_df = self._create_file1_dataframe(base_snps, 291786) + + # File 2: FTDNA format with ~715K SNPs, including discrepancies + file2_df = self._create_file2_dataframe(base_snps, 15194) + + # Save files + path1 = os.path.join(output_dir, "sample1.23andme.txt.gz") + path2 = os.path.join(output_dir, "sample2.ftdna.csv.gz") + + logger.info(f"Creating {os.path.relpath(path1)}") + logger.info(f"Creating {os.path.relpath(path2)}") + + self._write_snps_as_23andme(file1_df, path1) + self._write_snps_as_ftdna(file2_df, path2) + + return path1, path2 + + def _generate_base_snps(self, num_snps: int) -> pd.DataFrame: + """Generate base SNPs with sequential rsIDs.""" + base = self.generate_snps(num_snps=num_snps, inject_build_markers=False) + base = self._sort_snps_dataframe(base).reset_index(drop=True) + base.index = pd.Index([f"rs{i + 1}" for i in range(len(base))], name="rsid") + return base + + def _create_file1_dataframe( + self, base_snps: pd.DataFrame, unique_count: int + ) -> pd.DataFrame: + """Create file 1 DataFrame (23andMe format). + + Note: RSIDs are NOT renumbered here to preserve RSID correspondence + with file2 for merge functionality. Unique SNPs use offset 10_000_001 + to clearly distinguish them from base SNPs (rs1-rs700000). + """ + # Use large offset to clearly separate unique SNPs from base SNPs + unique = self._generate_unique_snps(unique_count, 10_000_000) + df = self._sort_snps_dataframe(pd.concat([base_snps, unique])) + return self._inject_build_markers(df, list(REFERENCE_SEQUENCE_CHROMS)) + + def _create_file2_dataframe( + self, base_snps: pd.DataFrame, unique_count: int + ) -> pd.DataFrame: + """Create file 2 DataFrame (FTDNA format) with discrepancies. + + Note: RSIDs are NOT renumbered here to preserve RSID correspondence + with file1 for merge functionality. Unique SNPs use offset 20_000_001 + to clearly distinguish them from base SNPs and file1's unique SNPs. + + This file only includes autosomal chromosomes (1-22), no X, Y, or MT. + """ + # Only include autosomal chromosomes (1-22) for FTDNA format + autosomal_chroms = [str(i) for i in range(1, 23)] + + # Filter base SNPs to autosomal chromosomes only + file2_base = base_snps[base_snps["chrom"].isin(autosomal_chroms)].copy() + + # Create position discrepancies (27 SNPs) + pos_disc_indices = self._introduce_position_discrepancies(file2_base, 27) + + # Create genotype discrepancies (151 SNPs, excluding position-discrepant ones) + self._introduce_genotype_discrepancies( + file2_base, 151, exclude=pos_disc_indices + ) + + # Use large offset to clearly separate unique SNPs from base SNPs + # Generate unique SNPs only for autosomal chromosomes + unique = self._generate_unique_snps( + unique_count, 20_000_000, chromosomes=autosomal_chroms + ) + df = self._sort_snps_dataframe(pd.concat([file2_base, unique])) + return self._inject_build_markers(df, autosomal_chroms) + + def _generate_unique_snps( + self, count: int, rsid_offset: int, chromosomes: list[str] | None = None + ) -> pd.DataFrame: + """Generate unique SNPs with rsIDs starting after offset.""" + unique = self.generate_snps( + num_snps=count, inject_build_markers=False, chromosomes=chromosomes + ) + unique = unique.reset_index(drop=True) + unique.index = pd.Index( + [f"rs{rsid_offset + i + 1}" for i in range(len(unique))], name="rsid" + ) + return unique + + def _introduce_position_discrepancies( + self, df: pd.DataFrame, count: int + ) -> set[Any]: + """Introduce position discrepancies in a DataFrame. Returns affected indices.""" + indices = self.rng.choice(df.index, size=count, replace=False) + for idx in indices: + shift = int(self.rng.integers(-1000, 1000)) + chrom = str(df.loc[idx, "chrom"]) + max_pos = self.chrom_sizes[chrom] + new_pos = max(1, min(int(df.loc[idx, "pos"]) + shift, max_pos)) # type: ignore[arg-type] + df.loc[idx, "pos"] = np.uint32(new_pos) + return set(indices) + + def _introduce_genotype_discrepancies( + self, df: pd.DataFrame, count: int, exclude: set[Any] | None = None + ) -> None: + """Introduce genotype discrepancies in a DataFrame, optionally excluding indices.""" + valid_indices = [ + idx for idx in df.index if exclude is None or idx not in exclude + ] + indices = self.rng.choice(valid_indices, size=count, replace=False) + bases = ["A", "C", "G", "T"] + for idx in indices: + current = str(df.loc[idx, "genotype"]) + if current != "--": + new_allele = self.rng.choice([b for b in bases if b not in current]) + df.loc[idx, "genotype"] = new_allele + new_allele + + def _calculate_snps_per_chromosome( + self, num_snps: int, chromosomes: list[str] + ) -> dict[str, int]: + """Calculate proportional SNP distribution across chromosomes.""" + total_size = sum(self.chrom_sizes[c] for c in chromosomes) + return { + chrom: max(1, int(num_snps * self.chrom_sizes[chrom] / total_size)) + for chrom in chromosomes + } + + def _generate_unique_positions(self, n: int, chrom_size: int) -> NDArray[np.uint32]: + """Generate n unique positions within chromosome efficiently. + + Uses fast randint with deduplication for large chromosomes (where + collisions are rare) and np.random.choice for small chromosomes + (where collisions would be frequent). + """ + max_pos = chrom_size - 1 + + # For small chromosomes or high density, use choice (guaranteed unique) + # Threshold: if requesting more than 1% of positions, use choice + if n > max_pos * 0.01: + positions = self.rng.choice(max_pos, size=n, replace=False) + 1 + else: + # For large chromosomes, use randint with deduplication (faster) + # Generate extra to account for potential duplicates + oversample = int(n * 1.1) + 100 + positions = self.rng.integers(1, chrom_size, size=oversample) + positions = np.unique(positions) + + # If we don't have enough unique positions, sample more + while len(positions) < n: + extra = self.rng.integers(1, chrom_size, size=n - len(positions) + 100) + positions = np.unique(np.concatenate([positions, extra])) + + # Take exactly n positions + positions = self.rng.choice(positions, size=n, replace=False) + + return np.sort(positions).astype(np.uint32) + + def _generate_genotypes(self, n: int, missing_rate: float) -> NDArray[np.object_]: + """Generate n genotypes using vectorized operations.""" + bases = np.array(["A", "C", "G", "T"]) + + missing_mask = self.rng.random(n) < missing_rate + homo_mask = self.rng.random(n) < 0.7 + + homo_bases = self.rng.integers(0, 4, size=n) + het_base1 = self.rng.integers(0, 4, size=n) + het_base2 = (het_base1 + self.rng.integers(1, 4, size=n)) % 4 + + genotypes = np.empty(n, dtype=object) + genotypes[missing_mask] = "--" + + homo_idx = ~missing_mask & homo_mask + homo_alleles = bases[homo_bases[homo_idx]] + genotypes[homo_idx] = np.char.add(homo_alleles, homo_alleles) + + het_idx = ~missing_mask & ~homo_mask + allele1, allele2 = bases[het_base1[het_idx]], bases[het_base2[het_idx]] + genotypes[het_idx] = np.array( + ["".join(sorted([a1, a2])) for a1, a2 in zip(allele1, allele2)] + ) + + return genotypes + + def _inject_build_markers( + self, df: pd.DataFrame, chromosomes: list[str] + ) -> pd.DataFrame: + """Inject known marker SNPs at their correct positions for build detection.""" + marker_rows = [] + for rsid, marker_data in BUILD_MARKER_SNPS.items(): + chrom = marker_data["chrom"] + if chrom in chromosomes and self.build in marker_data: + marker_rows.append( + { + "rsid": rsid, + "chrom": chrom, + "pos": np.uint32(marker_data[self.build]), + "genotype": self._random_genotype(), + } + ) + + if marker_rows: + marker_df = pd.DataFrame(marker_rows).set_index("rsid") + df = df[~df.index.isin(marker_df.index)] + df = self._sort_snps_dataframe(pd.concat([df, marker_df])) + + return df + + def _renumber_rsids_sequentially(self, df: pd.DataFrame) -> pd.DataFrame: + """Renumber RSIDs sequentially while preserving build marker RSIDs. + + After sorting by chromosome and position, RSIDs are renumbered to be + sequential (rs1, rs2, rs3, ...) for a cleaner output. Build marker + RSIDs are preserved to ensure build detection continues to work. + """ + # Get the set of build marker rsids that must be preserved + marker_rsids = set(BUILD_MARKER_SNPS.keys()) + + # Create new index with sequential rsids, preserving markers + new_index = [] + counter = 1 + for rsid in df.index: + if rsid in marker_rsids: + new_index.append(rsid) + else: + new_index.append(f"rs{counter}") + counter += 1 + + df.index = pd.Index(new_index, name="rsid") + return df + + def _sort_snps_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: + """Sort SNPs dataframe by chromosome and position. + + Uses the same chromosome ordering as SNPs.sort(): numeric chromosomes + in natural order (1, 2, ..., 22), then X, Y, and MT at the end. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with 'chrom' and 'pos' columns + + Returns + ------- + pd.DataFrame + Sorted DataFrame with chrom as object dtype + """ + # Get unique chromosomes and determine sort order + unique_chroms = df["chrom"].unique() + + # Use REFERENCE_SEQUENCE_CHROMS order, filtering to only include present chroms + chrom_order = [c for c in REFERENCE_SEQUENCE_CHROMS if c in unique_chroms] + + # Add any chromosomes not in standard list (e.g., PAR) + for c in unique_chroms: + if c not in chrom_order: + chrom_order.append(c) + + # Convert to categorical for proper sorting + df["chrom"] = df["chrom"].astype( + CategoricalDtype(categories=chrom_order, ordered=True) + ) + + # Sort by chromosome and position + df = df.sort_values(["chrom", "pos"]) + + # Convert back to object dtype + df["chrom"] = df["chrom"].astype(object) + + return df + + def _random_genotype(self) -> str: + """Generate a single random genotype.""" + bases = ["A", "C", "G", "T"] + if self.rng.random() < 0.7: + base = self.rng.choice(bases) + return base + base + alleles = self.rng.choice(bases, size=2, replace=False) + return "".join(sorted(alleles)) + + # ------------------------------------------------------------------------- + # File Format Writers + # ------------------------------------------------------------------------- + + def _save_as( + self, + output_path: str, + writer_method: Callable[[pd.DataFrame, str], None], + num_snps: int, + **kwargs: Any, + ) -> str: + """Generate SNPs and save using the specified writer method.""" + df = self.generate_snps(num_snps=num_snps, **kwargs) + logger.info(f"Creating {os.path.relpath(output_path)}") + writer_method(df, output_path) + return output_path + + def save_as_23andme( + self, output_path: str, num_snps: int = 991786, **kwargs: Any + ) -> str: + """Save SNPs in 23andMe format.""" + return self._save_as( + output_path, self._write_snps_as_23andme, num_snps, **kwargs + ) + + def save_as_ancestry( + self, output_path: str, num_snps: int = 700000, **kwargs: Any + ) -> str: + """Save SNPs in AncestryDNA format.""" + return self._save_as( + output_path, self._write_snps_as_ancestry, num_snps, **kwargs + ) + + def save_as_ftdna( + self, output_path: str, num_snps: int = 715194, **kwargs: Any + ) -> str: + """Save SNPs in Family Tree DNA (FTDNA) format.""" + return self._save_as(output_path, self._write_snps_as_ftdna, num_snps, **kwargs) + + def save_as_generic( + self, + output_path: str, + format: str = "csv", + num_snps: int = 10000, + **kwargs: Any, + ) -> str: + """Save SNPs in generic CSV or TSV format.""" + df = self.generate_snps(num_snps=num_snps, **kwargs) + logger.info(f"Creating {os.path.relpath(output_path)}") + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + sep = "," if format == "csv" else "\t" + df_out = df.reset_index() + if output_path.endswith(".gz"): + with gzip.open(output_path, "wt") as f: + df_out.to_csv(f, sep=sep, index=False) + else: + df_out.to_csv(output_path, sep=sep, index=False) + return output_path + + def _write_snps_as_23andme(self, df: pd.DataFrame, output_path: str) -> None: + """Write a DataFrame of SNPs in 23andMe format.""" + header = ( + "# 23andMe\n#\n" + "# This is synthetic genotype data generated for demonstration purposes.\n#\n" + "# rsid\tchromosome\tposition\tgenotype\n" + ) + buffer = StringIO() + buffer.write(header) + df.reset_index().to_csv(buffer, sep="\t", header=False, index=False) + self._write_file(output_path, buffer.getvalue()) + + def _write_snps_as_ftdna(self, df: pd.DataFrame, output_path: str) -> None: + """Write a DataFrame of SNPs in FTDNA format.""" + df_out = df.reset_index() + df_out.columns = ["RSID", "CHROMOSOME", "POSITION", "RESULT"] + buffer = StringIO() + buffer.write("RSID,CHROMOSOME,POSITION,RESULT\n") + df_out.to_csv(buffer, index=False, header=False, quoting=1) + self._write_file(output_path, buffer.getvalue()) + + def _write_snps_as_ancestry(self, df: pd.DataFrame, output_path: str) -> None: + """Write a DataFrame of SNPs in AncestryDNA format.""" + header = ( + "#AncestryDNA\n#\n" + "# This is synthetic genotype data generated for demonstration purposes.\n#\n" + "rsid\tchromosome\tposition\tallele1\tallele2\n" + ) + df_out = df.reset_index() + genotypes = df_out["genotype"].values + + allele1 = np.where( + genotypes == "--", + "0", + np.array([g[0] if len(g) >= 1 else "0" for g in genotypes]), + ) + allele2 = np.where( + genotypes == "--", + "0", + np.array([g[1] if len(g) >= 2 else "0" for g in genotypes]), + ) + + result = pd.DataFrame( + { + "rsid": df_out["rsid"], + "chrom": df_out["chrom"], + "pos": df_out["pos"], + "allele1": allele1, + "allele2": allele2, + } + ) + buffer = StringIO() + buffer.write(header) + result.to_csv(buffer, sep="\t", header=False, index=False) + self._write_file(output_path, buffer.getvalue()) + + def _write_file(self, path: str, content: str) -> None: + """Write content to file atomically, handling gzip compression if needed.""" + os.makedirs(os.path.dirname(path) or ".", exist_ok=True) + if path.endswith(".gz"): + with atomic_write(path, mode="wb", overwrite=True) as f: + with gzip.open(f, "wt", compresslevel=1) as f_gzip: + f_gzip.write(content) + else: + with atomic_write(path, mode="w", overwrite=True) as f: + f.write(content) diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index 59562ba3..ddb5464b 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -14,6 +14,8 @@ import numpy as np import pandas as pd +from snps.build_constants import CHROM_SIZES + logger = logging.getLogger(__name__) NORMALIZED_DTYPES = { @@ -265,10 +267,13 @@ def _detect_build_from_comments(self, comments, source): elif "38" in part_value: return 38 elif part_key.lower() == "length": - if "249250621" == part_value: - return 37 # length of chromosome 1 - elif "248956422" == part_value: - return 38 # length of chromosome 1 + # Check chromosome 1 length against known builds + if part_value == str(CHROM_SIZES[37]["1"]): + return 37 + elif part_value == str(CHROM_SIZES[38]["1"]): + return 38 + elif part_value == str(CHROM_SIZES[36]["1"]): + return 36 # couldn't find anything return 0 else: @@ -292,10 +297,13 @@ def _detect_build_from_comments(self, comments, source): return 38 elif "grch37" in comments.lower(): return 37 - elif "249250621" in comments.lower(): - return 37 # length of chromosome 1 - elif "248956422" in comments.lower(): - return 38 # length of chromosome 1 + # Check for chromosome 1 length in comments + elif str(CHROM_SIZES[37]["1"]) in comments.lower(): + return 37 + elif str(CHROM_SIZES[38]["1"]) in comments.lower(): + return 38 + elif str(CHROM_SIZES[36]["1"]) in comments.lower(): + return 36 else: return 0 diff --git a/src/snps/resources.py b/src/snps/resources.py index c483a1e9..e7e17fc3 100644 --- a/src/snps/resources.py +++ b/src/snps/resources.py @@ -24,7 +24,6 @@ import tempfile import urllib.error import urllib.request -import zipfile import numpy as np import pandas as pd @@ -57,7 +56,6 @@ def _init_resource_attributes(self): self._gsa_rsid_map = None self._gsa_chrpos_map = None self._dbsnp_151_37_reverse = None - self._opensnp_datadump_filenames = [] self._chip_clusters = None self._low_quality_snps = None @@ -125,48 +123,52 @@ def get_assembly_mapping_data(self, source_assembly, target_assembly): self._get_path_assembly_mapping_data(source_assembly, target_assembly) ) - def download_example_datasets(self): - """Download example datasets from `openSNP `_. + def create_example_datasets(self, output_dir=None): + """Create synthetic example datasets for demonstrations. - Per openSNP, "the data is donated into the public domain using `CC0 1.0 - `_." + Generates two correlated genotype files in different formats and builds, + suitable for demonstrating merging and remapping functionality. The files + share ~700K common SNPs with intentional discrepancies to demonstrate + merge conflict detection. + + Parameters + ---------- + output_dir : str, optional + Directory for output files (default: resources directory) Returns ------- - paths : list of str or empty str - paths to example datasets + paths : list of str + Paths to created example datasets - References - ---------- - 1. Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource - for Personal Genomics," PLOS ONE, 9(3): e89204, - https://doi.org/10.1371/journal.pone.0089204 + Examples + -------- + >>> from snps.resources import Resources + >>> r = Resources() + >>> paths = r.create_example_datasets() + Creating resources/sample1.23andme.txt.gz + Creating resources/sample2.ftdna.csv.gz """ - paths = [] - paths.append( - self._download_file( - "https://opensnp.org/data/662.23andme.340", - "662.23andme.340.txt.gz", - compress=True, - ) - ) - paths.append( - self._download_file( - "https://opensnp.org/data/662.ftdna-illumina.341", - "662.ftdna-illumina.341.csv.gz", - compress=True, - ) - ) + from snps.io.generator import SyntheticSNPGenerator + + if output_dir is None: + output_dir = self._resources_dir - return paths + # Ensure output directory exists + os.makedirs(output_dir, exist_ok=True) + + # Create correlated dataset pair with realistic merge characteristics + gen = SyntheticSNPGenerator(build=37, seed=47) + path1, path2 = gen.create_example_dataset_pair(output_dir) + + return [path1, path2] def get_all_resources(self): """Get / download all resources used throughout `snps`. Notes ----- - This function does not download reference sequences and the openSNP datadump, - due to their large sizes. + This function does not download reference sequences due to their large sizes. Returns ------- @@ -358,84 +360,6 @@ def get_dbsnp_151_37_reverse(self): return self._dbsnp_151_37_reverse - def get_opensnp_datadump_filenames(self): - """Get filenames internal to the `openSNP `_ datadump zip. - - Per openSNP, "the data is donated into the public domain using `CC0 1.0 - `_." - - Notes - ----- - This function can download over 27GB of data. If the download is not successful, - try using a different tool like `wget` or `curl` to download the file and move it - to the resources directory (see `_get_path_opensnp_datadump`). - - Returns - ------- - filenames : list of str - filenames internal to the openSNP datadump - - References - ---------- - 1. Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource - for Personal Genomics," PLOS ONE, 9(3): e89204, - https://doi.org/10.1371/journal.pone.0089204 - """ - if not self._opensnp_datadump_filenames: - self._opensnp_datadump_filenames = self._get_opensnp_datadump_filenames( - self._get_path_opensnp_datadump() - ) - return self._opensnp_datadump_filenames - - def load_opensnp_datadump_file(self, filename): - """Load the specified file from the openSNP datadump. - - Per openSNP, "the data is donated into the public domain using `CC0 1.0 - `_." - - Parameters - ---------- - filename : str - filename internal to the openSNP datadump - - Returns - ------- - bytes - content of specified file internal to the openSNP datadump - - References - ---------- - 1. Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource - for Personal Genomics," PLOS ONE, 9(3): e89204, - https://doi.org/10.1371/journal.pone.0089204 - """ - if self._get_path_opensnp_datadump(): - with zipfile.ZipFile(self._get_path_opensnp_datadump()) as z: - with z.open(filename, "r") as f: - return f.read() - else: - return bytes() - - @staticmethod - def _get_opensnp_datadump_filenames(filename): - """Get list of filenames internal to the openSNP datadump zip. - - Parameters - ---------- - filename : str - path to openSNP datadump - - Returns - ------- - filenames : list of str - filenames internal to the openSNP datadump - """ - if filename: - with zipfile.ZipFile(filename) as z: - return z.namelist() - else: - return [] - @staticmethod def _write_data_to_gzip(f, data): """Write `data` to `f` in `gzip` format. @@ -728,12 +652,6 @@ def get_gsa_chrpos(self): self._gsa_chrpos_map = chrpos return self._gsa_chrpos_map - def _get_path_opensnp_datadump(self): - return self._download_file( - "https://opensnp.org/data/zip/opensnp_datadump.current.zip", - "opensnp_datadump.current.zip", - ) - def _download_file(self, url, filename, compress=False, timeout=30): """Download a file to the resources folder. diff --git a/src/snps/snps.py b/src/snps/snps.py index 894658de..b10fab88 100644 --- a/src/snps/snps.py +++ b/src/snps/snps.py @@ -11,6 +11,7 @@ import pandas as pd from pandas.api.types import CategoricalDtype +from snps.build_constants import BUILD_MARKER_SNPS from snps.ensembl import EnsemblRestClient from snps.io import Reader, Writer, get_empty_snps_dataframe from snps.resources import Resources @@ -943,67 +944,14 @@ def detect_build(self): rs11928389, rs2500347, rs964481, rs2341354, rs3850290, and rs1329546 (dbSNP Build ID: 151). Available from: http://www.ncbi.nlm.nih.gov/SNP/ """ - - def lookup_build_with_snp_pos(pos, s): - try: - return int(s.loc[s == pos].index[0]) - except Exception: - return 0 - - build = 0 - - rsids = [ - "rs3094315", - "rs11928389", - "rs2500347", - "rs964481", - "rs2341354", - "rs3850290", - "rs1329546", - ] - df = pd.DataFrame( - { - 36: [ - 742429, - 50908372, - 143649677, - 27566744, - 908436, - 22315141, - 135302086, - ], - 37: [ - 752566, - 50927009, - 144938320, - 27656823, - 918573, - 23245301, - 135474420, - ], - 38: [ - 817186, - 50889578, - 148946169, - 27638706, - 983193, - 22776092, - 136392261, - ], - }, - index=rsids, - ) - - for rsid in rsids: + # Check each marker SNP until we find a matching build + for rsid, marker_data in BUILD_MARKER_SNPS.items(): if rsid in self._snps.index: - build = lookup_build_with_snp_pos( - self._snps.loc[rsid].pos, df.loc[rsid] - ) - - if build: - break - - return build + pos = self._snps.loc[rsid].pos + for build in (37, 38, 36): # Check most common builds first + if marker_data[build] == pos: + return build + return 0 def get_count(self, chrom=""): """Count of SNPs. diff --git a/tests/io/test_generator.py b/tests/io/test_generator.py new file mode 100644 index 00000000..6856dd3c --- /dev/null +++ b/tests/io/test_generator.py @@ -0,0 +1,480 @@ +"""Test synthetic SNP data generator.""" + +from __future__ import annotations + +import gzip +import os +import tempfile +from unittest import TestCase + +import numpy as np +import pandas as pd + +from snps import SNPs +from snps.build_constants import BUILD_MARKER_SNPS +from snps.io.generator import SyntheticSNPGenerator + + +class TestSyntheticSNPGenerator(TestCase): + def test_init_build_37(self) -> None: + """Test generator initialization with Build 37.""" + gen = SyntheticSNPGenerator(build=37, seed=123) + self.assertEqual(gen.build, 37) + self.assertEqual(gen.seed, 123) + self.assertIsNotNone(gen.chrom_sizes) + + def test_init_build_36(self) -> None: + """Test generator initialization with Build 36.""" + gen = SyntheticSNPGenerator(build=36, seed=456) + self.assertEqual(gen.build, 36) + self.assertEqual(gen.seed, 456) + + def test_init_build_38(self) -> None: + """Test generator initialization with Build 38.""" + gen = SyntheticSNPGenerator(build=38, seed=789) + self.assertEqual(gen.build, 38) + self.assertEqual(gen.seed, 789) + + def test_init_invalid_build(self) -> None: + """Test generator initialization with invalid build.""" + with self.assertRaises(ValueError): + SyntheticSNPGenerator(build=99) + + def test_generate_snps_empty_chromosomes(self) -> None: + """Test SNP generation with empty chromosomes list.""" + gen = SyntheticSNPGenerator(build=37, seed=123) + df = gen.generate_snps(num_snps=1000, chromosomes=[]) + + # Verify empty DataFrame with correct structure + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(len(df), 0) + self.assertEqual(df.index.name, "rsid") + self.assertListEqual(list(df.columns), ["chrom", "pos", "genotype"]) + + # Verify dtypes + self.assertEqual(df["chrom"].dtype, object) + self.assertEqual(df["pos"].dtype, np.uint32) + self.assertEqual(df["genotype"].dtype, object) + + def test_generate_snps_basic(self) -> None: + """Test basic SNP generation.""" + gen = SyntheticSNPGenerator(build=37, seed=123) + df = gen.generate_snps(num_snps=1000) + + # Verify DataFrame structure + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(df.index.name, "rsid") + self.assertListEqual(list(df.columns), ["chrom", "pos", "genotype"]) + + # Verify dtypes + self.assertEqual(df["chrom"].dtype, object) + self.assertEqual(df["pos"].dtype, np.uint32) + self.assertEqual(df["genotype"].dtype, object) + + # Verify approximate SNP count (may vary due to distribution) + self.assertGreater(len(df), 900) + self.assertLess(len(df), 1100) + + def test_generate_snps_chromosomes(self) -> None: + """Test SNP generation with specific chromosomes.""" + gen = SyntheticSNPGenerator(build=37, seed=123) + df = gen.generate_snps(num_snps=500, chromosomes=["1", "2", "X"]) + + # Verify only specified chromosomes are present + unique_chroms = set(df["chrom"].unique()) + self.assertTrue(unique_chroms.issubset({"1", "2", "X"})) + + def test_generate_snps_missing_rate(self) -> None: + """Test SNP generation with missing genotypes.""" + gen = SyntheticSNPGenerator(build=37, seed=123) + df = gen.generate_snps(num_snps=10000, missing_rate=0.1) + + # Count missing genotypes + missing_count = (df["genotype"] == "--").sum() + total_count = len(df) + + # Verify approximately 10% are missing (with some tolerance) + missing_rate = missing_count / total_count + self.assertGreater(missing_rate, 0.05) # At least 5% + self.assertLess(missing_rate, 0.15) # At most 15% + + def test_generate_snps_reproducibility(self) -> None: + """Test that same seed produces same results.""" + gen1 = SyntheticSNPGenerator(build=37, seed=42) + gen2 = SyntheticSNPGenerator(build=37, seed=42) + + df1 = gen1.generate_snps(num_snps=1000) + df2 = gen2.generate_snps(num_snps=1000) + + # Verify identical output + pd.testing.assert_frame_equal(df1, df2) + + def test_generate_snps_rsid_format(self) -> None: + """Test that rsIDs are correctly formatted.""" + gen = SyntheticSNPGenerator(build=37, seed=123) + df = gen.generate_snps(num_snps=100) + + # Verify all rsIDs start with "rs" followed by digits + for rsid in df.index: + self.assertTrue(rsid.startswith("rs")) + self.assertTrue(rsid[2:].isdigit()) + + def test_generate_snps_positions_sorted(self) -> None: + """Test that positions within each chromosome are sorted.""" + gen = SyntheticSNPGenerator(build=37, seed=123) + df = gen.generate_snps(num_snps=5000) + + # Check each chromosome separately + for chrom in df["chrom"].unique(): + chrom_df = df[df["chrom"] == chrom] + positions = chrom_df["pos"].values + # Verify positions are sorted + self.assertTrue(np.all(positions[:-1] <= positions[1:])) + + def test_generate_snps_genotype_format(self) -> None: + """Test that genotypes are correctly formatted.""" + gen = SyntheticSNPGenerator(build=37, seed=123) + df = gen.generate_snps(num_snps=1000, missing_rate=0.05) + + # Valid bases + valid_bases = {"A", "C", "G", "T", "-"} + + # Verify all genotypes are valid + for genotype in df["genotype"].unique(): + if pd.notna(genotype): + # Should be either 2 characters or "--" + self.assertIn(len(genotype), [2]) + for base in genotype: + self.assertIn(base, valid_bases) + + def test_save_as_23andme(self) -> None: + """Test saving SNPs in 23andMe format.""" + gen = SyntheticSNPGenerator(build=37, seed=123) + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test_23andme.txt.gz") + result_path = gen.save_as_23andme(output_path, num_snps=10000) + + # Verify file was created + self.assertEqual(result_path, output_path) + self.assertTrue(os.path.exists(output_path)) + + # Verify file can be read + with gzip.open(output_path, "rt") as f: + lines = f.readlines() + + # Verify header + self.assertTrue(lines[0].startswith("# 23andMe")) + + # Verify format (tab-separated) + data_lines = [line for line in lines if not line.startswith("#")] + self.assertGreater(len(data_lines), 9000) # Should have ~10000 SNPs + first_data = data_lines[0].strip().split("\t") + self.assertEqual(len(first_data), 4) # rsid, chrom, pos, genotype + + # Verify SNPs can load the file + s = SNPs(output_path) + self.assertEqual(s.source, "23andMe") + # Build detection should work with injected marker SNPs + self.assertEqual(s.build, 37) + self.assertTrue(s.build_detected) + + def test_save_as_ancestry(self) -> None: + """Test saving SNPs in AncestryDNA format.""" + gen = SyntheticSNPGenerator(build=37, seed=456) + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test_ancestry.txt.gz") + result_path = gen.save_as_ancestry(output_path, num_snps=5000) + + # Verify file was created + self.assertEqual(result_path, output_path) + self.assertTrue(os.path.exists(output_path)) + + # Verify file can be read + with gzip.open(output_path, "rt") as f: + lines = f.readlines() + + # Verify header + self.assertTrue(lines[0].startswith("#AncestryDNA")) + + # Verify format (allele1 and allele2 columns) + header_line = [line for line in lines if line.startswith("rsid")][0] + self.assertIn("allele1", header_line) + self.assertIn("allele2", header_line) + + # Verify SNPs can load the file + s = SNPs(output_path) + self.assertEqual(s.source, "AncestryDNA") + + def test_save_as_ftdna(self) -> None: + """Test saving SNPs in FTDNA format.""" + gen = SyntheticSNPGenerator(build=36, seed=789) + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test_ftdna.csv.gz") + result_path = gen.save_as_ftdna(output_path, num_snps=7000) + + # Verify file was created + self.assertEqual(result_path, output_path) + self.assertTrue(os.path.exists(output_path)) + + # Verify file can be read + with gzip.open(output_path, "rt") as f: + lines = f.readlines() + + # Verify CSV format with quotes + header = lines[0].strip() + self.assertEqual(header, "RSID,CHROMOSOME,POSITION,RESULT") + + # Verify data line format + self.assertTrue(lines[1].startswith('"rs')) + + # Verify SNPs can load the file + s = SNPs(output_path) + self.assertEqual(s.source, "FTDNA") + # Build detection should work with injected marker SNPs + self.assertEqual(s.build, 36) + self.assertTrue(s.build_detected) + + def test_save_as_generic_csv(self) -> None: + """Test saving SNPs in generic CSV format.""" + gen = SyntheticSNPGenerator(build=37, seed=202) + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test.csv.gz") + result_path = gen.save_as_generic(output_path, format="csv", num_snps=2000) + + # Verify file was created + self.assertEqual(result_path, output_path) + self.assertTrue(os.path.exists(output_path)) + + # Verify file can be read + with gzip.open(output_path, "rt") as f: + lines = f.readlines() + + # Verify CSV header + header = lines[0].strip() + self.assertEqual(header, "rsid,chrom,pos,genotype") + + # Verify data format (comma-separated with 4 fields) + data_lines = [line.strip() for line in lines[1:] if line.strip()] + self.assertGreater(len(data_lines), 0, "No data lines found") + self.assertEqual(len(data_lines[0].split(",")), 4) + + # Verify SNPs can load the file + s = SNPs(output_path) + self.assertGreater(s.count, 1900) + + def test_save_as_generic_tsv(self) -> None: + """Test saving SNPs in generic TSV format.""" + gen = SyntheticSNPGenerator(build=37, seed=303) + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test.tsv") + result_path = gen.save_as_generic(output_path, format="tsv", num_snps=1500) + + # Verify file was created (not gzipped) + self.assertEqual(result_path, output_path) + self.assertTrue(os.path.exists(output_path)) + + # Verify file can be read + with open(output_path, "r") as f: + lines = f.readlines() + + # Verify TSV header + header = lines[0].strip() + self.assertEqual(header, "rsid\tchrom\tpos\tgenotype") + + # Verify data format (tab-separated with 4 fields) + data_lines = [line.strip() for line in lines[1:] if line.strip()] + self.assertGreater(len(data_lines), 0, "No data lines found") + self.assertEqual(len(data_lines[0].split("\t")), 4) + + def test_save_creates_directory(self) -> None: + """Test that save methods create output directory if needed.""" + gen = SyntheticSNPGenerator(build=37, seed=404) + + with tempfile.TemporaryDirectory() as tmpdir: + # Create a nested path that doesn't exist + output_path = os.path.join(tmpdir, "subdir1", "subdir2", "test.txt.gz") + gen.save_as_23andme(output_path, num_snps=1000) + + # Verify file was created + self.assertTrue(os.path.exists(output_path)) + + def test_large_dataset(self) -> None: + """Test generating a large dataset.""" + gen = SyntheticSNPGenerator(build=37, seed=505) + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "large.txt.gz") + gen.save_as_23andme(output_path, num_snps=1000000) + + # Verify file was created + self.assertTrue(os.path.exists(output_path)) + + # Verify SNPs can load it + s = SNPs(output_path) + self.assertGreater(s.count, 950000) # Should be close to 1M + self.assertLess(s.count, 1050000) + + def test_different_builds_have_different_chromosome_sizes(self) -> None: + """Test that different builds use different chromosome size ranges.""" + gen36 = SyntheticSNPGenerator(build=36) + gen37 = SyntheticSNPGenerator(build=37) + gen38 = SyntheticSNPGenerator(build=38) + + # Verify different chromosome sizes are used + self.assertNotEqual(gen36.chrom_sizes["1"], gen37.chrom_sizes["1"]) + self.assertNotEqual(gen37.chrom_sizes["1"], gen38.chrom_sizes["1"]) + self.assertNotEqual(gen36.chrom_sizes["1"], gen38.chrom_sizes["1"]) + + def test_build_marker_injection(self) -> None: + """Test that build marker SNPs are injected correctly.""" + # Test Build 37 + gen37 = SyntheticSNPGenerator(build=37, seed=123) + df37 = gen37.generate_snps(num_snps=10000, inject_build_markers=True) + + # Check that at least one marker SNP was injected + found_markers = [rsid for rsid in BUILD_MARKER_SNPS if rsid in df37.index] + self.assertGreater(len(found_markers), 0) + + # Verify marker positions are correct for Build 37 + if "rs3094315" in df37.index: + self.assertEqual(df37.loc["rs3094315"]["pos"], 752566) + self.assertEqual(df37.loc["rs3094315"]["chrom"], "1") + + # Test Build 36 + gen36 = SyntheticSNPGenerator(build=36, seed=456) + df36 = gen36.generate_snps(num_snps=10000, inject_build_markers=True) + + # Verify marker position is different for Build 36 + if "rs3094315" in df36.index: + self.assertEqual(df36.loc["rs3094315"]["pos"], 742429) + self.assertEqual(df36.loc["rs3094315"]["chrom"], "1") + + def test_build_detection_with_markers(self) -> None: + """Test that SNPs with injected markers can be correctly build-detected.""" + gen37 = SyntheticSNPGenerator(build=37, seed=789) + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test_build37.txt.gz") + gen37.save_as_23andme(output_path, num_snps=50000) + + # Load and verify build detection works + s = SNPs(output_path) + self.assertEqual(s.build, 37) + self.assertTrue(s.build_detected) + + def test_generate_without_markers(self) -> None: + """Test generating SNPs without build markers.""" + gen = SyntheticSNPGenerator(build=37, seed=999) + df = gen.generate_snps(num_snps=1000, inject_build_markers=False) + + # Verify no marker SNPs are present + found_markers = [rsid for rsid in BUILD_MARKER_SNPS if rsid in df.index] + self.assertEqual(len(found_markers), 0) + + def test_inject_build_markers_no_matching_chromosomes(self) -> None: + """Test _inject_build_markers when chromosomes have no marker SNPs. + + Marker SNPs are on chromosomes 1, 2, and 20. Using chromosomes + without markers should skip marker injection (empty marker_rows branch). + """ + gen = SyntheticSNPGenerator(build=37, seed=123) + + # Generate SNPs only on chromosomes that don't have marker SNPs + # Marker SNPs are on chroms: 1, 2, 20 + df = gen.generate_snps( + num_snps=500, chromosomes=["MT", "X", "Y"], inject_build_markers=True + ) + + # Verify DataFrame was created with correct structure + self.assertIsInstance(df, pd.DataFrame) + self.assertGreater(len(df), 0) + self.assertEqual(df.index.name, "rsid") + + # Verify only the specified chromosomes are present + unique_chroms = set(df["chrom"].unique()) + self.assertTrue(unique_chroms.issubset({"MT", "X", "Y"})) + + # Verify no marker SNPs were injected (since they're on other chromosomes) + found_markers = [rsid for rsid in BUILD_MARKER_SNPS if rsid in df.index] + self.assertEqual(len(found_markers), 0) + + def test_create_example_dataset_pair(self) -> None: + """Test creating a pair of example datasets for merge demonstration.""" + gen = SyntheticSNPGenerator(build=37, seed=47) + + with tempfile.TemporaryDirectory() as tmpdir: + path1, path2 = gen.create_example_dataset_pair(tmpdir) + + # Verify files were created + self.assertTrue(os.path.exists(path1)) + self.assertTrue(os.path.exists(path2)) + self.assertTrue(path1.endswith("sample1.23andme.txt.gz")) + self.assertTrue(path2.endswith("sample2.ftdna.csv.gz")) + + # Load both files + s1 = SNPs(path1) + s2 = SNPs(path2) + + # Verify sources + self.assertEqual(s1.source, "23andMe") + self.assertEqual(s2.source, "FTDNA") + + # Verify builds + self.assertEqual(s1.build, 37) + self.assertEqual(s2.build, 37) + + # Verify file 1 includes all chromosomes (1-22, X, Y, MT) + s1_chroms = set(s1.snps["chrom"].unique()) + self.assertIn("X", s1_chroms) + self.assertIn("Y", s1_chroms) + self.assertIn("MT", s1_chroms) + + # Verify file 2 only has autosomal chromosomes (1-22) + s2_chroms = set(s2.snps["chrom"].unique()) + self.assertNotIn("X", s2_chroms) + self.assertNotIn("Y", s2_chroms) + self.assertNotIn("MT", s2_chroms) + + # Verify approximate SNP counts + # File 1: ~700K base + ~292K unique = ~992K + self.assertGreater(s1.count, 950000) + self.assertLess(s1.count, 1050000) + + # File 2: autosomal base SNPs + ~15K unique (no X, Y, MT) + self.assertGreater(s2.count, 600000) + self.assertLess(s2.count, 750000) + + # Verify shared RSIDs exist (base SNPs rs1-rs700000) + common_rsids = set(s1.snps.index) & set(s2.snps.index) + self.assertGreater(len(common_rsids), 600000) + + def test_unique_positions_small_chromosome(self) -> None: + """Test that positions are unique even on small chromosomes like MT.""" + gen = SyntheticSNPGenerator(build=37, seed=42) + + # MT chromosome is only ~16,569 bp, generate many SNPs + df = gen.generate_snps(num_snps=10000, chromosomes=["MT"]) + + # Check for duplicate positions within chromosome + duplicates = df.duplicated(subset=["chrom", "pos"], keep=False) + self.assertEqual(duplicates.sum(), 0, "Found duplicate positions on MT") + + # Verify all positions are within valid range + self.assertTrue((df["pos"] >= 1).all()) + self.assertTrue((df["pos"] <= gen.chrom_sizes["MT"]).all()) + + def test_unique_positions_large_dataset(self) -> None: + """Test that positions are unique in large datasets.""" + gen = SyntheticSNPGenerator(build=37, seed=123) + + # Generate 100K SNPs across all chromosomes + df = gen.generate_snps(num_snps=100000) + + # Check for duplicate positions within each chromosome + duplicates = df.duplicated(subset=["chrom", "pos"], keep=False) + self.assertEqual(duplicates.sum(), 0, "Found duplicate positions") diff --git a/tests/test_resources.py b/tests/test_resources.py index 9ebecaf9..9e169082 100644 --- a/tests/test_resources.py +++ b/tests/test_resources.py @@ -3,13 +3,9 @@ import socket import tempfile import urllib.error -import warnings -import zipfile from unittest.mock import Mock, mock_open, patch import numpy as np -import pandas as pd -from atomicwrites import atomic_write from snps import SNPs from snps.resources import ReferenceSequence, Resources @@ -122,20 +118,6 @@ def f(): for k, v in resources.items(): self.assertGreater(len(v), 0) - def test_download_example_datasets(self): - def f(): - with patch("urllib.request.urlopen", mock_open(read_data=b"")): - return self.resource.download_example_datasets() - - paths = ( - self.resource.download_example_datasets() if self.downloads_enabled else f() - ) - - for path in paths: - if not path or not os.path.exists(path): - warnings.warn("Example dataset(s) not currently available") - return - def test_get_paths_reference_sequences_invalid_assembly(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="36" @@ -415,53 +397,50 @@ def test_reference_sequence_generic_load_sequence(self): self.assertEqual(seq.end, 117) self.assertEqual(seq.length, 117) - def test_get_opensnp_datadump_filenames(self): - with tempfile.TemporaryDirectory() as tmpdir: - # temporarily set resources dir to tests - self.resource._resources_dir = tmpdir - - # write test openSNP datadump zip - with atomic_write( - os.path.join(tmpdir, "opensnp_datadump.current.zip"), - mode="wb", - overwrite=True, - ) as f: - with zipfile.ZipFile(f, "w") as f_zip: - f_zip.write("tests/input/generic.csv", arcname="generic1.csv") - f_zip.write("tests/input/generic.csv", arcname="generic2.csv") - - filenames = self.resource.get_opensnp_datadump_filenames() - - self.assertListEqual(filenames, ["generic1.csv", "generic2.csv"]) - - self.resource._resources_dir = "resources" - - def test_load_opensnp_datadump_file(self): + def test_create_example_datasets(self): + """Test creating synthetic example datasets.""" with tempfile.TemporaryDirectory() as tmpdir: - # temporarily set resources dir to tests - self.resource._resources_dir = tmpdir - - # write test openSNP datadump zip - with atomic_write( - os.path.join(tmpdir, "opensnp_datadump.current.zip"), - mode="wb", - overwrite=True, - ) as f: - with zipfile.ZipFile(f, "w") as f_zip: - f_zip.write("tests/input/generic.csv", arcname="generic1.csv") - f_zip.write("tests/input/generic.csv", arcname="generic2.csv") - - snps1 = SNPs(self.resource.load_opensnp_datadump_file("generic1.csv")) - snps2 = SNPs(self.resource.load_opensnp_datadump_file("generic2.csv")) - - pd.testing.assert_frame_equal( - snps1.snps, self.generic_snps(), check_exact=True - ) - pd.testing.assert_frame_equal( - snps2.snps, self.generic_snps(), check_exact=True - ) - - self.resource._resources_dir = "resources" + paths = self.resource.create_example_datasets(tmpdir) + + # Verify two files were created + self.assertEqual(len(paths), 2) + self.assertTrue(os.path.exists(paths[0])) + self.assertTrue(os.path.exists(paths[1])) + + # Verify filenames + self.assertTrue(paths[0].endswith("sample1.23andme.txt.gz")) + self.assertTrue(paths[1].endswith("sample2.ftdna.csv.gz")) + + # Verify files can be loaded + s1 = SNPs(paths[0]) + s2 = SNPs(paths[1]) + + # Verify file sources are detected correctly + self.assertEqual(s1.source, "23andMe") + self.assertEqual(s2.source, "FTDNA") + + # Build detection should work with injected marker SNPs + self.assertEqual(s1.build, 37) + self.assertTrue(s1.build_detected) + self.assertEqual(s2.build, 37) + self.assertTrue(s2.build_detected) + + # Verify SNP counts are approximately correct + self.assertGreater(s1.count, 900000) + # FTDNA file only contains autosomal chromosomes (1-22), so count is lower + self.assertGreater(s2.count, 650000) + + # Verify 23andMe file includes all chromosomes + s1_chroms = set(s1.snps["chrom"].unique()) + self.assertIn("X", s1_chroms) + self.assertIn("Y", s1_chroms) + self.assertIn("MT", s1_chroms) + + # Verify FTDNA file only includes autosomal chromosomes (1-22) + s2_chroms = set(s2.snps["chrom"].unique()) + self.assertNotIn("X", s2_chroms) + self.assertNotIn("Y", s2_chroms) + self.assertNotIn("MT", s2_chroms) def _generate_test_chip_clusters(self): s = "1:1\tc1\n" * 2135214 From 779bb1051af3efb39e42503eddcbd70ec341e26f Mon Sep 17 00:00:00 2001 From: Andrew Riha Date: Thu, 22 Jan 2026 22:22:41 -0800 Subject: [PATCH 2/2] Support modern Python versions --- .github/workflows/ci.yml | 12 ++-- README.rst | 2 +- setup.py | 5 +- tests/__init__.py | 93 ++++++++++++++++++++++-- tests/io/test_generator.py | 6 +- tests/io/test_writer.py | 3 +- tests/test_snps.py | 140 +++++++++++++++++++++++-------------- 7 files changed, 189 insertions(+), 72 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7120e898..5ce8af4c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,7 +32,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: - python-version: '3.12' + python-version: '3.14' - name: Install Ruff run: | pip install ruff @@ -50,7 +50,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: - python-version: '3.12' + python-version: '3.14' - name: Install Sphinx run: | pip install -r docs/requirements.txt @@ -65,12 +65,12 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14'] include: - os: macos-latest - python-version: '3.12' + python-version: '3.14' - os: windows-latest - python-version: '3.12' + python-version: '3.14' env: JOB_ID: ${{ strategy.job-index }} @@ -127,7 +127,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v4 diff --git a/README.rst b/README.rst index a7831a20..dfff8ae1 100644 --- a/README.rst +++ b/README.rst @@ -69,7 +69,7 @@ Additionally, ``snps`` can read a variety of "generic" CSV and TSV files. Dependencies ------------ -``snps`` requires `Python `_ 3.8+ and the following Python +``snps`` requires `Python `_ 3.9+ and the following Python packages: - `numpy `_ diff --git a/setup.py b/setup.py index e9746617..de6ae323 100644 --- a/setup.py +++ b/setup.py @@ -63,11 +63,12 @@ "License :: OSI Approved :: BSD License", "Natural Language :: English", "Operating System :: OS Independent", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Scientific/Engineering :: Information Analysis", @@ -81,6 +82,6 @@ keywords="snps dna chromosomes bioinformatics vcf", install_requires=["numpy", "pandas", "atomicwrites"], extras_require={"ezancestry": ["ezancestry"]}, - python_requires=">=3.8", + python_requires=">=3.9", platforms=["any"], ) diff --git a/tests/__init__.py b/tests/__init__.py index 375268ce..d7d1a0aa 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd -from pandas.api.types import is_object_dtype, is_unsigned_integer_dtype +from pandas.api.types import is_object_dtype, is_string_dtype, is_unsigned_integer_dtype from snps import SNPs from snps.utils import gzip_file, zip_file @@ -646,12 +646,89 @@ def run_parsing_tests_vcf( snps_df, ) + def assert_series_equal_with_string_dtype(self, left, right, **kwargs): + """Assert Series are equal, accepting both object and StringDtype for string data. + + In Python 3.14+, pandas infers StringDtype for string data instead of object. + This wrapper compares Series without strict dtype matching for string data. + + Parameters + ---------- + left : pd.Series + First Series to compare + right : pd.Series + Second Series to compare + **kwargs : dict + Additional arguments passed to pd.testing.assert_series_equal + """ + # Verify string series have string or object dtypes + if is_string_dtype(left.dtype) or is_object_dtype(left.dtype): + self.assertTrue( + is_string_dtype(right.dtype) or is_object_dtype(right.dtype), + f"Right series dtype {right.dtype} should be string/object type", + ) + # Compare Series without strict dtype matching + pd.testing.assert_series_equal(left, right, check_dtype=False, **kwargs) + + def assert_frame_equal_with_string_index(self, left, right, **kwargs): + """Assert DataFrames are equal, accepting both object and StringDtype for string columns. + + In Python 3.14+, pandas infers StringDtype for string columns/indices instead of object. + This wrapper validates that string columns have string types, then compares the + DataFrames without strict dtype matching for object/string columns. + + Parameters + ---------- + left : pd.DataFrame + First DataFrame to compare + right : pd.DataFrame + Second DataFrame to compare + **kwargs : dict + Additional arguments passed to pd.testing.assert_frame_equal + """ + # Verify index dtypes are string types if they're named 'rsid' + if left.index.name == "rsid": + self.assertTrue( + is_string_dtype(left.index.dtype), + f"Left index dtype {left.index.dtype} is not a string type", + ) + if right.index.name == "rsid": + self.assertTrue( + is_string_dtype(right.index.dtype), + f"Right index dtype {right.index.dtype} is not a string type", + ) + + # Verify string columns (chrom, genotype) have string dtypes + for col in ["chrom", "genotype"]: + if col in left.columns: + self.assertTrue( + is_string_dtype(left[col].dtype) + or is_object_dtype(left[col].dtype), + f"Left column '{col}' dtype {left[col].dtype} is not a string/object type", + ) + if col in right.columns: + self.assertTrue( + is_string_dtype(right[col].dtype) + or is_object_dtype(right[col].dtype), + f"Right column '{col}' dtype {right[col].dtype} is not a string/object type", + ) + + # Compare DataFrames without strict dtype matching for string columns + pd.testing.assert_frame_equal( + left, right, check_index_type=False, check_dtype=False, **kwargs + ) + def make_normalized_dataframe_assertions(self, df): self.assertEqual(df.index.name, "rsid") - self.assertTrue(is_object_dtype(df.index.dtype)) - self.assertTrue(is_object_dtype(df.chrom.dtype)) + # Accept both object dtype and StringDtype (used in Python 3.14+) + self.assertTrue(is_string_dtype(df.index.dtype)) + self.assertTrue( + is_string_dtype(df.chrom.dtype) or is_object_dtype(df.chrom.dtype) + ) self.assertTrue(is_unsigned_integer_dtype(df.pos.dtype)) - self.assertTrue(is_object_dtype(df.genotype.dtype)) + self.assertTrue( + is_string_dtype(df.genotype.dtype) or is_object_dtype(df.genotype.dtype) + ) def parse_file(self, file, rsids=()): return SNPs(file, rsids=rsids) @@ -675,7 +752,7 @@ def make_parsing_assertions( print(snps_df.info()) self.assertEqual(snps.source, source) - pd.testing.assert_frame_equal(snps.snps, snps_df, check_exact=True) + self.assert_frame_equal_with_string_index(snps.snps, snps_df, check_exact=True) self.assertTrue(snps.phased) if phased else self.assertFalse(snps.phased) self.assertEqual(snps.build, build) ( @@ -699,11 +776,13 @@ def make_parsing_assertions_vcf( else: self.assertFalse(snps.unannotated_vcf) ( - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( snps.snps, snps_df.loc[rsids], check_exact=True ) if rsids - else pd.testing.assert_frame_equal(snps.snps, snps_df, check_exact=True) + else self.assert_frame_equal_with_string_index( + snps.snps, snps_df, check_exact=True + ) ) self.assertTrue(snps.phased) if phased else self.assertFalse(snps.phased) diff --git a/tests/io/test_generator.py b/tests/io/test_generator.py index 6856dd3c..11b77cdb 100644 --- a/tests/io/test_generator.py +++ b/tests/io/test_generator.py @@ -5,7 +5,6 @@ import gzip import os import tempfile -from unittest import TestCase import numpy as np import pandas as pd @@ -13,9 +12,10 @@ from snps import SNPs from snps.build_constants import BUILD_MARKER_SNPS from snps.io.generator import SyntheticSNPGenerator +from tests import BaseSNPsTestCase -class TestSyntheticSNPGenerator(TestCase): +class TestSyntheticSNPGenerator(BaseSNPsTestCase): def test_init_build_37(self) -> None: """Test generator initialization with Build 37.""" gen = SyntheticSNPGenerator(build=37, seed=123) @@ -107,7 +107,7 @@ def test_generate_snps_reproducibility(self) -> None: df2 = gen2.generate_snps(num_snps=1000) # Verify identical output - pd.testing.assert_frame_equal(df1, df2) + self.assert_frame_equal_with_string_index(df1, df2) def test_generate_snps_rsid_format(self) -> None: """Test that rsIDs are correctly formatted.""" diff --git a/tests/io/test_writer.py b/tests/io/test_writer.py index 9e23acbe..155ee0b2 100755 --- a/tests/io/test_writer.py +++ b/tests/io/test_writer.py @@ -2,7 +2,6 @@ import tempfile import numpy as np -import pandas as pd from snps import SNPs from snps.resources import ReferenceSequence, Resources @@ -169,7 +168,7 @@ def test_save_snps_vcf_discrepant_pos(self): self.assertEqual(s.to_vcf(), output) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.discrepant_vcf_position, self.create_snp_df( rsid=["rs1", "rs17"], diff --git a/tests/test_snps.py b/tests/test_snps.py index 176f29af..f3be4260 100755 --- a/tests/test_snps.py +++ b/tests/test_snps.py @@ -67,7 +67,7 @@ def test_build_detected_PAR_snps(self): snps = self.load_assign_PAR_SNPs("tests/input/GRCh37_PAR.csv") self.assertEqual(snps.build, 37) self.assertTrue(snps.build_detected) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( snps.snps, self.snps_GRCh37_PAR(), check_exact=True ) @@ -100,7 +100,7 @@ def test_deduplicate_false(self): pos=[101, 102, 103], genotype=["AA", "CC", "GG"], ) - pd.testing.assert_frame_equal(snps.snps, result, check_exact=True) + self.assert_frame_equal_with_string_index(snps.snps, result, check_exact=True) def test_deduplicate_MT_chrom(self): snps = SNPs("tests/input/ancestry_mt.txt") @@ -111,12 +111,12 @@ def test_deduplicate_MT_chrom(self): pos=[101, 102], genotype=["A", np.nan], ) - pd.testing.assert_frame_equal(snps.snps, result, check_exact=True) + self.assert_frame_equal_with_string_index(snps.snps, result, check_exact=True) heterozygous_MT_snps = self.create_snp_df( rsid=["rs3"], chrom=["MT"], pos=[103], genotype=["GC"] ) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( snps.heterozygous_MT, heterozygous_MT_snps, check_exact=True ) @@ -129,7 +129,7 @@ def test_deduplicate_MT_chrom_false(self): pos=[101, 102, 103], genotype=["AA", np.nan, "GC"], ) - pd.testing.assert_frame_equal(snps.snps, result, check_exact=True) + self.assert_frame_equal_with_string_index(snps.snps, result, check_exact=True) def test_duplicate_rsids(self): snps = SNPs("tests/input/duplicate_rsids.csv") @@ -139,8 +139,10 @@ def test_duplicate_rsids(self): duplicate = self.create_snp_df( rsid=["rs1", "rs1"], chrom=["1", "1"], pos=[102, 103], genotype=["CC", "GG"] ) - pd.testing.assert_frame_equal(snps.snps, result, check_exact=True) - pd.testing.assert_frame_equal(snps.duplicate, duplicate, check_exact=True) + self.assert_frame_equal_with_string_index(snps.snps, result, check_exact=True) + self.assert_frame_equal_with_string_index( + snps.duplicate, duplicate, check_exact=True + ) def test_empty_dataframe(self): for snps in self.empty_snps(): @@ -174,7 +176,7 @@ def test_summary_no_snps(self): def test_heterozygous(self): s = SNPs("tests/input/generic.csv") - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.heterozygous(), self.create_snp_df( rsid=["rs6", "rs7", "rs8"], @@ -187,7 +189,7 @@ def test_heterozygous(self): def test_homozygous(self): s = SNPs("tests/input/generic.csv") - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.homozygous(), self.create_snp_df( rsid=["rs1", "rs2", "rs3", "rs4"], @@ -200,7 +202,7 @@ def test_homozygous(self): def test_homozygous_chrom(self): s = SNPs("tests/input/generic.csv") - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.homozygous("1"), self.create_snp_df( rsid=["rs1", "rs2", "rs3", "rs4"], @@ -223,7 +225,7 @@ def test_notnull(self): s = SNPs("tests/input/generic.csv") snps = self.generic_snps() snps.drop("rs5", inplace=True) - pd.testing.assert_frame_equal(s.notnull(), snps, check_exact=True) + self.assert_frame_equal_with_string_index(s.notnull(), snps, check_exact=True) def test_only_detect_source(self): s = SNPs("tests/input/generic.csv", only_detect_source=True) @@ -246,7 +248,9 @@ def f(): self.assertEqual(s.assembly, "GRCh37") self.assertEqual(len(chromosomes_remapped), 2) self.assertEqual(len(chromosomes_not_remapped), 0) - pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) + self.assert_frame_equal_with_string_index( + s.snps, self.snps_GRCh37(), check_exact=True + ) self._run_remap_test(f, self.NCBI36_GRCh37()) @@ -258,7 +262,9 @@ def f(): self.assertEqual(s.assembly, "GRCh37") self.assertEqual(len(chromosomes_remapped), 2) self.assertEqual(len(chromosomes_not_remapped), 0) - pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) + self.assert_frame_equal_with_string_index( + s.snps, self.snps_GRCh37(), check_exact=True + ) self._run_remap_test(f, self.NCBI36_GRCh37()) @@ -270,7 +276,9 @@ def f(): self.assertEqual(s.assembly, "NCBI36") self.assertEqual(len(chromosomes_remapped), 2) self.assertEqual(len(chromosomes_not_remapped), 0) - pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36(), check_exact=True) + self.assert_frame_equal_with_string_index( + s.snps, self.snps_NCBI36(), check_exact=True + ) self._run_remap_test(f, self.GRCh37_NCBI36()) @@ -282,7 +290,9 @@ def f(): self.assertEqual(s.assembly, "GRCh38") self.assertEqual(len(chromosomes_remapped), 2) self.assertEqual(len(chromosomes_not_remapped), 0) - pd.testing.assert_frame_equal(s.snps, self.snps_GRCh38(), check_exact=True) + self.assert_frame_equal_with_string_index( + s.snps, self.snps_GRCh38(), check_exact=True + ) self._run_remap_test(f, self.GRCh37_GRCh38()) @@ -296,7 +306,7 @@ def f(): self.assertEqual(len(chromosomes_remapped), 2) self.assertEqual(len(chromosomes_not_remapped), 1) self.assertEqual(s.count, 3) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.snps, self.snps_GRCh38_PAR(), check_exact=True ) @@ -309,7 +319,9 @@ def test_remap_37_to_37(self): self.assertEqual(s.assembly, "GRCh37") self.assertEqual(len(chromosomes_remapped), 0) self.assertEqual(len(chromosomes_not_remapped), 2) - pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) + self.assert_frame_equal_with_string_index( + s.snps, self.snps_GRCh37(), check_exact=True + ) def test_remap_invalid_assembly(self): s = SNPs("tests/input/GRCh37.csv") @@ -356,7 +368,7 @@ def test_save_source(self): self.assertTrue(snps.build_detected) self.assertEqual(snps.source, "generic") self.assertListEqual(snps._source, ["generic"]) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( snps.snps, self.snps_GRCh38(), check_exact=True ) @@ -393,7 +405,9 @@ def test_sex_Male_X_chrom_discrepant_XY(self): result = self.create_snp_df( rsid=["rs8001"], chrom=["X"], pos=[80000001], genotype=["AC"] ) - pd.testing.assert_frame_equal(s.discrepant_XY, result, check_exact=True) + self.assert_frame_equal_with_string_index( + s.discrepant_XY, result, check_exact=True + ) self.assertEqual(s.sex, "Male") def test_sex_Male_Y_chrom(self): @@ -595,11 +609,11 @@ def test_snps_qc(self): def f(): s = SNPs("tests/input/generic.csv") # identify quality controlled SNPs - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.snps_qc, self.generic_snps().drop(["rs4", "rs6"]) ) # return already identified quality controlled SNPs (test branch) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.snps_qc, self.generic_snps().drop(["rs4", "rs6"]) ) @@ -609,11 +623,11 @@ def test_low_quality(self): def f(): s = SNPs("tests/input/generic.csv") # identify low quality SNPs - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.low_quality, self.generic_snps().loc[["rs4", "rs6"]] ) # return already identified low quality SNPs (test branch) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.low_quality, self.generic_snps().loc[["rs4", "rs6"]] ) @@ -622,7 +636,7 @@ def f(): def test_snps_qc_low_quality_no_cluster(self): def f(): s = SNPs("tests/input/generic.csv") - pd.testing.assert_frame_equal(s.snps_qc, self.generic_snps()) + self.assert_frame_equal_with_string_index(s.snps_qc, self.generic_snps()) self.assertEqual(len(s.low_quality), 0) self.run_low_quality_snps_test(f, self.get_low_quality_snps(), cluster="") @@ -634,10 +648,10 @@ def f(): s._snps.drop(["rs4", "rs5", "rs6", "rs7", "rs8"], inplace=True) s._build = 36 # manually set build 36 s.identify_low_quality_snps() - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.snps_qc, self.generic_snps().loc[["rs1", "rs3"]] ) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.low_quality, self.generic_snps().loc[["rs2"]] ) self.assertEqual(s.build, 36) # ensure copy gets remapped @@ -709,7 +723,7 @@ def test_source_snps(self): s = SNPs(dest) self.assertEqual(s.source, "generic, 23andMe") self.assertListEqual(s._source, ["generic", "23andMe"]) - pd.testing.assert_frame_equal(s.snps, s.snps, check_exact=True) + self.assert_frame_equal_with_string_index(s.snps, s.snps, check_exact=True) self.assert_results(results, [{"merged": True}]) def test_merge_list(self): @@ -717,7 +731,9 @@ def test_merge_list(self): results = s.merge( [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/GRCh37.csv")] ) - pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) + self.assert_frame_equal_with_string_index( + s.snps, self.snps_GRCh37(), check_exact=True + ) self.assertEqual(s.source, "generic, generic") self.assertListEqual(s._source, ["generic", "generic"]) self.assert_results( @@ -740,7 +756,9 @@ def f(): results = s.merge([SNPs("tests/input/GRCh37.csv")]) self.assertEqual(len(s.discrepant_merge_positions), 0) self.assertEqual(len(s.discrepant_merge_genotypes), 0) - pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36(), check_exact=True) + self.assert_frame_equal_with_string_index( + s.snps, self.snps_NCBI36(), check_exact=True + ) self.assert_results( results, [ @@ -784,7 +802,7 @@ def test_merge_remap_false(self): expected.loc["rs11928389", "genotype"] = ( np.nan ) # discrepant genotype is set to null / NA - pd.testing.assert_frame_equal(s.snps, expected, check_exact=True) + self.assert_frame_equal_with_string_index(s.snps, expected, check_exact=True) self.assert_results( results, [ @@ -811,7 +829,9 @@ def test_merge_phased(self): results = s1.merge([s2]) self.assertTrue(s1.phased) - pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True) + self.assert_frame_equal_with_string_index( + s1.snps, self.generic_snps(), check_exact=True + ) self.assert_results( results, [ @@ -832,7 +852,9 @@ def test_merge_unphased(self): results = s1.merge([s2]) self.assertFalse(s1.phased) - pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True) + self.assert_frame_equal_with_string_index( + s1.snps, self.generic_snps(), check_exact=True + ) self.assert_results( results, [ @@ -851,7 +873,9 @@ def test_merge_non_existent_file(self): results = s.merge( [SNPs("tests/input/non_existent_file.csv"), SNPs("tests/input/GRCh37.csv")] ) - pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) + self.assert_frame_equal_with_string_index( + s.snps, self.snps_GRCh37(), check_exact=True + ) self.assert_results(results, [{}, {"merged": True}]) def test_merge_invalid_file(self): @@ -859,7 +883,9 @@ def test_merge_invalid_file(self): results = s.merge( [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/empty.txt")] ) - pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) + self.assert_frame_equal_with_string_index( + s.snps, self.snps_GRCh37(), check_exact=True + ) self.assert_results(results, [{"merged": True}, {}]) def test_merge_exceed_discrepant_positions_threshold(self): @@ -871,7 +897,9 @@ def test_merge_exceed_discrepant_positions_threshold(self): self.assertEqual(len(s1.discrepant_merge_positions), 0) self.assertEqual(len(s1.discrepant_merge_genotypes), 0) self.assertEqual(len(s1.discrepant_merge_positions_genotypes), 0) - pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True) + self.assert_frame_equal_with_string_index( + s1.snps, self.generic_snps(), check_exact=True + ) self.assert_results(results, [{}]) def test_merge_exceed_discrepant_genotypes_threshold(self): @@ -883,7 +911,9 @@ def test_merge_exceed_discrepant_genotypes_threshold(self): self.assertEqual(len(s1.discrepant_merge_positions), 0) self.assertEqual(len(s1.discrepant_merge_genotypes), 0) self.assertEqual(len(s1.discrepant_merge_positions_genotypes), 0) - pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True) + self.assert_frame_equal_with_string_index( + s1.snps, self.generic_snps(), check_exact=True + ) self.assert_results(results, [{}]) def test_merging_files_discrepant_snps(self): @@ -960,7 +990,9 @@ def test_merging_files_discrepant_snps(self): ) pd.testing.assert_series_equal(s.snps["pos"], expected["pos"]) - pd.testing.assert_series_equal(s.snps["genotype"], expected["genotype"]) + self.assert_series_equal_with_string_dtype( + s.snps["genotype"], expected["genotype"] + ) def test_appending_dfs(self): s = SNPs() @@ -977,12 +1009,12 @@ def test_appending_dfs(self): df = self.create_snp_df( rsid=["rs1", "rs1"], chrom=["1", "1"], pos=[1, 1], genotype=["AA", "AA"] ) - pd.testing.assert_frame_equal(s.duplicate, df, check_exact=True) - pd.testing.assert_frame_equal(s.discrepant_XY, df, check_exact=True) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index(s.duplicate, df, check_exact=True) + self.assert_frame_equal_with_string_index(s.discrepant_XY, df, check_exact=True) + self.assert_frame_equal_with_string_index( s.heterozygous_MT, get_empty_snps_dataframe(), check_exact=True ) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.discrepant_vcf_position, get_empty_snps_dataframe(), check_exact=True ) @@ -1021,7 +1053,7 @@ def test_merge_chrom(self): results = s1.merge([s2], chrom="Y") - pd.testing.assert_frame_equal(s1.snps, df, check_exact=True) + self.assert_frame_equal_with_string_index(s1.snps, df, check_exact=True) self.assert_results( results, @@ -1069,7 +1101,7 @@ def f2(): self.assertEqual(s.assembly, "GRCh37") self.assertEqual(len(chromosomes_remapped), 2) self.assertEqual(len(chromosomes_not_remapped), 0) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.snps, self.snps_GRCh37(), check_exact=True ) @@ -1127,7 +1159,9 @@ def f(): s = SNPs("tests/input/generic.csv") snps = self.generic_snps() snps.drop("rs5", inplace=True) - pd.testing.assert_frame_equal(s.not_null_snps(), snps, check_exact=True) + self.assert_frame_equal_with_string_index( + s.not_null_snps(), snps, check_exact=True + ) self.run_deprecated_test(f, "This method has been renamed to `notnull`.") @@ -1184,8 +1218,10 @@ def f(): pos=[102, 103], genotype=["CC", "GG"], ) - pd.testing.assert_frame_equal(snps.snps, result, check_exact=True) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( + snps.snps, result, check_exact=True + ) + self.assert_frame_equal_with_string_index( snps.duplicate_snps, duplicate, check_exact=True ) @@ -1203,7 +1239,7 @@ def f(): result = self.create_snp_df( rsid=["rs8001"], chrom=["X"], pos=[80000001], genotype=["AC"] ) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.discrepant_XY_snps, result, check_exact=True ) self.assertEqual(s.sex, "Male") @@ -1222,12 +1258,14 @@ def f(): pos=[101, 102], genotype=["A", np.nan], ) - pd.testing.assert_frame_equal(snps.snps, result, check_exact=True) + self.assert_frame_equal_with_string_index( + snps.snps, result, check_exact=True + ) heterozygous_MT_snps = self.create_snp_df( rsid=["rs3"], chrom=["MT"], pos=[103], genotype=["GC"] ) - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( snps.heterozygous_MT_snps, heterozygous_MT_snps, check_exact=True ) @@ -1238,7 +1276,7 @@ def f(): def test_heterozygous_snps(self): def f(): s = SNPs("tests/input/generic.csv") - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.heterozygous_snps(), self.create_snp_df( rsid=["rs6", "rs7", "rs8"], @@ -1254,7 +1292,7 @@ def f(): def test_homozygous_snps(self): def f(): s = SNPs("tests/input/generic.csv") - pd.testing.assert_frame_equal( + self.assert_frame_equal_with_string_index( s.homozygous_snps(), self.create_snp_df( rsid=["rs1", "rs2", "rs3", "rs4"],