From 90cd8bfa6540de3bae5b4960fb834b27547d3fb8 Mon Sep 17 00:00:00 2001
From: Andrew Riha <apriha@gmail.com>
Date: Fri, 9 Jan 2026 21:00:37 -0800
Subject: [PATCH 1/2] Add synthetic SNPs generation feature

---
 .gitignore                                    |   2 +
 .vscode/tasks.json                            |  30 +
 README.rst                                    | 207 ++-----
 analysis/parse-opensnp-files/README.rst       |  15 -
 analysis/parse-opensnp-files/get_file.py      |  21 -
 .../parse_opensnp_files.py                    | 131 -----
 analysis/xy-chrom-snp-ratios/README.rst       |  11 +
 docs/index.rst                                |   2 +-
 docs/snps.rst                                 |   8 +
 setup.cfg                                     |   1 +
 src/snps/build_constants.py                   | 133 +++++
 src/snps/io/__init__.py                       |   3 +-
 src/snps/io/generator.py                      | 545 ++++++++++++++++++
 src/snps/io/reader.py                         |  24 +-
 src/snps/resources.py                         | 148 ++---
 src/snps/snps.py                              |  68 +--
 tests/io/test_generator.py                    | 480 +++++++++++++++
 tests/test_resources.py                       | 107 ++--
 18 files changed, 1376 insertions(+), 560 deletions(-)
 create mode 100644 .vscode/tasks.json
 delete mode 100644 analysis/parse-opensnp-files/README.rst
 delete mode 100644 analysis/parse-opensnp-files/get_file.py
 delete mode 100644 analysis/parse-opensnp-files/parse_opensnp_files.py
 create mode 100644 src/snps/build_constants.py
 create mode 100644 src/snps/io/generator.py
 create mode 100644 tests/io/test_generator.py

diff --git a/.gitignore b/.gitignore
index 038d98ca..98ecec51 100644
--- a/.gitignore
+++ b/.gitignore
@@ -103,10 +103,12 @@ ENV/
 
 # development
 .idea/
+.DS_Store
 
 # snps
 output*/
 resources/*
+synthetic_23andme.txt.gz
 
 # snps tests
 tests/output/*
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
new file mode 100644
index 00000000..9127f594
--- /dev/null
+++ b/.vscode/tasks.json
@@ -0,0 +1,30 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Run All Tests",
+            "type": "shell",
+            "command": "source .venv/bin/activate && python -m pytest tests README.rst",
+            "group": {
+                "kind": "test",
+                "isDefault": true
+            },
+            "presentation": {
+                "reveal": "always",
+                "panel": "dedicated"
+            },
+            "problemMatcher": []
+        },
+        {
+            "label": "Test README Examples",
+            "type": "shell",
+            "command": "source .venv/bin/activate && python -m doctest README.rst -v",
+            "group": "test",
+            "presentation": {
+                "reveal": "always",
+                "panel": "dedicated"
+            },
+            "problemMatcher": []
+        }
+    ]
+}
diff --git a/README.rst b/README.rst
index 43ffd4a4..a7831a20 100644
--- a/README.rst
+++ b/README.rst
@@ -4,7 +4,7 @@
 
 snps
 ====
-tools for reading, writing, merging, and remapping SNPs 🧬
+tools for reading, writing, generating, merging, and remapping SNPs 🧬
 
 ``snps`` *strives to be an easy-to-use and accessible open-source library for working with
 genotype data*
@@ -19,8 +19,9 @@ Input / Output
 - Read and write VCF files (e.g., convert `23andMe <https://www.23andme.com>`_ to VCF)
 - Merge raw data files from different DNA tests, identifying discrepant SNPs in the process
 - Read data in a variety of formats (e.g., files, bytes, compressed with `gzip` or `zip`)
-- Handle several variations of file types, validated via
-  `openSNP parsing analysis <https://github.com/apriha/snps/tree/main/analysis/parse-opensnp-files>`_
+- Handle several variations of file types, historically validated using
+  data from `openSNP <https://opensnp.org>`_
+- Generate synthetic genotype data for testing and examples
 
 Build / Assembly Detection and Remapping
 ````````````````````````````````````````
@@ -90,175 +91,91 @@ capability, ``snps`` can be installed with `ezancestry <https://github.com/arvke
 
 Examples
 --------
-Download Example Data
-`````````````````````
-First, let's setup logging to get some helpful output:
 
->>> import logging, sys
->>> logger = logging.getLogger()
->>> logger.setLevel(logging.INFO)
->>> logger.addHandler(logging.StreamHandler(sys.stdout))
-
-Now we're ready to download some example data from `openSNP <https://opensnp.org>`_:
+To try these examples, first generate some sample data:
 
 >>> from snps.resources import Resources
->>> r = Resources()
->>> paths = r.download_example_datasets()
-Downloading resources/662.23andme.340.txt.gz
-Downloading resources/662.ftdna-illumina.341.csv.gz
+>>> paths = Resources().create_example_datasets()
 
-Load Raw Data
-`````````````
-Load a `23andMe <https://www.23andme.com>`_ raw data file:
+Load a Raw Data File
+````````````````````
+Load a raw data file exported from a DNA testing source (e.g.,
+`23andMe <https://www.23andme.com>`_, `AncestryDNA <https://www.ancestry.com>`_,
+`Family Tree DNA <https://www.familytreedna.com>`_):
 
 >>> from snps import SNPs
->>> s = SNPs("resources/662.23andme.340.txt.gz")
+>>> s = SNPs("resources/sample1.23andme.txt.gz")
+
+``snps`` automatically detects the source format and `normalizes
+<https://snps.readthedocs.io/en/stable/snps.html#snps.snps.SNPs.snps>`_ the data:
+
 >>> s.source
 '23andMe'
 >>> s.count
-991786
+991767
+>>> s.build
+37
+>>> s.assembly
+'GRCh37'
 
-The ``SNPs`` class accepts a path to a file or a bytes object. A ``Reader`` class attempts to
-infer the data source and load the SNPs. The loaded SNPs are
-`normalized <https://snps.readthedocs.io/en/stable/snps.html#snps.snps.SNPs.snps>`_ and
-available via a ``pandas.DataFrame``:
+The SNPs are available as a ``pandas.DataFrame``:
 
 >>> df = s.snps
 >>> df.columns.values
 array(['chrom', 'pos', 'genotype'], dtype=object)
->>> df.index.name
-'rsid'
->>> df.chrom.dtype.name
-'object'
->>> df.pos.dtype.name
-'uint32'
->>> df.genotype.dtype.name
-'object'
 >>> len(df)
-991786
-
-``snps`` also attempts to detect the build / assembly of the data:
-
->>> s.build
-37
->>> s.build_detected
-True
->>> s.assembly
-'GRCh37'
+991767
 
 Merge Raw Data Files
 ````````````````````
-The dataset consists of raw data files from two different DNA testing sources - let's combine
-these files. Specifically, we'll update the ``SNPs`` object with SNPs from a
-`Family Tree DNA <https://www.familytreedna.com>`_ file.
-
->>> merge_results = s.merge([SNPs("resources/662.ftdna-illumina.341.csv.gz")])
-Merging SNPs('662.ftdna-illumina.341.csv.gz')
-SNPs('662.ftdna-illumina.341.csv.gz') has Build 36; remapping to Build 37
-Downloading resources/NCBI36_GRCh37.tar.gz
-27 SNP positions were discrepant; keeping original positions
-151 SNP genotypes were discrepant; marking those as null
->>> s.source
-'23andMe, FTDNA'
->>> s.count
-1006960
->>> s.build
-37
->>> s.build_detected
-True
+Combine SNPs from multiple files (e.g., combine data from different testing companies):
 
-If the SNPs being merged have a build that differs from the destination build, the SNPs to merge
-will be remapped automatically. After this example merge, the build is still detected, since the
-build was detected for all ``SNPs`` objects that were merged.
+>>> results = s.merge([SNPs("resources/sample2.ftdna.csv.gz")])
+>>> s.count
+1006949
 
-As the data gets added, it's compared to the existing data, and SNP position and genotype
-discrepancies are identified. (The discrepancy thresholds can be tuned via parameters.) These
-discrepant SNPs are available for inspection after the merge via properties of the ``SNPs`` object.
+SNPs are compared during the merge. Position and genotype discrepancies are identified and
+can be inspected via properties of the ``SNPs`` object:
 
+>>> len(s.discrepant_merge_positions)
+27
 >>> len(s.discrepant_merge_genotypes)
-151
-
-Additionally, any non-called / null genotypes will be updated during the merge, if the file
-being merged has a called genotype for the SNP.
-
-Moreover, ``merge`` takes a ``chrom`` parameter - this enables merging of only SNPs associated
-with the specified chromosome (e.g., "Y" or "MT").
-
-Finally, ``merge`` returns a list of ``dict``, where each ``dict`` has information corresponding
-to the results of each merge (e.g., SNPs in common).
-
->>> sorted(list(merge_results[0].keys()))
-['common_rsids', 'discrepant_genotype_rsids', 'discrepant_position_rsids', 'merged']
->>> merge_results[0]["merged"]
-True
->>> len(merge_results[0]["common_rsids"])
-692918
+156
 
 Remap SNPs
 ``````````
-Now, let's remap the merged SNPs to change the assembly / build:
+Convert SNPs between genome assemblies (Build 36/NCBI36, Build 37/GRCh37, Build 38/GRCh38):
 
->>> s.snps.loc["rs3094315"].pos
-752566
 >>> chromosomes_remapped, chromosomes_not_remapped = s.remap(38)
-Downloading resources/GRCh37_GRCh38.tar.gz
->>> s.build
-38
 >>> s.assembly
 'GRCh38'
->>> s.snps.loc["rs3094315"].pos
-817186
-
-SNPs can be remapped between Build 36 (``NCBI36``), Build 37 (``GRCh37``), and Build 38
-(``GRCh38``).
 
 Save SNPs
 `````````
-Ok, so far we've merged the SNPs from two files (ensuring the same build in the process and
-identifying discrepancies along the way). Then, we remapped the SNPs to Build 38. Now, let's save
-the merged and remapped dataset consisting of 1M+ SNPs to a tab-separated values (TSV) file:
-
->>> saved_snps = s.to_tsv("out.txt")
-Saving output/out.txt
->>> print(saved_snps)
-output/out.txt
-
-Moreover, let's get the reference sequences for this assembly and save the SNPs as a VCF file:
-
->>> saved_snps = s.to_vcf("out.vcf")
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.2.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.3.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.4.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.5.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.6.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.7.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.8.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.9.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.10.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.11.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.12.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.13.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.14.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.15.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.16.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.17.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.18.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.19.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.20.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.21.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.22.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.X.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.Y.fa.gz
-Downloading resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz
-Saving output/out.vcf
-1 SNP positions were found to be discrepant when saving VCF
-
-When saving a VCF, if any SNPs have positions outside of the reference sequence, they are marked
-as discrepant and are available via a property of the ``SNPs`` object.
-
-All `output files <https://snps.readthedocs.io/en/stable/output_files.html>`_ are saved to the
-output directory.
+Save SNPs to common file formats:
+
+>>> _ = s.to_tsv("output.txt")
+>>> _ = s.to_csv("output.csv")
+
+To save as VCF, ``snps`` automatically downloads the required reference sequences for the
+assembly. This ensures the REF alleles in the VCF are accurate:
+
+>>> _ = s.to_vcf("output.vcf")  # doctest: +SKIP
+
+All output files are saved to the `output directory
+<https://snps.readthedocs.io/en/stable/output_files.html>`_.
+
+Generate Synthetic Data
+```````````````````````
+Generate synthetic genotype data for testing, examples, or demonstrations:
+
+>>> from snps.io import SyntheticSNPGenerator
+>>> gen = SyntheticSNPGenerator(build=37, seed=123)
+>>> gen.save_as_23andme("synthetic_23andme.txt.gz", num_snps=10000)
+'synthetic_23andme.txt.gz'
+
+The generator supports multiple output formats (23andMe, AncestryDNA, FTDNA) and
+automatically injects build-specific marker SNPs to ensure accurate build detection.
 
 Documentation
 -------------
@@ -266,11 +183,13 @@ Documentation is available `here <https://snps.readthedocs.io/>`_.
 
 Acknowledgements
 ----------------
-Thanks to Mike Agostino, Padma Reddy, Kevin Arvai, `openSNP <https://opensnp.org>`_,
-`Open Humans <https://www.openhumans.org>`_, and `Sano Genetics <https://sanogenetics.com>`_.
+Thanks to Mike Agostino, Padma Reddy, Kevin Arvai, `Open Humans <https://www.openhumans.org>`_,
+and `Sano Genetics <https://sanogenetics.com>`_. This project was historically validated using
+data from `openSNP <https://opensnp.org>`_.
 
-``snps`` incorporates code and concepts generated with the assistance of
-`OpenAI's <https://openai.com>`_ `ChatGPT <https://chatgpt.com>`_. ✨
+``snps`` incorporates code and concepts generated with the assistance of various
+generative AI tools (including but not limited to `ChatGPT <https://chatgpt.com>`_,
+`Grok <https://grok.com>`_, and `Claude <https://claude.ai>`_). ✨
 
 License
 -------
diff --git a/analysis/parse-opensnp-files/README.rst b/analysis/parse-opensnp-files/README.rst
deleted file mode 100644
index 6d6f8b90..00000000
--- a/analysis/parse-opensnp-files/README.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-parse-opensnp-files
-===================
-scripts to load and debug parsing of openSNP datadump files
-
-Method
-------
-Attempt to parse each file in the `openSNP <https://opensnp.org>`_ datadump by creating a
-``SNPs`` object. For files where SNPs were loaded, save summary statistics to a dataframe and
-output as a CSV. For files where no SNPs were loaded, save a message for each file indicating
-the issue and optionally extract these files from the datadump for debugging.
-
-Results
--------
-As of May 2020, ``snps`` can parse ~96.6% of the genotype files in the datadump. Additionally,
-``snps`` can detect the build in ~99.9% of those files.
diff --git a/analysis/parse-opensnp-files/get_file.py b/analysis/parse-opensnp-files/get_file.py
deleted file mode 100644
index 2e1ae536..00000000
--- a/analysis/parse-opensnp-files/get_file.py
+++ /dev/null
@@ -1,21 +0,0 @@
-"""Get a file from the openSNP datadump for debugging."""
-
-import os
-
-from atomicwrites import atomic_write
-
-from snps.resources import Resources
-from snps.utils import create_dir
-
-OUTPUT_DIR = "output"
-FILE = "user662_file340_yearofbirth_unknown_sex_unknown.23andme.txt"
-
-if __name__ == "__main__":
-    # create output directory for this example
-    create_dir(OUTPUT_DIR)
-
-    # assume script is being run from examples dir
-    r = Resources(resources_dir="../../resources")
-
-    with atomic_write(os.path.join(OUTPUT_DIR, FILE), mode="wb") as f:
-        f.write(r.load_opensnp_datadump_file(FILE))
diff --git a/analysis/parse-opensnp-files/parse_opensnp_files.py b/analysis/parse-opensnp-files/parse_opensnp_files.py
deleted file mode 100644
index 7a55112f..00000000
--- a/analysis/parse-opensnp-files/parse_opensnp_files.py
+++ /dev/null
@@ -1,131 +0,0 @@
-"""Parse openSNP datadump files.
-
-Attempt to parse each file in the openSNP datadump. For files where SNPs were loaded,
-save summary statistics to a dataframe and output as a CSV. For files where no SNPs were
-loaded, save a message for each file indicating the issue and optionally extract these
-files from the datadump for debugging.
-
-"""
-
-import logging
-import os
-import random
-
-import pandas as pd
-from atomicwrites import atomic_write
-
-from snps import SNPs
-from snps.resources import Resources
-from snps.utils import Parallelizer, clean_str, create_dir, save_df_as_csv
-
-OUTPUT_DIR = "output"
-EXTRACT_FILES = True
-
-# create output directory for this example
-create_dir(OUTPUT_DIR)
-
-# assume script is being run from examples dir
-r = Resources(resources_dir="../../resources")
-
-# setup logger to output to file in output directory
-logging.basicConfig(
-    filename=f"{os.path.join(OUTPUT_DIR, 'parse-opensnp-files.txt')}",
-    format="%(asctime)s: %(message)s",
-    filemode="w",
-    level=logging.INFO,
-)
-
-logger = logging.getLogger()
-
-
-def load_file(task):
-    file = task["file"]
-
-    try:
-        s = SNPs(r.load_opensnp_datadump_file(file), assign_par_snps=False)
-    except Exception as err:
-        return {"msg": str(err).strip()[:100], "file": file}
-
-    if s.count != 0:
-        d = s.summary
-        d.update({"file": file})
-        return d
-    else:
-        return {"msg": "no SNPs processed", "file": file}
-
-
-def main():
-    logger.info("start")
-
-    # get filenames from openSNP data dump
-    filenames = r.get_opensnp_datadump_filenames()
-
-    filenames = [
-        filename
-        for filename in filenames
-        if "readme" not in filename and "phenotype" not in filename
-    ]
-
-    # draw a sample from the observations
-    random.seed(1)
-    SAMPLE_SIZE = len(filenames)
-    # SAMPLE_SIZE = 10
-    samples = random.sample(range(len(filenames)), SAMPLE_SIZE)
-
-    # setup tasks for parallelizing / execution on multiple cores
-    p = Parallelizer(parallelize=True)
-    tasks = [{"file": filenames[i]} for i in samples]
-
-    # run tasks; results is a list of dicts
-    results = p(load_file, tasks)
-
-    # get results from `load_file` where `count` was non-zero
-    rows = [item for item in results if "msg" not in item]
-
-    df = pd.DataFrame(
-        rows,
-        columns=["file", "source", "build", "build_detected", "chromosomes", "count"],
-    )
-
-    save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv")
-
-    # log parsing statistics
-    file_count = len(filenames)
-    logger.info(f"{file_count} files in the openSNP datadump")
-    logger.info(f"{(len(df) / file_count):.2%} of openSNP datadump files parsed")
-    logger.info(
-        f"build detected in {len(df.loc[df.build_detected]) / len(df):.2%} of files parsed"
-    )
-
-    # extract files from the datadump where `load_file` returned a message
-    if EXTRACT_FILES:
-        # group files with same message (e.g., {"some message": ["file1", "file2"], ...})
-        d = {}
-        for result in results:
-            if "msg" in result:
-                if result["msg"] in d:
-                    d[result["msg"]].append(result["file"])
-                else:
-                    d[result["msg"]] = [result["file"]]
-
-        # add messages / file filters as necessary...
-        d["build not detected"] = list(df.loc[~df.build_detected].file.values)
-
-        # extract files that have messages for debugging
-        for msg, files in d.items():
-            if len(files) == 0:
-                continue
-
-            # create a directory for each message (prefix indicates number of files)
-            path = os.path.join(OUTPUT_DIR, f"{len(files):04}_{clean_str(msg)}")
-            create_dir(path)
-            # save each file with message into created directory
-            for filename in files:
-                with atomic_write(os.path.join(path, filename), mode="wb") as f:
-                    f.write(r.load_opensnp_datadump_file(filename))
-
-    logger.info("stop")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/analysis/xy-chrom-snp-ratios/README.rst b/analysis/xy-chrom-snp-ratios/README.rst
index 8fedb789..5191e83c 100644
--- a/analysis/xy-chrom-snp-ratios/README.rst
+++ b/analysis/xy-chrom-snp-ratios/README.rst
@@ -2,6 +2,17 @@ xy-chrom-snp-ratios
 ===================
 an analysis of heterozygous X SNP and not-null Y SNP ratios
 
+.. note::
+
+   **Historical Analysis**
+
+   This analysis was conducted using openSNP data dump functionality that has been
+   removed from the library. The code in this directory is **non-functional** with
+   current versions of ``snps`` and is preserved for historical reference only.
+
+   The analysis results and visualization remain valid as a demonstration of sex
+   determination from SNP data.
+
 Method
 ------
 All files in the `openSNP <https://opensnp.org>`_ data dump that ``snps`` could read were
diff --git a/docs/index.rst b/docs/index.rst
index 9dfa6878..986d3292 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -5,7 +5,7 @@
 
 `snps`
 ======
-*tools for reading, writing, merging, and remapping SNPs*
+*tools for reading, writing, generating, merging, and remapping SNPs*
 
 
 .. toctree::
diff --git a/docs/snps.rst b/docs/snps.rst
index d9110f28..ebf89552 100644
--- a/docs/snps.rst
+++ b/docs/snps.rst
@@ -47,6 +47,14 @@ snps.io.writer
     :show-inheritance:
     :exclude-members: write_file
 
+snps.io.generator
+~~~~~~~~~~~~~~~~~
+
+.. automodule:: snps.io.generator
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 snps.resources
 --------------
 
diff --git a/setup.cfg b/setup.cfg
index fb025f86..180fc9f2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,6 +19,7 @@ addopts =
     -ra
     --strict
     --tb=short
+    --doctest-glob=README.rst
 
 # http://coverage.readthedocs.io/en/latest/
 [coverage:run]
diff --git a/src/snps/build_constants.py b/src/snps/build_constants.py
new file mode 100644
index 00000000..7fd1a9e4
--- /dev/null
+++ b/src/snps/build_constants.py
@@ -0,0 +1,133 @@
+"""Centralized constants for genome builds and marker SNPs.
+
+This module provides single source of truth for:
+- Valid genome builds
+- Marker SNPs for build detection with positions
+- Chromosome sizes for each build
+
+References
+----------
+1. Genome Reference Consortium, https://www.ncbi.nlm.nih.gov/grc
+   - GRCh38: https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh38
+   - GRCh37: https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh37
+2. Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center
+   for Biotechnology Information, National Library of Medicine. dbSNP accession: rs3094315,
+   rs11928389, rs2500347, rs964481, rs2341354, rs3850290, and rs1329546
+   (dbSNP Build ID: 151). Available from: http://www.ncbi.nlm.nih.gov/SNP/
+3. UCSC Genome Browser chromosome size data:
+   - NCBI36/hg18: https://hgdownload.soe.ucsc.edu/goldenPath/hg18/bigZips/hg18.chrom.sizes
+   - GRCh37/hg19: https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes
+   - GRCh38/hg38: https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes
+"""
+
+from typing import Dict, Final, Tuple, Union
+
+# Valid genome builds supported by this library
+VALID_BUILDS: Final[Tuple[int, int, int]] = (36, 37, 38)
+
+# Build name mappings
+BUILD_ASSEMBLY_MAP: Final[Dict[int, str]] = {
+    36: "NCBI36",
+    37: "GRCh37",
+    38: "GRCh38",
+}
+
+# Marker SNPs for build detection
+# These SNPs have known positions across different builds and are used
+# to automatically detect which build a genotype file uses
+# Format: {rsid: {"chrom": str, 36: pos, 37: pos, 38: pos}}
+BUILD_MARKER_SNPS: Final[Dict[str, Dict[Union[str, int], Union[str, int]]]] = {
+    "rs3094315": {"chrom": "1", 36: 742429, 37: 752566, 38: 817186},
+    "rs11928389": {"chrom": "1", 36: 50908372, 37: 50927009, 38: 50889578},
+    "rs2500347": {"chrom": "1", 36: 143649677, 37: 144938320, 38: 148946169},
+    "rs964481": {"chrom": "20", 36: 27566744, 37: 27656823, 38: 27638706},
+    "rs2341354": {"chrom": "1", 36: 908436, 37: 918573, 38: 983193},
+    "rs3850290": {"chrom": "2", 36: 22315141, 37: 23245301, 38: 22776092},
+    "rs1329546": {"chrom": "1", 36: 135302086, 37: 135474420, 38: 136392261},
+}
+
+# Chromosome sizes indexed by build, then chromosome
+# Values are approximate sizes in base pairs
+CHROM_SIZES: Final[Dict[int, Dict[str, int]]] = {
+    36: {
+        "1": 247249719,
+        "2": 242951149,
+        "3": 199501827,
+        "4": 191273063,
+        "5": 180857866,
+        "6": 170899992,
+        "7": 158821424,
+        "8": 146274826,
+        "9": 140273252,
+        "10": 135374737,
+        "11": 134452384,
+        "12": 132349534,
+        "13": 114142980,
+        "14": 106368585,
+        "15": 100338915,
+        "16": 88827254,
+        "17": 78774742,
+        "18": 76117153,
+        "19": 63811651,
+        "20": 62435964,
+        "21": 46944323,
+        "22": 49691432,
+        "X": 154913754,
+        "Y": 57772954,
+        "MT": 16571,
+    },
+    37: {
+        "1": 249250621,
+        "2": 243199373,
+        "3": 198022430,
+        "4": 191154276,
+        "5": 180915260,
+        "6": 171115067,
+        "7": 159138663,
+        "8": 146364022,
+        "9": 141213431,
+        "10": 135534747,
+        "11": 135006516,
+        "12": 133851895,
+        "13": 115169878,
+        "14": 107349540,
+        "15": 102531392,
+        "16": 90354753,
+        "17": 81195210,
+        "18": 78077248,
+        "19": 59128983,
+        "20": 63025520,
+        "21": 48129895,
+        "22": 51304566,
+        "X": 155270560,
+        "Y": 59373566,
+        "MT": 16571,
+    },
+    38: {
+        "1": 248956422,
+        "2": 242193529,
+        "3": 198295559,
+        "4": 190214555,
+        "5": 181538259,
+        "6": 170805979,
+        "7": 159345973,
+        "8": 145138636,
+        "9": 138394717,
+        "10": 133797422,
+        "11": 135086622,
+        "12": 133275309,
+        "13": 114364328,
+        "14": 107043718,
+        "15": 101991189,
+        "16": 90338345,
+        "17": 83257441,
+        "18": 80373285,
+        "19": 58617616,
+        "20": 64444167,
+        "21": 46709983,
+        "22": 50818468,
+        "X": 156040895,
+        "Y": 57227415,
+        "MT": 16569,
+    },
+}
diff --git a/src/snps/io/__init__.py b/src/snps/io/__init__.py
index 455b39e9..c500c3ea 100644
--- a/src/snps/io/__init__.py
+++ b/src/snps/io/__init__.py
@@ -1,5 +1,6 @@
-"""Classes for reading and writing SNPs."""
+"""Classes for reading, writing, and generating SNPs."""
 
+from .generator import SyntheticSNPGenerator as SyntheticSNPGenerator
 from .reader import Reader as Reader
 from .reader import get_empty_snps_dataframe as get_empty_snps_dataframe
 from .writer import Writer as Writer
diff --git a/src/snps/io/generator.py b/src/snps/io/generator.py
new file mode 100644
index 00000000..ab4a1efa
--- /dev/null
+++ b/src/snps/io/generator.py
@@ -0,0 +1,545 @@
+"""Generate synthetic genotype data for testing and examples."""
+
+from __future__ import annotations
+
+import gzip
+import logging
+import os
+from collections.abc import Callable
+from io import StringIO
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from atomicwrites import atomic_write
+from numpy.typing import NDArray
+from pandas.api.types import CategoricalDtype
+
+from snps.build_constants import BUILD_MARKER_SNPS, CHROM_SIZES, VALID_BUILDS
+from snps.constants import REFERENCE_SEQUENCE_CHROMS
+from snps.io.reader import get_empty_snps_dataframe
+
+logger = logging.getLogger(__name__)
+
+
+class SyntheticSNPGenerator:
+    """Generate realistic synthetic genotype data.
+
+    This class generates synthetic SNP data that mimics real genotype files
+    from various DNA testing companies. The generated data is suitable for
+    testing, examples, and documentation.
+
+    Parameters
+    ----------
+    build : int
+        Genome build (36, 37, or 38), default is 37
+    seed : int, optional
+        Random seed for reproducibility
+
+    Examples
+    --------
+    >>> gen = SyntheticSNPGenerator(build=37, seed=123)
+    >>> gen.save_as_23andme("output.txt", num_snps=10000)
+    'output.txt'
+    """
+
+    def __init__(self, build: int = 37, seed: int | None = None) -> None:
+        if build not in VALID_BUILDS:
+            raise ValueError(f"Unsupported build: {build}. Use {VALID_BUILDS}.")
+        self.build: int = build
+        self.seed: int | None = seed
+        self.rng: np.random.Generator = np.random.default_rng(seed)
+        self.chrom_sizes: dict[str, int] = CHROM_SIZES[build]
+
+    def generate_snps(
+        self,
+        num_snps: int = 10000,
+        chromosomes: list[str] | None = None,
+        missing_rate: float = 0.01,
+        inject_build_markers: bool = True,
+    ) -> pd.DataFrame:
+        """Generate a DataFrame of synthetic SNPs.
+
+        Parameters
+        ----------
+        num_snps : int
+            Approximate number of SNPs to generate
+        chromosomes : list of str, optional
+            Chromosomes to include (default: all autosomes plus X, Y, MT)
+        missing_rate : float
+            Proportion of SNPs with missing genotypes (default: 0.01)
+        inject_build_markers : bool
+            Inject known marker SNPs for build detection (default: True)
+
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame with columns: rsid (index), chrom, pos, genotype
+        """
+        if chromosomes is None:
+            chromosomes = list(REFERENCE_SEQUENCE_CHROMS)
+
+        snps_per_chrom = self._calculate_snps_per_chromosome(num_snps, chromosomes)
+        all_snps = []
+        rsid_counter = 1
+
+        for chrom in chromosomes:
+            n = snps_per_chrom[chrom]
+            chrom_size = self.chrom_sizes[chrom]
+
+            # Cap SNP count to available positions (chromosome size - 1)
+            max_positions = chrom_size - 1
+            n = min(n, max_positions)
+
+            # Generate unique positions efficiently
+            positions = self._generate_unique_positions(n, chrom_size)
+            genotypes = self._generate_genotypes(n, missing_rate)
+            rsids = [f"rs{rsid_counter + i}" for i in range(n)]
+            rsid_counter += n
+
+            all_snps.append(
+                pd.DataFrame(
+                    {
+                        "rsid": rsids,
+                        "chrom": chrom,
+                        "pos": positions,
+                        "genotype": genotypes,
+                    }
+                )
+            )
+
+        if not all_snps:
+            return get_empty_snps_dataframe()
+
+        df = pd.concat(all_snps, ignore_index=True).set_index("rsid")
+        df["chrom"] = df["chrom"].astype(object)
+        df["pos"] = df["pos"].astype(np.uint32)
+        df["genotype"] = df["genotype"].astype(object)
+
+        if inject_build_markers:
+            df = self._inject_build_markers(df, chromosomes)
+
+        # Renumber RSIDs sequentially while preserving build markers
+        df = self._renumber_rsids_sequentially(df)
+
+        return df
+
+    def create_example_dataset_pair(self, output_dir: str = ".") -> tuple[str, str]:
+        """Create a pair of realistic example datasets suitable for merging.
+
+        Generates two correlated genotype files that share a large number of
+        common SNPs, with some discrepancies to demonstrate merge functionality.
+
+        Parameters
+        ----------
+        output_dir : str
+            Directory for output files
+
+        Returns
+        -------
+        tuple of (str, str)
+            Paths to (file1_23andme, file2_ftdna)
+        """
+        # Generate base dataset (~700K SNPs shared between files)
+        base_snps = self._generate_base_snps(700000)
+
+        # File 1: 23andMe format with ~991K SNPs
+        file1_df = self._create_file1_dataframe(base_snps, 291786)
+
+        # File 2: FTDNA format with ~715K SNPs, including discrepancies
+        file2_df = self._create_file2_dataframe(base_snps, 15194)
+
+        # Save files
+        path1 = os.path.join(output_dir, "sample1.23andme.txt.gz")
+        path2 = os.path.join(output_dir, "sample2.ftdna.csv.gz")
+
+        logger.info(f"Creating {os.path.relpath(path1)}")
+        logger.info(f"Creating {os.path.relpath(path2)}")
+
+        self._write_snps_as_23andme(file1_df, path1)
+        self._write_snps_as_ftdna(file2_df, path2)
+
+        return path1, path2
+
+    def _generate_base_snps(self, num_snps: int) -> pd.DataFrame:
+        """Generate base SNPs with sequential rsIDs."""
+        base = self.generate_snps(num_snps=num_snps, inject_build_markers=False)
+        base = self._sort_snps_dataframe(base).reset_index(drop=True)
+        base.index = pd.Index([f"rs{i + 1}" for i in range(len(base))], name="rsid")
+        return base
+
+    def _create_file1_dataframe(
+        self, base_snps: pd.DataFrame, unique_count: int
+    ) -> pd.DataFrame:
+        """Create file 1 DataFrame (23andMe format).
+
+        Note: RSIDs are NOT renumbered here to preserve RSID correspondence
+        with file2 for merge functionality. Unique SNPs use offset 10_000_001
+        to clearly distinguish them from base SNPs (rs1-rs700000).
+        """
+        # Use large offset to clearly separate unique SNPs from base SNPs
+        unique = self._generate_unique_snps(unique_count, 10_000_000)
+        df = self._sort_snps_dataframe(pd.concat([base_snps, unique]))
+        return self._inject_build_markers(df, list(REFERENCE_SEQUENCE_CHROMS))
+
+    def _create_file2_dataframe(
+        self, base_snps: pd.DataFrame, unique_count: int
+    ) -> pd.DataFrame:
+        """Create file 2 DataFrame (FTDNA format) with discrepancies.
+
+        Note: RSIDs are NOT renumbered here to preserve RSID correspondence
+        with file1 for merge functionality. Unique SNPs use offset 20_000_001
+        to clearly distinguish them from base SNPs and file1's unique SNPs.
+
+        This file only includes autosomal chromosomes (1-22), no X, Y, or MT.
+        """
+        # Only include autosomal chromosomes (1-22) for FTDNA format
+        autosomal_chroms = [str(i) for i in range(1, 23)]
+
+        # Filter base SNPs to autosomal chromosomes only
+        file2_base = base_snps[base_snps["chrom"].isin(autosomal_chroms)].copy()
+
+        # Create position discrepancies (27 SNPs)
+        pos_disc_indices = self._introduce_position_discrepancies(file2_base, 27)
+
+        # Create genotype discrepancies (151 SNPs, excluding position-discrepant ones)
+        self._introduce_genotype_discrepancies(
+            file2_base, 151, exclude=pos_disc_indices
+        )
+
+        # Use large offset to clearly separate unique SNPs from base SNPs
+        # Generate unique SNPs only for autosomal chromosomes
+        unique = self._generate_unique_snps(
+            unique_count, 20_000_000, chromosomes=autosomal_chroms
+        )
+        df = self._sort_snps_dataframe(pd.concat([file2_base, unique]))
+        return self._inject_build_markers(df, autosomal_chroms)
+
+    def _generate_unique_snps(
+        self, count: int, rsid_offset: int, chromosomes: list[str] | None = None
+    ) -> pd.DataFrame:
+        """Generate unique SNPs with rsIDs starting after offset."""
+        unique = self.generate_snps(
+            num_snps=count, inject_build_markers=False, chromosomes=chromosomes
+        )
+        unique = unique.reset_index(drop=True)
+        unique.index = pd.Index(
+            [f"rs{rsid_offset + i + 1}" for i in range(len(unique))], name="rsid"
+        )
+        return unique
+
+    def _introduce_position_discrepancies(
+        self, df: pd.DataFrame, count: int
+    ) -> set[Any]:
+        """Introduce position discrepancies in a DataFrame. Returns affected indices."""
+        indices = self.rng.choice(df.index, size=count, replace=False)
+        for idx in indices:
+            shift = int(self.rng.integers(-1000, 1000))
+            chrom = str(df.loc[idx, "chrom"])
+            max_pos = self.chrom_sizes[chrom]
+            new_pos = max(1, min(int(df.loc[idx, "pos"]) + shift, max_pos))  # type: ignore[arg-type]
+            df.loc[idx, "pos"] = np.uint32(new_pos)
+        return set(indices)
+
+    def _introduce_genotype_discrepancies(
+        self, df: pd.DataFrame, count: int, exclude: set[Any] | None = None
+    ) -> None:
+        """Introduce genotype discrepancies in a DataFrame, optionally excluding indices."""
+        valid_indices = [
+            idx for idx in df.index if exclude is None or idx not in exclude
+        ]
+        indices = self.rng.choice(valid_indices, size=count, replace=False)
+        bases = ["A", "C", "G", "T"]
+        for idx in indices:
+            current = str(df.loc[idx, "genotype"])
+            if current != "--":
+                new_allele = self.rng.choice([b for b in bases if b not in current])
+                df.loc[idx, "genotype"] = new_allele + new_allele
+
+    def _calculate_snps_per_chromosome(
+        self, num_snps: int, chromosomes: list[str]
+    ) -> dict[str, int]:
+        """Calculate proportional SNP distribution across chromosomes."""
+        total_size = sum(self.chrom_sizes[c] for c in chromosomes)
+        return {
+            chrom: max(1, int(num_snps * self.chrom_sizes[chrom] / total_size))
+            for chrom in chromosomes
+        }
+
+    def _generate_unique_positions(self, n: int, chrom_size: int) -> NDArray[np.uint32]:
+        """Generate n unique positions within chromosome efficiently.
+
+        Uses fast randint with deduplication for large chromosomes (where
+        collisions are rare) and np.random.choice for small chromosomes
+        (where collisions would be frequent).
+        """
+        max_pos = chrom_size - 1
+
+        # For small chromosomes or high density, use choice (guaranteed unique)
+        # Threshold: if requesting more than 1% of positions, use choice
+        if n > max_pos * 0.01:
+            positions = self.rng.choice(max_pos, size=n, replace=False) + 1
+        else:
+            # For large chromosomes, use randint with deduplication (faster)
+            # Generate extra to account for potential duplicates
+            oversample = int(n * 1.1) + 100
+            positions = self.rng.integers(1, chrom_size, size=oversample)
+            positions = np.unique(positions)
+
+            # If we don't have enough unique positions, sample more
+            while len(positions) < n:
+                extra = self.rng.integers(1, chrom_size, size=n - len(positions) + 100)
+                positions = np.unique(np.concatenate([positions, extra]))
+
+            # Take exactly n positions
+            positions = self.rng.choice(positions, size=n, replace=False)
+
+        return np.sort(positions).astype(np.uint32)
+
+    def _generate_genotypes(self, n: int, missing_rate: float) -> NDArray[np.object_]:
+        """Generate n genotypes using vectorized operations."""
+        bases = np.array(["A", "C", "G", "T"])
+
+        missing_mask = self.rng.random(n) < missing_rate
+        homo_mask = self.rng.random(n) < 0.7
+
+        homo_bases = self.rng.integers(0, 4, size=n)
+        het_base1 = self.rng.integers(0, 4, size=n)
+        het_base2 = (het_base1 + self.rng.integers(1, 4, size=n)) % 4
+
+        genotypes = np.empty(n, dtype=object)
+        genotypes[missing_mask] = "--"
+
+        homo_idx = ~missing_mask & homo_mask
+        homo_alleles = bases[homo_bases[homo_idx]]
+        genotypes[homo_idx] = np.char.add(homo_alleles, homo_alleles)
+
+        het_idx = ~missing_mask & ~homo_mask
+        allele1, allele2 = bases[het_base1[het_idx]], bases[het_base2[het_idx]]
+        genotypes[het_idx] = np.array(
+            ["".join(sorted([a1, a2])) for a1, a2 in zip(allele1, allele2)]
+        )
+
+        return genotypes
+
+    def _inject_build_markers(
+        self, df: pd.DataFrame, chromosomes: list[str]
+    ) -> pd.DataFrame:
+        """Inject known marker SNPs at their correct positions for build detection."""
+        marker_rows = []
+        for rsid, marker_data in BUILD_MARKER_SNPS.items():
+            chrom = marker_data["chrom"]
+            if chrom in chromosomes and self.build in marker_data:
+                marker_rows.append(
+                    {
+                        "rsid": rsid,
+                        "chrom": chrom,
+                        "pos": np.uint32(marker_data[self.build]),
+                        "genotype": self._random_genotype(),
+                    }
+                )
+
+        if marker_rows:
+            marker_df = pd.DataFrame(marker_rows).set_index("rsid")
+            df = df[~df.index.isin(marker_df.index)]
+            df = self._sort_snps_dataframe(pd.concat([df, marker_df]))
+
+        return df
+
+    def _renumber_rsids_sequentially(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Renumber RSIDs sequentially while preserving build marker RSIDs.
+
+        After sorting by chromosome and position, RSIDs are renumbered to be
+        sequential (rs1, rs2, rs3, ...) for a cleaner output. Build marker
+        RSIDs are preserved to ensure build detection continues to work.
+        """
+        # Get the set of build marker rsids that must be preserved
+        marker_rsids = set(BUILD_MARKER_SNPS.keys())
+
+        # Create new index with sequential rsids, preserving markers
+        new_index = []
+        counter = 1
+        for rsid in df.index:
+            if rsid in marker_rsids:
+                new_index.append(rsid)
+            else:
+                new_index.append(f"rs{counter}")
+                counter += 1
+
+        df.index = pd.Index(new_index, name="rsid")
+        return df
+
+    def _sort_snps_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Sort SNPs dataframe by chromosome and position.
+
+        Uses the same chromosome ordering as SNPs.sort(): numeric chromosomes
+        in natural order (1, 2, ..., 22), then X, Y, and MT at the end.
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            DataFrame with 'chrom' and 'pos' columns
+
+        Returns
+        -------
+        pd.DataFrame
+            Sorted DataFrame with chrom as object dtype
+        """
+        # Get unique chromosomes and determine sort order
+        unique_chroms = df["chrom"].unique()
+
+        # Use REFERENCE_SEQUENCE_CHROMS order, filtering to only include present chroms
+        chrom_order = [c for c in REFERENCE_SEQUENCE_CHROMS if c in unique_chroms]
+
+        # Add any chromosomes not in standard list (e.g., PAR)
+        for c in unique_chroms:
+            if c not in chrom_order:
+                chrom_order.append(c)
+
+        # Convert to categorical for proper sorting
+        df["chrom"] = df["chrom"].astype(
+            CategoricalDtype(categories=chrom_order, ordered=True)
+        )
+
+        # Sort by chromosome and position
+        df = df.sort_values(["chrom", "pos"])
+
+        # Convert back to object dtype
+        df["chrom"] = df["chrom"].astype(object)
+
+        return df
+
+    def _random_genotype(self) -> str:
+        """Generate a single random genotype."""
+        bases = ["A", "C", "G", "T"]
+        if self.rng.random() < 0.7:
+            base = self.rng.choice(bases)
+            return base + base
+        alleles = self.rng.choice(bases, size=2, replace=False)
+        return "".join(sorted(alleles))
+
+    # -------------------------------------------------------------------------
+    # File Format Writers
+    # -------------------------------------------------------------------------
+
+    def _save_as(
+        self,
+        output_path: str,
+        writer_method: Callable[[pd.DataFrame, str], None],
+        num_snps: int,
+        **kwargs: Any,
+    ) -> str:
+        """Generate SNPs and save using the specified writer method."""
+        df = self.generate_snps(num_snps=num_snps, **kwargs)
+        logger.info(f"Creating {os.path.relpath(output_path)}")
+        writer_method(df, output_path)
+        return output_path
+
+    def save_as_23andme(
+        self, output_path: str, num_snps: int = 991786, **kwargs: Any
+    ) -> str:
+        """Save SNPs in 23andMe format."""
+        return self._save_as(
+            output_path, self._write_snps_as_23andme, num_snps, **kwargs
+        )
+
+    def save_as_ancestry(
+        self, output_path: str, num_snps: int = 700000, **kwargs: Any
+    ) -> str:
+        """Save SNPs in AncestryDNA format."""
+        return self._save_as(
+            output_path, self._write_snps_as_ancestry, num_snps, **kwargs
+        )
+
+    def save_as_ftdna(
+        self, output_path: str, num_snps: int = 715194, **kwargs: Any
+    ) -> str:
+        """Save SNPs in Family Tree DNA (FTDNA) format."""
+        return self._save_as(output_path, self._write_snps_as_ftdna, num_snps, **kwargs)
+
+    def save_as_generic(
+        self,
+        output_path: str,
+        format: str = "csv",
+        num_snps: int = 10000,
+        **kwargs: Any,
+    ) -> str:
+        """Save SNPs in generic CSV or TSV format."""
+        df = self.generate_snps(num_snps=num_snps, **kwargs)
+        logger.info(f"Creating {os.path.relpath(output_path)}")
+        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+        sep = "," if format == "csv" else "\t"
+        df_out = df.reset_index()
+        if output_path.endswith(".gz"):
+            with gzip.open(output_path, "wt") as f:
+                df_out.to_csv(f, sep=sep, index=False)
+        else:
+            df_out.to_csv(output_path, sep=sep, index=False)
+        return output_path
+
+    def _write_snps_as_23andme(self, df: pd.DataFrame, output_path: str) -> None:
+        """Write a DataFrame of SNPs in 23andMe format."""
+        header = (
+            "# 23andMe\n#\n"
+            "# This is synthetic genotype data generated for demonstration purposes.\n#\n"
+            "# rsid\tchromosome\tposition\tgenotype\n"
+        )
+        buffer = StringIO()
+        buffer.write(header)
+        df.reset_index().to_csv(buffer, sep="\t", header=False, index=False)
+        self._write_file(output_path, buffer.getvalue())
+
+    def _write_snps_as_ftdna(self, df: pd.DataFrame, output_path: str) -> None:
+        """Write a DataFrame of SNPs in FTDNA format."""
+        df_out = df.reset_index()
+        df_out.columns = ["RSID", "CHROMOSOME", "POSITION", "RESULT"]
+        buffer = StringIO()
+        buffer.write("RSID,CHROMOSOME,POSITION,RESULT\n")
+        df_out.to_csv(buffer, index=False, header=False, quoting=1)
+        self._write_file(output_path, buffer.getvalue())
+
+    def _write_snps_as_ancestry(self, df: pd.DataFrame, output_path: str) -> None:
+        """Write a DataFrame of SNPs in AncestryDNA format."""
+        header = (
+            "#AncestryDNA\n#\n"
+            "# This is synthetic genotype data generated for demonstration purposes.\n#\n"
+            "rsid\tchromosome\tposition\tallele1\tallele2\n"
+        )
+        df_out = df.reset_index()
+        genotypes = df_out["genotype"].values
+
+        allele1 = np.where(
+            genotypes == "--",
+            "0",
+            np.array([g[0] if len(g) >= 1 else "0" for g in genotypes]),
+        )
+        allele2 = np.where(
+            genotypes == "--",
+            "0",
+            np.array([g[1] if len(g) >= 2 else "0" for g in genotypes]),
+        )
+
+        result = pd.DataFrame(
+            {
+                "rsid": df_out["rsid"],
+                "chrom": df_out["chrom"],
+                "pos": df_out["pos"],
+                "allele1": allele1,
+                "allele2": allele2,
+            }
+        )
+        buffer = StringIO()
+        buffer.write(header)
+        result.to_csv(buffer, sep="\t", header=False, index=False)
+        self._write_file(output_path, buffer.getvalue())
+
+    def _write_file(self, path: str, content: str) -> None:
+        """Write content to file atomically, handling gzip compression if needed."""
+        os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
+        if path.endswith(".gz"):
+            with atomic_write(path, mode="wb", overwrite=True) as f:
+                with gzip.open(f, "wt", compresslevel=1) as f_gzip:
+                    f_gzip.write(content)
+        else:
+            with atomic_write(path, mode="w", overwrite=True) as f:
+                f.write(content)
diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py
index 59562ba3..ddb5464b 100644
--- a/src/snps/io/reader.py
+++ b/src/snps/io/reader.py
@@ -14,6 +14,8 @@
 import numpy as np
 import pandas as pd
 
+from snps.build_constants import CHROM_SIZES
+
 logger = logging.getLogger(__name__)
 
 NORMALIZED_DTYPES = {
@@ -265,10 +267,13 @@ def _detect_build_from_comments(self, comments, source):
                             elif "38" in part_value:
                                 return 38
                         elif part_key.lower() == "length":
-                            if "249250621" == part_value:
-                                return 37  # length of chromosome 1
-                            elif "248956422" == part_value:
-                                return 38  # length of chromosome 1
+                            # Check chromosome 1 length against known builds
+                            if part_value == str(CHROM_SIZES[37]["1"]):
+                                return 37
+                            elif part_value == str(CHROM_SIZES[38]["1"]):
+                                return 38
+                            elif part_value == str(CHROM_SIZES[36]["1"]):
+                                return 36
             # couldn't find anything
             return 0
         else:
@@ -292,10 +297,13 @@ def _detect_build_from_comments(self, comments, source):
                 return 38
             elif "grch37" in comments.lower():
                 return 37
-            elif "249250621" in comments.lower():
-                return 37  # length of chromosome 1
-            elif "248956422" in comments.lower():
-                return 38  # length of chromosome 1
+            # Check for chromosome 1 length in comments
+            elif str(CHROM_SIZES[37]["1"]) in comments.lower():
+                return 37
+            elif str(CHROM_SIZES[38]["1"]) in comments.lower():
+                return 38
+            elif str(CHROM_SIZES[36]["1"]) in comments.lower():
+                return 36
             else:
                 return 0
 
diff --git a/src/snps/resources.py b/src/snps/resources.py
index c483a1e9..e7e17fc3 100644
--- a/src/snps/resources.py
+++ b/src/snps/resources.py
@@ -24,7 +24,6 @@
 import tempfile
 import urllib.error
 import urllib.request
-import zipfile
 
 import numpy as np
 import pandas as pd
@@ -57,7 +56,6 @@ def _init_resource_attributes(self):
         self._gsa_rsid_map = None
         self._gsa_chrpos_map = None
         self._dbsnp_151_37_reverse = None
-        self._opensnp_datadump_filenames = []
         self._chip_clusters = None
         self._low_quality_snps = None
 
@@ -125,48 +123,52 @@ def get_assembly_mapping_data(self, source_assembly, target_assembly):
             self._get_path_assembly_mapping_data(source_assembly, target_assembly)
         )
 
-    def download_example_datasets(self):
-        """Download example datasets from `openSNP <https://opensnp.org>`_.
+    def create_example_datasets(self, output_dir=None):
+        """Create synthetic example datasets for demonstrations.
 
-        Per openSNP, "the data is donated into the public domain using `CC0 1.0
-        <http://creativecommons.org/publicdomain/zero/1.0/>`_."
+        Generates two correlated genotype files in different formats and builds,
+        suitable for demonstrating merging and remapping functionality. The files
+        share ~700K common SNPs with intentional discrepancies to demonstrate
+        merge conflict detection.
+
+        Parameters
+        ----------
+        output_dir : str, optional
+            Directory for output files (default: resources directory)
 
         Returns
         -------
-        paths : list of str or empty str
-            paths to example datasets
+        paths : list of str
+            Paths to created example datasets
 
-        References
-        ----------
-        1. Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource
-           for Personal Genomics," PLOS ONE, 9(3): e89204,
-           https://doi.org/10.1371/journal.pone.0089204
+        Examples
+        --------
+        >>> from snps.resources import Resources
+        >>> r = Resources()
+        >>> paths = r.create_example_datasets()
+        Creating resources/sample1.23andme.txt.gz
+        Creating resources/sample2.ftdna.csv.gz
         """
-        paths = []
-        paths.append(
-            self._download_file(
-                "https://opensnp.org/data/662.23andme.340",
-                "662.23andme.340.txt.gz",
-                compress=True,
-            )
-        )
-        paths.append(
-            self._download_file(
-                "https://opensnp.org/data/662.ftdna-illumina.341",
-                "662.ftdna-illumina.341.csv.gz",
-                compress=True,
-            )
-        )
+        from snps.io.generator import SyntheticSNPGenerator
+
+        if output_dir is None:
+            output_dir = self._resources_dir
 
-        return paths
+        # Ensure output directory exists
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Create correlated dataset pair with realistic merge characteristics
+        gen = SyntheticSNPGenerator(build=37, seed=47)
+        path1, path2 = gen.create_example_dataset_pair(output_dir)
+
+        return [path1, path2]
 
     def get_all_resources(self):
         """Get / download all resources used throughout `snps`.
 
         Notes
         -----
-        This function does not download reference sequences and the openSNP datadump,
-        due to their large sizes.
+        This function does not download reference sequences due to their large sizes.
 
         Returns
         -------
@@ -358,84 +360,6 @@ def get_dbsnp_151_37_reverse(self):
 
         return self._dbsnp_151_37_reverse
 
-    def get_opensnp_datadump_filenames(self):
-        """Get filenames internal to the `openSNP <https://opensnp.org>`_ datadump zip.
-
-        Per openSNP, "the data is donated into the public domain using `CC0 1.0
-        <http://creativecommons.org/publicdomain/zero/1.0/>`_."
-
-        Notes
-        -----
-        This function can download over 27GB of data. If the download is not successful,
-        try using a different tool like `wget` or `curl` to download the file and move it
-        to the resources directory (see `_get_path_opensnp_datadump`).
-
-        Returns
-        -------
-        filenames : list of str
-            filenames internal to the openSNP datadump
-
-        References
-        ----------
-        1. Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource
-           for Personal Genomics," PLOS ONE, 9(3): e89204,
-           https://doi.org/10.1371/journal.pone.0089204
-        """
-        if not self._opensnp_datadump_filenames:
-            self._opensnp_datadump_filenames = self._get_opensnp_datadump_filenames(
-                self._get_path_opensnp_datadump()
-            )
-        return self._opensnp_datadump_filenames
-
-    def load_opensnp_datadump_file(self, filename):
-        """Load the specified file from the openSNP datadump.
-
-        Per openSNP, "the data is donated into the public domain using `CC0 1.0
-        <http://creativecommons.org/publicdomain/zero/1.0/>`_."
-
-        Parameters
-        ----------
-        filename : str
-            filename internal to the openSNP datadump
-
-        Returns
-        -------
-        bytes
-            content of specified file internal to the openSNP datadump
-
-        References
-        ----------
-        1. Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource
-           for Personal Genomics," PLOS ONE, 9(3): e89204,
-           https://doi.org/10.1371/journal.pone.0089204
-        """
-        if self._get_path_opensnp_datadump():
-            with zipfile.ZipFile(self._get_path_opensnp_datadump()) as z:
-                with z.open(filename, "r") as f:
-                    return f.read()
-        else:
-            return bytes()
-
-    @staticmethod
-    def _get_opensnp_datadump_filenames(filename):
-        """Get list of filenames internal to the openSNP datadump zip.
-
-        Parameters
-        ----------
-        filename : str
-            path to openSNP datadump
-
-        Returns
-        -------
-        filenames : list of str
-            filenames internal to the openSNP datadump
-        """
-        if filename:
-            with zipfile.ZipFile(filename) as z:
-                return z.namelist()
-        else:
-            return []
-
     @staticmethod
     def _write_data_to_gzip(f, data):
         """Write `data` to `f` in `gzip` format.
@@ -728,12 +652,6 @@ def get_gsa_chrpos(self):
             self._gsa_chrpos_map = chrpos
         return self._gsa_chrpos_map
 
-    def _get_path_opensnp_datadump(self):
-        return self._download_file(
-            "https://opensnp.org/data/zip/opensnp_datadump.current.zip",
-            "opensnp_datadump.current.zip",
-        )
-
     def _download_file(self, url, filename, compress=False, timeout=30):
         """Download a file to the resources folder.
 
diff --git a/src/snps/snps.py b/src/snps/snps.py
index 894658de..b10fab88 100644
--- a/src/snps/snps.py
+++ b/src/snps/snps.py
@@ -11,6 +11,7 @@
 import pandas as pd
 from pandas.api.types import CategoricalDtype
 
+from snps.build_constants import BUILD_MARKER_SNPS
 from snps.ensembl import EnsemblRestClient
 from snps.io import Reader, Writer, get_empty_snps_dataframe
 from snps.resources import Resources
@@ -943,67 +944,14 @@ def detect_build(self):
            rs11928389, rs2500347, rs964481, rs2341354, rs3850290, and rs1329546
            (dbSNP Build ID: 151). Available from: http://www.ncbi.nlm.nih.gov/SNP/
         """
-
-        def lookup_build_with_snp_pos(pos, s):
-            try:
-                return int(s.loc[s == pos].index[0])
-            except Exception:
-                return 0
-
-        build = 0
-
-        rsids = [
-            "rs3094315",
-            "rs11928389",
-            "rs2500347",
-            "rs964481",
-            "rs2341354",
-            "rs3850290",
-            "rs1329546",
-        ]
-        df = pd.DataFrame(
-            {
-                36: [
-                    742429,
-                    50908372,
-                    143649677,
-                    27566744,
-                    908436,
-                    22315141,
-                    135302086,
-                ],
-                37: [
-                    752566,
-                    50927009,
-                    144938320,
-                    27656823,
-                    918573,
-                    23245301,
-                    135474420,
-                ],
-                38: [
-                    817186,
-                    50889578,
-                    148946169,
-                    27638706,
-                    983193,
-                    22776092,
-                    136392261,
-                ],
-            },
-            index=rsids,
-        )
-
-        for rsid in rsids:
+        # Check each marker SNP until we find a matching build
+        for rsid, marker_data in BUILD_MARKER_SNPS.items():
             if rsid in self._snps.index:
-                build = lookup_build_with_snp_pos(
-                    self._snps.loc[rsid].pos, df.loc[rsid]
-                )
-
-            if build:
-                break
-
-        return build
+                pos = self._snps.loc[rsid].pos
+                for build in (37, 38, 36):  # Check most common builds first
+                    if marker_data[build] == pos:
+                        return build
+        return 0
 
     def get_count(self, chrom=""):
         """Count of SNPs.
diff --git a/tests/io/test_generator.py b/tests/io/test_generator.py
new file mode 100644
index 00000000..6856dd3c
--- /dev/null
+++ b/tests/io/test_generator.py
@@ -0,0 +1,480 @@
+"""Test synthetic SNP data generator."""
+
+from __future__ import annotations
+
+import gzip
+import os
+import tempfile
+from unittest import TestCase
+
+import numpy as np
+import pandas as pd
+
+from snps import SNPs
+from snps.build_constants import BUILD_MARKER_SNPS
+from snps.io.generator import SyntheticSNPGenerator
+
+
+class TestSyntheticSNPGenerator(TestCase):
+    def test_init_build_37(self) -> None:
+        """Test generator initialization with Build 37."""
+        gen = SyntheticSNPGenerator(build=37, seed=123)
+        self.assertEqual(gen.build, 37)
+        self.assertEqual(gen.seed, 123)
+        self.assertIsNotNone(gen.chrom_sizes)
+
+    def test_init_build_36(self) -> None:
+        """Test generator initialization with Build 36."""
+        gen = SyntheticSNPGenerator(build=36, seed=456)
+        self.assertEqual(gen.build, 36)
+        self.assertEqual(gen.seed, 456)
+
+    def test_init_build_38(self) -> None:
+        """Test generator initialization with Build 38."""
+        gen = SyntheticSNPGenerator(build=38, seed=789)
+        self.assertEqual(gen.build, 38)
+        self.assertEqual(gen.seed, 789)
+
+    def test_init_invalid_build(self) -> None:
+        """Test generator initialization with invalid build."""
+        with self.assertRaises(ValueError):
+            SyntheticSNPGenerator(build=99)
+
+    def test_generate_snps_empty_chromosomes(self) -> None:
+        """Test SNP generation with empty chromosomes list."""
+        gen = SyntheticSNPGenerator(build=37, seed=123)
+        df = gen.generate_snps(num_snps=1000, chromosomes=[])
+
+        # Verify empty DataFrame with correct structure
+        self.assertIsInstance(df, pd.DataFrame)
+        self.assertEqual(len(df), 0)
+        self.assertEqual(df.index.name, "rsid")
+        self.assertListEqual(list(df.columns), ["chrom", "pos", "genotype"])
+
+        # Verify dtypes
+        self.assertEqual(df["chrom"].dtype, object)
+        self.assertEqual(df["pos"].dtype, np.uint32)
+        self.assertEqual(df["genotype"].dtype, object)
+
+    def test_generate_snps_basic(self) -> None:
+        """Test basic SNP generation."""
+        gen = SyntheticSNPGenerator(build=37, seed=123)
+        df = gen.generate_snps(num_snps=1000)
+
+        # Verify DataFrame structure
+        self.assertIsInstance(df, pd.DataFrame)
+        self.assertEqual(df.index.name, "rsid")
+        self.assertListEqual(list(df.columns), ["chrom", "pos", "genotype"])
+
+        # Verify dtypes
+        self.assertEqual(df["chrom"].dtype, object)
+        self.assertEqual(df["pos"].dtype, np.uint32)
+        self.assertEqual(df["genotype"].dtype, object)
+
+        # Verify approximate SNP count (may vary due to distribution)
+        self.assertGreater(len(df), 900)
+        self.assertLess(len(df), 1100)
+
+    def test_generate_snps_chromosomes(self) -> None:
+        """Test SNP generation with specific chromosomes."""
+        gen = SyntheticSNPGenerator(build=37, seed=123)
+        df = gen.generate_snps(num_snps=500, chromosomes=["1", "2", "X"])
+
+        # Verify only specified chromosomes are present
+        unique_chroms = set(df["chrom"].unique())
+        self.assertTrue(unique_chroms.issubset({"1", "2", "X"}))
+
+    def test_generate_snps_missing_rate(self) -> None:
+        """Test SNP generation with missing genotypes."""
+        gen = SyntheticSNPGenerator(build=37, seed=123)
+        df = gen.generate_snps(num_snps=10000, missing_rate=0.1)
+
+        # Count missing genotypes
+        missing_count = (df["genotype"] == "--").sum()
+        total_count = len(df)
+
+        # Verify approximately 10% are missing (with some tolerance)
+        missing_rate = missing_count / total_count
+        self.assertGreater(missing_rate, 0.05)  # At least 5%
+        self.assertLess(missing_rate, 0.15)  # At most 15%
+
+    def test_generate_snps_reproducibility(self) -> None:
+        """Test that same seed produces same results."""
+        gen1 = SyntheticSNPGenerator(build=37, seed=42)
+        gen2 = SyntheticSNPGenerator(build=37, seed=42)
+
+        df1 = gen1.generate_snps(num_snps=1000)
+        df2 = gen2.generate_snps(num_snps=1000)
+
+        # Verify identical output
+        pd.testing.assert_frame_equal(df1, df2)
+
+    def test_generate_snps_rsid_format(self) -> None:
+        """Test that rsIDs are correctly formatted."""
+        gen = SyntheticSNPGenerator(build=37, seed=123)
+        df = gen.generate_snps(num_snps=100)
+
+        # Verify all rsIDs start with "rs" followed by digits
+        for rsid in df.index:
+            self.assertTrue(rsid.startswith("rs"))
+            self.assertTrue(rsid[2:].isdigit())
+
+    def test_generate_snps_positions_sorted(self) -> None:
+        """Test that positions within each chromosome are sorted."""
+        gen = SyntheticSNPGenerator(build=37, seed=123)
+        df = gen.generate_snps(num_snps=5000)
+
+        # Check each chromosome separately
+        for chrom in df["chrom"].unique():
+            chrom_df = df[df["chrom"] == chrom]
+            positions = chrom_df["pos"].values
+            # Verify positions are sorted
+            self.assertTrue(np.all(positions[:-1] <= positions[1:]))
+
+    def test_generate_snps_genotype_format(self) -> None:
+        """Test that genotypes are correctly formatted."""
+        gen = SyntheticSNPGenerator(build=37, seed=123)
+        df = gen.generate_snps(num_snps=1000, missing_rate=0.05)
+
+        # Valid bases
+        valid_bases = {"A", "C", "G", "T", "-"}
+
+        # Verify all genotypes are valid
+        for genotype in df["genotype"].unique():
+            if pd.notna(genotype):
+                # Should be either 2 characters or "--"
+                self.assertIn(len(genotype), [2])
+                for base in genotype:
+                    self.assertIn(base, valid_bases)
+
+    def test_save_as_23andme(self) -> None:
+        """Test saving SNPs in 23andMe format."""
+        gen = SyntheticSNPGenerator(build=37, seed=123)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            output_path = os.path.join(tmpdir, "test_23andme.txt.gz")
+            result_path = gen.save_as_23andme(output_path, num_snps=10000)
+
+            # Verify file was created
+            self.assertEqual(result_path, output_path)
+            self.assertTrue(os.path.exists(output_path))
+
+            # Verify file can be read
+            with gzip.open(output_path, "rt") as f:
+                lines = f.readlines()
+
+            # Verify header
+            self.assertTrue(lines[0].startswith("# 23andMe"))
+
+            # Verify format (tab-separated)
+            data_lines = [line for line in lines if not line.startswith("#")]
+            self.assertGreater(len(data_lines), 9000)  # Should have ~10000 SNPs
+            first_data = data_lines[0].strip().split("\t")
+            self.assertEqual(len(first_data), 4)  # rsid, chrom, pos, genotype
+
+            # Verify SNPs can load the file
+            s = SNPs(output_path)
+            self.assertEqual(s.source, "23andMe")
+            # Build detection should work with injected marker SNPs
+            self.assertEqual(s.build, 37)
+            self.assertTrue(s.build_detected)
+
+    def test_save_as_ancestry(self) -> None:
+        """Test saving SNPs in AncestryDNA format."""
+        gen = SyntheticSNPGenerator(build=37, seed=456)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            output_path = os.path.join(tmpdir, "test_ancestry.txt.gz")
+            result_path = gen.save_as_ancestry(output_path, num_snps=5000)
+
+            # Verify file was created
+            self.assertEqual(result_path, output_path)
+            self.assertTrue(os.path.exists(output_path))
+
+            # Verify file can be read
+            with gzip.open(output_path, "rt") as f:
+                lines = f.readlines()
+
+            # Verify header
+            self.assertTrue(lines[0].startswith("#AncestryDNA"))
+
+            # Verify format (allele1 and allele2 columns)
+            header_line = [line for line in lines if line.startswith("rsid")][0]
+            self.assertIn("allele1", header_line)
+            self.assertIn("allele2", header_line)
+
+            # Verify SNPs can load the file
+            s = SNPs(output_path)
+            self.assertEqual(s.source, "AncestryDNA")
+
+    def test_save_as_ftdna(self) -> None:
+        """Test saving SNPs in FTDNA format."""
+        gen = SyntheticSNPGenerator(build=36, seed=789)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            output_path = os.path.join(tmpdir, "test_ftdna.csv.gz")
+            result_path = gen.save_as_ftdna(output_path, num_snps=7000)
+
+            # Verify file was created
+            self.assertEqual(result_path, output_path)
+            self.assertTrue(os.path.exists(output_path))
+
+            # Verify file can be read
+            with gzip.open(output_path, "rt") as f:
+                lines = f.readlines()
+
+            # Verify CSV format with quotes
+            header = lines[0].strip()
+            self.assertEqual(header, "RSID,CHROMOSOME,POSITION,RESULT")
+
+            # Verify data line format
+            self.assertTrue(lines[1].startswith('"rs'))
+
+            # Verify SNPs can load the file
+            s = SNPs(output_path)
+            self.assertEqual(s.source, "FTDNA")
+            # Build detection should work with injected marker SNPs
+            self.assertEqual(s.build, 36)
+            self.assertTrue(s.build_detected)
+
+    def test_save_as_generic_csv(self) -> None:
+        """Test saving SNPs in generic CSV format."""
+        gen = SyntheticSNPGenerator(build=37, seed=202)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            output_path = os.path.join(tmpdir, "test.csv.gz")
+            result_path = gen.save_as_generic(output_path, format="csv", num_snps=2000)
+
+            # Verify file was created
+            self.assertEqual(result_path, output_path)
+            self.assertTrue(os.path.exists(output_path))
+
+            # Verify file can be read
+            with gzip.open(output_path, "rt") as f:
+                lines = f.readlines()
+
+            # Verify CSV header
+            header = lines[0].strip()
+            self.assertEqual(header, "rsid,chrom,pos,genotype")
+
+            # Verify data format (comma-separated with 4 fields)
+            data_lines = [line.strip() for line in lines[1:] if line.strip()]
+            self.assertGreater(len(data_lines), 0, "No data lines found")
+            self.assertEqual(len(data_lines[0].split(",")), 4)
+
+            # Verify SNPs can load the file
+            s = SNPs(output_path)
+            self.assertGreater(s.count, 1900)
+
+    def test_save_as_generic_tsv(self) -> None:
+        """Test saving SNPs in generic TSV format."""
+        gen = SyntheticSNPGenerator(build=37, seed=303)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            output_path = os.path.join(tmpdir, "test.tsv")
+            result_path = gen.save_as_generic(output_path, format="tsv", num_snps=1500)
+
+            # Verify file was created (not gzipped)
+            self.assertEqual(result_path, output_path)
+            self.assertTrue(os.path.exists(output_path))
+
+            # Verify file can be read
+            with open(output_path, "r") as f:
+                lines = f.readlines()
+
+            # Verify TSV header
+            header = lines[0].strip()
+            self.assertEqual(header, "rsid\tchrom\tpos\tgenotype")
+
+            # Verify data format (tab-separated with 4 fields)
+            data_lines = [line.strip() for line in lines[1:] if line.strip()]
+            self.assertGreater(len(data_lines), 0, "No data lines found")
+            self.assertEqual(len(data_lines[0].split("\t")), 4)
+
+    def test_save_creates_directory(self) -> None:
+        """Test that save methods create output directory if needed."""
+        gen = SyntheticSNPGenerator(build=37, seed=404)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create a nested path that doesn't exist
+            output_path = os.path.join(tmpdir, "subdir1", "subdir2", "test.txt.gz")
+            gen.save_as_23andme(output_path, num_snps=1000)
+
+            # Verify file was created
+            self.assertTrue(os.path.exists(output_path))
+
+    def test_large_dataset(self) -> None:
+        """Test generating a large dataset."""
+        gen = SyntheticSNPGenerator(build=37, seed=505)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            output_path = os.path.join(tmpdir, "large.txt.gz")
+            gen.save_as_23andme(output_path, num_snps=1000000)
+
+            # Verify file was created
+            self.assertTrue(os.path.exists(output_path))
+
+            # Verify SNPs can load it
+            s = SNPs(output_path)
+            self.assertGreater(s.count, 950000)  # Should be close to 1M
+            self.assertLess(s.count, 1050000)
+
+    def test_different_builds_have_different_chromosome_sizes(self) -> None:
+        """Test that different builds use different chromosome size ranges."""
+        gen36 = SyntheticSNPGenerator(build=36)
+        gen37 = SyntheticSNPGenerator(build=37)
+        gen38 = SyntheticSNPGenerator(build=38)
+
+        # Verify different chromosome sizes are used
+        self.assertNotEqual(gen36.chrom_sizes["1"], gen37.chrom_sizes["1"])
+        self.assertNotEqual(gen37.chrom_sizes["1"], gen38.chrom_sizes["1"])
+        self.assertNotEqual(gen36.chrom_sizes["1"], gen38.chrom_sizes["1"])
+
+    def test_build_marker_injection(self) -> None:
+        """Test that build marker SNPs are injected correctly."""
+        # Test Build 37
+        gen37 = SyntheticSNPGenerator(build=37, seed=123)
+        df37 = gen37.generate_snps(num_snps=10000, inject_build_markers=True)
+
+        # Check that at least one marker SNP was injected
+        found_markers = [rsid for rsid in BUILD_MARKER_SNPS if rsid in df37.index]
+        self.assertGreater(len(found_markers), 0)
+
+        # Verify marker positions are correct for Build 37
+        if "rs3094315" in df37.index:
+            self.assertEqual(df37.loc["rs3094315"]["pos"], 752566)
+            self.assertEqual(df37.loc["rs3094315"]["chrom"], "1")
+
+        # Test Build 36
+        gen36 = SyntheticSNPGenerator(build=36, seed=456)
+        df36 = gen36.generate_snps(num_snps=10000, inject_build_markers=True)
+
+        # Verify marker position is different for Build 36
+        if "rs3094315" in df36.index:
+            self.assertEqual(df36.loc["rs3094315"]["pos"], 742429)
+            self.assertEqual(df36.loc["rs3094315"]["chrom"], "1")
+
+    def test_build_detection_with_markers(self) -> None:
+        """Test that SNPs with injected markers can be correctly build-detected."""
+        gen37 = SyntheticSNPGenerator(build=37, seed=789)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            output_path = os.path.join(tmpdir, "test_build37.txt.gz")
+            gen37.save_as_23andme(output_path, num_snps=50000)
+
+            # Load and verify build detection works
+            s = SNPs(output_path)
+            self.assertEqual(s.build, 37)
+            self.assertTrue(s.build_detected)
+
+    def test_generate_without_markers(self) -> None:
+        """Test generating SNPs without build markers."""
+        gen = SyntheticSNPGenerator(build=37, seed=999)
+        df = gen.generate_snps(num_snps=1000, inject_build_markers=False)
+
+        # Verify no marker SNPs are present
+        found_markers = [rsid for rsid in BUILD_MARKER_SNPS if rsid in df.index]
+        self.assertEqual(len(found_markers), 0)
+
+    def test_inject_build_markers_no_matching_chromosomes(self) -> None:
+        """Test _inject_build_markers when chromosomes have no marker SNPs.
+
+        Marker SNPs are on chromosomes 1, 2, and 20. Using chromosomes
+        without markers should skip marker injection (empty marker_rows branch).
+        """
+        gen = SyntheticSNPGenerator(build=37, seed=123)
+
+        # Generate SNPs only on chromosomes that don't have marker SNPs
+        # Marker SNPs are on chroms: 1, 2, 20
+        df = gen.generate_snps(
+            num_snps=500, chromosomes=["MT", "X", "Y"], inject_build_markers=True
+        )
+
+        # Verify DataFrame was created with correct structure
+        self.assertIsInstance(df, pd.DataFrame)
+        self.assertGreater(len(df), 0)
+        self.assertEqual(df.index.name, "rsid")
+
+        # Verify only the specified chromosomes are present
+        unique_chroms = set(df["chrom"].unique())
+        self.assertTrue(unique_chroms.issubset({"MT", "X", "Y"}))
+
+        # Verify no marker SNPs were injected (since they're on other chromosomes)
+        found_markers = [rsid for rsid in BUILD_MARKER_SNPS if rsid in df.index]
+        self.assertEqual(len(found_markers), 0)
+
+    def test_create_example_dataset_pair(self) -> None:
+        """Test creating a pair of example datasets for merge demonstration."""
+        gen = SyntheticSNPGenerator(build=37, seed=47)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path1, path2 = gen.create_example_dataset_pair(tmpdir)
+
+            # Verify files were created
+            self.assertTrue(os.path.exists(path1))
+            self.assertTrue(os.path.exists(path2))
+            self.assertTrue(path1.endswith("sample1.23andme.txt.gz"))
+            self.assertTrue(path2.endswith("sample2.ftdna.csv.gz"))
+
+            # Load both files
+            s1 = SNPs(path1)
+            s2 = SNPs(path2)
+
+            # Verify sources
+            self.assertEqual(s1.source, "23andMe")
+            self.assertEqual(s2.source, "FTDNA")
+
+            # Verify builds
+            self.assertEqual(s1.build, 37)
+            self.assertEqual(s2.build, 37)
+
+            # Verify file 1 includes all chromosomes (1-22, X, Y, MT)
+            s1_chroms = set(s1.snps["chrom"].unique())
+            self.assertIn("X", s1_chroms)
+            self.assertIn("Y", s1_chroms)
+            self.assertIn("MT", s1_chroms)
+
+            # Verify file 2 only has autosomal chromosomes (1-22)
+            s2_chroms = set(s2.snps["chrom"].unique())
+            self.assertNotIn("X", s2_chroms)
+            self.assertNotIn("Y", s2_chroms)
+            self.assertNotIn("MT", s2_chroms)
+
+            # Verify approximate SNP counts
+            # File 1: ~700K base + ~292K unique = ~992K
+            self.assertGreater(s1.count, 950000)
+            self.assertLess(s1.count, 1050000)
+
+            # File 2: autosomal base SNPs + ~15K unique (no X, Y, MT)
+            self.assertGreater(s2.count, 600000)
+            self.assertLess(s2.count, 750000)
+
+            # Verify shared RSIDs exist (base SNPs rs1-rs700000)
+            common_rsids = set(s1.snps.index) & set(s2.snps.index)
+            self.assertGreater(len(common_rsids), 600000)
+
+    def test_unique_positions_small_chromosome(self) -> None:
+        """Test that positions are unique even on small chromosomes like MT."""
+        gen = SyntheticSNPGenerator(build=37, seed=42)
+
+        # MT chromosome is only ~16,569 bp, generate many SNPs
+        df = gen.generate_snps(num_snps=10000, chromosomes=["MT"])
+
+        # Check for duplicate positions within chromosome
+        duplicates = df.duplicated(subset=["chrom", "pos"], keep=False)
+        self.assertEqual(duplicates.sum(), 0, "Found duplicate positions on MT")
+
+        # Verify all positions are within valid range
+        self.assertTrue((df["pos"] >= 1).all())
+        self.assertTrue((df["pos"] <= gen.chrom_sizes["MT"]).all())
+
+    def test_unique_positions_large_dataset(self) -> None:
+        """Test that positions are unique in large datasets."""
+        gen = SyntheticSNPGenerator(build=37, seed=123)
+
+        # Generate 100K SNPs across all chromosomes
+        df = gen.generate_snps(num_snps=100000)
+
+        # Check for duplicate positions within each chromosome
+        duplicates = df.duplicated(subset=["chrom", "pos"], keep=False)
+        self.assertEqual(duplicates.sum(), 0, "Found duplicate positions")
diff --git a/tests/test_resources.py b/tests/test_resources.py
index 9ebecaf9..9e169082 100644
--- a/tests/test_resources.py
+++ b/tests/test_resources.py
@@ -3,13 +3,9 @@
 import socket
 import tempfile
 import urllib.error
-import warnings
-import zipfile
 from unittest.mock import Mock, mock_open, patch
 
 import numpy as np
-import pandas as pd
-from atomicwrites import atomic_write
 
 from snps import SNPs
 from snps.resources import ReferenceSequence, Resources
@@ -122,20 +118,6 @@ def f():
         for k, v in resources.items():
             self.assertGreater(len(v), 0)
 
-    def test_download_example_datasets(self):
-        def f():
-            with patch("urllib.request.urlopen", mock_open(read_data=b"")):
-                return self.resource.download_example_datasets()
-
-        paths = (
-            self.resource.download_example_datasets() if self.downloads_enabled else f()
-        )
-
-        for path in paths:
-            if not path or not os.path.exists(path):
-                warnings.warn("Example dataset(s) not currently available")
-                return
-
     def test_get_paths_reference_sequences_invalid_assembly(self):
         assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
             assembly="36"
@@ -415,53 +397,50 @@ def test_reference_sequence_generic_load_sequence(self):
             self.assertEqual(seq.end, 117)
             self.assertEqual(seq.length, 117)
 
-    def test_get_opensnp_datadump_filenames(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # temporarily set resources dir to tests
-            self.resource._resources_dir = tmpdir
-
-            # write test openSNP datadump zip
-            with atomic_write(
-                os.path.join(tmpdir, "opensnp_datadump.current.zip"),
-                mode="wb",
-                overwrite=True,
-            ) as f:
-                with zipfile.ZipFile(f, "w") as f_zip:
-                    f_zip.write("tests/input/generic.csv", arcname="generic1.csv")
-                    f_zip.write("tests/input/generic.csv", arcname="generic2.csv")
-
-            filenames = self.resource.get_opensnp_datadump_filenames()
-
-            self.assertListEqual(filenames, ["generic1.csv", "generic2.csv"])
-
-            self.resource._resources_dir = "resources"
-
-    def test_load_opensnp_datadump_file(self):
+    def test_create_example_datasets(self):
+        """Test creating synthetic example datasets."""
         with tempfile.TemporaryDirectory() as tmpdir:
-            # temporarily set resources dir to tests
-            self.resource._resources_dir = tmpdir
-
-            # write test openSNP datadump zip
-            with atomic_write(
-                os.path.join(tmpdir, "opensnp_datadump.current.zip"),
-                mode="wb",
-                overwrite=True,
-            ) as f:
-                with zipfile.ZipFile(f, "w") as f_zip:
-                    f_zip.write("tests/input/generic.csv", arcname="generic1.csv")
-                    f_zip.write("tests/input/generic.csv", arcname="generic2.csv")
-
-            snps1 = SNPs(self.resource.load_opensnp_datadump_file("generic1.csv"))
-            snps2 = SNPs(self.resource.load_opensnp_datadump_file("generic2.csv"))
-
-            pd.testing.assert_frame_equal(
-                snps1.snps, self.generic_snps(), check_exact=True
-            )
-            pd.testing.assert_frame_equal(
-                snps2.snps, self.generic_snps(), check_exact=True
-            )
-
-            self.resource._resources_dir = "resources"
+            paths = self.resource.create_example_datasets(tmpdir)
+
+            # Verify two files were created
+            self.assertEqual(len(paths), 2)
+            self.assertTrue(os.path.exists(paths[0]))
+            self.assertTrue(os.path.exists(paths[1]))
+
+            # Verify filenames
+            self.assertTrue(paths[0].endswith("sample1.23andme.txt.gz"))
+            self.assertTrue(paths[1].endswith("sample2.ftdna.csv.gz"))
+
+            # Verify files can be loaded
+            s1 = SNPs(paths[0])
+            s2 = SNPs(paths[1])
+
+            # Verify file sources are detected correctly
+            self.assertEqual(s1.source, "23andMe")
+            self.assertEqual(s2.source, "FTDNA")
+
+            # Build detection should work with injected marker SNPs
+            self.assertEqual(s1.build, 37)
+            self.assertTrue(s1.build_detected)
+            self.assertEqual(s2.build, 37)
+            self.assertTrue(s2.build_detected)
+
+            # Verify SNP counts are approximately correct
+            self.assertGreater(s1.count, 900000)
+            # FTDNA file only contains autosomal chromosomes (1-22), so count is lower
+            self.assertGreater(s2.count, 650000)
+
+            # Verify 23andMe file includes all chromosomes
+            s1_chroms = set(s1.snps["chrom"].unique())
+            self.assertIn("X", s1_chroms)
+            self.assertIn("Y", s1_chroms)
+            self.assertIn("MT", s1_chroms)
+
+            # Verify FTDNA file only includes autosomal chromosomes (1-22)
+            s2_chroms = set(s2.snps["chrom"].unique())
+            self.assertNotIn("X", s2_chroms)
+            self.assertNotIn("Y", s2_chroms)
+            self.assertNotIn("MT", s2_chroms)
 
     def _generate_test_chip_clusters(self):
         s = "1:1\tc1\n" * 2135214

From 779bb1051af3efb39e42503eddcbd70ec341e26f Mon Sep 17 00:00:00 2001
From: Andrew Riha <apriha@gmail.com>
Date: Thu, 22 Jan 2026 22:22:41 -0800
Subject: [PATCH 2/2] Support modern Python versions

---
 .github/workflows/ci.yml   |  12 ++--
 README.rst                 |   2 +-
 setup.py                   |   5 +-
 tests/__init__.py          |  93 ++++++++++++++++++++++--
 tests/io/test_generator.py |   6 +-
 tests/io/test_writer.py    |   3 +-
 tests/test_snps.py         | 140 +++++++++++++++++++++++--------------
 7 files changed, 189 insertions(+), 72 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7120e898..5ce8af4c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -32,7 +32,7 @@ jobs:
     - uses: actions/checkout@v4
     - uses: actions/setup-python@v5
       with:
-        python-version: '3.12'
+        python-version: '3.14'
     - name: Install Ruff
       run: |
         pip install ruff
@@ -50,7 +50,7 @@ jobs:
     - uses: actions/checkout@v4
     - uses: actions/setup-python@v5
       with:
-        python-version: '3.12'
+        python-version: '3.14'
     - name: Install Sphinx
       run: |
         pip install -r docs/requirements.txt
@@ -65,12 +65,12 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14']
         include:
           - os: macos-latest
-            python-version: '3.12'
+            python-version: '3.14'
           - os: windows-latest
-            python-version: '3.12'
+            python-version: '3.14'
 
     env:
       JOB_ID: ${{ strategy.job-index }}
@@ -127,7 +127,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: ['3.8', '3.9', '3.10', '3.11']
+        python-version: ['3.9', '3.10', '3.11']
 
     steps:
     - uses: actions/checkout@v4
diff --git a/README.rst b/README.rst
index a7831a20..dfff8ae1 100644
--- a/README.rst
+++ b/README.rst
@@ -69,7 +69,7 @@ Additionally, ``snps`` can read a variety of "generic" CSV and TSV files.
 
 Dependencies
 ------------
-``snps`` requires `Python <https://www.python.org>`_ 3.8+ and the following Python
+``snps`` requires `Python <https://www.python.org>`_ 3.9+ and the following Python
 packages:
 
 - `numpy <http://www.numpy.org>`_
diff --git a/setup.py b/setup.py
index e9746617..de6ae323 100644
--- a/setup.py
+++ b/setup.py
@@ -63,11 +63,12 @@
         "License :: OSI Approved :: BSD License",
         "Natural Language :: English",
         "Operating System :: OS Independent",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
         "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
+        "Programming Language :: Python :: 3.14",
         "Topic :: Scientific/Engineering",
         "Topic :: Scientific/Engineering :: Bio-Informatics",
         "Topic :: Scientific/Engineering :: Information Analysis",
@@ -81,6 +82,6 @@
     keywords="snps dna chromosomes bioinformatics vcf",
     install_requires=["numpy", "pandas", "atomicwrites"],
     extras_require={"ezancestry": ["ezancestry"]},
-    python_requires=">=3.8",
+    python_requires=">=3.9",
     platforms=["any"],
 )
diff --git a/tests/__init__.py b/tests/__init__.py
index 375268ce..d7d1a0aa 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 import pandas as pd
-from pandas.api.types import is_object_dtype, is_unsigned_integer_dtype
+from pandas.api.types import is_object_dtype, is_string_dtype, is_unsigned_integer_dtype
 
 from snps import SNPs
 from snps.utils import gzip_file, zip_file
@@ -646,12 +646,89 @@ def run_parsing_tests_vcf(
                 snps_df,
             )
 
+    def assert_series_equal_with_string_dtype(self, left, right, **kwargs):
+        """Assert Series are equal, accepting both object and StringDtype for string data.
+
+        In Python 3.14+, pandas infers StringDtype for string data instead of object.
+        This wrapper compares Series without strict dtype matching for string data.
+
+        Parameters
+        ----------
+        left : pd.Series
+            First Series to compare
+        right : pd.Series
+            Second Series to compare
+        **kwargs : dict
+            Additional arguments passed to pd.testing.assert_series_equal
+        """
+        # Verify string series have string or object dtypes
+        if is_string_dtype(left.dtype) or is_object_dtype(left.dtype):
+            self.assertTrue(
+                is_string_dtype(right.dtype) or is_object_dtype(right.dtype),
+                f"Right series dtype {right.dtype} should be string/object type",
+            )
+        # Compare Series without strict dtype matching
+        pd.testing.assert_series_equal(left, right, check_dtype=False, **kwargs)
+
+    def assert_frame_equal_with_string_index(self, left, right, **kwargs):
+        """Assert DataFrames are equal, accepting both object and StringDtype for string columns.
+
+        In Python 3.14+, pandas infers StringDtype for string columns/indices instead of object.
+        This wrapper validates that string columns have string types, then compares the
+        DataFrames without strict dtype matching for object/string columns.
+
+        Parameters
+        ----------
+        left : pd.DataFrame
+            First DataFrame to compare
+        right : pd.DataFrame
+            Second DataFrame to compare
+        **kwargs : dict
+            Additional arguments passed to pd.testing.assert_frame_equal
+        """
+        # Verify index dtypes are string types if they're named 'rsid'
+        if left.index.name == "rsid":
+            self.assertTrue(
+                is_string_dtype(left.index.dtype),
+                f"Left index dtype {left.index.dtype} is not a string type",
+            )
+        if right.index.name == "rsid":
+            self.assertTrue(
+                is_string_dtype(right.index.dtype),
+                f"Right index dtype {right.index.dtype} is not a string type",
+            )
+
+        # Verify string columns (chrom, genotype) have string dtypes
+        for col in ["chrom", "genotype"]:
+            if col in left.columns:
+                self.assertTrue(
+                    is_string_dtype(left[col].dtype)
+                    or is_object_dtype(left[col].dtype),
+                    f"Left column '{col}' dtype {left[col].dtype} is not a string/object type",
+                )
+            if col in right.columns:
+                self.assertTrue(
+                    is_string_dtype(right[col].dtype)
+                    or is_object_dtype(right[col].dtype),
+                    f"Right column '{col}' dtype {right[col].dtype} is not a string/object type",
+                )
+
+        # Compare DataFrames without strict dtype matching for string columns
+        pd.testing.assert_frame_equal(
+            left, right, check_index_type=False, check_dtype=False, **kwargs
+        )
+
     def make_normalized_dataframe_assertions(self, df):
         self.assertEqual(df.index.name, "rsid")
-        self.assertTrue(is_object_dtype(df.index.dtype))
-        self.assertTrue(is_object_dtype(df.chrom.dtype))
+        # Accept both object dtype and StringDtype (used in Python 3.14+)
+        self.assertTrue(is_string_dtype(df.index.dtype))
+        self.assertTrue(
+            is_string_dtype(df.chrom.dtype) or is_object_dtype(df.chrom.dtype)
+        )
         self.assertTrue(is_unsigned_integer_dtype(df.pos.dtype))
-        self.assertTrue(is_object_dtype(df.genotype.dtype))
+        self.assertTrue(
+            is_string_dtype(df.genotype.dtype) or is_object_dtype(df.genotype.dtype)
+        )
 
     def parse_file(self, file, rsids=()):
         return SNPs(file, rsids=rsids)
@@ -675,7 +752,7 @@ def make_parsing_assertions(
         print(snps_df.info())
 
         self.assertEqual(snps.source, source)
-        pd.testing.assert_frame_equal(snps.snps, snps_df, check_exact=True)
+        self.assert_frame_equal_with_string_index(snps.snps, snps_df, check_exact=True)
         self.assertTrue(snps.phased) if phased else self.assertFalse(snps.phased)
         self.assertEqual(snps.build, build)
         (
@@ -699,11 +776,13 @@ def make_parsing_assertions_vcf(
         else:
             self.assertFalse(snps.unannotated_vcf)
             (
-                pd.testing.assert_frame_equal(
+                self.assert_frame_equal_with_string_index(
                     snps.snps, snps_df.loc[rsids], check_exact=True
                 )
                 if rsids
-                else pd.testing.assert_frame_equal(snps.snps, snps_df, check_exact=True)
+                else self.assert_frame_equal_with_string_index(
+                    snps.snps, snps_df, check_exact=True
+                )
             )
 
         self.assertTrue(snps.phased) if phased else self.assertFalse(snps.phased)
diff --git a/tests/io/test_generator.py b/tests/io/test_generator.py
index 6856dd3c..11b77cdb 100644
--- a/tests/io/test_generator.py
+++ b/tests/io/test_generator.py
@@ -5,7 +5,6 @@
 import gzip
 import os
 import tempfile
-from unittest import TestCase
 
 import numpy as np
 import pandas as pd
@@ -13,9 +12,10 @@
 from snps import SNPs
 from snps.build_constants import BUILD_MARKER_SNPS
 from snps.io.generator import SyntheticSNPGenerator
+from tests import BaseSNPsTestCase
 
 
-class TestSyntheticSNPGenerator(TestCase):
+class TestSyntheticSNPGenerator(BaseSNPsTestCase):
     def test_init_build_37(self) -> None:
         """Test generator initialization with Build 37."""
         gen = SyntheticSNPGenerator(build=37, seed=123)
@@ -107,7 +107,7 @@ def test_generate_snps_reproducibility(self) -> None:
         df2 = gen2.generate_snps(num_snps=1000)
 
         # Verify identical output
-        pd.testing.assert_frame_equal(df1, df2)
+        self.assert_frame_equal_with_string_index(df1, df2)
 
     def test_generate_snps_rsid_format(self) -> None:
         """Test that rsIDs are correctly formatted."""
diff --git a/tests/io/test_writer.py b/tests/io/test_writer.py
index 9e23acbe..155ee0b2 100755
--- a/tests/io/test_writer.py
+++ b/tests/io/test_writer.py
@@ -2,7 +2,6 @@
 import tempfile
 
 import numpy as np
-import pandas as pd
 
 from snps import SNPs
 from snps.resources import ReferenceSequence, Resources
@@ -169,7 +168,7 @@ def test_save_snps_vcf_discrepant_pos(self):
 
                 self.assertEqual(s.to_vcf(), output)
 
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 s.discrepant_vcf_position,
                 self.create_snp_df(
                     rsid=["rs1", "rs17"],
diff --git a/tests/test_snps.py b/tests/test_snps.py
index 176f29af..f3be4260 100755
--- a/tests/test_snps.py
+++ b/tests/test_snps.py
@@ -67,7 +67,7 @@ def test_build_detected_PAR_snps(self):
         snps = self.load_assign_PAR_SNPs("tests/input/GRCh37_PAR.csv")
         self.assertEqual(snps.build, 37)
         self.assertTrue(snps.build_detected)
-        pd.testing.assert_frame_equal(
+        self.assert_frame_equal_with_string_index(
             snps.snps, self.snps_GRCh37_PAR(), check_exact=True
         )
 
@@ -100,7 +100,7 @@ def test_deduplicate_false(self):
             pos=[101, 102, 103],
             genotype=["AA", "CC", "GG"],
         )
-        pd.testing.assert_frame_equal(snps.snps, result, check_exact=True)
+        self.assert_frame_equal_with_string_index(snps.snps, result, check_exact=True)
 
     def test_deduplicate_MT_chrom(self):
         snps = SNPs("tests/input/ancestry_mt.txt")
@@ -111,12 +111,12 @@ def test_deduplicate_MT_chrom(self):
             pos=[101, 102],
             genotype=["A", np.nan],
         )
-        pd.testing.assert_frame_equal(snps.snps, result, check_exact=True)
+        self.assert_frame_equal_with_string_index(snps.snps, result, check_exact=True)
 
         heterozygous_MT_snps = self.create_snp_df(
             rsid=["rs3"], chrom=["MT"], pos=[103], genotype=["GC"]
         )
-        pd.testing.assert_frame_equal(
+        self.assert_frame_equal_with_string_index(
             snps.heterozygous_MT, heterozygous_MT_snps, check_exact=True
         )
 
@@ -129,7 +129,7 @@ def test_deduplicate_MT_chrom_false(self):
             pos=[101, 102, 103],
             genotype=["AA", np.nan, "GC"],
         )
-        pd.testing.assert_frame_equal(snps.snps, result, check_exact=True)
+        self.assert_frame_equal_with_string_index(snps.snps, result, check_exact=True)
 
     def test_duplicate_rsids(self):
         snps = SNPs("tests/input/duplicate_rsids.csv")
@@ -139,8 +139,10 @@ def test_duplicate_rsids(self):
         duplicate = self.create_snp_df(
             rsid=["rs1", "rs1"], chrom=["1", "1"], pos=[102, 103], genotype=["CC", "GG"]
         )
-        pd.testing.assert_frame_equal(snps.snps, result, check_exact=True)
-        pd.testing.assert_frame_equal(snps.duplicate, duplicate, check_exact=True)
+        self.assert_frame_equal_with_string_index(snps.snps, result, check_exact=True)
+        self.assert_frame_equal_with_string_index(
+            snps.duplicate, duplicate, check_exact=True
+        )
 
     def test_empty_dataframe(self):
         for snps in self.empty_snps():
@@ -174,7 +176,7 @@ def test_summary_no_snps(self):
 
     def test_heterozygous(self):
         s = SNPs("tests/input/generic.csv")
-        pd.testing.assert_frame_equal(
+        self.assert_frame_equal_with_string_index(
             s.heterozygous(),
             self.create_snp_df(
                 rsid=["rs6", "rs7", "rs8"],
@@ -187,7 +189,7 @@ def test_heterozygous(self):
 
     def test_homozygous(self):
         s = SNPs("tests/input/generic.csv")
-        pd.testing.assert_frame_equal(
+        self.assert_frame_equal_with_string_index(
             s.homozygous(),
             self.create_snp_df(
                 rsid=["rs1", "rs2", "rs3", "rs4"],
@@ -200,7 +202,7 @@ def test_homozygous(self):
 
     def test_homozygous_chrom(self):
         s = SNPs("tests/input/generic.csv")
-        pd.testing.assert_frame_equal(
+        self.assert_frame_equal_with_string_index(
             s.homozygous("1"),
             self.create_snp_df(
                 rsid=["rs1", "rs2", "rs3", "rs4"],
@@ -223,7 +225,7 @@ def test_notnull(self):
         s = SNPs("tests/input/generic.csv")
         snps = self.generic_snps()
         snps.drop("rs5", inplace=True)
-        pd.testing.assert_frame_equal(s.notnull(), snps, check_exact=True)
+        self.assert_frame_equal_with_string_index(s.notnull(), snps, check_exact=True)
 
     def test_only_detect_source(self):
         s = SNPs("tests/input/generic.csv", only_detect_source=True)
@@ -246,7 +248,9 @@ def f():
             self.assertEqual(s.assembly, "GRCh37")
             self.assertEqual(len(chromosomes_remapped), 2)
             self.assertEqual(len(chromosomes_not_remapped), 0)
-            pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
+            self.assert_frame_equal_with_string_index(
+                s.snps, self.snps_GRCh37(), check_exact=True
+            )
 
         self._run_remap_test(f, self.NCBI36_GRCh37())
 
@@ -258,7 +262,9 @@ def f():
             self.assertEqual(s.assembly, "GRCh37")
             self.assertEqual(len(chromosomes_remapped), 2)
             self.assertEqual(len(chromosomes_not_remapped), 0)
-            pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
+            self.assert_frame_equal_with_string_index(
+                s.snps, self.snps_GRCh37(), check_exact=True
+            )
 
         self._run_remap_test(f, self.NCBI36_GRCh37())
 
@@ -270,7 +276,9 @@ def f():
             self.assertEqual(s.assembly, "NCBI36")
             self.assertEqual(len(chromosomes_remapped), 2)
             self.assertEqual(len(chromosomes_not_remapped), 0)
-            pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36(), check_exact=True)
+            self.assert_frame_equal_with_string_index(
+                s.snps, self.snps_NCBI36(), check_exact=True
+            )
 
         self._run_remap_test(f, self.GRCh37_NCBI36())
 
@@ -282,7 +290,9 @@ def f():
             self.assertEqual(s.assembly, "GRCh38")
             self.assertEqual(len(chromosomes_remapped), 2)
             self.assertEqual(len(chromosomes_not_remapped), 0)
-            pd.testing.assert_frame_equal(s.snps, self.snps_GRCh38(), check_exact=True)
+            self.assert_frame_equal_with_string_index(
+                s.snps, self.snps_GRCh38(), check_exact=True
+            )
 
         self._run_remap_test(f, self.GRCh37_GRCh38())
 
@@ -296,7 +306,7 @@ def f():
             self.assertEqual(len(chromosomes_remapped), 2)
             self.assertEqual(len(chromosomes_not_remapped), 1)
             self.assertEqual(s.count, 3)
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 s.snps, self.snps_GRCh38_PAR(), check_exact=True
             )
 
@@ -309,7 +319,9 @@ def test_remap_37_to_37(self):
         self.assertEqual(s.assembly, "GRCh37")
         self.assertEqual(len(chromosomes_remapped), 0)
         self.assertEqual(len(chromosomes_not_remapped), 2)
-        pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
+        self.assert_frame_equal_with_string_index(
+            s.snps, self.snps_GRCh37(), check_exact=True
+        )
 
     def test_remap_invalid_assembly(self):
         s = SNPs("tests/input/GRCh37.csv")
@@ -356,7 +368,7 @@ def test_save_source(self):
             self.assertTrue(snps.build_detected)
             self.assertEqual(snps.source, "generic")
             self.assertListEqual(snps._source, ["generic"])
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 snps.snps, self.snps_GRCh38(), check_exact=True
             )
 
@@ -393,7 +405,9 @@ def test_sex_Male_X_chrom_discrepant_XY(self):
         result = self.create_snp_df(
             rsid=["rs8001"], chrom=["X"], pos=[80000001], genotype=["AC"]
         )
-        pd.testing.assert_frame_equal(s.discrepant_XY, result, check_exact=True)
+        self.assert_frame_equal_with_string_index(
+            s.discrepant_XY, result, check_exact=True
+        )
         self.assertEqual(s.sex, "Male")
 
     def test_sex_Male_Y_chrom(self):
@@ -595,11 +609,11 @@ def test_snps_qc(self):
         def f():
             s = SNPs("tests/input/generic.csv")
             # identify quality controlled SNPs
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 s.snps_qc, self.generic_snps().drop(["rs4", "rs6"])
             )
             # return already identified quality controlled SNPs (test branch)
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 s.snps_qc, self.generic_snps().drop(["rs4", "rs6"])
             )
 
@@ -609,11 +623,11 @@ def test_low_quality(self):
         def f():
             s = SNPs("tests/input/generic.csv")
             # identify low quality SNPs
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 s.low_quality, self.generic_snps().loc[["rs4", "rs6"]]
             )
             # return already identified low quality SNPs (test branch)
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 s.low_quality, self.generic_snps().loc[["rs4", "rs6"]]
             )
 
@@ -622,7 +636,7 @@ def f():
     def test_snps_qc_low_quality_no_cluster(self):
         def f():
             s = SNPs("tests/input/generic.csv")
-            pd.testing.assert_frame_equal(s.snps_qc, self.generic_snps())
+            self.assert_frame_equal_with_string_index(s.snps_qc, self.generic_snps())
             self.assertEqual(len(s.low_quality), 0)
 
         self.run_low_quality_snps_test(f, self.get_low_quality_snps(), cluster="")
@@ -634,10 +648,10 @@ def f():
             s._snps.drop(["rs4", "rs5", "rs6", "rs7", "rs8"], inplace=True)
             s._build = 36  # manually set build 36
             s.identify_low_quality_snps()
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 s.snps_qc, self.generic_snps().loc[["rs1", "rs3"]]
             )
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 s.low_quality, self.generic_snps().loc[["rs2"]]
             )
             self.assertEqual(s.build, 36)  # ensure copy gets remapped
@@ -709,7 +723,7 @@ def test_source_snps(self):
             s = SNPs(dest)
             self.assertEqual(s.source, "generic, 23andMe")
             self.assertListEqual(s._source, ["generic", "23andMe"])
-            pd.testing.assert_frame_equal(s.snps, s.snps, check_exact=True)
+            self.assert_frame_equal_with_string_index(s.snps, s.snps, check_exact=True)
             self.assert_results(results, [{"merged": True}])
 
     def test_merge_list(self):
@@ -717,7 +731,9 @@ def test_merge_list(self):
         results = s.merge(
             [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/GRCh37.csv")]
         )
-        pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
+        self.assert_frame_equal_with_string_index(
+            s.snps, self.snps_GRCh37(), check_exact=True
+        )
         self.assertEqual(s.source, "generic, generic")
         self.assertListEqual(s._source, ["generic", "generic"])
         self.assert_results(
@@ -740,7 +756,9 @@ def f():
             results = s.merge([SNPs("tests/input/GRCh37.csv")])
             self.assertEqual(len(s.discrepant_merge_positions), 0)
             self.assertEqual(len(s.discrepant_merge_genotypes), 0)
-            pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36(), check_exact=True)
+            self.assert_frame_equal_with_string_index(
+                s.snps, self.snps_NCBI36(), check_exact=True
+            )
             self.assert_results(
                 results,
                 [
@@ -784,7 +802,7 @@ def test_merge_remap_false(self):
         expected.loc["rs11928389", "genotype"] = (
             np.nan
         )  # discrepant genotype is set to null / NA
-        pd.testing.assert_frame_equal(s.snps, expected, check_exact=True)
+        self.assert_frame_equal_with_string_index(s.snps, expected, check_exact=True)
         self.assert_results(
             results,
             [
@@ -811,7 +829,9 @@ def test_merge_phased(self):
 
         results = s1.merge([s2])
         self.assertTrue(s1.phased)
-        pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True)
+        self.assert_frame_equal_with_string_index(
+            s1.snps, self.generic_snps(), check_exact=True
+        )
         self.assert_results(
             results,
             [
@@ -832,7 +852,9 @@ def test_merge_unphased(self):
 
         results = s1.merge([s2])
         self.assertFalse(s1.phased)
-        pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True)
+        self.assert_frame_equal_with_string_index(
+            s1.snps, self.generic_snps(), check_exact=True
+        )
         self.assert_results(
             results,
             [
@@ -851,7 +873,9 @@ def test_merge_non_existent_file(self):
         results = s.merge(
             [SNPs("tests/input/non_existent_file.csv"), SNPs("tests/input/GRCh37.csv")]
         )
-        pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
+        self.assert_frame_equal_with_string_index(
+            s.snps, self.snps_GRCh37(), check_exact=True
+        )
         self.assert_results(results, [{}, {"merged": True}])
 
     def test_merge_invalid_file(self):
@@ -859,7 +883,9 @@ def test_merge_invalid_file(self):
         results = s.merge(
             [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/empty.txt")]
         )
-        pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
+        self.assert_frame_equal_with_string_index(
+            s.snps, self.snps_GRCh37(), check_exact=True
+        )
         self.assert_results(results, [{"merged": True}, {}])
 
     def test_merge_exceed_discrepant_positions_threshold(self):
@@ -871,7 +897,9 @@ def test_merge_exceed_discrepant_positions_threshold(self):
         self.assertEqual(len(s1.discrepant_merge_positions), 0)
         self.assertEqual(len(s1.discrepant_merge_genotypes), 0)
         self.assertEqual(len(s1.discrepant_merge_positions_genotypes), 0)
-        pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True)
+        self.assert_frame_equal_with_string_index(
+            s1.snps, self.generic_snps(), check_exact=True
+        )
         self.assert_results(results, [{}])
 
     def test_merge_exceed_discrepant_genotypes_threshold(self):
@@ -883,7 +911,9 @@ def test_merge_exceed_discrepant_genotypes_threshold(self):
         self.assertEqual(len(s1.discrepant_merge_positions), 0)
         self.assertEqual(len(s1.discrepant_merge_genotypes), 0)
         self.assertEqual(len(s1.discrepant_merge_positions_genotypes), 0)
-        pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True)
+        self.assert_frame_equal_with_string_index(
+            s1.snps, self.generic_snps(), check_exact=True
+        )
         self.assert_results(results, [{}])
 
     def test_merging_files_discrepant_snps(self):
@@ -960,7 +990,9 @@ def test_merging_files_discrepant_snps(self):
             )
 
             pd.testing.assert_series_equal(s.snps["pos"], expected["pos"])
-            pd.testing.assert_series_equal(s.snps["genotype"], expected["genotype"])
+            self.assert_series_equal_with_string_dtype(
+                s.snps["genotype"], expected["genotype"]
+            )
 
     def test_appending_dfs(self):
         s = SNPs()
@@ -977,12 +1009,12 @@ def test_appending_dfs(self):
         df = self.create_snp_df(
             rsid=["rs1", "rs1"], chrom=["1", "1"], pos=[1, 1], genotype=["AA", "AA"]
         )
-        pd.testing.assert_frame_equal(s.duplicate, df, check_exact=True)
-        pd.testing.assert_frame_equal(s.discrepant_XY, df, check_exact=True)
-        pd.testing.assert_frame_equal(
+        self.assert_frame_equal_with_string_index(s.duplicate, df, check_exact=True)
+        self.assert_frame_equal_with_string_index(s.discrepant_XY, df, check_exact=True)
+        self.assert_frame_equal_with_string_index(
             s.heterozygous_MT, get_empty_snps_dataframe(), check_exact=True
         )
-        pd.testing.assert_frame_equal(
+        self.assert_frame_equal_with_string_index(
             s.discrepant_vcf_position, get_empty_snps_dataframe(), check_exact=True
         )
 
@@ -1021,7 +1053,7 @@ def test_merge_chrom(self):
 
         results = s1.merge([s2], chrom="Y")
 
-        pd.testing.assert_frame_equal(s1.snps, df, check_exact=True)
+        self.assert_frame_equal_with_string_index(s1.snps, df, check_exact=True)
 
         self.assert_results(
             results,
@@ -1069,7 +1101,7 @@ def f2():
                 self.assertEqual(s.assembly, "GRCh37")
                 self.assertEqual(len(chromosomes_remapped), 2)
                 self.assertEqual(len(chromosomes_not_remapped), 0)
-                pd.testing.assert_frame_equal(
+                self.assert_frame_equal_with_string_index(
                     s.snps, self.snps_GRCh37(), check_exact=True
                 )
 
@@ -1127,7 +1159,9 @@ def f():
             s = SNPs("tests/input/generic.csv")
             snps = self.generic_snps()
             snps.drop("rs5", inplace=True)
-            pd.testing.assert_frame_equal(s.not_null_snps(), snps, check_exact=True)
+            self.assert_frame_equal_with_string_index(
+                s.not_null_snps(), snps, check_exact=True
+            )
 
         self.run_deprecated_test(f, "This method has been renamed to `notnull`.")
 
@@ -1184,8 +1218,10 @@ def f():
                 pos=[102, 103],
                 genotype=["CC", "GG"],
             )
-            pd.testing.assert_frame_equal(snps.snps, result, check_exact=True)
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
+                snps.snps, result, check_exact=True
+            )
+            self.assert_frame_equal_with_string_index(
                 snps.duplicate_snps, duplicate, check_exact=True
             )
 
@@ -1203,7 +1239,7 @@ def f():
             result = self.create_snp_df(
                 rsid=["rs8001"], chrom=["X"], pos=[80000001], genotype=["AC"]
             )
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 s.discrepant_XY_snps, result, check_exact=True
             )
             self.assertEqual(s.sex, "Male")
@@ -1222,12 +1258,14 @@ def f():
                 pos=[101, 102],
                 genotype=["A", np.nan],
             )
-            pd.testing.assert_frame_equal(snps.snps, result, check_exact=True)
+            self.assert_frame_equal_with_string_index(
+                snps.snps, result, check_exact=True
+            )
 
             heterozygous_MT_snps = self.create_snp_df(
                 rsid=["rs3"], chrom=["MT"], pos=[103], genotype=["GC"]
             )
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 snps.heterozygous_MT_snps, heterozygous_MT_snps, check_exact=True
             )
 
@@ -1238,7 +1276,7 @@ def f():
     def test_heterozygous_snps(self):
         def f():
             s = SNPs("tests/input/generic.csv")
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 s.heterozygous_snps(),
                 self.create_snp_df(
                     rsid=["rs6", "rs7", "rs8"],
@@ -1254,7 +1292,7 @@ def f():
     def test_homozygous_snps(self):
         def f():
             s = SNPs("tests/input/generic.csv")
-            pd.testing.assert_frame_equal(
+            self.assert_frame_equal_with_string_index(
                 s.homozygous_snps(),
                 self.create_snp_df(
                     rsid=["rs1", "rs2", "rs3", "rs4"],