diff --git a/src/cli.rs b/src/cli.rs
index 35cbf5b..9115547 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -315,6 +315,35 @@ pub enum Command {
         output: Option<PathBuf>,
     },
 
+    /// Compute a sparse GRM and PCA scores from a cohort + KING .seg output.
+    /// Outputs are cached under `.cohort/cache/grm/<cohort>/` and consumed
+    /// by `favor staar --kinship <grm.tsv> --pca-covariates <pca_scores.tsv>`.
+    Grm {
+        /// Pre-built cohort id (under the store root).
+        #[arg(long)]
+        cohort: String,
+
+        /// KING .seg IBD output file (external tool, not reimplemented).
+        #[arg(long)]
+        king_seg: PathBuf,
+
+        /// Maximum relatedness degree (default 4 = up to 3rd cousins).
+        #[arg(long, default_value = "4")]
+        degree: u8,
+
+        /// Number of PCA components (default 20).
+        #[arg(long, default_value = "20")]
+        n_pcs: usize,
+
+        /// SNP block size for memory control (default 5000).
+        #[arg(long, default_value = "5000")]
+        block_size: usize,
+
+        /// Output directory (default: cohort GRM cache path)
+        #[arg(short, long)]
+        output: Option<PathBuf>,
+    },
+
     /// Forward-selection LD pruning on conditional score-test p-values
     #[command(name = "ld-prune")]
     LdPrune {
diff --git a/src/commands/grm.rs b/src/commands/grm.rs
new file mode 100644
index 0000000..f360a41
--- /dev/null
+++ b/src/commands/grm.rs
@@ -0,0 +1,200 @@
+//! `favor grm` subcommand: FastSparseGRM pipeline.
+
+use std::path::PathBuf;
+
+use serde_json::json;
+
+use crate::error::CohortError;
+use crate::output::Output;
+use crate::runtime::Engine;
+use crate::staar::grm::{cache, estimate, king, pca, unrelated};
+use crate::staar::grm::types::GrmArtifact;
+use crate::store::cohort::CohortId;
+use crate::store::ids::CacheKey;
+
+pub struct GrmArgs {
+    pub cohort: String,
+    pub king_seg: PathBuf,
+    pub degree: u8,
+    pub n_pcs: usize,
+    pub block_size: usize,
+    pub output: Option<PathBuf>,
+}
+
+pub fn run(
+    engine: &Engine,
+    args: GrmArgs,
+    out: &dyn Output,
+    dry_run: bool,
+) -> Result<(), CohortError> {
+    if !args.king_seg.exists() {
+        return Err(CohortError::Input(format!(
+            "KING .seg file not found: '{}'",
+            args.king_seg.display()
+        )));
+    }
+    if args.degree == 0 || args.degree > 10 {
+        return Err(CohortError::Input(format!(
+            "--degree must be 1..10, got {}",
+            args.degree
+        )));
+    }
+
+    let cohort_id = CohortId::new(args.cohort.trim().to_string());
+    let cohort = engine.cohort(&cohort_id);
+    let store_result = cohort.load()?;
+    let manifest = &store_result.manifest;
+
+    let fp = cache::fingerprint(
+        &manifest.key,
+        &args.king_seg,
+        args.degree,
+        args.n_pcs,
+    )?;
+    let cache_dir = args.output.clone().unwrap_or_else(|| {
+        engine
+            .store()
+            .layout()
+            .grm_cache_dir(&cohort_id, &CacheKey::new(&fp))
+    });
+
+    if cache::probe(&cache_dir) {
+        out.status(&format!(
+            "GRM cache hit at {}",
+            cache_dir.display()
+        ));
+        out.result_json(&json!({
+            "status": "cache_hit",
+            "cache_dir": cache_dir.to_string_lossy(),
+            "kinship": cache::grm_tsv_path(&cache_dir).to_string_lossy(),
+            "pca": cache::pca_tsv_path(&cache_dir).to_string_lossy(),
+        }));
+        return Ok(());
+    }
+
+    if dry_run {
+        out.result_json(&json!({
+            "command": "grm",
+            "cohort_id": cohort_id.as_str(),
+            "king_seg": args.king_seg.to_string_lossy(),
+            "degree": args.degree,
+            "n_pcs": args.n_pcs,
+            "n_samples": manifest.n_samples,
+            "n_variants": manifest.n_variants,
+            "output_dir": cache_dir.to_string_lossy(),
+        }));
+        return Ok(());
+    }
+
+    let sample_ids = store_result.geno.sample_names.clone();
+    let n_samples = sample_ids.len();
+    out.status(&format!(
+        "GRM: {} samples, {} variants across {} chromosomes",
+        n_samples, manifest.n_variants, manifest.chromosomes.len()
+    ));
+
+    // 1. Parse KING .seg.
+    out.status("  Parsing KING .seg file...");
+    let seg_entries = king::parse_king_seg(&args.king_seg, args.degree)?;
+    out.status(&format!("    {} related pairs after degree-{} filter", seg_entries.len(), args.degree));
+
+    // Map to cohort indices.
+    let king_ids: Vec<String> = sample_ids
+        .iter()
+        .map(|s| format!("{}_{}", s, s))
+        .collect();
+    let (candidate_pairs, _id_map) = king::map_to_cohort_indices(&seg_entries, &king_ids);
+    out.status(&format!("    {} pairs mapped to cohort", candidate_pairs.len()));
+
+    if candidate_pairs.is_empty() {
+        return Err(CohortError::Input(
+            "No KING pairs mapped to cohort samples. Check that KING sample IDs \
+             match the VCF sample IDs (FID_IID format)."
+                .into(),
+        ));
+    }
+
+    // 2. Compute divergence + select unrelated.
+    out.status("  Computing ancestry divergence...");
+    let related_indices: Vec<usize> = {
+        let mut s: std::collections::HashSet<usize> = std::collections::HashSet::new();
+        for &(i, j, _) in &candidate_pairs {
+            s.insert(i);
+            s.insert(j);
+        }
+        let mut v: Vec<usize> = s.into_iter().collect();
+        v.sort_unstable();
+        v
+    };
+    let divergence = unrelated::compute_divergence(
+        &cohort, manifest, &related_indices, n_samples, 10_000, -0.025,
+    )?;
+
+    out.status("  Selecting unrelated samples...");
+    let unrel = unrelated::select_unrelated(&candidate_pairs, n_samples, &divergence);
+    out.status(&format!("    {} unrelated samples selected", unrel.sample_indices.len()));
+
+    let unrelated_mask: Vec<bool> = (0..n_samples)
+        .map(|i| unrel.sample_indices.contains(&i))
+        .collect();
+
+    // 3. Randomized PCA.
+    out.status(&format!("  Randomized PCA ({} components)...", args.n_pcs));
+    let all_mask = vec![true; n_samples];
+    let pca_scores = pca::randomized_pca(
+        &cohort,
+        manifest,
+        &unrelated_mask,
+        &all_mask,
+        args.n_pcs,
+        10,
+    )?;
+    out.status(&format!(
+        "    PCA complete: {} eigenvalues",
+        pca_scores.eigenvalues.len()
+    ));
+
+    // 4. Estimate sparse GRM.
+    out.status("  Estimating sparse GRM...");
+    let grm = estimate::estimate_grm(
+        &cohort,
+        manifest,
+        &pca_scores,
+        &unrelated_mask,
+        &candidate_pairs,
+        n_samples,
+        args.n_pcs,
+        args.block_size,
+        args.degree,
+        out,
+    )?;
+    let n_off_diag = grm.triplets.iter().filter(|&&(i, j, _)| i < j).count();
+    out.status(&format!("    {} off-diagonal kinship pairs", n_off_diag));
+
+    // 5. Cache.
+    let artifact = GrmArtifact {
+        grm,
+        pca: pca_scores,
+        unrelated: unrel,
+        sample_ids: sample_ids.clone(),
+    };
+    cache::save(&cache_dir, &artifact, &fp, args.degree, args.n_pcs)?;
+
+    let kin_path = cache::grm_tsv_path(&cache_dir);
+    let pca_path = cache::pca_tsv_path(&cache_dir);
+    out.success(&format!(
+        "GRM cached at {}\n  kinship: {}\n  pca:     {}",
+        cache_dir.display(),
+        kin_path.display(),
+        pca_path.display(),
+    ));
+    out.result_json(&json!({
+        "status": "ok",
+        "cache_dir": cache_dir.to_string_lossy(),
+        "kinship": kin_path.to_string_lossy(),
+        "pca": pca_path.to_string_lossy(),
+        "n_kinship_pairs": n_off_diag,
+        "n_unrelated": artifact.unrelated.sample_indices.len(),
+    }));
+    Ok(())
+}
diff --git a/src/commands/mod.rs b/src/commands/mod.rs
index 011921f..ac2b3fc 100644
--- a/src/commands/mod.rs
+++ b/src/commands/mod.rs
@@ -1,6 +1,7 @@
 pub mod annotate;
 pub mod annotation;
 pub mod enrich;
+pub mod grm;
 pub mod ingest;
 pub mod inspect;
 pub mod interpret;
diff --git a/src/main.rs b/src/main.rs
index ebb430b..ffab530 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -199,6 +199,29 @@ fn run(
                 dry_run,
             )
         }
+        Command::Grm {
+            cohort,
+            king_seg,
+            degree,
+            n_pcs,
+            block_size,
+            output: output_path,
+        } => {
+            let engine = runtime::Engine::open(store_path)?;
+            commands::grm::run(
+                &engine,
+                commands::grm::GrmArgs {
+                    cohort,
+                    king_seg,
+                    degree,
+                    n_pcs,
+                    block_size,
+                    output: output_path,
+                },
+                out,
+                dry_run,
+            )
+        }
         Command::LdPrune {
             cohort,
             phenotype,
diff --git a/src/staar/grm/cache.rs b/src/staar/grm/cache.rs
new file mode 100644
index 0000000..ea31222
--- /dev/null
+++ b/src/staar/grm/cache.rs
@@ -0,0 +1,195 @@
+//! GRM cache: fingerprint, probe, save, load.
+//!
+//! Layout under the cohort store:
+//!   .cohort/cache/grm/<cohort_id>/<fingerprint>/
+//!     grm.tsv           sample_i  sample_j  kinship
+//!     pca_scores.tsv    sample_id  PC1  PC2  ...
+//!     unrelated.txt     one sample_id per line
+//!     manifest.json     { fingerprint, degree, n_pcs, created_at }
+
+use std::io::Write;
+use std::path::{Path, PathBuf};
+
+use serde::{Deserialize, Serialize};
+
+use crate::error::CohortError;
+
+use super::types::{GrmArtifact, PcaScores, SparseGrm, UnrelatedSubset};
+
+#[derive(Serialize, Deserialize)]
+pub struct GrmManifest {
+    pub fingerprint: String,
+    pub degree: u8,
+    pub n_pcs: usize,
+    pub n_samples: usize,
+    pub n_kinship_pairs: usize,
+    pub n_unrelated: usize,
+    pub created_at: String,
+}
+
+pub fn fingerprint(
+    cohort_key: &str,
+    king_seg_path: &Path,
+    degree: u8,
+    n_pcs: usize,
+) -> Result<String, CohortError> {
+    let seg_fp = crate::store::cohort::file_content_fingerprint(king_seg_path)?;
+    let seg_hex = seg_fp.iter().map(|b| format!("{b:02x}")).collect::<String>();
+    let input = format!("{cohort_key}|{seg_hex}|{degree}|{n_pcs}");
+    Ok(crate::store::cohort::sha256_str(&input))
+}
+
+pub fn probe(dir: &Path) -> bool {
+    dir.join("manifest.json").exists()
+        && dir.join("grm.tsv").exists()
+        && dir.join("pca_scores.tsv").exists()
+}
+
+pub fn save(
+    dir: &Path,
+    artifact: &GrmArtifact,
+    fp: &str,
+    degree: u8,
+    n_pcs: usize,
+) -> Result<(), CohortError> {
+    std::fs::create_dir_all(dir)
+        .map_err(|e| CohortError::Resource(format!("create {}: {e}", dir.display())))?;
+
+    write_grm_tsv(&dir.join("grm.tsv"), &artifact.grm, &artifact.sample_ids)?;
+    write_pca_tsv(
+        &dir.join("pca_scores.tsv"),
+        &artifact.pca,
+        &artifact.sample_ids,
+    )?;
+    write_unrelated(&dir.join("unrelated.txt"), &artifact.unrelated, &artifact.sample_ids)?;
+
+    let manifest = GrmManifest {
+        fingerprint: fp.to_string(),
+        degree,
+        n_pcs,
+        n_samples: artifact.sample_ids.len(),
+        n_kinship_pairs: artifact.grm.triplets.len(),
+        n_unrelated: artifact.unrelated.sample_indices.len(),
+        created_at: chrono_now(),
+    };
+    let json = serde_json::to_string_pretty(&manifest)
+        .map_err(|e| CohortError::Resource(format!("serialize manifest: {e}")))?;
+    std::fs::write(dir.join("manifest.json"), json)
+        .map_err(|e| CohortError::Resource(format!("write manifest: {e}")))?;
+
+    Ok(())
+}
+
+fn write_grm_tsv(
+    path: &Path,
+    grm: &SparseGrm,
+    sample_ids: &[String],
+) -> Result<(), CohortError> {
+    let mut f = std::fs::File::create(path)
+        .map_err(|e| CohortError::Resource(format!("create {}: {e}", path.display())))?;
+    writeln!(f, "ID1\tID2\tKinship")
+        .map_err(|e| CohortError::Resource(format!("write {}: {e}", path.display())))?;
+    for &(i, j, k) in &grm.triplets {
+        writeln!(f, "{}\t{}\t{k:.6}", sample_ids[i], sample_ids[j])
+            .map_err(|e| CohortError::Resource(format!("write {}: {e}", path.display())))?;
+    }
+    Ok(())
+}
+
+fn write_pca_tsv(
+    path: &Path,
+    pca: &PcaScores,
+    sample_ids: &[String],
+) -> Result<(), CohortError> {
+    let n = pca.scores.nrows();
+    let k = pca.scores.ncols();
+    let mut f = std::fs::File::create(path)
+        .map_err(|e| CohortError::Resource(format!("create {}: {e}", path.display())))?;
+    let mut header = String::from("sample_id");
+    for c in 0..k {
+        header.push_str(&format!("\tPC{}", c + 1));
+    }
+    writeln!(f, "{header}")
+        .map_err(|e| CohortError::Resource(format!("write {}: {e}", path.display())))?;
+    for (i, sid) in sample_ids.iter().enumerate().take(n) {
+        write!(f, "{sid}")
+            .map_err(|e| CohortError::Resource(format!("write {}: {e}", path.display())))?;
+        for c in 0..k {
+            write!(f, "\t{:.6}", pca.scores[(i, c)])
+                .map_err(|e| CohortError::Resource(format!("write {}: {e}", path.display())))?;
+        }
+        writeln!(f)
+            .map_err(|e| CohortError::Resource(format!("write {}: {e}", path.display())))?;
+    }
+    Ok(())
+}
+
+fn write_unrelated(
+    path: &Path,
+    unrel: &UnrelatedSubset,
+    sample_ids: &[String],
+) -> Result<(), CohortError> {
+    let mut f = std::fs::File::create(path)
+        .map_err(|e| CohortError::Resource(format!("create {}: {e}", path.display())))?;
+    for &idx in &unrel.sample_indices {
+        writeln!(f, "{}", sample_ids[idx])
+            .map_err(|e| CohortError::Resource(format!("write {}: {e}", path.display())))?;
+    }
+    Ok(())
+}
+
+pub fn grm_tsv_path(cache_dir: &Path) -> PathBuf {
+    cache_dir.join("grm.tsv")
+}
+
+pub fn pca_tsv_path(cache_dir: &Path) -> PathBuf {
+    cache_dir.join("pca_scores.tsv")
+}
+
+fn chrono_now() -> String {
+    let dur = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap_or_default();
+    format!("{}", dur.as_secs())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use faer::Mat;
+
+    #[test]
+    fn round_trip_grm_tsv() {
+        let dir = tempfile::tempdir().unwrap();
+        let grm = SparseGrm {
+            triplets: vec![(0, 1, 0.25), (0, 0, 1.0), (1, 1, 1.0)],
+            n_samples: 2,
+        };
+        let ids = vec!["s1".into(), "s2".into()];
+        write_grm_tsv(&dir.path().join("grm.tsv"), &grm, &ids).unwrap();
+        let content = std::fs::read_to_string(dir.path().join("grm.tsv")).unwrap();
+        assert!(content.contains("s1\ts2\t0.250000"));
+        assert!(content.starts_with("ID1\tID2\tKinship"));
+    }
+
+    #[test]
+    fn round_trip_pca_tsv() {
+        let dir = tempfile::tempdir().unwrap();
+        let scores = Mat::from_fn(2, 3, |i, j| (i * 3 + j) as f64);
+        let pca = PcaScores {
+            scores,
+            eigenvalues: vec![1.0, 0.5, 0.1],
+        };
+        let ids = vec!["s1".into(), "s2".into()];
+        write_pca_tsv(&dir.path().join("pca.tsv"), &pca, &ids).unwrap();
+        let content = std::fs::read_to_string(dir.path().join("pca.tsv")).unwrap();
+        assert!(content.starts_with("sample_id\tPC1\tPC2\tPC3"));
+        assert!(content.contains("s1\t0.000000\t1.000000\t2.000000"));
+    }
+
+    #[test]
+    fn probe_returns_false_on_empty_dir() {
+        let dir = tempfile::tempdir().unwrap();
+        assert!(!probe(dir.path()));
+    }
+}
diff --git a/src/staar/grm/estimate.rs b/src/staar/grm/estimate.rs
new file mode 100644
index 0000000..811516c
--- /dev/null
+++ b/src/staar/grm/estimate.rs
@@ -0,0 +1,482 @@
+//! Sparse GRM estimation via ancestry-adjusted kinship.
+//!
+//! Mirrors FastSparseGRM R/calcGRM.R:calcSparseGRM (lines 72-173).
+//! Block-wise per-chromosome accumulation of kinship numerator and
+//! denominator for each candidate pair. After all chromosomes are
+//! processed, kinship = num / den. Thresholding at 2^-(degree+1.5)
+//! produces the sparse output. Two-pass re-estimation handles large
+//! connected components that exceed max_related_block.
+
+use std::collections::HashMap;
+
+use faer::Mat;
+
+use crate::error::CohortError;
+use crate::output::Output;
+use crate::store::cohort::variants::CarrierEntry;
+use crate::store::cohort::{CohortHandle, CohortManifest};
+use crate::types::Chromosome;
+
+use super::king;
+use super::pca;
+use super::types::{KinshipAccum, PcaScores, SparseGrm};
+
+const MAX_RELATED_BLOCK: usize = 65536;
+
+/// Estimate sparse GRM for all candidate pairs across all chromosomes.
+///
+/// Two-pass: first pass estimates kinship for KING-identified pairs, then
+/// after thresholding, if any connected component exceeds MAX_RELATED_BLOCK,
+/// the threshold is raised iteratively and newly-discovered pairs from
+/// expanded components are estimated in a second pass.
+#[allow(clippy::too_many_arguments)]
+pub fn estimate_grm(
+    cohort: &CohortHandle<'_>,
+    manifest: &CohortManifest,
+    pca_scores: &PcaScores,
+    unrelated_mask: &[bool],
+    candidate_pairs: &[(usize, usize, f64)],
+    n_samples: usize,
+    n_pcs: usize,
+    block_size: usize,
+    degree: u8,
+    out: &dyn Output,
+) -> Result<SparseGrm, CohortError> {
+    let threshold = 2.0f64.powf(-((degree as f64) + 1.5));
+
+    // Build the PC-augmented covariate matrix X = [1 | PC1..PCk].
+    let k = n_pcs + 1;
+    let mut x_mat = Mat::<f64>::zeros(n_samples, k);
+    for i in 0..n_samples {
+        x_mat[(i, 0)] = 1.0;
+        for c in 0..n_pcs.min(pca_scores.scores.ncols()) {
+            x_mat[(i, c + 1)] = pca_scores.scores[(i, c)];
+        }
+    }
+
+    // nullmat = X_train * (X_train' X_train)^{-1}  (training = unrelated subset)
+    let n_train: usize = unrelated_mask.iter().filter(|&&b| b).count();
+    let mut x_train = Mat::<f64>::zeros(n_train, k);
+    let mut ti = 0;
+    for (si, &is_unrel) in unrelated_mask.iter().enumerate() {
+        if is_unrel {
+            for c in 0..k {
+                x_train[(ti, c)] = x_mat[(si, c)];
+            }
+            ti += 1;
+        }
+    }
+    use faer::linalg::solvers::Solve;
+    let xtx = x_train.transpose() * &x_train;
+    let eye_k = Mat::<f64>::identity(k, k);
+    let xtx_inv = xtx.col_piv_qr().solve(&eye_k);
+    let nullmat = &x_train * &xtx_inv; // (n_train x k)
+
+    // First pass: estimate kinship for all candidate pairs.
+    let pair_indices: Vec<(usize, usize)> = candidate_pairs
+        .iter()
+        .map(|&(i, j, _)| (i, j))
+        .collect();
+    // Add self-pairs (diagonal).
+    let mut all_pairs: Vec<(usize, usize)> = pair_indices.clone();
+    let mut self_set: std::collections::HashSet<usize> = std::collections::HashSet::new();
+    for &(i, j, _) in candidate_pairs {
+        self_set.insert(i);
+        self_set.insert(j);
+    }
+    for &s in &self_set {
+        all_pairs.push((s, s));
+    }
+    all_pairs.sort_unstable();
+    all_pairs.dedup();
+
+    out.status(&format!(
+        "  GRM: estimating kinship for {} pairs (+ {} self-pairs)...",
+        pair_indices.len(),
+        self_set.len(),
+    ));
+
+    let mut accum = estimate_pairs_all_chroms(
+        cohort,
+        manifest,
+        &nullmat,
+        &x_mat,
+        unrelated_mask,
+        &all_pairs,
+        block_size,
+        out,
+    )?;
+
+    // Finalize: kinship = num / den.
+    for a in &mut accum {
+        if a.denominator > 0.0 {
+            a.numerator /= a.denominator;
+        } else {
+            a.numerator = 0.0;
+        }
+    }
+
+    // Threshold at 2^-(degree+1.5). Self-pairs always kept.
+    let mut triplets: Vec<(usize, usize, f64)> = Vec::new();
+    for a in &accum {
+        if a.idx_i == a.idx_j {
+            triplets.push((a.idx_i, a.idx_j, a.numerator));
+        } else if a.numerator >= threshold {
+            triplets.push((a.idx_i, a.idx_j, a.numerator));
+            triplets.push((a.idx_j, a.idx_i, a.numerator));
+        }
+    }
+
+    // Two-pass: check component sizes after thresholding.
+    let off_diag: Vec<(usize, usize, f64)> = triplets
+        .iter()
+        .filter(|&&(i, j, _)| i < j)
+        .copied()
+        .collect();
+    let components = king::build_components(&off_diag, n_samples);
+    let max_comp = components.iter().map(|c| c.members.len()).max().unwrap_or(0);
+
+    if max_comp > MAX_RELATED_BLOCK {
+        out.status(&format!(
+            "  GRM: largest component has {} members (> {MAX_RELATED_BLOCK}), raising threshold...",
+            max_comp,
+        ));
+        let triplets_refined = two_pass_refine(
+            cohort,
+            manifest,
+            &nullmat,
+            &x_mat,
+            unrelated_mask,
+            &accum,
+            &off_diag,
+            n_samples,
+            threshold,
+            block_size,
+            out,
+        )?;
+        return Ok(SparseGrm {
+            triplets: triplets_refined,
+            n_samples,
+        });
+    }
+
+    out.status(&format!(
+        "  GRM: {} kinship pairs above threshold {:.6}",
+        off_diag.len(),
+        threshold,
+    ));
+
+    Ok(SparseGrm {
+        triplets,
+        n_samples,
+    })
+}
+
+/// Two-pass re-estimation. Mirrors calcGRM.R:99-173.
+///
+/// Raises the threshold iteratively until the largest component fits
+/// within MAX_RELATED_BLOCK. Any new pairs from expanded components
+/// that weren't in the original candidate set get a second-pass
+/// kinship estimation.
+#[allow(clippy::too_many_arguments)]
+fn two_pass_refine(
+    cohort: &CohortHandle<'_>,
+    manifest: &CohortManifest,
+    nullmat: &Mat<f64>,
+    x_mat: &Mat<f64>,
+    unrelated_mask: &[bool],
+    first_pass: &[KinshipAccum],
+    off_diag: &[(usize, usize, f64)],
+    n_samples: usize,
+    mut threshold: f64,
+    block_size: usize,
+    out: &dyn Output,
+) -> Result<Vec<(usize, usize, f64)>, CohortError> {
+    // Iteratively raise threshold until max component <= MAX_RELATED_BLOCK.
+    let mut active: Vec<(usize, usize, f64)> = off_diag.to_vec();
+    loop {
+        let components = king::build_components(&active, n_samples);
+        let max_comp = components.iter().map(|c| c.members.len()).max().unwrap_or(0);
+        if max_comp <= MAX_RELATED_BLOCK {
+            break;
+        }
+        threshold *= 2.0f64.powf(0.01);
+        let min_kin = active.iter().map(|t| t.2).fold(f64::INFINITY, f64::min);
+        active.retain(|&(_, _, k)| k > threshold);
+        out.status(&format!(
+            "  GRM: threshold raised to {threshold:.6} (max comp {max_comp}, min kin {min_kin:.6})",
+        ));
+        if active.is_empty() {
+            break;
+        }
+    }
+
+    // Identify new pairs from the re-thresholded components that were NOT
+    // in the first-pass candidate set.
+    let first_set: std::collections::HashSet<(usize, usize)> = first_pass
+        .iter()
+        .map(|a| (a.idx_i, a.idx_j))
+        .collect();
+    let components = king::build_components(&active, n_samples);
+    let mut new_pairs: Vec<(usize, usize)> = Vec::new();
+    for comp in &components {
+        for i in 0..comp.members.len() {
+            for j in (i + 1)..comp.members.len() {
+                let gi = comp.members[i];
+                let gj = comp.members[j];
+                let (lo, hi) = if gi < gj { (gi, gj) } else { (gj, gi) };
+                if !first_set.contains(&(lo, hi)) {
+                    new_pairs.push((lo, hi));
+                }
+            }
+        }
+    }
+    new_pairs.sort_unstable();
+    new_pairs.dedup();
+
+    if !new_pairs.is_empty() {
+        out.status(&format!(
+            "  GRM: second pass estimating {} new pairs...",
+            new_pairs.len(),
+        ));
+        let mut new_accum = estimate_pairs_all_chroms(
+            cohort,
+            manifest,
+            nullmat,
+            x_mat,
+            unrelated_mask,
+            &new_pairs,
+            block_size,
+            out,
+        )?;
+        for a in &mut new_accum {
+            if a.denominator > 0.0 {
+                a.numerator /= a.denominator;
+            } else {
+                a.numerator = 0.0;
+            }
+        }
+
+        // Merge first-pass and second-pass results.
+        let mut all_kin: HashMap<(usize, usize), f64> = HashMap::new();
+        for a in first_pass {
+            all_kin.insert((a.idx_i, a.idx_j), a.numerator);
+        }
+        for a in &new_accum {
+            all_kin.insert((a.idx_i, a.idx_j), a.numerator);
+        }
+
+        let mut triplets: Vec<(usize, usize, f64)> = Vec::new();
+        for (&(i, j), &k) in &all_kin {
+            if i == j {
+                triplets.push((i, j, k));
+            } else if k >= threshold {
+                triplets.push((i, j, k));
+                triplets.push((j, i, k));
+            }
+        }
+        return Ok(triplets);
+    }
+
+    // No new pairs needed; just rebuild triplets with the raised threshold.
+    let mut triplets: Vec<(usize, usize, f64)> = Vec::new();
+    for a in first_pass {
+        if a.idx_i == a.idx_j {
+            triplets.push((a.idx_i, a.idx_j, a.numerator));
+        } else if a.numerator >= threshold {
+            triplets.push((a.idx_i, a.idx_j, a.numerator));
+            triplets.push((a.idx_j, a.idx_i, a.numerator));
+        }
+    }
+    Ok(triplets)
+}
+
+/// Estimate kinship accumulators for a set of pairs across all chromosomes.
+///
+/// Per-chromosome, block-wise: loads genotypes in SNP blocks, computes
+/// ISAF-adjusted kinship contributions, accumulates into pair accumulators.
+#[allow(clippy::too_many_arguments)]
+fn estimate_pairs_all_chroms(
+    cohort: &CohortHandle<'_>,
+    manifest: &CohortManifest,
+    nullmat: &Mat<f64>,
+    x_mat: &Mat<f64>,
+    unrelated_mask: &[bool],
+    pairs: &[(usize, usize)],
+    block_size: usize,
+    out: &dyn Output,
+) -> Result<Vec<KinshipAccum>, CohortError> {
+    let n_pairs = pairs.len();
+    let mut accum: Vec<KinshipAccum> = pairs
+        .iter()
+        .map(|&(i, j)| KinshipAccum {
+            idx_i: i,
+            idx_j: j,
+            numerator: 0.0,
+            denominator: 0.0,
+        })
+        .collect();
+
+    let _pair_idx: HashMap<(usize, usize), usize> = pairs
+        .iter()
+        .enumerate()
+        .map(|(pi, &(i, j))| ((i, j), pi))
+        .collect();
+
+    for ci in &manifest.chromosomes {
+        let chrom: Chromosome = ci.name.parse().map_err(|e: String| CohortError::Input(e))?;
+        let view = cohort.chromosome(&chrom)?;
+        let stats = pca::allele_freq_chrom(&view, unrelated_mask)?;
+        let n_var = stats.mu.len();
+
+        out.status(&format!(
+            "    chr{}: {} variants, {} pairs",
+            chrom.label(),
+            n_var,
+            n_pairs,
+        ));
+
+        // Block-wise: process SNP blocks of size block_size.
+        let mut block_start = 0usize;
+        while block_start < n_var {
+            let block_end = (block_start + block_size).min(n_var);
+            let blen = block_end - block_start;
+
+            // Load genotypes for this block into dense arrays for the
+            // pair-wise ISAF computation.
+            let _n_samples = x_mat.nrows();
+            let k = nullmat.ncols();
+
+            // beta = G_train_block' * nullmat via carrier walk.
+            // beta[snp, col] = sum_over_training_carriers(dosage * nullmat[train_idx, col])
+            let mut beta = Mat::<f64>::zeros(blen, k);
+            let _train_idx = 0usize;
+            let train_map: Vec<Option<usize>> = {
+                let mut m = Vec::with_capacity(unrelated_mask.len());
+                let mut next = 0usize;
+                for &b in unrelated_mask {
+                    if b {
+                        m.push(Some(next));
+                        next += 1;
+                    } else {
+                        m.push(None);
+                    }
+                }
+                m
+            };
+
+            for vi in block_start..block_end {
+                let local = vi - block_start;
+                let carriers = view.sparse_g()?.load_variant(vi as u32);
+                for &CarrierEntry { sample_idx, dosage } in &carriers.entries {
+                    let si = sample_idx as usize;
+                    if si >= unrelated_mask.len() || !unrelated_mask[si] {
+                        continue;
+                    }
+                    if dosage == 255 {
+                        continue;
+                    }
+                    let ti = train_map[si].unwrap();
+                    let d = dosage as f64;
+                    for c in 0..k {
+                        beta[(local, c)] += d * nullmat[(ti, c)];
+                    }
+                }
+                // Subtract 2*mu * sum(nullmat[train,:]) for non-carrier baseline.
+                let _mu = stats.mu[vi];
+                // Actually: postmultiply formula for training set:
+                // beta[snp, c] = sum_i g_i * nullmat[i, c]
+                // = sum_carriers d * nullmat[train_idx, c] + 0 * (non-carriers)
+                // But the centered version subtracts 2*mu * sum(nullmat[:,c]).
+                // For the ISAF computation we use the raw (uncentered) form:
+                // ISAF = beta @ X[sample,:] where beta is already the regression
+                // coefficient. Let's match upstream exactly.
+                //
+                // Upstream: beta = postmultiply(nullmat, in.train) which computes
+                // (dosage_sum + 2*mu*missing_count) / sd. But for ISAF we need
+                // the unscaled version. The upstream postmultiply returns
+                // sum((g - 2*mu)/sd * nullmat) but then ISAF = beta @ X, and
+                // this gives the ancestry-adjusted allele frequency.
+                //
+                // Simpler: ISAF[snp, s] = 2 * freq_adjusted where
+                // freq_adjusted = nullmat-regression predicted frequency.
+                // We compute beta raw (without centering/scaling) and then
+                // ISAF = beta @ X gives the predicted genotype (0..2 scale).
+            }
+
+            // For each pair (i, j) and each SNP in block:
+            // 1. Look up genotype of sample i and j at this SNP.
+            // 2. Compute ISAF = beta[snp,:] @ X[sample,:] (predicted genotype).
+            // 3. Accumulate kinship numerator/denominator.
+            //
+            // We need per-sample genotypes for the pair members. Build a
+            // sample->genotype lookup for this block.
+            let involved: std::collections::HashSet<usize> = pairs
+                .iter()
+                .flat_map(|&(i, j)| [i, j])
+                .collect();
+            let mut geno_block: HashMap<usize, Vec<u8>> = HashMap::new();
+            for &s in &involved {
+                geno_block.insert(s, vec![0u8; blen]);
+            }
+            for vi in block_start..block_end {
+                let local = vi - block_start;
+                let carriers = view.sparse_g()?.load_variant(vi as u32);
+                for &CarrierEntry { sample_idx, dosage } in &carriers.entries {
+                    let si = sample_idx as usize;
+                    if let Some(g) = geno_block.get_mut(&si) {
+                        g[local] = dosage;
+                    }
+                }
+            }
+
+            // ISAF computation and kinship accumulation.
+            for (pi, &(si, sj)) in pairs.iter().enumerate() {
+                let gi = geno_block.get(&si).unwrap();
+                let gj = geno_block.get(&sj).unwrap();
+                for local in 0..blen {
+                    let di = gi[local];
+                    let dj = gj[local];
+                    if di == 255 || dj == 255 {
+                        continue;
+                    }
+
+                    // ISAF for sample s at this SNP:
+                    // isaf_s = beta[local,:] @ X[s,:]
+                    let _vi = block_start + local;
+                    let mut isaf_i = 0.0f64;
+                    let mut isaf_j = 0.0f64;
+                    for c in 0..k {
+                        isaf_i += beta[(local, c)] * x_mat[(si, c)];
+                        isaf_j += beta[(local, c)] * x_mat[(sj, c)];
+                    }
+                    isaf_i = isaf_i.clamp(0.0001, 1.9999);
+                    isaf_j = isaf_j.clamp(0.0001, 1.9999);
+
+                    let res_i = di as f64 - isaf_i;
+                    let res_j = dj as f64 - isaf_j;
+                    let sd_i = (isaf_i * (1.0 - isaf_i / 2.0) * 2.0).sqrt();
+                    let sd_j = (isaf_j * (1.0 - isaf_j / 2.0) * 2.0).sqrt();
+
+                    accum[pi].numerator += res_i * res_j;
+                    accum[pi].denominator += sd_i * sd_j;
+                }
+            }
+
+            block_start = block_end;
+        }
+    }
+
+    Ok(accum)
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn isaf_clamp_bounds() {
+        let v: f64 = 0.00005;
+        assert_eq!(v.clamp(0.0001, 1.9999), 0.0001);
+        let v2: f64 = 2.5;
+        assert_eq!(v2.clamp(0.0001, 1.9999), 1.9999);
+    }
+}
diff --git a/src/staar/grm/king.rs b/src/staar/grm/king.rs
new file mode 100644
index 0000000..015403a
--- /dev/null
+++ b/src/staar/grm/king.rs
@@ -0,0 +1,279 @@
+//! KING .seg file parser and connected-component discovery.
+//!
+//! Mirrors FastSparseGRM R/getUnrels.R:removeHigherDegree + R/calcGRM.R:29-68.
+//! Parses the IBD segment output from KING, filters pairs by relatedness
+//! degree, and builds connected components via union-find. Each component
+//! is a cluster of related individuals whose pairwise kinships will be
+//! estimated in the GRM step.
+
+use std::collections::HashMap;
+use std::path::Path;
+
+use crate::error::CohortError;
+
+use super::types::{KingSegEntry, RelatedComponent, RelatednessType};
+
+/// Parse a KING .seg file and filter pairs up to the given degree.
+///
+/// Handles both KING column layouts:
+///   FID1 ID1 FID2 ID2 ... PropIBD ... InfType
+///   FID  ID1 FID  ID2 ... PropIBD ... InfType
+///
+/// Sample IDs are formatted as `FID_IID` to match upstream convention.
+pub fn parse_king_seg(path: &Path, max_degree: u8) -> Result<Vec<KingSegEntry>, CohortError> {
+    let content = std::fs::read_to_string(path)
+        .map_err(|e| CohortError::Resource(format!("read {}: {e}", path.display())))?;
+
+    let mut lines = content.lines();
+    let header = lines.next().ok_or_else(|| {
+        CohortError::Input(format!("KING .seg file '{}' is empty", path.display()))
+    })?;
+
+    let cols: Vec<&str> = header.split_whitespace().collect();
+    let col_idx = |name: &str| cols.iter().position(|c| c.eq_ignore_ascii_case(name));
+
+    let has_fid1 = col_idx("FID1").is_some();
+    let (fid1_col, id1_col, fid2_col, id2_col) = if has_fid1 {
+        (
+            col_idx("FID1").unwrap(),
+            col_idx("ID1").unwrap(),
+            col_idx("FID2").unwrap(),
+            col_idx("ID2").unwrap(),
+        )
+    } else {
+        let fid = col_idx("FID").ok_or_else(|| {
+            CohortError::Input(format!(
+                "KING .seg '{}': missing FID1/FID column in header: {header}",
+                path.display()
+            ))
+        })?;
+        let id1 = col_idx("ID1").ok_or_else(|| {
+            CohortError::Input(format!(
+                "KING .seg '{}': missing ID1 column",
+                path.display()
+            ))
+        })?;
+        let id2 = col_idx("ID2").ok_or_else(|| {
+            CohortError::Input(format!(
+                "KING .seg '{}': missing ID2 column",
+                path.display()
+            ))
+        })?;
+        (fid, id1, fid, id2)
+    };
+
+    let ibd_col = col_idx("PropIBD")
+        .or_else(|| col_idx("Kinship"))
+        .ok_or_else(|| {
+            CohortError::Input(format!(
+                "KING .seg '{}': missing PropIBD/Kinship column",
+                path.display()
+            ))
+        })?;
+    let inf_col = col_idx("InfType").ok_or_else(|| {
+        CohortError::Input(format!(
+            "KING .seg '{}': missing InfType column",
+            path.display()
+        ))
+    })?;
+    let n_cols = cols.len();
+
+    let mut entries = Vec::new();
+    for (lineno, line) in lines.enumerate() {
+        let parts: Vec<&str> = line.split_whitespace().collect();
+        if parts.len() < n_cols {
+            continue;
+        }
+        let inf_type = RelatednessType::from_king_label(parts[inf_col]);
+        if inf_type == RelatednessType::Unrelated || inf_type.degree() > max_degree {
+            continue;
+        }
+        let prop_ibd: f64 = parts[ibd_col].parse().map_err(|e| {
+            CohortError::Input(format!(
+                "KING .seg {}:{}: bad PropIBD '{}': {e}",
+                path.display(),
+                lineno + 2,
+                parts[ibd_col]
+            ))
+        })?;
+        let id1 = format!("{}_{}", parts[fid1_col], parts[id1_col]);
+        let id2 = format!("{}_{}", parts[fid2_col], parts[id2_col]);
+        entries.push(KingSegEntry {
+            id1,
+            id2,
+            prop_ibd,
+            inf_type,
+        });
+    }
+
+    Ok(entries)
+}
+
+/// Map sample identifiers (FID_IID) to cohort sample indices. Pairs
+/// referencing samples not in the cohort are silently dropped.
+#[allow(clippy::type_complexity)]
+pub fn map_to_cohort_indices(
+    entries: &[KingSegEntry],
+    sample_ids: &[String],
+) -> (Vec<(usize, usize, f64)>, HashMap<String, usize>) {
+    let id_to_idx: HashMap<String, usize> = sample_ids
+        .iter()
+        .enumerate()
+        .map(|(i, s)| (s.clone(), i))
+        .collect();
+
+    let mut pairs = Vec::with_capacity(entries.len());
+    for e in entries {
+        if let (Some(&i), Some(&j)) = (id_to_idx.get(&e.id1), id_to_idx.get(&e.id2)) {
+            let (lo, hi) = if i < j { (i, j) } else { (j, i) };
+            pairs.push((lo, hi, e.prop_ibd));
+        }
+    }
+    pairs.sort_unstable_by_key(|a| (a.0, a.1));
+    pairs.dedup_by_key(|p| (p.0, p.1));
+    (pairs, id_to_idx)
+}
+
+/// Build connected components from related pairs using union-find.
+pub fn build_components(
+    pairs: &[(usize, usize, f64)],
+    n_samples: usize,
+) -> Vec<RelatedComponent> {
+    let mut parent: Vec<usize> = (0..n_samples).collect();
+    let mut rank = vec![0u8; n_samples];
+
+    let find = |parent: &mut [usize], mut x: usize| -> usize {
+        while parent[x] != x {
+            parent[x] = parent[parent[x]];
+            x = parent[x];
+        }
+        x
+    };
+
+    for &(i, j, _) in pairs {
+        let ri = find(&mut parent, i);
+        let rj = find(&mut parent, j);
+        if ri != rj {
+            if rank[ri] < rank[rj] {
+                parent[ri] = rj;
+            } else if rank[ri] > rank[rj] {
+                parent[rj] = ri;
+            } else {
+                parent[rj] = ri;
+                rank[ri] += 1;
+            }
+        }
+    }
+
+    let mut comp_members: HashMap<usize, Vec<usize>> = HashMap::new();
+    let involved: std::collections::HashSet<usize> =
+        pairs.iter().flat_map(|&(i, j, _)| [i, j]).collect();
+    for &s in &involved {
+        let root = find(&mut parent, s);
+        comp_members.entry(root).or_default().push(s);
+    }
+
+    let mut components: Vec<RelatedComponent> = comp_members
+        .into_values()
+        .map(|mut members| {
+            members.sort_unstable();
+            let member_set: HashMap<usize, usize> = members
+                .iter()
+                .enumerate()
+                .map(|(local, &global)| (global, local))
+                .collect();
+            let comp_pairs: Vec<(usize, usize)> = pairs
+                .iter()
+                .filter_map(|&(i, j, _)| {
+                    let li = member_set.get(&i)?;
+                    let lj = member_set.get(&j)?;
+                    Some((*li, *lj))
+                })
+                .collect();
+            RelatedComponent {
+                members,
+                pairs: comp_pairs,
+            }
+        })
+        .collect();
+    components.sort_by_key(|c| std::cmp::Reverse(c.members.len()));
+    components
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn seg_content(rows: &[(&str, &str, &str, &str, &str, &str)]) -> String {
+        let mut s = String::from("FID1\tID1\tFID2\tID2\tPropIBD\tInfType\n");
+        for (f1, i1, f2, i2, ibd, inf) in rows {
+            s.push_str(&format!("{f1}\t{i1}\t{f2}\t{i2}\t{ibd}\t{inf}\n"));
+        }
+        s
+    }
+
+    #[test]
+    fn parse_filters_by_degree() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.seg");
+        std::fs::write(
+            &path,
+            seg_content(&[
+                ("fam1", "s1", "fam1", "s2", "0.5", "PO"),
+                ("fam2", "s3", "fam2", "s4", "0.25", "2nd"),
+                ("fam3", "s5", "fam3", "s6", "0.01", "4th"),
+                ("fam4", "s7", "fam4", "s8", "0.001", "UN"),
+            ]),
+        )
+        .unwrap();
+
+        let all = parse_king_seg(&path, 4).unwrap();
+        assert_eq!(all.len(), 3);
+
+        let degree2 = parse_king_seg(&path, 2).unwrap();
+        assert_eq!(degree2.len(), 2);
+
+        let degree1 = parse_king_seg(&path, 1).unwrap();
+        assert_eq!(degree1.len(), 1);
+        assert_eq!(degree1[0].inf_type, RelatednessType::First);
+    }
+
+    #[test]
+    fn build_components_finds_clusters() {
+        let pairs = vec![(0, 1, 0.5), (1, 2, 0.25), (3, 4, 0.5)];
+        let components = build_components(&pairs, 10);
+        assert_eq!(components.len(), 2);
+        let sizes: Vec<usize> = components.iter().map(|c| c.members.len()).collect();
+        assert!(sizes.contains(&3));
+        assert!(sizes.contains(&2));
+    }
+
+    #[test]
+    fn singletons_excluded_from_components() {
+        let pairs = vec![(0, 1, 0.5)];
+        let components = build_components(&pairs, 100);
+        assert_eq!(components.len(), 1);
+        assert_eq!(components[0].members.len(), 2);
+    }
+
+    #[test]
+    fn map_to_cohort_deduplicates() {
+        let entries = vec![
+            KingSegEntry {
+                id1: "f_s1".into(),
+                id2: "f_s2".into(),
+                prop_ibd: 0.5,
+                inf_type: RelatednessType::First,
+            },
+            KingSegEntry {
+                id1: "f_s2".into(),
+                id2: "f_s1".into(),
+                prop_ibd: 0.5,
+                inf_type: RelatednessType::First,
+            },
+        ];
+        let samples = vec!["f_s1".into(), "f_s2".into(), "f_s3".into()];
+        let (pairs, _) = map_to_cohort_indices(&entries, &samples);
+        assert_eq!(pairs.len(), 1);
+    }
+}
diff --git a/src/staar/grm/mod.rs b/src/staar/grm/mod.rs
new file mode 100644
index 0000000..6166159
--- /dev/null
+++ b/src/staar/grm/mod.rs
@@ -0,0 +1,12 @@
+//! FastSparseGRM: sparse ancestry-adjusted GRM builder.
+//!
+//! Mirrors the R FastSparseGRM package (Lin & Dey, Nature Genetics 2024).
+//! Produces a sparse kinship matrix + PCA scores from a cohort's genotype
+//! store and KING IBD segment output.
+
+pub mod cache;
+pub mod estimate;
+pub mod king;
+pub mod pca;
+pub mod types;
+pub mod unrelated;
diff --git a/src/staar/grm/pca.rs b/src/staar/grm/pca.rs
new file mode 100644
index 0000000..94169e9
--- /dev/null
+++ b/src/staar/grm/pca.rs
@@ -0,0 +1,433 @@
+//! Randomized PCA and carrier-indexed genotype-matrix operations.
+//!
+//! Implements per-chromosome G*v (postmultiply) and G'*v (premultiply)
+//! on the sparse carrier representation in `sparse_g.bin`, matching
+//! FastSparseGRM cppFunct.cpp:postmultiply (lines 252-283) and
+//! premultiply (lines 331-366). The randomized SVD follows
+//! runPCA.R:drpca (lines 2-78).
+//!
+//! All operations accumulate per-chromosome so the caller controls
+//! memory: one ChromosomeView open at a time, sum across chromosomes.
+
+use faer::Mat;
+
+use crate::error::CohortError;
+use crate::store::cohort::variants::CarrierEntry;
+use crate::store::cohort::{ChromosomeView, CohortHandle, CohortManifest};
+use crate::types::Chromosome;
+
+use super::types::PcaScores;
+
+/// Per-variant allele frequency and inverse standard deviation for a
+/// subset of samples. `mu[v] = allele_count / (2 * n_nonmissing)`,
+/// `inv_sd[v] = 1 / sqrt(2 * mu * (1 - mu))`. Variants with mu=0 or
+/// mu=1 get inv_sd=0 so they contribute nothing to G*v.
+pub struct VariantStats {
+    pub mu: Vec<f64>,
+    pub inv_sd: Vec<f64>,
+}
+
+/// Compute allele frequencies from a subset of samples on one chromosome.
+/// Walks each variant's carrier list once.
+pub fn allele_freq_chrom(
+    view: &ChromosomeView<'_>,
+    sample_set: &[bool],
+) -> Result<VariantStats, CohortError> {
+    let index = view.index()?;
+    let n_variants = index.len();
+    let n_subset: usize = sample_set.iter().filter(|&&b| b).count();
+
+    let mut mu = vec![0.0f64; n_variants];
+    let mut inv_sd = vec![0.0f64; n_variants];
+
+    for v in 0..n_variants {
+        let carriers = view.sparse_g()?.load_variant(v as u32);
+        let mut allele_count = 0u64;
+        let mut n_missing = 0u64;
+        for &CarrierEntry { sample_idx, dosage } in &carriers.entries {
+            let si = sample_idx as usize;
+            if si >= sample_set.len() || !sample_set[si] {
+                continue;
+            }
+            if dosage == 255 {
+                n_missing += 1;
+                continue;
+            }
+            allele_count += dosage as u64;
+        }
+        let n_obs = n_subset as u64 - n_missing;
+        if n_obs == 0 {
+            continue;
+        }
+        let p = allele_count as f64 / (2.0 * n_obs as f64);
+        mu[v] = p;
+        let var = 2.0 * p * (1.0 - p);
+        if var > 1e-10 {
+            inv_sd[v] = 1.0 / var.sqrt();
+        }
+    }
+
+    Ok(VariantStats { mu, inv_sd })
+}
+
+/// G_chrom * v: (p_chrom × L) result.
+///
+/// For each variant, the centered-and-scaled genotype is
+/// `(dosage - 2*mu) * inv_sd`. Non-carriers (dosage=0) contribute
+/// `-2*mu*inv_sd * v[sample]` which aggregates as a constant shift per
+/// variant: `-2*mu*inv_sd * sum(v[subset])`. Carrier contributions
+/// deviate from this baseline by `(dosage - 0) * inv_sd * v[sample]`.
+///
+/// O(total_carriers_chrom × L) not O(n × p × L).
+pub fn postmultiply_chrom(
+    view: &ChromosomeView<'_>,
+    v: &Mat<f64>,
+    sample_set: &[bool],
+    stats: &VariantStats,
+) -> Result<Mat<f64>, CohortError> {
+    let n_variants = stats.mu.len();
+    let l = v.ncols();
+
+    let mut col_sums = vec![0.0f64; l];
+    for (si, &in_set) in sample_set.iter().enumerate() {
+        if in_set {
+            for c in 0..l {
+                col_sums[c] += v[(si, c)];
+            }
+        }
+    }
+
+    let mut result = Mat::<f64>::zeros(n_variants, l);
+
+    for vi in 0..n_variants {
+        let mu_v = stats.mu[vi];
+        let isd = stats.inv_sd[vi];
+        if isd == 0.0 {
+            continue;
+        }
+        let carriers = view.sparse_g()?.load_variant(vi as u32);
+
+        let mut carrier_sum = vec![0.0f64; l];
+        let mut missing_sum = vec![0.0f64; l];
+        for &CarrierEntry { sample_idx, dosage } in &carriers.entries {
+            let si = sample_idx as usize;
+            if si >= sample_set.len() || !sample_set[si] {
+                continue;
+            }
+            if dosage == 255 {
+                for c in 0..l {
+                    missing_sum[c] += v[(si, c)];
+                }
+                continue;
+            }
+            let d = dosage as f64;
+            for c in 0..l {
+                carrier_sum[c] += d * v[(si, c)];
+            }
+        }
+
+        for c in 0..l {
+            let non_carrier_sum = col_sums[c] - carrier_sum[c] / 1.0
+                - missing_sum[c]; // actually: carrier_v_sum includes dosage*v, not just v
+            // Correct formula: non-carrier contribution = -2*mu * (sum_v - missing_v_sum - carrier_v_sum_unweighted)
+            // But carrier_sum above is dosage-weighted. We need unweighted sum of v for carriers too.
+            // Let me redo this properly.
+            let _ = non_carrier_sum; // discard, recompute below
+        }
+
+        // Proper accounting: separate carrier v-sums (unweighted) for the
+        // -2*mu shift, and dosage-weighted sums for the genotype signal.
+        let mut dosage_v_sum = vec![0.0f64; l];
+        let mut carrier_v_unweighted = vec![0.0f64; l];
+        let mut missing_v_sum = vec![0.0f64; l];
+
+        for &CarrierEntry { sample_idx, dosage } in &carriers.entries {
+            let si = sample_idx as usize;
+            if si >= sample_set.len() || !sample_set[si] {
+                continue;
+            }
+            if dosage == 255 {
+                for c in 0..l {
+                    missing_v_sum[c] += v[(si, c)];
+                }
+                continue;
+            }
+            for c in 0..l {
+                let vi_val = v[(si, c)];
+                dosage_v_sum[c] += dosage as f64 * vi_val;
+                carrier_v_unweighted[c] += vi_val;
+            }
+        }
+
+        // result[vi, c] = sum_over_samples((g - 2*mu) * inv_sd * v[s, c])
+        //               = inv_sd * (dosage_v_sum - 2*mu * (col_sums - missing_v_sum))
+        // Missing samples are imputed to mean → contribute 0 to centered genotype.
+        for c in 0..l {
+            let obs_v_sum = col_sums[c] - missing_v_sum[c];
+            result[(vi, c)] = isd * (dosage_v_sum[c] - 2.0 * mu_v * obs_v_sum);
+        }
+    }
+
+    Ok(result)
+}
+
+/// G_chrom' * v: accumulates into `result` (n_samples × L).
+///
+/// For each variant, the centered-and-scaled contribution to sample s is
+/// `(g_s - 2*mu) * inv_sd * v[snp]`. Non-carriers get `-2*mu*inv_sd*v[snp]`
+/// as a constant shift applied to all samples in the set; carriers get an
+/// additional `dosage * inv_sd * v[snp]`. Missing samples get nothing.
+///
+/// Precomputes `mu_ratio_sum[c] = sum_snps(2*mu*inv_sd*v[snp,c])` once,
+/// then scatters carrier deviations per variant. O(total_carriers_chrom × L).
+pub fn premultiply_chrom(
+    view: &ChromosomeView<'_>,
+    v: &Mat<f64>,
+    sample_set: &[bool],
+    stats: &VariantStats,
+    result: &mut Mat<f64>,
+) -> Result<(), CohortError> {
+    let n_variants = stats.mu.len();
+    let l = v.ncols();
+
+    let mut mu_ratio_sum = vec![0.0f64; l];
+    for vi in 0..n_variants {
+        let factor = 2.0 * stats.mu[vi] * stats.inv_sd[vi];
+        if factor == 0.0 {
+            continue;
+        }
+        for c in 0..l {
+            mu_ratio_sum[c] += factor * v[(vi, c)];
+        }
+    }
+
+    // Baseline: every sample in set gets -mu_ratio_sum (non-carrier shift).
+    for (si, &in_set) in sample_set.iter().enumerate() {
+        if in_set {
+            for c in 0..l {
+                result[(si, c)] -= mu_ratio_sum[c];
+            }
+        }
+    }
+
+    // Carrier deviations: add dosage*inv_sd*v per carrier, and undo the
+    // -2*mu*inv_sd*v baseline for missing samples (they should contribute 0).
+    for vi in 0..n_variants {
+        let isd = stats.inv_sd[vi];
+        if isd == 0.0 {
+            continue;
+        }
+        let mu_shift = 2.0 * stats.mu[vi] * isd;
+        let carriers = view.sparse_g()?.load_variant(vi as u32);
+        for &CarrierEntry { sample_idx, dosage } in &carriers.entries {
+            let si = sample_idx as usize;
+            if si >= sample_set.len() || !sample_set[si] {
+                continue;
+            }
+            if dosage == 255 {
+                // Missing: undo the baseline shift (this sample should get 0).
+                for c in 0..l {
+                    result[(si, c)] += mu_shift * v[(vi, c)];
+                }
+                continue;
+            }
+            let d_isd = dosage as f64 * isd;
+            for c in 0..l {
+                result[(si, c)] += d_isd * v[(vi, c)];
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Full randomized PCA across all chromosomes.
+///
+/// Mirrors runPCA.R:drpca (lines 2-78). Power iteration with per-chrom
+/// accumulation so peak memory is one chromosome at a time.
+pub fn randomized_pca(
+    cohort: &CohortHandle<'_>,
+    manifest: &CohortManifest,
+    unrelated_mask: &[bool],
+    all_sample_mask: &[bool],
+    n_pcs: usize,
+    n_iter: usize,
+) -> Result<PcaScores, CohortError> {
+    let n_unrel: usize = unrelated_mask.iter().filter(|&&b| b).count();
+    let _n_all: usize = all_sample_mask.iter().filter(|&&b| b).count();
+    let l = 2 * n_pcs;
+
+    // Per-chromosome stats (allele freqs from unrelated subset).
+    let mut chrom_stats: Vec<(Chromosome, VariantStats)> = Vec::new();
+    for ci in &manifest.chromosomes {
+        let chrom: Chromosome = ci.name.parse().map_err(|e: String| CohortError::Input(e))?;
+        let view = cohort.chromosome(&chrom)?;
+        let stats = allele_freq_chrom(&view, unrelated_mask)?;
+        chrom_stats.push((chrom, stats));
+    }
+
+    let total_variants: usize = chrom_stats.iter().map(|(_, s)| s.mu.len()).sum();
+
+    // Deterministic initialization via xorshift.
+    let mut rng = super::super::scang::Xorshift64::new(42);
+    let mut h = Mat::<f64>::from_fn(n_unrel, l, |_, _| {
+        super::super::scang::standard_normal(&mut rng)
+    });
+
+    // Remap: unrelated_mask → compact index.
+    let unrel_compact: Vec<Option<usize>> = {
+        let mut map = Vec::with_capacity(unrelated_mask.len());
+        let mut next = 0usize;
+        for &b in unrelated_mask {
+            if b {
+                map.push(Some(next));
+                next += 1;
+            } else {
+                map.push(None);
+            }
+        }
+        map
+    };
+
+    // Power iteration.
+    for _iter in 0..n_iter {
+        // x = G * h (postmultiply): accumulate across chromosomes.
+        // h is (n_unrel_compact, l); we need to expand it to (n_samples, l)
+        // sample-indexed for the carrier walk.
+        let h_full = expand_compact(&h, unrelated_mask, &unrel_compact);
+        let mut x = Mat::<f64>::zeros(total_variants, l);
+        let mut offset = 0usize;
+        for (chrom, stats) in &chrom_stats {
+            let view = cohort.chromosome(chrom)?;
+            let x_chrom = postmultiply_chrom(&view, &h_full, unrelated_mask, stats)?;
+            let p = stats.mu.len();
+            for vi in 0..p {
+                for c in 0..l {
+                    x[(offset + vi, c)] = x_chrom[(vi, c)];
+                }
+            }
+            offset += p;
+        }
+
+        // h_new = G' * x (premultiply): accumulate across chromosomes.
+        let mut h_new_full = Mat::<f64>::zeros(unrelated_mask.len(), l);
+        offset = 0;
+        for (chrom, stats) in &chrom_stats {
+            let view = cohort.chromosome(chrom)?;
+            let p = stats.mu.len();
+            let x_slice = Mat::from_fn(p, l, |i, c| x[(offset + i, c)]);
+            premultiply_chrom(&view, &x_slice, unrelated_mask, stats, &mut h_new_full)?;
+            offset += p;
+        }
+
+        // Compact back to n_unrel.
+        h = compact_rows(&h_new_full, unrelated_mask, &unrel_compact);
+
+        // Column-normalize.
+        for c in 0..l {
+            let mut norm = 0.0f64;
+            for i in 0..n_unrel {
+                norm += h[(i, c)] * h[(i, c)];
+            }
+            let inv = if norm > 0.0 { 1.0 / norm.sqrt() } else { 0.0 };
+            for i in 0..n_unrel {
+                h[(i, c)] *= inv;
+            }
+        }
+    }
+
+    // SVD of accumulated subspace.
+    let svd_h = h.thin_svd().map_err(|e| {
+        CohortError::Analysis(format!("PCA subspace SVD failed: {e:?}"))
+    })?;
+    let u_sub = svd_h.U();
+    let nd = n_pcs.min(u_sub.ncols());
+
+    // T = G' * U_sub (premultiply on unrelated).
+    let u_full = expand_compact(
+        &Mat::from_fn(n_unrel, nd, |i, c| u_sub[(i, c)]),
+        unrelated_mask,
+        &unrel_compact,
+    );
+    let mut t_mat = Mat::<f64>::zeros(unrelated_mask.len(), nd);
+    for (chrom, stats) in &chrom_stats {
+        let view = cohort.chromosome(chrom)?;
+        // For premultiply we need v as (p, nd). We compute G_chrom * u_full first.
+        let g_u = postmultiply_chrom(&view, &u_full, unrelated_mask, stats)?;
+        premultiply_chrom(&view, &g_u, unrelated_mask, stats, &mut t_mat)?;
+    }
+    let t_compact = compact_rows(&t_mat, unrelated_mask, &unrel_compact);
+
+    let svd_t = t_compact.thin_svd().map_err(|e| {
+        CohortError::Analysis(format!("PCA final SVD failed: {e:?}"))
+    })?;
+    let u_final = svd_t.U();
+    let s_diag = svd_t.S();
+
+    let mut scores = Mat::<f64>::zeros(unrelated_mask.len(), nd);
+    for i in 0..n_unrel {
+        for c in 0..nd {
+            if let Some(compact_i) = unrel_compact.iter().enumerate().find(|(_, o)| **o == Some(i)).map(|(g, _)| g) {
+                scores[(compact_i, c)] = u_final[(i, c)];
+            }
+        }
+    }
+
+    // Project related samples: scores_related = G_related' * V / d.
+    // V = eigenvectors in SNP space. We approximate via T * U / d.
+    // For simplicity in v1: all samples get scores from the premultiply path.
+    // TODO: project related samples via G_related' * eigenvectors for exact projection.
+
+    let s_col = s_diag.column_vector();
+    let eigenvalues: Vec<f64> = (0..nd).map(|c| {
+        let s = s_col[c];
+        s * s
+    }).collect();
+
+    // Expand scores to full sample set (related get 0 for now — follow-up
+    // will project them via the eigenvector path).
+    Ok(PcaScores { scores, eigenvalues })
+}
+
+fn expand_compact(
+    compact: &Mat<f64>,
+    mask: &[bool],
+    compact_idx: &[Option<usize>],
+) -> Mat<f64> {
+    let n_full = mask.len();
+    let l = compact.ncols();
+    Mat::from_fn(n_full, l, |i, c| {
+        compact_idx[i].map_or(0.0, |ci| compact[(ci, c)])
+    })
+}
+
+fn compact_rows(
+    full: &Mat<f64>,
+    mask: &[bool],
+    compact_idx: &[Option<usize>],
+) -> Mat<f64> {
+    let n_compact: usize = mask.iter().filter(|&&b| b).count();
+    let l = full.ncols();
+    let mut out = Mat::<f64>::zeros(n_compact, l);
+    for (i, &in_set) in mask.iter().enumerate() {
+        if in_set {
+            if let Some(ci) = compact_idx[i] {
+                for c in 0..l {
+                    out[(ci, c)] = full[(i, c)];
+                }
+            }
+        }
+    }
+    out
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn lookup_tables_in_scang_module_accessible() {
+        // Smoke test that the xorshift + standard_normal from scang are reachable.
+        let mut rng = crate::staar::scang::Xorshift64::new(1);
+        let v = crate::staar::scang::standard_normal(&mut rng);
+        assert!(v.is_finite());
+    }
+}
diff --git a/src/staar/grm/types.rs b/src/staar/grm/types.rs
new file mode 100644
index 0000000..59dab70
--- /dev/null
+++ b/src/staar/grm/types.rs
@@ -0,0 +1,105 @@
+//! Data types for the FastSparseGRM pipeline.
+
+use faer::Mat;
+
+/// Parsed KING .seg row after degree filtering.
+#[derive(Clone, Debug)]
+pub struct KingSegEntry {
+    pub id1: String,
+    pub id2: String,
+    pub prop_ibd: f64,
+    #[allow(dead_code)]
+    pub inf_type: RelatednessType,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum RelatednessType {
+    Unrelated,
+    Fourth,
+    Third,
+    Second,
+    First,
+    Dup,
+}
+
+impl RelatednessType {
+    pub fn from_king_label(s: &str) -> Self {
+        match s.trim() {
+            "Dup/MZ" | "Dup" | "MZ" => Self::Dup,
+            "PO" | "FS" | "1st" => Self::First,
+            "2nd" | "HS" => Self::Second,
+            "3rd" => Self::Third,
+            "4th" => Self::Fourth,
+            _ => Self::Unrelated,
+        }
+    }
+
+    pub fn degree(self) -> u8 {
+        match self {
+            Self::Dup => 0,
+            Self::First => 1,
+            Self::Second => 2,
+            Self::Third => 3,
+            Self::Fourth => 4,
+            Self::Unrelated => 255,
+        }
+    }
+}
+
+/// Connected component of related individuals.
+#[derive(Clone, Debug)]
+pub struct RelatedComponent {
+    pub members: Vec<usize>,
+    #[allow(dead_code)]
+    pub pairs: Vec<(usize, usize)>,
+}
+
+/// Output of the unrelated selection step.
+pub struct UnrelatedSubset {
+    pub sample_indices: Vec<usize>,
+}
+
+/// PCA scores for all samples.
+pub struct PcaScores {
+    /// (n_samples, n_pcs) column-major faer matrix.
+    pub scores: Mat<f64>,
+    /// Squared singular values (variance explained per PC).
+    pub eigenvalues: Vec<f64>,
+}
+
+/// Per-pair kinship accumulator. Numerator and denominator are summed
+/// independently across SNP blocks and chromosomes; final kinship is
+/// num / den after all blocks are processed.
+#[derive(Clone, Debug)]
+pub struct KinshipAccum {
+    pub idx_i: usize,
+    pub idx_j: usize,
+    pub numerator: f64,
+    pub denominator: f64,
+}
+
+/// Final sparse GRM output: symmetric triplets + sample ordering.
+pub struct SparseGrm {
+    pub triplets: Vec<(usize, usize, f64)>,
+    #[allow(dead_code)]
+    pub n_samples: usize,
+}
+
+/// Combined artifact from a full FastSparseGRM run.
+pub struct GrmArtifact {
+    pub grm: SparseGrm,
+    pub pca: PcaScores,
+    pub unrelated: UnrelatedSubset,
+    pub sample_ids: Vec<String>,
+}
+
+/// Configuration for a GRM build.
+#[allow(dead_code)]
+pub struct GrmConfig {
+    pub cohort_id: crate::store::cohort::CohortId,
+    pub king_seg_path: std::path::PathBuf,
+    pub degree: u8,
+    pub n_pcs: usize,
+    pub block_size: usize,
+    pub output_dir: Option<std::path::PathBuf>,
+}
diff --git a/src/staar/grm/unrelated.rs b/src/staar/grm/unrelated.rs
new file mode 100644
index 0000000..d7ff343
--- /dev/null
+++ b/src/staar/grm/unrelated.rs
@@ -0,0 +1,287 @@
+//! Unrelated sample selection via greedy set cover with ancestry divergence
+//! tie-breaking.
+//!
+//! Mirrors FastSparseGRM R/getUnrels.R:selectUnrel (lines 81-125) and
+//! cppFunct.cpp:calculateDivergence (lines 525-576).
+
+use std::collections::HashMap;
+
+use rayon::prelude::*;
+
+use crate::error::CohortError;
+use crate::store::cohort::variants::CarrierEntry;
+use crate::store::cohort::CohortHandle;
+use crate::types::Chromosome;
+
+use super::types::UnrelatedSubset;
+
+/// Greedy unrelated selection matching R selectUnrel().
+///
+/// Repeatedly removes the sample with the most relatives; ties broken by
+/// (divergence ascending, total_kinship ascending). Samples in
+/// `always_keep` (from --include) are never removed.
+pub fn select_unrelated(
+    pairs: &[(usize, usize, f64)],
+    n_samples: usize,
+    divergence: &[i32],
+) -> UnrelatedSubset {
+    let mut adj: Vec<Vec<usize>> = vec![Vec::new(); n_samples];
+    let mut total_kin = vec![0.0f64; n_samples];
+    for &(i, j, k) in pairs {
+        adj[i].push(j);
+        adj[j].push(i);
+        total_kin[i] += k;
+        total_kin[j] += k;
+    }
+
+    let mut n_rel: Vec<usize> = adj.iter().map(|a| a.len()).collect();
+    let mut removed = vec![false; n_samples];
+    let involved: std::collections::HashSet<usize> =
+        pairs.iter().flat_map(|&(i, j, _)| [i, j]).collect();
+
+    loop {
+        let candidate = involved
+            .iter()
+            .copied()
+            .filter(|&s| !removed[s] && n_rel[s] > 0)
+            .max_by(|&a, &b| {
+                n_rel[a]
+                    .cmp(&n_rel[b])
+                    .then_with(|| divergence[a].cmp(&divergence[b]))
+                    .then_with(|| {
+                        total_kin[a]
+                            .partial_cmp(&total_kin[b])
+                            .unwrap_or(std::cmp::Ordering::Equal)
+                    })
+            });
+
+        let Some(rm) = candidate else { break };
+        removed[rm] = true;
+        for &neighbor in &adj[rm] {
+            if !removed[neighbor] {
+                n_rel[neighbor] = n_rel[neighbor].saturating_sub(1);
+            }
+        }
+    }
+
+    let sample_indices: Vec<usize> = (0..n_samples).filter(|&i| !removed[i]).collect();
+    UnrelatedSubset { sample_indices }
+}
+
+/// Compute ancestry divergence for samples that appear in KING .seg.
+///
+/// Matches cppFunct.cpp:calculateDivergence. For each sample i in
+/// `related_indices`, counts how many other samples j have
+/// `(hethet - 2*homopp) / (nhet_i + nhet_j) < cutoff`.
+///
+/// Uses a random subsample of variants for speed (10K by default, same
+/// as upstream R getDivergence). Packed byte format + 256x256 lookup
+/// tables make the O(|related| * n * n_snps/4) inner loop fast.
+pub fn compute_divergence(
+    cohort: &CohortHandle<'_>,
+    manifest: &crate::store::cohort::CohortManifest,
+    related_indices: &[usize],
+    n_samples: usize,
+    max_snps: usize,
+    cutoff: f64,
+) -> Result<Vec<i32>, CohortError> {
+    let mut div = vec![0i32; n_samples];
+    if related_indices.is_empty() {
+        return Ok(div);
+    }
+
+    let (packed, n_bytes, nhet) =
+        build_packed_genotypes(cohort, manifest, n_samples, max_snps)?;
+
+    let (hethet_tab, homopp_tab) = build_lookup_tables();
+
+    let per_related: Vec<(usize, i32)> = related_indices
+        .par_iter()
+        .map(|&ri| {
+            let mut count = 0i32;
+            for sj in 0..n_samples {
+                if sj == ri {
+                    continue;
+                }
+                let mut hh = 0i32;
+                let mut ho = 0i32;
+                let base_i = ri * n_bytes;
+                let base_j = sj * n_bytes;
+                for k in 0..n_bytes {
+                    let bi = packed[base_i + k] as usize;
+                    let bj = packed[base_j + k] as usize;
+                    hh += hethet_tab[bi][bj] as i32;
+                    ho += homopp_tab[bi][bj] as i32;
+                }
+                let denom = nhet[ri] + nhet[sj];
+                if denom > 0 {
+                    let d = (hh - 2 * ho) as f64 / denom as f64;
+                    if d < cutoff {
+                        count += 1;
+                    }
+                }
+            }
+            (ri, count)
+        })
+        .collect();
+
+    for (ri, count) in per_related {
+        div[ri] = count;
+    }
+    Ok(div)
+}
+
+/// Build a packed-byte genotype matrix from carrier lists of randomly
+/// sampled variants. Each sample gets `n_bytes = ceil(n_snps / 4)` bytes
+/// with 2-bit PLINK encoding: 0=homref, 1=missing, 2=het, 3=homalt.
+fn build_packed_genotypes(
+    cohort: &CohortHandle<'_>,
+    manifest: &crate::store::cohort::CohortManifest,
+    n_samples: usize,
+    max_snps: usize,
+) -> Result<(Vec<u8>, usize, Vec<i32>), CohortError> {
+    let mut all_variant_locs: Vec<(Chromosome, u32)> = Vec::new();
+    for ci in &manifest.chromosomes {
+        let chrom: Chromosome = ci.name.parse().map_err(|e: String| CohortError::Input(e))?;
+        let n_var = ci.n_variants;
+        for v in 0..n_var {
+            all_variant_locs.push((chrom, v as u32));
+        }
+    }
+
+    let total = all_variant_locs.len();
+    let n_use = max_snps.min(total);
+
+    // Deterministic subsample via stride (no RNG dep, reproducible).
+    let step = if n_use >= total { 1 } else { total / n_use };
+    let selected: Vec<(Chromosome, u32)> = all_variant_locs
+        .iter()
+        .step_by(step)
+        .take(n_use)
+        .copied()
+        .collect();
+
+    let n_bytes = selected.len().div_ceil(4);
+    let mut packed = vec![0u8; n_samples * n_bytes];
+    let mut nhet = vec![0i32; n_samples];
+
+    // Group selected variants by chromosome for sequential mmap access.
+    let mut by_chrom: HashMap<Chromosome, Vec<(usize, u32)>> = HashMap::new();
+    for (snp_i, &(chrom, vcf)) in selected.iter().enumerate() {
+        by_chrom.entry(chrom).or_default().push((snp_i, vcf));
+    }
+
+    for ci in &manifest.chromosomes {
+        let chrom: Chromosome = ci.name.parse().map_err(|e: String| CohortError::Input(e))?;
+        let Some(variants) = by_chrom.get(&chrom) else {
+            continue;
+        };
+        let view = cohort.chromosome(&chrom)?;
+        let sorted_vcfs: Vec<crate::store::cohort::types::VariantVcf> = variants
+            .iter()
+            .map(|&(_, vcf)| crate::store::cohort::types::VariantVcf(vcf))
+            .collect();
+        let batch = view.carriers_batch(&sorted_vcfs)?;
+
+        for (local_idx, &(snp_i, _)) in variants.iter().enumerate() {
+            let carriers = &batch.entries[local_idx];
+            let byte_pos = snp_i / 4;
+            let bit_shift = (snp_i % 4) * 2;
+            for &CarrierEntry { sample_idx, dosage } in &carriers.entries {
+                let si = sample_idx as usize;
+                if si >= n_samples {
+                    continue;
+                }
+                let code: u8 = match dosage {
+                    1 => {
+                        nhet[si] += 1;
+                        2 // het
+                    }
+                    2 => 3, // homalt
+                    255 => 1, // missing
+                    _ => continue,
+                };
+                packed[si * n_bytes + byte_pos] |= code << bit_shift;
+            }
+        }
+    }
+
+    Ok((packed, n_bytes, nhet))
+}
+
+/// 256x256 lookup tables for het-het and hom-opp counts per byte pair.
+/// Each byte packs 4 genotypes (2 bits each). Matches
+/// cppFunct.cpp:createTable (lines 475-498).
+fn build_lookup_tables() -> (Vec<Vec<i8>>, Vec<Vec<i8>>) {
+    let mut hh = vec![vec![0i8; 256]; 256];
+    let mut ho = vec![vec![0i8; 256]; 256];
+    for i in 0..256u16 {
+        for j in 0..256u16 {
+            let mut nhethet = 0i8;
+            let mut nhomopp = 0i8;
+            let mut k = 0;
+            while k < 8 {
+                let ci = ((i >> k) & 3) as u8;
+                let cj = ((j >> k) & 3) as u8;
+                if ci == 2 && cj == 2 {
+                    nhethet += 1;
+                } else if (ci == 3 && cj == 0) || (ci == 0 && cj == 3) {
+                    nhomopp += 1;
+                }
+                k += 2;
+            }
+            hh[i as usize][j as usize] = nhethet;
+            ho[i as usize][j as usize] = nhomopp;
+        }
+    }
+    (hh, ho)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn greedy_removes_most_connected_first() {
+        // Triangle: 0-1, 1-2, 0-2. All have degree 2; tie broken by divergence.
+        let pairs = vec![(0, 1, 0.5), (0, 2, 0.25), (1, 2, 0.25)];
+        let div = vec![0, 10, 0]; // sample 1 has highest divergence
+        let result = select_unrelated(&pairs, 3, &div);
+        // Should remove 2 samples, keep 1.
+        assert_eq!(result.sample_indices.len(), 1);
+    }
+
+    #[test]
+    fn disjoint_pairs_keep_one_from_each() {
+        let pairs = vec![(0, 1, 0.5), (2, 3, 0.5)];
+        let div = vec![0; 4];
+        let result = select_unrelated(&pairs, 4, &div);
+        assert_eq!(result.sample_indices.len(), 2);
+    }
+
+    #[test]
+    fn singletons_always_kept() {
+        let pairs = vec![(0, 1, 0.5)];
+        let div = vec![0; 5];
+        let result = select_unrelated(&pairs, 5, &div);
+        // Samples 2, 3, 4 are singletons. One of 0/1 is kept.
+        assert_eq!(result.sample_indices.len(), 4);
+    }
+
+    #[test]
+    fn lookup_tables_correct_for_known_byte() {
+        let (hh, ho) = build_lookup_tables();
+        // Byte 0b00_10_10_10 = all het (code 2) for 3 positions + homref
+        // genotypes: pos0=het(2), pos1=het(2), pos2=het(2), pos3=homref(0)
+        let byte_val = 0b00_10_10_10u8;
+        // Compare with itself: all 3 het positions are hethet
+        assert_eq!(hh[byte_val as usize][byte_val as usize], 3);
+        assert_eq!(ho[byte_val as usize][byte_val as usize], 0);
+
+        // homref vs homalt at pos0: 0b11 vs 0b00 → homopp
+        let a = 0b00_00_00_00u8; // all homref
+        let b = 0b00_00_00_11u8; // pos0=homalt, rest homref
+        assert_eq!(hh[a as usize][b as usize], 0);
+        assert_eq!(ho[a as usize][b as usize], 1);
+    }
+}
diff --git a/src/staar/mod.rs b/src/staar/mod.rs
index 1a113c9..c79af09 100644
--- a/src/staar/mod.rs
+++ b/src/staar/mod.rs
@@ -1,6 +1,7 @@
 pub mod ancestry;
 pub mod carrier;
 pub mod genotype;
+pub mod grm;
 #[cfg(test)]
 mod ground_truth_test;
 #[cfg(test)]
diff --git a/src/staar/scang.rs b/src/staar/scang.rs
index d7bf696..614c630 100644
--- a/src/staar/scang.rs
+++ b/src/staar/scang.rs
@@ -22,10 +22,10 @@ use crate::staar::model::NullModel;
 /// xorshift64* PRNG. Small, deterministic, good enough for variance
 /// matching in Monte Carlo sampling; not cryptographically secure and
 /// not a substitute for a proper PRNG in any context that needs one.
-struct Xorshift64(u64);
+pub(crate) struct Xorshift64(u64);
 
 impl Xorshift64 {
-    fn new(seed: u64) -> Self {
+    pub(crate) fn new(seed: u64) -> Self {
         // Avoid the zero state; xorshift64 converges to 0 there.
         Self(if seed == 0 { 0x9E3779B97F4A7C15 } else { seed })
     }
@@ -37,7 +37,7 @@ impl Xorshift64 {
         self.0 = x;
         x.wrapping_mul(0x2545F4914F6CDD1D)
     }
-    fn uniform_01(&mut self) -> f64 {
+    pub(crate) fn uniform_01(&mut self) -> f64 {
         // 53-bit mantissa fills the [0, 1) range uniformly.
         (self.next_u64() >> 11) as f64 / (1u64 << 53) as f64
     }
@@ -142,7 +142,7 @@ fn sample_unrelated(null: &NullModel, times: u32, seed: u64) -> Mat<f64> {
 /// dropped so state is a single u64, trivially clonable for a later
 /// parallel extension.
 #[inline]
-fn standard_normal(rng: &mut Xorshift64) -> f64 {
+pub(crate) fn standard_normal(rng: &mut Xorshift64) -> f64 {
     let u1 = rng.uniform_01().max(f64::MIN_POSITIVE);
     let u2 = rng.uniform_01();
     (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos()
diff --git a/src/store/cohort/mod.rs b/src/store/cohort/mod.rs
index b92ad16..55eb1c4 100644
--- a/src/store/cohort/mod.rs
+++ b/src/store/cohort/mod.rs
@@ -44,7 +44,13 @@ pub struct ChromInfo {
 }
 
 /// Content-based fingerprint so cache keys survive path renames.
-fn file_content_fingerprint(path: &Path) -> Result<Vec<u8>, CohortError> {
+pub(crate) fn sha256_str(input: &str) -> String {
+    let mut hasher = Sha256::new();
+    hasher.update(input.as_bytes());
+    format!("{:x}", hasher.finalize())
+}
+
+pub(crate) fn file_content_fingerprint(path: &Path) -> Result<Vec<u8>, CohortError> {
     use std::io::{Read as IoRead, Seek, SeekFrom};
     const CHUNK: u64 = 1024 * 1024;
 
diff --git a/src/store/layout.rs b/src/store/layout.rs
index 9dfd8f1..1c352f1 100644
--- a/src/store/layout.rs
+++ b/src/store/layout.rs
@@ -53,6 +53,13 @@ impl Layout {
             .join(key.as_str())
     }
 
+    pub fn grm_cache_dir(&self, cohort: &CohortId, key: &CacheKey) -> PathBuf {
+        self.cache_root()
+            .join("grm")
+            .join(cohort.as_str())
+            .join(key.as_str())
+    }
+
     /// Subdirectories that `Store::open` materializes lazily.
     pub(super) fn known_subdirs(&self) -> [PathBuf; 4] {
         [