sunlabuiuc · jhnwu3 · Nov 30, 2025 · Nov 23, 2025 · Nov 24, 2025
diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -48,5 +48,8 @@ Available Datasets
     datasets/pyhealth.datasets.ChestXray14Dataset
     datasets/pyhealth.datasets.TUABDataset
     datasets/pyhealth.datasets.TUEVDataset
+    datasets/pyhealth.datasets.ClinVarDataset
+    datasets/pyhealth.datasets.COSMICDataset
+    datasets/pyhealth.datasets.TCGAPRADDataset
     datasets/pyhealth.datasets.splitter
     datasets/pyhealth.datasets.utils
diff --git a/docs/api/datasets/pyhealth.datasets.COSMICDataset.rst b/docs/api/datasets/pyhealth.datasets.COSMICDataset.rst
@@ -0,0 +1,9 @@
+pyhealth.datasets.COSMICDataset
+===============================
+
+The COSMIC (Catalogue of Somatic Mutations in Cancer) dataset provides comprehensive information about somatic mutations in human cancers. For more information see `COSMIC <https://cancer.sanger.ac.uk/cosmic>`_. This dataset was contributed as part of the Prostate-VarBench benchmarking work (`arXiv:2511.09576 <https://arxiv.org/abs/2511.09576>`_).
+
+.. autoclass:: pyhealth.datasets.COSMICDataset
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/api/datasets/pyhealth.datasets.ClinVarDataset.rst b/docs/api/datasets/pyhealth.datasets.ClinVarDataset.rst
@@ -0,0 +1,9 @@
+pyhealth.datasets.ClinVarDataset
+================================
+
+The ClinVar dataset provides information about genomic variants and their clinical significance based on ACMG/AMP guidelines. For more information see `ClinVar <https://www.ncbi.nlm.nih.gov/clinvar/>`_. This dataset was contributed as part of the Prostate-VarBench benchmarking work (`arXiv:2511.09576 <https://arxiv.org/abs/2511.09576>`_).
+
+.. autoclass:: pyhealth.datasets.ClinVarDataset
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/api/datasets/pyhealth.datasets.TCGAPRADDataset.rst b/docs/api/datasets/pyhealth.datasets.TCGAPRADDataset.rst
@@ -0,0 +1,9 @@
+pyhealth.datasets.TCGAPRADDataset
+=================================
+
+The Cancer Genome Atlas Prostate Adenocarcinoma (TCGA-PRAD) dataset provides multi-omics data including somatic mutations and clinical information for prostate cancer patients. For more information see `TCGA-PRAD <https://portal.gdc.cancer.gov/projects/TCGA-PRAD>`_. This dataset was contributed as part of the Prostate-VarBench benchmarking work (`arXiv:2511.09576 <https://arxiv.org/abs/2511.09576>`_).
+
+.. autoclass:: pyhealth.datasets.TCGAPRADDataset
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/api/tasks.rst b/docs/api/tasks.rst
@@ -95,3 +95,7 @@ Available Tasks
     Benchmark EHRShot <tasks/pyhealth.tasks.benchmark_ehrshot>
     ChestX-ray14 Binary Classification <tasks/pyhealth.tasks.ChestXray14BinaryClassification>
     ChestX-ray14 Multilabel Classification <tasks/pyhealth.tasks.ChestXray14MultilabelClassification>
+    Variant Classification (ClinVar) <tasks/pyhealth.tasks.VariantClassificationClinVar>
+    Mutation Pathogenicity (COSMIC) <tasks/pyhealth.tasks.MutationPathogenicityPrediction>
+    Cancer Survival Prediction (TCGA) <tasks/pyhealth.tasks.CancerSurvivalPrediction>
+    Cancer Mutation Burden (TCGA) <tasks/pyhealth.tasks.CancerMutationBurden>
diff --git a/docs/api/tasks/pyhealth.tasks.CancerMutationBurden.rst b/docs/api/tasks/pyhealth.tasks.CancerMutationBurden.rst
@@ -0,0 +1,7 @@
+pyhealth.tasks.CancerMutationBurden
+===================================
+
+.. autoclass:: pyhealth.tasks.CancerMutationBurden
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/api/tasks/pyhealth.tasks.CancerSurvivalPrediction.rst b/docs/api/tasks/pyhealth.tasks.CancerSurvivalPrediction.rst
@@ -0,0 +1,7 @@
+pyhealth.tasks.CancerSurvivalPrediction
+=======================================
+
+.. autoclass:: pyhealth.tasks.CancerSurvivalPrediction
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/api/tasks/pyhealth.tasks.MutationPathogenicityPrediction.rst b/docs/api/tasks/pyhealth.tasks.MutationPathogenicityPrediction.rst
@@ -0,0 +1,7 @@
+pyhealth.tasks.MutationPathogenicityPrediction
+==============================================
+
+.. autoclass:: pyhealth.tasks.MutationPathogenicityPrediction
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/api/tasks/pyhealth.tasks.VariantClassificationClinVar.rst b/docs/api/tasks/pyhealth.tasks.VariantClassificationClinVar.rst
@@ -0,0 +1,7 @@
+pyhealth.tasks.VariantClassificationClinVar
+===========================================
+
+.. autoclass:: pyhealth.tasks.VariantClassificationClinVar
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py
@@ -49,6 +49,8 @@ def __init__(self, *args, **kwargs):
 from .base_dataset import BaseDataset
 from .cardiology import CardiologyDataset
 from .chestxray14 import ChestXray14Dataset
+from .clinvar import ClinVarDataset
+from .cosmic import COSMICDataset
 from .covid19_cxr import COVID19CXRDataset
 from .dreamt import DREAMTDataset
 from .ehrshot import EHRShotDataset
@@ -64,6 +66,7 @@ def __init__(self, *args, **kwargs):
 from .sleepedf import SleepEDFDataset
 from .bmd_hs import BMDHSDataset
 from .support2 import Support2Dataset
+from .tcga_prad import TCGAPRADDataset
 from .splitter import (
     split_by_patient,
     split_by_patient_conformal,

diff --git a/pyhealth/datasets/clinvar.py b/pyhealth/datasets/clinvar.py
@@ -0,0 +1,169 @@
+"""ClinVar dataset for PyHealth.
+
+This module provides the ClinVarDataset class for loading and processing
+ClinVar variant data for machine learning tasks.
+"""
+
+import logging
+import os
+from pathlib import Path
+from typing import List, Optional
+
+import pandas as pd
+
+from .base_dataset import BaseDataset
+
+logger = logging.getLogger(__name__)
+
+
+class ClinVarDataset(BaseDataset):
+    """ClinVar dataset for variant classification.
+
+    ClinVar is a freely accessible, public archive of reports of the relationships
+    among human variations and phenotypes, with supporting evidence. This dataset
+    enables variant pathogenicity prediction tasks.
+
+    Dataset is available at:
+    https://ftp.ncbi.nlm.nih.gov/pub/clinvar/
+
+    Args:
+        root: Root directory of the raw data containing the ClinVar files.
+        tables: Optional list of additional tables to load beyond defaults.
+        dataset_name: Optional name of the dataset. Defaults to "clinvar".
+        config_path: Optional path to the configuration file. If not provided,
+            uses the default config in the configs directory.
+
+    Attributes:
+        root: Root directory of the raw data.
+        dataset_name: Name of the dataset.
+        config_path: Path to the configuration file.
+
+    Examples:
+        >>> from pyhealth.datasets import ClinVarDataset
+        >>> dataset = ClinVarDataset(root="/path/to/clinvar")
+        >>> dataset.stats()
+        >>> samples = dataset.set_task()
+        >>> print(samples[0])
+    """
+
+    def __init__(
+        self,
+        root: str,
+        tables: List[str] = None,
+        dataset_name: Optional[str] = None,
+        config_path: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        if config_path is None:
+            logger.info("No config path provided, using default config")
+            config_path = Path(__file__).parent / "configs" / "clinvar.yaml"
+
+        # Prepare standardized CSV if not exists
+        pyhealth_csv = os.path.join(root, "clinvar-pyhealth.csv")
+        if not os.path.exists(pyhealth_csv):
+            logger.info("Preparing ClinVar metadata...")
+            self.prepare_metadata(root)
+
+        default_tables = ["variants"]
+        tables = default_tables + (tables or [])
+
+        super().__init__(
+            root=root,
+            tables=tables,
+            dataset_name=dataset_name or "clinvar",
+            config_path=config_path,
+            **kwargs,
+        )
+
+    @staticmethod
+    def prepare_metadata(root: str) -> None:
+        """Prepare metadata for the ClinVar dataset.
+
+        Converts raw ClinVar variant_summary.txt to standardized CSV format.
+
+        Args:
+            root: Root directory containing the ClinVar files.
+        """
+        # Try to find the raw ClinVar file
+        possible_files = [
+            "variant_summary.txt",
+            "variant_summary.txt.gz",
+            "clinvar_variant_summary.txt",
+            "clinvar.vcf",
+        ]
+
+        raw_file = None
+        for fname in possible_files:
+            fpath = os.path.join(root, fname)
+            if os.path.exists(fpath):
+                raw_file = fpath
+                break
+
+        if raw_file is None:
+            logger.warning(
+                f"No raw ClinVar file found in {root}. "
+                "Please download from https://ftp.ncbi.nlm.nih.gov/pub/clinvar/ "
+                "and place variant_summary.txt in the root directory."
+            )
+            # Create empty placeholder
+            pd.DataFrame(
+                columns=[
+                    "gene_symbol",
+                    "clinical_significance",
+                    "review_status",
+                    "chromosome",
+                    "position",
+                    "reference_allele",
+                    "alternate_allele",
+                    "variant_type",
+                    "assembly",
+                ]
+            ).to_csv(os.path.join(root, "clinvar-pyhealth.csv"), index=False)
+            return
+
+        logger.info(f"Processing ClinVar file: {raw_file}")
+
+        # Read the raw file
+        if raw_file.endswith(".gz"):
+            df = pd.read_csv(raw_file, sep="\t", compression="gzip", low_memory=False)
+        else:
+            df = pd.read_csv(raw_file, sep="\t", low_memory=False)
+
+        # Standardize column names
+        column_mapping = {
+            "GeneSymbol": "gene_symbol",
+            "ClinicalSignificance": "clinical_significance",
+            "ReviewStatus": "review_status",
+            "Chromosome": "chromosome",
+            "PositionVCF": "position",
+            "ReferenceAlleleVCF": "reference_allele",
+            "AlternateAlleleVCF": "alternate_allele",
+            "Type": "variant_type",
+            "Assembly": "assembly",
+        }
+
+        # Select and rename columns that exist
+        available_cols = [c for c in column_mapping.keys() if c in df.columns]
+        df_out = df[available_cols].rename(
+            columns={k: v for k, v in column_mapping.items() if k in available_cols}
+        )
+
+        # Filter for GRCh38 assembly if assembly column exists
+        if "assembly" in df_out.columns:
+            df_out = df_out[df_out["assembly"] == "GRCh38"]
+
+        # Save to standardized CSV
+        output_path = os.path.join(root, "clinvar-pyhealth.csv")
+        df_out.to_csv(output_path, index=False)
+        logger.info(f"Saved {len(df_out)} variants to {output_path}")
+
+    @property
+    def default_task(self):
+        """Returns the default task for this dataset.
+
+        Returns:
+            VariantClassificationClinVar: The default classification task.
+        """
+        from pyhealth.tasks import VariantClassificationClinVar
+
+        return VariantClassificationClinVar()
diff --git a/pyhealth/datasets/configs/clinvar.yaml b/pyhealth/datasets/configs/clinvar.yaml
@@ -0,0 +1,16 @@
+version: "1.0"
+tables:
+  variants:
+    file_path: "clinvar-pyhealth.csv"
+    patient_id: null
+    timestamp: null
+    attributes:
+    - "gene_symbol"
+    - "clinical_significance"
+    - "review_status"
+    - "chromosome"
+    - "position"
+    - "reference_allele"
+    - "alternate_allele"
+    - "variant_type"
+    - "assembly"
diff --git a/pyhealth/datasets/configs/cosmic.yaml b/pyhealth/datasets/configs/cosmic.yaml
@@ -0,0 +1,15 @@
+version: "1.0"
+tables:
+  mutations:
+    file_path: "cosmic-pyhealth.csv"
+    patient_id: "sample_id"
+    timestamp: null
+    attributes:
+    - "gene_name"
+    - "hgvsc"
+    - "hgvsp"
+    - "mutation_description"
+    - "fathmm_prediction"
+    - "primary_site"
+    - "primary_histology"
+    - "mutation_somatic_status"
diff --git a/pyhealth/datasets/configs/tcga_prad.yaml b/pyhealth/datasets/configs/tcga_prad.yaml
@@ -0,0 +1,23 @@
+version: "1.0"
+tables:
+  mutations:
+    file_path: "tcga_prad_mutations-pyhealth.csv"
+    patient_id: "patient_id"
+    timestamp: null
+    attributes:
+    - "hugo_symbol"
+    - "variant_classification"
+    - "variant_type"
+    - "hgvsc"
+    - "hgvsp"
+    - "tumor_sample_barcode"
+  clinical:
+    file_path: "tcga_prad_clinical-pyhealth.csv"
+    patient_id: "patient_id"
+    timestamp: null
+    attributes:
+    - "age_at_diagnosis"
+    - "gleason_score"
+    - "vital_status"
+    - "days_to_death"
+    - "tumor_stage"