DaneshjouLab · shloknatarajan · May 23, 2025 · May 23, 2025 · May 23, 2025 · May 23, 2025
diff --git a/README.MD b/README.MD
@@ -15,6 +15,20 @@ Output: Score
 3. System for extracting drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc.
 4. Continously fetch new pharmacogenomic articles
 
+## Setup
+To get started, you need two sources of data locally:
+1. The annotations for the articles (data/variantAnnotations/var_drug_ann.tsv)
+2. the articles themselves (data/articles)
+These can be populated using the following commands:
+```
+pixi run download-variants
+pixi run update-download-map
+pixi run download-articles
+```
+The download-articles step takes the longest and can be skipped by unzipping data/articles.zip, creating a list of XMLs at the directory data/articles.
+If you are running download-articles, make sure to create a .env at the root with your email using the format
+NCBI_EMAIL=YOUR_EMAIL@SCHOOL.EDU
+
 ## Description
 
 This repository contains Python scripts for running and building a Pharmacogenomic Agentic system to annotate and label genetic variants based on their phenotypical associations from journal articles. 

diff --git a/pixi.toml b/pixi.toml
@@ -12,7 +12,7 @@ platforms = ["osx-arm64"]
 version = "0.1.0"
 
 [tasks]
-download-variants = "python -m src.load_variants.load_clinical_variants"
+download-variants = "python -m src.load_variants.download_annotations_pipeline"
 update-download-map = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
 download-articles = "python -m src.fetch_articles.article_downloader"
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,12 @@
 # SPDX-FileCopyrightText: 2025 Stanford University and the project authors (see CONTRIBUTORS.md)
 # SPDX-License-Identifier: Apache-2.0
+
+requests
+pandas
+openai
+biopython
+beautifulsoup4
+tqdm
+matplotlib
+loguru
+dotenv
diff --git a/src/benchmark/README.md b/src/benchmark/README.md
@@ -1,4 +1,4 @@
 # Benchmark
 
 ## Functions
-1. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID
+1. Calculate the naive difference between an extracted variant and the ground truth variant on Variant Annotation ID
diff --git a/src/fetch_articles/article_downloader.py b/src/fetch_articles/article_downloader.py
@@ -1,11 +1,13 @@
 from loguru import logger
 from src.fetch_articles.pmcid_converter import get_unique_pmcids
 from src.utils.file_paths import get_project_root
+from src.variant_extraction.config import ENTREZ_EMAIL
 from Bio import Entrez
 import os
 import json
 from tqdm import tqdm
 
+Entrez.email = ENTREZ_EMAIL
 
 def fetch_pmc_content(pmcid):
     """

diff --git a/src/fetch_articles/pmcid_converter.py b/src/fetch_articles/pmcid_converter.py
@@ -13,10 +13,6 @@
 # Email for NCBI
 Entrez.email = os.getenv("NCBI_EMAIL")
 
-# Step 1: Function to get PMCID from PMID
-import requests
-from loguru import logger
-
 import requests
 import time
 from loguru import logger

diff --git a/src/load_variants/__init__.py b/src/load_variants/__init__.py
@@ -1,5 +1,4 @@
-from .load_clinical_variants import (
+from src.load_variants.load_clinical_variants import (
     load_raw_variant_annotations,
     get_pmid_list,
-    variant_annotations_pipeline,
 )
diff --git a/src/load_variants/download_annotations_pipeline.py b/src/load_variants/download_annotations_pipeline.py
@@ -0,0 +1,23 @@
+from loguru import logger
+from src.load_variants.load_clinical_variants import download_and_extract_variant_annotations, load_raw_variant_annotations, get_pmid_list
+
+def variant_annotations_pipeline():
+    """
+    Loads the variant annotations tsv file and saves the unique PMIDs to a json file.
+    """
+    # Download and extract the variant annotations
+    logger.info("Downloading and extracting variant annotations...")
+    download_and_extract_variant_annotations()
+
+    # Load the variant annotations
+    logger.info("Loading variant annotations...")
+    df = load_raw_variant_annotations()
+
+    # Get the PMIDs
+    logger.info("Getting PMIDs...")
+    pmid_list = get_pmid_list()
+    logger.info(f"Number of unique PMIDs: {len(pmid_list)}")
+
+
+if __name__ == "__main__":
+    variant_annotations_pipeline()
diff --git a/src/load_variants/load_clinical_variants.py b/src/load_variants/load_clinical_variants.py
@@ -112,25 +112,3 @@ def get_pmid_list(override: bool = False) -> list:
         with open(pmid_list_path, "w") as f:
             json.dump(pmid_list, f)
     return pmid_list
-
-
-def variant_annotations_pipeline():
-    """
-    Loads the variant annotations tsv file and saves the unique PMIDs to a json file.
-    """
-    # Download and extract the variant annotations
-    logger.info("Downloading and extracting variant annotations...")
-    download_and_extract_variant_annotations()
-
-    # Load the variant annotations
-    logger.info("Loading variant annotations...")
-    df = load_raw_variant_annotations()
-
-    # Get the PMIDs
-    logger.info("Getting PMIDs...")
-    pmid_list = get_pmid_list()
-    logger.info(f"Number of unique PMIDs: {len(pmid_list)}")
-
-
-if __name__ == "__main__":
-    variant_annotations_pipeline()
diff --git a/src/variant_extraction/README.md b/src/variant_extraction/README.md
@@ -0,0 +1,150 @@
+# Variant Extraction Module
+
+This is organized into the following Python modules, each handling a specific aspect of the workflow:
+
+- **config.py**: Stores configuration variables, such as URLs, file paths, and API settings.
+- **ncbi_fetch.py**: Manages fetching PMCID and content from NCBI using the Entrez API.
+- **processing.py**: Loads and processes the variant annotation dataset, including enumeration cleaning and DataFrame processing. Also interacts with the OpenAI API to extract structured genetic variant data from publication content.
+- **variant_matching.py**: Compares extracted data with ground truth for accuracy evaluation.
+- **visualization.py**: Generates visualizations to summarize match rates and analysis results.
+- **run_variant_extraction.py**: Orchestrates the entire workflow, integrating all modules.
+
+## config.py
+This module centralizes configuration settings to avoid hardcoding values in the codebase.
+
+**Variables**:
+- URLs for downloading PharmGKB data (CLINICAL_VARIANTS_URL, VARIANT_ANNOTATIONS_URL).
+- File paths for input and output data (VAR_DRUG_ANN_PATH, CHECKPOINT_PATH, OUTPUT_CSV_PATH, DF_NEW_CSV_PATH, WHOLE_CSV_PATH).
+- NCBI Entrez email (ENTREZ_EMAIL) for API compliance.
+- OpenAI model name (OPENAI_MODEL) and JSON schema (SCHEMA_TEXT) for structured API responses.
+- System message template (SYSTEM_MESSAGE_TEMPLATE) for API prompts.
+
+## processing.py
+This module handles interactions with the OpenAI API to extract structured genetic variant data.
+
+`clean_enum_list(enum_list)`:
+
+Cleans and normalizes enumeration lists by removing NaN values, splitting comma-separated strings, and ensuring uniqueness.
+Used to prepare valid enumeration values for the JSON schema.
+
+
+`load_and_prepare_data(file_path)`:
+
+Loads the variant annotation TSV file into a pandas DataFrame.
+Extracts unique values for Phenotype Category, Significance, Metabolizer types, and Population types to create enumeration lists.
+Returns the DataFrame and a dictionary of cleaned enumeration values.
+
+
+`create_schema(enum_values)`:
+
+Creates a JSON schema for API responses based on the provided enumeration values.
+Defines a structure for an array of gene objects with fields like gene, variant, drug(s), and others, enforcing strict validation.
+
+
+`create_messages(content_text, schema_text, custom_template=None)`:
+
+Generates API messages with a system prompt (using SYSTEM_MESSAGE_TEMPLATE or a custom template) and user content.
+The system prompt instructs the API to extract genetic variant information in the specified schema format.
+
+
+`call_api(client, messages, schema)`:
+
+Makes an API call to the OpenAI model (gpt-4o-2024-08-06) with the provided messages and schema.
+Returns the parsed JSON response containing extracted gene data.
+
+
+`load_checkpoint(checkpoint_path)`:
+
+Loads previously processed PMIDs and results from a checkpoint file to avoid redundant API calls.
+Returns a set of processed PMIDs and a list of results.
+
+
+`save_checkpoint(checkpoint_path, processed_pmids, results)`:
+
+Saves processed PMIDs and results to a checkpoint file for persistence.
+Ensures progress is saved after each processed row to handle interruptions.
+
+
+`process_responses(df, client, schema_text, schema, checkpoint_path, custom_template=None)`:
+
+Iterates through the DataFrame to process each row’s Content_text using the OpenAI API.
+Skips previously processed PMIDs based on checkpoint data.
+Saves results and updates the checkpoint after each row to ensure progress is not lost.
+Returns a list of flattened JSON objects with extracted gene data and associated PMIDs.
+
+
+
+## variant_matching.py
+This module compares extracted data with ground truth to evaluate accuracy.
+
+`SimplifiedVariantMatcher`:
+
+`split_variants(variant_string)`:
+Splits variant strings into individual components, handling delimiters like commas and slashes.
+
+
+`preprocess_variants(variant_string, gene=None)`:
+Preprocesses variant strings to handle rsIDs, star alleles, and SNP notations.
+Attaches gene names to star alleles and processes complex notations (e.g., CYP2C19*2-1234G>A).
+
+
+`match_row(row)`:
+Compares ground truth and predicted variants for a single row.
+Returns Exact Match, Partial Match, or No Match based on set intersections.
+
+
+`align_and_compare_datasets(df_new, flattened_df)`:
+
+Renames columns in input DataFrames to distinguish ground truth (_truth) and predicted (_output) data.
+Merges DataFrames on PMID using an inner join.
+Applies variant matching using SimplifiedVariantMatcher and compares other fields (gene, drug(s), phenotype category, significance, metabolizer types, specialty population).
+Returns a DataFrame with match indicators for each field.
+
+
+
+## visualization.py
+This module generates visualizations to summarize match rates and analysis results.
+
+`plot_match_rates(match_stats)`:
+
+Creates a bar plot of exact match rates for Gene, Drug, Phenotype, Significance, and Variant categories.
+Uses a professional color scheme and ensures readability with appropriate labels and limits.
+
+
+`plot_pie_charts(match_stats)`:
+
+Generates nested pie charts showing partial_match_rate (outer) and exact_match_rate (inner).
+Includes a legend with percentage values for clarity.
+
+
+`plot_grouped_match_rates(average_gene_match_rate, average_drug_match_rate, average_variant_match_rate)`:
+
+Plots a bar chart of match rates for Gene, Drug, and Variant categories, calculated by grouping data by PMID.
+Adds percentage labels above bars for clarity.
+
+
+`plot_attribute_match_rates(wholecsv)`:
+
+Creates a bar plot of match percentages for attributes like Match metabolizer, Match significance, etc., from the wholecsv dataset.
+Returns a DataFrame summarizing the match statistics for inclusion in reports or posters.
+
+
+
+## run_variant_extraction.py
+This module orchestrates the entire workflow.
+
+`main()`:
+- Initializes the OpenAI client with the API key from the environment.
+- Downloads and extracts PharmGKB data using data_download.download_and_extract_zip.
+- Loads and prepares the variant annotation dataset using data_processing.load_and_prepare_data.
+- Processes a subset of the DataFrame (e.g., 5 rows) to fetch NCBI data using data_processing.process_dataframe.
+- Creates a JSON schema using processing.create_schema.
+- Processes API responses to extract gene data using processing.process_responses.
+- Aligns and compares datasets using variant_matching.align_and_compare_datasets.
+- Calculates match statistics for various fields and grouped match rates by PMID.
+- Saves output DataFrames to CSV files (DF_NEW_CSV_PATH, OUTPUT_CSV_PATH).
+- Generates visualizations using visualization module functions.
+- Prints match statistics and attribute match table to the console.
+
+## Run the variant extraction:
+`python -m src.variant_extraction.run_variant_extraction`
diff --git a/src/variant_extraction/config.py b/src/variant_extraction/config.py
@@ -0,0 +1,55 @@
+# config.py
+# Configuration file for the variant annotation extraction process
+
+# URLs
+CLINICAL_VARIANTS_URL = "https://api.pharmgkb.org/v1/download/file/data/clinicalVariants.zip"
+VARIANT_ANNOTATIONS_URL = "https://api.pharmgkb.org/v1/download/file/data/variantAnnotations.zip"
+
+# File paths
+VAR_DRUG_ANN_PATH = "./data/variantAnnotations/var_drug_ann.tsv"
+CHECKPOINT_PATH = "./data/api_processing_checkpoint.json"
+OUTPUT_CSV_PATH = "./data/variant_extraction/merged.csv"
+DF_NEW_CSV_PATH = "./data/variant_extraction/df_new.csv"
+WHOLE_CSV_PATH = "./data/variant_extraction/wholecsv.csv"
+
+# NCBI email
+ENTREZ_EMAIL = "aron7628@gmail.com"
+
+# API settings
+OPENAI_MODEL = "gpt-4o-2024-08-06"
+
+# JSON schema
+SCHEMA_TEXT = '''
+{
+    "type": "object",
+    "properties": {
+        "gene": {"type": "string", "description": "The specific gene related to the drug response or phenotype (e.g., CYP3A4).", "examples": ["CYP2C19", "UGT1A3"]},
+        "variant/haplotypes": {"type": "string", "description": "full star allele including gene, full rsid, or full haplotype", "example": ["CYP2C19*17"]},
+        "drug(s)": {"type": "string", "description": "The drug(s) that are influenced by the gene variant(s).", "examples": ["abrocitinib", "mirabegron"]},
+        "phenotype category": {"type": "string", "description": "Describes the type of phenotype related to the gene-drug interaction (e.g., Metabolism/PK, toxicity).", "enum": ["Metabolism/PK", "Efficacy", "Toxicity", "Other"], "examples": ["Metabolism/PK"]},
+        "significance": {"type": "string", "description": "The level of importance or statistical significance of the gene-drug interaction.", "enum": ["significant", "not significant", "not stated"], "examples": ["significant", "not stated"]},
+        "metabolizer types": {"type": "string", "description": "Indicates the metabolizer status of the patient based on the gene variant.", "enum": ["poor", "intermediate", "extensive", "ultrarapid"], "examples": ["poor", "extensive"]},
+        "specialty population": {"type": "string", "description": "Refers to specific populations where this gene-drug interaction may have different effects.", "examples": ["healthy individuals", "African American", "pediatric"], "default": "Not specified"},
+        "PMID": {"type": "integer", "description": "PMID from source spreadsheet", "example": 123345}
+    },
+    "required": ["gene", "variant/haplotyptes", "drug(s)", "phenotype category", "significance", "metabolizer types", "PMID"]
+}
+'''
+
+# System message template
+SYSTEM_MESSAGE_TEMPLATE = (
+    "You are tasked with extracting information from scientific articles to assist in genetic variant annotation. "
+    "Focus on identifying key details related to genetic variants, including but not limited to:\n"
+    "- Variant identifiers (e.g., rsIDs, gene names, protein changes like p.Val600Glu, or DNA changes like c.1799T>A).\n"
+    "- Associated genes, transcripts, and protein products.\n"
+    "- Contextual information such as clinical significance, population frequency, or related diseases and drugs.\n"
+    "- Methodologies or evidence supporting the findings (e.g., experimental results, population studies, computational predictions).\n\n"
+    "Your output must be in the form of an array of JSON objects adhering to the following schema:\n"
+    "{schema}\n\n"
+    "Each JSON object should include:\n"
+    "1. A unique variant identifier.\n"
+    "2. Relevant metadata (e.g., associated gene, protein change, clinical significance).\n"
+    "3. Contextual evidence supporting the variant's importance.\n\n"
+    "Ensure the extracted information is accurate and directly relevant to variant annotation. "
+    "When extracting, prioritize structured data, avoiding ambiguous or irrelevant information."
+)