Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,20 @@ Output: Score
3. System for extracting drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc.
4. Continously fetch new pharmacogenomic articles

## Setup
To get started, you need two sources of data locally:
1. The annotations for the articles (data/variantAnnotations/var_drug_ann.tsv)
2. the articles themselves (data/articles)
These can be populated using the following commands:
```
pixi run download-variants
pixi run update-download-map
pixi run download-articles
```
The download-articles step takes the longest and can be skipped by unzipping data/articles.zip, creating a list of XMLs at the directory data/articles.
If you are running download-articles, make sure to create a .env at the root with your email using the format
NCBI_EMAIL=YOUR_EMAIL@SCHOOL.EDU

## Description

This repository contains Python scripts for running and building a Pharmacogenomic Agentic system to annotate and label genetic variants based on their phenotypical associations from journal articles.
Expand Down
2 changes: 1 addition & 1 deletion pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ platforms = ["osx-arm64"]
version = "0.1.0"

[tasks]
download-variants = "python -m src.load_variants.load_clinical_variants"
download-variants = "python -m src.load_variants.download_annotations_pipeline"
update-download-map = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
download-articles = "python -m src.fetch_articles.article_downloader"

Expand Down
10 changes: 10 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
# SPDX-FileCopyrightText: 2025 Stanford University and the project authors (see CONTRIBUTORS.md)
# SPDX-License-Identifier: Apache-2.0

requests
pandas
openai
biopython
beautifulsoup4
tqdm
matplotlib
loguru
dotenv
2 changes: 1 addition & 1 deletion src/benchmark/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Benchmark

## Functions
1. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID
1. Calculate the naive difference between an extracted variant and the ground truth variant on Variant Annotation ID
2 changes: 2 additions & 0 deletions src/fetch_articles/article_downloader.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from loguru import logger
from src.fetch_articles.pmcid_converter import get_unique_pmcids
from src.utils.file_paths import get_project_root
from src.variant_extraction.config import ENTREZ_EMAIL
from Bio import Entrez
import os
import json
from tqdm import tqdm

Entrez.email = ENTREZ_EMAIL

def fetch_pmc_content(pmcid):
"""
Expand Down
4 changes: 0 additions & 4 deletions src/fetch_articles/pmcid_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@
# Email for NCBI
Entrez.email = os.getenv("NCBI_EMAIL")

# Step 1: Function to get PMCID from PMID
import requests
from loguru import logger

import requests
import time
from loguru import logger
Expand Down
3 changes: 1 addition & 2 deletions src/load_variants/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .load_clinical_variants import (
from src.load_variants.load_clinical_variants import (
load_raw_variant_annotations,
get_pmid_list,
variant_annotations_pipeline,
)
23 changes: 23 additions & 0 deletions src/load_variants/download_annotations_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from loguru import logger
from src.load_variants.load_clinical_variants import download_and_extract_variant_annotations, load_raw_variant_annotations, get_pmid_list

def variant_annotations_pipeline():
"""
Loads the variant annotations tsv file and saves the unique PMIDs to a json file.
"""
# Download and extract the variant annotations
logger.info("Downloading and extracting variant annotations...")
download_and_extract_variant_annotations()

# Load the variant annotations
logger.info("Loading variant annotations...")
df = load_raw_variant_annotations()

# Get the PMIDs
logger.info("Getting PMIDs...")
pmid_list = get_pmid_list()
logger.info(f"Number of unique PMIDs: {len(pmid_list)}")


if __name__ == "__main__":
variant_annotations_pipeline()
22 changes: 0 additions & 22 deletions src/load_variants/load_clinical_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,25 +112,3 @@ def get_pmid_list(override: bool = False) -> list:
with open(pmid_list_path, "w") as f:
json.dump(pmid_list, f)
return pmid_list


def variant_annotations_pipeline():
"""
Loads the variant annotations tsv file and saves the unique PMIDs to a json file.
"""
# Download and extract the variant annotations
logger.info("Downloading and extracting variant annotations...")
download_and_extract_variant_annotations()

# Load the variant annotations
logger.info("Loading variant annotations...")
df = load_raw_variant_annotations()

# Get the PMIDs
logger.info("Getting PMIDs...")
pmid_list = get_pmid_list()
logger.info(f"Number of unique PMIDs: {len(pmid_list)}")


if __name__ == "__main__":
variant_annotations_pipeline()
150 changes: 150 additions & 0 deletions src/variant_extraction/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# Variant Extraction Module

This is organized into the following Python modules, each handling a specific aspect of the workflow:

- **config.py**: Stores configuration variables, such as URLs, file paths, and API settings.
- **ncbi_fetch.py**: Manages fetching PMCID and content from NCBI using the Entrez API.
- **processing.py**: Loads and processes the variant annotation dataset, including enumeration cleaning and DataFrame processing. Also interacts with the OpenAI API to extract structured genetic variant data from publication content.
- **variant_matching.py**: Compares extracted data with ground truth for accuracy evaluation.
- **visualization.py**: Generates visualizations to summarize match rates and analysis results.
- **run_variant_extraction.py**: Orchestrates the entire workflow, integrating all modules.

## config.py
This module centralizes configuration settings to avoid hardcoding values in the codebase.

**Variables**:
- URLs for downloading PharmGKB data (CLINICAL_VARIANTS_URL, VARIANT_ANNOTATIONS_URL).
- File paths for input and output data (VAR_DRUG_ANN_PATH, CHECKPOINT_PATH, OUTPUT_CSV_PATH, DF_NEW_CSV_PATH, WHOLE_CSV_PATH).
- NCBI Entrez email (ENTREZ_EMAIL) for API compliance.
- OpenAI model name (OPENAI_MODEL) and JSON schema (SCHEMA_TEXT) for structured API responses.
- System message template (SYSTEM_MESSAGE_TEMPLATE) for API prompts.

## processing.py
This module handles interactions with the OpenAI API to extract structured genetic variant data.

`clean_enum_list(enum_list)`:

Cleans and normalizes enumeration lists by removing NaN values, splitting comma-separated strings, and ensuring uniqueness.
Used to prepare valid enumeration values for the JSON schema.


`load_and_prepare_data(file_path)`:

Loads the variant annotation TSV file into a pandas DataFrame.
Extracts unique values for Phenotype Category, Significance, Metabolizer types, and Population types to create enumeration lists.
Returns the DataFrame and a dictionary of cleaned enumeration values.


`create_schema(enum_values)`:

Creates a JSON schema for API responses based on the provided enumeration values.
Defines a structure for an array of gene objects with fields like gene, variant, drug(s), and others, enforcing strict validation.


`create_messages(content_text, schema_text, custom_template=None)`:

Generates API messages with a system prompt (using SYSTEM_MESSAGE_TEMPLATE or a custom template) and user content.
The system prompt instructs the API to extract genetic variant information in the specified schema format.


`call_api(client, messages, schema)`:

Makes an API call to the OpenAI model (gpt-4o-2024-08-06) with the provided messages and schema.
Returns the parsed JSON response containing extracted gene data.


`load_checkpoint(checkpoint_path)`:

Loads previously processed PMIDs and results from a checkpoint file to avoid redundant API calls.
Returns a set of processed PMIDs and a list of results.


`save_checkpoint(checkpoint_path, processed_pmids, results)`:

Saves processed PMIDs and results to a checkpoint file for persistence.
Ensures progress is saved after each processed row to handle interruptions.


`process_responses(df, client, schema_text, schema, checkpoint_path, custom_template=None)`:

Iterates through the DataFrame to process each row’s Content_text using the OpenAI API.
Skips previously processed PMIDs based on checkpoint data.
Saves results and updates the checkpoint after each row to ensure progress is not lost.
Returns a list of flattened JSON objects with extracted gene data and associated PMIDs.



## variant_matching.py
This module compares extracted data with ground truth to evaluate accuracy.

`SimplifiedVariantMatcher`:

`split_variants(variant_string)`:
Splits variant strings into individual components, handling delimiters like commas and slashes.


`preprocess_variants(variant_string, gene=None)`:
Preprocesses variant strings to handle rsIDs, star alleles, and SNP notations.
Attaches gene names to star alleles and processes complex notations (e.g., CYP2C19*2-1234G>A).


`match_row(row)`:
Compares ground truth and predicted variants for a single row.
Returns Exact Match, Partial Match, or No Match based on set intersections.


`align_and_compare_datasets(df_new, flattened_df)`:

Renames columns in input DataFrames to distinguish ground truth (_truth) and predicted (_output) data.
Merges DataFrames on PMID using an inner join.
Applies variant matching using SimplifiedVariantMatcher and compares other fields (gene, drug(s), phenotype category, significance, metabolizer types, specialty population).
Returns a DataFrame with match indicators for each field.



## visualization.py
This module generates visualizations to summarize match rates and analysis results.

`plot_match_rates(match_stats)`:

Creates a bar plot of exact match rates for Gene, Drug, Phenotype, Significance, and Variant categories.
Uses a professional color scheme and ensures readability with appropriate labels and limits.


`plot_pie_charts(match_stats)`:

Generates nested pie charts showing partial_match_rate (outer) and exact_match_rate (inner).
Includes a legend with percentage values for clarity.


`plot_grouped_match_rates(average_gene_match_rate, average_drug_match_rate, average_variant_match_rate)`:

Plots a bar chart of match rates for Gene, Drug, and Variant categories, calculated by grouping data by PMID.
Adds percentage labels above bars for clarity.


`plot_attribute_match_rates(wholecsv)`:

Creates a bar plot of match percentages for attributes like Match metabolizer, Match significance, etc., from the wholecsv dataset.
Returns a DataFrame summarizing the match statistics for inclusion in reports or posters.



## run_variant_extraction.py
This module orchestrates the entire workflow.

`main()`:
- Initializes the OpenAI client with the API key from the environment.
- Downloads and extracts PharmGKB data using data_download.download_and_extract_zip.
- Loads and prepares the variant annotation dataset using data_processing.load_and_prepare_data.
- Processes a subset of the DataFrame (e.g., 5 rows) to fetch NCBI data using data_processing.process_dataframe.
- Creates a JSON schema using processing.create_schema.
- Processes API responses to extract gene data using processing.process_responses.
- Aligns and compares datasets using variant_matching.align_and_compare_datasets.
- Calculates match statistics for various fields and grouped match rates by PMID.
- Saves output DataFrames to CSV files (DF_NEW_CSV_PATH, OUTPUT_CSV_PATH).
- Generates visualizations using visualization module functions.
- Prints match statistics and attribute match table to the console.

## Run the variant extraction:
`python -m src.variant_extraction.run_variant_extraction`
55 changes: 55 additions & 0 deletions src/variant_extraction/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# config.py
# Configuration file for the variant annotation extraction process

# URLs
CLINICAL_VARIANTS_URL = "https://api.pharmgkb.org/v1/download/file/data/clinicalVariants.zip"
VARIANT_ANNOTATIONS_URL = "https://api.pharmgkb.org/v1/download/file/data/variantAnnotations.zip"

# File paths
VAR_DRUG_ANN_PATH = "./data/variantAnnotations/var_drug_ann.tsv"
CHECKPOINT_PATH = "./data/api_processing_checkpoint.json"
OUTPUT_CSV_PATH = "./data/variant_extraction/merged.csv"
DF_NEW_CSV_PATH = "./data/variant_extraction/df_new.csv"
WHOLE_CSV_PATH = "./data/variant_extraction/wholecsv.csv"

# NCBI email
ENTREZ_EMAIL = "aron7628@gmail.com"

# API settings
OPENAI_MODEL = "gpt-4o-2024-08-06"

# JSON schema
SCHEMA_TEXT = '''
{
"type": "object",
"properties": {
"gene": {"type": "string", "description": "The specific gene related to the drug response or phenotype (e.g., CYP3A4).", "examples": ["CYP2C19", "UGT1A3"]},
"variant/haplotypes": {"type": "string", "description": "full star allele including gene, full rsid, or full haplotype", "example": ["CYP2C19*17"]},
"drug(s)": {"type": "string", "description": "The drug(s) that are influenced by the gene variant(s).", "examples": ["abrocitinib", "mirabegron"]},
"phenotype category": {"type": "string", "description": "Describes the type of phenotype related to the gene-drug interaction (e.g., Metabolism/PK, toxicity).", "enum": ["Metabolism/PK", "Efficacy", "Toxicity", "Other"], "examples": ["Metabolism/PK"]},
"significance": {"type": "string", "description": "The level of importance or statistical significance of the gene-drug interaction.", "enum": ["significant", "not significant", "not stated"], "examples": ["significant", "not stated"]},
"metabolizer types": {"type": "string", "description": "Indicates the metabolizer status of the patient based on the gene variant.", "enum": ["poor", "intermediate", "extensive", "ultrarapid"], "examples": ["poor", "extensive"]},
"specialty population": {"type": "string", "description": "Refers to specific populations where this gene-drug interaction may have different effects.", "examples": ["healthy individuals", "African American", "pediatric"], "default": "Not specified"},
"PMID": {"type": "integer", "description": "PMID from source spreadsheet", "example": 123345}
},
"required": ["gene", "variant/haplotyptes", "drug(s)", "phenotype category", "significance", "metabolizer types", "PMID"]
}
'''

# System message template
SYSTEM_MESSAGE_TEMPLATE = (
"You are tasked with extracting information from scientific articles to assist in genetic variant annotation. "
"Focus on identifying key details related to genetic variants, including but not limited to:\n"
"- Variant identifiers (e.g., rsIDs, gene names, protein changes like p.Val600Glu, or DNA changes like c.1799T>A).\n"
"- Associated genes, transcripts, and protein products.\n"
"- Contextual information such as clinical significance, population frequency, or related diseases and drugs.\n"
"- Methodologies or evidence supporting the findings (e.g., experimental results, population studies, computational predictions).\n\n"
"Your output must be in the form of an array of JSON objects adhering to the following schema:\n"
"{schema}\n\n"
"Each JSON object should include:\n"
"1. A unique variant identifier.\n"
"2. Relevant metadata (e.g., associated gene, protein change, clinical significance).\n"
"3. Contextual evidence supporting the variant's importance.\n\n"
"Ensure the extracted information is accurate and directly relevant to variant annotation. "
"When extracting, prioritize structured data, avoiding ambiguous or irrelevant information."
)
Loading
Loading