Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,9 @@ __pycache__

# environments
.pyenv
.env
.env

# data
src/load_data/saved_data/
src/fetch_articles/saved_data/downloaded_pmcids.json
src/fetch_articles/saved_data/articles/
13 changes: 12 additions & 1 deletion README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,15 @@

## Description

This repository contains Python scripts for running and building a Pharmacogenomic Agentic system to annotate and label genetic variants based on their phenotypical associations from journal articles.
This repository contains Python scripts for running and building a Pharmacogenomic Agentic system to annotate and label genetic variants based on their phenotypical associations from journal articles.


## Progress Tracker
| Task | Status |
| --- | --- |
| Download the zip of variants from pharmgkb | ✅ |
| Get a PMID list from the variants tsv (column PMID) | ✅ |
| Convert the PMID to PMCID | ✅ |
| Update to use non-official pmid to pmcid | |
| Fetch the content from the PMCID | |
| Create pairing of annotations to article | |
1,782 changes: 1,782 additions & 0 deletions pixi.lock

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,18 @@ platforms = ["osx-arm64"]
version = "0.1.0"

[tasks]
update-downloaded-pmcids = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"

[dependencies]
seaborn = ">=0.13.2,<0.14"
tqdm = ">=4.67.1,<5"
requests = ">=2.32.3,<3"
biopython = ">=1.85,<2"
ipykernel = ">=6.29.5,<7"
pandas = ">=2.2.3,<3"
numpy = ">=2.2.5,<3"
openai = ">=1.76.2,<2"
playwright = ">=1.52.0,<2"
loguru = ">=0.7.2,<0.8"
python-dotenv = ">=1.1.0,<2"
black = ">=25.1.0,<26"
13 changes: 13 additions & 0 deletions src/fetch_articles/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# PubMed Document Fetching
## Goal
Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues

## Process Overview
1. Download the zip of variants from pharmgkb (handled in load_data module)
2. Get a PMID list from the variants tsv (column PMID) (handled in load_data module)
3. Convert the PMID to PMCID
4. Fetch the content from the PMCID

## Saved Data
pmcid_mapping.json: Maps the PMID to the PMCID {"PMID": "PMCID" or Null, ..}
unique_pmcids.json: List of all the unique PMCIDs from pmcid_mapping.json (["PMCID1", "PMCID2", ...])
Empty file added src/fetch_articles/__init__.py
Empty file.
101 changes: 101 additions & 0 deletions src/fetch_articles/article_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from loguru import logger
from src.fetch_articles.pmcid_converter import get_unique_pmcids
from Bio import Entrez
import os
import json
from tqdm import tqdm


def fetch_pmc_content(pmcid):
try:
handle = Entrez.efetch(db="pmc", id=pmcid, rettype="full", retmode="xml")
record = handle.read()
handle.close()
return record
except Exception as e:
print(f"An error occurred while fetching content for PMCID {pmcid}: {e}")
return None


def update_downloaded_pmcids() -> None:
"""
Update the downloaded_pmcids.json file with PMCIDs found in the saved_data/articles directory.
"""
base_dir = os.path.dirname(os.path.abspath(__file__))
downloaded_pmcids_path = os.path.join(
base_dir, "saved_data", "downloaded_pmcids.json"
)
# Check for all the filenames in the saved_data/articles directory
articles_dir = os.path.join(base_dir, "saved_data", "articles")
article_pmcids = [f.split(".")[0] for f in os.listdir(articles_dir)]
article_pmcids_mapping = {pmcid: f"{pmcid}.xml" for pmcid in article_pmcids}

logger.info(f"Found {len(article_pmcids)} existing XML files in {articles_dir}")
# Add the new PMCIDs to the json file
if os.path.exists(downloaded_pmcids_path):
with open(downloaded_pmcids_path, "r") as f:
try:
downloaded_pmcids = json.load(f)
except json.JSONDecodeError:
logger.error(
f"Error loading {downloaded_pmcids_path}. Creating new json file."
)
downloaded_pmcids = {}
else:
downloaded_pmcids = {}
downloaded_pmcids.update(article_pmcids_mapping)
with open(downloaded_pmcids_path, "w") as f:
json.dump(downloaded_pmcids, f)
logger.info(
f"Updated {downloaded_pmcids_path} with {len(article_pmcids)} new PMCIDs"
)


def download_articles(pmcids: list[str]):
"""
Download articles from PubMed Central using PMCIDs.
Keeps track of the PMCIDs that have been downloaded and skips them.
Saves the downloaded articles to the saved_data/articles directory.

Args:
pmcids (list[str]): List of PMCIDs to download.
"""
base_dir = os.path.dirname(os.path.abspath(__file__))
saved_dir = os.path.join(base_dir, "saved_data", "articles")
os.makedirs(saved_dir, exist_ok=True)

# Load the downloaded PMCIDs from the json file
downloaded_pmcids_path = os.path.join(
base_dir, "saved_data", "downloaded_pmcids.json"
)
if os.path.exists(downloaded_pmcids_path):
with open(downloaded_pmcids_path, "r") as f:
downloaded_pmcids = json.load(f)
else:
downloaded_pmcids = {}

new_pmcids = [pmcid for pmcid in pmcids if pmcid not in downloaded_pmcids]
logger.warning(f"{len(downloaded_pmcids)} existing articles found")
logger.info(f"{len(new_pmcids)} new articles to download")

# Download the articles
for pmcid in tqdm(new_pmcids):
record = fetch_pmc_content(pmcid)
if record:
with open(os.path.join(saved_dir, f"{pmcid}.xml"), "w") as f:
f.write(record.decode("utf-8"))
downloaded_pmcids[pmcid] = f"{pmcid}.xml"
else:
downloaded_pmcids[pmcid] = None
logger.warning(f"No record found for PMCID {pmcid}")
logger.info(f"Downloaded {len(downloaded_pmcids)} articles")

# Save the downloaded PMCIDs to a json file
with open(os.path.join(base_dir, "saved_data", "downloaded_pmcids.json"), "w") as f:
json.dump(downloaded_pmcids, f)


if __name__ == "__main__":
update_downloaded_pmcids()
pmcids = get_unique_pmcids()
download_articles(pmcids)
165 changes: 165 additions & 0 deletions src/fetch_articles/pmcid_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import time
import random
import pandas as pd
from Bio import Entrez
from tqdm import tqdm
from dotenv import load_dotenv
import os
from src.load_data import get_pmid_list
import json

load_dotenv()
# Email for NCBI
Entrez.email = os.getenv("NCBI_EMAIL")

# Step 1: Function to get PMCID from PMID
import requests
from loguru import logger

import requests
import time
from loguru import logger
from typing import List, Set, Dict, Optional


def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]:
"""
Load the saved PMCID mapping from the json file.
"""
base_dir = os.path.dirname(os.path.abspath(__file__))
results_path = os.path.join(base_dir, "pmcid_mapping.json")
if os.path.exists(results_path):
with open(results_path, "r") as f:
existing_results = json.load(f)
logger.info(
f"Loaded {len(existing_results)} existing PMCID mappings from {results_path}"
)
else:
logger.info(
f"No PMCID mapping found at {results_path}. Creating empty mapping."
)
existing_results = {}
return existing_results


def batch_pmid_to_pmcid(
pmids: List[str], email: str, batch_size: int = 100, delay: float = 0.4
) -> Dict[str, Optional[str]]:
"""
Convert a list of PMIDs to PMCIDs using NCBI's ID Converter API.

Args:
pmids: List of PMIDs (as strings).
email: Your email address for NCBI tool identification.
batch_size: Number of PMIDs to send per request (max: 200).
delay: Seconds to wait between requests (default 0.4 to respect NCBI).

Returns:
Dict mapping each PMID to a PMCID (or None if not available).
"""
url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
results = {}
existing_results = load_saved_pmcid_mapping()

# Check for existing results
existing_pmids = set(existing_results.keys())

# Remove existing results from pmids
filtered_pmids = [x for x in pmids if str(x) not in existing_pmids]

logger.info(f"Remaining PMIDs to process: {len(filtered_pmids)}")
if len(filtered_pmids) == 0:
logger.warning("No PMIDs to process. Exiting.")
return existing_results

# Process remaining PMIDs
for i in range(0, len(filtered_pmids), batch_size):
batch = filtered_pmids[i : i + batch_size]
batch_str = [str(pmid) for pmid in batch]
ids_str = ",".join(batch_str)
logger.info(f"Processing PMIDs {i + 1} to {i + len(batch)}...")

params = {
"tool": "pmid2pmcid_tool",
"email": email,
"ids": ids_str,
"format": "json",
}

try:
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
records = data.get("records", [])
for record in records:
pmid = record.get("pmid")
pmcid = record.get("pmcid")
results[pmid] = pmcid if pmcid else None
if pmcid:
logger.info(f"PMID {pmid} → PMCID {pmcid}")
else:
logger.warning(f"PMID {pmid} has no PMCID available.")
except Exception as e:
logger.error(f"Failed batch starting at index {i}: {e}")
for pmid in batch:
results[pmid] = None

time.sleep(delay)

# Merge existing results with new results
existing_results.update(results)

# Save updated results
base_dir = os.path.dirname(os.path.abspath(__file__))
results_path = os.path.join(base_dir, "pmcid_mapping.json")
with open(results_path, "w") as f:
json.dump(existing_results, f)
logger.info(f"Updated PMCID mappings saved to {results_path}")

return existing_results


def get_unique_pmcids() -> List[str]:
"""
Get a list of unique PMCIDs from the PMCID mapping (pmcid_mapping.json)
NOTE: Could add functionality to check for new PMCIDs in mapping and update the unique_pmcids.json file
Currently function returns the pre-existing unique PMCIDs if they exist or regenerates the list from the mapping.
"""
# Load the unique PMCIDs if they've already been saved
base_dir = os.path.dirname(os.path.abspath(__file__))
unique_pmcids_path = os.path.join(base_dir, "saved_data", "unique_pmcids.json")
if os.path.exists(unique_pmcids_path):
with open(unique_pmcids_path, "r") as f:
try:
pmcids = json.load(f)
except json.JSONDecodeError as e:
logger.error(
f"Error loading unique PMCIDs from {unique_pmcids_path}: {e}"
)
raise e
logger.warning(
f"Loaded {len(pmcids)} pre-existing unique PMCIDs from {unique_pmcids_path}"
)
return pmcids

# Load from pmcid_mapping.json if unique pmcids haven't been saved
results_path = os.path.join(base_dir, "saved_data", "pmcid_mapping.json")
with open(results_path, "r") as f:
existing_results = json.load(f)
# get the unique pmcids
pmcids = list(set(existing_results.values()))

# Save the unique pmcids to a json file
unique_pmcids_path = os.path.join(base_dir, "saved_data", "unique_pmcids.json")
with open(unique_pmcids_path, "w") as f:
json.dump(pmcids, f)
logger.info(f"Unique PMCIDs saved to {unique_pmcids_path}")
return pmcids


if __name__ == "__main__":
# pmid_list = get_pmid_list()
# results = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL"))
# logger.info(f"PMCID mapping complete. {len(results)} PMIDs mapped to PMCIDs.")
pmcids = get_unique_pmcids()
logger.info(f"Number of unique PMCIDs: {len(pmcids)}")
1 change: 1 addition & 0 deletions src/fetch_articles/saved_data/pmcid_mapping.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/fetch_articles/saved_data/unique_pmcids.json

Large diffs are not rendered by default.

28 changes: 28 additions & 0 deletions src/load_data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Load Data Module

This module handles the loading and preprocessing of PharmGKB clinical variants data.

## Methods

1. **`download_and_extract_variant_annotations(override: bool = False)`**
- Downloads and extracts the variant annotations ZIP file from PharmGKB
- Saves data to `saved_data/variantAnnotations/`
- Can override existing downloads if needed

2. **`load_variant_annotations_tsv(override: bool = False)`**
- Loads the variant annotations TSV file into a pandas DataFrame
- Automatically downloads data if not present
- Returns the DataFrame containing variant-drug annotations

3. **`unique_variants(df: pd.DataFrame)`**
- Helper function that generates a dictionary of unique values for each column
- Used for data analysis and validation

4. **`get_pmid_list(override: bool = False)`**
- Main function to extract PMIDs from the variant annotations
- Returns a list of unique PMIDs
- Caches results in `saved_data/pmid_list.json`
- Used as input for PMCID conversion

The module handles all data downloading, extraction, and preprocessing steps needed to get the PMID list for subsequent steps in the pipeline.

1 change: 1 addition & 0 deletions src/load_data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .load_clinical_variants import load_variant_annotations_tsv, get_pmid_list
Loading
Loading