DaneshjouLab · gtcha2 · May 2, 2025 · May 1, 2025 · May 1, 2025 · May 1, 2025
diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,9 @@ __pycache__
 
 # environments
 .pyenv
-.env
+.env
+
+# data
+src/load_data/saved_data/
+src/fetch_articles/saved_data/downloaded_pmcids.json
+src/fetch_articles/saved_data/articles/
diff --git a/README.MD b/README.MD
@@ -10,4 +10,15 @@
 
 ## Description
 
-This repository contains Python scripts for running and building a Pharmacogenomic Agentic system to annotate and label genetic variants based on their phenotypical associations from journal articles. 
+This repository contains Python scripts for running and building a Pharmacogenomic Agentic system to annotate and label genetic variants based on their phenotypical associations from journal articles. 
+
+
+## Progress Tracker
+| Task | Status |
+| --- | --- |
+| Download the zip of variants from pharmgkb | ✅  |
+| Get a PMID list from the variants tsv (column PMID) | ✅ |
+| Convert the PMID to PMCID | ✅ |
+| Update to use non-official pmid to pmcid | |
+| Fetch the content from the PMCID |  |
+| Create pairing of annotations to article | |
diff --git a/pixi.lock b/pixi.lock
diff --git a/pixi.toml b/pixi.toml
@@ -12,5 +12,18 @@ platforms = ["osx-arm64"]
 version = "0.1.0"
 
 [tasks]
+update-downloaded-pmcids = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
 
 [dependencies]
+seaborn = ">=0.13.2,<0.14"
+tqdm = ">=4.67.1,<5"
+requests = ">=2.32.3,<3"
+biopython = ">=1.85,<2"
+ipykernel = ">=6.29.5,<7"
+pandas = ">=2.2.3,<3"
+numpy = ">=2.2.5,<3"
+openai = ">=1.76.2,<2"
+playwright = ">=1.52.0,<2"
+loguru = ">=0.7.2,<0.8"
+python-dotenv = ">=1.1.0,<2"
+black = ">=25.1.0,<26"
diff --git a/src/fetch_articles/README.md b/src/fetch_articles/README.md
@@ -0,0 +1,13 @@
+# PubMed Document Fetching
+## Goal
+Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues
+
+## Process Overview
+1. Download the zip of variants from pharmgkb (handled in load_data module)
+2. Get a PMID list from the variants tsv (column PMID) (handled in load_data module)
+3. Convert the PMID to PMCID 
+4. Fetch the content from the PMCID
+
+## Saved Data
+pmcid_mapping.json: Maps the PMID to the PMCID {"PMID": "PMCID" or Null, ..}
+unique_pmcids.json: List of all the unique PMCIDs from pmcid_mapping.json (["PMCID1", "PMCID2", ...])
diff --git a/src/fetch_articles/__init__.py b/src/fetch_articles/__init__.py
diff --git a/src/fetch_articles/article_downloader.py b/src/fetch_articles/article_downloader.py
@@ -0,0 +1,101 @@
+from loguru import logger
+from src.fetch_articles.pmcid_converter import get_unique_pmcids
+from Bio import Entrez
+import os
+import json
+from tqdm import tqdm
+
+
+def fetch_pmc_content(pmcid):
+    try:
+        handle = Entrez.efetch(db="pmc", id=pmcid, rettype="full", retmode="xml")
+        record = handle.read()
+        handle.close()
+        return record
+    except Exception as e:
+        print(f"An error occurred while fetching content for PMCID {pmcid}: {e}")
+        return None
+
+
+def update_downloaded_pmcids() -> None:
+    """
+    Update the downloaded_pmcids.json file with PMCIDs found in the saved_data/articles directory.
+    """
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    downloaded_pmcids_path = os.path.join(
+        base_dir, "saved_data", "downloaded_pmcids.json"
+    )
+    # Check for all the filenames in the saved_data/articles directory
+    articles_dir = os.path.join(base_dir, "saved_data", "articles")
+    article_pmcids = [f.split(".")[0] for f in os.listdir(articles_dir)]
+    article_pmcids_mapping = {pmcid: f"{pmcid}.xml" for pmcid in article_pmcids}
+
+    logger.info(f"Found {len(article_pmcids)} existing XML files in {articles_dir}")
+    # Add the new PMCIDs to the json file
+    if os.path.exists(downloaded_pmcids_path):
+        with open(downloaded_pmcids_path, "r") as f:
+            try:
+                downloaded_pmcids = json.load(f)
+            except json.JSONDecodeError:
+                logger.error(
+                    f"Error loading {downloaded_pmcids_path}. Creating new json file."
+                )
+                downloaded_pmcids = {}
+    else:
+        downloaded_pmcids = {}
+    downloaded_pmcids.update(article_pmcids_mapping)
+    with open(downloaded_pmcids_path, "w") as f:
+        json.dump(downloaded_pmcids, f)
+    logger.info(
+        f"Updated {downloaded_pmcids_path} with {len(article_pmcids)} new PMCIDs"
+    )
+
+
+def download_articles(pmcids: list[str]):
+    """
+    Download articles from PubMed Central using PMCIDs.
+    Keeps track of the PMCIDs that have been downloaded and skips them.
+    Saves the downloaded articles to the saved_data/articles directory.
+
+    Args:
+        pmcids (list[str]): List of PMCIDs to download.
+    """
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    saved_dir = os.path.join(base_dir, "saved_data", "articles")
+    os.makedirs(saved_dir, exist_ok=True)
+
+    # Load the downloaded PMCIDs from the json file
+    downloaded_pmcids_path = os.path.join(
+        base_dir, "saved_data", "downloaded_pmcids.json"
+    )
+    if os.path.exists(downloaded_pmcids_path):
+        with open(downloaded_pmcids_path, "r") as f:
+            downloaded_pmcids = json.load(f)
+    else:
+        downloaded_pmcids = {}
+
+    new_pmcids = [pmcid for pmcid in pmcids if pmcid not in downloaded_pmcids]
+    logger.warning(f"{len(downloaded_pmcids)} existing articles found")
+    logger.info(f"{len(new_pmcids)} new articles to download")
+
+    # Download the articles
+    for pmcid in tqdm(new_pmcids):
+        record = fetch_pmc_content(pmcid)
+        if record:
+            with open(os.path.join(saved_dir, f"{pmcid}.xml"), "w") as f:
+                f.write(record.decode("utf-8"))
+            downloaded_pmcids[pmcid] = f"{pmcid}.xml"
+        else:
+            downloaded_pmcids[pmcid] = None
+            logger.warning(f"No record found for PMCID {pmcid}")
+    logger.info(f"Downloaded {len(downloaded_pmcids)} articles")
+
+    # Save the downloaded PMCIDs to a json file
+    with open(os.path.join(base_dir, "saved_data", "downloaded_pmcids.json"), "w") as f:
+        json.dump(downloaded_pmcids, f)
+
+
+if __name__ == "__main__":
+    update_downloaded_pmcids()
+    pmcids = get_unique_pmcids()
+    download_articles(pmcids)
diff --git a/src/fetch_articles/pmcid_converter.py b/src/fetch_articles/pmcid_converter.py
@@ -0,0 +1,165 @@
+import time
+import random
+import pandas as pd
+from Bio import Entrez
+from tqdm import tqdm
+from dotenv import load_dotenv
+import os
+from src.load_data import get_pmid_list
+import json
+
+load_dotenv()
+# Email for NCBI
+Entrez.email = os.getenv("NCBI_EMAIL")
+
+# Step 1: Function to get PMCID from PMID
+import requests
+from loguru import logger
+
+import requests
+import time
+from loguru import logger
+from typing import List, Set, Dict, Optional
+
+
+def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]:
+    """
+    Load the saved PMCID mapping from the json file.
+    """
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    results_path = os.path.join(base_dir, "pmcid_mapping.json")
+    if os.path.exists(results_path):
+        with open(results_path, "r") as f:
+            existing_results = json.load(f)
+        logger.info(
+            f"Loaded {len(existing_results)} existing PMCID mappings from {results_path}"
+        )
+    else:
+        logger.info(
+            f"No PMCID mapping found at {results_path}. Creating empty mapping."
+        )
+        existing_results = {}
+    return existing_results
+
+
+def batch_pmid_to_pmcid(
+    pmids: List[str], email: str, batch_size: int = 100, delay: float = 0.4
+) -> Dict[str, Optional[str]]:
+    """
+    Convert a list of PMIDs to PMCIDs using NCBI's ID Converter API.
+
+    Args:
+        pmids: List of PMIDs (as strings).
+        email: Your email address for NCBI tool identification.
+        batch_size: Number of PMIDs to send per request (max: 200).
+        delay: Seconds to wait between requests (default 0.4 to respect NCBI).
+
+    Returns:
+        Dict mapping each PMID to a PMCID (or None if not available).
+    """
+    url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
+    results = {}
+    existing_results = load_saved_pmcid_mapping()
+
+    # Check for existing results
+    existing_pmids = set(existing_results.keys())
+
+    # Remove existing results from pmids
+    filtered_pmids = [x for x in pmids if str(x) not in existing_pmids]
+
+    logger.info(f"Remaining PMIDs to process: {len(filtered_pmids)}")
+    if len(filtered_pmids) == 0:
+        logger.warning("No PMIDs to process. Exiting.")
+        return existing_results
+
+    # Process remaining PMIDs
+    for i in range(0, len(filtered_pmids), batch_size):
+        batch = filtered_pmids[i : i + batch_size]
+        batch_str = [str(pmid) for pmid in batch]
+        ids_str = ",".join(batch_str)
+        logger.info(f"Processing PMIDs {i + 1} to {i + len(batch)}...")
+
+        params = {
+            "tool": "pmid2pmcid_tool",
+            "email": email,
+            "ids": ids_str,
+            "format": "json",
+        }
+
+        try:
+            response = requests.get(url, params=params)
+            response.raise_for_status()
+            data = response.json()
+            records = data.get("records", [])
+            for record in records:
+                pmid = record.get("pmid")
+                pmcid = record.get("pmcid")
+                results[pmid] = pmcid if pmcid else None
+                if pmcid:
+                    logger.info(f"PMID {pmid} → PMCID {pmcid}")
+                else:
+                    logger.warning(f"PMID {pmid} has no PMCID available.")
+        except Exception as e:
+            logger.error(f"Failed batch starting at index {i}: {e}")
+            for pmid in batch:
+                results[pmid] = None
+
+        time.sleep(delay)
+
+    # Merge existing results with new results
+    existing_results.update(results)
+
+    # Save updated results
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    results_path = os.path.join(base_dir, "pmcid_mapping.json")
+    with open(results_path, "w") as f:
+        json.dump(existing_results, f)
+    logger.info(f"Updated PMCID mappings saved to {results_path}")
+
+    return existing_results
+
+
+def get_unique_pmcids() -> List[str]:
+    """
+    Get a list of unique PMCIDs from the PMCID mapping (pmcid_mapping.json)
+    NOTE: Could add functionality to check for new PMCIDs in mapping and update the unique_pmcids.json file
+    Currently function returns the pre-existing unique PMCIDs if they exist or regenerates the list from the mapping.
+    """
+    # Load the unique PMCIDs if they've already been saved
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    unique_pmcids_path = os.path.join(base_dir, "saved_data", "unique_pmcids.json")
+    if os.path.exists(unique_pmcids_path):
+        with open(unique_pmcids_path, "r") as f:
+            try:
+                pmcids = json.load(f)
+            except json.JSONDecodeError as e:
+                logger.error(
+                    f"Error loading unique PMCIDs from {unique_pmcids_path}: {e}"
+                )
+                raise e
+        logger.warning(
+            f"Loaded {len(pmcids)} pre-existing unique PMCIDs from {unique_pmcids_path}"
+        )
+        return pmcids
+
+    # Load from pmcid_mapping.json if unique pmcids haven't been saved
+    results_path = os.path.join(base_dir, "saved_data", "pmcid_mapping.json")
+    with open(results_path, "r") as f:
+        existing_results = json.load(f)
+    # get the unique pmcids
+    pmcids = list(set(existing_results.values()))
+
+    # Save the unique pmcids to a json file
+    unique_pmcids_path = os.path.join(base_dir, "saved_data", "unique_pmcids.json")
+    with open(unique_pmcids_path, "w") as f:
+        json.dump(pmcids, f)
+    logger.info(f"Unique PMCIDs saved to {unique_pmcids_path}")
+    return pmcids
+
+
+if __name__ == "__main__":
+    # pmid_list = get_pmid_list()
+    # results = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL"))
+    # logger.info(f"PMCID mapping complete. {len(results)} PMIDs mapped to PMCIDs.")
+    pmcids = get_unique_pmcids()
+    logger.info(f"Number of unique PMCIDs: {len(pmcids)}")
diff --git a/src/fetch_articles/saved_data/pmcid_mapping.json b/src/fetch_articles/saved_data/pmcid_mapping.json
diff --git a/src/fetch_articles/saved_data/unique_pmcids.json b/src/fetch_articles/saved_data/unique_pmcids.json
diff --git a/src/load_data/README.md b/src/load_data/README.md
@@ -0,0 +1,28 @@
+# Load Data Module
+
+This module handles the loading and preprocessing of PharmGKB clinical variants data.
+
+## Methods
+
+1. **`download_and_extract_variant_annotations(override: bool = False)`**
+   - Downloads and extracts the variant annotations ZIP file from PharmGKB
+   - Saves data to `saved_data/variantAnnotations/`
+   - Can override existing downloads if needed
+
+2. **`load_variant_annotations_tsv(override: bool = False)`**
+   - Loads the variant annotations TSV file into a pandas DataFrame
+   - Automatically downloads data if not present
+   - Returns the DataFrame containing variant-drug annotations
+
+3. **`unique_variants(df: pd.DataFrame)`**
+   - Helper function that generates a dictionary of unique values for each column
+   - Used for data analysis and validation
+
+4. **`get_pmid_list(override: bool = False)`**
+   - Main function to extract PMIDs from the variant annotations
+   - Returns a list of unique PMIDs
+   - Caches results in `saved_data/pmid_list.json`
+   - Used as input for PMCID conversion
+
+The module handles all data downloading, extraction, and preprocessing steps needed to get the PMID list for subsequent steps in the pipeline.
+
diff --git a/src/load_data/__init__.py b/src/load_data/__init__.py
@@ -0,0 +1 @@
+from .load_clinical_variants import load_variant_annotations_tsv, get_pmid_list
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .load_clinical_variants import load_variant_annotations_tsv, get_pmid_list