DaneshjouLab · shloknatarajan · May 20, 2025 · May 3, 2025 · May 6, 2025 · May 6, 2025
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,19 @@ __pycache__
 .env
 
 # data
-src/load_data/saved_data/
-src/fetch_articles/saved_data/downloaded_pmcids.json
-src/fetch_articles/saved_data/articles/
+data/articles/
+data/variantAnnotations/
+data/unique_pmcids.json
+data/pmid_list.json
+data/downloaded_pmcids.json
+
+*.zip
+*.tar.gz
+*.tar.bz2
+*.tar.xz
+*.tar.lzma
+*.tar.lz
+*.tar.lzo
+
+.DS_Store
+
diff --git a/README.MD b/README.MD
@@ -6,19 +6,37 @@
 
 # AutoGKB
 
-
+Goals:
+1. Fetch annotated articles from variantAnnotations stored in PharmGKB API
+2. Create a general benchmark for an extraction system that can output a score for an extraction system
+Given: Article, Ground Truth Variants (Manually extracted and recorded in var_drug_ann.tsv:)
+Input: Extracted Variants
+Output: Score 
+3. System for extracting drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc.
+4. Continously fetch new pharmacogenomic articles
 
 ## Description
 
 This repository contains Python scripts for running and building a Pharmacogenomic Agentic system to annotate and label genetic variants based on their phenotypical associations from journal articles. 
 
 
 ## Progress Tracker
-| Task | Status |
-| --- | --- |
-| Download the zip of variants from pharmgkb | ✅  |
-| Get a PMID list from the variants tsv (column PMID) | ✅ |
-| Convert the PMID to PMCID | ✅ |
-| Update to use non-official pmid to pmcid | |
-| Fetch the content from the PMCID |  |
-| Create pairing of annotations to article | |
+| Category | Task | Status |
+| --- | --- | --- |
+| Initial Download | Download the zip of variants from pharmgkb | ✅  |
+|                  | Get a PMID list from the variants tsv (column PMID) | ✅ |
+|                  | Convert the PMID to PMCID | ✅ |
+|                  | Update to use non-official pmid to pmcid (aaron's method) | |
+|                  | Fetch the content from the PMCID | ✅ |
+| Benchmark        | Create pairings of annotations to articles | |
+|                  | Create a niave score of number of matches | |
+|                  | Create group wise score | |
+|                  | Look into advanced scoring based on distance from truth per term | |
+| Workflows        | Integrate Aaron's current approach | |
+|                  | Document on individual annotation meanings | |
+|                  | Delegate annotation groupings to team members | |
+| New Article Fetching | Replicate PharGKB current workflow | |
+
+## System Overview
+![Annotations Diagram](assets/annotations_diagram.svg)
+
diff --git a/assets/annotations_diagram.svg b/assets/annotations_diagram.svg
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,17 @@
+# Data
+
+This directory contains the primary data files used by the AutoGKB project.
+
+## Directory Structure
+
+- **articles/** - Contains XML files of articles from PubMed Central (PMC), identified by their PMCID (e.g., PMC1234567.xml). These articles are used for text mining and information extraction.
+
+- **variantAnnotations/** - Contains clinical variant annotations and related data:
+  - `var_drug_ann.tsv` - Variant-drug annotations. This is what is used in this repo.
+  - This can be downloaded using download_and_extract_variant_annotations from the load_variants module
+
+- **Support Files**:
+  - `pmcid_mapping.json` - Maps between PMIDs and PMCIDs
+  - `unique_pmcids.json` - List of unique PMCIDs in the dataset
+  - `pmid_list.json` - List of PMIDs in the dataset
+  - `downloaded_pmcids.json` - Tracking which PMCIDs have been downloaded
diff --git a/...ch_articles/saved_data/pmcid_mapping.json → data/pmcid_mapping.json b/...ch_articles/saved_data/pmcid_mapping.json → data/pmcid_mapping.json
diff --git a/pixi.toml b/pixi.toml
@@ -12,7 +12,9 @@ platforms = ["osx-arm64"]
 version = "0.1.0"
 
 [tasks]
-update-downloaded-pmcids = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
+download-variants = "python -m src.load_variants.load_clinical_variants"
+update-download-map = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
+download-articles = "python -m src.fetch_articles.article_downloader"
 
 [dependencies]
 seaborn = ">=0.13.2,<0.14"

diff --git a/src/benchmark/README.md b/src/benchmark/README.md
@@ -0,0 +1,4 @@
+# Benchmark
+
+## Functions
+1. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID
diff --git a/src/benchmark/__init__.py b/src/benchmark/__init__.py
diff --git a/src/dataset/README.md b/src/dataset/README.md
@@ -0,0 +1,8 @@
+# Dataset
+
+## Goal
+Convert the loaded files into a dataset where the annotations and raw text are paired with each other
+
+## Subgoals
+1. Understand the formats of the annotations
+2. Choose a format for the dataset
diff --git a/src/dataset/__init__.py b/src/dataset/__init__.py
diff --git a/src/fetch_articles/README.md b/src/fetch_articles/README.md
@@ -1,13 +1,161 @@
 # PubMed Document Fetching
+
 ## Goal
-Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues
+Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues.
 
 ## Process Overview
-1. Download the zip of variants from pharmgkb (handled in load_data module)
-2. Get a PMID list from the variants tsv (column PMID) (handled in load_data module)
+1. Download the zip of variants from pharmgkb (handled in load_variants module)
+2. Get a PMID list from the variants tsv (column PMID) (handled in load_variants module)
 3. Convert the PMID to PMCID 
 4. Fetch the content from the PMCID
 
-## Saved Data
-pmcid_mapping.json: Maps the PMID to the PMCID {"PMID": "PMCID" or Null, ..}
-unique_pmcids.json: List of all the unique PMCIDs from pmcid_mapping.json (["PMCID1", "PMCID2", ...])
+## Key Functions
+
+### PMCID Converter (`pmcid_converter.py`)
+
+- `batch_pmid_to_pmcid(pmids, email, batch_size, delay)`: Converts a list of PMIDs to PMCIDs using NCBI's ID Converter API. Processes PMIDs in batches and handles rate limiting.
+  - Arguments:
+    - `pmids`: List of PMIDs (as strings)
+    - `email`: Your email for NCBI tool identification
+    - `batch_size`: Number of PMIDs per request (max: 200)
+    - `delay`: Seconds between requests (default: 0.4)
+  - Returns: Dict mapping each PMID to PMCID (or None if not available)
+
+- `get_unique_pmcids()`: Returns a list of unique PMCIDs from the PMCID mapping file.
+
+- `load_saved_pmcid_mapping()`: Loads previously saved PMCID mappings from disk.
+
+- `get_project_root()`: Returns the project root directory path.
+
+### Article Downloader (`article_downloader.py`)
+
+- `fetch_pmc_content(pmcid)`: Fetches a single article's content from PubMed Central.
+  - Arguments:
+    - `pmcid`: The PubMed Central ID to fetch
+  - Returns: Article content in XML format or None if fetching failed
+
+- `download_articles(pmcids)`: Downloads multiple articles from PubMed Central.
+  - Arguments:
+    - `pmcids`: List of PMCIDs to download
+  - Saves downloaded articles to `data/articles/` as XML files
+  - Tracks downloaded PMCIDs to avoid duplicating work
+
+- `update_downloaded_pmcids()`: Updates tracking of downloaded PMCIDs from files in `data/articles/` directory.
+
+## Created Data
+- `pmcid_mapping.json`: Maps the PMID to the PMCID `{"PMID": "PMCID" or Null, ..}`
+- `unique_pmcids.json`: List of all the unique PMCIDs from pmcid_mapping.json `["PMCID1", "PMCID2", ...]`
+- `downloaded_pmcids.json`: Maps PMCIDs to filenames or None if download failed `{"PMCID": "PMCID.xml" or null, ..}`
+- `<articles>.xml`: Downloaded articles
+
+## Usage Examples
+
+### Convert PMIDs to PMCIDs
+
+```python
+from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid
+from src.load_variants import get_pmid_list
+import os
+from dotenv import load_dotenv
+
+load_dotenv()  # Load environment variables (NCBI_EMAIL)
+
+# Get list of PMIDs from variant data
+pmid_list = get_pmid_list()
+
+# Convert PMIDs to PMCIDs
+pmcid_mapping = batch_pmid_to_pmcid(
+    pmids=pmid_list,
+    email=os.getenv("NCBI_EMAIL"),
+    batch_size=100,
+    delay=0.4
+)
+
+print(f"Successfully mapped {len(pmcid_mapping)} PMIDs to PMCIDs")
+```
+
+### Download Articles Using PMCIDs
+
+```python
+from src.fetch_articles.article_downloader import download_articles
+from src.fetch_articles.pmcid_converter import get_unique_pmcids
+
+# Get unique PMCIDs from saved mapping
+pmcids = get_unique_pmcids()
+
+# Download articles
+download_articles(pmcids)
+```
+
+### Download a Single Article
+
+```python
+from src.fetch_articles.article_downloader import fetch_pmc_content
+from src.fetch_articles.pmcid_converter import get_project_root
+import os
+from pathlib import Path
+
+# Get project root
+project_root = get_project_root()
+
+# Fetch a single article
+pmcid = "PMC1234567"
+content = fetch_pmc_content(pmcid)
+
+if content:
+    # Save the article content
+    articles_dir = project_root / "data" / "articles"
+    os.makedirs(articles_dir, exist_ok=True)
+
+    with open(articles_dir / f"{pmcid}.xml", "w") as f:
+        f.write(content.decode("utf-8"))
+    print(f"Successfully downloaded article {pmcid}")
+else:
+    print(f"Failed to download article {pmcid}")
+```
+
+### Update Downloaded PMCIDs
+
+```python
+from src.fetch_articles.article_downloader import update_downloaded_pmcids
+
+# Update downloaded_pmcids.json with articles in data/articles/
+update_downloaded_pmcids()
+```
+
+## Full Pipeline Execution
+
+To run the complete pipeline (convert PMIDs to PMCIDs and download articles):
+
+```python
+# Full pipeline from PMIDs to downloaded articles
+from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid
+from src.fetch_articles.article_downloader import download_articles
+from src.load_variants import get_pmid_list
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# 1. Get PMIDs from variant data
+pmid_list = get_pmid_list()
+
+# 2. Convert PMIDs to PMCIDs
+pmcid_mapping = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL"))
+
+# 3. Extract only valid PMCIDs (not None)
+valid_pmcids = [pmcid for pmcid in pmcid_mapping.values() if pmcid]
+
+# 4. Download articles
+download_articles(valid_pmcids)
+```
+
+Alternatively, run the module scripts directly:
+
+```bash
+# First convert PMIDs to PMCIDs
+python -m src.fetch_articles.pmcid_converter
+
+# Then download articles
+python -m src.fetch_articles.article_downloader
+```
diff --git a/src/fetch_articles/article_downloader.py b/src/fetch_articles/article_downloader.py
@@ -1,12 +1,22 @@
 from loguru import logger
 from src.fetch_articles.pmcid_converter import get_unique_pmcids
+from src.utils.file_paths import get_project_root
 from Bio import Entrez
 import os
 import json
 from tqdm import tqdm
 
 
 def fetch_pmc_content(pmcid):
+    """
+    Fetch content for a single article from PubMed Central.
+
+    Args:
+        pmcid (str): The PubMed Central ID to fetch
+
+    Returns:
+        bytes or None: The article content in XML format or None if fetching failed
+    """
     try:
         handle = Entrez.efetch(db="pmc", id=pmcid, rettype="full", retmode="xml")
         record = handle.read()
@@ -19,18 +29,20 @@ def fetch_pmc_content(pmcid):
 
 def update_downloaded_pmcids() -> None:
     """
-    Update the downloaded_pmcids.json file with PMCIDs found in the saved_data/articles directory.
+    Update the downloaded_pmcids.json file with PMCIDs found in the data/articles directory.
     """
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    downloaded_pmcids_path = os.path.join(
-        base_dir, "saved_data", "downloaded_pmcids.json"
-    )
-    # Check for all the filenames in the saved_data/articles directory
-    articles_dir = os.path.join(base_dir, "saved_data", "articles")
+    project_root = get_project_root()
+    downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json"
+
+    # Check for all the filenames in the data/articles directory
+    articles_dir = project_root / "data" / "articles"
+    os.makedirs(articles_dir, exist_ok=True)
+
     article_pmcids = [f.split(".")[0] for f in os.listdir(articles_dir)]
     article_pmcids_mapping = {pmcid: f"{pmcid}.xml" for pmcid in article_pmcids}
 
     logger.info(f"Found {len(article_pmcids)} existing XML files in {articles_dir}")
+
     # Add the new PMCIDs to the json file
     if os.path.exists(downloaded_pmcids_path):
         with open(downloaded_pmcids_path, "r") as f:
@@ -43,9 +55,12 @@ def update_downloaded_pmcids() -> None:
                 downloaded_pmcids = {}
     else:
         downloaded_pmcids = {}
+
     downloaded_pmcids.update(article_pmcids_mapping)
+
     with open(downloaded_pmcids_path, "w") as f:
         json.dump(downloaded_pmcids, f)
+
     logger.info(
         f"Updated {downloaded_pmcids_path} with {len(article_pmcids)} new PMCIDs"
     )
@@ -55,19 +70,18 @@ def download_articles(pmcids: list[str]):
     """
     Download articles from PubMed Central using PMCIDs.
     Keeps track of the PMCIDs that have been downloaded and skips them.
-    Saves the downloaded articles to the saved_data/articles directory.
+    Saves the downloaded articles to the data/articles directory.
 
     Args:
         pmcids (list[str]): List of PMCIDs to download.
     """
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    saved_dir = os.path.join(base_dir, "saved_data", "articles")
-    os.makedirs(saved_dir, exist_ok=True)
+    project_root = get_project_root()
+    articles_dir = project_root / "data" / "articles"
+    os.makedirs(articles_dir, exist_ok=True)
 
     # Load the downloaded PMCIDs from the json file
-    downloaded_pmcids_path = os.path.join(
-        base_dir, "saved_data", "downloaded_pmcids.json"
-    )
+    downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json"
+
     if os.path.exists(downloaded_pmcids_path):
         with open(downloaded_pmcids_path, "r") as f:
             downloaded_pmcids = json.load(f)
@@ -82,16 +96,19 @@ def download_articles(pmcids: list[str]):
     for pmcid in tqdm(new_pmcids):
         record = fetch_pmc_content(pmcid)
         if record:
-            with open(os.path.join(saved_dir, f"{pmcid}.xml"), "w") as f:
+            with open(articles_dir / f"{pmcid}.xml", "w") as f:
                 f.write(record.decode("utf-8"))
             downloaded_pmcids[pmcid] = f"{pmcid}.xml"
         else:
             downloaded_pmcids[pmcid] = None
             logger.warning(f"No record found for PMCID {pmcid}")
-    logger.info(f"Downloaded {len(downloaded_pmcids)} articles")
+
+    logger.info(
+        f"Downloaded {len(new_pmcids)} new articles, total articles: {len(downloaded_pmcids)}"
+    )
 
     # Save the downloaded PMCIDs to a json file
-    with open(os.path.join(base_dir, "saved_data", "downloaded_pmcids.json"), "w") as f:
+    with open(downloaded_pmcids_path, "w") as f:
         json.dump(downloaded_pmcids, f)