From 695255f63e5c40c23587295790e56fb011f04d03 Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Fri, 2 May 2025 17:14:19 -0700
Subject: [PATCH 01/15] feat: pixi commands, gitignore, progress tracker

---
 .gitignore | 11 ++++++++++-
 README.MD  |  2 +-
 pixi.toml  |  3 ++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4122350..6c9c9f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,4 +21,13 @@ __pycache__
 # data
 src/load_data/saved_data/
 src/fetch_articles/saved_data/downloaded_pmcids.json
-src/fetch_articles/saved_data/articles/
\ No newline at end of file
+src/fetch_articles/saved_data/articles/
+*.zip
+*.tar.gz
+*.tar.bz2
+*.tar.xz
+*.tar.lzma
+*.tar.lz
+*.tar.lzo
+
+.DS_Store
diff --git a/README.MD b/README.MD
index c3bc5c9..135f8a7 100644
--- a/README.MD
+++ b/README.MD
@@ -20,5 +20,5 @@ This repository contains Python scripts for running and building a Pharmacogenom
 | Get a PMID list from the variants tsv (column PMID) | ✅ |
 | Convert the PMID to PMCID | ✅ |
 | Update to use non-official pmid to pmcid | |
-| Fetch the content from the PMCID |  |
+| Fetch the content from the PMCID | ✅ |
 | Create pairing of annotations to article | |
\ No newline at end of file
diff --git a/pixi.toml b/pixi.toml
index e6d800e..6c5d7fd 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -12,7 +12,8 @@ platforms = ["osx-arm64"]
 version = "0.1.0"
 
 [tasks]
-update-downloaded-pmcids = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
+update-download-map = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
+download-articles = "python -m src.fetch_articles.article_downloader"
 
 [dependencies]
 seaborn = ">=0.13.2,<0.14"

From 0d0282fffb3833f431e18cf59925fe6e04b1dfcb Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 5 May 2025 18:23:13 -0700
Subject: [PATCH 02/15] docs: goals breakdown

---
 README.MD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.MD b/README.MD
index 135f8a7..fc110e9 100644
--- a/README.MD
+++ b/README.MD
@@ -6,7 +6,10 @@
 
 # AutoGKB
 
-
+Goals:
+1. Continously fetch publications on pharmacogenomic relationships
+2. Extract variants annotations from an article
+3. Create a general benchmark for an extraction system
 
 ## Description
 

From 0940450970e72014b29d3c1ba8dab947aec3df0b Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 5 May 2025 18:27:05 -0700
Subject: [PATCH 03/15] docs: update goals

---
 README.MD | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.MD b/README.MD
index fc110e9..e3e07f0 100644
--- a/README.MD
+++ b/README.MD
@@ -8,8 +8,11 @@
 
 Goals:
 1. Continously fetch publications on pharmacogenomic relationships
-2. Extract variants annotations from an article
-3. Create a general benchmark for an extraction system
+2. Extract drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc.
+3. Create a general benchmark for an extraction system that can output a score for an extraction system
+Given: Article, Ground Truth Variants (Manually extracted and recorded in var_drug_ann.tsv:)
+Input: Extraction System or Extracted Variants
+Output: Score 
 
 ## Description
 

From 1b29657924bc5a950f659e2bc0217d37c51815d9 Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 5 May 2025 18:28:31 -0700
Subject: [PATCH 04/15] docs: goals

---
 README.MD | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.MD b/README.MD
index e3e07f0..2e35714 100644
--- a/README.MD
+++ b/README.MD
@@ -7,12 +7,13 @@
 # AutoGKB
 
 Goals:
-1. Continously fetch publications on pharmacogenomic relationships
+1. Fetch annotated articles from variantAnnotations stored in PharmGKB API
 2. Extract drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc.
 3. Create a general benchmark for an extraction system that can output a score for an extraction system
 Given: Article, Ground Truth Variants (Manually extracted and recorded in var_drug_ann.tsv:)
 Input: Extraction System or Extracted Variants
 Output: Score 
+4. Continously fetch new pharmacogenomic articles
 
 ## Description
 

From 31cbdc8ad7c0fe6c9616f8fa976a238b108461af Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 5 May 2025 18:29:42 -0700
Subject: [PATCH 05/15] docs: goals

---
 README.MD | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.MD b/README.MD
index 2e35714..54f7b48 100644
--- a/README.MD
+++ b/README.MD
@@ -8,11 +8,11 @@
 
 Goals:
 1. Fetch annotated articles from variantAnnotations stored in PharmGKB API
-2. Extract drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc.
-3. Create a general benchmark for an extraction system that can output a score for an extraction system
+2. Create a general benchmark for an extraction system that can output a score for an extraction system
 Given: Article, Ground Truth Variants (Manually extracted and recorded in var_drug_ann.tsv:)
 Input: Extraction System or Extracted Variants
 Output: Score 
+3. System for extracting drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc.
 4. Continously fetch new pharmacogenomic articles
 
 ## Description

From 3d273f5ecef3f07e44cc0f333c362f7b687a4dd9 Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Thu, 15 May 2025 14:53:57 -0700
Subject: [PATCH 06/15] feat: dataset folder and progress trackers

---
 README.MD                               | 15 ++++++--
 src/benchmark/README.md                 |  4 +++
 src/benchmark/__init__.py               |  0
 src/benchmark/annotation.py             | 46 +++++++++++++++++++++++++
 src/dataset/README.md                   |  8 +++++
 src/dataset/__init__.py                 |  0
 src/load_data/README.md                 |  2 +-
 src/load_data/__init__.py               |  2 +-
 src/load_data/load_clinical_variants.py |  7 ++--
 9 files changed, 75 insertions(+), 9 deletions(-)
 create mode 100644 src/benchmark/README.md
 create mode 100644 src/benchmark/__init__.py
 create mode 100644 src/benchmark/annotation.py
 create mode 100644 src/dataset/README.md
 create mode 100644 src/dataset/__init__.py

diff --git a/README.MD b/README.MD
index 54f7b48..e326fd1 100644
--- a/README.MD
+++ b/README.MD
@@ -10,7 +10,7 @@ Goals:
 1. Fetch annotated articles from variantAnnotations stored in PharmGKB API
 2. Create a general benchmark for an extraction system that can output a score for an extraction system
 Given: Article, Ground Truth Variants (Manually extracted and recorded in var_drug_ann.tsv:)
-Input: Extraction System or Extracted Variants
+Input: Extracted Variants
 Output: Score 
 3. System for extracting drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc.
 4. Continously fetch new pharmacogenomic articles
@@ -26,6 +26,15 @@ This repository contains Python scripts for running and building a Pharmacogenom
 | Download the zip of variants from pharmgkb | ✅  |
 | Get a PMID list from the variants tsv (column PMID) | ✅ |
 | Convert the PMID to PMCID | ✅ |
-| Update to use non-official pmid to pmcid | |
+| Update to use non-official pmid to pmcid (aaron's method) | |
 | Fetch the content from the PMCID | ✅ |
-| Create pairing of annotations to article | |
\ No newline at end of file
+| Create pairings of annotations to articles | |
+| Create a niave score of number of matches | |
+| Create group wise score | |
+| Look into advanced scoring based on distance from truth per term | |
+
+
+## Notes
+### 5/15
+Current state of the repo:
+- 
\ No newline at end of file
diff --git a/src/benchmark/README.md b/src/benchmark/README.md
new file mode 100644
index 0000000..d12e18e
--- /dev/null
+++ b/src/benchmark/README.md
@@ -0,0 +1,4 @@
+# Benchmark
+
+## Functions
+1. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID
diff --git a/src/benchmark/__init__.py b/src/benchmark/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/benchmark/annotation.py b/src/benchmark/annotation.py
new file mode 100644
index 0000000..87b0da0
--- /dev/null
+++ b/src/benchmark/annotation.py
@@ -0,0 +1,46 @@
+from pydantic import BaseModel
+from src.load_data import load_raw_variant_annotations
+
+"""
+Denotes a class for a variant annotation (row in var_drug_ann.tsv)
+"""
+
+class VariantAnnotation(BaseModel):
+    variant_annotation_id: str
+    variant_haplotypes: str
+    gene: str
+    drug: str
+    pmid: str
+    phenotype_category: str
+    significance: str
+    notes: str
+    sentence: str
+    alleles: str
+    specialty_population: str
+    metabolizer_types: str
+    phenotype_category: str
+    significance: str
+    notes: str
+    sentence: str
+    alleles: str
+    specialty_population: str
+    metabolizer_types: str
+    is_plural: str
+    is_associated: str
+    direction_of_effect: str
+    pd_pk_terms: str
+    multiple_drugs_and_or: str
+    population_types: str
+    population_phenotypes_or_diseases: str
+    multiple_phenotypes_or_diseases_and_or: str
+    comparison_alleles_or_genotypes: str
+    comparison_metabolizer_types: str
+    
+
+    
+"""
+1. Load the ground truth variants
+2. Load the extracted variants
+3. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID
+"""
+
diff --git a/src/dataset/README.md b/src/dataset/README.md
new file mode 100644
index 0000000..8a30cd7
--- /dev/null
+++ b/src/dataset/README.md
@@ -0,0 +1,8 @@
+# Dataset
+
+## Goal
+Convert the loaded files into a dataset where the annotations and raw text are paired with each other
+
+## Subgoals
+1. Understand the formats of the annotations
+2. Choose a format for the dataset
diff --git a/src/dataset/__init__.py b/src/dataset/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/load_data/README.md b/src/load_data/README.md
index 997987a..311203f 100644
--- a/src/load_data/README.md
+++ b/src/load_data/README.md
@@ -9,7 +9,7 @@ This module handles the loading and preprocessing of PharmGKB clinical variants
    - Saves data to `saved_data/variantAnnotations/`
    - Can override existing downloads if needed
 
-2. **`load_variant_annotations_tsv(override: bool = False)`**
+2. **`load_raw_variant_annotations(override: bool = False)`**
    - Loads the variant annotations TSV file into a pandas DataFrame
    - Automatically downloads data if not present
    - Returns the DataFrame containing variant-drug annotations
diff --git a/src/load_data/__init__.py b/src/load_data/__init__.py
index c2694d6..90cbc32 100644
--- a/src/load_data/__init__.py
+++ b/src/load_data/__init__.py
@@ -1 +1 @@
-from .load_clinical_variants import load_variant_annotations_tsv, get_pmid_list
+from .load_clinical_variants import load_raw_variant_annotations, get_pmid_list
\ No newline at end of file
diff --git a/src/load_data/load_clinical_variants.py b/src/load_data/load_clinical_variants.py
index 4176343..492abfc 100644
--- a/src/load_data/load_clinical_variants.py
+++ b/src/load_data/load_clinical_variants.py
@@ -49,7 +49,7 @@ def download_and_extract_variant_annotations(override: bool = False) -> str:
     return extract_dir
 
 
-def load_variant_annotations_tsv(override: bool = False) -> pd.DataFrame:
+def load_raw_variant_annotations(override: bool = False) -> pd.DataFrame:
     """
     Loads the variant annotations tsv file.
     If the file does not exist, it will be downloaded and extracted.
@@ -112,7 +112,7 @@ def load_unique_variants(save_results: bool = True) -> dict:
         logger.info(
             f"Unique variants not found at {unique_variants_path}. Loading from tsv file..."
         )
-        df = load_variant_annotations_tsv()
+        df = load_raw_variant_annotations()
         unique_values_per_column = unique_variants(df)
         if save_results:
             logger.info(f"Saving unique variants to {unique_variants_path}")
@@ -132,14 +132,13 @@ def get_pmid_list(override: bool = False) -> list:
         with open(pmid_list_path, "r") as f:
             pmid_list = json.load(f)
     else:
-        df = load_variant_annotations_tsv(override)
+        df = load_raw_variant_annotations(override)
         pmid_list = df["PMID"].unique().tolist()
         logger.info(f"Saving PMIDs to {pmid_list_path}")
         with open(pmid_list_path, "w") as f:
             json.dump(pmid_list, f)
     return pmid_list
 
-
 if __name__ == "__main__":
     pmid_list = get_pmid_list()
     print(f"Number of unique PMIDs: {len(pmid_list)}")

From 1084188ccdd3eb5daffcc82ab858d7f6f2d85301 Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 19 May 2025 15:58:30 -0700
Subject: [PATCH 07/15] docs: prgoress tracker

---
 README.MD | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/README.MD b/README.MD
index e326fd1..3ae321f 100644
--- a/README.MD
+++ b/README.MD
@@ -21,17 +21,20 @@ This repository contains Python scripts for running and building a Pharmacogenom
 
 
 ## Progress Tracker
-| Task | Status |
-| --- | --- |
-| Download the zip of variants from pharmgkb | ✅  |
-| Get a PMID list from the variants tsv (column PMID) | ✅ |
-| Convert the PMID to PMCID | ✅ |
-| Update to use non-official pmid to pmcid (aaron's method) | |
-| Fetch the content from the PMCID | ✅ |
-| Create pairings of annotations to articles | |
-| Create a niave score of number of matches | |
-| Create group wise score | |
-| Look into advanced scoring based on distance from truth per term | |
+| Category | Task | Status |
+| --- | --- | --- |
+| Initial Download | Download the zip of variants from pharmgkb | ✅  |
+|                  | Get a PMID list from the variants tsv (column PMID) | ✅ |
+|                  | Convert the PMID to PMCID | ✅ |
+|                  | Update to use non-official pmid to pmcid (aaron's method) | |
+|                  | Fetch the content from the PMCID | ✅ |
+| Benchmark        | Create pairings of annotations to articles | |
+|                  | Create a niave score of number of matches | |
+|                  | Create group wise score | |
+|                  | Look into advanced scoring based on distance from truth per term | |
+| Workflows        | Integrate Aaron's current approach | |
+|                  | Document on individual annotation meanings | |
+|                  | Delegate annotation groupings to team members | |
 
 
 ## Notes

From 96e099c046a30a9cdceb4bdd4abb3f1f3e1fd345 Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 19 May 2025 15:59:09 -0700
Subject: [PATCH 08/15] docs: cleaned up readme

---
 README.MD | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/README.MD b/README.MD
index 3ae321f..7a6b7e5 100644
--- a/README.MD
+++ b/README.MD
@@ -36,8 +36,3 @@ This repository contains Python scripts for running and building a Pharmacogenom
 |                  | Document on individual annotation meanings | |
 |                  | Delegate annotation groupings to team members | |
 
-
-## Notes
-### 5/15
-Current state of the repo:
-- 
\ No newline at end of file

From 50c9e6aa4548760f0c868ed33b6a5aedf912df1e Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 19 May 2025 16:00:02 -0700
Subject: [PATCH 09/15] docs: progress tracker

---
 README.MD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.MD b/README.MD
index 7a6b7e5..c7ad26e 100644
--- a/README.MD
+++ b/README.MD
@@ -35,4 +35,5 @@ This repository contains Python scripts for running and building a Pharmacogenom
 | Workflows        | Integrate Aaron's current approach | |
 |                  | Document on individual annotation meanings | |
 |                  | Delegate annotation groupings to team members | |
+| New Article Fetching | Replicate PharGKB current workflow | |
 

From bf89b08b7f37c1eb7a54753d15c92d497aa85d42 Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 19 May 2025 17:02:22 -0700
Subject: [PATCH 10/15] feat: moved data to root

---
 .gitignore                                    |   6 +
 data/README.md                                |  16 ++
 .../saved_data => data}/pmcid_mapping.json    |   0
 src/fetch_articles/README.md                  | 156 +++++++++++++++++-
 src/fetch_articles/article_downloader.py      |  51 ++++--
 src/fetch_articles/pmcid_converter.py         |  53 ++++--
 .../saved_data/unique_pmcids.json             |   1 -
 src/load_data/load_clinical_variants.py       |  13 +-
 src/utils/__init__.py                         |   1 +
 src/utils/file_paths.py                       |  10 ++
 10 files changed, 261 insertions(+), 46 deletions(-)
 create mode 100644 data/README.md
 rename {src/fetch_articles/saved_data => data}/pmcid_mapping.json (100%)
 delete mode 100644 src/fetch_articles/saved_data/unique_pmcids.json
 create mode 100644 src/utils/__init__.py
 create mode 100644 src/utils/file_paths.py

diff --git a/.gitignore b/.gitignore
index 6c9c9f4..662a2c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,9 @@ src/fetch_articles/saved_data/articles/
 *.tar.lzo
 
 .DS_Store
+
+data/articles/
+data/variantAnnotations/
+data/unique_pmcids.json
+data/pmid_list.json
+data/downloaded_pmcids.json
\ No newline at end of file
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 0000000..1d26479
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,16 @@
+# Data
+
+This directory contains the primary data files used by the AutoGKB project.
+
+## Directory Structure
+
+- **articles/** - Contains XML files of articles from PubMed Central (PMC), identified by their PMCID (e.g., PMC1234567.xml). These articles are used for text mining and information extraction.
+
+- **variantAnnotations/** - Contains clinical variant annotations and related data:
+  - `var_drug_ann.tsv` - Variant-drug annotations. This is what is used in this repo.
+
+- **Support Files**:
+  - `pmcid_mapping.json` - Maps between PMIDs and PMCIDs
+  - `unique_pmcids.json` - List of unique PMCIDs in the dataset
+  - `pmid_list.json` - List of PMIDs in the dataset
+  - `downloaded_pmcids.json` - Tracking which PMCIDs have been downloaded
\ No newline at end of file
diff --git a/src/fetch_articles/saved_data/pmcid_mapping.json b/data/pmcid_mapping.json
similarity index 100%
rename from src/fetch_articles/saved_data/pmcid_mapping.json
rename to data/pmcid_mapping.json
diff --git a/src/fetch_articles/README.md b/src/fetch_articles/README.md
index eed6e26..dbade90 100644
--- a/src/fetch_articles/README.md
+++ b/src/fetch_articles/README.md
@@ -1,6 +1,7 @@
 # PubMed Document Fetching
+
 ## Goal
-Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues
+Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues.
 
 ## Process Overview
 1. Download the zip of variants from pharmgkb (handled in load_data module)
@@ -8,6 +9,153 @@ Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall
 3. Convert the PMID to PMCID 
 4. Fetch the content from the PMCID
 
-## Saved Data
-pmcid_mapping.json: Maps the PMID to the PMCID {"PMID": "PMCID" or Null, ..}
-unique_pmcids.json: List of all the unique PMCIDs from pmcid_mapping.json (["PMCID1", "PMCID2", ...])
\ No newline at end of file
+## Key Functions
+
+### PMCID Converter (`pmcid_converter.py`)
+
+- `batch_pmid_to_pmcid(pmids, email, batch_size, delay)`: Converts a list of PMIDs to PMCIDs using NCBI's ID Converter API. Processes PMIDs in batches and handles rate limiting.
+  - Arguments:
+    - `pmids`: List of PMIDs (as strings)
+    - `email`: Your email for NCBI tool identification
+    - `batch_size`: Number of PMIDs per request (max: 200)
+    - `delay`: Seconds between requests (default: 0.4)
+  - Returns: Dict mapping each PMID to PMCID (or None if not available)
+
+- `get_unique_pmcids()`: Returns a list of unique PMCIDs from the PMCID mapping file.
+
+- `load_saved_pmcid_mapping()`: Loads previously saved PMCID mappings from disk.
+
+- `get_project_root()`: Returns the project root directory path.
+
+### Article Downloader (`article_downloader.py`)
+
+- `fetch_pmc_content(pmcid)`: Fetches a single article's content from PubMed Central.
+  - Arguments:
+    - `pmcid`: The PubMed Central ID to fetch
+  - Returns: Article content in XML format or None if fetching failed
+
+- `download_articles(pmcids)`: Downloads multiple articles from PubMed Central.
+  - Arguments:
+    - `pmcids`: List of PMCIDs to download
+  - Saves downloaded articles to `data/articles/` as XML files
+  - Tracks downloaded PMCIDs to avoid duplicating work
+
+- `update_downloaded_pmcids()`: Updates tracking of downloaded PMCIDs from files in `data/articles/` directory.
+
+## Created Data
+- `pmcid_mapping.json`: Maps the PMID to the PMCID `{"PMID": "PMCID" or Null, ..}`
+- `unique_pmcids.json`: List of all the unique PMCIDs from pmcid_mapping.json `["PMCID1", "PMCID2", ...]`
+- `downloaded_pmcids.json`: Maps PMCIDs to filenames or None if download failed `{"PMCID": "PMCID.xml" or null, ..}`
+- `<articles>.xml`: Downloaded articles
+
+## Usage Examples
+
+### Convert PMIDs to PMCIDs
+
+```python
+from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid
+from src.load_data import get_pmid_list
+import os
+from dotenv import load_dotenv
+
+load_dotenv()  # Load environment variables (NCBI_EMAIL)
+
+# Get list of PMIDs from variant data
+pmid_list = get_pmid_list()
+
+# Convert PMIDs to PMCIDs
+pmcid_mapping = batch_pmid_to_pmcid(
+    pmids=pmid_list,
+    email=os.getenv("NCBI_EMAIL"),
+    batch_size=100,
+    delay=0.4
+)
+
+print(f"Successfully mapped {len(pmcid_mapping)} PMIDs to PMCIDs")
+```
+
+### Download Articles Using PMCIDs
+
+```python
+from src.fetch_articles.article_downloader import download_articles
+from src.fetch_articles.pmcid_converter import get_unique_pmcids
+
+# Get unique PMCIDs from saved mapping
+pmcids = get_unique_pmcids()
+
+# Download articles
+download_articles(pmcids)
+```
+
+### Download a Single Article
+
+```python
+from src.fetch_articles.article_downloader import fetch_pmc_content
+from src.fetch_articles.pmcid_converter import get_project_root
+import os
+from pathlib import Path
+
+# Get project root
+project_root = get_project_root()
+
+# Fetch a single article
+pmcid = "PMC1234567"
+content = fetch_pmc_content(pmcid)
+
+if content:
+    # Save the article content
+    articles_dir = project_root / "data" / "articles"
+    os.makedirs(articles_dir, exist_ok=True)
+    
+    with open(articles_dir / f"{pmcid}.xml", "w") as f:
+        f.write(content.decode("utf-8"))
+    print(f"Successfully downloaded article {pmcid}")
+else:
+    print(f"Failed to download article {pmcid}")
+```
+
+### Update Downloaded PMCIDs
+
+```python
+from src.fetch_articles.article_downloader import update_downloaded_pmcids
+
+# Update downloaded_pmcids.json with articles in data/articles/
+update_downloaded_pmcids()
+```
+
+## Full Pipeline Execution
+
+To run the complete pipeline (convert PMIDs to PMCIDs and download articles):
+
+```python
+# Full pipeline from PMIDs to downloaded articles
+from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid
+from src.fetch_articles.article_downloader import download_articles
+from src.load_data import get_pmid_list
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# 1. Get PMIDs from variant data
+pmid_list = get_pmid_list()
+
+# 2. Convert PMIDs to PMCIDs
+pmcid_mapping = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL"))
+
+# 3. Extract only valid PMCIDs (not None)
+valid_pmcids = [pmcid for pmcid in pmcid_mapping.values() if pmcid]
+
+# 4. Download articles
+download_articles(valid_pmcids)
+```
+
+Alternatively, run the module scripts directly:
+
+```bash
+# First convert PMIDs to PMCIDs
+python -m src.fetch_articles.pmcid_converter
+
+# Then download articles
+python -m src.fetch_articles.article_downloader
+```
\ No newline at end of file
diff --git a/src/fetch_articles/article_downloader.py b/src/fetch_articles/article_downloader.py
index 13c5cf1..c9179f7 100644
--- a/src/fetch_articles/article_downloader.py
+++ b/src/fetch_articles/article_downloader.py
@@ -1,5 +1,6 @@
 from loguru import logger
 from src.fetch_articles.pmcid_converter import get_unique_pmcids
+from src.utils.file_paths import get_project_root
 from Bio import Entrez
 import os
 import json
@@ -7,6 +8,15 @@
 
 
 def fetch_pmc_content(pmcid):
+    """
+    Fetch content for a single article from PubMed Central.
+    
+    Args:
+        pmcid (str): The PubMed Central ID to fetch
+        
+    Returns:
+        bytes or None: The article content in XML format or None if fetching failed
+    """
     try:
         handle = Entrez.efetch(db="pmc", id=pmcid, rettype="full", retmode="xml")
         record = handle.read()
@@ -19,18 +29,20 @@ def fetch_pmc_content(pmcid):
 
 def update_downloaded_pmcids() -> None:
     """
-    Update the downloaded_pmcids.json file with PMCIDs found in the saved_data/articles directory.
+    Update the downloaded_pmcids.json file with PMCIDs found in the data/articles directory.
     """
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    downloaded_pmcids_path = os.path.join(
-        base_dir, "saved_data", "downloaded_pmcids.json"
-    )
-    # Check for all the filenames in the saved_data/articles directory
-    articles_dir = os.path.join(base_dir, "saved_data", "articles")
+    project_root = get_project_root()
+    downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json"
+    
+    # Check for all the filenames in the data/articles directory
+    articles_dir = project_root / "data" / "articles"
+    os.makedirs(articles_dir, exist_ok=True)
+    
     article_pmcids = [f.split(".")[0] for f in os.listdir(articles_dir)]
     article_pmcids_mapping = {pmcid: f"{pmcid}.xml" for pmcid in article_pmcids}
 
     logger.info(f"Found {len(article_pmcids)} existing XML files in {articles_dir}")
+    
     # Add the new PMCIDs to the json file
     if os.path.exists(downloaded_pmcids_path):
         with open(downloaded_pmcids_path, "r") as f:
@@ -43,9 +55,12 @@ def update_downloaded_pmcids() -> None:
                 downloaded_pmcids = {}
     else:
         downloaded_pmcids = {}
+        
     downloaded_pmcids.update(article_pmcids_mapping)
+    
     with open(downloaded_pmcids_path, "w") as f:
         json.dump(downloaded_pmcids, f)
+        
     logger.info(
         f"Updated {downloaded_pmcids_path} with {len(article_pmcids)} new PMCIDs"
     )
@@ -55,19 +70,18 @@ def download_articles(pmcids: list[str]):
     """
     Download articles from PubMed Central using PMCIDs.
     Keeps track of the PMCIDs that have been downloaded and skips them.
-    Saves the downloaded articles to the saved_data/articles directory.
+    Saves the downloaded articles to the data/articles directory.
 
     Args:
         pmcids (list[str]): List of PMCIDs to download.
     """
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    saved_dir = os.path.join(base_dir, "saved_data", "articles")
-    os.makedirs(saved_dir, exist_ok=True)
+    project_root = get_project_root()
+    articles_dir = project_root / "data" / "articles"
+    os.makedirs(articles_dir, exist_ok=True)
 
     # Load the downloaded PMCIDs from the json file
-    downloaded_pmcids_path = os.path.join(
-        base_dir, "saved_data", "downloaded_pmcids.json"
-    )
+    downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json"
+    
     if os.path.exists(downloaded_pmcids_path):
         with open(downloaded_pmcids_path, "r") as f:
             downloaded_pmcids = json.load(f)
@@ -82,20 +96,21 @@ def download_articles(pmcids: list[str]):
     for pmcid in tqdm(new_pmcids):
         record = fetch_pmc_content(pmcid)
         if record:
-            with open(os.path.join(saved_dir, f"{pmcid}.xml"), "w") as f:
+            with open(articles_dir / f"{pmcid}.xml", "w") as f:
                 f.write(record.decode("utf-8"))
             downloaded_pmcids[pmcid] = f"{pmcid}.xml"
         else:
             downloaded_pmcids[pmcid] = None
             logger.warning(f"No record found for PMCID {pmcid}")
-    logger.info(f"Downloaded {len(downloaded_pmcids)} articles")
+    
+    logger.info(f"Downloaded {len(new_pmcids)} new articles, total articles: {len(downloaded_pmcids)}")
 
     # Save the downloaded PMCIDs to a json file
-    with open(os.path.join(base_dir, "saved_data", "downloaded_pmcids.json"), "w") as f:
+    with open(downloaded_pmcids_path, "w") as f:
         json.dump(downloaded_pmcids, f)
 
 
 if __name__ == "__main__":
     update_downloaded_pmcids()
     pmcids = get_unique_pmcids()
-    download_articles(pmcids)
+    download_articles(pmcids)
\ No newline at end of file
diff --git a/src/fetch_articles/pmcid_converter.py b/src/fetch_articles/pmcid_converter.py
index dc5a57c..10b9347 100644
--- a/src/fetch_articles/pmcid_converter.py
+++ b/src/fetch_articles/pmcid_converter.py
@@ -7,6 +7,7 @@
 import os
 from src.load_data import get_pmid_list
 import json
+from src.utils.file_paths import get_project_root
 
 load_dotenv()
 # Email for NCBI
@@ -22,12 +23,19 @@
 from typing import List, Set, Dict, Optional
 
 
+
+
+
 def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]:
     """
     Load the saved PMCID mapping from the json file.
     """
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    results_path = os.path.join(base_dir, "pmcid_mapping.json")
+    project_root = get_project_root()
+    results_path = project_root / "data" / "pmcid_mapping.json"
+    
+    # Create data directory if it doesn't exist
+    os.makedirs(project_root / "data", exist_ok=True)
+    
     if os.path.exists(results_path):
         with open(results_path, "r") as f:
             existing_results = json.load(f)
@@ -43,7 +51,7 @@ def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]:
 
 
 def batch_pmid_to_pmcid(
-    pmids: List[str], email: str, batch_size: int = 100, delay: float = 0.4
+    pmids: List[str], email: str = os.getenv("NCBI_EMAIL"), batch_size: int = 100, delay: float = 0.4
 ) -> Dict[str, Optional[str]]:
     """
     Convert a list of PMIDs to PMCIDs using NCBI's ID Converter API.
@@ -110,8 +118,12 @@ def batch_pmid_to_pmcid(
     existing_results.update(results)
 
     # Save updated results
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    results_path = os.path.join(base_dir, "pmcid_mapping.json")
+    project_root = get_project_root()
+    results_path = project_root / "data" / "pmcid_mapping.json"
+    
+    # Create data directory if it doesn't exist
+    os.makedirs(project_root / "data", exist_ok=True)
+    
     with open(results_path, "w") as f:
         json.dump(existing_results, f)
     logger.info(f"Updated PMCID mappings saved to {results_path}")
@@ -125,9 +137,14 @@ def get_unique_pmcids() -> List[str]:
     NOTE: Could add functionality to check for new PMCIDs in mapping and update the unique_pmcids.json file
     Currently function returns the pre-existing unique PMCIDs if they exist or regenerates the list from the mapping.
     """
+    project_root = get_project_root()
+    
     # Load the unique PMCIDs if they've already been saved
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    unique_pmcids_path = os.path.join(base_dir, "saved_data", "unique_pmcids.json")
+    unique_pmcids_path = project_root / "data" / "unique_pmcids.json"
+    
+    # Create data directory if it doesn't exist
+    os.makedirs(project_root / "data", exist_ok=True)
+    
     if os.path.exists(unique_pmcids_path):
         with open(unique_pmcids_path, "r") as f:
             try:
@@ -143,14 +160,20 @@ def get_unique_pmcids() -> List[str]:
         return pmcids
 
     # Load from pmcid_mapping.json if unique pmcids haven't been saved
-    results_path = os.path.join(base_dir, "saved_data", "pmcid_mapping.json")
+    results_path = project_root / "data" / "pmcid_mapping.json"
+    
+    if not os.path.exists(results_path):
+        logger.error(f"No PMCID mapping found at {results_path}. Cannot generate unique PMCIDs.")
+        return []
+        
     with open(results_path, "r") as f:
         existing_results = json.load(f)
-    # get the unique pmcids
-    pmcids = list(set(existing_results.values()))
+    
+    # Get the unique pmcids (remove None values)
+    pmcids = [value for value in existing_results.values() if value is not None]
+    pmcids = list(set(pmcids))
 
     # Save the unique pmcids to a json file
-    unique_pmcids_path = os.path.join(base_dir, "saved_data", "unique_pmcids.json")
     with open(unique_pmcids_path, "w") as f:
         json.dump(pmcids, f)
     logger.info(f"Unique PMCIDs saved to {unique_pmcids_path}")
@@ -158,8 +181,8 @@ def get_unique_pmcids() -> List[str]:
 
 
 if __name__ == "__main__":
-    # pmid_list = get_pmid_list()
-    # results = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL"))
-    # logger.info(f"PMCID mapping complete. {len(results)} PMIDs mapped to PMCIDs.")
+    pmid_list = get_pmid_list()
+    results = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL"))
+    logger.info(f"PMCID mapping complete. {len(results)} PMIDs mapped to PMCIDs.")
     pmcids = get_unique_pmcids()
-    logger.info(f"Number of unique PMCIDs: {len(pmcids)}")
+    logger.info(f"Number of unique PMCIDs: {len(pmcids)}")
\ No newline at end of file
diff --git a/src/fetch_articles/saved_data/unique_pmcids.json b/src/fetch_articles/saved_data/unique_pmcids.json
deleted file mode 100644
index c1a43e7..0000000
--- a/src/fetch_articles/saved_data/unique_pmcids.json
+++ /dev/null
@@ -1 +0,0 @@
-["PMC11850035", "PMC2812115", "PMC2911553", "PMC5583388", "PMC4526634", "PMC6247602", "PMC11758033", "PMC2884029", "PMC3778124", "PMC3044738", "PMC5700353", "PMC1884342", "PMC6289816", "PMC4609097", "PMC1364741", "PMC2857717", "PMC10898793", "PMC4909584", "PMC4630174", "PMC5391214", "PMC10675244", "PMC6647927", "PMC5074472", "PMC11667419", "PMC8185249", "PMC2352037", "PMC4280295", "PMC11512548", "PMC7377539", "PMC11258238", "PMC7499297", "PMC5859345", "PMC11921366", "PMC3872414", "PMC3797132", "PMC5373545", "PMC8455325", "PMC3786328", "PMC3081375", "PMC11825576", "PMC10418744", "PMC1563530", "PMC4032230", "PMC5886039", "PMC3068061", "PMC48077", "PMC4537319", "PMC4730664", "PMC4713720", "PMC3433845", "PMC11241034", "PMC6142943", "PMC7710914", "PMC5496343", "PMC2966433", "PMC6777349", "PMC3310336", "PMC4278770", "PMC5904126", "PMC4342329", "PMC3594083", "PMC6714829", "PMC5496345", "PMC2048549", "PMC3574284", "PMC3454425", "PMC3657889", "PMC3555061", "PMC10618485", "PMC3692386", "PMC3902809", "PMC9468554", "PMC5309131", "PMC2757009", "PMC2592852", "PMC2928561", "PMC2762391", "PMC4583245", "PMC6432766", "PMC7616417", "PMC5749368", "PMC4522133", "PMC3983993", "PMC2920450", "PMC4181635", "PMC3703617", "PMC4702321", "PMC2879959", "PMC4746878", "PMC11871410", "PMC7164646", "PMC5203947", "PMC4867099", "PMC3434304", "PMC6462825", "PMC4435089", "PMC2681284", "PMC1952551", "PMC5866313", "PMC3016221", "PMC3837290", "PMC10848431", "PMC3565812", "PMC3787223", "PMC4226857", "PMC10675623", "PMC9768477", "PMC2485247", "PMC4503374", "PMC4565152", "PMC4206345", "PMC1873971", "PMC2364178", "PMC3230303", "PMC11016593", "PMC3622803", "PMC11666798", "PMC4956330", "PMC5354739", "PMC4307337", "PMC3836273", "PMC2292110", "PMC2630264", "PMC4959996", "PMC11421434", "PMC11787782", "PMC3561425", "PMC2921956", "PMC5538123", "PMC6612579", "PMC3539557", "PMC8553963", "PMC9585281", "PMC3938989", "PMC11552228", "PMC9501307", "PMC11203291", "PMC3786570", "PMC5514947", "PMC10566653", "PMC3944214", "PMC4476880", "PMC2733171", "PMC3164274", "PMC5142600", "PMC7274090", "PMC10537526", "PMC6493603", "PMC3567337", "PMC6587209", "PMC7455128", "PMC5324942", "PMC2666924", "PMC11544447", "PMC3603284", "PMC11786019", "PMC4560372", "PMC3734199", "PMC4525256", "PMC4488893", "PMC3786668", "PMC11803932", "PMC4513254", "PMC6005582", "PMC5940523", "PMC6923423", "PMC2913479", "PMC8429954", "PMC5306247", "PMC3529147", "PMC3358293", "PMC6038204", "PMC5541380", "PMC11717999", "PMC10179231", "PMC2966981", "PMC10995391", "PMC4254688", "PMC8702453", "PMC11257390", "PMC11052159", null, "PMC10858860", "PMC4154892", "PMC4613195", "PMC4387236", "PMC4628029", "PMC1754569", "PMC3381232", "PMC3579501", "PMC4716887", "PMC11855146", "PMC3093392", "PMC10668244", "PMC4437521", "PMC9875006", "PMC10532907", "PMC4890827", "PMC3639978", "PMC4876188", "PMC4868001", "PMC4330076", "PMC8758337", "PMC4533232", "PMC4982581", "PMC2432487", "PMC11160041", "PMC4134280", "PMC11913886", "PMC5558541", "PMC4011617", "PMC7005197", "PMC7674153", "PMC5678480", "PMC2773991", "PMC11492722", "PMC4503103", "PMC11063049", "PMC11082567", "PMC5862636", "PMC3617060", "PMC12043259", "PMC3279522", "PMC8438567", "PMC5871545", "PMC3625373", "PMC2756088", "PMC7260086", "PMC10298263", "PMC4270923", "PMC4390701", "PMC6003833", "PMC3769669", "PMC11049768", "PMC1755496", "PMC2952572", "PMC9536193", "PMC4762905", "PMC1773505", "PMC5386607", "PMC3390407", "PMC9914414", "PMC6357964", "PMC3946972", "PMC4023787", "PMC6523194", "PMC5483245", "PMC2673121", "PMC5903228", "PMC3369131", "PMC11528939", "PMC11159193", "PMC2291274", "PMC4296935", "PMC3158597", "PMC3273458", "PMC6174029", "PMC4490522", "PMC5148898", "PMC6248022", "PMC5468510", "PMC4015881", "PMC2853591", "PMC3396003", "PMC2515139", "PMC3292264", "PMC3632552", "PMC2680291", "PMC4938133", "PMC5614982", "PMC4812555", "PMC2492917", "PMC4616511", "PMC6987567", "PMC9608913", "PMC3555879", "PMC2943151", "PMC4448076", "PMC2766479", "PMC4835128", "PMC3944116", "PMC10931982", "PMC4452656", "PMC10159199", "PMC2561120", "PMC4613221", "PMC10214567", "PMC9801627", "PMC3505921", "PMC7968507", "PMC2966859", "PMC5743122", "PMC4641035", "PMC3760990", "PMC11059713", "PMC4892970", "PMC2722908", "PMC3746708", "PMC4601717", "PMC4462610", "PMC1874463", "PMC11003701", "PMC4591203", "PMC1474035", "PMC3880259", "PMC4892378", "PMC9820603", "PMC3952719", "PMC5564514", "PMC7870766", "PMC5411458", "PMC10827494", "PMC11524821", "PMC8182957", "PMC6265082", "PMC5342670", "PMC3940150", "PMC2810514", "PMC3808494", "PMC2762405", "PMC11638344", "PMC2386778", "PMC4012347", "PMC5207665", "PMC4433569", "PMC4697903", "PMC5427048", "PMC3523080", "PMC6046471", "PMC7993015", "PMC6034060", "PMC10880038", "PMC4199712", "PMC6409308", "PMC5346037", "PMC4594719", "PMC1974827", "PMC5266160", "PMC3729209", "PMC8954661", "PMC5684285", "PMC3845218", "PMC3161212", "PMC5500390", "PMC4631184", "PMC2664151", "PMC3860742", "PMC4099069", "PMC6179259", "PMC10091789", "PMC4160394", "PMC6033076", "PMC7375060", "PMC8578201", "PMC4078496", "PMC3522814", "PMC5563830", "PMC4996314", "PMC11354576", "PMC3291838", "PMC5306492", "PMC3384479", "PMC3899768", "PMC5432414", "PMC3518380", "PMC4432150", "PMC5355121", "PMC3674704", "PMC3378722", "PMC3727245", "PMC3941038", "PMC2751283", "PMC6767327", "PMC3553682", "PMC1029622", "PMC4511425", "PMC3248259", "PMC4385537", "PMC4195667", "PMC4757974", "PMC3330749", "PMC9891445", "PMC2679896", "PMC2810802", "PMC3871508", "PMC4171106", "PMC2820245", "PMC10810687", "PMC8540141", "PMC3997354", "PMC11088557", "PMC4130425", "PMC4615534", "PMC4468641", "PMC3735354", "PMC6734474", "PMC6542686", "PMC5753622", "PMC9931738", "PMC4300289", "PMC5612381", "PMC5543069", "PMC4229256", "PMC2883666", "PMC11603417", "PMC1365155", "PMC6231319", "PMC3682424", "PMC2715837", "PMC5526237", "PMC3621246", "PMC4190075", "PMC5983535", "PMC4519823", "PMC5508045", "PMC5346878", "PMC6328871", "PMC4272010", "PMC7215378", "PMC3890033", "PMC3641305", "PMC8841435", "PMC8137991", "PMC3873034", "PMC4043918", "PMC5619051", "PMC11685162", "PMC11509751", "PMC3130093", "PMC11860030", "PMC3098751", "PMC5519037", "PMC11221861", "PMC4087845", "PMC8238023", "PMC4872310", "PMC9601332", "PMC5798599", "PMC3182303", "PMC3611944", "PMC10967865", "PMC10645035", "PMC3348126", "PMC5316146", "PMC10377184", "PMC2014902", "PMC3264276", "PMC10838100", "PMC10607223", "PMC5469860", "PMC11134291", "PMC6313513", "PMC6927671", "PMC3525178", "PMC9961245", "PMC10864595", "PMC1873375", "PMC8533258", "PMC6562943", "PMC3544007", "PMC4833150", "PMC5645220", "PMC6586010", "PMC8513493", "PMC9314634", "PMC6400024", "PMC10196221", "PMC11887348", "PMC11244643", "PMC9256318", "PMC11703455", "PMC5079351", "PMC11393095", "PMC4915265", "PMC8530979", "PMC7398416", "PMC8822703", "PMC3092713", "PMC4456129", "PMC3780966", "PMC3608305", "PMC4224698", "PMC11481807", "PMC11887086", "PMC10815823", "PMC6448146", "PMC2014166", "PMC10163902", "PMC3749354", "PMC5883590", "PMC8742641", "PMC4965653", "PMC3604156", "PMC4702374", "PMC5505550", "PMC3114195", "PMC4356257", "PMC5727754", "PMC4995153", "PMC2959002", "PMC442471", "PMC4454552", "PMC3030919", "PMC11111788", "PMC4137828", "PMC4916778", "PMC11773121", "PMC8673616", "PMC6347826", "PMC2859392", "PMC3352974", "PMC2288721", "PMC4682920", "PMC4581326", "PMC4365300", "PMC2480976", "PMC4169411", "PMC6613715", "PMC6745302", "PMC8724172", "PMC6262886", "PMC4479596", "PMC2168111", "PMC4636889", "PMC4594699", "PMC5904201", "PMC7089776", "PMC5604555", "PMC6092108", "PMC3749570", "PMC4631197", "PMC6479273", "PMC6942309", "PMC4183989", "PMC4368615", "PMC3525665", "PMC4820801", "PMC9298338", "PMC6472479", "PMC3931261", "PMC4017364", "PMC11152251", "PMC4500334", "PMC4168388", "PMC11677811", "PMC3673300", "PMC10583240", "PMC3214266", "PMC5282793", "PMC3775655", "PMC3818406", "PMC2949522", "PMC524175", "PMC4862932", "PMC3137420", "PMC2830598", "PMC3818912", "PMC4803610", "PMC9582748", "PMC4034115", "PMC3867202", "PMC3690108", "PMC9297921", "PMC9537548", "PMC5589489", "PMC3055694", "PMC4012056", "PMC2903324", "PMC2686066", "PMC10152845", "PMC3667657", "PMC4308646", "PMC6411694", "PMC5009007", "PMC10834390", "PMC3403289", "PMC10909096", "PMC2992873", "PMC8445626", "PMC6969041", "PMC3894627", "PMC6786370", "PMC5833535", "PMC6505090", "PMC11359404", "PMC8672325", "PMC6980920", "PMC10666731", "PMC7963143", "PMC5382092", "PMC4461653", "PMC2860533", "PMC5018246", "PMC4479153", "PMC6451710", "PMC4301945", "PMC4557249", "PMC4157963", "PMC6995013", "PMC11608742", "PMC10668502", "PMC4480925", "PMC11573879", "PMC1975838", "PMC4231027", "PMC5818817", "PMC6037621", "PMC6801039", "PMC4943245", "PMC2014539", "PMC11555502", "PMC2995295", "PMC5006145", "PMC4055378", "PMC4484512", "PMC11418302", "PMC10278212", "PMC4057281", "PMC6219441", "PMC5598801", "PMC11754044", "PMC3984266", "PMC10778798", "PMC4274707", "PMC5651309", "PMC11252221", "PMC4265416", "PMC11475898", "PMC4752391", "PMC3624039", "PMC11401437", "PMC2652833", "PMC3774043", "PMC7431691", "PMC7039325", "PMC6086578", "PMC7655626", "PMC5521342", "PMC3910846", "PMC6851426", "PMC3139013", "PMC11773116", "PMC8458697", "PMC10527451", "PMC5241185", "PMC4151614", "PMC1874262", "PMC3461952", "PMC3818518", "PMC3653303", "PMC5411211", "PMC4356640", "PMC4672523", "PMC3414671", "PMC3485381", "PMC5877743", "PMC3481266", "PMC8295171", "PMC4800352", "PMC4693492", "PMC10501538", "PMC4154311", "PMC10309098", "PMC6813860", "PMC8953705", "PMC4366347", "PMC9925376", "PMC10917709", "PMC1365072", "PMC6014560", "PMC4292894", "PMC8426351", "PMC6612264", "PMC7319006", "PMC10502099", "PMC3726442", "PMC4500328", "PMC5711571", "PMC8940650", "PMC4345005", "PMC8604252", "PMC5319785", "PMC5233579", "PMC3658129", "PMC6493124", "PMC4892373", "PMC3401172", "PMC4527535", "PMC5293674", "PMC9080200", "PMC6591035", "PMC2684883", "PMC4243902", "PMC11652804", "PMC10883345", "PMC2668081", "PMC4324232", "PMC11159294", "PMC10982510", "PMC4737107", "PMC11148365", "PMC6510382", "PMC6216325", "PMC2518836", "PMC2830602", "PMC4694426", "PMC3394147", "PMC4335884", "PMC6486881", "PMC6461793", "PMC5903579", "PMC10349379", "PMC2647710", "PMC10557961", "PMC7115946", "PMC5370513", "PMC10409991", "PMC5298887", "PMC11995662", "PMC4836090", "PMC2791975", "PMC2726911", "PMC3164277", "PMC4111883", "PMC11315837", "PMC3246196", "PMC11531276", "PMC2750008", "PMC11884701", "PMC9830790", "PMC2662935", "PMC6941886", "PMC5887212", "PMC11240873", "PMC10970167", "PMC7793629", "PMC5898372", "PMC4038142", "PMC11208962", "PMC8880478", "PMC5716599", "PMC6298606", "PMC4661296", "PMC2888980", "PMC1884506", "PMC4542662", "PMC3579261", "PMC4872428", "PMC3984158", "PMC2935997", "PMC9810307", "PMC6989102", "PMC5975540", "PMC5299197", "PMC3571021", "PMC3582836", "PMC3376437", "PMC3513646", "PMC1087660", "PMC5065384", "PMC5176308", "PMC11668066", "PMC10990950", "PMC8163522", "PMC4425504", "PMC7883889", "PMC8505487", "PMC4502741", "PMC3555056", "PMC3675749", "PMC7292295", "PMC4184528", "PMC5727167", "PMC3444290", "PMC5562097", "PMC4441275", "PMC5591096", "PMC10684410", "PMC4298011", "PMC7393710", "PMC3584248", "PMC4105486", "PMC5711795", "PMC3753270", "PMC5520553", "PMC2014233", "PMC9301121", "PMC6920759", "PMC1885008", "PMC4116670", "PMC4220988", "PMC4846779", "PMC3329222", "PMC4445755", "PMC6132901", "PMC7193447", "PMC3454958", "PMC2896457", "PMC3628804", "PMC1995596", "PMC3508798", "PMC10972729", "PMC4794377", "PMC4692529", "PMC3922978", "PMC1365132", "PMC2886925", "PMC5138058", "PMC5425333", "PMC7115450", "PMC6654446", "PMC6006403", "PMC4304713", "PMC3006662", "PMC2949912", "PMC5700347", "PMC5461999", "PMC5768901", "PMC6089815", "PMC4762902", "PMC4690185", "PMC5438821", "PMC5189722", "PMC9809306", "PMC2644687", "PMC4631185", "PMC5534241", "PMC5373543", "PMC4100708", "PMC1237155", "PMC8373649", "PMC4693577", "PMC9328121", "PMC2737687", "PMC3249179", "PMC5377478", "PMC5531276", "PMC8890732", "PMC8108700", "PMC6387687", "PMC2683977", "PMC6054772", "PMC3237821", "PMC4345081", "PMC6518412", "PMC7086280", "PMC5440888", "PMC2922203", "PMC3712827", "PMC4612590", "PMC2596476", "PMC6773496", "PMC11246114", "PMC3038469", "PMC4969350", "PMC11763628", "PMC5734971", "PMC4719145", "PMC3208318", "PMC5763654", "PMC4573240", "PMC8184575", "PMC4444267", "PMC4168390", "PMC9584256", "PMC3020258", "PMC2901912", "PMC5135610", "PMC5932771", "PMC4473094", "PMC2896826", "PMC3985268", "PMC7351433", "PMC4760888", "PMC5287983", "PMC2709885", "PMC2364770", "PMC5057355", "PMC3988537", "PMC5817388", "PMC3656883", "PMC11435314", "PMC3448899", "PMC1884346", "PMC5423974", "PMC11102648", "PMC4257570", "PMC2906637", "PMC10381361", "PMC4828529", "PMC9890192", "PMC4503705", "PMC1884285", "PMC6587626", "PMC2641037", "PMC4797547", "PMC3988270", "PMC8505452", "PMC10876746", "PMC6760244", "PMC4425056", "PMC3846997", "PMC4296254", "PMC5449482", "PMC11755583", "PMC5161051", "PMC11049954", "PMC11558073", "PMC4805204", "PMC5789875", "PMC6800829", "PMC10908252", "PMC4551162", "PMC10495004", "PMC4595504", "PMC5943457", "PMC11246689", "PMC5744175", "PMC4618180", "PMC8222836", "PMC5298566", "PMC4541975", "PMC10951231", "PMC3055457", "PMC1978168", "PMC10526247", "PMC6493076", "PMC2650539", "PMC4707035", "PMC4151246", "PMC4413900", "PMC4454285", "PMC4930967", "PMC10787143", "PMC4116556", "PMC2754599", "PMC9306465", "PMC11317398", "PMC3415853", "PMC3852421", "PMC11158672", "PMC3575609", "PMC11891766", "PMC2794921", "PMC5546852", "PMC7221122", "PMC10914946", "PMC2586993", "PMC4574839", "PMC8204702", "PMC4982759", "PMC2950972", "PMC7235792", "PMC3521860", "PMC11730665", "PMC5611711", "PMC9515473", "PMC2291379", "PMC4498287", "PMC3360546", "PMC5945500", "PMC6361127", "PMC3943570", "PMC6426691", "PMC11158323", "PMC5800559", "PMC6151284", "PMC6542461", "PMC5220536", "PMC5610780", "PMC3100585", "PMC4405819", "PMC3912955", "PMC10782740", "PMC5903239", "PMC7302666", "PMC11271148", "PMC9841299", "PMC3195031", "PMC6081148", "PMC3756535", "PMC6357360", "PMC5538305", "PMC4706412", "PMC2865873", "PMC5264271", "PMC4498982", "PMC10483403", "PMC4177494", "PMC2976715", "PMC11011338", "PMC3633658", "PMC2679107", "PMC5465325", "PMC7375952", "PMC2896566", "PMC3652476", "PMC556232", "PMC11106956", "PMC11236688", "PMC8132880", "PMC3282030", "PMC4169706", "PMC2757655", "PMC3909010", "PMC4896103", "PMC4243881", "PMC5392306", "PMC4575538", "PMC8975736", "PMC3419350", "PMC5509475", "PMC2925052", "PMC7497848", "PMC3947488", "PMC2276142", "PMC6373376", "PMC3461592", "PMC6501809", "PMC10154044", "PMC4002970", "PMC5028170", "PMC4208722", "PMC9610285", "PMC2556451", "PMC5029084", "PMC2564574", "PMC5485718", "PMC5003027", "PMC5604731", "PMC2908290", "PMC5901893", "PMC4865408", "PMC8917764", "PMC10349800", "PMC10499425", "PMC11703419", "PMC4236071", "PMC5980466", "PMC6411020", "PMC5101708", "PMC11628867", "PMC11102100", "PMC4731723", "PMC8571740", "PMC7649675", "PMC6595468", "PMC5763318", "PMC4949007", "PMC5323433", "PMC4703773", "PMC1401654", "PMC5875353", "PMC4541974", "PMC8973308", "PMC11933031", "PMC11720188", "PMC4038024", "PMC5808057", "PMC1884959", "PMC6493375", "PMC5145728", "PMC3992925", "PMC5807179", "PMC3499361", "PMC3760447", "PMC4931969", "PMC3049596", "PMC4110085", "PMC5007158", "PMC6475679", "PMC4947669", "PMC5875925", "PMC11310823", "PMC1884261", "PMC3172251", "PMC6171340", "PMC7245057", "PMC7340566", "PMC7388522", "PMC3766937", "PMC10099095", "PMC6125540", "PMC4220464", "PMC11269678", "PMC5346875", "PMC8106923", "PMC4113831", "PMC10244018", "PMC5346034", "PMC3320544", "PMC1963422", "PMC3530397", "PMC6246957", "PMC4590670", "PMC9974434", "PMC3612775", "PMC1885108", "PMC6714673", "PMC5659294", "PMC8578190", "PMC5427244", "PMC3991683", "PMC4115247", "PMC5412025", "PMC10038974", "PMC8915292", "PMC10230242", "PMC11404698", "PMC6742943", "PMC7497238", "PMC8472669", "PMC4855508", "PMC5152628", "PMC5651327", "PMC9028965", "PMC3597465", "PMC5478306", "PMC6631257", "PMC3833422", "PMC4119242", "PMC2792638", "PMC6046506", "PMC3468617", "PMC6489578", "PMC11314417", "PMC11347466", "PMC4470685", "PMC3116045", "PMC11809887", "PMC4833149", "PMC5726942", "PMC2749505", "PMC11852071", "PMC5510236", "PMC4598210", "PMC1251635", "PMC10463210", "PMC4469933", "PMC10747255", "PMC10582663", "PMC10957942", "PMC3570048", "PMC6071997", "PMC4406866", "PMC10275785", "PMC4297489", "PMC10565537", "PMC3865618", "PMC6855320", "PMC11022290", "PMC5749387", "PMC3978988", "PMC9322346", "PMC8599229", "PMC3598593", "PMC3610685", "PMC5599305", "PMC9657232", "PMC5402961", "PMC5524513", "PMC4221105", "PMC2704695", "PMC2910688", "PMC5316454", "PMC5249113", "PMC9934922", "PMC3959225", "PMC4735961", "PMC5533497", "PMC5492788", "PMC4922322", "PMC3805522", "PMC10139129", "PMC5963414", "PMC4854407", "PMC4039203", "PMC3734608", "PMC4343187", "PMC10327396", "PMC8263746", "PMC2748889", "PMC8081740", "PMC4108472", "PMC2042888", "PMC3093079", "PMC5829963", "PMC5949564", "PMC5656562", "PMC4640545", "PMC3071070", "PMC3383686", "PMC8767566", "PMC4999337", "PMC4271081", "PMC5395152", "PMC4872305", "PMC2794198", "PMC3478502", "PMC5558527", "PMC3066089", "PMC4764353", "PMC3471928", "PMC10145266", "PMC10599059", "PMC5899062", "PMC9552901", "PMC3137047", "PMC7305826", "PMC9481373", "PMC4375579", "PMC4010098", "PMC7217737", "PMC1365130", "PMC3476140", "PMC3755037", "PMC2981241", "PMC3834132", "PMC4931885", "PMC3148255", "PMC2767285", "PMC3901533", "PMC3548029", "PMC4651007", "PMC11140026", "PMC9413960", "PMC4282597", "PMC4484731", "PMC9450009", "PMC5903234", "PMC11269006", "PMC5795999", "PMC1769026", "PMC8100460", "PMC4375304", "PMC4332701", "PMC10880264", "PMC8441053", "PMC4701680", "PMC4412845", "PMC2732914", "PMC9701885", "PMC3779247", "PMC5632935", "PMC4364852", "PMC4002408", "PMC2919241", "PMC1762324", "PMC9321338", "PMC2858245", "PMC3245828", "PMC2599947", "PMC5421731", "PMC3260990", "PMC5087931", "PMC3637851", "PMC8359222", "PMC3100476", "PMC9532634", "PMC11508189", "PMC6960206", "PMC5558529", "PMC4000411", "PMC10648962", "PMC10769478", "PMC3131846", "PMC11023817", "PMC3698861", "PMC4667947", "PMC4544820", "PMC2675161", "PMC4025175", "PMC5606007", "PMC7039663", "PMC4921119", "PMC3143437", "PMC5309133", "PMC11458732", "PMC6370172", "PMC5192124", "PMC4788379", "PMC2000640", "PMC10085626", "PMC5061780", "PMC5908896", "PMC4631186", "PMC3248257", "PMC4814312", "PMC4240933", "PMC10478012", "PMC3244642", "PMC6128165", "PMC4209173", "PMC4585967", "PMC11584383", "PMC4503165", "PMC4778608", "PMC11012255", "PMC7613628", "PMC4972156", "PMC9373641", "PMC5391994", "PMC4876172", "PMC7292331", "PMC9820795", "PMC3048137", "PMC2000718", "PMC11507373", "PMC3425006", "PMC4505931", "PMC8800862", "PMC4806848", "PMC5546927", "PMC3672984", "PMC11862786", "PMC5346382", "PMC3061841", "PMC11000398", "PMC11141156", "PMC5655282", "PMC4764723", "PMC7423195", "PMC10452379", "PMC5342450", "PMC7197488", "PMC1364713", "PMC6759913", "PMC3753327", "PMC8141066", "PMC2014382", "PMC3686783", "PMC5048209", "PMC4928097", "PMC4735517", "PMC6021962", "PMC2885152", "PMC10529681", "PMC4892230", "PMC2570505", "PMC4462564", "PMC6939828", "PMC2042718", "PMC10825484", "PMC4338734", "PMC5404990", "PMC6631360", "PMC7028104", "PMC4615595", "PMC1767618", "PMC6891932", "PMC3213989", "PMC3680019", "PMC11094496", "PMC5817390", "PMC5944577", "PMC4943390", "PMC11140815", "PMC11605493", "PMC3462355", "PMC1746721", "PMC3925114", "PMC3895354", "PMC3125052", "PMC4669157", "PMC5098919", "PMC11520374", "PMC3029819", "PMC9031832", "PMC3107291", "PMC5908314", "PMC4600600", "PMC3506814", "PMC6049926", "PMC5412267", "PMC5355968", "PMC539815", "PMC3640375", "PMC6408006", "PMC5051541", "PMC2660379", "PMC4323272", "PMC4104334", "PMC4976849", "PMC3180021", "PMC7303159", "PMC10532840", "PMC2760462", "PMC10337687", "PMC5548439", "PMC11264771", "PMC3550197", "PMC11943653", "PMC7347085", "PMC7214659", "PMC4722076", "PMC4155516", "PMC3958404", "PMC6752321", "PMC7427977", "PMC2957581", "PMC3080643", "PMC9819208", "PMC3225067", "PMC4932617", "PMC6375065", "PMC3175513", "PMC7718230", "PMC3537445", "PMC10758687", "PMC3858547", "PMC3370715", "PMC11095822", "PMC5167198", "PMC2976128", "PMC3734060", "PMC10852661", "PMC4201132", "PMC10974048", "PMC6423619", "PMC3621996", "PMC3910794", "PMC7999651", "PMC11120965", "PMC5378677", "PMC9468644", "PMC5600689", "PMC7308427", "PMC3034442", "PMC16264", "PMC3776990", "PMC1887589", "PMC2855513", "PMC5721751", "PMC2194758", "PMC2547143", "PMC5590735", "PMC2743299", "PMC4350512", "PMC6011347"]
\ No newline at end of file
diff --git a/src/load_data/load_clinical_variants.py b/src/load_data/load_clinical_variants.py
index 492abfc..310efed 100644
--- a/src/load_data/load_clinical_variants.py
+++ b/src/load_data/load_clinical_variants.py
@@ -7,6 +7,7 @@
 import pandas as pd
 import json
 
+from src.utils.file_paths import get_project_root
 """
 This file contains functions to load the clinical variants data from the PharmGKB API.
 The key function is get_pmid_list(), which loads the PMIDs from the variant annotations tsv file and saves them to a json file.
@@ -24,8 +25,7 @@ def download_and_extract_variant_annotations(override: bool = False) -> str:
     """
     url = "https://api.pharmgkb.org/v1/download/file/data/variantAnnotations.zip"
 
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    save_dir = os.path.join(base_dir, "saved_data")
+    save_dir = os.path.join(get_project_root(), "data")
     extract_dir = os.path.join(save_dir, "variantAnnotations")
 
     if os.path.exists(extract_dir):
@@ -58,9 +58,8 @@ def load_raw_variant_annotations(override: bool = False) -> pd.DataFrame:
     Returns:
         pd.DataFrame: The loaded variant annotations tsv file.
     """
-    base_dir = os.path.dirname(os.path.abspath(__file__))
     tsv_path = os.path.join(
-        base_dir, "saved_data", "variantAnnotations", "var_drug_ann.tsv"
+        get_project_root(), "data", "variantAnnotations", "var_drug_ann.tsv"
     )
 
     if not os.path.exists(tsv_path):
@@ -102,8 +101,7 @@ def load_unique_variants(save_results: bool = True) -> dict:
     If the json file already exists, it will be loaded from the file.
     NOTE: Don't think this function is needed anymore. get_pmid_list() is used instead.
     """
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    unique_variants_path = os.path.join(base_dir, "saved_data", "unique_variants.json")
+    unique_variants_path = os.path.join(get_project_root(), "data", "unique_variants.json")
     if os.path.exists(unique_variants_path):
         logger.info(f"Loading unique variants from {unique_variants_path}")
         with open(unique_variants_path, "r") as f:
@@ -125,8 +123,7 @@ def get_pmid_list(override: bool = False) -> list:
     """
     Loads the pmid list from the variant annotations tsv file.
     """
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    pmid_list_path = os.path.join(base_dir, "saved_data", "pmid_list.json")
+    pmid_list_path = os.path.join(get_project_root(), "data", "pmid_list.json")
     if os.path.exists(pmid_list_path):
         logger.info(f"Loading PMIDs from {pmid_list_path}")
         with open(pmid_list_path, "r") as f:
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000..10433a3
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1 @@
+from .file_paths import get_project_root
\ No newline at end of file
diff --git a/src/utils/file_paths.py b/src/utils/file_paths.py
new file mode 100644
index 0000000..1d37993
--- /dev/null
+++ b/src/utils/file_paths.py
@@ -0,0 +1,10 @@
+import os
+from pathlib import Path
+
+def get_project_root() -> Path:
+    """
+    Return the project root directory.
+    """
+    # Assuming src is a top-level directory in the project
+    current_file = Path(__file__)
+    return current_file.parent.parent.parent

From 7531b2bee6c62742ebf825ce40a41a17615b2db2 Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 19 May 2025 17:02:47 -0700
Subject: [PATCH 11/15] chore: updated gitignore

---
 .gitignore | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/.gitignore b/.gitignore
index 662a2c9..be46693 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,9 +19,12 @@ __pycache__
 .env
 
 # data
-src/load_data/saved_data/
-src/fetch_articles/saved_data/downloaded_pmcids.json
-src/fetch_articles/saved_data/articles/
+data/articles/
+data/variantAnnotations/
+data/unique_pmcids.json
+data/pmid_list.json
+data/downloaded_pmcids.json
+
 *.zip
 *.tar.gz
 *.tar.bz2
@@ -32,8 +35,3 @@ src/fetch_articles/saved_data/articles/
 
 .DS_Store
 
-data/articles/
-data/variantAnnotations/
-data/unique_pmcids.json
-data/pmid_list.json
-data/downloaded_pmcids.json
\ No newline at end of file

From 6d521651759d04d4f1970d75e097e777e34fae1e Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 19 May 2025 17:04:47 -0700
Subject: [PATCH 12/15] docs: readme update

---
 data/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/data/README.md b/data/README.md
index 1d26479..5fef6c8 100644
--- a/data/README.md
+++ b/data/README.md
@@ -8,6 +8,7 @@ This directory contains the primary data files used by the AutoGKB project.
 
 - **variantAnnotations/** - Contains clinical variant annotations and related data:
   - `var_drug_ann.tsv` - Variant-drug annotations. This is what is used in this repo.
+  - This can be downloaded using download_and_extract_variant_annotations from the load_data module
 
 - **Support Files**:
   - `pmcid_mapping.json` - Maps between PMIDs and PMCIDs

From f4ce245d4742259541159ade095e529379b60dfb Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 19 May 2025 17:15:15 -0700
Subject: [PATCH 13/15] feat: load variants pipeline

---
 data/README.md                                |  2 +-
 pixi.toml                                     |  1 +
 src/benchmark/annotation.py                   |  2 +-
 src/fetch_articles/README.md                  |  8 ++--
 src/fetch_articles/pmcid_converter.py         |  2 +-
 src/{load_data => load_variants}/README.md    |  4 +-
 src/{load_data => load_variants}/__init__.py  |  2 +-
 .../load_clinical_variants.py                 | 44 ++++++++-----------
 8 files changed, 29 insertions(+), 36 deletions(-)
 rename src/{load_data => load_variants}/README.md (91%)
 rename src/{load_data => load_variants}/__init__.py (58%)
 rename src/{load_data => load_variants}/load_clinical_variants.py (77%)

diff --git a/data/README.md b/data/README.md
index 5fef6c8..659b32f 100644
--- a/data/README.md
+++ b/data/README.md
@@ -8,7 +8,7 @@ This directory contains the primary data files used by the AutoGKB project.
 
 - **variantAnnotations/** - Contains clinical variant annotations and related data:
   - `var_drug_ann.tsv` - Variant-drug annotations. This is what is used in this repo.
-  - This can be downloaded using download_and_extract_variant_annotations from the load_data module
+  - This can be downloaded using download_and_extract_variant_annotations from the load_variants module
 
 - **Support Files**:
   - `pmcid_mapping.json` - Maps between PMIDs and PMCIDs
diff --git a/pixi.toml b/pixi.toml
index 6c5d7fd..4bb20c4 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -12,6 +12,7 @@ platforms = ["osx-arm64"]
 version = "0.1.0"
 
 [tasks]
+download-variants = "python -m src.load_variants.load_clinical_variants"
 update-download-map = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
 download-articles = "python -m src.fetch_articles.article_downloader"
 
diff --git a/src/benchmark/annotation.py b/src/benchmark/annotation.py
index 87b0da0..65ab710 100644
--- a/src/benchmark/annotation.py
+++ b/src/benchmark/annotation.py
@@ -1,5 +1,5 @@
 from pydantic import BaseModel
-from src.load_data import load_raw_variant_annotations
+from src.load_variants import load_raw_variant_annotations
 
 """
 Denotes a class for a variant annotation (row in var_drug_ann.tsv)
diff --git a/src/fetch_articles/README.md b/src/fetch_articles/README.md
index dbade90..19503ef 100644
--- a/src/fetch_articles/README.md
+++ b/src/fetch_articles/README.md
@@ -4,8 +4,8 @@
 Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues.
 
 ## Process Overview
-1. Download the zip of variants from pharmgkb (handled in load_data module)
-2. Get a PMID list from the variants tsv (column PMID) (handled in load_data module)
+1. Download the zip of variants from pharmgkb (handled in load_variants module)
+2. Get a PMID list from the variants tsv (column PMID) (handled in load_variants module)
 3. Convert the PMID to PMCID 
 4. Fetch the content from the PMCID
 
@@ -54,7 +54,7 @@ Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall
 
 ```python
 from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid
-from src.load_data import get_pmid_list
+from src.load_variants import get_pmid_list
 import os
 from dotenv import load_dotenv
 
@@ -131,7 +131,7 @@ To run the complete pipeline (convert PMIDs to PMCIDs and download articles):
 # Full pipeline from PMIDs to downloaded articles
 from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid
 from src.fetch_articles.article_downloader import download_articles
-from src.load_data import get_pmid_list
+from src.load_variants import get_pmid_list
 import os
 from dotenv import load_dotenv
 
diff --git a/src/fetch_articles/pmcid_converter.py b/src/fetch_articles/pmcid_converter.py
index 10b9347..e24fc00 100644
--- a/src/fetch_articles/pmcid_converter.py
+++ b/src/fetch_articles/pmcid_converter.py
@@ -5,7 +5,7 @@
 from tqdm import tqdm
 from dotenv import load_dotenv
 import os
-from src.load_data import get_pmid_list
+from src.load_variants import get_pmid_list
 import json
 from src.utils.file_paths import get_project_root
 
diff --git a/src/load_data/README.md b/src/load_variants/README.md
similarity index 91%
rename from src/load_data/README.md
rename to src/load_variants/README.md
index 311203f..0b6ca40 100644
--- a/src/load_data/README.md
+++ b/src/load_variants/README.md
@@ -6,7 +6,7 @@ This module handles the loading and preprocessing of PharmGKB clinical variants
 
 1. **`download_and_extract_variant_annotations(override: bool = False)`**
    - Downloads and extracts the variant annotations ZIP file from PharmGKB
-   - Saves data to `saved_data/variantAnnotations/`
+   - Saves data to `data/variantAnnotations/`
    - Can override existing downloads if needed
 
 2. **`load_raw_variant_annotations(override: bool = False)`**
@@ -21,7 +21,7 @@ This module handles the loading and preprocessing of PharmGKB clinical variants
 4. **`get_pmid_list(override: bool = False)`**
    - Main function to extract PMIDs from the variant annotations
    - Returns a list of unique PMIDs
-   - Caches results in `saved_data/pmid_list.json`
+   - Caches results in `data/pmid_list.json`
    - Used as input for PMCID conversion
 
 The module handles all data downloading, extraction, and preprocessing steps needed to get the PMID list for subsequent steps in the pipeline.
diff --git a/src/load_data/__init__.py b/src/load_variants/__init__.py
similarity index 58%
rename from src/load_data/__init__.py
rename to src/load_variants/__init__.py
index 90cbc32..4142345 100644
--- a/src/load_data/__init__.py
+++ b/src/load_variants/__init__.py
@@ -1 +1 @@
-from .load_clinical_variants import load_raw_variant_annotations, get_pmid_list
\ No newline at end of file
+from .load_clinical_variants import load_raw_variant_annotations, get_pmid_list, variant_annotations_pipeline
\ No newline at end of file
diff --git a/src/load_data/load_clinical_variants.py b/src/load_variants/load_clinical_variants.py
similarity index 77%
rename from src/load_data/load_clinical_variants.py
rename to src/load_variants/load_clinical_variants.py
index 310efed..dd868fe 100644
--- a/src/load_data/load_clinical_variants.py
+++ b/src/load_variants/load_clinical_variants.py
@@ -95,30 +95,6 @@ def unique_variants(df: pd.DataFrame) -> dict:
     return {col: df[col].unique().tolist() for col in df.columns}
 
 
-def load_unique_variants(save_results: bool = True) -> dict:
-    """
-    Loads the unique variants from the variant annotations tsv file and saves them to a json file.
-    If the json file already exists, it will be loaded from the file.
-    NOTE: Don't think this function is needed anymore. get_pmid_list() is used instead.
-    """
-    unique_variants_path = os.path.join(get_project_root(), "data", "unique_variants.json")
-    if os.path.exists(unique_variants_path):
-        logger.info(f"Loading unique variants from {unique_variants_path}")
-        with open(unique_variants_path, "r") as f:
-            unique_values_per_column = json.load(f)
-    else:
-        logger.info(
-            f"Unique variants not found at {unique_variants_path}. Loading from tsv file..."
-        )
-        df = load_raw_variant_annotations()
-        unique_values_per_column = unique_variants(df)
-        if save_results:
-            logger.info(f"Saving unique variants to {unique_variants_path}")
-            with open(unique_variants_path, "w") as f:
-                json.dump(unique_values_per_column, f)
-    return unique_values_per_column
-
-
 def get_pmid_list(override: bool = False) -> list:
     """
     Loads the pmid list from the variant annotations tsv file.
@@ -136,6 +112,22 @@ def get_pmid_list(override: bool = False) -> list:
             json.dump(pmid_list, f)
     return pmid_list
 
-if __name__ == "__main__":
+def variant_annotations_pipeline():
+    """
+    Loads the variant annotations tsv file and saves the unique PMIDs to a json file.
+    """
+    # Download and extract the variant annotations
+    logger.info("Downloading and extracting variant annotations...")
+    download_and_extract_variant_annotations()
+
+    # Load the variant annotations
+    logger.info("Loading variant annotations...")
+    df = load_raw_variant_annotations()
+
+    # Get the PMIDs
+    logger.info("Getting PMIDs...")
     pmid_list = get_pmid_list()
-    print(f"Number of unique PMIDs: {len(pmid_list)}")
+    logger.info(f"Number of unique PMIDs: {len(pmid_list)}")
+
+if __name__ == "__main__":
+    variant_annotations_pipeline()

From 74db6aa07aa808f5cc73399f4b5af2e311b3b0ec Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 19 May 2025 17:22:45 -0700
Subject: [PATCH 14/15] feat: diagram and black formatting

---
 README.MD                                   |  3 ++
 assets/annotations_diagram.svg              |  1 +
 src/benchmark/annotation.py                 |  5 ++-
 src/fetch_articles/article_downloader.py    | 26 ++++++++--------
 src/fetch_articles/pmcid_converter.py       | 34 +++++++++++----------
 src/load_variants/__init__.py               |  6 +++-
 src/load_variants/load_clinical_variants.py |  3 ++
 src/utils/__init__.py                       |  2 +-
 src/utils/file_paths.py                     |  1 +
 9 files changed, 48 insertions(+), 33 deletions(-)
 create mode 100644 assets/annotations_diagram.svg

diff --git a/README.MD b/README.MD
index c7ad26e..97f5c97 100644
--- a/README.MD
+++ b/README.MD
@@ -37,3 +37,6 @@ This repository contains Python scripts for running and building a Pharmacogenom
 |                  | Delegate annotation groupings to team members | |
 | New Article Fetching | Replicate PharGKB current workflow | |
 
+## System Overview
+![Annotations Diagram](assets/annotations_diagram.svg)
+
diff --git a/assets/annotations_diagram.svg b/assets/annotations_diagram.svg
new file mode 100644
index 0000000..ac373f0
--- /dev/null
+++ b/assets/annotations_diagram.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:lucid="lucid" width="1562" height="622"><g transform="translate(-1339 -399)" lucid:page-tab-id="0_0"><path d="M1000 0h2000v1500H1000z" fill="#fff"/><path d="M1340 406a6 6 0 0 1 6-6h1548a6 6 0 0 1 6 6v608a6 6 0 0 1-6 6H1346a6 6 0 0 1-6-6z" stroke="#000" stroke-width="2" fill="#fff"/><path d="M1420 716a6 6 0 0 1 6-6h183.67a6 6 0 0 1 6 6v88a6 6 0 0 1-6 6H1426a6 6 0 0 1-6-6z" stroke="#000" stroke-width="2" fill="#fff"/><use xlink:href="#a" transform="matrix(1,0,0,1,1425,715) translate(14.717664930555571 52.46527777777778)"/><use xlink:href="#b" transform="matrix(1,0,0,1,1425,715) translate(82.56705729166667 52.46527777777778)"/><path d="M1920 716a6 6 0 0 1 6-6h183.67a6 6 0 0 1 6 6v88a6 6 0 0 1-6 6H1926a6 6 0 0 1-6-6z" stroke="#000" stroke-width="2" fill="#fff"/><use xlink:href="#c" transform="matrix(1,0,0,1,1925,715) translate(7.832899305555557 52.46527777777778)"/><use xlink:href="#d" transform="matrix(1,0,0,1,1925,715) translate(133.26692708333331 52.46527777777778)"/><path d="M2200 546a6 6 0 0 1 6-6h148a6 6 0 0 1 6 6v108a6 6 0 0 1-6 6h-148a6 6 0 0 1-6-6z" stroke="#000" stroke-width="2" fill="#fff"/><use xlink:href="#e" transform="matrix(1,0,0,1,2212,552) translate(7.930555555555557 24.52777777777778)"/><use xlink:href="#f" transform="matrix(1,0,0,1,2212,552) translate(98.80512152777777 24.52777777777778)"/><use xlink:href="#g" transform="matrix(1,0,0,1,2212,552) translate(10.909071180555557 56.79340277777778)"/><use xlink:href="#h" transform="matrix(1,0,0,1,2212,552) translate(13.1171875 89.05902777777777)"/><use xlink:href="#i" transform="matrix(1,0,0,1,2212,552) translate(73.11067708333333 89.05902777777777)"/><use xlink:href="#j" transform="matrix(1,0,0,1,2212,552) translate(113.84418402777777 89.05902777777777)"/><path d="M2200 706a6 6 0 0 1 6-6h148a6 6 0 0 1 6 6v108a6 6 0 0 1-6 6h-148a6 6 0 0 1-6-6z" stroke="#000" stroke-width="2" fill="#fff"/><use xlink:href="#e" transform="matrix(1,0,0,1,2212,712) translate(7.930555555555557 24.52777777777778)"/><use xlink:href="#f" transform="matrix(1,0,0,1,2212,712) translate(98.80512152777777 24.52777777777778)"/><use xlink:href="#g" transform="matrix(1,0,0,1,2212,712) translate(10.909071180555557 56.79340277777778)"/><use xlink:href="#h" transform="matrix(1,0,0,1,2212,712) translate(10.860243055555557 89.05902777777777)"/><use xlink:href="#i" transform="matrix(1,0,0,1,2212,712) translate(70.85373263888889 89.05902777777777)"/><use xlink:href="#k" transform="matrix(1,0,0,1,2212,712) translate(111.58723958333333 89.05902777777777)"/><path d="M2200 866a6 6 0 0 1 6-6h148a6 6 0 0 1 6 6v108a6 6 0 0 1-6 6h-148a6 6 0 0 1-6-6z" stroke="#000" stroke-width="2" fill="#fff"/><use xlink:href="#e" transform="matrix(1,0,0,1,2212,872) translate(7.930555555555557 24.52777777777778)"/><use xlink:href="#f" transform="matrix(1,0,0,1,2212,872) translate(98.80512152777777 24.52777777777778)"/><use xlink:href="#g" transform="matrix(1,0,0,1,2212,872) translate(10.909071180555557 56.79340277777778)"/><use xlink:href="#h" transform="matrix(1,0,0,1,2212,872) translate(11.071831597222229 89.05902777777777)"/><use xlink:href="#i" transform="matrix(1,0,0,1,2212,872) translate(71.06532118055556 89.05902777777777)"/><use xlink:href="#l" transform="matrix(1,0,0,1,2212,872) translate(111.798828125 89.05902777777777)"/><path d="M2440 706a6 6 0 0 1 6-6h148a6 6 0 0 1 6 6v108a6 6 0 0 1-6 6h-148a6 6 0 0 1-6-6z" stroke="#000" stroke-width="2" fill="#fff"/><use xlink:href="#m" transform="matrix(1,0,0,1,2445,705) translate(11.338975694444436 31.84027777777778)"/><use xlink:href="#n" transform="matrix(1,0,0,1,2445,705) translate(34.43467881944444 64.10590277777777)"/><use xlink:href="#o" transform="matrix(1,0,0,1,2445,705) translate(29.779730902777786 96.37152777777777)"/><path d="M1680 716a6 6 0 0 1 6-6h163.67a6 6 0 0 1 6 6v88a6 6 0 0 1-6 6H1686a6 6 0 0 1-6-6z" stroke="#000" stroke-width="2" fill="#fff"/><use xlink:href="#p" transform="matrix(1,0,0,1,1685,715) translate(36.67296006944445 52.46527777777778)"/><path d="M2680 711a6 6 0 0 1 6-6h168a6 6 0 0 1 6 6v98a6 6 0 0 1-6 6h-168a6 6 0 0 1-6-6z" fill="none"/><use xlink:href="#q" transform="matrix(1,0,0,1,2680,705) translate(11.289062500000028 42.92643229166668)"/><use xlink:href="#r" transform="matrix(1,0,0,1,2680,705) translate(42.750651041666686 91.32486979166666)"/><path d="M1617.17 760h45.45" stroke="#3a414a" fill="none"/><path d="M1617.18 760.48h-.5v-.96h.5z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M1677.38 760l-14.26 4.63v-9.26z" stroke="#3a414a" fill="#3a414a"/><path d="M1857.17 760h45.45" stroke="#3a414a" fill="none"/><path d="M1857.18 760.48h-.5v-.96h.5z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M1917.38 760l-14.26 4.63v-9.26z" stroke="#3a414a" fill="#3a414a"/><path d="M2117.17 760h65.45" stroke="#3a414a" fill="none"/><path d="M2117.18 760.48h-.5v-.96h.5z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M2197.38 760l-14.26 4.63v-9.26z" stroke="#3a414a" fill="#3a414a"/><path d="M2117.17 760h34.66a6 6 0 0 0 6-6V606a6 6 0 0 1 6-6h18.8" stroke="#3a414a" fill="none"/><path d="M2117.18 760.48h-.5v-.96h.5z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M2197.38 600l-14.26 4.63v-9.26z" stroke="#3a414a" fill="#3a414a"/><path d="M2117.17 760h34.66a6 6 0 0 1 6 6v148a6 6 0 0 0 6 6h18.8" stroke="#3a414a" fill="none"/><path d="M2117.18 760.48h-.5v-.96h.5z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M2197.38 920l-14.26 4.63v-9.26z" stroke="#3a414a" fill="#3a414a"/><path d="M2361.5 600h32.5a6 6 0 0 1 6 6v148a6 6 0 0 0 6 6h16.62" stroke="#3a414a" fill="none"/><path d="M2361.5 600.48h-.5v-.96h.5z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M2437.38 760l-14.26 4.63v-9.26z" stroke="#3a414a" fill="#3a414a"/><path d="M2361.5 760h61.12" stroke="#3a414a" fill="none"/><path d="M2361.5 760.48h-.5v-.96h.5z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M2437.38 760l-14.26 4.63v-9.26z" stroke="#3a414a" fill="#3a414a"/><path d="M2361.5 920h32.5a6 6 0 0 0 6-6V766a6 6 0 0 1 6-6h16.62" stroke="#3a414a" fill="none"/><path d="M2361.5 920.48h-.5v-.96h.5z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M2437.38 760l-14.26 4.63v-9.26z" stroke="#3a414a" fill="#3a414a"/><path d="M2601.5 760h62.12" stroke="#3a414a" fill="none"/><path d="M2601.5 760.48h-.5v-.96h.5z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M2678.38 760l-14.26 4.63v-9.26z" stroke="#3a414a" fill="#3a414a"/><path d="M1768.9 437.55a6 6 0 0 1 6-6h548a6 6 0 0 1 6 6v48a6 6 0 0 1-6 6h-548a6 6 0 0 1-6-6z" fill="none"/><g><use xlink:href="#s" transform="matrix(1,0,0,1,1768.8899799998853,431.5502956272559) translate(31.908094618055657 43.742404513888886)"/><use xlink:href="#t" transform="matrix(1,0,0,1,1768.8899799998853,431.5502956272559) translate(236.64225260416669 43.742404513888886)"/><use xlink:href="#u" transform="matrix(1,0,0,1,1768.8899799998853,431.5502956272559) translate(403.43261718749994 43.742404513888886)"/></g><defs><path d="M180 0v-1490h510c348 0 508 209 508 474 0 266-160 477-507 477H370V0H180zm190-706h312c236 0 327-133 327-310 0-176-91-307-329-307H370v617" id="v"/><path d="M471 26C259 26 90-98 90-318c0-256 228-303 435-329 202-27 287-16 287-108 0-139-79-219-234-219-161 0-248 86-283 164l-173-57c86-203 278-265 451-265 150 0 419 46 419 395V0H815v-152h-12C765-73 660 26 471 26zm31-159c199 0 310-134 310-271v-155c-30 35-226 55-295 64-131 17-246 59-246 186 0 116 97 176 231 176" id="w"/><path d="M158 418v-1536h174v179h20c37-59 106-193 324-193 279 0 474 222 474 576 0 356-194 580-473 580-213 0-288-135-325-197h-14v591H158zm492-555c209 0 317-186 317-421 0-232-105-413-317-413-206 0-314 166-314 413 0 249 111 421 314 421" id="x"/><path d="M628 24c-324 0-524-230-524-574 0-343 198-582 503-582 237 0 487 146 487 559v75H286c9 234 145 362 343 362 132 0 231-58 273-172l174 48C1024-91 857 24 628 24zM287-650h624c-17-190-120-322-304-322-192 0-309 151-320 322" id="y"/><path d="M158 0v-1118h174v172h12c41-113 157-188 290-188 26 0 70 2 91 3v181c-11-2-60-10-108-10-161 0-279 109-279 260V0H158" id="z"/><g id="a"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#v"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,14.192708333333332,0)" xlink:href="#w"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,26.671006944444443,0)" xlink:href="#x"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,40.27777777777777,0)" xlink:href="#y"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,53.233506944444436,0)" xlink:href="#z"/></g><path d="M218-607c0-325 108-727 258-974h175c-155 315-250 682-250 974 0 265 78 531 250 886H476C306-10 218-315 218-607" id="A"/><path d="M180 0v-1490h270l367 940c28 72 75 218 110 339 35-117 81-264 110-339l362-940h271V0h-187c2-448-5-837 7-1287-157 497-311 829-483 1287H842C666-458 514-784 354-1284c12 438 5 843 7 1284H180" id="B"/><path d="M783 20c-382 0-661-292-661-764 0-473 279-766 661-766 302 0 548 182 601 489h-190c-42-204-217-313-411-313-268 0-476 208-476 590 0 381 209 588 476 588 195 0 369-110 411-313h190c-52 303-296 489-601 489" id="C"/><path d="M370-1490V0H180v-1490h190" id="D"/><path d="M645 0H180v-1490h484c435 0 692 280 692 742 0 466-257 748-711 748zM370-168h263c363 0 539-218 539-580 0-358-176-574-521-574H370v1154" id="E"/><path d="M96 279C273-86 346-350 346-607c0-292-95-659-250-974h175c150 246 258 650 258 974 0 297-91 602-258 886H96" id="F"/><g id="b"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#A"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,8.10546875,0)" xlink:href="#v"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,22.298177083333332,0)" xlink:href="#B"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,42.37196180555556,0)" xlink:href="#C"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,58.60460069444444,0)" xlink:href="#D"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,64.57248263888889,0)" xlink:href="#E"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,80.60980902777777,0)" xlink:href="#F"/></g><path d="M613 24c-304 0-509-231-509-576 0-350 205-580 509-580 305 0 511 230 511 580 0 345-206 576-511 576zm0-161c226 0 329-195 329-415 0-222-103-419-329-419-223 0-326 196-326 419 0 220 103 415 326 415" id="G"/><path d="M613 24c-304 0-509-231-509-576 0-350 205-580 509-580 216 0 392 114 453 309l-173 49c-33-115-133-197-280-197-223 0-326 196-326 419 0 220 103 415 326 415 150 0 252-85 285-206l172 49C1010-95 832 24 613 24" id="H"/><path d="M538 24C308 24 148-78 108-271l171-41c32 123 123 178 257 178 156 0 256-77 256-169 0-77-54-128-164-154l-186-44c-203-48-300-148-300-305 0-192 176-326 414-326 230 0 351 112 402 269l-163 42c-31-80-94-158-238-158-133 0-233 69-233 162 0 83 57 129 188 160l169 40c203 48 298 149 298 302 0 196-179 339-441 339" id="I"/><g id="c"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#v"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,14.192708333333332,0)" xlink:href="#z"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,22.189670138888886,0)" xlink:href="#y"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,35.14539930555555,0)" xlink:href="#x"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,48.752170138888886,0)" xlink:href="#z"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,56.749131944444436,0)" xlink:href="#G"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,70.07378472222221,0)" xlink:href="#H"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,82.76909722222221,0)" xlink:href="#y"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,95.72482638888889,0)" xlink:href="#I"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,107.45442708333333,0)" xlink:href="#I"/></g><path d="M98-1322v-168h1126v168H757V0H567v-1322H98" id="J"/><path d="M65 0l393-574-370-544h210c88 136 192 297 267 435 67-142 176-302 264-435h206L661-564 1053 0H844c-94-144-206-310-287-458C484-308 366-142 273 0H65" id="K"/><path d="M598-1118v154H368v674c0 100 37 144 132 144 23 0 62-6 92-12L629-6c-37 13-88 20-134 20-193 0-307-107-307-290v-688H20v-154h168v-266h180v266h230" id="L"/><g id="d"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#J"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,12.608506944444445,0)" xlink:href="#y"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,25.065104166666664,0)" xlink:href="#K"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,37.63020833333333,0)" xlink:href="#L"/></g><g id="e"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#v"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,14.192708333333332,0)" xlink:href="#z"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,22.189670138888886,0)" xlink:href="#G"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,35.514322916666664,0)" xlink:href="#H"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,48.209635416666664,0)" xlink:href="#y"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,61.16536458333333,0)" xlink:href="#I"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,72.89496527777777,0)" xlink:href="#I"/></g><path d="M678-1118v154H420V0H240v-964H20v-154h220v-149c0-194 155-293 318-293 85 0 141 18 168 30l-50 154c-19-6-47-17-97-17-111 0-159 58-159 166v109h258" id="M"/><g id="f"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#M"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,7.573784722222222,0)" xlink:href="#G"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,20.8984375,0)" xlink:href="#z"/></g><path d="M52 0l541-1490h220L1361 0h-200l-149-416H398L254 0H52zm404-582h497c-103-290-145-390-251-756-108 377-145 460-246 756" id="N"/><path d="M338-670V0H158v-1118h173l1 207c72-158 192-221 342-221 226 0 378 139 378 422V0H872v-695c0-172-96-275-252-275-161 0-282 109-282 300" id="O"/><path d="M158 0v-1118h180V0H158zm91-1301c-68 0-125-53-125-119s57-119 125-119c69 0 126 53 126 119s-57 119-126 119" id="P"/><path d="M611 442c-248 0-391-105-460-228l146-94c47 65 117 165 314 165 178 0 307-82 307-266v-224h-17C863-141 792-18 576-18c-268 0-472-195-472-546 0-346 197-568 476-568 216 0 288 133 326 193h17v-179h175V29c0 289-215 413-487 413zm-5-620c203 0 314-146 314-390 0-237-108-403-314-403-213 0-319 180-319 403 0 230 109 390 319 390" id="Q"/><g id="g"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#N"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,15.33203125,0)" xlink:href="#O"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,28.461371527777775,0)" xlink:href="#O"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,41.59071180555556,0)" xlink:href="#G"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,54.91536458333333,0)" xlink:href="#L"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,62.29383680555555,0)" xlink:href="#w"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,74.77213541666667,0)" xlink:href="#L"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,82.04210069444444,0)" xlink:href="#P"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,87.42404513888889,0)" xlink:href="#O"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,100.55338541666666,0)" xlink:href="#Q"/></g><path d="M158 0v-1118h175l1 205c55-151 181-225 313-225 147 0 245 90 285 228 53-141 190-228 352-228 194 0 352 125 352 384V0h-181v-749c0-161-105-225-225-225-151 0-243 103-243 244V0H807v-767c0-124-93-207-219-207-131 0-250 92-250 270V0H158" id="R"/><g id="h"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#J"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,12.608506944444445,0)" xlink:href="#y"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,25.56423611111111,0)" xlink:href="#z"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,34.27734375,0)" xlink:href="#R"/></g><path d="M657 26c-323 0-524-166-541-416h195c15 169 171 246 346 246 202 0 356-106 356-265 5-203-294-238-475-293-239-73-380-191-380-389 0-252 224-419 512-419 294 0 499 171 508 396H992c-17-145-151-228-328-228-193 0-321 102-321 242 0 156 175 211 284 241l149 41c160 44 422 134 422 412 0 244-197 432-541 432" id="S"/><g id="i"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#S"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,14.2578125,0)" xlink:href="#y"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,27.213541666666664,0)" xlink:href="#L"/></g><path d="M653-1490V0H466v-1314h-10L96-1047v-204l324-239h233" id="T"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#T" id="j"/><path d="M154 0v-137l495-537c165-179 249-281 249-418 0-156-121-253-280-253-170 0-278 110-278 278H158c0-264 200-443 465-443 266 0 455 183 455 416 0 161-73 288-336 568L416-179v12h687V0H154" id="U"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#U" id="k"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#O" id="l"/><path d="M180 0v-1490h528c311 0 458 177 458 395 0 191-114 278-244 313v14c139 9 302 137 302 371 0 225-148 397-499 397H180zm190-168h350c232 0 321-102 321-231 0-149-120-277-311-277H370v508zm0-668h328c160 0 286-101 286-255 0-129-87-231-280-231H370v486" id="V"/><path d="M338-670V0H158v-1490h180v566c73-149 190-208 336-208 226 0 379 139 379 422V0H872v-695c0-172-96-275-252-275-161 0-282 109-282 300" id="W"/><path d="M158 0v-1490h180v865h22l478-493h223L593-638 1096 0H865L456-523 338-412V0H158" id="X"/><path d="M692-1560L212 224H46l480-1784h166" id="Y"/><g id="m"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#V"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,14.539930555555555,0)" xlink:href="#y"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,27.49565972222222,0)" xlink:href="#O"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,40.625,0)" xlink:href="#H"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,53.3203125,0)" xlink:href="#W"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,66.46050347222223,0)" xlink:href="#R"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,85.92664930555556,0)" xlink:href="#w"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,98.40494791666666,0)" xlink:href="#z"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,107.11805555555557,0)" xlink:href="#X"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,119.31423611111113,0)" xlink:href="#Y"/></g><g id="n"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#S"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,14.2578125,0)" xlink:href="#H"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,26.953125,0)" xlink:href="#G"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,40.27777777777778,0)" xlink:href="#z"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,48.990885416666664,0)" xlink:href="#P"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,54.37282986111111,0)" xlink:href="#O"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,67.50217013888889,0)" xlink:href="#Q"/></g><path d="M180 0v-1490h901v168H370v510h643v168H370V0H180" id="Z"/><path d="M537 14c-226 0-379-139-379-422v-710h180v695c0 172 97 275 253 275 160 0 281-109 281-300v-670h181V0H879v-209C806-46 684 14 537 14" id="aa"/><g id="o"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#Z"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,12.369791666666666,0)" xlink:href="#aa"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,25.50998263888889,0)" xlink:href="#O"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,38.639322916666664,0)" xlink:href="#H"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,51.334635416666664,0)" xlink:href="#L"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,58.60460069444444,0)" xlink:href="#P"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,63.986545138888886,0)" xlink:href="#G"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,77.31119791666666,0)" xlink:href="#O"/></g><path d="M180 0v-1490h510c348 0 508 194 508 460 0 198-88 351-276 417L1256 0h-220L726-579c-117 2-238 0-356 1V0H180zm190-747h312c235 0 327-108 327-283 0-177-92-293-329-293H370v576" id="ab"/><path d="M409 0L70-1118h191c89 329 165 560 243 925 75-353 154-601 240-925h192c85 325 161 564 235 922 77-354 157-598 244-922h191L1267 0h-179c-86-307-176-590-250-913C763-588 675-308 588 0H409" id="ac"/><path d="M57 0l534-763-501-727h220c122 187 296 416 391 605 94-191 272-419 396-605h215L811-770 1340 0h-219C986-204 811-426 699-640 588-430 407-202 273 0H57" id="ad"/><path d="M180 0v-1490h190v1322h690V0H180" id="ae"/><g id="p"><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,0,0)" xlink:href="#ab"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,14.192708333333332,0)" xlink:href="#w"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,26.671006944444443,0)" xlink:href="#ac"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,44.85677083333333,0)" xlink:href="#ad"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,60.01519097222221,0)" xlink:href="#B"/><use transform="matrix(0.010850694444444444,0,0,0.010850694444444444,80.08897569444443,0)" xlink:href="#ae"/></g><path d="M449 0L49-1490h333c89 395 194 758 253 1185 65-426 176-790 269-1185h317c93 394 201 758 268 1181 60-425 164-788 253-1181h334L1675 0h-348c-90-376-204-711-266-1118C1004-714 886-373 798 0H449" id="af"/><path d="M628 22C291 22 81-210 81-554c0-346 210-578 547-578 336 0 547 232 547 578 0 344-211 576-547 576zm0-236c160 0 242-146 242-341 0-197-82-341-242-341S387-753 387-555c0 195 81 341 241 341" id="ag"/><path d="M128 0v-1118h290v195h12c54-169 184-238 367-200v268c-27-8-86-15-130-15-138 0-239 95-239 234V0H128" id="ah"/><path d="M128 0v-1490h300v802h17l366-430h347L740-633 1179 0H826L514-456l-86 97V0H128" id="ai"/><path d="M750-1118v229H516V0H217v-889H20v-229h197v-86c0-238 156-356 363-356 94 0 178 20 215 31l-54 226c-25-7-61-15-100-15-123-3-131 84-125 200h234" id="aj"/><path d="M428-1490V0H128v-1490h300" id="ak"/><path d="M361 0L31-1118h315c68 295 127 523 188 845 63-317 128-551 199-845h276c71 296 131 525 196 846 57-321 119-552 186-846h319L1378 0h-305c-73-261-148-486-204-771C813-484 740-261 666 0H361" id="al"/><g id="q"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#af"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,32.92643229166666,0)" xlink:href="#ag"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,53.369140624999986,0)" xlink:href="#ah"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,67.46419270833331,0)" xlink:href="#ai"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,86.80013020833331,0)" xlink:href="#aj"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.06510416666664,0)" xlink:href="#ak"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,109.09830729166664,0)" xlink:href="#ag"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,129.0852864583333,0)" xlink:href="#al"/></g><path d="M681 22C323 22 94-145 85-462h297c10 151 131 226 296 226 163 0 271-77 271-190 0-102-95-149-257-189l-164-41c-252-61-403-188-403-405 0-269 236-449 560-449 330 0 546 183 551 445H941c-11-119-106-188-259-188-156 0-244 72-244 172-3 156 226 185 373 221 250 61 445 186 445 436 0 271-213 446-575 446" id="am"/><path d="M628 22C291 22 81-210 81-554c0-346 210-578 547-578 265 0 455 142 497 372l-279 52c-25-116-99-188-215-188-160 0-244 139-244 341 0 200 84 341 244 341 116 0 193-74 217-195l279 51C1085-123 896 22 628 22" id="an"/><path d="M633 22C291 22 81-200 81-553c0-345 210-579 537-579 292 0 525 185 525 567v84H378c7 183 109 277 260 277 105 0 184-46 216-132l272 51C1071-99 893 22 633 22zM380-669h474c-15-146-94-237-233-237-143 0-229 99-241 237" id="ao"/><g id="r"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#am"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,21.826171874999993,0)" xlink:href="#an"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,41.438802083333314,0)" xlink:href="#ag"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,61.88151041666664,0)" xlink:href="#ah"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,74.6419270833333,0)" xlink:href="#ao"/></g><g id="s"><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,0,0)" xlink:href="#N"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,26.06445312499999,0)" xlink:href="#O"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,48.3843315972222,0)" xlink:href="#O"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,70.70421006944441,0)" xlink:href="#G"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,93.35611979166663,0)" xlink:href="#L"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,105.8995225694444,0)" xlink:href="#w"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,127.11263020833329,0)" xlink:href="#L"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,139.47157118055551,0)" xlink:href="#P"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,148.62087673611106,0)" xlink:href="#O"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,170.9407552083333,0)" xlink:href="#Q"/></g><path d="M338-1490V0H158v-1490h180" id="ap"/><path d="M577 24c-279 0-473-224-473-580 0-354 195-576 474-576 218 0 287 134 324 193h14v-551h180V0H922v-173h-20C865-111 790 24 577 24zm27-161c203 0 314-172 314-421 0-247-108-413-314-413-212 0-317 181-317 413 0 235 108 421 317 421" id="aq"/><g id="t"><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,0,0)" xlink:href="#S"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,24.238281249999993,0)" xlink:href="#y"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,46.263020833333314,0)" xlink:href="#ap"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,55.41232638888886,0)" xlink:href="#y"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,77.43706597222217,0)" xlink:href="#H"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,99.01909722222217,0)" xlink:href="#L"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,111.00911458333327,0)" xlink:href="#y"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,133.0338541666666,0)" xlink:href="#aq"/></g><g id="u"><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,0,0)" xlink:href="#v"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,24.127604166666657,0)" xlink:href="#w"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,45.34071180555554,0)" xlink:href="#x"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,68.4722222222222,0)" xlink:href="#y"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,90.49696180555553,0)" xlink:href="#z"/><use transform="matrix(0.01844618055555555,0,0,0.01844618055555555,104.71896701388886,0)" xlink:href="#I"/></g></defs></g></svg>
\ No newline at end of file
diff --git a/src/benchmark/annotation.py b/src/benchmark/annotation.py
index 65ab710..b552ae4 100644
--- a/src/benchmark/annotation.py
+++ b/src/benchmark/annotation.py
@@ -5,6 +5,7 @@
 Denotes a class for a variant annotation (row in var_drug_ann.tsv)
 """
 
+
 class VariantAnnotation(BaseModel):
     variant_annotation_id: str
     variant_haplotypes: str
@@ -35,12 +36,10 @@ class VariantAnnotation(BaseModel):
     multiple_phenotypes_or_diseases_and_or: str
     comparison_alleles_or_genotypes: str
     comparison_metabolizer_types: str
-    
 
-    
+
 """
 1. Load the ground truth variants
 2. Load the extracted variants
 3. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID
 """
-
diff --git a/src/fetch_articles/article_downloader.py b/src/fetch_articles/article_downloader.py
index c9179f7..543f01e 100644
--- a/src/fetch_articles/article_downloader.py
+++ b/src/fetch_articles/article_downloader.py
@@ -10,10 +10,10 @@
 def fetch_pmc_content(pmcid):
     """
     Fetch content for a single article from PubMed Central.
-    
+
     Args:
         pmcid (str): The PubMed Central ID to fetch
-        
+
     Returns:
         bytes or None: The article content in XML format or None if fetching failed
     """
@@ -33,16 +33,16 @@ def update_downloaded_pmcids() -> None:
     """
     project_root = get_project_root()
     downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json"
-    
+
     # Check for all the filenames in the data/articles directory
     articles_dir = project_root / "data" / "articles"
     os.makedirs(articles_dir, exist_ok=True)
-    
+
     article_pmcids = [f.split(".")[0] for f in os.listdir(articles_dir)]
     article_pmcids_mapping = {pmcid: f"{pmcid}.xml" for pmcid in article_pmcids}
 
     logger.info(f"Found {len(article_pmcids)} existing XML files in {articles_dir}")
-    
+
     # Add the new PMCIDs to the json file
     if os.path.exists(downloaded_pmcids_path):
         with open(downloaded_pmcids_path, "r") as f:
@@ -55,12 +55,12 @@ def update_downloaded_pmcids() -> None:
                 downloaded_pmcids = {}
     else:
         downloaded_pmcids = {}
-        
+
     downloaded_pmcids.update(article_pmcids_mapping)
-    
+
     with open(downloaded_pmcids_path, "w") as f:
         json.dump(downloaded_pmcids, f)
-        
+
     logger.info(
         f"Updated {downloaded_pmcids_path} with {len(article_pmcids)} new PMCIDs"
     )
@@ -81,7 +81,7 @@ def download_articles(pmcids: list[str]):
 
     # Load the downloaded PMCIDs from the json file
     downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json"
-    
+
     if os.path.exists(downloaded_pmcids_path):
         with open(downloaded_pmcids_path, "r") as f:
             downloaded_pmcids = json.load(f)
@@ -102,8 +102,10 @@ def download_articles(pmcids: list[str]):
         else:
             downloaded_pmcids[pmcid] = None
             logger.warning(f"No record found for PMCID {pmcid}")
-    
-    logger.info(f"Downloaded {len(new_pmcids)} new articles, total articles: {len(downloaded_pmcids)}")
+
+    logger.info(
+        f"Downloaded {len(new_pmcids)} new articles, total articles: {len(downloaded_pmcids)}"
+    )
 
     # Save the downloaded PMCIDs to a json file
     with open(downloaded_pmcids_path, "w") as f:
@@ -113,4 +115,4 @@ def download_articles(pmcids: list[str]):
 if __name__ == "__main__":
     update_downloaded_pmcids()
     pmcids = get_unique_pmcids()
-    download_articles(pmcids)
\ No newline at end of file
+    download_articles(pmcids)
diff --git a/src/fetch_articles/pmcid_converter.py b/src/fetch_articles/pmcid_converter.py
index e24fc00..417ec57 100644
--- a/src/fetch_articles/pmcid_converter.py
+++ b/src/fetch_articles/pmcid_converter.py
@@ -23,19 +23,16 @@
 from typing import List, Set, Dict, Optional
 
 
-
-
-
 def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]:
     """
     Load the saved PMCID mapping from the json file.
     """
     project_root = get_project_root()
     results_path = project_root / "data" / "pmcid_mapping.json"
-    
+
     # Create data directory if it doesn't exist
     os.makedirs(project_root / "data", exist_ok=True)
-    
+
     if os.path.exists(results_path):
         with open(results_path, "r") as f:
             existing_results = json.load(f)
@@ -51,7 +48,10 @@ def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]:
 
 
 def batch_pmid_to_pmcid(
-    pmids: List[str], email: str = os.getenv("NCBI_EMAIL"), batch_size: int = 100, delay: float = 0.4
+    pmids: List[str],
+    email: str = os.getenv("NCBI_EMAIL"),
+    batch_size: int = 100,
+    delay: float = 0.4,
 ) -> Dict[str, Optional[str]]:
     """
     Convert a list of PMIDs to PMCIDs using NCBI's ID Converter API.
@@ -120,10 +120,10 @@ def batch_pmid_to_pmcid(
     # Save updated results
     project_root = get_project_root()
     results_path = project_root / "data" / "pmcid_mapping.json"
-    
+
     # Create data directory if it doesn't exist
     os.makedirs(project_root / "data", exist_ok=True)
-    
+
     with open(results_path, "w") as f:
         json.dump(existing_results, f)
     logger.info(f"Updated PMCID mappings saved to {results_path}")
@@ -138,13 +138,13 @@ def get_unique_pmcids() -> List[str]:
     Currently function returns the pre-existing unique PMCIDs if they exist or regenerates the list from the mapping.
     """
     project_root = get_project_root()
-    
+
     # Load the unique PMCIDs if they've already been saved
     unique_pmcids_path = project_root / "data" / "unique_pmcids.json"
-    
+
     # Create data directory if it doesn't exist
     os.makedirs(project_root / "data", exist_ok=True)
-    
+
     if os.path.exists(unique_pmcids_path):
         with open(unique_pmcids_path, "r") as f:
             try:
@@ -161,14 +161,16 @@ def get_unique_pmcids() -> List[str]:
 
     # Load from pmcid_mapping.json if unique pmcids haven't been saved
     results_path = project_root / "data" / "pmcid_mapping.json"
-    
+
     if not os.path.exists(results_path):
-        logger.error(f"No PMCID mapping found at {results_path}. Cannot generate unique PMCIDs.")
+        logger.error(
+            f"No PMCID mapping found at {results_path}. Cannot generate unique PMCIDs."
+        )
         return []
-        
+
     with open(results_path, "r") as f:
         existing_results = json.load(f)
-    
+
     # Get the unique pmcids (remove None values)
     pmcids = [value for value in existing_results.values() if value is not None]
     pmcids = list(set(pmcids))
@@ -185,4 +187,4 @@ def get_unique_pmcids() -> List[str]:
     results = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL"))
     logger.info(f"PMCID mapping complete. {len(results)} PMIDs mapped to PMCIDs.")
     pmcids = get_unique_pmcids()
-    logger.info(f"Number of unique PMCIDs: {len(pmcids)}")
\ No newline at end of file
+    logger.info(f"Number of unique PMCIDs: {len(pmcids)}")
diff --git a/src/load_variants/__init__.py b/src/load_variants/__init__.py
index 4142345..6c56850 100644
--- a/src/load_variants/__init__.py
+++ b/src/load_variants/__init__.py
@@ -1 +1,5 @@
-from .load_clinical_variants import load_raw_variant_annotations, get_pmid_list, variant_annotations_pipeline
\ No newline at end of file
+from .load_clinical_variants import (
+    load_raw_variant_annotations,
+    get_pmid_list,
+    variant_annotations_pipeline,
+)
diff --git a/src/load_variants/load_clinical_variants.py b/src/load_variants/load_clinical_variants.py
index dd868fe..c43b340 100644
--- a/src/load_variants/load_clinical_variants.py
+++ b/src/load_variants/load_clinical_variants.py
@@ -8,6 +8,7 @@
 import json
 
 from src.utils.file_paths import get_project_root
+
 """
 This file contains functions to load the clinical variants data from the PharmGKB API.
 The key function is get_pmid_list(), which loads the PMIDs from the variant annotations tsv file and saves them to a json file.
@@ -112,6 +113,7 @@ def get_pmid_list(override: bool = False) -> list:
             json.dump(pmid_list, f)
     return pmid_list
 
+
 def variant_annotations_pipeline():
     """
     Loads the variant annotations tsv file and saves the unique PMIDs to a json file.
@@ -129,5 +131,6 @@ def variant_annotations_pipeline():
     pmid_list = get_pmid_list()
     logger.info(f"Number of unique PMIDs: {len(pmid_list)}")
 
+
 if __name__ == "__main__":
     variant_annotations_pipeline()
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
index 10433a3..4deed16 100644
--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -1 +1 @@
-from .file_paths import get_project_root
\ No newline at end of file
+from .file_paths import get_project_root
diff --git a/src/utils/file_paths.py b/src/utils/file_paths.py
index 1d37993..229c652 100644
--- a/src/utils/file_paths.py
+++ b/src/utils/file_paths.py
@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 
+
 def get_project_root() -> Path:
     """
     Return the project root directory.

From 9ac85cef6659c165e8b8b53362d4c5ededce7803 Mon Sep 17 00:00:00 2001
From: Shlok Natarajan <shlok.natarajan@gmail.com>
Date: Mon, 19 May 2025 17:24:31 -0700
Subject: [PATCH 15/15] chore: removed unused file

---
 src/benchmark/annotation.py | 45 -------------------------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 src/benchmark/annotation.py

diff --git a/src/benchmark/annotation.py b/src/benchmark/annotation.py
deleted file mode 100644
index b552ae4..0000000
--- a/src/benchmark/annotation.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from pydantic import BaseModel
-from src.load_variants import load_raw_variant_annotations
-
-"""
-Denotes a class for a variant annotation (row in var_drug_ann.tsv)
-"""
-
-
-class VariantAnnotation(BaseModel):
-    variant_annotation_id: str
-    variant_haplotypes: str
-    gene: str
-    drug: str
-    pmid: str
-    phenotype_category: str
-    significance: str
-    notes: str
-    sentence: str
-    alleles: str
-    specialty_population: str
-    metabolizer_types: str
-    phenotype_category: str
-    significance: str
-    notes: str
-    sentence: str
-    alleles: str
-    specialty_population: str
-    metabolizer_types: str
-    is_plural: str
-    is_associated: str
-    direction_of_effect: str
-    pd_pk_terms: str
-    multiple_drugs_and_or: str
-    population_types: str
-    population_phenotypes_or_diseases: str
-    multiple_phenotypes_or_diseases_and_or: str
-    comparison_alleles_or_genotypes: str
-    comparison_metabolizer_types: str
-
-
-"""
-1. Load the ground truth variants
-2. Load the extracted variants
-3. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID
-"""