From 695255f63e5c40c23587295790e56fb011f04d03 Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Fri, 2 May 2025 17:14:19 -0700 Subject: [PATCH 01/15] feat: pixi commands, gitignore, progress tracker --- .gitignore | 11 ++++++++++- README.MD | 2 +- pixi.toml | 3 ++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 4122350..6c9c9f4 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,13 @@ __pycache__ # data src/load_data/saved_data/ src/fetch_articles/saved_data/downloaded_pmcids.json -src/fetch_articles/saved_data/articles/ \ No newline at end of file +src/fetch_articles/saved_data/articles/ +*.zip +*.tar.gz +*.tar.bz2 +*.tar.xz +*.tar.lzma +*.tar.lz +*.tar.lzo + +.DS_Store diff --git a/README.MD b/README.MD index c3bc5c9..135f8a7 100644 --- a/README.MD +++ b/README.MD @@ -20,5 +20,5 @@ This repository contains Python scripts for running and building a Pharmacogenom | Get a PMID list from the variants tsv (column PMID) | ✅ | | Convert the PMID to PMCID | ✅ | | Update to use non-official pmid to pmcid | | -| Fetch the content from the PMCID | | +| Fetch the content from the PMCID | ✅ | | Create pairing of annotations to article | | \ No newline at end of file diff --git a/pixi.toml b/pixi.toml index e6d800e..6c5d7fd 100644 --- a/pixi.toml +++ b/pixi.toml @@ -12,7 +12,8 @@ platforms = ["osx-arm64"] version = "0.1.0" [tasks] -update-downloaded-pmcids = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'" +update-download-map = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'" +download-articles = "python -m src.fetch_articles.article_downloader" [dependencies] seaborn = ">=0.13.2,<0.14" From 0d0282fffb3833f431e18cf59925fe6e04b1dfcb Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 5 May 2025 18:23:13 -0700 Subject: [PATCH 02/15] docs: goals breakdown --- README.MD | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.MD b/README.MD index 135f8a7..fc110e9 100644 --- a/README.MD +++ b/README.MD @@ -6,7 +6,10 @@ # AutoGKB - +Goals: +1. Continously fetch publications on pharmacogenomic relationships +2. Extract variants annotations from an article +3. Create a general benchmark for an extraction system ## Description From 0940450970e72014b29d3c1ba8dab947aec3df0b Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 5 May 2025 18:27:05 -0700 Subject: [PATCH 03/15] docs: update goals --- README.MD | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.MD b/README.MD index fc110e9..e3e07f0 100644 --- a/README.MD +++ b/README.MD @@ -8,8 +8,11 @@ Goals: 1. Continously fetch publications on pharmacogenomic relationships -2. Extract variants annotations from an article -3. Create a general benchmark for an extraction system +2. Extract drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc. +3. Create a general benchmark for an extraction system that can output a score for an extraction system +Given: Article, Ground Truth Variants (Manually extracted and recorded in var_drug_ann.tsv:) +Input: Extraction System or Extracted Variants +Output: Score ## Description From 1b29657924bc5a950f659e2bc0217d37c51815d9 Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 5 May 2025 18:28:31 -0700 Subject: [PATCH 04/15] docs: goals --- README.MD | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.MD b/README.MD index e3e07f0..2e35714 100644 --- a/README.MD +++ b/README.MD @@ -7,12 +7,13 @@ # AutoGKB Goals: -1. Continously fetch publications on pharmacogenomic relationships +1. Fetch annotated articles from variantAnnotations stored in PharmGKB API 2. Extract drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc. 3. Create a general benchmark for an extraction system that can output a score for an extraction system Given: Article, Ground Truth Variants (Manually extracted and recorded in var_drug_ann.tsv:) Input: Extraction System or Extracted Variants Output: Score +4. Continously fetch new pharmacogenomic articles ## Description From 31cbdc8ad7c0fe6c9616f8fa976a238b108461af Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 5 May 2025 18:29:42 -0700 Subject: [PATCH 05/15] docs: goals --- README.MD | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.MD b/README.MD index 2e35714..54f7b48 100644 --- a/README.MD +++ b/README.MD @@ -8,11 +8,11 @@ Goals: 1. Fetch annotated articles from variantAnnotations stored in PharmGKB API -2. Extract drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc. -3. Create a general benchmark for an extraction system that can output a score for an extraction system +2. Create a general benchmark for an extraction system that can output a score for an extraction system Given: Article, Ground Truth Variants (Manually extracted and recorded in var_drug_ann.tsv:) Input: Extraction System or Extracted Variants Output: Score +3. System for extracting drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc. 4. Continously fetch new pharmacogenomic articles ## Description From 3d273f5ecef3f07e44cc0f333c362f7b687a4dd9 Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Thu, 15 May 2025 14:53:57 -0700 Subject: [PATCH 06/15] feat: dataset folder and progress trackers --- README.MD | 15 ++++++-- src/benchmark/README.md | 4 +++ src/benchmark/__init__.py | 0 src/benchmark/annotation.py | 46 +++++++++++++++++++++++++ src/dataset/README.md | 8 +++++ src/dataset/__init__.py | 0 src/load_data/README.md | 2 +- src/load_data/__init__.py | 2 +- src/load_data/load_clinical_variants.py | 7 ++-- 9 files changed, 75 insertions(+), 9 deletions(-) create mode 100644 src/benchmark/README.md create mode 100644 src/benchmark/__init__.py create mode 100644 src/benchmark/annotation.py create mode 100644 src/dataset/README.md create mode 100644 src/dataset/__init__.py diff --git a/README.MD b/README.MD index 54f7b48..e326fd1 100644 --- a/README.MD +++ b/README.MD @@ -10,7 +10,7 @@ Goals: 1. Fetch annotated articles from variantAnnotations stored in PharmGKB API 2. Create a general benchmark for an extraction system that can output a score for an extraction system Given: Article, Ground Truth Variants (Manually extracted and recorded in var_drug_ann.tsv:) -Input: Extraction System or Extracted Variants +Input: Extracted Variants Output: Score 3. System for extracting drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc. 4. Continously fetch new pharmacogenomic articles @@ -26,6 +26,15 @@ This repository contains Python scripts for running and building a Pharmacogenom | Download the zip of variants from pharmgkb | ✅ | | Get a PMID list from the variants tsv (column PMID) | ✅ | | Convert the PMID to PMCID | ✅ | -| Update to use non-official pmid to pmcid | | +| Update to use non-official pmid to pmcid (aaron's method) | | | Fetch the content from the PMCID | ✅ | -| Create pairing of annotations to article | | \ No newline at end of file +| Create pairings of annotations to articles | | +| Create a niave score of number of matches | | +| Create group wise score | | +| Look into advanced scoring based on distance from truth per term | | + + +## Notes +### 5/15 +Current state of the repo: +- \ No newline at end of file diff --git a/src/benchmark/README.md b/src/benchmark/README.md new file mode 100644 index 0000000..d12e18e --- /dev/null +++ b/src/benchmark/README.md @@ -0,0 +1,4 @@ +# Benchmark + +## Functions +1. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID diff --git a/src/benchmark/__init__.py b/src/benchmark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/benchmark/annotation.py b/src/benchmark/annotation.py new file mode 100644 index 0000000..87b0da0 --- /dev/null +++ b/src/benchmark/annotation.py @@ -0,0 +1,46 @@ +from pydantic import BaseModel +from src.load_data import load_raw_variant_annotations + +""" +Denotes a class for a variant annotation (row in var_drug_ann.tsv) +""" + +class VariantAnnotation(BaseModel): + variant_annotation_id: str + variant_haplotypes: str + gene: str + drug: str + pmid: str + phenotype_category: str + significance: str + notes: str + sentence: str + alleles: str + specialty_population: str + metabolizer_types: str + phenotype_category: str + significance: str + notes: str + sentence: str + alleles: str + specialty_population: str + metabolizer_types: str + is_plural: str + is_associated: str + direction_of_effect: str + pd_pk_terms: str + multiple_drugs_and_or: str + population_types: str + population_phenotypes_or_diseases: str + multiple_phenotypes_or_diseases_and_or: str + comparison_alleles_or_genotypes: str + comparison_metabolizer_types: str + + + +""" +1. Load the ground truth variants +2. Load the extracted variants +3. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID +""" + diff --git a/src/dataset/README.md b/src/dataset/README.md new file mode 100644 index 0000000..8a30cd7 --- /dev/null +++ b/src/dataset/README.md @@ -0,0 +1,8 @@ +# Dataset + +## Goal +Convert the loaded files into a dataset where the annotations and raw text are paired with each other + +## Subgoals +1. Understand the formats of the annotations +2. Choose a format for the dataset diff --git a/src/dataset/__init__.py b/src/dataset/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/load_data/README.md b/src/load_data/README.md index 997987a..311203f 100644 --- a/src/load_data/README.md +++ b/src/load_data/README.md @@ -9,7 +9,7 @@ This module handles the loading and preprocessing of PharmGKB clinical variants - Saves data to `saved_data/variantAnnotations/` - Can override existing downloads if needed -2. **`load_variant_annotations_tsv(override: bool = False)`** +2. **`load_raw_variant_annotations(override: bool = False)`** - Loads the variant annotations TSV file into a pandas DataFrame - Automatically downloads data if not present - Returns the DataFrame containing variant-drug annotations diff --git a/src/load_data/__init__.py b/src/load_data/__init__.py index c2694d6..90cbc32 100644 --- a/src/load_data/__init__.py +++ b/src/load_data/__init__.py @@ -1 +1 @@ -from .load_clinical_variants import load_variant_annotations_tsv, get_pmid_list +from .load_clinical_variants import load_raw_variant_annotations, get_pmid_list \ No newline at end of file diff --git a/src/load_data/load_clinical_variants.py b/src/load_data/load_clinical_variants.py index 4176343..492abfc 100644 --- a/src/load_data/load_clinical_variants.py +++ b/src/load_data/load_clinical_variants.py @@ -49,7 +49,7 @@ def download_and_extract_variant_annotations(override: bool = False) -> str: return extract_dir -def load_variant_annotations_tsv(override: bool = False) -> pd.DataFrame: +def load_raw_variant_annotations(override: bool = False) -> pd.DataFrame: """ Loads the variant annotations tsv file. If the file does not exist, it will be downloaded and extracted. @@ -112,7 +112,7 @@ def load_unique_variants(save_results: bool = True) -> dict: logger.info( f"Unique variants not found at {unique_variants_path}. Loading from tsv file..." ) - df = load_variant_annotations_tsv() + df = load_raw_variant_annotations() unique_values_per_column = unique_variants(df) if save_results: logger.info(f"Saving unique variants to {unique_variants_path}") @@ -132,14 +132,13 @@ def get_pmid_list(override: bool = False) -> list: with open(pmid_list_path, "r") as f: pmid_list = json.load(f) else: - df = load_variant_annotations_tsv(override) + df = load_raw_variant_annotations(override) pmid_list = df["PMID"].unique().tolist() logger.info(f"Saving PMIDs to {pmid_list_path}") with open(pmid_list_path, "w") as f: json.dump(pmid_list, f) return pmid_list - if __name__ == "__main__": pmid_list = get_pmid_list() print(f"Number of unique PMIDs: {len(pmid_list)}") From 1084188ccdd3eb5daffcc82ab858d7f6f2d85301 Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 19 May 2025 15:58:30 -0700 Subject: [PATCH 07/15] docs: prgoress tracker --- README.MD | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/README.MD b/README.MD index e326fd1..3ae321f 100644 --- a/README.MD +++ b/README.MD @@ -21,17 +21,20 @@ This repository contains Python scripts for running and building a Pharmacogenom ## Progress Tracker -| Task | Status | -| --- | --- | -| Download the zip of variants from pharmgkb | ✅ | -| Get a PMID list from the variants tsv (column PMID) | ✅ | -| Convert the PMID to PMCID | ✅ | -| Update to use non-official pmid to pmcid (aaron's method) | | -| Fetch the content from the PMCID | ✅ | -| Create pairings of annotations to articles | | -| Create a niave score of number of matches | | -| Create group wise score | | -| Look into advanced scoring based on distance from truth per term | | +| Category | Task | Status | +| --- | --- | --- | +| Initial Download | Download the zip of variants from pharmgkb | ✅ | +| | Get a PMID list from the variants tsv (column PMID) | ✅ | +| | Convert the PMID to PMCID | ✅ | +| | Update to use non-official pmid to pmcid (aaron's method) | | +| | Fetch the content from the PMCID | ✅ | +| Benchmark | Create pairings of annotations to articles | | +| | Create a niave score of number of matches | | +| | Create group wise score | | +| | Look into advanced scoring based on distance from truth per term | | +| Workflows | Integrate Aaron's current approach | | +| | Document on individual annotation meanings | | +| | Delegate annotation groupings to team members | | ## Notes From 96e099c046a30a9cdceb4bdd4abb3f1f3e1fd345 Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 19 May 2025 15:59:09 -0700 Subject: [PATCH 08/15] docs: cleaned up readme --- README.MD | 5 ----- 1 file changed, 5 deletions(-) diff --git a/README.MD b/README.MD index 3ae321f..7a6b7e5 100644 --- a/README.MD +++ b/README.MD @@ -36,8 +36,3 @@ This repository contains Python scripts for running and building a Pharmacogenom | | Document on individual annotation meanings | | | | Delegate annotation groupings to team members | | - -## Notes -### 5/15 -Current state of the repo: -- \ No newline at end of file From 50c9e6aa4548760f0c868ed33b6a5aedf912df1e Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 19 May 2025 16:00:02 -0700 Subject: [PATCH 09/15] docs: progress tracker --- README.MD | 1 + 1 file changed, 1 insertion(+) diff --git a/README.MD b/README.MD index 7a6b7e5..c7ad26e 100644 --- a/README.MD +++ b/README.MD @@ -35,4 +35,5 @@ This repository contains Python scripts for running and building a Pharmacogenom | Workflows | Integrate Aaron's current approach | | | | Document on individual annotation meanings | | | | Delegate annotation groupings to team members | | +| New Article Fetching | Replicate PharGKB current workflow | | From bf89b08b7f37c1eb7a54753d15c92d497aa85d42 Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 19 May 2025 17:02:22 -0700 Subject: [PATCH 10/15] feat: moved data to root --- .gitignore | 6 + data/README.md | 16 ++ .../saved_data => data}/pmcid_mapping.json | 0 src/fetch_articles/README.md | 156 +++++++++++++++++- src/fetch_articles/article_downloader.py | 51 ++++-- src/fetch_articles/pmcid_converter.py | 53 ++++-- .../saved_data/unique_pmcids.json | 1 - src/load_data/load_clinical_variants.py | 13 +- src/utils/__init__.py | 1 + src/utils/file_paths.py | 10 ++ 10 files changed, 261 insertions(+), 46 deletions(-) create mode 100644 data/README.md rename {src/fetch_articles/saved_data => data}/pmcid_mapping.json (100%) delete mode 100644 src/fetch_articles/saved_data/unique_pmcids.json create mode 100644 src/utils/__init__.py create mode 100644 src/utils/file_paths.py diff --git a/.gitignore b/.gitignore index 6c9c9f4..662a2c9 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,9 @@ src/fetch_articles/saved_data/articles/ *.tar.lzo .DS_Store + +data/articles/ +data/variantAnnotations/ +data/unique_pmcids.json +data/pmid_list.json +data/downloaded_pmcids.json \ No newline at end of file diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..1d26479 --- /dev/null +++ b/data/README.md @@ -0,0 +1,16 @@ +# Data + +This directory contains the primary data files used by the AutoGKB project. + +## Directory Structure + +- **articles/** - Contains XML files of articles from PubMed Central (PMC), identified by their PMCID (e.g., PMC1234567.xml). These articles are used for text mining and information extraction. + +- **variantAnnotations/** - Contains clinical variant annotations and related data: + - `var_drug_ann.tsv` - Variant-drug annotations. This is what is used in this repo. + +- **Support Files**: + - `pmcid_mapping.json` - Maps between PMIDs and PMCIDs + - `unique_pmcids.json` - List of unique PMCIDs in the dataset + - `pmid_list.json` - List of PMIDs in the dataset + - `downloaded_pmcids.json` - Tracking which PMCIDs have been downloaded \ No newline at end of file diff --git a/src/fetch_articles/saved_data/pmcid_mapping.json b/data/pmcid_mapping.json similarity index 100% rename from src/fetch_articles/saved_data/pmcid_mapping.json rename to data/pmcid_mapping.json diff --git a/src/fetch_articles/README.md b/src/fetch_articles/README.md index eed6e26..dbade90 100644 --- a/src/fetch_articles/README.md +++ b/src/fetch_articles/README.md @@ -1,6 +1,7 @@ # PubMed Document Fetching + ## Goal -Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues +Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues. ## Process Overview 1. Download the zip of variants from pharmgkb (handled in load_data module) @@ -8,6 +9,153 @@ Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall 3. Convert the PMID to PMCID 4. Fetch the content from the PMCID -## Saved Data -pmcid_mapping.json: Maps the PMID to the PMCID {"PMID": "PMCID" or Null, ..} -unique_pmcids.json: List of all the unique PMCIDs from pmcid_mapping.json (["PMCID1", "PMCID2", ...]) \ No newline at end of file +## Key Functions + +### PMCID Converter (`pmcid_converter.py`) + +- `batch_pmid_to_pmcid(pmids, email, batch_size, delay)`: Converts a list of PMIDs to PMCIDs using NCBI's ID Converter API. Processes PMIDs in batches and handles rate limiting. + - Arguments: + - `pmids`: List of PMIDs (as strings) + - `email`: Your email for NCBI tool identification + - `batch_size`: Number of PMIDs per request (max: 200) + - `delay`: Seconds between requests (default: 0.4) + - Returns: Dict mapping each PMID to PMCID (or None if not available) + +- `get_unique_pmcids()`: Returns a list of unique PMCIDs from the PMCID mapping file. + +- `load_saved_pmcid_mapping()`: Loads previously saved PMCID mappings from disk. + +- `get_project_root()`: Returns the project root directory path. + +### Article Downloader (`article_downloader.py`) + +- `fetch_pmc_content(pmcid)`: Fetches a single article's content from PubMed Central. + - Arguments: + - `pmcid`: The PubMed Central ID to fetch + - Returns: Article content in XML format or None if fetching failed + +- `download_articles(pmcids)`: Downloads multiple articles from PubMed Central. + - Arguments: + - `pmcids`: List of PMCIDs to download + - Saves downloaded articles to `data/articles/` as XML files + - Tracks downloaded PMCIDs to avoid duplicating work + +- `update_downloaded_pmcids()`: Updates tracking of downloaded PMCIDs from files in `data/articles/` directory. + +## Created Data +- `pmcid_mapping.json`: Maps the PMID to the PMCID `{"PMID": "PMCID" or Null, ..}` +- `unique_pmcids.json`: List of all the unique PMCIDs from pmcid_mapping.json `["PMCID1", "PMCID2", ...]` +- `downloaded_pmcids.json`: Maps PMCIDs to filenames or None if download failed `{"PMCID": "PMCID.xml" or null, ..}` +- `.xml`: Downloaded articles + +## Usage Examples + +### Convert PMIDs to PMCIDs + +```python +from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid +from src.load_data import get_pmid_list +import os +from dotenv import load_dotenv + +load_dotenv() # Load environment variables (NCBI_EMAIL) + +# Get list of PMIDs from variant data +pmid_list = get_pmid_list() + +# Convert PMIDs to PMCIDs +pmcid_mapping = batch_pmid_to_pmcid( + pmids=pmid_list, + email=os.getenv("NCBI_EMAIL"), + batch_size=100, + delay=0.4 +) + +print(f"Successfully mapped {len(pmcid_mapping)} PMIDs to PMCIDs") +``` + +### Download Articles Using PMCIDs + +```python +from src.fetch_articles.article_downloader import download_articles +from src.fetch_articles.pmcid_converter import get_unique_pmcids + +# Get unique PMCIDs from saved mapping +pmcids = get_unique_pmcids() + +# Download articles +download_articles(pmcids) +``` + +### Download a Single Article + +```python +from src.fetch_articles.article_downloader import fetch_pmc_content +from src.fetch_articles.pmcid_converter import get_project_root +import os +from pathlib import Path + +# Get project root +project_root = get_project_root() + +# Fetch a single article +pmcid = "PMC1234567" +content = fetch_pmc_content(pmcid) + +if content: + # Save the article content + articles_dir = project_root / "data" / "articles" + os.makedirs(articles_dir, exist_ok=True) + + with open(articles_dir / f"{pmcid}.xml", "w") as f: + f.write(content.decode("utf-8")) + print(f"Successfully downloaded article {pmcid}") +else: + print(f"Failed to download article {pmcid}") +``` + +### Update Downloaded PMCIDs + +```python +from src.fetch_articles.article_downloader import update_downloaded_pmcids + +# Update downloaded_pmcids.json with articles in data/articles/ +update_downloaded_pmcids() +``` + +## Full Pipeline Execution + +To run the complete pipeline (convert PMIDs to PMCIDs and download articles): + +```python +# Full pipeline from PMIDs to downloaded articles +from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid +from src.fetch_articles.article_downloader import download_articles +from src.load_data import get_pmid_list +import os +from dotenv import load_dotenv + +load_dotenv() + +# 1. Get PMIDs from variant data +pmid_list = get_pmid_list() + +# 2. Convert PMIDs to PMCIDs +pmcid_mapping = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL")) + +# 3. Extract only valid PMCIDs (not None) +valid_pmcids = [pmcid for pmcid in pmcid_mapping.values() if pmcid] + +# 4. Download articles +download_articles(valid_pmcids) +``` + +Alternatively, run the module scripts directly: + +```bash +# First convert PMIDs to PMCIDs +python -m src.fetch_articles.pmcid_converter + +# Then download articles +python -m src.fetch_articles.article_downloader +``` \ No newline at end of file diff --git a/src/fetch_articles/article_downloader.py b/src/fetch_articles/article_downloader.py index 13c5cf1..c9179f7 100644 --- a/src/fetch_articles/article_downloader.py +++ b/src/fetch_articles/article_downloader.py @@ -1,5 +1,6 @@ from loguru import logger from src.fetch_articles.pmcid_converter import get_unique_pmcids +from src.utils.file_paths import get_project_root from Bio import Entrez import os import json @@ -7,6 +8,15 @@ def fetch_pmc_content(pmcid): + """ + Fetch content for a single article from PubMed Central. + + Args: + pmcid (str): The PubMed Central ID to fetch + + Returns: + bytes or None: The article content in XML format or None if fetching failed + """ try: handle = Entrez.efetch(db="pmc", id=pmcid, rettype="full", retmode="xml") record = handle.read() @@ -19,18 +29,20 @@ def fetch_pmc_content(pmcid): def update_downloaded_pmcids() -> None: """ - Update the downloaded_pmcids.json file with PMCIDs found in the saved_data/articles directory. + Update the downloaded_pmcids.json file with PMCIDs found in the data/articles directory. """ - base_dir = os.path.dirname(os.path.abspath(__file__)) - downloaded_pmcids_path = os.path.join( - base_dir, "saved_data", "downloaded_pmcids.json" - ) - # Check for all the filenames in the saved_data/articles directory - articles_dir = os.path.join(base_dir, "saved_data", "articles") + project_root = get_project_root() + downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json" + + # Check for all the filenames in the data/articles directory + articles_dir = project_root / "data" / "articles" + os.makedirs(articles_dir, exist_ok=True) + article_pmcids = [f.split(".")[0] for f in os.listdir(articles_dir)] article_pmcids_mapping = {pmcid: f"{pmcid}.xml" for pmcid in article_pmcids} logger.info(f"Found {len(article_pmcids)} existing XML files in {articles_dir}") + # Add the new PMCIDs to the json file if os.path.exists(downloaded_pmcids_path): with open(downloaded_pmcids_path, "r") as f: @@ -43,9 +55,12 @@ def update_downloaded_pmcids() -> None: downloaded_pmcids = {} else: downloaded_pmcids = {} + downloaded_pmcids.update(article_pmcids_mapping) + with open(downloaded_pmcids_path, "w") as f: json.dump(downloaded_pmcids, f) + logger.info( f"Updated {downloaded_pmcids_path} with {len(article_pmcids)} new PMCIDs" ) @@ -55,19 +70,18 @@ def download_articles(pmcids: list[str]): """ Download articles from PubMed Central using PMCIDs. Keeps track of the PMCIDs that have been downloaded and skips them. - Saves the downloaded articles to the saved_data/articles directory. + Saves the downloaded articles to the data/articles directory. Args: pmcids (list[str]): List of PMCIDs to download. """ - base_dir = os.path.dirname(os.path.abspath(__file__)) - saved_dir = os.path.join(base_dir, "saved_data", "articles") - os.makedirs(saved_dir, exist_ok=True) + project_root = get_project_root() + articles_dir = project_root / "data" / "articles" + os.makedirs(articles_dir, exist_ok=True) # Load the downloaded PMCIDs from the json file - downloaded_pmcids_path = os.path.join( - base_dir, "saved_data", "downloaded_pmcids.json" - ) + downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json" + if os.path.exists(downloaded_pmcids_path): with open(downloaded_pmcids_path, "r") as f: downloaded_pmcids = json.load(f) @@ -82,20 +96,21 @@ def download_articles(pmcids: list[str]): for pmcid in tqdm(new_pmcids): record = fetch_pmc_content(pmcid) if record: - with open(os.path.join(saved_dir, f"{pmcid}.xml"), "w") as f: + with open(articles_dir / f"{pmcid}.xml", "w") as f: f.write(record.decode("utf-8")) downloaded_pmcids[pmcid] = f"{pmcid}.xml" else: downloaded_pmcids[pmcid] = None logger.warning(f"No record found for PMCID {pmcid}") - logger.info(f"Downloaded {len(downloaded_pmcids)} articles") + + logger.info(f"Downloaded {len(new_pmcids)} new articles, total articles: {len(downloaded_pmcids)}") # Save the downloaded PMCIDs to a json file - with open(os.path.join(base_dir, "saved_data", "downloaded_pmcids.json"), "w") as f: + with open(downloaded_pmcids_path, "w") as f: json.dump(downloaded_pmcids, f) if __name__ == "__main__": update_downloaded_pmcids() pmcids = get_unique_pmcids() - download_articles(pmcids) + download_articles(pmcids) \ No newline at end of file diff --git a/src/fetch_articles/pmcid_converter.py b/src/fetch_articles/pmcid_converter.py index dc5a57c..10b9347 100644 --- a/src/fetch_articles/pmcid_converter.py +++ b/src/fetch_articles/pmcid_converter.py @@ -7,6 +7,7 @@ import os from src.load_data import get_pmid_list import json +from src.utils.file_paths import get_project_root load_dotenv() # Email for NCBI @@ -22,12 +23,19 @@ from typing import List, Set, Dict, Optional + + + def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]: """ Load the saved PMCID mapping from the json file. """ - base_dir = os.path.dirname(os.path.abspath(__file__)) - results_path = os.path.join(base_dir, "pmcid_mapping.json") + project_root = get_project_root() + results_path = project_root / "data" / "pmcid_mapping.json" + + # Create data directory if it doesn't exist + os.makedirs(project_root / "data", exist_ok=True) + if os.path.exists(results_path): with open(results_path, "r") as f: existing_results = json.load(f) @@ -43,7 +51,7 @@ def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]: def batch_pmid_to_pmcid( - pmids: List[str], email: str, batch_size: int = 100, delay: float = 0.4 + pmids: List[str], email: str = os.getenv("NCBI_EMAIL"), batch_size: int = 100, delay: float = 0.4 ) -> Dict[str, Optional[str]]: """ Convert a list of PMIDs to PMCIDs using NCBI's ID Converter API. @@ -110,8 +118,12 @@ def batch_pmid_to_pmcid( existing_results.update(results) # Save updated results - base_dir = os.path.dirname(os.path.abspath(__file__)) - results_path = os.path.join(base_dir, "pmcid_mapping.json") + project_root = get_project_root() + results_path = project_root / "data" / "pmcid_mapping.json" + + # Create data directory if it doesn't exist + os.makedirs(project_root / "data", exist_ok=True) + with open(results_path, "w") as f: json.dump(existing_results, f) logger.info(f"Updated PMCID mappings saved to {results_path}") @@ -125,9 +137,14 @@ def get_unique_pmcids() -> List[str]: NOTE: Could add functionality to check for new PMCIDs in mapping and update the unique_pmcids.json file Currently function returns the pre-existing unique PMCIDs if they exist or regenerates the list from the mapping. """ + project_root = get_project_root() + # Load the unique PMCIDs if they've already been saved - base_dir = os.path.dirname(os.path.abspath(__file__)) - unique_pmcids_path = os.path.join(base_dir, "saved_data", "unique_pmcids.json") + unique_pmcids_path = project_root / "data" / "unique_pmcids.json" + + # Create data directory if it doesn't exist + os.makedirs(project_root / "data", exist_ok=True) + if os.path.exists(unique_pmcids_path): with open(unique_pmcids_path, "r") as f: try: @@ -143,14 +160,20 @@ def get_unique_pmcids() -> List[str]: return pmcids # Load from pmcid_mapping.json if unique pmcids haven't been saved - results_path = os.path.join(base_dir, "saved_data", "pmcid_mapping.json") + results_path = project_root / "data" / "pmcid_mapping.json" + + if not os.path.exists(results_path): + logger.error(f"No PMCID mapping found at {results_path}. Cannot generate unique PMCIDs.") + return [] + with open(results_path, "r") as f: existing_results = json.load(f) - # get the unique pmcids - pmcids = list(set(existing_results.values())) + + # Get the unique pmcids (remove None values) + pmcids = [value for value in existing_results.values() if value is not None] + pmcids = list(set(pmcids)) # Save the unique pmcids to a json file - unique_pmcids_path = os.path.join(base_dir, "saved_data", "unique_pmcids.json") with open(unique_pmcids_path, "w") as f: json.dump(pmcids, f) logger.info(f"Unique PMCIDs saved to {unique_pmcids_path}") @@ -158,8 +181,8 @@ def get_unique_pmcids() -> List[str]: if __name__ == "__main__": - # pmid_list = get_pmid_list() - # results = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL")) - # logger.info(f"PMCID mapping complete. {len(results)} PMIDs mapped to PMCIDs.") + pmid_list = get_pmid_list() + results = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL")) + logger.info(f"PMCID mapping complete. {len(results)} PMIDs mapped to PMCIDs.") pmcids = get_unique_pmcids() - logger.info(f"Number of unique PMCIDs: {len(pmcids)}") + logger.info(f"Number of unique PMCIDs: {len(pmcids)}") \ No newline at end of file diff --git a/src/fetch_articles/saved_data/unique_pmcids.json b/src/fetch_articles/saved_data/unique_pmcids.json deleted file mode 100644 index c1a43e7..0000000 --- a/src/fetch_articles/saved_data/unique_pmcids.json +++ /dev/null @@ -1 +0,0 @@ -["PMC11850035", "PMC2812115", "PMC2911553", "PMC5583388", "PMC4526634", "PMC6247602", "PMC11758033", "PMC2884029", "PMC3778124", "PMC3044738", "PMC5700353", "PMC1884342", "PMC6289816", "PMC4609097", "PMC1364741", "PMC2857717", "PMC10898793", "PMC4909584", "PMC4630174", "PMC5391214", "PMC10675244", "PMC6647927", "PMC5074472", "PMC11667419", "PMC8185249", "PMC2352037", "PMC4280295", "PMC11512548", "PMC7377539", "PMC11258238", "PMC7499297", "PMC5859345", "PMC11921366", "PMC3872414", "PMC3797132", "PMC5373545", "PMC8455325", "PMC3786328", "PMC3081375", "PMC11825576", "PMC10418744", "PMC1563530", "PMC4032230", "PMC5886039", "PMC3068061", "PMC48077", "PMC4537319", "PMC4730664", "PMC4713720", "PMC3433845", "PMC11241034", "PMC6142943", "PMC7710914", "PMC5496343", "PMC2966433", "PMC6777349", "PMC3310336", "PMC4278770", "PMC5904126", "PMC4342329", "PMC3594083", "PMC6714829", "PMC5496345", "PMC2048549", "PMC3574284", "PMC3454425", "PMC3657889", "PMC3555061", "PMC10618485", "PMC3692386", "PMC3902809", "PMC9468554", "PMC5309131", "PMC2757009", "PMC2592852", "PMC2928561", "PMC2762391", "PMC4583245", "PMC6432766", "PMC7616417", "PMC5749368", "PMC4522133", "PMC3983993", "PMC2920450", "PMC4181635", "PMC3703617", "PMC4702321", "PMC2879959", "PMC4746878", "PMC11871410", "PMC7164646", "PMC5203947", "PMC4867099", "PMC3434304", "PMC6462825", "PMC4435089", "PMC2681284", "PMC1952551", "PMC5866313", "PMC3016221", "PMC3837290", "PMC10848431", "PMC3565812", "PMC3787223", "PMC4226857", "PMC10675623", "PMC9768477", "PMC2485247", "PMC4503374", "PMC4565152", "PMC4206345", "PMC1873971", "PMC2364178", "PMC3230303", "PMC11016593", "PMC3622803", "PMC11666798", "PMC4956330", "PMC5354739", "PMC4307337", "PMC3836273", "PMC2292110", "PMC2630264", "PMC4959996", "PMC11421434", "PMC11787782", "PMC3561425", "PMC2921956", "PMC5538123", "PMC6612579", "PMC3539557", "PMC8553963", "PMC9585281", "PMC3938989", "PMC11552228", "PMC9501307", "PMC11203291", "PMC3786570", "PMC5514947", "PMC10566653", "PMC3944214", "PMC4476880", "PMC2733171", "PMC3164274", "PMC5142600", "PMC7274090", "PMC10537526", "PMC6493603", "PMC3567337", "PMC6587209", "PMC7455128", "PMC5324942", "PMC2666924", "PMC11544447", "PMC3603284", "PMC11786019", "PMC4560372", "PMC3734199", "PMC4525256", "PMC4488893", "PMC3786668", "PMC11803932", "PMC4513254", "PMC6005582", "PMC5940523", "PMC6923423", "PMC2913479", "PMC8429954", "PMC5306247", "PMC3529147", "PMC3358293", "PMC6038204", "PMC5541380", "PMC11717999", "PMC10179231", "PMC2966981", "PMC10995391", "PMC4254688", "PMC8702453", "PMC11257390", "PMC11052159", null, "PMC10858860", "PMC4154892", "PMC4613195", "PMC4387236", "PMC4628029", "PMC1754569", "PMC3381232", "PMC3579501", "PMC4716887", "PMC11855146", "PMC3093392", "PMC10668244", "PMC4437521", "PMC9875006", "PMC10532907", "PMC4890827", "PMC3639978", "PMC4876188", "PMC4868001", "PMC4330076", "PMC8758337", "PMC4533232", "PMC4982581", "PMC2432487", "PMC11160041", "PMC4134280", "PMC11913886", "PMC5558541", "PMC4011617", "PMC7005197", "PMC7674153", "PMC5678480", "PMC2773991", "PMC11492722", "PMC4503103", "PMC11063049", "PMC11082567", "PMC5862636", "PMC3617060", "PMC12043259", "PMC3279522", "PMC8438567", "PMC5871545", "PMC3625373", "PMC2756088", "PMC7260086", "PMC10298263", "PMC4270923", "PMC4390701", "PMC6003833", "PMC3769669", "PMC11049768", "PMC1755496", "PMC2952572", "PMC9536193", "PMC4762905", "PMC1773505", "PMC5386607", "PMC3390407", "PMC9914414", "PMC6357964", "PMC3946972", "PMC4023787", "PMC6523194", "PMC5483245", "PMC2673121", "PMC5903228", "PMC3369131", "PMC11528939", "PMC11159193", "PMC2291274", "PMC4296935", "PMC3158597", "PMC3273458", "PMC6174029", "PMC4490522", "PMC5148898", "PMC6248022", "PMC5468510", "PMC4015881", "PMC2853591", "PMC3396003", "PMC2515139", "PMC3292264", "PMC3632552", "PMC2680291", "PMC4938133", "PMC5614982", "PMC4812555", "PMC2492917", "PMC4616511", "PMC6987567", "PMC9608913", "PMC3555879", "PMC2943151", "PMC4448076", "PMC2766479", "PMC4835128", "PMC3944116", "PMC10931982", "PMC4452656", "PMC10159199", "PMC2561120", "PMC4613221", "PMC10214567", "PMC9801627", "PMC3505921", "PMC7968507", "PMC2966859", "PMC5743122", "PMC4641035", "PMC3760990", "PMC11059713", "PMC4892970", "PMC2722908", "PMC3746708", "PMC4601717", "PMC4462610", "PMC1874463", "PMC11003701", "PMC4591203", "PMC1474035", "PMC3880259", "PMC4892378", "PMC9820603", "PMC3952719", "PMC5564514", "PMC7870766", "PMC5411458", "PMC10827494", "PMC11524821", "PMC8182957", "PMC6265082", "PMC5342670", "PMC3940150", "PMC2810514", "PMC3808494", "PMC2762405", "PMC11638344", "PMC2386778", "PMC4012347", "PMC5207665", "PMC4433569", "PMC4697903", "PMC5427048", "PMC3523080", "PMC6046471", "PMC7993015", "PMC6034060", "PMC10880038", "PMC4199712", "PMC6409308", "PMC5346037", "PMC4594719", "PMC1974827", "PMC5266160", "PMC3729209", "PMC8954661", "PMC5684285", "PMC3845218", "PMC3161212", "PMC5500390", "PMC4631184", "PMC2664151", "PMC3860742", "PMC4099069", "PMC6179259", "PMC10091789", "PMC4160394", "PMC6033076", "PMC7375060", "PMC8578201", "PMC4078496", "PMC3522814", "PMC5563830", "PMC4996314", "PMC11354576", "PMC3291838", "PMC5306492", "PMC3384479", "PMC3899768", "PMC5432414", "PMC3518380", "PMC4432150", "PMC5355121", "PMC3674704", "PMC3378722", "PMC3727245", "PMC3941038", "PMC2751283", "PMC6767327", "PMC3553682", "PMC1029622", "PMC4511425", "PMC3248259", "PMC4385537", "PMC4195667", "PMC4757974", "PMC3330749", "PMC9891445", "PMC2679896", "PMC2810802", "PMC3871508", "PMC4171106", "PMC2820245", "PMC10810687", "PMC8540141", "PMC3997354", "PMC11088557", "PMC4130425", "PMC4615534", "PMC4468641", "PMC3735354", "PMC6734474", "PMC6542686", "PMC5753622", "PMC9931738", "PMC4300289", "PMC5612381", "PMC5543069", "PMC4229256", "PMC2883666", "PMC11603417", "PMC1365155", "PMC6231319", "PMC3682424", "PMC2715837", "PMC5526237", "PMC3621246", "PMC4190075", "PMC5983535", "PMC4519823", "PMC5508045", "PMC5346878", "PMC6328871", "PMC4272010", "PMC7215378", "PMC3890033", "PMC3641305", "PMC8841435", "PMC8137991", "PMC3873034", "PMC4043918", "PMC5619051", "PMC11685162", "PMC11509751", "PMC3130093", "PMC11860030", "PMC3098751", "PMC5519037", "PMC11221861", "PMC4087845", "PMC8238023", "PMC4872310", "PMC9601332", "PMC5798599", "PMC3182303", "PMC3611944", "PMC10967865", "PMC10645035", "PMC3348126", "PMC5316146", "PMC10377184", "PMC2014902", "PMC3264276", "PMC10838100", "PMC10607223", "PMC5469860", "PMC11134291", "PMC6313513", "PMC6927671", "PMC3525178", "PMC9961245", "PMC10864595", "PMC1873375", "PMC8533258", "PMC6562943", "PMC3544007", "PMC4833150", "PMC5645220", "PMC6586010", "PMC8513493", "PMC9314634", "PMC6400024", "PMC10196221", "PMC11887348", "PMC11244643", "PMC9256318", "PMC11703455", "PMC5079351", "PMC11393095", "PMC4915265", "PMC8530979", "PMC7398416", "PMC8822703", "PMC3092713", "PMC4456129", "PMC3780966", "PMC3608305", "PMC4224698", "PMC11481807", "PMC11887086", "PMC10815823", "PMC6448146", "PMC2014166", "PMC10163902", "PMC3749354", "PMC5883590", "PMC8742641", "PMC4965653", "PMC3604156", "PMC4702374", "PMC5505550", "PMC3114195", "PMC4356257", "PMC5727754", "PMC4995153", "PMC2959002", "PMC442471", "PMC4454552", "PMC3030919", "PMC11111788", "PMC4137828", "PMC4916778", "PMC11773121", "PMC8673616", "PMC6347826", "PMC2859392", "PMC3352974", "PMC2288721", "PMC4682920", "PMC4581326", "PMC4365300", "PMC2480976", "PMC4169411", "PMC6613715", "PMC6745302", "PMC8724172", "PMC6262886", "PMC4479596", "PMC2168111", "PMC4636889", "PMC4594699", "PMC5904201", "PMC7089776", "PMC5604555", "PMC6092108", "PMC3749570", "PMC4631197", "PMC6479273", "PMC6942309", "PMC4183989", "PMC4368615", "PMC3525665", "PMC4820801", "PMC9298338", "PMC6472479", "PMC3931261", "PMC4017364", "PMC11152251", "PMC4500334", "PMC4168388", "PMC11677811", "PMC3673300", "PMC10583240", "PMC3214266", "PMC5282793", "PMC3775655", "PMC3818406", "PMC2949522", "PMC524175", "PMC4862932", "PMC3137420", "PMC2830598", "PMC3818912", "PMC4803610", "PMC9582748", "PMC4034115", "PMC3867202", "PMC3690108", "PMC9297921", "PMC9537548", "PMC5589489", "PMC3055694", "PMC4012056", "PMC2903324", "PMC2686066", "PMC10152845", "PMC3667657", "PMC4308646", "PMC6411694", "PMC5009007", "PMC10834390", "PMC3403289", "PMC10909096", "PMC2992873", "PMC8445626", "PMC6969041", "PMC3894627", "PMC6786370", "PMC5833535", "PMC6505090", "PMC11359404", "PMC8672325", "PMC6980920", "PMC10666731", "PMC7963143", "PMC5382092", "PMC4461653", "PMC2860533", "PMC5018246", "PMC4479153", "PMC6451710", "PMC4301945", "PMC4557249", "PMC4157963", "PMC6995013", "PMC11608742", "PMC10668502", "PMC4480925", "PMC11573879", "PMC1975838", "PMC4231027", "PMC5818817", "PMC6037621", "PMC6801039", "PMC4943245", "PMC2014539", "PMC11555502", "PMC2995295", "PMC5006145", "PMC4055378", "PMC4484512", "PMC11418302", "PMC10278212", "PMC4057281", "PMC6219441", "PMC5598801", "PMC11754044", "PMC3984266", "PMC10778798", "PMC4274707", "PMC5651309", "PMC11252221", "PMC4265416", "PMC11475898", "PMC4752391", "PMC3624039", "PMC11401437", "PMC2652833", "PMC3774043", "PMC7431691", "PMC7039325", "PMC6086578", "PMC7655626", "PMC5521342", "PMC3910846", "PMC6851426", "PMC3139013", "PMC11773116", "PMC8458697", "PMC10527451", "PMC5241185", "PMC4151614", "PMC1874262", "PMC3461952", "PMC3818518", "PMC3653303", "PMC5411211", "PMC4356640", "PMC4672523", "PMC3414671", "PMC3485381", "PMC5877743", "PMC3481266", "PMC8295171", "PMC4800352", "PMC4693492", "PMC10501538", "PMC4154311", "PMC10309098", "PMC6813860", "PMC8953705", "PMC4366347", "PMC9925376", "PMC10917709", "PMC1365072", "PMC6014560", "PMC4292894", "PMC8426351", "PMC6612264", "PMC7319006", "PMC10502099", "PMC3726442", "PMC4500328", "PMC5711571", "PMC8940650", "PMC4345005", "PMC8604252", "PMC5319785", "PMC5233579", "PMC3658129", "PMC6493124", "PMC4892373", "PMC3401172", "PMC4527535", "PMC5293674", "PMC9080200", "PMC6591035", "PMC2684883", "PMC4243902", "PMC11652804", "PMC10883345", "PMC2668081", "PMC4324232", "PMC11159294", "PMC10982510", "PMC4737107", "PMC11148365", "PMC6510382", "PMC6216325", "PMC2518836", "PMC2830602", "PMC4694426", "PMC3394147", "PMC4335884", "PMC6486881", "PMC6461793", "PMC5903579", "PMC10349379", "PMC2647710", "PMC10557961", "PMC7115946", "PMC5370513", "PMC10409991", "PMC5298887", "PMC11995662", "PMC4836090", "PMC2791975", "PMC2726911", "PMC3164277", "PMC4111883", "PMC11315837", "PMC3246196", "PMC11531276", "PMC2750008", "PMC11884701", "PMC9830790", "PMC2662935", "PMC6941886", "PMC5887212", "PMC11240873", "PMC10970167", "PMC7793629", "PMC5898372", "PMC4038142", "PMC11208962", "PMC8880478", "PMC5716599", "PMC6298606", "PMC4661296", "PMC2888980", "PMC1884506", "PMC4542662", "PMC3579261", "PMC4872428", "PMC3984158", "PMC2935997", "PMC9810307", "PMC6989102", "PMC5975540", "PMC5299197", "PMC3571021", "PMC3582836", "PMC3376437", "PMC3513646", "PMC1087660", "PMC5065384", "PMC5176308", "PMC11668066", "PMC10990950", "PMC8163522", "PMC4425504", "PMC7883889", "PMC8505487", "PMC4502741", "PMC3555056", "PMC3675749", "PMC7292295", "PMC4184528", "PMC5727167", "PMC3444290", "PMC5562097", "PMC4441275", "PMC5591096", "PMC10684410", "PMC4298011", "PMC7393710", "PMC3584248", "PMC4105486", "PMC5711795", "PMC3753270", "PMC5520553", "PMC2014233", "PMC9301121", "PMC6920759", "PMC1885008", "PMC4116670", "PMC4220988", "PMC4846779", "PMC3329222", "PMC4445755", "PMC6132901", "PMC7193447", "PMC3454958", "PMC2896457", "PMC3628804", "PMC1995596", "PMC3508798", "PMC10972729", "PMC4794377", "PMC4692529", "PMC3922978", "PMC1365132", "PMC2886925", "PMC5138058", "PMC5425333", "PMC7115450", "PMC6654446", "PMC6006403", "PMC4304713", "PMC3006662", "PMC2949912", "PMC5700347", "PMC5461999", "PMC5768901", "PMC6089815", "PMC4762902", "PMC4690185", "PMC5438821", "PMC5189722", "PMC9809306", "PMC2644687", "PMC4631185", "PMC5534241", "PMC5373543", "PMC4100708", "PMC1237155", "PMC8373649", "PMC4693577", "PMC9328121", "PMC2737687", "PMC3249179", "PMC5377478", "PMC5531276", "PMC8890732", "PMC8108700", "PMC6387687", "PMC2683977", "PMC6054772", "PMC3237821", "PMC4345081", "PMC6518412", "PMC7086280", "PMC5440888", "PMC2922203", "PMC3712827", "PMC4612590", "PMC2596476", "PMC6773496", "PMC11246114", "PMC3038469", "PMC4969350", "PMC11763628", "PMC5734971", "PMC4719145", "PMC3208318", "PMC5763654", "PMC4573240", "PMC8184575", "PMC4444267", "PMC4168390", "PMC9584256", "PMC3020258", "PMC2901912", "PMC5135610", "PMC5932771", "PMC4473094", "PMC2896826", "PMC3985268", "PMC7351433", "PMC4760888", "PMC5287983", "PMC2709885", "PMC2364770", "PMC5057355", "PMC3988537", "PMC5817388", "PMC3656883", "PMC11435314", "PMC3448899", "PMC1884346", "PMC5423974", "PMC11102648", "PMC4257570", "PMC2906637", "PMC10381361", "PMC4828529", "PMC9890192", "PMC4503705", "PMC1884285", "PMC6587626", "PMC2641037", "PMC4797547", "PMC3988270", "PMC8505452", "PMC10876746", "PMC6760244", "PMC4425056", "PMC3846997", "PMC4296254", "PMC5449482", "PMC11755583", "PMC5161051", "PMC11049954", "PMC11558073", "PMC4805204", "PMC5789875", "PMC6800829", "PMC10908252", "PMC4551162", "PMC10495004", "PMC4595504", "PMC5943457", "PMC11246689", "PMC5744175", "PMC4618180", "PMC8222836", "PMC5298566", "PMC4541975", "PMC10951231", "PMC3055457", "PMC1978168", "PMC10526247", "PMC6493076", "PMC2650539", "PMC4707035", "PMC4151246", "PMC4413900", "PMC4454285", "PMC4930967", "PMC10787143", "PMC4116556", "PMC2754599", "PMC9306465", "PMC11317398", "PMC3415853", "PMC3852421", "PMC11158672", "PMC3575609", "PMC11891766", "PMC2794921", "PMC5546852", "PMC7221122", "PMC10914946", "PMC2586993", "PMC4574839", "PMC8204702", "PMC4982759", "PMC2950972", "PMC7235792", "PMC3521860", "PMC11730665", "PMC5611711", "PMC9515473", "PMC2291379", "PMC4498287", "PMC3360546", "PMC5945500", "PMC6361127", "PMC3943570", "PMC6426691", "PMC11158323", "PMC5800559", "PMC6151284", "PMC6542461", "PMC5220536", "PMC5610780", "PMC3100585", "PMC4405819", "PMC3912955", "PMC10782740", "PMC5903239", "PMC7302666", "PMC11271148", "PMC9841299", "PMC3195031", "PMC6081148", "PMC3756535", "PMC6357360", "PMC5538305", "PMC4706412", "PMC2865873", "PMC5264271", "PMC4498982", "PMC10483403", "PMC4177494", "PMC2976715", "PMC11011338", "PMC3633658", "PMC2679107", "PMC5465325", "PMC7375952", "PMC2896566", "PMC3652476", "PMC556232", "PMC11106956", "PMC11236688", "PMC8132880", "PMC3282030", "PMC4169706", "PMC2757655", "PMC3909010", "PMC4896103", "PMC4243881", "PMC5392306", "PMC4575538", "PMC8975736", "PMC3419350", "PMC5509475", "PMC2925052", "PMC7497848", "PMC3947488", "PMC2276142", "PMC6373376", "PMC3461592", "PMC6501809", "PMC10154044", "PMC4002970", "PMC5028170", "PMC4208722", "PMC9610285", "PMC2556451", "PMC5029084", "PMC2564574", "PMC5485718", "PMC5003027", "PMC5604731", "PMC2908290", "PMC5901893", "PMC4865408", "PMC8917764", "PMC10349800", "PMC10499425", "PMC11703419", "PMC4236071", "PMC5980466", "PMC6411020", "PMC5101708", "PMC11628867", "PMC11102100", "PMC4731723", "PMC8571740", "PMC7649675", "PMC6595468", "PMC5763318", "PMC4949007", "PMC5323433", "PMC4703773", "PMC1401654", "PMC5875353", "PMC4541974", "PMC8973308", "PMC11933031", "PMC11720188", "PMC4038024", "PMC5808057", "PMC1884959", "PMC6493375", "PMC5145728", "PMC3992925", "PMC5807179", "PMC3499361", "PMC3760447", "PMC4931969", "PMC3049596", "PMC4110085", "PMC5007158", "PMC6475679", "PMC4947669", "PMC5875925", "PMC11310823", "PMC1884261", "PMC3172251", "PMC6171340", "PMC7245057", "PMC7340566", "PMC7388522", "PMC3766937", "PMC10099095", "PMC6125540", "PMC4220464", "PMC11269678", "PMC5346875", "PMC8106923", "PMC4113831", "PMC10244018", "PMC5346034", "PMC3320544", "PMC1963422", "PMC3530397", "PMC6246957", "PMC4590670", "PMC9974434", "PMC3612775", "PMC1885108", "PMC6714673", "PMC5659294", "PMC8578190", "PMC5427244", "PMC3991683", "PMC4115247", "PMC5412025", "PMC10038974", "PMC8915292", "PMC10230242", "PMC11404698", "PMC6742943", "PMC7497238", "PMC8472669", "PMC4855508", "PMC5152628", "PMC5651327", "PMC9028965", "PMC3597465", "PMC5478306", "PMC6631257", "PMC3833422", "PMC4119242", "PMC2792638", "PMC6046506", "PMC3468617", "PMC6489578", "PMC11314417", "PMC11347466", "PMC4470685", "PMC3116045", "PMC11809887", "PMC4833149", "PMC5726942", "PMC2749505", "PMC11852071", "PMC5510236", "PMC4598210", "PMC1251635", "PMC10463210", "PMC4469933", "PMC10747255", "PMC10582663", "PMC10957942", "PMC3570048", "PMC6071997", "PMC4406866", "PMC10275785", "PMC4297489", "PMC10565537", "PMC3865618", "PMC6855320", "PMC11022290", "PMC5749387", "PMC3978988", "PMC9322346", "PMC8599229", "PMC3598593", "PMC3610685", "PMC5599305", "PMC9657232", "PMC5402961", "PMC5524513", "PMC4221105", "PMC2704695", "PMC2910688", "PMC5316454", "PMC5249113", "PMC9934922", "PMC3959225", "PMC4735961", "PMC5533497", "PMC5492788", "PMC4922322", "PMC3805522", "PMC10139129", "PMC5963414", "PMC4854407", "PMC4039203", "PMC3734608", "PMC4343187", "PMC10327396", "PMC8263746", "PMC2748889", "PMC8081740", "PMC4108472", "PMC2042888", "PMC3093079", "PMC5829963", "PMC5949564", "PMC5656562", "PMC4640545", "PMC3071070", "PMC3383686", "PMC8767566", "PMC4999337", "PMC4271081", "PMC5395152", "PMC4872305", "PMC2794198", "PMC3478502", "PMC5558527", "PMC3066089", "PMC4764353", "PMC3471928", "PMC10145266", "PMC10599059", "PMC5899062", "PMC9552901", "PMC3137047", "PMC7305826", "PMC9481373", "PMC4375579", "PMC4010098", "PMC7217737", "PMC1365130", "PMC3476140", "PMC3755037", "PMC2981241", "PMC3834132", "PMC4931885", "PMC3148255", "PMC2767285", "PMC3901533", "PMC3548029", "PMC4651007", "PMC11140026", "PMC9413960", "PMC4282597", "PMC4484731", "PMC9450009", "PMC5903234", "PMC11269006", "PMC5795999", "PMC1769026", "PMC8100460", "PMC4375304", "PMC4332701", "PMC10880264", "PMC8441053", "PMC4701680", "PMC4412845", "PMC2732914", "PMC9701885", "PMC3779247", "PMC5632935", "PMC4364852", "PMC4002408", "PMC2919241", "PMC1762324", "PMC9321338", "PMC2858245", "PMC3245828", "PMC2599947", "PMC5421731", "PMC3260990", "PMC5087931", "PMC3637851", "PMC8359222", "PMC3100476", "PMC9532634", "PMC11508189", "PMC6960206", "PMC5558529", "PMC4000411", "PMC10648962", "PMC10769478", "PMC3131846", "PMC11023817", "PMC3698861", "PMC4667947", "PMC4544820", "PMC2675161", "PMC4025175", "PMC5606007", "PMC7039663", "PMC4921119", "PMC3143437", "PMC5309133", "PMC11458732", "PMC6370172", "PMC5192124", "PMC4788379", "PMC2000640", "PMC10085626", "PMC5061780", "PMC5908896", "PMC4631186", "PMC3248257", "PMC4814312", "PMC4240933", "PMC10478012", "PMC3244642", "PMC6128165", "PMC4209173", "PMC4585967", "PMC11584383", "PMC4503165", "PMC4778608", "PMC11012255", "PMC7613628", "PMC4972156", "PMC9373641", "PMC5391994", "PMC4876172", "PMC7292331", "PMC9820795", "PMC3048137", "PMC2000718", "PMC11507373", "PMC3425006", "PMC4505931", "PMC8800862", "PMC4806848", "PMC5546927", "PMC3672984", "PMC11862786", "PMC5346382", "PMC3061841", "PMC11000398", "PMC11141156", "PMC5655282", "PMC4764723", "PMC7423195", "PMC10452379", "PMC5342450", "PMC7197488", "PMC1364713", "PMC6759913", "PMC3753327", "PMC8141066", "PMC2014382", "PMC3686783", "PMC5048209", "PMC4928097", "PMC4735517", "PMC6021962", "PMC2885152", "PMC10529681", "PMC4892230", "PMC2570505", "PMC4462564", "PMC6939828", "PMC2042718", "PMC10825484", "PMC4338734", "PMC5404990", "PMC6631360", "PMC7028104", "PMC4615595", "PMC1767618", "PMC6891932", "PMC3213989", "PMC3680019", "PMC11094496", "PMC5817390", "PMC5944577", "PMC4943390", "PMC11140815", "PMC11605493", "PMC3462355", "PMC1746721", "PMC3925114", "PMC3895354", "PMC3125052", "PMC4669157", "PMC5098919", "PMC11520374", "PMC3029819", "PMC9031832", "PMC3107291", "PMC5908314", "PMC4600600", "PMC3506814", "PMC6049926", "PMC5412267", "PMC5355968", "PMC539815", "PMC3640375", "PMC6408006", "PMC5051541", "PMC2660379", "PMC4323272", "PMC4104334", "PMC4976849", "PMC3180021", "PMC7303159", "PMC10532840", "PMC2760462", "PMC10337687", "PMC5548439", "PMC11264771", "PMC3550197", "PMC11943653", "PMC7347085", "PMC7214659", "PMC4722076", "PMC4155516", "PMC3958404", "PMC6752321", "PMC7427977", "PMC2957581", "PMC3080643", "PMC9819208", "PMC3225067", "PMC4932617", "PMC6375065", "PMC3175513", "PMC7718230", "PMC3537445", "PMC10758687", "PMC3858547", "PMC3370715", "PMC11095822", "PMC5167198", "PMC2976128", "PMC3734060", "PMC10852661", "PMC4201132", "PMC10974048", "PMC6423619", "PMC3621996", "PMC3910794", "PMC7999651", "PMC11120965", "PMC5378677", "PMC9468644", "PMC5600689", "PMC7308427", "PMC3034442", "PMC16264", "PMC3776990", "PMC1887589", "PMC2855513", "PMC5721751", "PMC2194758", "PMC2547143", "PMC5590735", "PMC2743299", "PMC4350512", "PMC6011347"] \ No newline at end of file diff --git a/src/load_data/load_clinical_variants.py b/src/load_data/load_clinical_variants.py index 492abfc..310efed 100644 --- a/src/load_data/load_clinical_variants.py +++ b/src/load_data/load_clinical_variants.py @@ -7,6 +7,7 @@ import pandas as pd import json +from src.utils.file_paths import get_project_root """ This file contains functions to load the clinical variants data from the PharmGKB API. The key function is get_pmid_list(), which loads the PMIDs from the variant annotations tsv file and saves them to a json file. @@ -24,8 +25,7 @@ def download_and_extract_variant_annotations(override: bool = False) -> str: """ url = "https://api.pharmgkb.org/v1/download/file/data/variantAnnotations.zip" - base_dir = os.path.dirname(os.path.abspath(__file__)) - save_dir = os.path.join(base_dir, "saved_data") + save_dir = os.path.join(get_project_root(), "data") extract_dir = os.path.join(save_dir, "variantAnnotations") if os.path.exists(extract_dir): @@ -58,9 +58,8 @@ def load_raw_variant_annotations(override: bool = False) -> pd.DataFrame: Returns: pd.DataFrame: The loaded variant annotations tsv file. """ - base_dir = os.path.dirname(os.path.abspath(__file__)) tsv_path = os.path.join( - base_dir, "saved_data", "variantAnnotations", "var_drug_ann.tsv" + get_project_root(), "data", "variantAnnotations", "var_drug_ann.tsv" ) if not os.path.exists(tsv_path): @@ -102,8 +101,7 @@ def load_unique_variants(save_results: bool = True) -> dict: If the json file already exists, it will be loaded from the file. NOTE: Don't think this function is needed anymore. get_pmid_list() is used instead. """ - base_dir = os.path.dirname(os.path.abspath(__file__)) - unique_variants_path = os.path.join(base_dir, "saved_data", "unique_variants.json") + unique_variants_path = os.path.join(get_project_root(), "data", "unique_variants.json") if os.path.exists(unique_variants_path): logger.info(f"Loading unique variants from {unique_variants_path}") with open(unique_variants_path, "r") as f: @@ -125,8 +123,7 @@ def get_pmid_list(override: bool = False) -> list: """ Loads the pmid list from the variant annotations tsv file. """ - base_dir = os.path.dirname(os.path.abspath(__file__)) - pmid_list_path = os.path.join(base_dir, "saved_data", "pmid_list.json") + pmid_list_path = os.path.join(get_project_root(), "data", "pmid_list.json") if os.path.exists(pmid_list_path): logger.info(f"Loading PMIDs from {pmid_list_path}") with open(pmid_list_path, "r") as f: diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..10433a3 --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1 @@ +from .file_paths import get_project_root \ No newline at end of file diff --git a/src/utils/file_paths.py b/src/utils/file_paths.py new file mode 100644 index 0000000..1d37993 --- /dev/null +++ b/src/utils/file_paths.py @@ -0,0 +1,10 @@ +import os +from pathlib import Path + +def get_project_root() -> Path: + """ + Return the project root directory. + """ + # Assuming src is a top-level directory in the project + current_file = Path(__file__) + return current_file.parent.parent.parent From 7531b2bee6c62742ebf825ce40a41a17615b2db2 Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 19 May 2025 17:02:47 -0700 Subject: [PATCH 11/15] chore: updated gitignore --- .gitignore | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 662a2c9..be46693 100644 --- a/.gitignore +++ b/.gitignore @@ -19,9 +19,12 @@ __pycache__ .env # data -src/load_data/saved_data/ -src/fetch_articles/saved_data/downloaded_pmcids.json -src/fetch_articles/saved_data/articles/ +data/articles/ +data/variantAnnotations/ +data/unique_pmcids.json +data/pmid_list.json +data/downloaded_pmcids.json + *.zip *.tar.gz *.tar.bz2 @@ -32,8 +35,3 @@ src/fetch_articles/saved_data/articles/ .DS_Store -data/articles/ -data/variantAnnotations/ -data/unique_pmcids.json -data/pmid_list.json -data/downloaded_pmcids.json \ No newline at end of file From 6d521651759d04d4f1970d75e097e777e34fae1e Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 19 May 2025 17:04:47 -0700 Subject: [PATCH 12/15] docs: readme update --- data/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/data/README.md b/data/README.md index 1d26479..5fef6c8 100644 --- a/data/README.md +++ b/data/README.md @@ -8,6 +8,7 @@ This directory contains the primary data files used by the AutoGKB project. - **variantAnnotations/** - Contains clinical variant annotations and related data: - `var_drug_ann.tsv` - Variant-drug annotations. This is what is used in this repo. + - This can be downloaded using download_and_extract_variant_annotations from the load_data module - **Support Files**: - `pmcid_mapping.json` - Maps between PMIDs and PMCIDs From f4ce245d4742259541159ade095e529379b60dfb Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 19 May 2025 17:15:15 -0700 Subject: [PATCH 13/15] feat: load variants pipeline --- data/README.md | 2 +- pixi.toml | 1 + src/benchmark/annotation.py | 2 +- src/fetch_articles/README.md | 8 ++-- src/fetch_articles/pmcid_converter.py | 2 +- src/{load_data => load_variants}/README.md | 4 +- src/{load_data => load_variants}/__init__.py | 2 +- .../load_clinical_variants.py | 44 ++++++++----------- 8 files changed, 29 insertions(+), 36 deletions(-) rename src/{load_data => load_variants}/README.md (91%) rename src/{load_data => load_variants}/__init__.py (58%) rename src/{load_data => load_variants}/load_clinical_variants.py (77%) diff --git a/data/README.md b/data/README.md index 5fef6c8..659b32f 100644 --- a/data/README.md +++ b/data/README.md @@ -8,7 +8,7 @@ This directory contains the primary data files used by the AutoGKB project. - **variantAnnotations/** - Contains clinical variant annotations and related data: - `var_drug_ann.tsv` - Variant-drug annotations. This is what is used in this repo. - - This can be downloaded using download_and_extract_variant_annotations from the load_data module + - This can be downloaded using download_and_extract_variant_annotations from the load_variants module - **Support Files**: - `pmcid_mapping.json` - Maps between PMIDs and PMCIDs diff --git a/pixi.toml b/pixi.toml index 6c5d7fd..4bb20c4 100644 --- a/pixi.toml +++ b/pixi.toml @@ -12,6 +12,7 @@ platforms = ["osx-arm64"] version = "0.1.0" [tasks] +download-variants = "python -m src.load_variants.load_clinical_variants" update-download-map = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'" download-articles = "python -m src.fetch_articles.article_downloader" diff --git a/src/benchmark/annotation.py b/src/benchmark/annotation.py index 87b0da0..65ab710 100644 --- a/src/benchmark/annotation.py +++ b/src/benchmark/annotation.py @@ -1,5 +1,5 @@ from pydantic import BaseModel -from src.load_data import load_raw_variant_annotations +from src.load_variants import load_raw_variant_annotations """ Denotes a class for a variant annotation (row in var_drug_ann.tsv) diff --git a/src/fetch_articles/README.md b/src/fetch_articles/README.md index dbade90..19503ef 100644 --- a/src/fetch_articles/README.md +++ b/src/fetch_articles/README.md @@ -4,8 +4,8 @@ Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues. ## Process Overview -1. Download the zip of variants from pharmgkb (handled in load_data module) -2. Get a PMID list from the variants tsv (column PMID) (handled in load_data module) +1. Download the zip of variants from pharmgkb (handled in load_variants module) +2. Get a PMID list from the variants tsv (column PMID) (handled in load_variants module) 3. Convert the PMID to PMCID 4. Fetch the content from the PMCID @@ -54,7 +54,7 @@ Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall ```python from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid -from src.load_data import get_pmid_list +from src.load_variants import get_pmid_list import os from dotenv import load_dotenv @@ -131,7 +131,7 @@ To run the complete pipeline (convert PMIDs to PMCIDs and download articles): # Full pipeline from PMIDs to downloaded articles from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid from src.fetch_articles.article_downloader import download_articles -from src.load_data import get_pmid_list +from src.load_variants import get_pmid_list import os from dotenv import load_dotenv diff --git a/src/fetch_articles/pmcid_converter.py b/src/fetch_articles/pmcid_converter.py index 10b9347..e24fc00 100644 --- a/src/fetch_articles/pmcid_converter.py +++ b/src/fetch_articles/pmcid_converter.py @@ -5,7 +5,7 @@ from tqdm import tqdm from dotenv import load_dotenv import os -from src.load_data import get_pmid_list +from src.load_variants import get_pmid_list import json from src.utils.file_paths import get_project_root diff --git a/src/load_data/README.md b/src/load_variants/README.md similarity index 91% rename from src/load_data/README.md rename to src/load_variants/README.md index 311203f..0b6ca40 100644 --- a/src/load_data/README.md +++ b/src/load_variants/README.md @@ -6,7 +6,7 @@ This module handles the loading and preprocessing of PharmGKB clinical variants 1. **`download_and_extract_variant_annotations(override: bool = False)`** - Downloads and extracts the variant annotations ZIP file from PharmGKB - - Saves data to `saved_data/variantAnnotations/` + - Saves data to `data/variantAnnotations/` - Can override existing downloads if needed 2. **`load_raw_variant_annotations(override: bool = False)`** @@ -21,7 +21,7 @@ This module handles the loading and preprocessing of PharmGKB clinical variants 4. **`get_pmid_list(override: bool = False)`** - Main function to extract PMIDs from the variant annotations - Returns a list of unique PMIDs - - Caches results in `saved_data/pmid_list.json` + - Caches results in `data/pmid_list.json` - Used as input for PMCID conversion The module handles all data downloading, extraction, and preprocessing steps needed to get the PMID list for subsequent steps in the pipeline. diff --git a/src/load_data/__init__.py b/src/load_variants/__init__.py similarity index 58% rename from src/load_data/__init__.py rename to src/load_variants/__init__.py index 90cbc32..4142345 100644 --- a/src/load_data/__init__.py +++ b/src/load_variants/__init__.py @@ -1 +1 @@ -from .load_clinical_variants import load_raw_variant_annotations, get_pmid_list \ No newline at end of file +from .load_clinical_variants import load_raw_variant_annotations, get_pmid_list, variant_annotations_pipeline \ No newline at end of file diff --git a/src/load_data/load_clinical_variants.py b/src/load_variants/load_clinical_variants.py similarity index 77% rename from src/load_data/load_clinical_variants.py rename to src/load_variants/load_clinical_variants.py index 310efed..dd868fe 100644 --- a/src/load_data/load_clinical_variants.py +++ b/src/load_variants/load_clinical_variants.py @@ -95,30 +95,6 @@ def unique_variants(df: pd.DataFrame) -> dict: return {col: df[col].unique().tolist() for col in df.columns} -def load_unique_variants(save_results: bool = True) -> dict: - """ - Loads the unique variants from the variant annotations tsv file and saves them to a json file. - If the json file already exists, it will be loaded from the file. - NOTE: Don't think this function is needed anymore. get_pmid_list() is used instead. - """ - unique_variants_path = os.path.join(get_project_root(), "data", "unique_variants.json") - if os.path.exists(unique_variants_path): - logger.info(f"Loading unique variants from {unique_variants_path}") - with open(unique_variants_path, "r") as f: - unique_values_per_column = json.load(f) - else: - logger.info( - f"Unique variants not found at {unique_variants_path}. Loading from tsv file..." - ) - df = load_raw_variant_annotations() - unique_values_per_column = unique_variants(df) - if save_results: - logger.info(f"Saving unique variants to {unique_variants_path}") - with open(unique_variants_path, "w") as f: - json.dump(unique_values_per_column, f) - return unique_values_per_column - - def get_pmid_list(override: bool = False) -> list: """ Loads the pmid list from the variant annotations tsv file. @@ -136,6 +112,22 @@ def get_pmid_list(override: bool = False) -> list: json.dump(pmid_list, f) return pmid_list -if __name__ == "__main__": +def variant_annotations_pipeline(): + """ + Loads the variant annotations tsv file and saves the unique PMIDs to a json file. + """ + # Download and extract the variant annotations + logger.info("Downloading and extracting variant annotations...") + download_and_extract_variant_annotations() + + # Load the variant annotations + logger.info("Loading variant annotations...") + df = load_raw_variant_annotations() + + # Get the PMIDs + logger.info("Getting PMIDs...") pmid_list = get_pmid_list() - print(f"Number of unique PMIDs: {len(pmid_list)}") + logger.info(f"Number of unique PMIDs: {len(pmid_list)}") + +if __name__ == "__main__": + variant_annotations_pipeline() From 74db6aa07aa808f5cc73399f4b5af2e311b3b0ec Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 19 May 2025 17:22:45 -0700 Subject: [PATCH 14/15] feat: diagram and black formatting --- README.MD | 3 ++ assets/annotations_diagram.svg | 1 + src/benchmark/annotation.py | 5 ++- src/fetch_articles/article_downloader.py | 26 ++++++++-------- src/fetch_articles/pmcid_converter.py | 34 +++++++++++---------- src/load_variants/__init__.py | 6 +++- src/load_variants/load_clinical_variants.py | 3 ++ src/utils/__init__.py | 2 +- src/utils/file_paths.py | 1 + 9 files changed, 48 insertions(+), 33 deletions(-) create mode 100644 assets/annotations_diagram.svg diff --git a/README.MD b/README.MD index c7ad26e..97f5c97 100644 --- a/README.MD +++ b/README.MD @@ -37,3 +37,6 @@ This repository contains Python scripts for running and building a Pharmacogenom | | Delegate annotation groupings to team members | | | New Article Fetching | Replicate PharGKB current workflow | | +## System Overview +![Annotations Diagram](assets/annotations_diagram.svg) + diff --git a/assets/annotations_diagram.svg b/assets/annotations_diagram.svg new file mode 100644 index 0000000..ac373f0 --- /dev/null +++ b/assets/annotations_diagram.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/src/benchmark/annotation.py b/src/benchmark/annotation.py index 65ab710..b552ae4 100644 --- a/src/benchmark/annotation.py +++ b/src/benchmark/annotation.py @@ -5,6 +5,7 @@ Denotes a class for a variant annotation (row in var_drug_ann.tsv) """ + class VariantAnnotation(BaseModel): variant_annotation_id: str variant_haplotypes: str @@ -35,12 +36,10 @@ class VariantAnnotation(BaseModel): multiple_phenotypes_or_diseases_and_or: str comparison_alleles_or_genotypes: str comparison_metabolizer_types: str - - + """ 1. Load the ground truth variants 2. Load the extracted variants 3. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID """ - diff --git a/src/fetch_articles/article_downloader.py b/src/fetch_articles/article_downloader.py index c9179f7..543f01e 100644 --- a/src/fetch_articles/article_downloader.py +++ b/src/fetch_articles/article_downloader.py @@ -10,10 +10,10 @@ def fetch_pmc_content(pmcid): """ Fetch content for a single article from PubMed Central. - + Args: pmcid (str): The PubMed Central ID to fetch - + Returns: bytes or None: The article content in XML format or None if fetching failed """ @@ -33,16 +33,16 @@ def update_downloaded_pmcids() -> None: """ project_root = get_project_root() downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json" - + # Check for all the filenames in the data/articles directory articles_dir = project_root / "data" / "articles" os.makedirs(articles_dir, exist_ok=True) - + article_pmcids = [f.split(".")[0] for f in os.listdir(articles_dir)] article_pmcids_mapping = {pmcid: f"{pmcid}.xml" for pmcid in article_pmcids} logger.info(f"Found {len(article_pmcids)} existing XML files in {articles_dir}") - + # Add the new PMCIDs to the json file if os.path.exists(downloaded_pmcids_path): with open(downloaded_pmcids_path, "r") as f: @@ -55,12 +55,12 @@ def update_downloaded_pmcids() -> None: downloaded_pmcids = {} else: downloaded_pmcids = {} - + downloaded_pmcids.update(article_pmcids_mapping) - + with open(downloaded_pmcids_path, "w") as f: json.dump(downloaded_pmcids, f) - + logger.info( f"Updated {downloaded_pmcids_path} with {len(article_pmcids)} new PMCIDs" ) @@ -81,7 +81,7 @@ def download_articles(pmcids: list[str]): # Load the downloaded PMCIDs from the json file downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json" - + if os.path.exists(downloaded_pmcids_path): with open(downloaded_pmcids_path, "r") as f: downloaded_pmcids = json.load(f) @@ -102,8 +102,10 @@ def download_articles(pmcids: list[str]): else: downloaded_pmcids[pmcid] = None logger.warning(f"No record found for PMCID {pmcid}") - - logger.info(f"Downloaded {len(new_pmcids)} new articles, total articles: {len(downloaded_pmcids)}") + + logger.info( + f"Downloaded {len(new_pmcids)} new articles, total articles: {len(downloaded_pmcids)}" + ) # Save the downloaded PMCIDs to a json file with open(downloaded_pmcids_path, "w") as f: @@ -113,4 +115,4 @@ def download_articles(pmcids: list[str]): if __name__ == "__main__": update_downloaded_pmcids() pmcids = get_unique_pmcids() - download_articles(pmcids) \ No newline at end of file + download_articles(pmcids) diff --git a/src/fetch_articles/pmcid_converter.py b/src/fetch_articles/pmcid_converter.py index e24fc00..417ec57 100644 --- a/src/fetch_articles/pmcid_converter.py +++ b/src/fetch_articles/pmcid_converter.py @@ -23,19 +23,16 @@ from typing import List, Set, Dict, Optional - - - def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]: """ Load the saved PMCID mapping from the json file. """ project_root = get_project_root() results_path = project_root / "data" / "pmcid_mapping.json" - + # Create data directory if it doesn't exist os.makedirs(project_root / "data", exist_ok=True) - + if os.path.exists(results_path): with open(results_path, "r") as f: existing_results = json.load(f) @@ -51,7 +48,10 @@ def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]: def batch_pmid_to_pmcid( - pmids: List[str], email: str = os.getenv("NCBI_EMAIL"), batch_size: int = 100, delay: float = 0.4 + pmids: List[str], + email: str = os.getenv("NCBI_EMAIL"), + batch_size: int = 100, + delay: float = 0.4, ) -> Dict[str, Optional[str]]: """ Convert a list of PMIDs to PMCIDs using NCBI's ID Converter API. @@ -120,10 +120,10 @@ def batch_pmid_to_pmcid( # Save updated results project_root = get_project_root() results_path = project_root / "data" / "pmcid_mapping.json" - + # Create data directory if it doesn't exist os.makedirs(project_root / "data", exist_ok=True) - + with open(results_path, "w") as f: json.dump(existing_results, f) logger.info(f"Updated PMCID mappings saved to {results_path}") @@ -138,13 +138,13 @@ def get_unique_pmcids() -> List[str]: Currently function returns the pre-existing unique PMCIDs if they exist or regenerates the list from the mapping. """ project_root = get_project_root() - + # Load the unique PMCIDs if they've already been saved unique_pmcids_path = project_root / "data" / "unique_pmcids.json" - + # Create data directory if it doesn't exist os.makedirs(project_root / "data", exist_ok=True) - + if os.path.exists(unique_pmcids_path): with open(unique_pmcids_path, "r") as f: try: @@ -161,14 +161,16 @@ def get_unique_pmcids() -> List[str]: # Load from pmcid_mapping.json if unique pmcids haven't been saved results_path = project_root / "data" / "pmcid_mapping.json" - + if not os.path.exists(results_path): - logger.error(f"No PMCID mapping found at {results_path}. Cannot generate unique PMCIDs.") + logger.error( + f"No PMCID mapping found at {results_path}. Cannot generate unique PMCIDs." + ) return [] - + with open(results_path, "r") as f: existing_results = json.load(f) - + # Get the unique pmcids (remove None values) pmcids = [value for value in existing_results.values() if value is not None] pmcids = list(set(pmcids)) @@ -185,4 +187,4 @@ def get_unique_pmcids() -> List[str]: results = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL")) logger.info(f"PMCID mapping complete. {len(results)} PMIDs mapped to PMCIDs.") pmcids = get_unique_pmcids() - logger.info(f"Number of unique PMCIDs: {len(pmcids)}") \ No newline at end of file + logger.info(f"Number of unique PMCIDs: {len(pmcids)}") diff --git a/src/load_variants/__init__.py b/src/load_variants/__init__.py index 4142345..6c56850 100644 --- a/src/load_variants/__init__.py +++ b/src/load_variants/__init__.py @@ -1 +1,5 @@ -from .load_clinical_variants import load_raw_variant_annotations, get_pmid_list, variant_annotations_pipeline \ No newline at end of file +from .load_clinical_variants import ( + load_raw_variant_annotations, + get_pmid_list, + variant_annotations_pipeline, +) diff --git a/src/load_variants/load_clinical_variants.py b/src/load_variants/load_clinical_variants.py index dd868fe..c43b340 100644 --- a/src/load_variants/load_clinical_variants.py +++ b/src/load_variants/load_clinical_variants.py @@ -8,6 +8,7 @@ import json from src.utils.file_paths import get_project_root + """ This file contains functions to load the clinical variants data from the PharmGKB API. The key function is get_pmid_list(), which loads the PMIDs from the variant annotations tsv file and saves them to a json file. @@ -112,6 +113,7 @@ def get_pmid_list(override: bool = False) -> list: json.dump(pmid_list, f) return pmid_list + def variant_annotations_pipeline(): """ Loads the variant annotations tsv file and saves the unique PMIDs to a json file. @@ -129,5 +131,6 @@ def variant_annotations_pipeline(): pmid_list = get_pmid_list() logger.info(f"Number of unique PMIDs: {len(pmid_list)}") + if __name__ == "__main__": variant_annotations_pipeline() diff --git a/src/utils/__init__.py b/src/utils/__init__.py index 10433a3..4deed16 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -1 +1 @@ -from .file_paths import get_project_root \ No newline at end of file +from .file_paths import get_project_root diff --git a/src/utils/file_paths.py b/src/utils/file_paths.py index 1d37993..229c652 100644 --- a/src/utils/file_paths.py +++ b/src/utils/file_paths.py @@ -1,6 +1,7 @@ import os from pathlib import Path + def get_project_root() -> Path: """ Return the project root directory. From 9ac85cef6659c165e8b8b53362d4c5ededce7803 Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 19 May 2025 17:24:31 -0700 Subject: [PATCH 15/15] chore: removed unused file --- src/benchmark/annotation.py | 45 ------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 src/benchmark/annotation.py diff --git a/src/benchmark/annotation.py b/src/benchmark/annotation.py deleted file mode 100644 index b552ae4..0000000 --- a/src/benchmark/annotation.py +++ /dev/null @@ -1,45 +0,0 @@ -from pydantic import BaseModel -from src.load_variants import load_raw_variant_annotations - -""" -Denotes a class for a variant annotation (row in var_drug_ann.tsv) -""" - - -class VariantAnnotation(BaseModel): - variant_annotation_id: str - variant_haplotypes: str - gene: str - drug: str - pmid: str - phenotype_category: str - significance: str - notes: str - sentence: str - alleles: str - specialty_population: str - metabolizer_types: str - phenotype_category: str - significance: str - notes: str - sentence: str - alleles: str - specialty_population: str - metabolizer_types: str - is_plural: str - is_associated: str - direction_of_effect: str - pd_pk_terms: str - multiple_drugs_and_or: str - population_types: str - population_phenotypes_or_diseases: str - multiple_phenotypes_or_diseases_and_or: str - comparison_alleles_or_genotypes: str - comparison_metabolizer_types: str - - -""" -1. Load the ground truth variants -2. Load the extracted variants -3. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID -"""