diff --git a/.gitignore b/.gitignore
index 4122350..be46693 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,19 @@ __pycache__
.env
# data
-src/load_data/saved_data/
-src/fetch_articles/saved_data/downloaded_pmcids.json
-src/fetch_articles/saved_data/articles/
\ No newline at end of file
+data/articles/
+data/variantAnnotations/
+data/unique_pmcids.json
+data/pmid_list.json
+data/downloaded_pmcids.json
+
+*.zip
+*.tar.gz
+*.tar.bz2
+*.tar.xz
+*.tar.lzma
+*.tar.lz
+*.tar.lzo
+
+.DS_Store
+
diff --git a/README.MD b/README.MD
index c3bc5c9..97f5c97 100644
--- a/README.MD
+++ b/README.MD
@@ -6,7 +6,14 @@
# AutoGKB
-
+Goals:
+1. Fetch annotated articles from variantAnnotations stored in PharmGKB API
+2. Create a general benchmark for an extraction system that can output a score for an extraction system
+Given: Article, Ground Truth Variants (Manually extracted and recorded in var_drug_ann.tsv:)
+Input: Extracted Variants
+Output: Score
+3. System for extracting drug related variants annotations from an article. Associations in which the variant affects a drug dose, response, metabolism, etc.
+4. Continously fetch new pharmacogenomic articles
## Description
@@ -14,11 +21,22 @@ This repository contains Python scripts for running and building a Pharmacogenom
## Progress Tracker
-| Task | Status |
-| --- | --- |
-| Download the zip of variants from pharmgkb | ✅ |
-| Get a PMID list from the variants tsv (column PMID) | ✅ |
-| Convert the PMID to PMCID | ✅ |
-| Update to use non-official pmid to pmcid | |
-| Fetch the content from the PMCID | |
-| Create pairing of annotations to article | |
\ No newline at end of file
+| Category | Task | Status |
+| --- | --- | --- |
+| Initial Download | Download the zip of variants from pharmgkb | ✅ |
+| | Get a PMID list from the variants tsv (column PMID) | ✅ |
+| | Convert the PMID to PMCID | ✅ |
+| | Update to use non-official pmid to pmcid (aaron's method) | |
+| | Fetch the content from the PMCID | ✅ |
+| Benchmark | Create pairings of annotations to articles | |
+| | Create a niave score of number of matches | |
+| | Create group wise score | |
+| | Look into advanced scoring based on distance from truth per term | |
+| Workflows | Integrate Aaron's current approach | |
+| | Document on individual annotation meanings | |
+| | Delegate annotation groupings to team members | |
+| New Article Fetching | Replicate PharGKB current workflow | |
+
+## System Overview
+
+
diff --git a/assets/annotations_diagram.svg b/assets/annotations_diagram.svg
new file mode 100644
index 0000000..ac373f0
--- /dev/null
+++ b/assets/annotations_diagram.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 0000000..659b32f
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,17 @@
+# Data
+
+This directory contains the primary data files used by the AutoGKB project.
+
+## Directory Structure
+
+- **articles/** - Contains XML files of articles from PubMed Central (PMC), identified by their PMCID (e.g., PMC1234567.xml). These articles are used for text mining and information extraction.
+
+- **variantAnnotations/** - Contains clinical variant annotations and related data:
+ - `var_drug_ann.tsv` - Variant-drug annotations. This is what is used in this repo.
+ - This can be downloaded using download_and_extract_variant_annotations from the load_variants module
+
+- **Support Files**:
+ - `pmcid_mapping.json` - Maps between PMIDs and PMCIDs
+ - `unique_pmcids.json` - List of unique PMCIDs in the dataset
+ - `pmid_list.json` - List of PMIDs in the dataset
+ - `downloaded_pmcids.json` - Tracking which PMCIDs have been downloaded
\ No newline at end of file
diff --git a/src/fetch_articles/saved_data/pmcid_mapping.json b/data/pmcid_mapping.json
similarity index 100%
rename from src/fetch_articles/saved_data/pmcid_mapping.json
rename to data/pmcid_mapping.json
diff --git a/pixi.toml b/pixi.toml
index e6d800e..4bb20c4 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -12,7 +12,9 @@ platforms = ["osx-arm64"]
version = "0.1.0"
[tasks]
-update-downloaded-pmcids = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
+download-variants = "python -m src.load_variants.load_clinical_variants"
+update-download-map = "python -c 'from src.fetch_articles.article_downloader import update_downloaded_pmcids; update_downloaded_pmcids()'"
+download-articles = "python -m src.fetch_articles.article_downloader"
[dependencies]
seaborn = ">=0.13.2,<0.14"
diff --git a/src/benchmark/README.md b/src/benchmark/README.md
new file mode 100644
index 0000000..d12e18e
--- /dev/null
+++ b/src/benchmark/README.md
@@ -0,0 +1,4 @@
+# Benchmark
+
+## Functions
+1. Calculate the niave difference between an extracted variant and the ground truth variant on Variant Annotation ID
diff --git a/src/benchmark/__init__.py b/src/benchmark/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/dataset/README.md b/src/dataset/README.md
new file mode 100644
index 0000000..8a30cd7
--- /dev/null
+++ b/src/dataset/README.md
@@ -0,0 +1,8 @@
+# Dataset
+
+## Goal
+Convert the loaded files into a dataset where the annotations and raw text are paired with each other
+
+## Subgoals
+1. Understand the formats of the annotations
+2. Choose a format for the dataset
diff --git a/src/dataset/__init__.py b/src/dataset/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/fetch_articles/README.md b/src/fetch_articles/README.md
index eed6e26..19503ef 100644
--- a/src/fetch_articles/README.md
+++ b/src/fetch_articles/README.md
@@ -1,13 +1,161 @@
# PubMed Document Fetching
+
## Goal
-Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues
+Given a PMID, fetch the paper from PubMed. Ignore papers where there are paywall issues.
## Process Overview
-1. Download the zip of variants from pharmgkb (handled in load_data module)
-2. Get a PMID list from the variants tsv (column PMID) (handled in load_data module)
+1. Download the zip of variants from pharmgkb (handled in load_variants module)
+2. Get a PMID list from the variants tsv (column PMID) (handled in load_variants module)
3. Convert the PMID to PMCID
4. Fetch the content from the PMCID
-## Saved Data
-pmcid_mapping.json: Maps the PMID to the PMCID {"PMID": "PMCID" or Null, ..}
-unique_pmcids.json: List of all the unique PMCIDs from pmcid_mapping.json (["PMCID1", "PMCID2", ...])
\ No newline at end of file
+## Key Functions
+
+### PMCID Converter (`pmcid_converter.py`)
+
+- `batch_pmid_to_pmcid(pmids, email, batch_size, delay)`: Converts a list of PMIDs to PMCIDs using NCBI's ID Converter API. Processes PMIDs in batches and handles rate limiting.
+ - Arguments:
+ - `pmids`: List of PMIDs (as strings)
+ - `email`: Your email for NCBI tool identification
+ - `batch_size`: Number of PMIDs per request (max: 200)
+ - `delay`: Seconds between requests (default: 0.4)
+ - Returns: Dict mapping each PMID to PMCID (or None if not available)
+
+- `get_unique_pmcids()`: Returns a list of unique PMCIDs from the PMCID mapping file.
+
+- `load_saved_pmcid_mapping()`: Loads previously saved PMCID mappings from disk.
+
+- `get_project_root()`: Returns the project root directory path.
+
+### Article Downloader (`article_downloader.py`)
+
+- `fetch_pmc_content(pmcid)`: Fetches a single article's content from PubMed Central.
+ - Arguments:
+ - `pmcid`: The PubMed Central ID to fetch
+ - Returns: Article content in XML format or None if fetching failed
+
+- `download_articles(pmcids)`: Downloads multiple articles from PubMed Central.
+ - Arguments:
+ - `pmcids`: List of PMCIDs to download
+ - Saves downloaded articles to `data/articles/` as XML files
+ - Tracks downloaded PMCIDs to avoid duplicating work
+
+- `update_downloaded_pmcids()`: Updates tracking of downloaded PMCIDs from files in `data/articles/` directory.
+
+## Created Data
+- `pmcid_mapping.json`: Maps the PMID to the PMCID `{"PMID": "PMCID" or Null, ..}`
+- `unique_pmcids.json`: List of all the unique PMCIDs from pmcid_mapping.json `["PMCID1", "PMCID2", ...]`
+- `downloaded_pmcids.json`: Maps PMCIDs to filenames or None if download failed `{"PMCID": "PMCID.xml" or null, ..}`
+- `.xml`: Downloaded articles
+
+## Usage Examples
+
+### Convert PMIDs to PMCIDs
+
+```python
+from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid
+from src.load_variants import get_pmid_list
+import os
+from dotenv import load_dotenv
+
+load_dotenv() # Load environment variables (NCBI_EMAIL)
+
+# Get list of PMIDs from variant data
+pmid_list = get_pmid_list()
+
+# Convert PMIDs to PMCIDs
+pmcid_mapping = batch_pmid_to_pmcid(
+ pmids=pmid_list,
+ email=os.getenv("NCBI_EMAIL"),
+ batch_size=100,
+ delay=0.4
+)
+
+print(f"Successfully mapped {len(pmcid_mapping)} PMIDs to PMCIDs")
+```
+
+### Download Articles Using PMCIDs
+
+```python
+from src.fetch_articles.article_downloader import download_articles
+from src.fetch_articles.pmcid_converter import get_unique_pmcids
+
+# Get unique PMCIDs from saved mapping
+pmcids = get_unique_pmcids()
+
+# Download articles
+download_articles(pmcids)
+```
+
+### Download a Single Article
+
+```python
+from src.fetch_articles.article_downloader import fetch_pmc_content
+from src.fetch_articles.pmcid_converter import get_project_root
+import os
+from pathlib import Path
+
+# Get project root
+project_root = get_project_root()
+
+# Fetch a single article
+pmcid = "PMC1234567"
+content = fetch_pmc_content(pmcid)
+
+if content:
+ # Save the article content
+ articles_dir = project_root / "data" / "articles"
+ os.makedirs(articles_dir, exist_ok=True)
+
+ with open(articles_dir / f"{pmcid}.xml", "w") as f:
+ f.write(content.decode("utf-8"))
+ print(f"Successfully downloaded article {pmcid}")
+else:
+ print(f"Failed to download article {pmcid}")
+```
+
+### Update Downloaded PMCIDs
+
+```python
+from src.fetch_articles.article_downloader import update_downloaded_pmcids
+
+# Update downloaded_pmcids.json with articles in data/articles/
+update_downloaded_pmcids()
+```
+
+## Full Pipeline Execution
+
+To run the complete pipeline (convert PMIDs to PMCIDs and download articles):
+
+```python
+# Full pipeline from PMIDs to downloaded articles
+from src.fetch_articles.pmcid_converter import batch_pmid_to_pmcid
+from src.fetch_articles.article_downloader import download_articles
+from src.load_variants import get_pmid_list
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# 1. Get PMIDs from variant data
+pmid_list = get_pmid_list()
+
+# 2. Convert PMIDs to PMCIDs
+pmcid_mapping = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL"))
+
+# 3. Extract only valid PMCIDs (not None)
+valid_pmcids = [pmcid for pmcid in pmcid_mapping.values() if pmcid]
+
+# 4. Download articles
+download_articles(valid_pmcids)
+```
+
+Alternatively, run the module scripts directly:
+
+```bash
+# First convert PMIDs to PMCIDs
+python -m src.fetch_articles.pmcid_converter
+
+# Then download articles
+python -m src.fetch_articles.article_downloader
+```
\ No newline at end of file
diff --git a/src/fetch_articles/article_downloader.py b/src/fetch_articles/article_downloader.py
index 13c5cf1..543f01e 100644
--- a/src/fetch_articles/article_downloader.py
+++ b/src/fetch_articles/article_downloader.py
@@ -1,5 +1,6 @@
from loguru import logger
from src.fetch_articles.pmcid_converter import get_unique_pmcids
+from src.utils.file_paths import get_project_root
from Bio import Entrez
import os
import json
@@ -7,6 +8,15 @@
def fetch_pmc_content(pmcid):
+ """
+ Fetch content for a single article from PubMed Central.
+
+ Args:
+ pmcid (str): The PubMed Central ID to fetch
+
+ Returns:
+ bytes or None: The article content in XML format or None if fetching failed
+ """
try:
handle = Entrez.efetch(db="pmc", id=pmcid, rettype="full", retmode="xml")
record = handle.read()
@@ -19,18 +29,20 @@ def fetch_pmc_content(pmcid):
def update_downloaded_pmcids() -> None:
"""
- Update the downloaded_pmcids.json file with PMCIDs found in the saved_data/articles directory.
+ Update the downloaded_pmcids.json file with PMCIDs found in the data/articles directory.
"""
- base_dir = os.path.dirname(os.path.abspath(__file__))
- downloaded_pmcids_path = os.path.join(
- base_dir, "saved_data", "downloaded_pmcids.json"
- )
- # Check for all the filenames in the saved_data/articles directory
- articles_dir = os.path.join(base_dir, "saved_data", "articles")
+ project_root = get_project_root()
+ downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json"
+
+ # Check for all the filenames in the data/articles directory
+ articles_dir = project_root / "data" / "articles"
+ os.makedirs(articles_dir, exist_ok=True)
+
article_pmcids = [f.split(".")[0] for f in os.listdir(articles_dir)]
article_pmcids_mapping = {pmcid: f"{pmcid}.xml" for pmcid in article_pmcids}
logger.info(f"Found {len(article_pmcids)} existing XML files in {articles_dir}")
+
# Add the new PMCIDs to the json file
if os.path.exists(downloaded_pmcids_path):
with open(downloaded_pmcids_path, "r") as f:
@@ -43,9 +55,12 @@ def update_downloaded_pmcids() -> None:
downloaded_pmcids = {}
else:
downloaded_pmcids = {}
+
downloaded_pmcids.update(article_pmcids_mapping)
+
with open(downloaded_pmcids_path, "w") as f:
json.dump(downloaded_pmcids, f)
+
logger.info(
f"Updated {downloaded_pmcids_path} with {len(article_pmcids)} new PMCIDs"
)
@@ -55,19 +70,18 @@ def download_articles(pmcids: list[str]):
"""
Download articles from PubMed Central using PMCIDs.
Keeps track of the PMCIDs that have been downloaded and skips them.
- Saves the downloaded articles to the saved_data/articles directory.
+ Saves the downloaded articles to the data/articles directory.
Args:
pmcids (list[str]): List of PMCIDs to download.
"""
- base_dir = os.path.dirname(os.path.abspath(__file__))
- saved_dir = os.path.join(base_dir, "saved_data", "articles")
- os.makedirs(saved_dir, exist_ok=True)
+ project_root = get_project_root()
+ articles_dir = project_root / "data" / "articles"
+ os.makedirs(articles_dir, exist_ok=True)
# Load the downloaded PMCIDs from the json file
- downloaded_pmcids_path = os.path.join(
- base_dir, "saved_data", "downloaded_pmcids.json"
- )
+ downloaded_pmcids_path = project_root / "data" / "downloaded_pmcids.json"
+
if os.path.exists(downloaded_pmcids_path):
with open(downloaded_pmcids_path, "r") as f:
downloaded_pmcids = json.load(f)
@@ -82,16 +96,19 @@ def download_articles(pmcids: list[str]):
for pmcid in tqdm(new_pmcids):
record = fetch_pmc_content(pmcid)
if record:
- with open(os.path.join(saved_dir, f"{pmcid}.xml"), "w") as f:
+ with open(articles_dir / f"{pmcid}.xml", "w") as f:
f.write(record.decode("utf-8"))
downloaded_pmcids[pmcid] = f"{pmcid}.xml"
else:
downloaded_pmcids[pmcid] = None
logger.warning(f"No record found for PMCID {pmcid}")
- logger.info(f"Downloaded {len(downloaded_pmcids)} articles")
+
+ logger.info(
+ f"Downloaded {len(new_pmcids)} new articles, total articles: {len(downloaded_pmcids)}"
+ )
# Save the downloaded PMCIDs to a json file
- with open(os.path.join(base_dir, "saved_data", "downloaded_pmcids.json"), "w") as f:
+ with open(downloaded_pmcids_path, "w") as f:
json.dump(downloaded_pmcids, f)
diff --git a/src/fetch_articles/pmcid_converter.py b/src/fetch_articles/pmcid_converter.py
index dc5a57c..417ec57 100644
--- a/src/fetch_articles/pmcid_converter.py
+++ b/src/fetch_articles/pmcid_converter.py
@@ -5,8 +5,9 @@
from tqdm import tqdm
from dotenv import load_dotenv
import os
-from src.load_data import get_pmid_list
+from src.load_variants import get_pmid_list
import json
+from src.utils.file_paths import get_project_root
load_dotenv()
# Email for NCBI
@@ -26,8 +27,12 @@ def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]:
"""
Load the saved PMCID mapping from the json file.
"""
- base_dir = os.path.dirname(os.path.abspath(__file__))
- results_path = os.path.join(base_dir, "pmcid_mapping.json")
+ project_root = get_project_root()
+ results_path = project_root / "data" / "pmcid_mapping.json"
+
+ # Create data directory if it doesn't exist
+ os.makedirs(project_root / "data", exist_ok=True)
+
if os.path.exists(results_path):
with open(results_path, "r") as f:
existing_results = json.load(f)
@@ -43,7 +48,10 @@ def load_saved_pmcid_mapping() -> Dict[str, Optional[str]]:
def batch_pmid_to_pmcid(
- pmids: List[str], email: str, batch_size: int = 100, delay: float = 0.4
+ pmids: List[str],
+ email: str = os.getenv("NCBI_EMAIL"),
+ batch_size: int = 100,
+ delay: float = 0.4,
) -> Dict[str, Optional[str]]:
"""
Convert a list of PMIDs to PMCIDs using NCBI's ID Converter API.
@@ -110,8 +118,12 @@ def batch_pmid_to_pmcid(
existing_results.update(results)
# Save updated results
- base_dir = os.path.dirname(os.path.abspath(__file__))
- results_path = os.path.join(base_dir, "pmcid_mapping.json")
+ project_root = get_project_root()
+ results_path = project_root / "data" / "pmcid_mapping.json"
+
+ # Create data directory if it doesn't exist
+ os.makedirs(project_root / "data", exist_ok=True)
+
with open(results_path, "w") as f:
json.dump(existing_results, f)
logger.info(f"Updated PMCID mappings saved to {results_path}")
@@ -125,9 +137,14 @@ def get_unique_pmcids() -> List[str]:
NOTE: Could add functionality to check for new PMCIDs in mapping and update the unique_pmcids.json file
Currently function returns the pre-existing unique PMCIDs if they exist or regenerates the list from the mapping.
"""
+ project_root = get_project_root()
+
# Load the unique PMCIDs if they've already been saved
- base_dir = os.path.dirname(os.path.abspath(__file__))
- unique_pmcids_path = os.path.join(base_dir, "saved_data", "unique_pmcids.json")
+ unique_pmcids_path = project_root / "data" / "unique_pmcids.json"
+
+ # Create data directory if it doesn't exist
+ os.makedirs(project_root / "data", exist_ok=True)
+
if os.path.exists(unique_pmcids_path):
with open(unique_pmcids_path, "r") as f:
try:
@@ -143,14 +160,22 @@ def get_unique_pmcids() -> List[str]:
return pmcids
# Load from pmcid_mapping.json if unique pmcids haven't been saved
- results_path = os.path.join(base_dir, "saved_data", "pmcid_mapping.json")
+ results_path = project_root / "data" / "pmcid_mapping.json"
+
+ if not os.path.exists(results_path):
+ logger.error(
+ f"No PMCID mapping found at {results_path}. Cannot generate unique PMCIDs."
+ )
+ return []
+
with open(results_path, "r") as f:
existing_results = json.load(f)
- # get the unique pmcids
- pmcids = list(set(existing_results.values()))
+
+ # Get the unique pmcids (remove None values)
+ pmcids = [value for value in existing_results.values() if value is not None]
+ pmcids = list(set(pmcids))
# Save the unique pmcids to a json file
- unique_pmcids_path = os.path.join(base_dir, "saved_data", "unique_pmcids.json")
with open(unique_pmcids_path, "w") as f:
json.dump(pmcids, f)
logger.info(f"Unique PMCIDs saved to {unique_pmcids_path}")
@@ -158,8 +183,8 @@ def get_unique_pmcids() -> List[str]:
if __name__ == "__main__":
- # pmid_list = get_pmid_list()
- # results = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL"))
- # logger.info(f"PMCID mapping complete. {len(results)} PMIDs mapped to PMCIDs.")
+ pmid_list = get_pmid_list()
+ results = batch_pmid_to_pmcid(pmid_list, os.getenv("NCBI_EMAIL"))
+ logger.info(f"PMCID mapping complete. {len(results)} PMIDs mapped to PMCIDs.")
pmcids = get_unique_pmcids()
logger.info(f"Number of unique PMCIDs: {len(pmcids)}")
diff --git a/src/fetch_articles/saved_data/unique_pmcids.json b/src/fetch_articles/saved_data/unique_pmcids.json
deleted file mode 100644
index c1a43e7..0000000
--- a/src/fetch_articles/saved_data/unique_pmcids.json
+++ /dev/null
@@ -1 +0,0 @@
-["PMC11850035", "PMC2812115", "PMC2911553", "PMC5583388", "PMC4526634", "PMC6247602", "PMC11758033", "PMC2884029", "PMC3778124", "PMC3044738", "PMC5700353", "PMC1884342", "PMC6289816", "PMC4609097", "PMC1364741", "PMC2857717", "PMC10898793", "PMC4909584", "PMC4630174", "PMC5391214", "PMC10675244", "PMC6647927", "PMC5074472", "PMC11667419", "PMC8185249", "PMC2352037", "PMC4280295", "PMC11512548", "PMC7377539", "PMC11258238", "PMC7499297", "PMC5859345", "PMC11921366", "PMC3872414", "PMC3797132", "PMC5373545", "PMC8455325", "PMC3786328", "PMC3081375", "PMC11825576", "PMC10418744", "PMC1563530", "PMC4032230", "PMC5886039", "PMC3068061", "PMC48077", "PMC4537319", "PMC4730664", "PMC4713720", "PMC3433845", "PMC11241034", "PMC6142943", "PMC7710914", "PMC5496343", "PMC2966433", "PMC6777349", "PMC3310336", "PMC4278770", "PMC5904126", "PMC4342329", "PMC3594083", "PMC6714829", "PMC5496345", "PMC2048549", "PMC3574284", "PMC3454425", "PMC3657889", "PMC3555061", "PMC10618485", "PMC3692386", "PMC3902809", "PMC9468554", "PMC5309131", "PMC2757009", "PMC2592852", "PMC2928561", "PMC2762391", "PMC4583245", "PMC6432766", "PMC7616417", "PMC5749368", "PMC4522133", "PMC3983993", "PMC2920450", "PMC4181635", "PMC3703617", "PMC4702321", "PMC2879959", "PMC4746878", "PMC11871410", "PMC7164646", "PMC5203947", "PMC4867099", "PMC3434304", "PMC6462825", "PMC4435089", "PMC2681284", "PMC1952551", "PMC5866313", "PMC3016221", "PMC3837290", "PMC10848431", "PMC3565812", "PMC3787223", "PMC4226857", "PMC10675623", "PMC9768477", "PMC2485247", "PMC4503374", "PMC4565152", "PMC4206345", "PMC1873971", "PMC2364178", "PMC3230303", "PMC11016593", "PMC3622803", "PMC11666798", "PMC4956330", "PMC5354739", "PMC4307337", "PMC3836273", "PMC2292110", "PMC2630264", "PMC4959996", "PMC11421434", "PMC11787782", "PMC3561425", "PMC2921956", "PMC5538123", "PMC6612579", "PMC3539557", "PMC8553963", "PMC9585281", "PMC3938989", "PMC11552228", "PMC9501307", "PMC11203291", "PMC3786570", "PMC5514947", "PMC10566653", "PMC3944214", "PMC4476880", "PMC2733171", "PMC3164274", "PMC5142600", "PMC7274090", "PMC10537526", "PMC6493603", "PMC3567337", "PMC6587209", "PMC7455128", "PMC5324942", "PMC2666924", "PMC11544447", "PMC3603284", "PMC11786019", "PMC4560372", "PMC3734199", "PMC4525256", "PMC4488893", "PMC3786668", "PMC11803932", "PMC4513254", "PMC6005582", "PMC5940523", "PMC6923423", "PMC2913479", "PMC8429954", "PMC5306247", "PMC3529147", "PMC3358293", "PMC6038204", "PMC5541380", "PMC11717999", "PMC10179231", "PMC2966981", "PMC10995391", "PMC4254688", "PMC8702453", "PMC11257390", "PMC11052159", null, "PMC10858860", "PMC4154892", "PMC4613195", "PMC4387236", "PMC4628029", "PMC1754569", "PMC3381232", "PMC3579501", "PMC4716887", "PMC11855146", "PMC3093392", "PMC10668244", "PMC4437521", "PMC9875006", "PMC10532907", "PMC4890827", "PMC3639978", "PMC4876188", "PMC4868001", "PMC4330076", "PMC8758337", "PMC4533232", "PMC4982581", "PMC2432487", "PMC11160041", "PMC4134280", "PMC11913886", "PMC5558541", "PMC4011617", "PMC7005197", "PMC7674153", "PMC5678480", "PMC2773991", "PMC11492722", "PMC4503103", "PMC11063049", "PMC11082567", "PMC5862636", "PMC3617060", "PMC12043259", "PMC3279522", "PMC8438567", "PMC5871545", "PMC3625373", "PMC2756088", "PMC7260086", "PMC10298263", "PMC4270923", "PMC4390701", "PMC6003833", "PMC3769669", "PMC11049768", "PMC1755496", "PMC2952572", "PMC9536193", "PMC4762905", "PMC1773505", "PMC5386607", "PMC3390407", "PMC9914414", "PMC6357964", "PMC3946972", "PMC4023787", "PMC6523194", "PMC5483245", "PMC2673121", "PMC5903228", "PMC3369131", "PMC11528939", "PMC11159193", "PMC2291274", "PMC4296935", "PMC3158597", "PMC3273458", "PMC6174029", "PMC4490522", "PMC5148898", "PMC6248022", "PMC5468510", "PMC4015881", "PMC2853591", "PMC3396003", "PMC2515139", "PMC3292264", "PMC3632552", "PMC2680291", "PMC4938133", "PMC5614982", "PMC4812555", "PMC2492917", "PMC4616511", "PMC6987567", "PMC9608913", "PMC3555879", "PMC2943151", "PMC4448076", "PMC2766479", "PMC4835128", "PMC3944116", "PMC10931982", "PMC4452656", "PMC10159199", "PMC2561120", "PMC4613221", "PMC10214567", "PMC9801627", "PMC3505921", "PMC7968507", "PMC2966859", "PMC5743122", "PMC4641035", "PMC3760990", "PMC11059713", "PMC4892970", "PMC2722908", "PMC3746708", "PMC4601717", "PMC4462610", "PMC1874463", "PMC11003701", "PMC4591203", "PMC1474035", "PMC3880259", "PMC4892378", "PMC9820603", "PMC3952719", "PMC5564514", "PMC7870766", "PMC5411458", "PMC10827494", "PMC11524821", "PMC8182957", "PMC6265082", "PMC5342670", "PMC3940150", "PMC2810514", "PMC3808494", "PMC2762405", "PMC11638344", "PMC2386778", "PMC4012347", "PMC5207665", "PMC4433569", "PMC4697903", "PMC5427048", "PMC3523080", "PMC6046471", "PMC7993015", "PMC6034060", "PMC10880038", "PMC4199712", "PMC6409308", "PMC5346037", "PMC4594719", "PMC1974827", "PMC5266160", "PMC3729209", "PMC8954661", "PMC5684285", "PMC3845218", "PMC3161212", "PMC5500390", "PMC4631184", "PMC2664151", "PMC3860742", "PMC4099069", "PMC6179259", "PMC10091789", "PMC4160394", "PMC6033076", "PMC7375060", "PMC8578201", "PMC4078496", "PMC3522814", "PMC5563830", "PMC4996314", "PMC11354576", "PMC3291838", "PMC5306492", "PMC3384479", "PMC3899768", "PMC5432414", "PMC3518380", "PMC4432150", "PMC5355121", "PMC3674704", "PMC3378722", "PMC3727245", "PMC3941038", "PMC2751283", "PMC6767327", "PMC3553682", "PMC1029622", "PMC4511425", "PMC3248259", "PMC4385537", "PMC4195667", "PMC4757974", "PMC3330749", "PMC9891445", "PMC2679896", "PMC2810802", "PMC3871508", "PMC4171106", "PMC2820245", "PMC10810687", "PMC8540141", "PMC3997354", "PMC11088557", "PMC4130425", "PMC4615534", "PMC4468641", "PMC3735354", "PMC6734474", "PMC6542686", "PMC5753622", "PMC9931738", "PMC4300289", "PMC5612381", "PMC5543069", "PMC4229256", "PMC2883666", "PMC11603417", "PMC1365155", "PMC6231319", "PMC3682424", "PMC2715837", "PMC5526237", "PMC3621246", "PMC4190075", "PMC5983535", "PMC4519823", "PMC5508045", "PMC5346878", "PMC6328871", "PMC4272010", "PMC7215378", "PMC3890033", "PMC3641305", "PMC8841435", "PMC8137991", "PMC3873034", "PMC4043918", "PMC5619051", "PMC11685162", "PMC11509751", "PMC3130093", "PMC11860030", "PMC3098751", "PMC5519037", "PMC11221861", "PMC4087845", "PMC8238023", "PMC4872310", "PMC9601332", "PMC5798599", "PMC3182303", "PMC3611944", "PMC10967865", "PMC10645035", "PMC3348126", "PMC5316146", "PMC10377184", "PMC2014902", "PMC3264276", "PMC10838100", "PMC10607223", "PMC5469860", "PMC11134291", "PMC6313513", "PMC6927671", "PMC3525178", "PMC9961245", "PMC10864595", "PMC1873375", "PMC8533258", "PMC6562943", "PMC3544007", "PMC4833150", "PMC5645220", "PMC6586010", "PMC8513493", "PMC9314634", "PMC6400024", "PMC10196221", "PMC11887348", "PMC11244643", "PMC9256318", "PMC11703455", "PMC5079351", "PMC11393095", "PMC4915265", "PMC8530979", "PMC7398416", "PMC8822703", "PMC3092713", "PMC4456129", "PMC3780966", "PMC3608305", "PMC4224698", "PMC11481807", "PMC11887086", "PMC10815823", "PMC6448146", "PMC2014166", "PMC10163902", "PMC3749354", "PMC5883590", "PMC8742641", "PMC4965653", "PMC3604156", "PMC4702374", "PMC5505550", "PMC3114195", "PMC4356257", "PMC5727754", "PMC4995153", "PMC2959002", "PMC442471", "PMC4454552", "PMC3030919", "PMC11111788", "PMC4137828", "PMC4916778", "PMC11773121", "PMC8673616", "PMC6347826", "PMC2859392", "PMC3352974", "PMC2288721", "PMC4682920", "PMC4581326", "PMC4365300", "PMC2480976", "PMC4169411", "PMC6613715", "PMC6745302", "PMC8724172", "PMC6262886", "PMC4479596", "PMC2168111", "PMC4636889", "PMC4594699", "PMC5904201", "PMC7089776", "PMC5604555", "PMC6092108", "PMC3749570", "PMC4631197", "PMC6479273", "PMC6942309", "PMC4183989", "PMC4368615", "PMC3525665", "PMC4820801", "PMC9298338", "PMC6472479", "PMC3931261", "PMC4017364", "PMC11152251", "PMC4500334", "PMC4168388", "PMC11677811", "PMC3673300", "PMC10583240", "PMC3214266", "PMC5282793", "PMC3775655", "PMC3818406", "PMC2949522", "PMC524175", "PMC4862932", "PMC3137420", "PMC2830598", "PMC3818912", "PMC4803610", "PMC9582748", "PMC4034115", "PMC3867202", "PMC3690108", "PMC9297921", "PMC9537548", "PMC5589489", "PMC3055694", "PMC4012056", "PMC2903324", "PMC2686066", "PMC10152845", "PMC3667657", "PMC4308646", "PMC6411694", "PMC5009007", "PMC10834390", "PMC3403289", "PMC10909096", "PMC2992873", "PMC8445626", "PMC6969041", "PMC3894627", "PMC6786370", "PMC5833535", "PMC6505090", "PMC11359404", "PMC8672325", "PMC6980920", "PMC10666731", "PMC7963143", "PMC5382092", "PMC4461653", "PMC2860533", "PMC5018246", "PMC4479153", "PMC6451710", "PMC4301945", "PMC4557249", "PMC4157963", "PMC6995013", "PMC11608742", "PMC10668502", "PMC4480925", "PMC11573879", "PMC1975838", "PMC4231027", "PMC5818817", "PMC6037621", "PMC6801039", "PMC4943245", "PMC2014539", "PMC11555502", "PMC2995295", "PMC5006145", "PMC4055378", "PMC4484512", "PMC11418302", "PMC10278212", "PMC4057281", "PMC6219441", "PMC5598801", "PMC11754044", "PMC3984266", "PMC10778798", "PMC4274707", "PMC5651309", "PMC11252221", "PMC4265416", "PMC11475898", "PMC4752391", "PMC3624039", "PMC11401437", "PMC2652833", "PMC3774043", "PMC7431691", "PMC7039325", "PMC6086578", "PMC7655626", "PMC5521342", "PMC3910846", "PMC6851426", "PMC3139013", "PMC11773116", "PMC8458697", "PMC10527451", "PMC5241185", "PMC4151614", "PMC1874262", "PMC3461952", "PMC3818518", "PMC3653303", "PMC5411211", "PMC4356640", "PMC4672523", "PMC3414671", "PMC3485381", "PMC5877743", "PMC3481266", "PMC8295171", "PMC4800352", "PMC4693492", "PMC10501538", "PMC4154311", "PMC10309098", "PMC6813860", "PMC8953705", "PMC4366347", "PMC9925376", "PMC10917709", "PMC1365072", "PMC6014560", "PMC4292894", "PMC8426351", "PMC6612264", "PMC7319006", "PMC10502099", "PMC3726442", "PMC4500328", "PMC5711571", "PMC8940650", "PMC4345005", "PMC8604252", "PMC5319785", "PMC5233579", "PMC3658129", "PMC6493124", "PMC4892373", "PMC3401172", "PMC4527535", "PMC5293674", "PMC9080200", "PMC6591035", "PMC2684883", "PMC4243902", "PMC11652804", "PMC10883345", "PMC2668081", "PMC4324232", "PMC11159294", "PMC10982510", "PMC4737107", "PMC11148365", "PMC6510382", "PMC6216325", "PMC2518836", "PMC2830602", "PMC4694426", "PMC3394147", "PMC4335884", "PMC6486881", "PMC6461793", "PMC5903579", "PMC10349379", "PMC2647710", "PMC10557961", "PMC7115946", "PMC5370513", "PMC10409991", "PMC5298887", "PMC11995662", "PMC4836090", "PMC2791975", "PMC2726911", "PMC3164277", "PMC4111883", "PMC11315837", "PMC3246196", "PMC11531276", "PMC2750008", "PMC11884701", "PMC9830790", "PMC2662935", "PMC6941886", "PMC5887212", "PMC11240873", "PMC10970167", "PMC7793629", "PMC5898372", "PMC4038142", "PMC11208962", "PMC8880478", "PMC5716599", "PMC6298606", "PMC4661296", "PMC2888980", "PMC1884506", "PMC4542662", "PMC3579261", "PMC4872428", "PMC3984158", "PMC2935997", "PMC9810307", "PMC6989102", "PMC5975540", "PMC5299197", "PMC3571021", "PMC3582836", "PMC3376437", "PMC3513646", "PMC1087660", "PMC5065384", "PMC5176308", "PMC11668066", "PMC10990950", "PMC8163522", "PMC4425504", "PMC7883889", "PMC8505487", "PMC4502741", "PMC3555056", "PMC3675749", "PMC7292295", "PMC4184528", "PMC5727167", "PMC3444290", "PMC5562097", "PMC4441275", "PMC5591096", "PMC10684410", "PMC4298011", "PMC7393710", "PMC3584248", "PMC4105486", "PMC5711795", "PMC3753270", "PMC5520553", "PMC2014233", "PMC9301121", "PMC6920759", "PMC1885008", "PMC4116670", "PMC4220988", "PMC4846779", "PMC3329222", "PMC4445755", "PMC6132901", "PMC7193447", "PMC3454958", "PMC2896457", "PMC3628804", "PMC1995596", "PMC3508798", "PMC10972729", "PMC4794377", "PMC4692529", "PMC3922978", "PMC1365132", "PMC2886925", "PMC5138058", "PMC5425333", "PMC7115450", "PMC6654446", "PMC6006403", "PMC4304713", "PMC3006662", "PMC2949912", "PMC5700347", "PMC5461999", "PMC5768901", "PMC6089815", "PMC4762902", "PMC4690185", "PMC5438821", "PMC5189722", "PMC9809306", "PMC2644687", "PMC4631185", "PMC5534241", "PMC5373543", "PMC4100708", "PMC1237155", "PMC8373649", "PMC4693577", "PMC9328121", "PMC2737687", "PMC3249179", "PMC5377478", "PMC5531276", "PMC8890732", "PMC8108700", "PMC6387687", "PMC2683977", "PMC6054772", "PMC3237821", "PMC4345081", "PMC6518412", "PMC7086280", "PMC5440888", "PMC2922203", "PMC3712827", "PMC4612590", "PMC2596476", "PMC6773496", "PMC11246114", "PMC3038469", "PMC4969350", "PMC11763628", "PMC5734971", "PMC4719145", "PMC3208318", "PMC5763654", "PMC4573240", "PMC8184575", "PMC4444267", "PMC4168390", "PMC9584256", "PMC3020258", "PMC2901912", "PMC5135610", "PMC5932771", "PMC4473094", "PMC2896826", "PMC3985268", "PMC7351433", "PMC4760888", "PMC5287983", "PMC2709885", "PMC2364770", "PMC5057355", "PMC3988537", "PMC5817388", "PMC3656883", "PMC11435314", "PMC3448899", "PMC1884346", "PMC5423974", "PMC11102648", "PMC4257570", "PMC2906637", "PMC10381361", "PMC4828529", "PMC9890192", "PMC4503705", "PMC1884285", "PMC6587626", "PMC2641037", "PMC4797547", "PMC3988270", "PMC8505452", "PMC10876746", "PMC6760244", "PMC4425056", "PMC3846997", "PMC4296254", "PMC5449482", "PMC11755583", "PMC5161051", "PMC11049954", "PMC11558073", "PMC4805204", "PMC5789875", "PMC6800829", "PMC10908252", "PMC4551162", "PMC10495004", "PMC4595504", "PMC5943457", "PMC11246689", "PMC5744175", "PMC4618180", "PMC8222836", "PMC5298566", "PMC4541975", "PMC10951231", "PMC3055457", "PMC1978168", "PMC10526247", "PMC6493076", "PMC2650539", "PMC4707035", "PMC4151246", "PMC4413900", "PMC4454285", "PMC4930967", "PMC10787143", "PMC4116556", "PMC2754599", "PMC9306465", "PMC11317398", "PMC3415853", "PMC3852421", "PMC11158672", "PMC3575609", "PMC11891766", "PMC2794921", "PMC5546852", "PMC7221122", "PMC10914946", "PMC2586993", "PMC4574839", "PMC8204702", "PMC4982759", "PMC2950972", "PMC7235792", "PMC3521860", "PMC11730665", "PMC5611711", "PMC9515473", "PMC2291379", "PMC4498287", "PMC3360546", "PMC5945500", "PMC6361127", "PMC3943570", "PMC6426691", "PMC11158323", "PMC5800559", "PMC6151284", "PMC6542461", "PMC5220536", "PMC5610780", "PMC3100585", "PMC4405819", "PMC3912955", "PMC10782740", "PMC5903239", "PMC7302666", "PMC11271148", "PMC9841299", "PMC3195031", "PMC6081148", "PMC3756535", "PMC6357360", "PMC5538305", "PMC4706412", "PMC2865873", "PMC5264271", "PMC4498982", "PMC10483403", "PMC4177494", "PMC2976715", "PMC11011338", "PMC3633658", "PMC2679107", "PMC5465325", "PMC7375952", "PMC2896566", "PMC3652476", "PMC556232", "PMC11106956", "PMC11236688", "PMC8132880", "PMC3282030", "PMC4169706", "PMC2757655", "PMC3909010", "PMC4896103", "PMC4243881", "PMC5392306", "PMC4575538", "PMC8975736", "PMC3419350", "PMC5509475", "PMC2925052", "PMC7497848", "PMC3947488", "PMC2276142", "PMC6373376", "PMC3461592", "PMC6501809", "PMC10154044", "PMC4002970", "PMC5028170", "PMC4208722", "PMC9610285", "PMC2556451", "PMC5029084", "PMC2564574", "PMC5485718", "PMC5003027", "PMC5604731", "PMC2908290", "PMC5901893", "PMC4865408", "PMC8917764", "PMC10349800", "PMC10499425", "PMC11703419", "PMC4236071", "PMC5980466", "PMC6411020", "PMC5101708", "PMC11628867", "PMC11102100", "PMC4731723", "PMC8571740", "PMC7649675", "PMC6595468", "PMC5763318", "PMC4949007", "PMC5323433", "PMC4703773", "PMC1401654", "PMC5875353", "PMC4541974", "PMC8973308", "PMC11933031", "PMC11720188", "PMC4038024", "PMC5808057", "PMC1884959", "PMC6493375", "PMC5145728", "PMC3992925", "PMC5807179", "PMC3499361", "PMC3760447", "PMC4931969", "PMC3049596", "PMC4110085", "PMC5007158", "PMC6475679", "PMC4947669", "PMC5875925", "PMC11310823", "PMC1884261", "PMC3172251", "PMC6171340", "PMC7245057", "PMC7340566", "PMC7388522", "PMC3766937", "PMC10099095", "PMC6125540", "PMC4220464", "PMC11269678", "PMC5346875", "PMC8106923", "PMC4113831", "PMC10244018", "PMC5346034", "PMC3320544", "PMC1963422", "PMC3530397", "PMC6246957", "PMC4590670", "PMC9974434", "PMC3612775", "PMC1885108", "PMC6714673", "PMC5659294", "PMC8578190", "PMC5427244", "PMC3991683", "PMC4115247", "PMC5412025", "PMC10038974", "PMC8915292", "PMC10230242", "PMC11404698", "PMC6742943", "PMC7497238", "PMC8472669", "PMC4855508", "PMC5152628", "PMC5651327", "PMC9028965", "PMC3597465", "PMC5478306", "PMC6631257", "PMC3833422", "PMC4119242", "PMC2792638", "PMC6046506", "PMC3468617", "PMC6489578", "PMC11314417", "PMC11347466", "PMC4470685", "PMC3116045", "PMC11809887", "PMC4833149", "PMC5726942", "PMC2749505", "PMC11852071", "PMC5510236", "PMC4598210", "PMC1251635", "PMC10463210", "PMC4469933", "PMC10747255", "PMC10582663", "PMC10957942", "PMC3570048", "PMC6071997", "PMC4406866", "PMC10275785", "PMC4297489", "PMC10565537", "PMC3865618", "PMC6855320", "PMC11022290", "PMC5749387", "PMC3978988", "PMC9322346", "PMC8599229", "PMC3598593", "PMC3610685", "PMC5599305", "PMC9657232", "PMC5402961", "PMC5524513", "PMC4221105", "PMC2704695", "PMC2910688", "PMC5316454", "PMC5249113", "PMC9934922", "PMC3959225", "PMC4735961", "PMC5533497", "PMC5492788", "PMC4922322", "PMC3805522", "PMC10139129", "PMC5963414", "PMC4854407", "PMC4039203", "PMC3734608", "PMC4343187", "PMC10327396", "PMC8263746", "PMC2748889", "PMC8081740", "PMC4108472", "PMC2042888", "PMC3093079", "PMC5829963", "PMC5949564", "PMC5656562", "PMC4640545", "PMC3071070", "PMC3383686", "PMC8767566", "PMC4999337", "PMC4271081", "PMC5395152", "PMC4872305", "PMC2794198", "PMC3478502", "PMC5558527", "PMC3066089", "PMC4764353", "PMC3471928", "PMC10145266", "PMC10599059", "PMC5899062", "PMC9552901", "PMC3137047", "PMC7305826", "PMC9481373", "PMC4375579", "PMC4010098", "PMC7217737", "PMC1365130", "PMC3476140", "PMC3755037", "PMC2981241", "PMC3834132", "PMC4931885", "PMC3148255", "PMC2767285", "PMC3901533", "PMC3548029", "PMC4651007", "PMC11140026", "PMC9413960", "PMC4282597", "PMC4484731", "PMC9450009", "PMC5903234", "PMC11269006", "PMC5795999", "PMC1769026", "PMC8100460", "PMC4375304", "PMC4332701", "PMC10880264", "PMC8441053", "PMC4701680", "PMC4412845", "PMC2732914", "PMC9701885", "PMC3779247", "PMC5632935", "PMC4364852", "PMC4002408", "PMC2919241", "PMC1762324", "PMC9321338", "PMC2858245", "PMC3245828", "PMC2599947", "PMC5421731", "PMC3260990", "PMC5087931", "PMC3637851", "PMC8359222", "PMC3100476", "PMC9532634", "PMC11508189", "PMC6960206", "PMC5558529", "PMC4000411", "PMC10648962", "PMC10769478", "PMC3131846", "PMC11023817", "PMC3698861", "PMC4667947", "PMC4544820", "PMC2675161", "PMC4025175", "PMC5606007", "PMC7039663", "PMC4921119", "PMC3143437", "PMC5309133", "PMC11458732", "PMC6370172", "PMC5192124", "PMC4788379", "PMC2000640", "PMC10085626", "PMC5061780", "PMC5908896", "PMC4631186", "PMC3248257", "PMC4814312", "PMC4240933", "PMC10478012", "PMC3244642", "PMC6128165", "PMC4209173", "PMC4585967", "PMC11584383", "PMC4503165", "PMC4778608", "PMC11012255", "PMC7613628", "PMC4972156", "PMC9373641", "PMC5391994", "PMC4876172", "PMC7292331", "PMC9820795", "PMC3048137", "PMC2000718", "PMC11507373", "PMC3425006", "PMC4505931", "PMC8800862", "PMC4806848", "PMC5546927", "PMC3672984", "PMC11862786", "PMC5346382", "PMC3061841", "PMC11000398", "PMC11141156", "PMC5655282", "PMC4764723", "PMC7423195", "PMC10452379", "PMC5342450", "PMC7197488", "PMC1364713", "PMC6759913", "PMC3753327", "PMC8141066", "PMC2014382", "PMC3686783", "PMC5048209", "PMC4928097", "PMC4735517", "PMC6021962", "PMC2885152", "PMC10529681", "PMC4892230", "PMC2570505", "PMC4462564", "PMC6939828", "PMC2042718", "PMC10825484", "PMC4338734", "PMC5404990", "PMC6631360", "PMC7028104", "PMC4615595", "PMC1767618", "PMC6891932", "PMC3213989", "PMC3680019", "PMC11094496", "PMC5817390", "PMC5944577", "PMC4943390", "PMC11140815", "PMC11605493", "PMC3462355", "PMC1746721", "PMC3925114", "PMC3895354", "PMC3125052", "PMC4669157", "PMC5098919", "PMC11520374", "PMC3029819", "PMC9031832", "PMC3107291", "PMC5908314", "PMC4600600", "PMC3506814", "PMC6049926", "PMC5412267", "PMC5355968", "PMC539815", "PMC3640375", "PMC6408006", "PMC5051541", "PMC2660379", "PMC4323272", "PMC4104334", "PMC4976849", "PMC3180021", "PMC7303159", "PMC10532840", "PMC2760462", "PMC10337687", "PMC5548439", "PMC11264771", "PMC3550197", "PMC11943653", "PMC7347085", "PMC7214659", "PMC4722076", "PMC4155516", "PMC3958404", "PMC6752321", "PMC7427977", "PMC2957581", "PMC3080643", "PMC9819208", "PMC3225067", "PMC4932617", "PMC6375065", "PMC3175513", "PMC7718230", "PMC3537445", "PMC10758687", "PMC3858547", "PMC3370715", "PMC11095822", "PMC5167198", "PMC2976128", "PMC3734060", "PMC10852661", "PMC4201132", "PMC10974048", "PMC6423619", "PMC3621996", "PMC3910794", "PMC7999651", "PMC11120965", "PMC5378677", "PMC9468644", "PMC5600689", "PMC7308427", "PMC3034442", "PMC16264", "PMC3776990", "PMC1887589", "PMC2855513", "PMC5721751", "PMC2194758", "PMC2547143", "PMC5590735", "PMC2743299", "PMC4350512", "PMC6011347"]
\ No newline at end of file
diff --git a/src/load_data/__init__.py b/src/load_data/__init__.py
deleted file mode 100644
index c2694d6..0000000
--- a/src/load_data/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .load_clinical_variants import load_variant_annotations_tsv, get_pmid_list
diff --git a/src/load_data/README.md b/src/load_variants/README.md
similarity index 85%
rename from src/load_data/README.md
rename to src/load_variants/README.md
index 997987a..0b6ca40 100644
--- a/src/load_data/README.md
+++ b/src/load_variants/README.md
@@ -6,10 +6,10 @@ This module handles the loading and preprocessing of PharmGKB clinical variants
1. **`download_and_extract_variant_annotations(override: bool = False)`**
- Downloads and extracts the variant annotations ZIP file from PharmGKB
- - Saves data to `saved_data/variantAnnotations/`
+ - Saves data to `data/variantAnnotations/`
- Can override existing downloads if needed
-2. **`load_variant_annotations_tsv(override: bool = False)`**
+2. **`load_raw_variant_annotations(override: bool = False)`**
- Loads the variant annotations TSV file into a pandas DataFrame
- Automatically downloads data if not present
- Returns the DataFrame containing variant-drug annotations
@@ -21,7 +21,7 @@ This module handles the loading and preprocessing of PharmGKB clinical variants
4. **`get_pmid_list(override: bool = False)`**
- Main function to extract PMIDs from the variant annotations
- Returns a list of unique PMIDs
- - Caches results in `saved_data/pmid_list.json`
+ - Caches results in `data/pmid_list.json`
- Used as input for PMCID conversion
The module handles all data downloading, extraction, and preprocessing steps needed to get the PMID list for subsequent steps in the pipeline.
diff --git a/src/load_variants/__init__.py b/src/load_variants/__init__.py
new file mode 100644
index 0000000..6c56850
--- /dev/null
+++ b/src/load_variants/__init__.py
@@ -0,0 +1,5 @@
+from .load_clinical_variants import (
+ load_raw_variant_annotations,
+ get_pmid_list,
+ variant_annotations_pipeline,
+)
diff --git a/src/load_data/load_clinical_variants.py b/src/load_variants/load_clinical_variants.py
similarity index 67%
rename from src/load_data/load_clinical_variants.py
rename to src/load_variants/load_clinical_variants.py
index 4176343..c43b340 100644
--- a/src/load_data/load_clinical_variants.py
+++ b/src/load_variants/load_clinical_variants.py
@@ -7,6 +7,8 @@
import pandas as pd
import json
+from src.utils.file_paths import get_project_root
+
"""
This file contains functions to load the clinical variants data from the PharmGKB API.
The key function is get_pmid_list(), which loads the PMIDs from the variant annotations tsv file and saves them to a json file.
@@ -24,8 +26,7 @@ def download_and_extract_variant_annotations(override: bool = False) -> str:
"""
url = "https://api.pharmgkb.org/v1/download/file/data/variantAnnotations.zip"
- base_dir = os.path.dirname(os.path.abspath(__file__))
- save_dir = os.path.join(base_dir, "saved_data")
+ save_dir = os.path.join(get_project_root(), "data")
extract_dir = os.path.join(save_dir, "variantAnnotations")
if os.path.exists(extract_dir):
@@ -49,7 +50,7 @@ def download_and_extract_variant_annotations(override: bool = False) -> str:
return extract_dir
-def load_variant_annotations_tsv(override: bool = False) -> pd.DataFrame:
+def load_raw_variant_annotations(override: bool = False) -> pd.DataFrame:
"""
Loads the variant annotations tsv file.
If the file does not exist, it will be downloaded and extracted.
@@ -58,9 +59,8 @@ def load_variant_annotations_tsv(override: bool = False) -> pd.DataFrame:
Returns:
pd.DataFrame: The loaded variant annotations tsv file.
"""
- base_dir = os.path.dirname(os.path.abspath(__file__))
tsv_path = os.path.join(
- base_dir, "saved_data", "variantAnnotations", "var_drug_ann.tsv"
+ get_project_root(), "data", "variantAnnotations", "var_drug_ann.tsv"
)
if not os.path.exists(tsv_path):
@@ -96,43 +96,17 @@ def unique_variants(df: pd.DataFrame) -> dict:
return {col: df[col].unique().tolist() for col in df.columns}
-def load_unique_variants(save_results: bool = True) -> dict:
- """
- Loads the unique variants from the variant annotations tsv file and saves them to a json file.
- If the json file already exists, it will be loaded from the file.
- NOTE: Don't think this function is needed anymore. get_pmid_list() is used instead.
- """
- base_dir = os.path.dirname(os.path.abspath(__file__))
- unique_variants_path = os.path.join(base_dir, "saved_data", "unique_variants.json")
- if os.path.exists(unique_variants_path):
- logger.info(f"Loading unique variants from {unique_variants_path}")
- with open(unique_variants_path, "r") as f:
- unique_values_per_column = json.load(f)
- else:
- logger.info(
- f"Unique variants not found at {unique_variants_path}. Loading from tsv file..."
- )
- df = load_variant_annotations_tsv()
- unique_values_per_column = unique_variants(df)
- if save_results:
- logger.info(f"Saving unique variants to {unique_variants_path}")
- with open(unique_variants_path, "w") as f:
- json.dump(unique_values_per_column, f)
- return unique_values_per_column
-
-
def get_pmid_list(override: bool = False) -> list:
"""
Loads the pmid list from the variant annotations tsv file.
"""
- base_dir = os.path.dirname(os.path.abspath(__file__))
- pmid_list_path = os.path.join(base_dir, "saved_data", "pmid_list.json")
+ pmid_list_path = os.path.join(get_project_root(), "data", "pmid_list.json")
if os.path.exists(pmid_list_path):
logger.info(f"Loading PMIDs from {pmid_list_path}")
with open(pmid_list_path, "r") as f:
pmid_list = json.load(f)
else:
- df = load_variant_annotations_tsv(override)
+ df = load_raw_variant_annotations(override)
pmid_list = df["PMID"].unique().tolist()
logger.info(f"Saving PMIDs to {pmid_list_path}")
with open(pmid_list_path, "w") as f:
@@ -140,6 +114,23 @@ def get_pmid_list(override: bool = False) -> list:
return pmid_list
-if __name__ == "__main__":
+def variant_annotations_pipeline():
+ """
+ Loads the variant annotations tsv file and saves the unique PMIDs to a json file.
+ """
+ # Download and extract the variant annotations
+ logger.info("Downloading and extracting variant annotations...")
+ download_and_extract_variant_annotations()
+
+ # Load the variant annotations
+ logger.info("Loading variant annotations...")
+ df = load_raw_variant_annotations()
+
+ # Get the PMIDs
+ logger.info("Getting PMIDs...")
pmid_list = get_pmid_list()
- print(f"Number of unique PMIDs: {len(pmid_list)}")
+ logger.info(f"Number of unique PMIDs: {len(pmid_list)}")
+
+
+if __name__ == "__main__":
+ variant_annotations_pipeline()
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000..4deed16
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1 @@
+from .file_paths import get_project_root
diff --git a/src/utils/file_paths.py b/src/utils/file_paths.py
new file mode 100644
index 0000000..229c652
--- /dev/null
+++ b/src/utils/file_paths.py
@@ -0,0 +1,11 @@
+import os
+from pathlib import Path
+
+
+def get_project_root() -> Path:
+ """
+ Return the project root directory.
+ """
+ # Assuming src is a top-level directory in the project
+ current_file = Path(__file__)
+ return current_file.parent.parent.parent