From 548af994e27e6b402ba98cbac54a40fddda864fd Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Thu, 23 Oct 2025 13:58:02 -0700 Subject: [PATCH 1/5] feat: initial benchmark scaffold --- src/benchmark/__init__.py | 0 src/benchmark/main.py | 45 +++++++++++++++++++++++++++++++++++++++ src/utils.py | 10 +++++++-- 3 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 src/benchmark/__init__.py create mode 100644 src/benchmark/main.py diff --git a/src/benchmark/__init__.py b/src/benchmark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/benchmark/main.py b/src/benchmark/main.py new file mode 100644 index 0000000..acf69f2 --- /dev/null +++ b/src/benchmark/main.py @@ -0,0 +1,45 @@ +from typing import List +from src.utils import get_pmcid_annotation + +class Benchmark: + def __init__(self): + pass + + def load_data(self): + pass + + def get_var_drug_ann_score(self, var_drug_ann: List[dict]): + return 1.0 + + def get_var_pheno_ann_score(self, var_pheno_ann: List[dict]): + return 1.0 + + def get_var_fa_ann_score(self, var_fa_ann: List[dict]): + return 1.0 + + def get_study_parameters_score(self, study_parameters: List[dict]): + return 1.0 + + def calculate_total_score(self, var_drug_ann: List[dict], var_pheno_ann: List[dict], var_fa_ann: List[dict], study_parameters: List[dict]): + # Return average of all scores + scores = [self.get_var_drug_ann_score(var_drug_ann), self.get_var_pheno_ann_score(var_pheno_ann), self.get_var_fa_ann_score(var_fa_ann), self.get_study_parameters_score(study_parameters)] + return sum(scores) / len(scores) + + def run(self, pmcid: str): + pmcid_annotation = get_pmcid_annotation(pmcid) + + var_drug_ann = pmcid_annotation.get("varDrugAnn", []) + var_pheno_ann = pmcid_annotation.get("varPhenoAnn", []) + var_fa_ann = pmcid_annotation.get("varFaAnn", []) + study_parameters = pmcid_annotation.get("studyParameters", []) + + total_score = self.calculate_total_score(var_drug_ann, var_pheno_ann, var_fa_ann, study_parameters) + print(f"Total score for pmcid {pmcid}: {total_score}") + return total_score + + + + +if __name__ == "__main__": + benchmark = Benchmark() + benchmark.run("PMC123456") \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index 5076921..b990aea 100644 --- a/src/utils.py +++ b/src/utils.py @@ -5,10 +5,16 @@ from termcolor import colored from src.article_parser import MarkdownParser from pydantic import BaseModel, ValidationError +from pathlib import Path _true_variant_cache: Optional[dict] = None +def get_pmcid_annotation(pmcid: str, annotations_by_pmcid: Path = Path("data/annotations_by_pmcid.json")) -> dict: + with open(annotations_by_pmcid, "r") as f: + annotations_by_pmcid = json.load(f) + return annotations_by_pmcid.get(pmcid, {}) + def extractVariantsRegex(text): # Note, seems to extract a ton of variants, not just the ones that are being studied # Think it might only be applicable to rsIDs @@ -79,7 +85,7 @@ def compare_lists( return true_positives, true_negatives, false_positives, false_negatives -def get_true_variants(pmcid: str) -> List[str]: +def get_true_variants(pmcid: str, annotations_by_pmcid: Path) -> List[str]: """ Get the actual annotated variants for a given PMCID. Uses module-level caching to load the JSON file only once. @@ -88,7 +94,7 @@ def get_true_variants(pmcid: str) -> List[str]: if _true_variant_cache is None: try: - with open("data/benchmark/true_variant_list.json", "r") as f: + with open(annotations_by_pmcid, "r") as f: _true_variant_cache = json.load(f) except FileNotFoundError: logger.error( From b60619c3fee096a0ff8cbfb8dd3f482293cbf710 Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Thu, 23 Oct 2025 14:03:12 -0700 Subject: [PATCH 2/5] chore: cleanup --- src/benchmark/__init__.py | 1 + src/benchmark/main.py | 9 ++------- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/benchmark/__init__.py b/src/benchmark/__init__.py index e69de29..55c6967 100644 --- a/src/benchmark/__init__.py +++ b/src/benchmark/__init__.py @@ -0,0 +1 @@ +from .main import Benchmark \ No newline at end of file diff --git a/src/benchmark/main.py b/src/benchmark/main.py index acf69f2..333a180 100644 --- a/src/benchmark/main.py +++ b/src/benchmark/main.py @@ -1,13 +1,10 @@ from typing import List from src.utils import get_pmcid_annotation -class Benchmark: +class AnnotationBenchmark: def __init__(self): pass - def load_data(self): - pass - def get_var_drug_ann_score(self, var_drug_ann: List[dict]): return 1.0 @@ -38,8 +35,6 @@ def run(self, pmcid: str): return total_score - - if __name__ == "__main__": - benchmark = Benchmark() + benchmark = AnnotationBenchmark() benchmark.run("PMC123456") \ No newline at end of file From f0ec044af3f8db1c9ed562fa65115af698d9329c Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Thu, 23 Oct 2025 14:03:40 -0700 Subject: [PATCH 3/5] chore: black format --- src/benchmark/__init__.py | 2 +- src/benchmark/main.py | 32 +++++++++++++++++++++++--------- src/utils.py | 5 ++++- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/src/benchmark/__init__.py b/src/benchmark/__init__.py index 55c6967..45c96dd 100644 --- a/src/benchmark/__init__.py +++ b/src/benchmark/__init__.py @@ -1 +1 @@ -from .main import Benchmark \ No newline at end of file +from .main import Benchmark diff --git a/src/benchmark/main.py b/src/benchmark/main.py index 333a180..2c7e845 100644 --- a/src/benchmark/main.py +++ b/src/benchmark/main.py @@ -1,6 +1,7 @@ from typing import List from src.utils import get_pmcid_annotation + class AnnotationBenchmark: def __init__(self): pass @@ -10,31 +11,44 @@ def get_var_drug_ann_score(self, var_drug_ann: List[dict]): def get_var_pheno_ann_score(self, var_pheno_ann: List[dict]): return 1.0 - + def get_var_fa_ann_score(self, var_fa_ann: List[dict]): return 1.0 - + def get_study_parameters_score(self, study_parameters: List[dict]): return 1.0 - - def calculate_total_score(self, var_drug_ann: List[dict], var_pheno_ann: List[dict], var_fa_ann: List[dict], study_parameters: List[dict]): + + def calculate_total_score( + self, + var_drug_ann: List[dict], + var_pheno_ann: List[dict], + var_fa_ann: List[dict], + study_parameters: List[dict], + ): # Return average of all scores - scores = [self.get_var_drug_ann_score(var_drug_ann), self.get_var_pheno_ann_score(var_pheno_ann), self.get_var_fa_ann_score(var_fa_ann), self.get_study_parameters_score(study_parameters)] + scores = [ + self.get_var_drug_ann_score(var_drug_ann), + self.get_var_pheno_ann_score(var_pheno_ann), + self.get_var_fa_ann_score(var_fa_ann), + self.get_study_parameters_score(study_parameters), + ] return sum(scores) / len(scores) def run(self, pmcid: str): pmcid_annotation = get_pmcid_annotation(pmcid) - + var_drug_ann = pmcid_annotation.get("varDrugAnn", []) var_pheno_ann = pmcid_annotation.get("varPhenoAnn", []) var_fa_ann = pmcid_annotation.get("varFaAnn", []) study_parameters = pmcid_annotation.get("studyParameters", []) - - total_score = self.calculate_total_score(var_drug_ann, var_pheno_ann, var_fa_ann, study_parameters) + + total_score = self.calculate_total_score( + var_drug_ann, var_pheno_ann, var_fa_ann, study_parameters + ) print(f"Total score for pmcid {pmcid}: {total_score}") return total_score if __name__ == "__main__": benchmark = AnnotationBenchmark() - benchmark.run("PMC123456") \ No newline at end of file + benchmark.run("PMC123456") diff --git a/src/utils.py b/src/utils.py index b990aea..5f0ea3d 100644 --- a/src/utils.py +++ b/src/utils.py @@ -10,11 +10,14 @@ _true_variant_cache: Optional[dict] = None -def get_pmcid_annotation(pmcid: str, annotations_by_pmcid: Path = Path("data/annotations_by_pmcid.json")) -> dict: +def get_pmcid_annotation( + pmcid: str, annotations_by_pmcid: Path = Path("data/annotations_by_pmcid.json") +) -> dict: with open(annotations_by_pmcid, "r") as f: annotations_by_pmcid = json.load(f) return annotations_by_pmcid.get(pmcid, {}) + def extractVariantsRegex(text): # Note, seems to extract a ton of variants, not just the ones that are being studied # Think it might only be applicable to rsIDs From 7902b4be4f526b36cc6e0898c37a87a78455bd6d Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Thu, 23 Oct 2025 14:10:42 -0700 Subject: [PATCH 4/5] feat: run all method --- src/benchmark/__init__.py | 1 - .../{main.py => annotation_benchmark.py} | 17 ++++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) rename src/benchmark/{main.py => annotation_benchmark.py} (76%) diff --git a/src/benchmark/__init__.py b/src/benchmark/__init__.py index 45c96dd..e69de29 100644 --- a/src/benchmark/__init__.py +++ b/src/benchmark/__init__.py @@ -1 +0,0 @@ -from .main import Benchmark diff --git a/src/benchmark/main.py b/src/benchmark/annotation_benchmark.py similarity index 76% rename from src/benchmark/main.py rename to src/benchmark/annotation_benchmark.py index 2c7e845..ed0376e 100644 --- a/src/benchmark/main.py +++ b/src/benchmark/annotation_benchmark.py @@ -1,7 +1,6 @@ from typing import List from src.utils import get_pmcid_annotation - class AnnotationBenchmark: def __init__(self): pass @@ -45,10 +44,22 @@ def run(self, pmcid: str): total_score = self.calculate_total_score( var_drug_ann, var_pheno_ann, var_fa_ann, study_parameters ) - print(f"Total score for pmcid {pmcid}: {total_score}") + print(f"Score for pmcid {pmcid}: {total_score}") return total_score + def run_all(self): + benchmark_pmcids = [] + with open("persistent_data/benchmark_pmcids.txt", "r") as f: + benchmark_pmcids = f.read().splitlines() + scores = [] + for pmcid in benchmark_pmcids: + scores.append(self.run(pmcid)) + + overall_score = sum(scores) / len(scores) + print(f"Average score: {overall_score}") + return overall_score + if __name__ == "__main__": benchmark = AnnotationBenchmark() - benchmark.run("PMC123456") + benchmark.run_all() From ab00b8a0a16d27279c308584c9cb0ebc6efdfe3a Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Thu, 23 Oct 2025 14:39:52 -0700 Subject: [PATCH 5/5] chore: black --- src/benchmark/annotation_benchmark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/benchmark/annotation_benchmark.py b/src/benchmark/annotation_benchmark.py index ed0376e..0f54681 100644 --- a/src/benchmark/annotation_benchmark.py +++ b/src/benchmark/annotation_benchmark.py @@ -1,6 +1,7 @@ from typing import List from src.utils import get_pmcid_annotation + class AnnotationBenchmark: def __init__(self): pass