From e5bb251fe4db49ba07e7c303156e652ff28005f3 Mon Sep 17 00:00:00 2001
From: Vincent Yip <realsharkguy@gmail.com>
Date: Fri, 31 Oct 2025 14:08:51 -0700
Subject: [PATCH 1/2] added phenotype benchmark

---
 persistent_data/benchmark_annotations.json | 248 ++++++++++++++++++--
 src/benchmark/annotation_benchmark.py      |   8 +-
 src/benchmark/pheno_benchmark.py           | 256 +++++++++++++++++++++
 3 files changed, 494 insertions(+), 18 deletions(-)
 create mode 100644 src/benchmark/pheno_benchmark.py

diff --git a/persistent_data/benchmark_annotations.json b/persistent_data/benchmark_annotations.json
index d5b63bc..4d521ea 100644
--- a/persistent_data/benchmark_annotations.json
+++ b/persistent_data/benchmark_annotations.json
@@ -230,7 +230,7 @@
         "Population types": null,
         "Population Phenotypes or diseases": null,
         "Multiple phenotypes or diseases And/or": null,
-        "Comparison Allele(s) or Genotype(s)": "*1/*1 + *1/*2 + *1/*3",
+        "Comparison Allele(s) or Genotype(s)": "CYP2C9*1/*1 + CYP2C9*1/*2 + CYP2C9*1/*3",
         "Comparison Metabolizer types": "normal metabolizer and intermediate metabolizer",
         "PMID_norm": "37490620",
         "Variant Annotation ID_norm": "1452196120"
@@ -244,8 +244,37 @@
         "Phenotype Category": "Toxicity",
         "Significance": "no",
         "Notes": "effect is described as \"Statin switch\" as measured by changes in prescription refills in the medical record, and attributed to muscle side effects (therefore tagged as toxicity). \"No significant association was observed with atorvastatin and rosuvastatin.\" \"The sample sizes for fluvastatin and pravastatin were relatively small\"\"No statistically significant association was found for fluvastatin\"",
-        "Sentence": "Genotypes CC + CT is not associated with increased risk of discontinuation when treated with atorvastatin, fluvastatin or rosuvastatin as compared to genotype TT.",
-        "Alleles": "CC + CT",
+        "Sentence": "Genotypes CC is not associated with increased risk of discontinuation when treated with atorvastatin, fluvastatin or rosuvastatin as compared to genotype TT.",
+        "Alleles": "CC",
+        "Specialty Population": null,
+        "Metabolizer types": null,
+        "isPlural": "Is",
+        "Is/Is Not associated": "Not associated with",
+        "Direction of effect": "increased",
+        "Side effect/efficacy/other": "risk of",
+        "Phenotype": "Side Effect:Discontinuation",
+        "Multiple phenotypes And/or": null,
+        "When treated with/exposed to/when assayed with": "when treated with",
+        "Multiple drugs And/or": "or",
+        "Population types": null,
+        "Population Phenotypes or diseases": null,
+        "Multiple phenotypes or diseases And/or": null,
+        "Comparison Allele(s) or Genotype(s)": "TT",
+        "Comparison Metabolizer types": null,
+        "PMID_norm": "37490620",
+        "Variant Annotation ID_norm": "1452196040"
+      },
+      {
+        "Variant Annotation ID": 1452196040,
+        "Variant/Haplotypes": "rs4149056",
+        "Gene": "SLCO1B1",
+        "Drug(s)": "atorvastatin, fluvastatin, rosuvastatin",
+        "PMID": 37490620,
+        "Phenotype Category": "Toxicity",
+        "Significance": "no",
+        "Notes": "effect is described as \"Statin switch\" as measured by changes in prescription refills in the medical record, and attributed to muscle side effects (therefore tagged as toxicity). \"No significant association was observed with atorvastatin and rosuvastatin.\" \"The sample sizes for fluvastatin and pravastatin were relatively small\"\"No statistically significant association was found for fluvastatin\"",
+        "Sentence": "Genotypes CT is not associated with increased risk of discontinuation when treated with atorvastatin, fluvastatin or rosuvastatin as compared to genotype TT.",
+        "Alleles": "CT",
         "Specialty Population": null,
         "Metabolizer types": null,
         "isPlural": "Is",
@@ -273,8 +302,37 @@
         "Phenotype Category": "Toxicity",
         "Significance": "no",
         "Notes": "effect is described as \"Statin switch\" as measured by changes in prescription refills in the medical record, and attributed to muscle side effects (therefore tagged as toxicity). \"No significant association was observed with atorvastatin and rosuvastatin.\" \"The sample sizes for fluvastatin and pravastatin were relatively small\"\"No statistically significant association was found for fluvastatin\" Alleles complemented to plus chromosomal strand.",
-        "Sentence": "Genotypes GT + TT is not associated with increased risk of discontinuation when treated with atorvastatin, fluvastatin or rosuvastatin as compared to genotype GG.",
-        "Alleles": "GT + TT",
+        "Sentence": "Genotypes GT is not associated with increased risk of discontinuation when treated with atorvastatin, fluvastatin or rosuvastatin as compared to genotype GG.",
+        "Alleles": "GT",
+        "Specialty Population": null,
+        "Metabolizer types": null,
+        "isPlural": "Is",
+        "Is/Is Not associated": "Not associated with",
+        "Direction of effect": "increased",
+        "Side effect/efficacy/other": "risk of",
+        "Phenotype": "Side Effect:Discontinuation",
+        "Multiple phenotypes And/or": null,
+        "When treated with/exposed to/when assayed with": "when treated with",
+        "Multiple drugs And/or": "or",
+        "Population types": null,
+        "Population Phenotypes or diseases": null,
+        "Multiple phenotypes or diseases And/or": null,
+        "Comparison Allele(s) or Genotype(s)": "GG",
+        "Comparison Metabolizer types": null,
+        "PMID_norm": "37490620",
+        "Variant Annotation ID_norm": "1452196080"
+      },
+      {
+        "Variant Annotation ID": 1452196080,
+        "Variant/Haplotypes": "rs2231142",
+        "Gene": "ABCG2",
+        "Drug(s)": "atorvastatin, fluvastatin, rosuvastatin",
+        "PMID": 37490620,
+        "Phenotype Category": "Toxicity",
+        "Significance": "no",
+        "Notes": "effect is described as \"Statin switch\" as measured by changes in prescription refills in the medical record, and attributed to muscle side effects (therefore tagged as toxicity). \"No significant association was observed with atorvastatin and rosuvastatin.\" \"The sample sizes for fluvastatin and pravastatin were relatively small\"\"No statistically significant association was found for fluvastatin\" Alleles complemented to plus chromosomal strand.",
+        "Sentence": "Genotypes TT is not associated with increased risk of discontinuation when treated with atorvastatin, fluvastatin or rosuvastatin as compared to genotype GG.",
+        "Alleles": "TT",
         "Specialty Population": null,
         "Metabolizer types": null,
         "isPlural": "Is",
@@ -1393,7 +1451,7 @@
         "Significance": "yes",
         "Notes": null,
         "Sentence": "CYP2B6 *9 is associated with increased likelihood of discontinuation when treated with efavirenz in people with HIV as compared to CYP2B6 *1/*1.",
-        "Alleles": "*9",
+        "Alleles": "CYP2B6 *9",
         "Specialty Population": null,
         "Metabolizer types": null,
         "isPlural": "Is",
@@ -1407,7 +1465,7 @@
         "Population types": "in people with",
         "Population Phenotypes or diseases": "Disease:HIV infectious disease",
         "Multiple phenotypes or diseases And/or": null,
-        "Comparison Allele(s) or Genotype(s)": "*1/*1",
+        "Comparison Allele(s) or Genotype(s)": "CYP2B6*1/CYP2B6*1",
         "Comparison Metabolizer types": null,
         "PMID_norm": "26715213",
         "Variant Annotation ID_norm": "1448993783"
@@ -1508,11 +1566,11 @@
         "Phenotype Category": "Toxicity",
         "Significance": "yes",
         "Notes": null,
-        "Sentence": "Genotype AA are associated with increased risk of Central Nervous System Diseases when treated with efavirenz in people with HIV Infections as compared to genotype GG.",
-        "Alleles": "AA",
+        "Sentence": "Genotype TT are associated with increased risk of Central Nervous System Diseases when treated with efavirenz in people with HIV Infections as compared to genotype GG.",
+        "Alleles": "TT",
         "Specialty Population": null,
         "Metabolizer types": null,
-        "isPlural": "Are",
+        "isPlural": "Is",
         "Is/Is Not associated": "Associated with",
         "Direction of effect": "increased",
         "Side effect/efficacy/other": "risk of",
@@ -1523,7 +1581,7 @@
         "Population types": "in people with",
         "Population Phenotypes or diseases": "Disease:HIV infectious disease",
         "Multiple phenotypes or diseases And/or": null,
-        "Comparison Allele(s) or Genotype(s)": "GG",
+        "Comparison Allele(s) or Genotype(s)": "CT+CC",
         "Comparison Metabolizer types": null,
         "PMID_norm": "26715213",
         "Variant Annotation ID_norm": "1448993831"
@@ -2789,6 +2847,32 @@
         "PMID_norm": "40297930",
         "Variant Annotation ID_norm": "1454052260"
       },
+      {
+        "Variant Annotation ID": 1454052260,
+        "Variant/Haplotypes": "SLCO1B1*1b/*15",
+        "Gene": "SLCO1B1",
+        "Drug(s)": "simvastatin acid",
+        "PMID": 40297930,
+        "Phenotype Category": "Metabolism/PK",
+        "Significance": "yes",
+        "Notes": "\"Similarly, the SLCO1B1*1b/*15 genotype was associated with higher simvastatin acid levels than SLCO1B1*1a/*1a (0.58 vs. 0.16 ng/mL, p < 0.001). \"",
+        "Sentence": "Patients with the SLCO1B1*1b/*15 diplotype are associated with increased simvastatin acid levels when treated with simvastatin in patients with dyslipidemia or coronary artery disease.",
+        "Alleles": "SLCO1B1*1b/*15",
+        "Specialty Population": null,
+        "Metabolizer types": null,
+        "isPlural": "Is",
+        "Is/Is Not associated": "Associated with",
+        "Direction of effect": "increased",
+        "PD/PK terms": "levels of",
+        "Multiple drugs And/or": null,
+        "Population types": "in patients with",
+        "Population Phenotypes or diseases": "Other:Cardiovascular Disease, Other:Dyslipidaemia",
+        "Multiple phenotypes or diseases And/or": "or",
+        "Comparison Allele(s) or Genotype(s)": "SLCO1B1*1a/*1a",
+        "Comparison Metabolizer types": null,
+        "PMID_norm": "40297930",
+        "Variant Annotation ID_norm": "1454052260"
+      },
       {
         "Variant Annotation ID": 1454052282,
         "Variant/Haplotypes": "rs2306283",
@@ -2798,11 +2882,11 @@
         "Phenotype Category": "Metabolism/PK",
         "Significance": "yes",
         "Notes": "\"Additionally, SLCO1B1 rs2306283 was associated with significantly higher SVA levels in patients carrying the G allele (AG+GG genotype) at the same dose (3.63 vs. 1.59\u2009ng/mL, p\u2009=\u20090.04). \"",
-        "Sentence": "Genotypes AG + GG is associated with increased concentrations of simvastatin acid in people with Cardiovascular Disease or Dyslipidaemia as compared to genotype AA.",
+        "Sentence": "Genotypes AG + GG are associated with increased concentrations of simvastatin acid in people with Cardiovascular Disease or Dyslipidaemia as compared to genotype AA.",
         "Alleles": "AG + GG",
         "Specialty Population": null,
         "Metabolizer types": null,
-        "isPlural": "Is",
+        "isPlural": "Are",
         "Is/Is Not associated": "Associated with",
         "Direction of effect": "increased",
         "PD/PK terms": "concentrations of",
@@ -3712,6 +3796,32 @@
         "Comparison Metabolizer types": null,
         "PMID_norm": "30336686",
         "Variant Annotation ID_norm": "1451227020"
+      },
+      {
+        "Variant Annotation ID": 1451227020,
+        "Variant/Haplotypes": "?",
+        "Gene": "SLCO1B1",
+        "Drug(s)": "simvastatin",
+        "PMID": 30336686,
+        "Phenotype Category": "Efficacy",
+        "Significance": "yes",
+        "Notes": "Patients carrying 388G allele alone is not significantly associated with greater TC and LDL-C reduction in response to simvastatin after 4 weeks of treatment. no significant associations were found between the 521T>C and 388A>G polymorphisms and the lipid-lowering effects of simvastatin treatment after 8 weeks.",
+        "Sentence": "Genotypes AA are associated with increased response to simvastatin in people with Hypercholesterolemia as compared to genotype AG + GG.",
+        "Alleles": "AA",
+        "Specialty Population": null,
+        "Metabolizer types": null,
+        "isPlural": "Are",
+        "Is/Is Not associated": "Is associated",
+        "Direction of effect": "increased",
+        "PD/PK terms": "response to",
+        "Multiple drugs And/or": null,
+        "Population types": "in people with",
+        "Population Phenotypes or diseases": "Other:Hypercholesterolemia",
+        "Multiple phenotypes or diseases And/or": null,
+        "Comparison Allele(s) or Genotype(s)": "AG + GG",
+        "Comparison Metabolizer types": null,
+        "PMID_norm": "30336686",
+        "Variant Annotation ID_norm": "1451227020"
       }
     ],
     "var_pheno_ann": [],
@@ -6915,7 +7025,33 @@
         "Phenotype Category": "Efficacy",
         "Significance": "no",
         "Notes": "No significant difference in median CD4 T cell counts of each genotype at different time points was seen in efavirenz group (p = 0.818, 0.838, 0.783, 0.753 and 0.587 for baseline, weeks 12, 24, 36 and 48 of ART, respectively).",
-        "Sentence": "Genotype TT is not associated with response to efavirenz in people with HIV Infections and Tuberculosis as compared to genotypes GG + GT.",
+        "Sentence": "Genotype TT is not associated with response to efavirenz in people with HIV Infections and Tuberculosis as compared to genotypes GG.",
+        "Alleles": "TT",
+        "Specialty Population": null,
+        "Metabolizer types": null,
+        "isPlural": "Is",
+        "Is/Is Not associated": "Not associated with",
+        "Direction of effect": null,
+        "PD/PK terms": "response to",
+        "Multiple drugs And/or": null,
+        "Population types": "in people with",
+        "Population Phenotypes or diseases": "Disease:HIV infectious disease, Disease:Tuberculosis",
+        "Multiple phenotypes or diseases And/or": "and",
+        "Comparison Allele(s) or Genotype(s)": "GG",
+        "Comparison Metabolizer types": null,
+        "PMID_norm": "20338069",
+        "Variant Annotation ID_norm": "1448993666"
+      },
+      {
+        "Variant Annotation ID": 1448993666,
+        "Variant/Haplotypes": "rs3745274",
+        "Gene": "CYP2B6",
+        "Drug(s)": "efavirenz",
+        "PMID": 20338069,
+        "Phenotype Category": "Efficacy",
+        "Significance": "no",
+        "Notes": "No significant difference in median CD4 T cell counts of each genotype at different time points was seen in efavirenz group (p = 0.818, 0.838, 0.783, 0.753 and 0.587 for baseline, weeks 12, 24, 36 and 48 of ART, respectively).",
+        "Sentence": "Genotype TT is not associated with response to efavirenz in people with HIV Infections and Tuberculosis as compared to genotypes GT.",
         "Alleles": "TT",
         "Specialty Population": null,
         "Metabolizer types": null,
@@ -6927,7 +7063,7 @@
         "Population types": "in people with",
         "Population Phenotypes or diseases": "Disease:HIV infectious disease, Disease:Tuberculosis",
         "Multiple phenotypes or diseases And/or": "and",
-        "Comparison Allele(s) or Genotype(s)": "GG + GT",
+        "Comparison Allele(s) or Genotype(s)": "GT",
         "Comparison Metabolizer types": null,
         "PMID_norm": "20338069",
         "Variant Annotation ID_norm": "1448993666"
@@ -6941,7 +7077,7 @@
         "Phenotype Category": "Metabolism/PK",
         "Significance": "yes",
         "Notes": "Association with significantly increased efavirenz plasma levels. Was significant at week 6, 12 of treatment and 1 month after rifampicin discontinuation.",
-        "Sentence": "Genotype TT is associated with decreased metabolism of efavirenz in people with HIV Infections and Tuberculosis as compared to genotypes GG + GT.",
+        "Sentence": "Genotype TT is associated with decreased metabolism of efavirenz in people with HIV Infections and Tuberculosis as compared to genotypes GG.",
         "Alleles": "TT",
         "Specialty Population": null,
         "Metabolizer types": null,
@@ -6953,10 +7089,88 @@
         "Population types": "in people with",
         "Population Phenotypes or diseases": "Disease:HIV infectious disease, Disease:Tuberculosis",
         "Multiple phenotypes or diseases And/or": "and",
-        "Comparison Allele(s) or Genotype(s)": "GG + GT",
+        "Comparison Allele(s) or Genotype(s)": "GG",
         "Comparison Metabolizer types": null,
         "PMID_norm": "20338069",
         "Variant Annotation ID_norm": "1184988061"
+      },
+      {
+        "Variant Annotation ID": 1184988061,
+        "Variant/Haplotypes": "rs3745274",
+        "Gene": "CYP2B6",
+        "Drug(s)": "efavirenz",
+        "PMID": 20338069,
+        "Phenotype Category": "Metabolism/PK",
+        "Significance": "yes",
+        "Notes": "Association with significantly increased efavirenz plasma levels. Was significant at week 6, 12 of treatment and 1 month after rifampicin discontinuation.",
+        "Sentence": "Genotype TT is associated with decreased metabolism of efavirenz in people with HIV Infections and Tuberculosis as compared to genotypes GT.",
+        "Alleles": "TT",
+        "Specialty Population": null,
+        "Metabolizer types": null,
+        "isPlural": "Is",
+        "Is/Is Not associated": "Associated with",
+        "Direction of effect": "decreased",
+        "PD/PK terms": "metabolism of",
+        "Multiple drugs And/or": null,
+        "Population types": "in people with",
+        "Population Phenotypes or diseases": "Disease:HIV infectious disease, Disease:Tuberculosis",
+        "Multiple phenotypes or diseases And/or": "and",
+        "Comparison Allele(s) or Genotype(s)": "GT",
+        "Comparison Metabolizer types": null,
+        "PMID_norm": "20338069",
+        "Variant Annotation ID_norm": "1184988061"
+      },
+      {
+        "Variant Annotation ID": null,
+        "Variant/Haplotypes": "rs3745274",
+        "Gene": "CYP2B6",
+        "Drug(s)": "efavirenz, nevirapine",
+        "PMID": null,
+        "Phenotype Category": "Efficacy",
+        "Significance": "yes",
+        "Notes": "\"After 12 weeks of both drug regimens, there was a trend towards higher percentage of patients with CYP2B6-TT genotype who achieved HIV-1 RNA levels <50 copies/mL compared to those with GT or GG genotypes.\"",
+        "Sentence": "Patients with genotype TT show a trend towards increased likelihood of achieving HIV-1 RNA levels <50 copies/mL when treated with efavirenz and neviraprine as compared to genotypes GT.",
+        "Alleles": "TT",
+        "Specialty Population": null,
+        "Metabolizer types": null,
+        "isPlural": "show",
+        "Is/Is Not associated": "show a trend towards",
+        "Direction of effect": "increased",
+        "PD/PK terms": "likelihood of",
+        "Multiple drugs And/or": null,
+        "Population types": null,
+        "Population Phenotypes or diseases": null,
+        "Multiple phenotypes or diseases And/or": null,
+        "Comparison Allele(s) or Genotype(s)": "GT",
+        "Comparison Metabolizer types": null,
+        "PMID_norm": null,
+        "Variant Annotation ID_norm": null
+      },
+      {
+        "Variant Annotation ID": null,
+        "Variant/Haplotypes": "rs3745274",
+        "Gene": "CYP2B6",
+        "Drug(s)": "efavirenz, nevirapine",
+        "PMID": null,
+        "Phenotype Category": "Efficacy",
+        "Significance": "yes",
+        "Notes": "\"After 12 weeks of both drug regimens, there was a trend towards higher percentage of patients with CYP2B6-TT genotype who achieved HIV-1 RNA levels <50 copies/mL compared to those with GT or GG genotypes.\"",
+        "Sentence": "Patients with genotype TT show a trend towards increased likelihood of achieving HIV-1 RNA levels <50 copies/mL when treated with efavirenz and neviraprine as compared to genotypes GG.",
+        "Alleles": "TT",
+        "Specialty Population": null,
+        "Metabolizer types": null,
+        "isPlural": "show",
+        "Is/Is Not associated": "show a trend towards",
+        "Direction of effect": "increased",
+        "PD/PK terms": "likelihood of",
+        "Multiple drugs And/or": null,
+        "Population types": null,
+        "Population Phenotypes or diseases": null,
+        "Multiple phenotypes or diseases And/or": null,
+        "Comparison Allele(s) or Genotype(s)": "GG",
+        "Comparison Metabolizer types": null,
+        "PMID_norm": null,
+        "Variant Annotation ID_norm": null
       }
     ],
     "var_pheno_ann": [],
diff --git a/src/benchmark/annotation_benchmark.py b/src/benchmark/annotation_benchmark.py
index 0f54681..1b93da9 100644
--- a/src/benchmark/annotation_benchmark.py
+++ b/src/benchmark/annotation_benchmark.py
@@ -1,5 +1,7 @@
 from typing import List
+import json
 from src.utils import get_pmcid_annotation
+from src.benchmark.pheno_benchmark import evaluate_phenotype_annotations
 
 
 class AnnotationBenchmark:
@@ -10,7 +12,11 @@ def get_var_drug_ann_score(self, var_drug_ann: List[dict]):
         return 1.0
 
     def get_var_pheno_ann_score(self, var_pheno_ann: List[dict]):
-        return 1.0
+        try:
+            result = evaluate_phenotype_annotations(var_pheno_ann)
+            return result / 100.0
+        except Exception:
+            return 1.0
 
     def get_var_fa_ann_score(self, var_fa_ann: List[dict]):
         return 1.0
diff --git a/src/benchmark/pheno_benchmark.py b/src/benchmark/pheno_benchmark.py
new file mode 100644
index 0000000..3821209
--- /dev/null
+++ b/src/benchmark/pheno_benchmark.py
@@ -0,0 +1,256 @@
+from typing import List, Dict, Any, Tuple, Set
+from dataclasses import dataclass
+import re
+
+
+class PhenotypeAnnotationBenchmark:
+    """Benchmark for evaluating phenotype annotation predictions."""
+
+    # Fields to compare (excluding metadata fields)
+    CORE_FIELDS = [
+        "Variant/Haplotypes",
+        "Gene",
+        "Drug(s)",
+        "Phenotype Category",
+        "Alleles",
+        "Is/Is Not associated",
+        "Direction of effect",
+        "Phenotype",
+        "When treated with/exposed to/when assayed with",
+        "Comparison Allele(s) or Genotype(s)",
+    ]
+
+    # Fields with weighted importance
+    FIELD_WEIGHTS = {
+        "Phenotype": 2.0,
+        "Drug(s)": 1.5,
+        "Direction of effect": 2.0,
+        "Alleles": 1.5,
+        "Is/Is Not associated": 1.0,
+        "Variant/Haplotypes": 1.0,
+        "Gene": 1.0,    
+        "Phenotype Category": 0.5,
+        "When treated with/exposed to/when assayed with": 0.5,
+        "Comparison Allele(s) or Genotype(s)": 1.0,
+    }
+
+    def __init__(self, matching_threshold: float = 0.7):
+        """
+        Initialize benchmark.
+
+        Args:
+            matching_threshold: Minimum score to consider a match (0-1)
+        """
+        self.matching_threshold = matching_threshold
+
+    def _normalize_value(self, value: Any) -> str:
+        """Normalize a field value for comparison."""
+        if value is None:
+            return ""
+
+        # Convert to string and normalize
+        s = str(value).lower().strip()
+
+        # Remove extra whitespace
+        s = re.sub(r'\s+', ' ', s)
+
+        # Remove punctuation variations
+        s = re.sub(r'[,;]+', '', s)
+
+        return s
+
+    def _compare_field(self, pred_value: Any, gt_value: Any) -> float:
+        """
+        Compare two field values and return similarity score (0-1).
+
+        Args:
+            pred_value: Predicted value
+            gt_value: Ground truth value
+
+        Returns:
+            Similarity score between 0 and 1
+        """
+        pred_norm = self._normalize_value(pred_value)
+        ground_truth_norm = self._normalize_value(gt_value)
+
+        # Both empty or None
+        if not pred_norm and not ground_truth_norm:
+            return 1.0
+
+        # One empty, one not
+        if not pred_norm or not ground_truth_norm:
+            return 0.0
+
+        # Exact match
+        if pred_norm == ground_truth_norm:
+            return 1.0
+
+        # Check if one contains the other (useful for partial matches)
+        if pred_norm in ground_truth_norm or ground_truth_norm in pred_norm:
+            return 0.8
+
+        #The Jaccard index is particularly useful when the presence or absence of elements 
+        # in the sets is more important than their frequency or order.
+        # could be used to help check for multiple entries put in one annotation?
+        pred_tokens = set(pred_norm.split())
+        gt_tokens = set(ground_truth_norm.split())
+
+        if pred_tokens and gt_tokens:
+            intersection = len(pred_tokens & gt_tokens)
+            union = len(pred_tokens | gt_tokens)
+            jaccard = intersection / union if union > 0 else 0.0
+            return jaccard
+
+        return 0.0
+
+    def _compare_annotations(self, pred: Dict[str, Any], gt: Dict[str, Any]) -> float:
+        """
+        Compare a predicted annotation with a ground truth annotation.
+
+        Args:
+            pred: Predicted annotation
+            gt: Ground truth annotation
+
+        Returns:
+            Float ranging from 0 - 1 denoting similarity
+        """
+        field_scores = {}
+        weighted_sum = 0.0
+        total_weight = 0.0
+
+        for field in self.CORE_FIELDS:
+            weight = self.FIELD_WEIGHTS.get(field, 1.0)
+            similarity = self._compare_field(pred.get(field), gt.get(field))
+
+            field_scores[field] = similarity
+            weighted_sum += similarity * weight
+            total_weight += weight
+
+        # Calculate weighted average
+        matching_score = weighted_sum / total_weight
+
+        return matching_score
+
+    def _find_best_matches(
+        self,
+        predictions: List[Dict[str, Any]],
+        ground_truths: List[Dict[str, Any]]
+    ) -> List[Tuple[int, int, float]]:
+        """
+        Find best matches between predictions and ground truths.
+
+        Returns:
+            List of (pred_idx, gt_idx, score) tuples sorted by score descending
+        """
+        matches = []
+
+        for pred_idx, pred in enumerate(predictions):
+            for gt_idx, gt in enumerate(ground_truths):
+                match_score = self._compare_annotations(pred, gt)
+                if match_score >= self.matching_threshold:
+                    matches.append((pred_idx, gt_idx, match_score))
+
+        # Sort by score descending
+        matches.sort(key=lambda x: x[2], reverse=True)
+
+        return matches
+
+    def evaluate(
+        self,
+        samples: List[Any]
+    ) -> float:
+        """
+        Evaluate predictions against ground truths and return similarity score.
+
+        Handles both single annotation pairs and lists of annotations.
+
+        Args:
+            samples: List with exactly 2 items:
+                    - [ground_truth_dict, prediction_dict] for single comparison
+                    - [ground_truth_list, prediction_list] for multiple comparisons
+
+        Returns:
+            Similarity score between 0 and 1
+        """
+        if not isinstance(samples, list) or len(samples) != 2:
+            raise ValueError("Expected a list with exactly two items: [ground_truth, prediction].")
+
+        gt, pred = samples[0], samples[1]
+
+        # Normalize to lists
+        if isinstance(gt, dict) and isinstance(pred, dict):
+            # Single annotation pair
+            gt_list = [gt]
+            pred_list = [pred]
+        elif isinstance(gt, list) and isinstance(pred, list):
+            # Multiple annotations
+            gt_list = gt
+            pred_list = pred
+        else:
+            raise ValueError("Both items must be either dicts or lists: [ground_truth, prediction].")
+
+        if not gt_list or not pred_list:
+            return 0.0
+
+        # Find all potential matches
+        all_matches = self._find_best_matches(pred_list, gt_list)
+
+        # Greedily assign matches (allowing many-to-one mapping)
+        matched_preds: Set[int] = set()
+        matched_gts: Set[int] = set()
+        match_scores = []
+
+        for pred_idx, gt_idx, score in all_matches:
+            # Allow multiple predictions to match same ground truth (many-to-one)
+            # but each prediction can only match once (one-to-one from pred side)
+            if pred_idx not in matched_preds:
+                matched_preds.add(pred_idx)
+                matched_gts.add(gt_idx)
+                match_scores.append(score)
+
+        # Calculate average similarity across all ground truths
+        # Matched GTs contribute their match score
+        # Unmatched GTs contribute 0
+        total_score = sum(match_scores)
+        total_possible = len(gt_list)
+
+        return total_score / total_possible
+
+
+def evaluate_phenotype_annotations(
+    samples: List[Any],
+    matching_threshold: float = 0.7
+) -> float:
+    """
+    Benchmark phenotype annotations and return an aggregate similarity score.
+
+    Handles both single annotation pairs and lists of annotations.
+
+    Args:
+        samples: List with exactly 2 items:
+                - [ground_truth_dict, prediction_dict] for single comparison
+                - [ground_truth_list, prediction_list] for multiple comparisons
+        matching_threshold: Minimum similarity score to consider a match (0-1)
+
+    Returns:
+        Similarity score between 0-100 representing how well prediction(s)
+        match ground truth(s) across all fields.
+
+    Examples:
+        # Single annotation pair
+        >>> ground_truth = {"Phenotype": "sensitivity", "Drug(s)": "etoposide", ...}
+        >>> prediction = {"Phenotype": "sensitivity", "Drug(s)": "etoposide", ...}
+        >>> score = benchmark_phenotype_annotations([ground_truth, prediction])
+        >>> print(f"Model Score: {score:.1f}/100")
+
+        # Multiple annotations
+        >>> ground_truths = [gt1, gt2, gt3]
+        >>> predictions = [pred1, pred2]
+        >>> score = benchmark_phenotype_annotations([ground_truths, predictions])
+        >>> print(f"Model Score: {score:.1f}/100")
+    """
+    benchmark = PhenotypeAnnotationBenchmark(matching_threshold=matching_threshold)
+    similarity = benchmark.evaluate(samples)
+
+    # Return as 0-100 scale
+    return similarity * 100

From ad4953d0f8d6b778e218cc7f230170333bcb7df2 Mon Sep 17 00:00:00 2001
From: Vincent Yip <realsharkguy@gmail.com>
Date: Fri, 31 Oct 2025 14:28:18 -0700
Subject: [PATCH 2/2] added ground truth retrieval

---
 src/benchmark/annotation_benchmark.py | 32 ++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/benchmark/annotation_benchmark.py b/src/benchmark/annotation_benchmark.py
index 1b93da9..96336a4 100644
--- a/src/benchmark/annotation_benchmark.py
+++ b/src/benchmark/annotation_benchmark.py
@@ -11,12 +11,31 @@ def __init__(self):
     def get_var_drug_ann_score(self, var_drug_ann: List[dict]):
         return 1.0
 
-    def get_var_pheno_ann_score(self, var_pheno_ann: List[dict]):
+    def get_var_pheno_ann_score(self, var_pheno_ann: List[dict], pmcid: str):
+        # Load ground truth annotations
+        with open("persistent_data/benchmark_annotations.json", "r") as f:
+            ground_truth_data = json.load(f)
+
+        # Get ground truth for this PMCID
+        if pmcid not in ground_truth_data:
+            return 0.0
+
+        ground_truth_pheno_ann = ground_truth_data[pmcid].get("var_pheno_ann", [])
+
+        # If both are empty, perfect score
+        if not var_pheno_ann and not ground_truth_pheno_ann:
+            return 1.0
+
+        # If one is empty but not the other, score is 0
+        if not var_pheno_ann or not ground_truth_pheno_ann:
+            return 0.0
+
+        # Compare: [ground_truth, prediction]
         try:
-            result = evaluate_phenotype_annotations(var_pheno_ann)
-            return result / 100.0
+            score = evaluate_phenotype_annotations([ground_truth_pheno_ann, var_pheno_ann])
+            return score / 100.0
         except Exception:
-            return 1.0
+            return 0.0
 
     def get_var_fa_ann_score(self, var_fa_ann: List[dict]):
         return 1.0
@@ -30,11 +49,12 @@ def calculate_total_score(
         var_pheno_ann: List[dict],
         var_fa_ann: List[dict],
         study_parameters: List[dict],
+        pmcid: str,
     ):
         # Return average of all scores
         scores = [
             self.get_var_drug_ann_score(var_drug_ann),
-            self.get_var_pheno_ann_score(var_pheno_ann),
+            self.get_var_pheno_ann_score(var_pheno_ann, pmcid),
             self.get_var_fa_ann_score(var_fa_ann),
             self.get_study_parameters_score(study_parameters),
         ]
@@ -49,7 +69,7 @@ def run(self, pmcid: str):
         study_parameters = pmcid_annotation.get("studyParameters", [])
 
         total_score = self.calculate_total_score(
-            var_drug_ann, var_pheno_ann, var_fa_ann, study_parameters
+            var_drug_ann, var_pheno_ann, var_fa_ann, study_parameters, pmcid
         )
         print(f"Score for pmcid {pmcid}: {total_score}")
         return total_score