diff --git a/persistent_data/benchmark_analysis_summary.md b/persistent_data/benchmark_analysis_summary.md new file mode 100644 index 0000000..4b8da11 --- /dev/null +++ b/persistent_data/benchmark_analysis_summary.md @@ -0,0 +1,115 @@ +# Benchmark Analysis Summary +**Generated from 5 sample articles** +**Analysis Date:** 2025-11-24 17:09:34 + +## Overall Performance (Excluding 'one_empty' entries) + +| Benchmark | Average Score | Valid Examples | +|-----------|--------------|----------------| +| DRUG | 0.715 | 2 | +| FA | 0.375 | 1 | +| PHENO | 0.554 | 3 | +| STUDY_PARAMETERS | 0.677 | 5 | + +## Key Findings + +### 1. Most Problematic Fields + +**Study Parameters:** +- **Study Cases**: Appears 5 times with average score 0.364 +- **Characteristics**: Appears 5 times with average score 0.374 +- **Biogeographical Groups**: Appears 5 times with average score 0.000 +- **Study Type**: Appears 4 times with average score 0.200 +- **Ratio Stat Type**: Appears 4 times with average score 0.333 + +**Pheno:** +- **Alleles**: Appears 3 times with average score 0.280 +- **Phenotype**: Appears 3 times with average score 0.223 +- **Comparison Allele(s) or Genotype(s)**: Appears 3 times with average score 0.074 +- **Variant/Haplotypes**: Appears 2 times with average score 0.458 +- **Phenotype Category**: Appears 2 times with average score 0.500 + +**Drug:** +- **PMID**: Appears 2 times with average score 0.000 +- **Population types**: Appears 2 times with average score 0.450 +- **Population Phenotypes or diseases**: Appears 2 times with average score 0.577 +- **Comparison Allele(s) or Genotype(s)**: Appears 2 times with average score 0.000 + +**Fa:** +- **Variant/Haplotypes**: Appears 1 times with average score 0.000 +- **Drug(s)**: Appears 1 times with average score 0.000 +- **Phenotype Category**: Appears 1 times with average score 0.000 +- **Alleles**: Appears 1 times with average score 0.000 +- **Specialty Population**: Appears 1 times with average score 0.000 + +### 2. Missing Value Patterns + +**Predictions Missing Critical Fields:** +- study_parameters: Frequency in Cases - Missing in 17 cases +- study_parameters: Allele of Frequency in Cases - Missing in 17 cases +- study_parameters: Frequency in Controls - Missing in 17 cases +- study_parameters: Allele of Frequency in Controls - Missing in 17 cases +- study_parameters: Study Controls - Missing in 15 cases +- study_parameters: Ratio Stat - Missing in 14 cases +- study_parameters: Confidence Interval Start - Missing in 14 cases +- study_parameters: Confidence Interval Stop - Missing in 14 cases +- study_parameters: Study Type - Missing in 3 cases +- study_parameters: P Value - Missing in 3 cases + +**Ground Truth Missing but Predictions Provide:** +- study_parameters: Study Type - 14 times +- study_parameters: Ratio Stat Type - 10 times +- pheno: Alleles - 7 times +- pheno: Comparison Allele(s) or Genotype(s) - 7 times +- drug: Comparison Allele(s) or Genotype(s) - 1 times +- fa: Drug(s) - 1 times +- fa: Specialty Population - 1 times +- fa: When treated with/exposed to/when assayed with - 1 times +- fa: Multiple drugs And/or - 1 times +- fa: Cell type - 1 times + +### 3. Common Mismatch Patterns + +1. **Numeric Value Mismatches** (Study Cases/Controls): + - Frequent mismatches in population counts + - Suggests LLM may be extracting different population counts or misinterpreting study design + +2. **Semantic Similarity Issues** (Characteristics, Phenotype): + - Similar meaning but different wording causing lower scores + - May need improved embeddings or similarity thresholds + +3. **Format/Standardization Issues**: + - Minor differences in formatting causing score reductions + - Case sensitivity, punctuation, spacing differences + +4. **Missing vs Present**: + - Many fields where GT is None but predictions provide values (especially Study Type) + - Many fields where GT has values but predictions are None (especially statistical fields) + +### 4. Dependency Issues + +- 1x: Invalid star allele format: (DPYD*2A) + +## Recommendations + +1. **Improve Statistical Field Extraction**: + - Ratio Stat, Confidence Intervals, and P Values are frequently missing + - Consider adding explicit prompts for statistical measures + +2. **Study Type Handling**: + - LLM often provides Study Type when GT is None - this may be acceptable + - Consider whether this should be penalized or if it's useful additional information + +3. **Biogeographical Groups**: + - LLM consistently outputs "Unknown" - needs better extraction or different handling + +4. **Variant/Haplotypes Matching**: + - Improved with variant expansion/normalization (pheno benchmark fixed) + - Still some cases where variants don't match - may need further refinement + +5. **PMID Extraction**: + - LLM never extracts PMID - consider if this should be provided as input or if extraction is needed + +6. **Semantic Similarity Thresholds**: + - Many fields with similar meanings score low due to wording differences + - May need to adjust similarity thresholds or improve embeddings \ No newline at end of file diff --git a/persistent_data/benchmark_results_20251124_170826.json b/persistent_data/benchmark_results_20251124_170826.json new file mode 100644 index 0000000..fc3546d --- /dev/null +++ b/persistent_data/benchmark_results_20251124_170826.json @@ -0,0 +1,4052 @@ +{ + "timestamp": "20251124_170826", + "num_examples": 5, + "results": [ + { + "pmcid": "PMC10275785", + "title": "Effect of NLRP3 inflammasome genes polymorphism on disease susceptibility and response to TNF-\u03b1 inhibitors in Iraqi patients with rheumatoid arthritis", + "benchmarks": { + "pheno": { + "overall_score": 0.0, + "total_samples": 0, + "status": "one_empty" + }, + "drug": { + "total_samples": 1, + "field_scores": { + "Variant/Haplotypes": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Gene": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "PMID": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Phenotype Category": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Significance": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Alleles": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Specialty Population": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Metabolizer types": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "isPlural": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Is/Is Not associated": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Direction of effect": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "PD/PK terms": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Multiple drugs And/or": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Population types": { + "mean_score": 0.6666666666666666, + "scores": [ + 0.6666666666666666 + ] + }, + "Population Phenotypes or diseases": { + "mean_score": 0.8148148148148148, + "scores": [ + 0.8148148148148148 + ] + }, + "Multiple phenotypes or diseases And/or": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Comparison Allele(s) or Genotype(s)": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Comparison Metabolizer types": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Drug(s)": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + } + }, + "overall_score": 0.8674463937621832, + "detailed_results": [ + { + "sample_id": 0, + "field_scores": { + "Variant/Haplotypes": 1.0, + "Gene": 1.0, + "PMID": 0.0, + "Phenotype Category": 1.0, + "Significance": 1.0, + "Alleles": 1.0, + "Specialty Population": 1.0, + "Metabolizer types": 1.0, + "isPlural": 1.0, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "PD/PK terms": 1.0, + "Multiple drugs And/or": 1.0, + "Population types": 0.6666666666666666, + "Population Phenotypes or diseases": 0.8148148148148148, + "Multiple phenotypes or diseases And/or": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.0, + "Comparison Metabolizer types": 1.0, + "Drug(s)": 1.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "rs2043211", + "prediction": "rs2043211" + }, + "Gene": { + "ground_truth": "CARD8", + "prediction": "CARD8" + }, + "PMID": { + "ground_truth": 37332933, + "prediction": null + }, + "Phenotype Category": { + "ground_truth": "Efficacy", + "prediction": "efficacy" + }, + "Significance": { + "ground_truth": "yes", + "prediction": "yes" + }, + "Alleles": { + "ground_truth": "TT", + "prediction": "TT" + }, + "Specialty Population": { + "ground_truth": null, + "prediction": null + }, + "Metabolizer types": { + "ground_truth": null, + "prediction": null + }, + "isPlural": { + "ground_truth": "Is", + "prediction": "Is" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "decreased", + "prediction": "decreased" + }, + "PD/PK terms": { + "ground_truth": "response to", + "prediction": "response to" + }, + "Multiple drugs And/or": { + "ground_truth": "or", + "prediction": "or" + }, + "Population types": { + "ground_truth": "in people with", + "prediction": "in patients with" + }, + "Population Phenotypes or diseases": { + "ground_truth": "Other:Rheumatoid arthritis", + "prediction": "Disease:Rheumatoid Arthritis" + }, + "Multiple phenotypes or diseases And/or": { + "ground_truth": null, + "prediction": null + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": "AA + AT", + "prediction": null + }, + "Comparison Metabolizer types": { + "ground_truth": null, + "prediction": null + }, + "Drug(s)": { + "ground_truth": "etanercept, infliximab", + "prediction": "Etanercept, Infliximab" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + } + ] + }, + "fa": { + "overall_score": 0.0, + "total_samples": 0, + "status": "one_empty" + }, + "study_parameters": { + "total_samples": 2, + "field_scores": { + "Study Type": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0 + ] + }, + "Study Cases": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0 + ] + }, + "Study Controls": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0 + ] + }, + "Characteristics": { + "mean_score": 0.4387472283813747, + "scores": [ + 0.5365853658536586, + 0.3409090909090909 + ] + }, + "Characteristics Type": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "Frequency in Cases": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "Allele of Frequency in Cases": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "Frequency in Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "Allele of Frequency in Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "P Value": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "Ratio Stat Type": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0 + ] + }, + "Ratio Stat": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0 + ] + }, + "Confidence Interval Start": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0 + ] + }, + "Confidence Interval Stop": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0 + ] + }, + "Biogeographical Groups": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0 + ] + } + }, + "overall_score": 0.429249815225425, + "detailed_results": [ + { + "sample_id": 0, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 0.0, + "Study Controls": 0.0, + "Characteristics": 0.5365853658536586, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 1.0, + "Ratio Stat Type": 0.0, + "Ratio Stat": 0.0, + "Confidence Interval Start": 0.0, + "Confidence Interval Stop": 0.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452143368, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452143360, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "case/control" + }, + "Study Cases": { + "ground_truth": 66.0, + "prediction": 100.0 + }, + "Study Controls": { + "ground_truth": 33.0, + "prediction": 100.0 + }, + "Characteristics": { + "ground_truth": "Cases = responders, controls = non-responders", + "prediction": "Patients classified into responders and non-responders based on EULAR criteria" + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "< 0.0001", + "prediction": "< 0.0001" + }, + "Ratio Stat Type": { + "ground_truth": "OR", + "prediction": "Unknown" + }, + "Ratio Stat": { + "ground_truth": 0.183, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": 0.063, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": 0.531, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Near Eastern", + "prediction": "Unknown" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 1, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 0.0, + "Study Controls": 0.0, + "Characteristics": 0.3409090909090909, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 1.0, + "Ratio Stat Type": 0.0, + "Ratio Stat": 0.0, + "Confidence Interval Start": 0.0, + "Confidence Interval Stop": 0.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452143420, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452143400, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "case/control" + }, + "Study Cases": { + "ground_truth": 66.0, + "prediction": 100.0 + }, + "Study Controls": { + "ground_truth": 33.0, + "prediction": 100.0 + }, + "Characteristics": { + "ground_truth": "Cases = responders, controls = non-responders", + "prediction": "Patients with NLRP3 (rs4612666) TT genotype" + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "< 0.0001", + "prediction": "< 0.0001" + }, + "Ratio Stat Type": { + "ground_truth": "OR", + "prediction": "Unknown" + }, + "Ratio Stat": { + "ground_truth": 0.131, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": 0.047, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": 0.36, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Near Eastern", + "prediction": "Unknown" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + } + ] + } + } + }, + { + "pmcid": "PMC10399933", + "title": "Real-world pharmacogenetics of statin intolerance: effects of SLCO1B1, ABCG2, and CYP2C9 variants", + "benchmarks": { + "pheno": { + "total_samples": 2, + "field_scores": { + "Variant/Haplotypes": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0 + ] + }, + "Gene": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "Drug(s)": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "Phenotype Category": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "Alleles": { + "mean_score": 0.6000000000000001, + "scores": [ + 0.8, + 0.4 + ] + }, + "Is/Is Not associated": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "Direction of effect": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "Phenotype": { + "mean_score": 0.26666666666666666, + "scores": [ + 0.26666666666666666, + 0.26666666666666666 + ] + }, + "When treated with/exposed to/when assayed with": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0 + ] + }, + "Comparison Allele(s) or Genotype(s)": { + "mean_score": 0.2222222222222222, + "scores": [ + 0.2222222222222222, + 0.2222222222222222 + ] + } + }, + "overall_score": 0.6796296296296296, + "detailed_results": [ + { + "sample_id": 0, + "field_scores": { + "Variant/Haplotypes": 0.0, + "Gene": 1.0, + "Drug(s)": 1.0, + "Phenotype Category": 1.0, + "Alleles": 0.8, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "Phenotype": 0.26666666666666666, + "When treated with/exposed to/when assayed with": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.2222222222222222 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "rs4149056", + "prediction": "SLCO1B1" + }, + "Gene": { + "ground_truth": "SLCO1B1", + "prediction": "SLCO1B1" + }, + "Drug(s)": { + "ground_truth": "simvastatin", + "prediction": "simvastatin" + }, + "Phenotype Category": { + "ground_truth": "Toxicity", + "prediction": "toxicity" + }, + "Alleles": { + "ground_truth": "CC", + "prediction": "C/C" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "increased", + "prediction": "increased" + }, + "Phenotype": { + "ground_truth": "Side Effect:Discontinuation", + "prediction": "statin intolerance" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": "when treated with", + "prediction": "when treated with" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": "TT", + "prediction": "SLCO1B1 c.521T/T" + } + }, + "dependency_issues": [] + }, + { + "sample_id": 1, + "field_scores": { + "Variant/Haplotypes": 0.0, + "Gene": 1.0, + "Drug(s)": 1.0, + "Phenotype Category": 1.0, + "Alleles": 0.4, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "Phenotype": 0.26666666666666666, + "When treated with/exposed to/when assayed with": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.2222222222222222 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "rs4149056", + "prediction": "SLCO1B1" + }, + "Gene": { + "ground_truth": "SLCO1B1", + "prediction": "SLCO1B1" + }, + "Drug(s)": { + "ground_truth": "pravastatin", + "prediction": "pravastatin" + }, + "Phenotype Category": { + "ground_truth": "Toxicity", + "prediction": "toxicity" + }, + "Alleles": { + "ground_truth": "CC + CT", + "prediction": "C/T" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "increased", + "prediction": "increased" + }, + "Phenotype": { + "ground_truth": "Side Effect:Discontinuation", + "prediction": "statin intolerance" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": "when treated with", + "prediction": "when treated with" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": "TT", + "prediction": "SLCO1B1 c.521T/T" + } + }, + "dependency_issues": [] + } + ] + }, + "drug": { + "overall_score": 0.0, + "total_samples": 0, + "status": "one_empty" + }, + "fa": { + "overall_score": 0.0, + "total_samples": 0, + "status": "one_empty" + }, + "study_parameters": { + "total_samples": 3, + "field_scores": { + "Study Type": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0, + 0.0 + ] + }, + "Study Cases": { + "mean_score": 0.3333333333333333, + "scores": [ + 1.0, + 0.0, + 0.0 + ] + }, + "Study Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Characteristics": { + "mean_score": 0.3166802832244009, + "scores": [ + 0.40625, + 0.4, + 0.1437908496732026 + ] + }, + "Characteristics Type": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Frequency in Cases": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Allele of Frequency in Cases": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Frequency in Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Allele of Frequency in Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "P Value": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Ratio Stat Type": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Ratio Stat": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Confidence Interval Start": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Confidence Interval Stop": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Biogeographical Groups": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0, + 0.0 + ] + } + }, + "overall_score": 0.7766675744371823, + "detailed_results": [ + { + "sample_id": 0, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 1.0, + "Study Controls": 1.0, + "Characteristics": 0.40625, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 1.0, + "Ratio Stat Type": 1.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452195960, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452195947, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "cohort" + }, + "Study Cases": { + "ground_truth": 92.0, + "prediction": 92 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "Statin switch", + "prediction": "Pravastatin intolerance defined by statin switching" + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "= 0.047", + "prediction": "= 0.047" + }, + "Ratio Stat Type": { + "ground_truth": "HR", + "prediction": "HR" + }, + "Ratio Stat": { + "ground_truth": 2.11, + "prediction": 2.11 + }, + "Confidence Interval Start": { + "ground_truth": 1.01, + "prediction": 1.01 + }, + "Confidence Interval Stop": { + "ground_truth": 4.39, + "prediction": 4.39 + }, + "Biogeographical Groups": { + "ground_truth": "European", + "prediction": "Unknown" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 1, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 0.0, + "Study Controls": 1.0, + "Characteristics": 0.4, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 1.0, + "Ratio Stat Type": 1.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452195946, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452195940, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "cohort" + }, + "Study Cases": { + "ground_truth": 916.0, + "prediction": 2042 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "Statin switch", + "prediction": "Statin switchers with CK measurement for simvastatin" + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "= 0.011", + "prediction": "= 0.011" + }, + "Ratio Stat Type": { + "ground_truth": "HR", + "prediction": "HR" + }, + "Ratio Stat": { + "ground_truth": 5.44, + "prediction": 5.44 + }, + "Confidence Interval Start": { + "ground_truth": 1.49, + "prediction": 1.49 + }, + "Confidence Interval Stop": { + "ground_truth": 19.9, + "prediction": 19.9 + }, + "Biogeographical Groups": { + "ground_truth": "European", + "prediction": "Unknown" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 2, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 0.0, + "Study Controls": 1.0, + "Characteristics": 0.1437908496732026, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 1.0, + "Ratio Stat Type": 1.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452195945, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452195940, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "cohort" + }, + "Study Cases": { + "ground_truth": 916.0, + "prediction": 2042 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "Statin switch", + "prediction": "Patients initiating statin therapy; including simvastatin users, heterozygous or homozygous SLCO1B1 c.521T>C carriers, and pravastatin users" + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "= 0.025", + "prediction": "= 0.025" + }, + "Ratio Stat Type": { + "ground_truth": "HR", + "prediction": "HR" + }, + "Ratio Stat": { + "ground_truth": 1.88, + "prediction": 1.88 + }, + "Confidence Interval Start": { + "ground_truth": 1.08, + "prediction": 1.08 + }, + "Confidence Interval Stop": { + "ground_truth": 3.25, + "prediction": 3.25 + }, + "Biogeographical Groups": { + "ground_truth": "European", + "prediction": "Unknown" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + } + ] + } + } + }, + { + "pmcid": "PMC10786722", + "title": "Integrating rare genetic variants into DPYD pharmacogenetic testing may help preventing fluoropyrimidine-induced toxicity", + "benchmarks": { + "pheno": { + "overall_score": 0.0, + "total_samples": 0, + "status": "one_empty" + }, + "drug": { + "overall_score": 0.0, + "total_samples": 0, + "status": "one_empty" + }, + "fa": { + "total_samples": 1, + "field_scores": { + "Variant/Haplotypes": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Gene": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Drug(s)": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "PMID": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Phenotype Category": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Significance": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Alleles": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Specialty Population": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Assay type": { + "mean_score": 0.18867924528301888, + "scores": [ + 0.18867924528301888 + ] + }, + "Metabolizer types": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "isPlural": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Is/Is Not associated": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Direction of effect": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Functional terms": { + "mean_score": 0.4444444444444444, + "scores": [ + 0.4444444444444444 + ] + }, + "Gene/gene product": { + "mean_score": 0.8571428571428571, + "scores": [ + 0.8571428571428571 + ] + }, + "When treated with/exposed to/when assayed with": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Multiple drugs And/or": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Cell type": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Comparison Allele(s) or Genotype(s)": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Comparison Metabolizer types": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + } + }, + "overall_score": 0.37451332734351606, + "detailed_results": [ + { + "sample_id": 0, + "field_scores": { + "Variant/Haplotypes": 0.0, + "Gene": 1.0, + "Drug(s)": 0.0, + "PMID": 1.0, + "Phenotype Category": 0.0, + "Significance": 1.0, + "Alleles": 0.0, + "Specialty Population": 0.0, + "Assay type": 0.18867924528301888, + "Metabolizer types": 1.0, + "isPlural": 1.0, + "Is/Is Not associated": 1.0, + "Direction of effect": 0.0, + "Functional terms": 0.4444444444444444, + "Gene/gene product": 0.8571428571428571, + "When treated with/exposed to/when assayed with": 0.0, + "Multiple drugs And/or": 0.0, + "Cell type": 0.0, + "Comparison Allele(s) or Genotype(s)": 0.0, + "Comparison Metabolizer types": 0.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "rs56038477", + "prediction": "c.1905+1G>A (DPYD*2A)" + }, + "Gene": { + "ground_truth": "DPYD", + "prediction": "DPYD" + }, + "Drug(s)": { + "ground_truth": null, + "prediction": "5-fluorouracil" + }, + "PMID": { + "ground_truth": 38216550, + "prediction": 38216550 + }, + "Phenotype Category": { + "ground_truth": "Metabolism/PK", + "prediction": "toxicity" + }, + "Significance": { + "ground_truth": "yes", + "prediction": "yes" + }, + "Alleles": { + "ground_truth": "CT + TT", + "prediction": "" + }, + "Specialty Population": { + "ground_truth": null, + "prediction": "" + }, + "Assay type": { + "ground_truth": "plasma dihydrouracil/uracil", + "prediction": "Next Generation Sequencing" + }, + "Metabolizer types": { + "ground_truth": null, + "prediction": null + }, + "isPlural": { + "ground_truth": "Is", + "prediction": "Is" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "decreased", + "prediction": "increased" + }, + "Functional terms": { + "ground_truth": "activity of", + "prediction": "risk of" + }, + "Gene/gene product": { + "ground_truth": "DPYD", + "prediction": "DPD" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": null, + "prediction": "" + }, + "Multiple drugs And/or": { + "ground_truth": null, + "prediction": "" + }, + "Cell type": { + "ground_truth": null, + "prediction": "plasma samples" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": "CC", + "prediction": "wild-type DPYD" + }, + "Comparison Metabolizer types": { + "ground_truth": null, + "prediction": "" + } + }, + "dependency_issues": [ + "Invalid star allele format: (DPYD*2A)" + ], + "penalty_info": { + "total_penalty": 0.05, + "penalized_fields": { + "Variant/Haplotypes": { + "original_score": 0.0, + "penalized_score": 0.0, + "penalty_percentage": 5.0 + } + }, + "issues_by_field": { + "Variant/Haplotypes": [ + "Invalid star allele format: (DPYD*2A)" + ] + } + } + } + ] + }, + "study_parameters": { + "total_samples": 3, + "field_scores": { + "Study Type": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Study Cases": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0, + 0.0 + ] + }, + "Study Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Characteristics": { + "mean_score": 0.2678406727383393, + "scores": [ + 0.35294117647058826, + 0.24719101123595505, + 0.2033898305084746 + ] + }, + "Characteristics Type": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Frequency in Cases": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Allele of Frequency in Cases": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Frequency in Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Allele of Frequency in Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "P Value": { + "mean_score": 0.3333333333333333, + "scores": [ + 1.0, + 0.0, + 0.0 + ] + }, + "Ratio Stat Type": { + "mean_score": 0.6666666666666666, + "scores": [ + 0.0, + 1.0, + 1.0 + ] + }, + "Ratio Stat": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Confidence Interval Start": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Confidence Interval Stop": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0 + ] + }, + "Biogeographical Groups": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0, + 0.0 + ] + } + }, + "overall_score": 0.7511893781825559, + "detailed_results": [ + { + "sample_id": 0, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 1.0, + "Study Cases": 0.0, + "Study Controls": 1.0, + "Characteristics": 0.35294117647058826, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 1.0, + "Ratio Stat Type": 0.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452352360, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452352325, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": null + }, + "Study Cases": { + "ground_truth": 1055.0, + "prediction": 180 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "plasma dihydrouracil/uracil", + "prediction": "patients who may exhibit low DPD activity" + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "< 0.01", + "prediction": "< 0.01" + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": "Unknown" + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Unknown", + "prediction": "European" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 1, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 1.0, + "Study Cases": 0.0, + "Study Controls": 1.0, + "Characteristics": 0.24719101123595505, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 0.0, + "Ratio Stat Type": 1.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452352264, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452352260, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": null + }, + "Study Cases": { + "ground_truth": 855.0, + "prediction": 628 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "plasma dihydrouracil/uracil", + "prediction": "patients with DPD deficiency identified by [U] and UH2/U ratio" + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "< 0.05", + "prediction": null + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": null + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Unknown", + "prediction": "European" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 2, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 1.0, + "Study Cases": 0.0, + "Study Controls": 1.0, + "Characteristics": 0.2033898305084746, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 0.0, + "Ratio Stat Type": 1.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452352324, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452352320, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": null + }, + "Study Cases": { + "ground_truth": 1265.0, + "prediction": 2928 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "plasma dihydrouracil/uracil", + "prediction": "All patients (non-deficient DPD)" + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "< 0.01", + "prediction": null + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": null + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Unknown", + "prediction": "European" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + } + ] + } + } + }, + { + "pmcid": "PMC10880264", + "title": "Pharmacogenetic Factors Influence Escitalopram Pharmacokinetics and Adverse Events in Youth with a Family History of Bipolar Disorder: A Preliminary Study", + "benchmarks": { + "pheno": { + "total_samples": 8, + "field_scores": { + "Variant/Haplotypes": { + "mean_score": 0.375, + "scores": [ + 1.0, + 1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "Gene": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Drug(s)": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Phenotype Category": { + "mean_score": 0.5, + "scores": [ + 1.0, + 0.0, + 0.0, + 1.0, + 1.0, + 1.0, + 0.0, + 0.0 + ] + }, + "Alleles": { + "mean_score": 0.016666666666666666, + "scores": [ + 0.0, + 0.0, + 0.0, + 0.13333333333333333, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "Is/Is Not associated": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Direction of effect": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Phenotype": { + "mean_score": 0.14675468869338915, + "scores": [ + 0.1320754716981132, + 0.13793103448275862, + 0.13793103448275862, + 0.22608695652173913, + 0.1320754716981132, + 0.1320754716981132, + 0.13793103448275862, + 0.13793103448275862 + ] + }, + "When treated with/exposed to/when assayed with": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Comparison Allele(s) or Genotype(s)": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } + }, + "overall_score": 0.5786257814488982, + "detailed_results": [ + { + "sample_id": 0, + "field_scores": { + "Variant/Haplotypes": 1.0, + "Gene": 1.0, + "Drug(s)": 1.0, + "Phenotype Category": 1.0, + "Alleles": 0.0, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "Phenotype": 0.1320754716981132, + "When treated with/exposed to/when assayed with": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "CYP2D6", + "prediction": "CYP2D6" + }, + "Gene": { + "ground_truth": "CYP2D6", + "prediction": "CYP2D6" + }, + "Drug(s)": { + "ground_truth": "escitalopram", + "prediction": "escitalopram" + }, + "Phenotype Category": { + "ground_truth": "Toxicity", + "prediction": "toxicity" + }, + "Alleles": { + "ground_truth": null, + "prediction": "IM, PM" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "increased", + "prediction": "increased" + }, + "Phenotype": { + "ground_truth": "Side Effect:Psychomotor Agitation, Side Effect:Hyperkinesis, Side Effect:Impulse control disorder", + "prediction": "akathisia" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": "when treated with", + "prediction": "when treated with" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": null, + "prediction": "CYP2D6 NM" + } + }, + "dependency_issues": [] + }, + { + "sample_id": 1, + "field_scores": { + "Variant/Haplotypes": 1.0, + "Gene": 1.0, + "Drug(s)": 1.0, + "Phenotype Category": 0.0, + "Alleles": 0.0, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "Phenotype": 0.13793103448275862, + "When treated with/exposed to/when assayed with": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "CYP2C19", + "prediction": "CYP2C19" + }, + "Gene": { + "ground_truth": "CYP2C19", + "prediction": "CYP2C19" + }, + "Drug(s)": { + "ground_truth": "escitalopram", + "prediction": "escitalopram" + }, + "Phenotype Category": { + "ground_truth": "Toxicity", + "prediction": "metabolism/PK" + }, + "Alleles": { + "ground_truth": null, + "prediction": "IM, NM, RM" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "increased", + "prediction": "increased" + }, + "Phenotype": { + "ground_truth": "Side Effect:suicidal ideation, Side Effect:adverse events, Side Effect:deliberate self-harm", + "prediction": "serum escitalopram levels" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": "when treated with", + "prediction": "when treated with" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": null, + "prediction": "CYP2C19 PM" + } + }, + "dependency_issues": [] + }, + { + "sample_id": 2, + "field_scores": { + "Variant/Haplotypes": 1.0, + "Gene": 1.0, + "Drug(s)": 1.0, + "Phenotype Category": 0.0, + "Alleles": 0.0, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "Phenotype": 0.13793103448275862, + "When treated with/exposed to/when assayed with": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "intermediate", + "prediction": "RM" + }, + "Gene": { + "ground_truth": "CYP2C19", + "prediction": "CYP2C19" + }, + "Drug(s)": { + "ground_truth": "escitalopram", + "prediction": "escitalopram" + }, + "Phenotype Category": { + "ground_truth": "Toxicity", + "prediction": "metabolism/PK" + }, + "Alleles": { + "ground_truth": null, + "prediction": "IM, NM, RM" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "increased", + "prediction": "increased" + }, + "Phenotype": { + "ground_truth": "Side Effect:suicidal ideation, Side Effect:adverse events, Side Effect:deliberate self-harm", + "prediction": "serum escitalopram levels" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": "when treated with", + "prediction": "when treated with" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": null, + "prediction": "CYP2C19 PM" + } + }, + "dependency_issues": [] + }, + { + "sample_id": 3, + "field_scores": { + "Variant/Haplotypes": 0.0, + "Gene": 1.0, + "Drug(s)": 1.0, + "Phenotype Category": 1.0, + "Alleles": 0.13333333333333333, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "Phenotype": 0.22608695652173913, + "When treated with/exposed to/when assayed with": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "rs6311", + "prediction": "HTR2A" + }, + "Gene": { + "ground_truth": "HTR2A", + "prediction": "HTR2A" + }, + "Drug(s)": { + "ground_truth": "escitalopram", + "prediction": "escitalopram" + }, + "Phenotype Category": { + "ground_truth": "Toxicity", + "prediction": "toxicity" + }, + "Alleles": { + "ground_truth": "CT + TT", + "prediction": "A/G, A/A" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "increased", + "prediction": "increased" + }, + "Phenotype": { + "ground_truth": "Side Effect:suicidal ideation, Side Effect:adverse events, Side Effect:deliberate self-harm", + "prediction": "self-injury, suicidality" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": "when treated with", + "prediction": "when treated with" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": "CC", + "prediction": "HTR2A G/G" + } + }, + "dependency_issues": [] + }, + { + "sample_id": 4, + "field_scores": { + "Variant/Haplotypes": 0.0, + "Gene": 1.0, + "Drug(s)": 1.0, + "Phenotype Category": 1.0, + "Alleles": 0.0, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "Phenotype": 0.1320754716981132, + "When treated with/exposed to/when assayed with": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "CYP2D6", + "prediction": "IM" + }, + "Gene": { + "ground_truth": "CYP2D6", + "prediction": "CYP2D6" + }, + "Drug(s)": { + "ground_truth": "escitalopram", + "prediction": "escitalopram" + }, + "Phenotype Category": { + "ground_truth": "Toxicity", + "prediction": "toxicity" + }, + "Alleles": { + "ground_truth": null, + "prediction": "IM, PM" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "increased", + "prediction": "increased" + }, + "Phenotype": { + "ground_truth": "Side Effect:Psychomotor Agitation, Side Effect:Hyperkinesis, Side Effect:Impulse control disorder", + "prediction": "akathisia" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": "when treated with", + "prediction": "when treated with" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": null, + "prediction": "CYP2D6 NM" + } + }, + "dependency_issues": [] + }, + { + "sample_id": 5, + "field_scores": { + "Variant/Haplotypes": 0.0, + "Gene": 1.0, + "Drug(s)": 1.0, + "Phenotype Category": 1.0, + "Alleles": 0.0, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "Phenotype": 0.1320754716981132, + "When treated with/exposed to/when assayed with": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "CYP2D6", + "prediction": "PM" + }, + "Gene": { + "ground_truth": "CYP2D6", + "prediction": "CYP2D6" + }, + "Drug(s)": { + "ground_truth": "escitalopram", + "prediction": "escitalopram" + }, + "Phenotype Category": { + "ground_truth": "Toxicity", + "prediction": "toxicity" + }, + "Alleles": { + "ground_truth": null, + "prediction": "IM, PM" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "increased", + "prediction": "increased" + }, + "Phenotype": { + "ground_truth": "Side Effect:Psychomotor Agitation, Side Effect:Hyperkinesis, Side Effect:Impulse control disorder", + "prediction": "akathisia" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": "when treated with", + "prediction": "when treated with" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": null, + "prediction": "CYP2D6 NM" + } + }, + "dependency_issues": [] + }, + { + "sample_id": 6, + "field_scores": { + "Variant/Haplotypes": 0.0, + "Gene": 1.0, + "Drug(s)": 1.0, + "Phenotype Category": 0.0, + "Alleles": 0.0, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "Phenotype": 0.13793103448275862, + "When treated with/exposed to/when assayed with": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "CYP2C19", + "prediction": "IM" + }, + "Gene": { + "ground_truth": "CYP2C19", + "prediction": "CYP2C19" + }, + "Drug(s)": { + "ground_truth": "escitalopram", + "prediction": "escitalopram" + }, + "Phenotype Category": { + "ground_truth": "Toxicity", + "prediction": "metabolism/PK" + }, + "Alleles": { + "ground_truth": null, + "prediction": "IM, NM, RM" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "increased", + "prediction": "increased" + }, + "Phenotype": { + "ground_truth": "Side Effect:suicidal ideation, Side Effect:adverse events, Side Effect:deliberate self-harm", + "prediction": "serum escitalopram levels" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": "when treated with", + "prediction": "when treated with" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": null, + "prediction": "CYP2C19 PM" + } + }, + "dependency_issues": [] + }, + { + "sample_id": 7, + "field_scores": { + "Variant/Haplotypes": 0.0, + "Gene": 1.0, + "Drug(s)": 1.0, + "Phenotype Category": 0.0, + "Alleles": 0.0, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "Phenotype": 0.13793103448275862, + "When treated with/exposed to/when assayed with": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "CYP2C19", + "prediction": "NM" + }, + "Gene": { + "ground_truth": "CYP2C19", + "prediction": "CYP2C19" + }, + "Drug(s)": { + "ground_truth": "escitalopram", + "prediction": "escitalopram" + }, + "Phenotype Category": { + "ground_truth": "Toxicity", + "prediction": "metabolism/PK" + }, + "Alleles": { + "ground_truth": null, + "prediction": "IM, NM, RM" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "increased", + "prediction": "increased" + }, + "Phenotype": { + "ground_truth": "Side Effect:suicidal ideation, Side Effect:adverse events, Side Effect:deliberate self-harm", + "prediction": "serum escitalopram levels" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": "when treated with", + "prediction": "when treated with" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": null, + "prediction": "CYP2C19 PM" + } + }, + "dependency_issues": [] + } + ] + }, + "drug": { + "total_samples": 1, + "field_scores": { + "Variant/Haplotypes": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Gene": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "PMID": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Phenotype Category": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Significance": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Alleles": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Specialty Population": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Metabolizer types": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "isPlural": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Is/Is Not associated": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Direction of effect": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "PD/PK terms": { + "mean_score": 0.13333333333333333, + "scores": [ + 0.13333333333333333 + ] + }, + "Multiple drugs And/or": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Population types": { + "mean_score": 0.23376623376623376, + "scores": [ + 0.23376623376623376 + ] + }, + "Population Phenotypes or diseases": { + "mean_score": 0.3384615384615385, + "scores": [ + 0.3384615384615385 + ] + }, + "Multiple phenotypes or diseases And/or": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Comparison Allele(s) or Genotype(s)": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Comparison Metabolizer types": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Drug(s)": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + } + }, + "overall_score": 0.5634505845032161, + "detailed_results": [ + { + "sample_id": 0, + "field_scores": { + "Variant/Haplotypes": 1.0, + "Gene": 1.0, + "PMID": 0.0, + "Phenotype Category": 1.0, + "Significance": 1.0, + "Alleles": 0.0, + "Specialty Population": 1.0, + "Metabolizer types": 1.0, + "isPlural": 1.0, + "Is/Is Not associated": 1.0, + "Direction of effect": 1.0, + "PD/PK terms": 0.13333333333333333, + "Multiple drugs And/or": 0.0, + "Population types": 0.23376623376623376, + "Population Phenotypes or diseases": 0.3384615384615385, + "Multiple phenotypes or diseases And/or": 0.0, + "Comparison Allele(s) or Genotype(s)": 0.0, + "Comparison Metabolizer types": 0.0, + "Drug(s)": 1.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "CYP2C19", + "prediction": "CYP2C19" + }, + "Gene": { + "ground_truth": "CYP2C19", + "prediction": "CYP2C19" + }, + "PMID": { + "ground_truth": 38377518, + "prediction": null + }, + "Phenotype Category": { + "ground_truth": "Metabolism/PK", + "prediction": "metabolism/PK" + }, + "Significance": { + "ground_truth": "yes", + "prediction": "yes" + }, + "Alleles": { + "ground_truth": null, + "prediction": "" + }, + "Specialty Population": { + "ground_truth": "Pediatric", + "prediction": "Pediatric" + }, + "Metabolizer types": { + "ground_truth": "intermediate metabolizer", + "prediction": "intermediate metabolizer" + }, + "isPlural": { + "ground_truth": "Is", + "prediction": "Is" + }, + "Is/Is Not associated": { + "ground_truth": "Associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "increased", + "prediction": "increased" + }, + "PD/PK terms": { + "ground_truth": "dose-adjusted trough concentrations of", + "prediction": "AUC0\u201324" + }, + "Multiple drugs And/or": { + "ground_truth": null, + "prediction": "" + }, + "Population types": { + "ground_truth": "in children with", + "prediction": "in youth with a first-degree relative with bipolar I disorder" + }, + "Population Phenotypes or diseases": { + "ground_truth": "Other:Depression, Other:Anxiety Disorders", + "prediction": "Disease:bipolar disorder" + }, + "Multiple phenotypes or diseases And/or": { + "ground_truth": "or", + "prediction": "" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": null, + "prediction": "" + }, + "Comparison Metabolizer types": { + "ground_truth": "normal metabolizer", + "prediction": "" + }, + "Drug(s)": { + "ground_truth": "escitalopram", + "prediction": "escitalopram" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + } + ] + }, + "fa": { + "overall_score": 0.0, + "total_samples": 0, + "status": "one_empty" + }, + "study_parameters": { + "total_samples": 4, + "field_scores": { + "Study Type": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "Study Cases": { + "mean_score": 0.725, + "scores": [ + 1.0, + 1.0, + 0.9, + 0.0 + ] + }, + "Study Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Characteristics": { + "mean_score": 0.6511951223597594, + "scores": [ + 0.6387832699619772, + 0.6387832699619772, + 0.6311787072243346, + 0.6960352422907489 + ] + }, + "Characteristics Type": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Frequency in Cases": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Allele of Frequency in Cases": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Frequency in Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Allele of Frequency in Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "P Value": { + "mean_score": 0.625, + "scores": [ + 1.0, + 0.5, + 0.5, + 0.5 + ] + }, + "Ratio Stat Type": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "Ratio Stat": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Confidence Interval Start": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Confidence Interval Stop": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Biogeographical Groups": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0, + 0.0, + 0.0 + ] + } + }, + "overall_score": 0.7334130081573172, + "detailed_results": [ + { + "sample_id": 0, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 1.0, + "Study Controls": 1.0, + "Characteristics": 0.6387832699619772, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 1.0, + "Ratio Stat Type": 0.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452390361, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452390352, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "clinical trial" + }, + "Study Cases": { + "ground_truth": 66.0, + "prediction": 66 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "Outpatient youth aged 12\u201317 years who had at least one first-degree relative with bipolar I disorder; TEASAP subscale Self-injury, suicidality and harm to others", + "prediction": "Youth aged 12-17 years with a first-degree relative with bipolar I disorder treated with escitalopram." + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "= 0.017", + "prediction": "= 0.017" + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": "Unknown" + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Multiple groups, Non-Hispanic Caucasian n=44, Hispanic Caucasian n=7, Black or African n = 7, Multiple or other n=8", + "prediction": "Unknown" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 1, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 1.0, + "Study Controls": 1.0, + "Characteristics": 0.6387832699619772, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 0.5, + "Ratio Stat Type": 0.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452390309, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452390301, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "clinical trial" + }, + "Study Cases": { + "ground_truth": 66.0, + "prediction": 66 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "Outpatient youth aged 12\u201317 years who had at least one first-degree relative with bipolar I disorder; TEASAP subscale Self-injury, suicidality and harm to others", + "prediction": "Youth aged 12-17 years with a first-degree relative with bipolar I disorder treated with escitalopram." + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "= 0.09", + "prediction": "= 0.025" + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": "HR" + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Multiple groups, Non-Hispanic Caucasian n=44, Hispanic Caucasian n=7, Black or African n = 7, Multiple or other n=8", + "prediction": "Unknown" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 2, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 0.9, + "Study Controls": 1.0, + "Characteristics": 0.6311787072243346, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 0.5, + "Ratio Stat Type": 0.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452390397, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452390389, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "clinical trial" + }, + "Study Cases": { + "ground_truth": 64.0, + "prediction": 66 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "Outpatient youth aged 12\u201317 years who had at least one first-degree relative with bipolar I disorder; TEASAP subscale Akathisia, hyperkinesis and somatic anxiety", + "prediction": "Youth aged 12-17 years with a first-degree relative with bipolar I disorder treated with escitalopram." + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "= 0.017", + "prediction": "= 0.013" + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": "HR" + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Multiple groups, Non-Hispanic Caucasian n=44, Hispanic Caucasian n=7, Black or African n = 7, Multiple or other n=8", + "prediction": "Unknown" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 3, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 0.0, + "Study Controls": 1.0, + "Characteristics": 0.6960352422907489, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 0.5, + "Ratio Stat Type": 0.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452390351, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452390346, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "clinical trial" + }, + "Study Cases": { + "ground_truth": 48.0, + "prediction": 66 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "Outpatient youth aged 12\u201317 years who had at least one first-degree relative with bipolar I disorder; dose-normalized AUC0\u201324", + "prediction": "Youth aged 12-17 years with a first-degree relative with bipolar I disorder treated with escitalopram." + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "= 0.025", + "prediction": "= 0.015" + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": "Unknown" + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Multiple groups, Non-Hispanic Caucasian n=33, Hispanic Caucasian n=6, Black or African n = 6, Multiple or other n=6", + "prediction": "Unknown" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + } + ] + } + } + }, + { + "pmcid": "PMC10946077", + "title": "Individual Irinotecan Therapy Under the Guidance of Pre-Treated UGT1A1*6 Genotyping in Gastric Cancer", + "benchmarks": { + "pheno": { + "total_samples": 1, + "field_scores": { + "Variant/Haplotypes": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Gene": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Drug(s)": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Phenotype Category": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Alleles": { + "mean_score": 0.2222222222222222, + "scores": [ + 0.2222222222222222 + ] + }, + "Is/Is Not associated": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Direction of effect": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + }, + "Phenotype": { + "mean_score": 0.2549019607843137, + "scores": [ + 0.2549019607843137 + ] + }, + "When treated with/exposed to/when assayed with": { + "mean_score": 1.0, + "scores": [ + 1.0 + ] + }, + "Comparison Allele(s) or Genotype(s)": { + "mean_score": 0.0, + "scores": [ + 0.0 + ] + } + }, + "overall_score": 0.40359477124183013, + "detailed_results": [ + { + "sample_id": 0, + "field_scores": { + "Variant/Haplotypes": 1.0, + "Gene": 1.0, + "Drug(s)": 1.0, + "Phenotype Category": 0.0, + "Alleles": 0.2222222222222222, + "Is/Is Not associated": 0.0, + "Direction of effect": 0.0, + "Phenotype": 0.2549019607843137, + "When treated with/exposed to/when assayed with": 1.0, + "Comparison Allele(s) or Genotype(s)": 0.0 + }, + "field_values": { + "Variant/Haplotypes": { + "ground_truth": "UGT1A1*6", + "prediction": "UGT1A1*6" + }, + "Gene": { + "ground_truth": "UGT1A1", + "prediction": "UGT1A1" + }, + "Drug(s)": { + "ground_truth": "irinotecan", + "prediction": "irinotecan" + }, + "Phenotype Category": { + "ground_truth": "Efficacy", + "prediction": "toxicity" + }, + "Alleles": { + "ground_truth": "*6 + *28", + "prediction": "AA, GA, GG" + }, + "Is/Is Not associated": { + "ground_truth": "Not associated with", + "prediction": "Associated with" + }, + "Direction of effect": { + "ground_truth": "decreased", + "prediction": "increased" + }, + "Phenotype": { + "ground_truth": "Efficacy:Overall survival, Efficacy:Progression-free survival", + "prediction": "delayed diarrhea, leukopenia, neutropenia" + }, + "When treated with/exposed to/when assayed with": { + "ground_truth": "when treated with", + "prediction": "when treated with" + }, + "Comparison Allele(s) or Genotype(s)": { + "ground_truth": "*1", + "prediction": null + } + }, + "dependency_issues": [] + } + ] + }, + "drug": { + "overall_score": 0.0, + "total_samples": 0, + "status": "one_empty" + }, + "fa": { + "overall_score": 0.0, + "total_samples": 0, + "status": "one_empty" + }, + "study_parameters": { + "total_samples": 5, + "field_scores": { + "Study Type": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "Study Cases": { + "mean_score": 0.76, + "scores": [ + 1.0, + 1.0, + 0.9, + 0.9, + 0.0 + ] + }, + "Study Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Characteristics": { + "mean_score": 0.19383119568381332, + "scores": [ + 0.1038961038961039, + 0.17857142857142858, + 0.25, + 0.21238938053097345, + 0.22429906542056074 + ] + }, + "Characteristics Type": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Frequency in Cases": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Allele of Frequency in Cases": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Frequency in Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Allele of Frequency in Controls": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "P Value": { + "mean_score": 0.49000000000000005, + "scores": [ + 0.95, + 0.5, + 0.5, + 0.5, + 0.0 + ] + }, + "Ratio Stat Type": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "Ratio Stat": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Confidence Interval Start": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Confidence Interval Stop": { + "mean_score": 1.0, + "scores": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ] + }, + "Biogeographical Groups": { + "mean_score": 0.0, + "scores": [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } + }, + "overall_score": 0.6962554130455876, + "detailed_results": [ + { + "sample_id": 0, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 1.0, + "Study Controls": 1.0, + "Characteristics": 0.1038961038961039, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 0.95, + "Ratio Stat Type": 0.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452426880, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452426860, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "retrospective" + }, + "Study Cases": { + "ground_truth": 110.0, + "prediction": 110 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "OS for *6", + "prediction": "Patients treated with irinotecan, differentiated by UGT1A1 genotype." + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "= 0.703", + "prediction": "= 0.6821" + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": "HR" + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Unknown", + "prediction": "East Asian" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 1, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 1.0, + "Study Controls": 1.0, + "Characteristics": 0.17857142857142858, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 0.5, + "Ratio Stat Type": 0.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452426891, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452426882, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "retrospective" + }, + "Study Cases": { + "ground_truth": 110.0, + "prediction": 110 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "Diarrhea grade 3-4", + "prediction": "Patients with advanced gastric cancer receiving irinotecan treatment based on UGT1A1 genotype." + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "= 0.136", + "prediction": "= 0.5249" + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": "HR" + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Unknown", + "prediction": "East Asian" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 2, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 0.9, + "Study Controls": 1.0, + "Characteristics": 0.25, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 0.5, + "Ratio Stat Type": 0.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452426968, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452426961, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "retrospective" + }, + "Study Cases": { + "ground_truth": 110.0, + "prediction": 107 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "Leukopenia grade 3-4", + "prediction": "Patients with UGT1A1 genotype receiving irinotecan treatment and analysis of adverse events." + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "= 0.003", + "prediction": "< 0.003" + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": "Unknown" + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Unknown", + "prediction": "East Asian" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 3, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 0.9, + "Study Controls": 1.0, + "Characteristics": 0.21238938053097345, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 0.5, + "Ratio Stat Type": 0.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452426980, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452426961, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "retrospective" + }, + "Study Cases": { + "ground_truth": 110.0, + "prediction": 107 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "Neutropenia grade 3-4", + "prediction": "Patients with UGT1A1 genotype receiving irinotecan treatment and analysis of adverse events." + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "< 0.001", + "prediction": "< 0.000" + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": "Unknown" + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Unknown", + "prediction": "East Asian" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + }, + { + "sample_id": 4, + "field_scores": { + "Study Parameters ID": 0.0, + "Variant Annotation ID": 0.0, + "Study Type": 0.0, + "Study Cases": 0.0, + "Study Controls": 1.0, + "Characteristics": 0.22429906542056074, + "Characteristics Type": 1.0, + "Frequency in Cases": 1.0, + "Allele of Frequency in Cases": 1.0, + "Frequency in Controls": 1.0, + "Allele of Frequency in Controls": 1.0, + "P Value": 0.0, + "Ratio Stat Type": 0.0, + "Ratio Stat": 1.0, + "Confidence Interval Start": 1.0, + "Confidence Interval Stop": 1.0, + "Biogeographical Groups": 0.0 + }, + "field_values": { + "Study Parameters ID": { + "ground_truth": 1452426889, + "prediction": null + }, + "Variant Annotation ID": { + "ground_truth": 1452426882, + "prediction": null + }, + "Study Type": { + "ground_truth": null, + "prediction": "retrospective" + }, + "Study Cases": { + "ground_truth": 110.0, + "prediction": 5 + }, + "Study Controls": { + "ground_truth": null, + "prediction": null + }, + "Characteristics": { + "ground_truth": "Neutopenia grade 3-4", + "prediction": "Patients with UGT1A1*6 AA genotype switched to paclitaxel due to severe adverse events." + }, + "Characteristics Type": { + "ground_truth": "Study Cohort", + "prediction": "Study Cohort" + }, + "Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Cases": { + "ground_truth": null, + "prediction": null + }, + "Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "Allele of Frequency in Controls": { + "ground_truth": null, + "prediction": null + }, + "P Value": { + "ground_truth": "= 0.136", + "prediction": null + }, + "Ratio Stat Type": { + "ground_truth": null, + "prediction": "Unknown" + }, + "Ratio Stat": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Start": { + "ground_truth": null, + "prediction": null + }, + "Confidence Interval Stop": { + "ground_truth": null, + "prediction": null + }, + "Biogeographical Groups": { + "ground_truth": "Unknown", + "prediction": "East Asian" + } + }, + "dependency_issues": [], + "penalty_info": { + "total_penalty": 0.0, + "penalized_fields": {}, + "issues_by_field": {} + } + } + ] + } + } + } + ] +} \ No newline at end of file diff --git a/src/benchmark/annotation_benchmark.py b/src/benchmark/annotation_benchmark.py index 862d223..772e860 100644 --- a/src/benchmark/annotation_benchmark.py +++ b/src/benchmark/annotation_benchmark.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional import json from src.utils import get_pmcid_annotation from src.benchmark.pheno_benchmark import evaluate_phenotype_annotations @@ -17,31 +17,49 @@ def get_var_drug_ann_score(self, var_drug_ann: List[dict]): except Exception: return 1.0 - def get_var_pheno_ann_score(self, var_pheno_ann: List[dict], pmcid: str): - # Load ground truth annotations - with open("persistent_data/benchmark_annotations.json", "r") as f: - ground_truth_data = json.load(f) - - # Get ground truth for this PMCID - if pmcid not in ground_truth_data: - return 0.0 - - ground_truth_pheno_ann = ground_truth_data[pmcid].get("var_pheno_ann", []) + def get_var_pheno_ann_score( + self, + var_pheno_ann: List[dict], + pmcid: Optional[str] = None, + ground_truth_pheno_ann: Optional[List[dict]] = None, + ): + """ + Get phenotype annotation score. + + Args: + var_pheno_ann: Prediction annotations + pmcid: PMCID for loading ground truth (for backward compatibility) + ground_truth_pheno_ann: Ground truth annotations (preferred, passed directly) + """ + # If ground truth is provided directly, use it (preferred) + if ground_truth_pheno_ann is not None: + gt_list = ground_truth_pheno_ann + elif pmcid is not None: + # Fallback: load from file (for backward compatibility) + with open("persistent_data/benchmark_annotations.json", "r") as f: + ground_truth_data = json.load(f) + + if pmcid not in ground_truth_data: + return 0.0 + + gt_list = ground_truth_data[pmcid].get("var_pheno_ann", []) + else: + raise ValueError( + "Either ground_truth_pheno_ann or pmcid must be provided" + ) # If both are empty, perfect score - if not var_pheno_ann and not ground_truth_pheno_ann: + if not var_pheno_ann and not gt_list: return 1.0 # If one is empty but not the other, score is 0 - if not var_pheno_ann or not ground_truth_pheno_ann: + if not var_pheno_ann or not gt_list: return 0.0 # Compare: [ground_truth, prediction] try: - score = evaluate_phenotype_annotations( - [ground_truth_pheno_ann, var_pheno_ann] - ) - return score / 100.0 + result = evaluate_phenotype_annotations([gt_list, var_pheno_ann]) + return float(result.get("overall_score", 0.0)) except Exception: return 0.0 diff --git a/src/benchmark/drug_benchmark.py b/src/benchmark/drug_benchmark.py index 09a4a25..fa73bd4 100644 --- a/src/benchmark/drug_benchmark.py +++ b/src/benchmark/drug_benchmark.py @@ -2,27 +2,59 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Dict, List, Any, Optional, Tuple from difflib import SequenceMatcher -import numpy as np import re -from sentence_transformers import SentenceTransformer - - -_model: Optional[SentenceTransformer] = None - - -def _get_model() -> SentenceTransformer: - global _model - if _model is None: - _model = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO") - return _model - - -def evaluate_drug_annotations(samples: List[Dict[str, Any]]) -> Dict[str, Any]: +from src.benchmark.shared_utils import ( + exact_match, + semantic_similarity, + category_equal, + variant_substring_match, + compute_weighted_score, + parse_variant_list, + normalize_variant, +) + + +def validate_drug_dependencies(annotation: Dict[str, Any]) -> List[str]: + """Validate field dependencies for drug annotations.""" + issues: List[str] = [] + + # Direction of effect requires Is/Is Not associated = "Associated with" + direction = annotation.get("Direction of effect") + association = annotation.get("Is/Is Not associated") + if direction and association != "Associated with": + issues.append("Direction of effect requires 'Associated with' status") + + # Comparison Allele(s) requires Variant/Haplotypes + comparison_alleles = annotation.get("Comparison Allele(s) or Genotype(s)") + variants = annotation.get("Variant/Haplotypes") + if comparison_alleles and not variants: + issues.append( + "Variant/Haplotypes required when Comparison Allele(s) is specified" + ) + + # Multiple drugs And/or should be consistent with Drug(s) presence + multiple_drugs_op = annotation.get("Multiple drugs And/or") + drugs = annotation.get("Drug(s)") + if multiple_drugs_op and not drugs: + issues.append("Drug(s) field should be present when Multiple drugs And/or is specified") + + return issues + + +def evaluate_drug_annotations( + samples: List[Dict[str, Any]], + field_weights: Optional[Dict[str, float]] = None, +) -> Dict[str, Any]: """ Parallel benchmark for drug entries. Input is a list with exactly two dicts: - samples[0] = ground truth annotation dict - samples[1] = prediction annotation dict + + Args: + samples: [ground_truth_dict, prediction_dict] + field_weights: Optional dict mapping field names to weights for weighted scoring. + If None, all fields are weighted equally (unweighted mean). """ if not isinstance(samples, list) or len(samples) != 2: @@ -35,19 +67,6 @@ def evaluate_drug_annotations(samples: List[Dict[str, Any]]) -> Dict[str, Any]: "Both items must be dicts: [ground_truth_dict, prediction_dict]." ) - # Variant expansion and alignment (mirroring FA) - def parse_variant_list(variants_text: Optional[str]) -> List[str]: - if not variants_text: - return [] - tokens = re.split(r"[,;|\s]+(?:\+\s*)?", variants_text) - return [t.strip() for t in tokens if t and t.strip()] - - def normalize_variant(variant: str) -> str: - v = variant.strip() - if v.lower().startswith("rs"): - return v.lower() - return re.sub(r"\s+", "", v) - def expand_annotations_by_variant( annotations: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: @@ -122,8 +141,6 @@ def align_by_variant( "detailed_results": [], } - model = _get_model() - def normalize_drug_name(name: str) -> str: # lowercase, strip, collapse whitespace, standardize separators n = name.lower().strip() @@ -153,37 +170,7 @@ def parse_drug_list(value: Optional[str]) -> List[str]: unique.append(t) return unique - def exact_match(gt_val: Any, pred_val: Any) -> float: - if gt_val is None and pred_val is None: - return 1.0 - if gt_val is None or pred_val is None: - return 0.0 - return ( - 1.0 if str(gt_val).strip().lower() == str(pred_val).strip().lower() else 0.0 - ) - - def semantic_similarity(gt_val: Any, pred_val: Any) -> float: - if gt_val is None and pred_val is None: - return 1.0 - if gt_val is None or pred_val is None: - return 0.0 - gt_str = str(gt_val).strip() - pred_str = str(pred_val).strip() - if gt_str == pred_str: - return 1.0 - try: - embeddings = model.encode([gt_str, pred_str]) - gt_embedding = embeddings[0] - pred_embedding = embeddings[1] - similarity = float( - np.dot(gt_embedding, pred_embedding) - / (np.linalg.norm(gt_embedding) * np.linalg.norm(pred_embedding)) - ) - return similarity - except Exception: - return SequenceMatcher(None, gt_str.lower(), pred_str.lower()).ratio() - - def variant_substring_match(gt_val: Any, pred_val: Any) -> float: + def variant_substring_match_with_phenotype_groups(gt_val: Any, pred_val: Any) -> float: """Return 1.0 if GT substring appears in prediction (case-insensitive). Also accept canonical gene-phenotype group labels via category equality. """ @@ -209,28 +196,6 @@ def variant_substring_match(gt_val: Any, pred_val: Any) -> float: return 1.0 if not pred_str else 0.0 return 1.0 if gt_str in pred_str else 0.0 - def parse_allele_tokens(text: Optional[str]) -> List[str]: - if not text: - return [] - # split on '+' and whitespace and commas/semicolons - parts = re.split(r"[+/,;\s]+", str(text)) - tokens = [p.strip().lower() for p in parts if p and p.strip()] - return tokens - - def alleles_set_coverage(gt_val: Any, pred_val: Any) -> float: - """Order-insensitive coverage for allele/group fields. - Scores fraction of GT tokens present in Pred tokens (1.0 if both empty). - """ - gt_tokens = parse_allele_tokens(gt_val) - pred_tokens = parse_allele_tokens(pred_val) - if not gt_tokens and not pred_tokens: - return 1.0 - if not gt_tokens or not pred_tokens: - return 0.0 - pred_set = set(pred_tokens) - covered = sum(1 for t in gt_tokens if t in pred_set) - return covered / len(gt_tokens) - def drugs_coverage(gt: Dict[str, Any], pred: Dict[str, Any]) -> float: """Operator-aware coverage for Drug(s). Uses `Multiple drugs And/or` to decide coverage rule. Defaults to 'or' if missing. @@ -275,22 +240,13 @@ def token_match(g: str, p: str) -> bool: return max(frac, 1.0 if any(covered) else 0.0) # Map evaluators to drug schema fields; Drug(s) handled separately below. - def category_equal(a: Any, b: Any) -> float: - a_norm = re.sub(r"\s+", " ", str(a).strip().lower()) if a is not None else None - b_norm = re.sub(r"\s+", " ", str(b).strip().lower()) if b is not None else None - if a_norm is None and b_norm is None: - return 1.0 - if a_norm is None or b_norm is None: - return 0.0 - return 1.0 if a_norm == b_norm else 0.0 - field_evaluators = { - "Variant/Haplotypes": variant_substring_match, + "Variant/Haplotypes": variant_substring_match_with_phenotype_groups, "Gene": semantic_similarity, "PMID": exact_match, "Phenotype Category": category_equal, "Significance": category_equal, - "Alleles": alleles_set_coverage, + "Alleles": semantic_similarity, # Changed to semantic similarity "Specialty Population": semantic_similarity, "Metabolizer types": semantic_similarity, "isPlural": category_equal, @@ -301,7 +257,7 @@ def category_equal(a: Any, b: Any) -> float: "Population types": semantic_similarity, "Population Phenotypes or diseases": semantic_similarity, "Multiple phenotypes or diseases And/or": category_equal, - "Comparison Allele(s) or Genotype(s)": alleles_set_coverage, + "Comparison Allele(s) or Genotype(s)": semantic_similarity, # Changed to semantic similarity "Comparison Metabolizer types": semantic_similarity, } @@ -331,12 +287,66 @@ def category_equal(a: Any, b: Any) -> float: results["detailed_results"] = [] for i, (g, p) in enumerate(zip(gt_list, pred_list)): - sample_result: Dict[str, Any] = {"sample_id": i, "field_scores": {}} + sample_result: Dict[str, Any] = {"sample_id": i, "field_scores": {}, "field_values": {}} for field, evaluator in field_evaluators.items(): sample_result["field_scores"][field] = evaluator(g.get(field), p.get(field)) + # Store actual values for display + sample_result["field_values"][field] = { + "ground_truth": g.get(field), + "prediction": p.get(field) + } sample_result["field_scores"]["Drug(s)"] = drugs_coverage(g, p) - # No dependency penalties wired yet for drug entries; can be added later if needed - sample_result["dependency_issues"] = [] + # Also store Drug(s) values + sample_result["field_values"]["Drug(s)"] = { + "ground_truth": g.get("Drug(s)"), + "prediction": p.get("Drug(s)") + } + + # Dependency validation + dependency_issues = validate_drug_dependencies(p) + sample_result["dependency_issues"] = dependency_issues + + # Track penalty information + penalty_info = { + 'total_penalty': 0.0, + 'penalized_fields': {}, + 'issues_by_field': {} + } + + if dependency_issues: + penalty_per_issue = 0.05 + total_penalty = min(len(dependency_issues) * penalty_per_issue, 0.3) + penalty_info['total_penalty'] = total_penalty + fields_to_penalize = set() + for issue in dependency_issues: + affected_fields = [] + if "Direction" in issue or "Associated" in issue: + affected_fields = ["Direction of effect", "Is/Is Not associated"] + elif "Variant" in issue or "Comparison" in issue: + affected_fields = ["Variant/Haplotypes", "Comparison Allele(s) or Genotype(s)"] + elif "Drug" in issue or "Multiple drugs" in issue: + affected_fields = ["Drug(s)"] + else: + affected_fields = list(sample_result["field_scores"].keys()) + + for field in affected_fields: + fields_to_penalize.add(field) + if field not in penalty_info['issues_by_field']: + penalty_info['issues_by_field'][field] = [] + penalty_info['issues_by_field'][field].append(issue) + + for field in fields_to_penalize: + if field in sample_result["field_scores"]: + original_score = sample_result["field_scores"][field] + penalized_score = original_score * (1 - total_penalty) + sample_result["field_scores"][field] = penalized_score + penalty_info['penalized_fields'][field] = { + 'original_score': original_score, + 'penalized_score': penalized_score, + 'penalty_percentage': total_penalty * 100 + } + + sample_result['penalty_info'] = penalty_info results["detailed_results"].append(sample_result) for field in list(field_evaluators.keys()) + ["Drug(s)"]: @@ -346,8 +356,7 @@ def category_equal(a: Any, b: Any) -> float: "scores": field_scores, } - field_means = [v["mean_score"] for v in results["field_scores"].values()] - results["overall_score"] = ( - sum(field_means) / len(field_means) if field_means else 0.0 - ) + # Compute overall score with optional field weights + field_mean_scores = {k: v["mean_score"] for k, v in results["field_scores"].items()} + results["overall_score"] = compute_weighted_score(field_mean_scores, field_weights) return results diff --git a/src/benchmark/fa_benchmark.py b/src/benchmark/fa_benchmark.py index 77388df..1933387 100644 --- a/src/benchmark/fa_benchmark.py +++ b/src/benchmark/fa_benchmark.py @@ -1,18 +1,13 @@ from typing import Dict, List, Any, Optional, Tuple from difflib import SequenceMatcher -import numpy as np import re -from sentence_transformers import SentenceTransformer - - -_model: Optional[SentenceTransformer] = None - - -def _get_model() -> SentenceTransformer: - global _model - if _model is None: - _model = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO") - return _model +from src.benchmark.shared_utils import ( + exact_match, + semantic_similarity, + category_equal, + variant_substring_match, + compute_weighted_score, +) def parse_variant_list(variants_text: Optional[str]) -> List[str]: @@ -131,7 +126,7 @@ def evaluate_fa_from_articles( "status": "no_overlap_after_alignment", } - results = _evaluate_functional_analysis_pairs(aligned_gt, aligned_pred, None) + results = _evaluate_functional_analysis_pairs(aligned_gt, aligned_pred, None, None) results["aligned_variants"] = display results["status"] = "ok" return results @@ -221,7 +216,10 @@ def validate_all_dependencies( return issues -def evaluate_functional_analysis(samples: List[Dict[str, Any]]) -> Dict[str, Any]: +def evaluate_functional_analysis( + samples: List[Dict[str, Any]], + field_weights: Optional[Dict[str, float]] = None, +) -> Dict[str, Any]: """ Evaluate FA when provided a list with exactly two dicts: - samples[0] = ground truth annotation dict @@ -229,6 +227,8 @@ def evaluate_functional_analysis(samples: List[Dict[str, Any]]) -> Dict[str, Any Args: samples: [ground_truth_dict, prediction_dict] + field_weights: Optional dict mapping field names to weights for weighted scoring. + If None, all fields are weighted equally (unweighted mean). Returns: Dict with overall and per-field scores. @@ -246,46 +246,15 @@ def evaluate_functional_analysis(samples: List[Dict[str, Any]]) -> Dict[str, Any gt_list: List[Dict[str, Any]] = [gt] pred_list: List[Dict[str, Any]] = [pred] - return _evaluate_functional_analysis_pairs(gt_list, pred_list, None) + return _evaluate_functional_analysis_pairs(gt_list, pred_list, None, field_weights) def _evaluate_functional_analysis_pairs( gt_list: List[Dict[str, Any]], pred_list: List[Dict[str, Any]], study_parameters: Optional[List[Dict[str, Any]]], + field_weights: Optional[Dict[str, float]] = None, ) -> Dict[str, Any]: - model = _get_model() - - def exact_match(gt_val: Any, pred_val: Any) -> float: - if gt_val is None and pred_val is None: - return 1.0 - if gt_val is None or pred_val is None: - return 0.0 - return ( - 1.0 if str(gt_val).strip().lower() == str(pred_val).strip().lower() else 0.0 - ) - - def semantic_similarity(gt_val: Any, pred_val: Any) -> float: - if gt_val is None and pred_val is None: - return 1.0 - if gt_val is None or pred_val is None: - return 0.0 - gt_str = str(gt_val).strip() - pred_str = str(pred_val).strip() - if gt_str == pred_str: - return 1.0 - try: - embeddings = model.encode([gt_str, pred_str]) - gt_embedding = embeddings[0] - pred_embedding = embeddings[1] - similarity = float( - np.dot(gt_embedding, pred_embedding) - / (np.linalg.norm(gt_embedding) * np.linalg.norm(pred_embedding)) - ) - return similarity - except Exception: - return SequenceMatcher(None, gt_str.lower(), pred_str.lower()).ratio() - def variant_coverage(gt_variants: str, pred_variants: str) -> float: if not gt_variants or not pred_variants: return 1.0 if not gt_variants and not pred_variants else 0.0 @@ -336,63 +305,24 @@ def is_star_allele(variant: str) -> bool: covered_count += 1 return covered_count / len(gt_list_filtered) - def variant_substring_match(gt_val: Any, pred_val: Any) -> float: - if gt_val is None and pred_val is None: - return 1.0 - if gt_val is None or pred_val is None: - return 0.0 - gt_str = str(gt_val).strip().lower() - pred_str = str(pred_val).strip().lower() - if not gt_str: - return 1.0 if not pred_str else 0.0 - return 1.0 if gt_str in pred_str else 0.0 - field_evaluators = { "Variant/Haplotypes": variant_substring_match, "Gene": semantic_similarity, "Drug(s)": semantic_similarity, "PMID": exact_match, - "Phenotype Category": lambda gt, pred: ( - 1.0 - if (gt and pred and gt.lower().strip() == pred.lower().strip()) - else (1.0 if not gt and not pred else 0.0) - ), - "Significance": lambda gt, pred: ( - 1.0 - if (gt and pred and gt.lower().strip() == pred.lower().strip()) - else (1.0 if not gt and not pred else 0.0) - ), + "Phenotype Category": category_equal, + "Significance": category_equal, "Alleles": semantic_similarity, "Specialty Population": semantic_similarity, "Assay type": semantic_similarity, "Metabolizer types": semantic_similarity, - "isPlural": lambda gt, pred: ( - 1.0 - if (gt and pred and gt.lower().strip() == pred.lower().strip()) - else (1.0 if not gt and not pred else 0.0) - ), - "Is/Is Not associated": lambda gt, pred: ( - 1.0 - if (gt and pred and gt.lower().strip() == pred.lower().strip()) - else (1.0 if not gt and not pred else 0.0) - ), - "Direction of effect": lambda gt, pred: ( - 1.0 - if (gt and pred and gt.lower().strip() == pred.lower().strip()) - else (1.0 if not gt and not pred else 0.0) - ), + "isPlural": category_equal, + "Is/Is Not associated": category_equal, + "Direction of effect": category_equal, "Functional terms": semantic_similarity, "Gene/gene product": semantic_similarity, - "When treated with/exposed to/when assayed with": lambda gt, pred: ( - 1.0 - if (gt and pred and gt.lower().strip() == pred.lower().strip()) - else (1.0 if not gt and not pred else 0.0) - ), - "Multiple drugs And/or": lambda gt, pred: ( - 1.0 - if (gt and pred and gt.lower().strip() == pred.lower().strip()) - else (1.0 if not gt and not pred else 0.0) - ), + "When treated with/exposed to/when assayed with": category_equal, + "Multiple drugs And/or": category_equal, "Cell type": semantic_similarity, "Comparison Allele(s) or Genotype(s)": semantic_similarity, "Comparison Metabolizer types": semantic_similarity, @@ -415,39 +345,64 @@ def variant_substring_match(gt_val: Any, pred_val: Any) -> float: results["detailed_results"] = [] for i, (gt, pred) in enumerate(zip(gt_list, pred_list)): - sample_result: Dict[str, Any] = {"sample_id": i, "field_scores": {}} + sample_result: Dict[str, Any] = {"sample_id": i, "field_scores": {}, "field_values": {}} for field, evaluator in field_evaluators.items(): sample_result["field_scores"][field] = evaluator( gt.get(field), pred.get(field) ) + # Store actual values for display + sample_result["field_values"][field] = { + "ground_truth": gt.get(field), + "prediction": pred.get(field) + } dependency_issues = validate_all_dependencies(pred, study_parameters) sample_result["dependency_issues"] = dependency_issues + + # Track penalty information + penalty_info = { + 'total_penalty': 0.0, + 'penalized_fields': {}, + 'issues_by_field': {} + } + if dependency_issues: penalty_per_issue = 0.05 total_penalty = min(len(dependency_issues) * penalty_per_issue, 0.3) + penalty_info['total_penalty'] = total_penalty fields_to_penalize = set() for issue in dependency_issues: + affected_fields = [] if "Gene" in issue or "gene" in issue: - fields_to_penalize.update(["Gene", "Gene/gene product"]) + affected_fields = ["Gene", "Gene/gene product"] elif "Variant" in issue or "variant" in issue: - fields_to_penalize.update( - ["Variant/Haplotypes", "Comparison Allele(s) or Genotype(s)"] - ) + affected_fields = ["Variant/Haplotypes", "Comparison Allele(s) or Genotype(s)"] elif "Direction" in issue or "Associated" in issue: - fields_to_penalize.update( - ["Direction of effect", "Is/Is Not associated"] - ) + affected_fields = ["Direction of effect", "Is/Is Not associated"] elif "Functional" in issue: - fields_to_penalize.update(["Functional terms", "Gene/gene product"]) + affected_fields = ["Functional terms", "Gene/gene product"] elif "rsID" in issue or "star allele" in issue: - fields_to_penalize.add("Variant/Haplotypes") + affected_fields = ["Variant/Haplotypes"] else: - fields_to_penalize.update(sample_result["field_scores"].keys()) + affected_fields = list(sample_result["field_scores"].keys()) + + for field in affected_fields: + fields_to_penalize.add(field) + if field not in penalty_info['issues_by_field']: + penalty_info['issues_by_field'][field] = [] + penalty_info['issues_by_field'][field].append(issue) + for field in fields_to_penalize: - original_score = sample_result["field_scores"][field] - sample_result["field_scores"][field] = original_score * ( - 1 - total_penalty - ) + if field in sample_result["field_scores"]: + original_score = sample_result["field_scores"][field] + penalized_score = original_score * (1 - total_penalty) + sample_result["field_scores"][field] = penalized_score + penalty_info['penalized_fields'][field] = { + 'original_score': original_score, + 'penalized_score': penalized_score, + 'penalty_percentage': total_penalty * 100 + } + + sample_result['penalty_info'] = penalty_info results["detailed_results"].append(sample_result) for field in list(field_evaluators.keys()): @@ -457,8 +412,6 @@ def variant_substring_match(gt_val: Any, pred_val: Any) -> float: "scores": field_scores, } - field_means = [v["mean_score"] for v in results["field_scores"].values()] - results["overall_score"] = ( - sum(field_means) / len(field_means) if field_means else 0.0 - ) + field_mean_scores = {k: v["mean_score"] for k, v in results["field_scores"].items()} + results["overall_score"] = compute_weighted_score(field_mean_scores, field_weights) return results diff --git a/src/benchmark/pheno_benchmark.py b/src/benchmark/pheno_benchmark.py index b1a301f..f27659c 100644 --- a/src/benchmark/pheno_benchmark.py +++ b/src/benchmark/pheno_benchmark.py @@ -1,6 +1,118 @@ -from typing import List, Dict, Any, Tuple, Set -from dataclasses import dataclass +from typing import List, Dict, Any, Tuple, Set, Optional import re +from src.benchmark.shared_utils import ( + semantic_similarity, + category_equal, + variant_substring_match, + compute_weighted_score, + parse_variant_list, + normalize_variant, +) + + +def expand_pheno_annotations_by_variant( + annotations: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Expand annotations with multiple variants into separate records.""" + expanded: List[Dict[str, Any]] = [] + for ann in annotations: + variants_field = ann.get("Variant/Haplotypes") + tokens = parse_variant_list(variants_field) + if len(tokens) <= 1: + expanded.append(ann) + continue + for tok in tokens: + new_ann = dict(ann) + new_ann["Variant/Haplotypes"] = normalize_variant(tok) + new_ann["_expanded_from_multi_variant"] = True + expanded.append(new_ann) + return expanded + + +def align_pheno_annotations_by_variant( + ground_truth_list: List[Dict[str, Any]], + predictions_list: List[Dict[str, Any]], +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """ + Align pheno annotations by variant string with robust matching: + 1) Expand multi-variant records to one per variant + 2) Prefer rsID intersection; fallback to normalized substring containment + 3) If no variant match, fallback to Gene + Drug(s) matching + Returns aligned (gt_list, pred_list) + """ + rs_re = re.compile(r"rs\d+", re.IGNORECASE) + + gt_expanded = expand_pheno_annotations_by_variant(ground_truth_list or []) + pred_expanded = expand_pheno_annotations_by_variant(predictions_list or []) + + pred_index: List[Tuple[set, str, Dict[str, Any]]] = [] + for rec in pred_expanded: + raw = (rec.get("Variant/Haplotypes") or "").strip() + raw_norm = normalize_variant(raw).lower() + rsids = set(m.group(0).lower() for m in rs_re.finditer(raw)) + pred_index.append((rsids, raw_norm, rec)) + + aligned_gt: List[Dict[str, Any]] = [] + aligned_pred: List[Dict[str, Any]] = [] + matched_pred_indices: Set[int] = set() + + # First pass: try variant-based matching + for gt_rec in gt_expanded: + gt_raw = (gt_rec.get("Variant/Haplotypes") or "").strip() + gt_norm = normalize_variant(gt_raw).lower() + gt_rs = set(m.group(0).lower() for m in rs_re.finditer(gt_raw)) + + match_idx = None + match_rec = None + + if gt_rs: + for idx, (rsids, raw_norm, pred_rec) in enumerate(pred_index): + if idx not in matched_pred_indices and rsids & gt_rs: + match_idx = idx + match_rec = pred_rec + break + if match_idx is None and gt_norm: + for idx, (rsids, raw_norm, pred_rec) in enumerate(pred_index): + if idx not in matched_pred_indices and (gt_norm in raw_norm or raw_norm in gt_norm): + match_idx = idx + match_rec = pred_rec + break + + if match_idx is not None: + aligned_gt.append(gt_rec) + aligned_pred.append(match_rec) + matched_pred_indices.add(match_idx) + + # Second pass: for unmatched GT records, try Gene + Drug(s) matching + for gt_idx, gt_rec in enumerate(gt_expanded): + if gt_idx < len(aligned_gt) and aligned_gt[gt_idx] == gt_rec: + continue # Already matched + + gt_gene = str(gt_rec.get("Gene", "")).strip().lower() + gt_drug = str(gt_rec.get("Drug(s)", "")).strip().lower() + + if not gt_gene and not gt_drug: + continue + + # Try to find match by Gene + Drug + for idx, (_, _, pred_rec) in enumerate(pred_index): + if idx in matched_pred_indices: + continue + + pred_gene = str(pred_rec.get("Gene", "")).strip().lower() + pred_drug = str(pred_rec.get("Drug(s)", "")).strip().lower() + + # Match if Gene matches and (Drug matches or both are empty) + gene_match = gt_gene and pred_gene and gt_gene == pred_gene + drug_match = (not gt_drug and not pred_drug) or (gt_drug and pred_drug and gt_drug == pred_drug) + + if gene_match and drug_match: + aligned_gt.append(gt_rec) + aligned_pred.append(pred_rec) + matched_pred_indices.add(idx) + break + + return aligned_gt, aligned_pred class PhenotypeAnnotationBenchmark: @@ -20,8 +132,8 @@ class PhenotypeAnnotationBenchmark: "Comparison Allele(s) or Genotype(s)", ] - # Fields with weighted importance - FIELD_WEIGHTS = { + # Default field weights (can be overridden via parameter) + DEFAULT_FIELD_WEIGHTS = { "Phenotype": 2.0, "Drug(s)": 1.5, "Direction of effect": 2.0, @@ -34,7 +146,7 @@ class PhenotypeAnnotationBenchmark: "Comparison Allele(s) or Genotype(s)": 1.0, } - def __init__(self, matching_threshold: float = 0.7): + def __init__(self, matching_threshold: float = 0.3): """ Initialize benchmark. @@ -43,119 +155,83 @@ def __init__(self, matching_threshold: float = 0.7): """ self.matching_threshold = matching_threshold - def _normalize_value(self, value: Any) -> str: - """Normalize a field value for comparison.""" - if value is None: - return "" - - # Convert to string and normalize - s = str(value).lower().strip() - - # Remove extra whitespace - s = re.sub(r"\s+", " ", s) - - # Remove punctuation variations - s = re.sub(r"[,;]+", "", s) - - return s - - def _compare_field(self, pred_value: Any, gt_value: Any) -> float: - """ - Compare two field values and return similarity score (0-1). - - Args: - pred_value: Predicted value - gt_value: Ground truth value - - Returns: - Similarity score between 0 and 1 - """ - pred_norm = self._normalize_value(pred_value) - ground_truth_norm = self._normalize_value(gt_value) - - # Both empty or None - if not pred_norm and not ground_truth_norm: - return 1.0 - - # One empty, one not - if not pred_norm or not ground_truth_norm: - return 0.0 - - # Exact match - if pred_norm == ground_truth_norm: - return 1.0 - - # Check if one contains the other (useful for partial matches) - if pred_norm in ground_truth_norm or ground_truth_norm in pred_norm: - return 0.8 - - # The Jaccard index is particularly useful when the presence or absence of elements - # in the sets is more important than their frequency or order. - # could be used to help check for multiple entries put in one annotation? - pred_tokens = set(pred_norm.split()) - gt_tokens = set(ground_truth_norm.split()) - - if pred_tokens and gt_tokens: - intersection = len(pred_tokens & gt_tokens) - union = len(pred_tokens | gt_tokens) - jaccard = intersection / union if union > 0 else 0.0 - return jaccard - - return 0.0 - - def _compare_annotations(self, pred: Dict[str, Any], gt: Dict[str, Any]) -> float: + def _get_field_evaluator(self, field: str): + """Get the appropriate evaluator function for a field.""" + # Map fields to evaluators using shared utilities + field_evaluators = { + "Variant/Haplotypes": variant_substring_match, + "Gene": semantic_similarity, + "Drug(s)": semantic_similarity, + "Phenotype Category": category_equal, + "Alleles": semantic_similarity, + "Is/Is Not associated": category_equal, + "Direction of effect": category_equal, + "Phenotype": semantic_similarity, + "When treated with/exposed to/when assayed with": semantic_similarity, + "Comparison Allele(s) or Genotype(s)": semantic_similarity, + } + return field_evaluators.get(field, semantic_similarity) + + def _compare_annotations( + self, pred: Dict[str, Any], gt: Dict[str, Any], field_weights: Dict[str, float] + ) -> Tuple[float, Dict[str, float]]: """ Compare a predicted annotation with a ground truth annotation. Args: pred: Predicted annotation gt: Ground truth annotation + field_weights: Field weights for scoring Returns: - Float ranging from 0 - 1 denoting similarity + Tuple of (matching_score, field_scores_dict) """ field_scores = {} - weighted_sum = 0.0 - total_weight = 0.0 for field in self.CORE_FIELDS: - weight = self.FIELD_WEIGHTS.get(field, 1.0) - similarity = self._compare_field(pred.get(field), gt.get(field)) - + evaluator = self._get_field_evaluator(field) + similarity = evaluator(pred.get(field), gt.get(field)) field_scores[field] = similarity - weighted_sum += similarity * weight - total_weight += weight # Calculate weighted average - matching_score = weighted_sum / total_weight + matching_score = compute_weighted_score(field_scores, field_weights) - return matching_score + return matching_score, field_scores def _find_best_matches( - self, predictions: List[Dict[str, Any]], ground_truths: List[Dict[str, Any]] - ) -> List[Tuple[int, int, float]]: + self, + predictions: List[Dict[str, Any]], + ground_truths: List[Dict[str, Any]], + field_weights: Dict[str, float], + ) -> List[Tuple[int, int, float, Dict[str, float]]]: """ Find best matches between predictions and ground truths. Returns: - List of (pred_idx, gt_idx, score) tuples sorted by score descending + List of (pred_idx, gt_idx, score, field_scores) tuples sorted by score descending """ matches = [] for pred_idx, pred in enumerate(predictions): for gt_idx, gt in enumerate(ground_truths): - match_score = self._compare_annotations(pred, gt) + match_score, field_scores = self._compare_annotations( + pred, gt, field_weights + ) if match_score >= self.matching_threshold: - matches.append((pred_idx, gt_idx, match_score)) + matches.append((pred_idx, gt_idx, match_score, field_scores)) # Sort by score descending matches.sort(key=lambda x: x[2], reverse=True) return matches - def evaluate(self, samples: List[Any]) -> float: + def evaluate( + self, + samples: List[Any], + field_weights: Optional[Dict[str, float]] = None, + ) -> Dict[str, Any]: """ - Evaluate predictions against ground truths and return similarity score. + Evaluate predictions against ground truths and return detailed results. Handles both single annotation pairs and lists of annotations. @@ -163,9 +239,11 @@ def evaluate(self, samples: List[Any]) -> float: samples: List with exactly 2 items: - [ground_truth_dict, prediction_dict] for single comparison - [ground_truth_list, prediction_list] for multiple comparisons + field_weights: Optional dict mapping field names to weights for weighted scoring. + If None, uses DEFAULT_FIELD_WEIGHTS. Returns: - Similarity score between 0 and 1 + Dict with field_scores, overall_score (0-1 scale), detailed_results, total_samples """ if not isinstance(samples, list) or len(samples) != 2: raise ValueError( @@ -189,66 +267,142 @@ def evaluate(self, samples: List[Any]) -> float: ) if not gt_list or not pred_list: - return 0.0 - - # Find all potential matches - all_matches = self._find_best_matches(pred_list, gt_list) + return { + "total_samples": 0, + "field_scores": {}, + "overall_score": 0.0, + "detailed_results": [], + } + + # Align by variant first (similar to FA/Drug benchmarks) + aligned_gt, aligned_pred = align_pheno_annotations_by_variant(gt_list, pred_list) + + if not aligned_gt: + return { + "total_samples": 0, + "field_scores": {}, + "overall_score": 0.0, + "detailed_results": [], + "status": "no_overlap_after_alignment", + } + + # Use provided field weights or default + weights = ( + field_weights if field_weights is not None else self.DEFAULT_FIELD_WEIGHTS + ) + + # Find all potential matches from aligned pairs + all_matches = self._find_best_matches(aligned_pred, aligned_gt, weights) # Greedily assign matches (allowing many-to-one mapping) matched_preds: Set[int] = set() - matched_gts: Set[int] = set() - match_scores = [] + matched_pairs: List[ + Tuple[Dict[str, Any], Dict[str, Any], float, Dict[str, float]] + ] = [] - for pred_idx, gt_idx, score in all_matches: + for pred_idx, gt_idx, score, field_scores in all_matches: # Allow multiple predictions to match same ground truth (many-to-one) # but each prediction can only match once (one-to-one from pred side) if pred_idx not in matched_preds: matched_preds.add(pred_idx) - matched_gts.add(gt_idx) - match_scores.append(score) + matched_pairs.append( + (aligned_gt[gt_idx], aligned_pred[pred_idx], score, field_scores) + ) + + # Build detailed results structure + results: Dict[str, Any] = { + "total_samples": len(matched_pairs), + "field_scores": {}, + "overall_score": 0.0, + "detailed_results": [], + } + + # Compute field scores for matched pairs + for field in self.CORE_FIELDS: + field_scores_list = [] + for gt, pred, _, field_scores_dict in matched_pairs: + field_scores_list.append(field_scores_dict.get(field, 0.0)) + + if field_scores_list: + results["field_scores"][field] = { + "mean_score": sum(field_scores_list) / len(field_scores_list), + "scores": field_scores_list, + } + else: + results["field_scores"][field] = { + "mean_score": 0.0, + "scores": [], + } + + # Build detailed results for each matched pair + for i, (gt, pred, match_score, field_scores_dict) in enumerate(matched_pairs): + sample_result: Dict[str, Any] = { + "sample_id": i, + "field_scores": field_scores_dict.copy(), + "field_values": {}, + "dependency_issues": [], # Placeholder for future dependency validation + } + + # Store actual values for display + for field in self.CORE_FIELDS: + sample_result["field_values"][field] = { + "ground_truth": gt.get(field), + "prediction": pred.get(field) + } + + results["detailed_results"].append(sample_result) + + # Recompute field scores from detailed results (after any penalties) + for field in self.CORE_FIELDS: + field_scores = [ + s["field_scores"].get(field, 0.0) for s in results["detailed_results"] + ] + if field_scores: + results["field_scores"][field] = { + "mean_score": sum(field_scores) / len(field_scores), + "scores": field_scores, + } - # Calculate average similarity across all ground truths - # Matched GTs contribute their match score - # Unmatched GTs contribute 0 - total_score = sum(match_scores) - total_possible = len(gt_list) + # Compute overall score with field weights + field_mean_scores = { + k: v["mean_score"] for k, v in results["field_scores"].items() + } + results["overall_score"] = compute_weighted_score(field_mean_scores, weights) - return total_score / total_possible + return results def evaluate_phenotype_annotations( - samples: List[Any], matching_threshold: float = 0.7 -) -> float: + samples: List[Any], + field_weights: Optional[Dict[str, float]] = None, + matching_threshold: float = 0.3, +) -> Dict[str, Any]: """ - Benchmark phenotype annotations and return an aggregate similarity score. - - Handles both single annotation pairs and lists of annotations. + Benchmark phenotype annotations and return detailed results. Args: samples: List with exactly 2 items: - [ground_truth_dict, prediction_dict] for single comparison - [ground_truth_list, prediction_list] for multiple comparisons + field_weights: Optional dict mapping field names to weights for weighted scoring. + If None, uses default weights. matching_threshold: Minimum similarity score to consider a match (0-1) Returns: - Similarity score between 0-100 representing how well prediction(s) - match ground truth(s) across all fields. + Dict with field_scores, overall_score (0-1 scale), detailed_results, total_samples Examples: # Single annotation pair >>> ground_truth = {"Phenotype": "sensitivity", "Drug(s)": "etoposide", ...} >>> prediction = {"Phenotype": "sensitivity", "Drug(s)": "etoposide", ...} - >>> score = benchmark_phenotype_annotations([ground_truth, prediction]) - >>> print(f"Model Score: {score:.1f}/100") + >>> result = evaluate_phenotype_annotations([ground_truth, prediction]) + >>> print(f"Overall Score: {result['overall_score']:.3f}") # Multiple annotations >>> ground_truths = [gt1, gt2, gt3] >>> predictions = [pred1, pred2] - >>> score = benchmark_phenotype_annotations([ground_truths, predictions]) - >>> print(f"Model Score: {score:.1f}/100") + >>> result = evaluate_phenotype_annotations([ground_truths, predictions]) + >>> print(f"Overall Score: {result['overall_score']:.3f}") """ benchmark = PhenotypeAnnotationBenchmark(matching_threshold=matching_threshold) - similarity = benchmark.evaluate(samples) - - # Return as 0-100 scale - return similarity * 100 + return benchmark.evaluate(samples, field_weights=field_weights) diff --git a/src/benchmark/run_benchmark_examples.py b/src/benchmark/run_benchmark_examples.py new file mode 100755 index 0000000..ee0724d --- /dev/null +++ b/src/benchmark/run_benchmark_examples.py @@ -0,0 +1,562 @@ +"""Run benchmark examples comparing LLM predictions to ground truth.""" +import json +import sys +from pathlib import Path +from datetime import datetime +from typing import Dict, List, Any, Optional + +# Add project root to path for imports +project_root = Path(__file__).parent.parent.parent +sys.path.insert(0, str(project_root)) + +from src.benchmark.pheno_benchmark import evaluate_phenotype_annotations +from src.benchmark.drug_benchmark import evaluate_drug_annotations +from src.benchmark.fa_benchmark import evaluate_functional_analysis +from src.benchmark.study_parameters_benchmark import evaluate_study_parameters + + +def load_data_files(): + """Load LLM predictions and ground truth annotations.""" + base_path = Path(__file__).parent.parent.parent / "persistent_data" + + llm_file = base_path / "combined_output_11_02_25.json" + gt_file = base_path / "benchmark_annotations.json" + + print(f"Loading LLM predictions from: {llm_file}") + with open(llm_file, "r") as f: + llm_data = json.load(f) + + print(f"Loading ground truth from: {gt_file}") + with open(gt_file, "r") as f: + gt_data = json.load(f) + + return llm_data, gt_data + + +def find_common_pmcids(llm_data: Dict, gt_data: Dict, num_examples: int = 5) -> List[str]: + """Find common PMCIDs between LLM and ground truth data.""" + llm_pmcids = set(llm_data.keys()) + gt_pmcids = set(gt_data.keys()) + common = sorted(list(llm_pmcids & gt_pmcids)) + + print(f"\nFound {len(common)} common PMCIDs") + print(f"Selecting first {min(num_examples, len(common))} examples") + + return common[:num_examples] + + +def run_benchmark( + benchmark_func, + gt_list: List[Dict[str, Any]], + pred_list: List[Dict[str, Any]], + benchmark_name: str, + accepts_lists: bool = False, +) -> Optional[Dict[str, Any]]: + """Run a single benchmark and return results.""" + if not gt_list and not pred_list: + return { + "overall_score": 1.0, + "total_samples": 0, + "status": "both_empty", + } + + if not gt_list or not pred_list: + return { + "overall_score": 0.0, + "total_samples": 0, + "status": "one_empty", + } + + try: + if accepts_lists: + # Pheno and Study Parameters benchmarks can handle lists directly + result = benchmark_func([gt_list, pred_list]) + else: + # Drug, FA expect single dicts + if len(gt_list) == 0 or len(pred_list) == 0: + return { + "overall_score": 0.0, + "total_samples": 0, + "status": "empty_list", + } + + # Compare first annotations (can be extended to compare all pairs) + result = benchmark_func([gt_list[0], pred_list[0]]) + + return result + except Exception as e: + print(f" Error in {benchmark_name}: {e}") + import traceback + traceback.print_exc() + return { + "overall_score": 0.0, + "total_samples": 0, + "status": "error", + "error": str(e), + } + + +def create_aggregated_summary(bench_result: Dict[str, Any], bench_name: str) -> Dict[str, Any]: + """Create aggregated summary from benchmark results, excluding detailed per-sample data.""" + aggregated = { + "overall_score": bench_result.get("overall_score", 0.0), + "total_samples": bench_result.get("total_samples", 0), + } + + status = bench_result.get("status") + if status: + aggregated["status"] = status + + # Aggregate field scores + field_scores = bench_result.get("field_scores", {}) + if field_scores: + aggregated["field_scores"] = {} + for field, score_data in field_scores.items(): + aggregated["field_scores"][field] = { + "mean_score": score_data.get("mean_score", 0.0), + "scores": score_data.get("scores", []) + } + + # Aggregate low-scoring fields with values + detailed_results = bench_result.get("detailed_results", []) + if detailed_results and field_scores: + # Exclude ID fields from display for study_parameters + excluded_from_display = set() + if bench_name == "study_parameters": + excluded_from_display = {'Study Parameters ID', 'Variant Annotation ID'} + + # Find low-scoring fields + sorted_fields = sorted( + field_scores.items(), + key=lambda x: x[1].get("mean_score", 0.0), + reverse=True, + ) + low_scoring_fields = [ + f for f in sorted_fields + if f[1].get("mean_score", 1.0) < 1.0 + and f[0] not in excluded_from_display + ] + + if low_scoring_fields: + aggregated["low_scoring_fields"] = {} + for field, score_data in low_scoring_fields: + field_info = { + "mean_score": score_data.get("mean_score", 0.0), + "scores": score_data.get("scores", []) + } + + # Collect values from all samples + field_values_list = [] + for dr in detailed_results: + field_values = dr.get("field_values", {}) + if field in field_values: + field_values_list.append(field_values[field]) + + if field_values_list: + # Store all sample values + field_info["sample_values"] = field_values_list + + aggregated["low_scoring_fields"][field] = field_info + + # Aggregate dependency issues + all_issues = [] + all_penalties = [] + for dr in detailed_results: + issues = dr.get("dependency_issues", []) + all_issues.extend(issues) + + penalty_info = dr.get("penalty_info", {}) + if penalty_info and penalty_info.get("total_penalty", 0) > 0: + all_penalties.append(penalty_info) + + if all_issues: + aggregated["dependency_issues"] = list(set(all_issues)) # Unique issues + + if all_penalties: + aggregated["penalties"] = [] + for penalty_info in all_penalties: + aggregated["penalties"].append({ + "total_penalty": penalty_info.get("total_penalty", 0.0), + "penalized_fields": list(penalty_info.get("penalized_fields", {}).keys()), + "issues_count": len(penalty_info.get("issues_by_field", {})) + }) + + return aggregated + + +def create_aggregated_summary(bench_result: Dict[str, Any], bench_name: str) -> Dict[str, Any]: + """Create aggregated summary from benchmark results, excluding detailed per-sample data.""" + aggregated = { + "overall_score": bench_result.get("overall_score", 0.0), + "total_samples": bench_result.get("total_samples", 0), + } + + status = bench_result.get("status") + if status: + aggregated["status"] = status + + # Aggregate field scores + field_scores = bench_result.get("field_scores", {}) + if field_scores: + aggregated["field_scores"] = {} + for field, score_data in field_scores.items(): + aggregated["field_scores"][field] = { + "mean_score": score_data.get("mean_score", 0.0), + "scores": score_data.get("scores", []) + } + + # Aggregate low-scoring fields with values + detailed_results = bench_result.get("detailed_results", []) + if detailed_results and field_scores: + # Exclude ID fields from display for study_parameters + excluded_from_display = set() + if bench_name == "study_parameters": + excluded_from_display = {'Study Parameters ID', 'Variant Annotation ID'} + + # Find low-scoring fields + sorted_fields = sorted( + field_scores.items(), + key=lambda x: x[1].get("mean_score", 0.0), + reverse=True, + ) + low_scoring_fields = [ + f for f in sorted_fields + if f[1].get("mean_score", 1.0) < 1.0 + and f[0] not in excluded_from_display + ] + + if low_scoring_fields: + aggregated["low_scoring_fields"] = {} + for field, score_data in low_scoring_fields: + field_info = { + "mean_score": score_data.get("mean_score", 0.0), + "scores": score_data.get("scores", []) + } + + # Collect values from all samples + field_values_list = [] + for dr in detailed_results: + field_values = dr.get("field_values", {}) + if field in field_values: + field_values_list.append(field_values[field]) + + if field_values_list: + # Store all sample values + field_info["sample_values"] = field_values_list + + aggregated["low_scoring_fields"][field] = field_info + + # Aggregate dependency issues + all_issues = [] + all_penalties = [] + for dr in detailed_results: + issues = dr.get("dependency_issues", []) + all_issues.extend(issues) + + penalty_info = dr.get("penalty_info", {}) + if penalty_info and penalty_info.get("total_penalty", 0) > 0: + all_penalties.append(penalty_info) + + if all_issues: + aggregated["dependency_issues"] = list(set(all_issues)) # Unique issues + + if all_penalties: + aggregated["penalties"] = [] + for penalty_info in all_penalties: + aggregated["penalties"].append({ + "total_penalty": penalty_info.get("total_penalty", 0.0), + "penalized_fields": list(penalty_info.get("penalized_fields", {}).keys()), + "issues_count": len(penalty_info.get("issues_by_field", {})) + }) + + return aggregated + + +def evaluate_pmcid( + pmcid: str, llm_data: Dict, gt_data: Dict +) -> Dict[str, Any]: + """Evaluate all benchmark types for a single PMCID.""" + llm_entry = llm_data.get(pmcid, {}) + gt_entry = gt_data.get(pmcid, {}) + + results = { + "pmcid": pmcid, + "title": gt_entry.get("title", llm_entry.get("title", "Unknown")), + "benchmarks": {}, + } + + # Pheno benchmark (accepts lists) + gt_pheno = gt_entry.get("var_pheno_ann", []) + pred_pheno = llm_entry.get("var_pheno_ann", []) + pheno_result = run_benchmark( + evaluate_phenotype_annotations, gt_pheno, pred_pheno, "Pheno", accepts_lists=True + ) + results["benchmarks"]["pheno"] = pheno_result + + # Drug benchmark (expects single dicts) + gt_drug = gt_entry.get("var_drug_ann", []) + pred_drug = llm_entry.get("var_drug_ann", []) + drug_result = run_benchmark( + evaluate_drug_annotations, gt_drug, pred_drug, "Drug", accepts_lists=False + ) + results["benchmarks"]["drug"] = drug_result + + # FA benchmark (expects single dicts) + gt_fa = gt_entry.get("var_fa_ann", []) + pred_fa = llm_entry.get("var_fa_ann", []) + fa_result = run_benchmark( + evaluate_functional_analysis, gt_fa, pred_fa, "FA", accepts_lists=False + ) + results["benchmarks"]["fa"] = fa_result + + # Study Parameters benchmark (now handles lists directly with similarity-based alignment) + gt_study = gt_entry.get("study_parameters", []) + pred_study = llm_entry.get("study_parameters", []) + study_result = run_benchmark( + evaluate_study_parameters, gt_study, pred_study, "Study Parameters", accepts_lists=True + ) + results["benchmarks"]["study_parameters"] = study_result + + return results + + +def print_summary(results: Dict[str, Any]): + """Print a summary of results to console.""" + pmcid = results["pmcid"] + title = results["title"] + + print(f"\n{'='*60}") + print(f"=== {pmcid} ===") + print(f"Title: {title}") + print(f"{'='*60}") + + for bench_name, bench_result in results["benchmarks"].items(): + if bench_result is None: + continue + + overall = bench_result.get("overall_score", 0.0) + total_samples = bench_result.get("total_samples", 0) + status = bench_result.get("status") + + print(f"\n{bench_name.upper()} Benchmark:") + print(f" Overall Score: {overall:.3f}") + print(f" Aligned Samples: {total_samples}") + + if status: + print(f" Status: {status}") + + # Get detailed results early for value display + detailed_results = bench_result.get("detailed_results", []) + + # Print field score analysis + field_scores = bench_result.get("field_scores", {}) + if field_scores: + # Sort by mean_score + sorted_fields = sorted( + field_scores.items(), + key=lambda x: x[1].get("mean_score", 0.0), + reverse=True, + ) + + # Show top 3 performing fields + top_fields = sorted_fields[:3] + if top_fields: + top_fields_str = ", ".join( + [ + f"{field} ({score['mean_score']:.2f})" + for field, score in top_fields + ] + ) + print(f" Top Performing Fields: {top_fields_str}") + + # Show bottom performing fields (causing score reduction) + # For study_parameters, exclude ID fields from display + excluded_from_display = set() + if bench_name == "study_parameters": + excluded_from_display = {'Study Parameters ID', 'Variant Annotation ID'} + + bottom_fields = [ + f for f in sorted_fields + if f[1].get("mean_score", 1.0) < 1.0 + and f[0] not in excluded_from_display + ] + + if bottom_fields: + print(f" Low Scoring Fields (reducing overall score):") + for field, score_data in bottom_fields: + mean_score = score_data.get("mean_score", 0.0) + scores = score_data.get("scores", []) + # Show range if multiple samples + if len(scores) > 1: + min_score = min(scores) + max_score = max(scores) + print(f" • {field}: {mean_score:.3f} (range: {min_score:.3f}-{max_score:.3f})") + else: + print(f" • {field}: {mean_score:.3f}") + + # Show actual values for misaligned fields + if detailed_results: + # Get values from first sample (or aggregate if multiple) + field_values_list = [] + for dr in detailed_results: + field_values = dr.get("field_values", {}) + if field in field_values: + field_values_list.append(field_values[field]) + + if field_values_list: + # Show values from first sample, or aggregate if multiple + if len(field_values_list) == 1: + vals = field_values_list[0] + gt_val = vals.get("ground_truth") + pred_val = vals.get("prediction") + gt_str = str(gt_val) if gt_val is not None else "None" + pred_str = str(pred_val) if pred_val is not None else "None" + # Truncate long values + if len(gt_str) > 60: + gt_str = gt_str[:57] + "..." + if len(pred_str) > 60: + pred_str = pred_str[:57] + "..." + print(f" GT: {gt_str}") + print(f" Pred: {pred_str}") + else: + # Show values per sample when multiple samples exist + print(f" ({len(field_values_list)} samples)") + for sample_idx, vals in enumerate(field_values_list[:3]): # Show first 3 samples + gt_val = vals.get("ground_truth") + pred_val = vals.get("prediction") + gt_str = str(gt_val) if gt_val is not None else "None" + pred_str = str(pred_val) if pred_val is not None else "None" + # Truncate long values + if len(gt_str) > 50: + gt_str = gt_str[:47] + "..." + if len(pred_str) > 50: + pred_str = pred_str[:47] + "..." + print(f" Sample {sample_idx}: GT={gt_str}, Pred={pred_str}") + if len(field_values_list) > 3: + print(f" ... ({len(field_values_list) - 3} more samples)") + + # Print dependency issues and penalty information if present + all_issues = [] + all_penalties = [] + + for dr in detailed_results: + issues = dr.get("dependency_issues", []) + all_issues.extend(issues) + + penalty_info = dr.get("penalty_info", {}) + if penalty_info and penalty_info.get("total_penalty", 0) > 0: + all_penalties.append(penalty_info) + + if all_issues: + unique_issues = list(set(all_issues))[:5] # Show first 5 unique issues + print(f" Dependency Issues: {len(all_issues)} total") + for issue in unique_issues: + print(f" - {issue}") + + # Print detailed penalty information + if all_penalties: + print(f" Penalties Applied: {len(all_penalties)} sample(s) penalized") + for i, penalty_info in enumerate(all_penalties[:2]): # Show first 2 samples + total_penalty = penalty_info.get("total_penalty", 0) + penalized_fields = penalty_info.get("penalized_fields", {}) + issues_by_field = penalty_info.get("issues_by_field", {}) + + if penalized_fields: + print(f" Sample {i+1}: {total_penalty*100:.1f}% penalty applied") + for field, penalty_data in list(penalized_fields.items())[:3]: # Show top 3 penalized fields + orig = penalty_data.get("original_score", 0) + penal = penalty_data.get("penalized_score", 0) + pct = penalty_data.get("penalty_percentage", 0) + field_issues = issues_by_field.get(field, []) + + print(f" • {field}:") + print(f" Score: {orig:.3f} → {penal:.3f} ({pct:.1f}% reduction)") + if field_issues: + for issue in field_issues[:1]: # Show first issue for this field + print(f" Reason: {issue}") + + # Show field-level score breakdown for better understanding + if field_scores and total_samples > 0: + # Calculate which fields contribute most to score reduction + perfect_fields = sum(1 for f, s in field_scores.items() if abs(s.get("mean_score", 0) - 1.0) < 0.001) + total_fields = len(field_scores) + if perfect_fields < total_fields: + imperfect_count = total_fields - perfect_fields + print(f" Field Performance: {perfect_fields}/{total_fields} fields perfect, {imperfect_count} fields with mismatches") + + +def main(num_examples: int = 5): + """Main function to run benchmark examples.""" + print("=" * 60) + print("Benchmark Evaluation Script") + print("=" * 60) + + # Load data + llm_data, gt_data = load_data_files() + + # Find common PMCIDs + common_pmcids = find_common_pmcids(llm_data, gt_data, num_examples) + + if not common_pmcids: + print("No common PMCIDs found!") + return + + # Evaluate each PMCID + all_results = [] + for pmcid in common_pmcids: + print(f"\nEvaluating {pmcid}...") + results = evaluate_pmcid(pmcid, llm_data, gt_data) + all_results.append(results) + print_summary(results) + + # Save results to file + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + # From src/benchmark/, go up 3 levels to project root, then to persistent_data + output_file = Path(__file__).parent.parent.parent / "persistent_data" / f"benchmark_results_{timestamp}.json" + + output_data = { + "timestamp": timestamp, + "num_examples": len(all_results), + "results": all_results, + } + + with open(output_file, "w") as f: + json.dump(output_data, f, indent=2) + + print(f"\n{'='*60}") + print(f"Results saved to: {output_file}") + print(f"{'='*60}") + + # Print overall summary + print("\nOverall Summary:") + for bench_name in ["pheno", "drug", "fa", "study_parameters"]: + scores = [] + for r in all_results: + bench_result = r["benchmarks"].get(bench_name) + if bench_result: + # Exclude entries with "one_empty" status from average + status = bench_result.get("status") + if status != "one_empty": + scores.append(bench_result.get("overall_score", 0.0)) + if scores: + avg_score = sum(scores) / len(scores) + print(f" {bench_name.upper()}: {avg_score:.3f} (avg across {len(scores)} examples, excluding 'one_empty')") + else: + print(f" {bench_name.upper()}: No valid scores (all had 'one_empty' status)") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Run benchmark examples") + parser.add_argument( + "-n", + "--num-examples", + type=int, + default=5, + help="Number of examples to evaluate (default: 5)", + ) + + args = parser.parse_args() + main(num_examples=args.num_examples) + diff --git a/src/benchmark/shared_utils.py b/src/benchmark/shared_utils.py new file mode 100644 index 0000000..85b6177 --- /dev/null +++ b/src/benchmark/shared_utils.py @@ -0,0 +1,164 @@ +"""Shared utilities for benchmark evaluation functions.""" +from typing import Any, Optional, Dict, List +from difflib import SequenceMatcher +import numpy as np +import re +from sentence_transformers import SentenceTransformer + + +_model: Optional[SentenceTransformer] = None + + +def _get_model() -> SentenceTransformer: + """Get or initialize the PubMedBERT model.""" + global _model + if _model is None: + _model = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO") + return _model + + +def exact_match(gt_val: Any, pred_val: Any) -> float: + """Exact string match - case and whitespace insensitive.""" + if gt_val is None and pred_val is None: + return 1.0 + if gt_val is None or pred_val is None: + return 0.0 + return ( + 1.0 if str(gt_val).strip().lower() == str(pred_val).strip().lower() else 0.0 + ) + + +def semantic_similarity(gt_val: Any, pred_val: Any) -> float: + """Semantic similarity using PubMedBERT embeddings.""" + if gt_val is None and pred_val is None: + return 1.0 + if gt_val is None or pred_val is None: + return 0.0 + gt_str = str(gt_val).strip() + pred_str = str(pred_val).strip() + if gt_str == pred_str: + return 1.0 + try: + model = _get_model() + embeddings = model.encode([gt_str, pred_str]) + gt_embedding = embeddings[0] + pred_embedding = embeddings[1] + similarity = float( + np.dot(gt_embedding, pred_embedding) + / (np.linalg.norm(gt_embedding) * np.linalg.norm(pred_embedding)) + ) + return similarity + except Exception: + return SequenceMatcher(None, gt_str.lower(), pred_str.lower()).ratio() + + +def category_equal(a: Any, b: Any) -> float: + """Category equality check (normalized string comparison).""" + a_norm = re.sub(r"\s+", " ", str(a).strip().lower()) if a is not None else None + b_norm = re.sub(r"\s+", " ", str(b).strip().lower()) if b is not None else None + if a_norm is None and b_norm is None: + return 1.0 + if a_norm is None or b_norm is None: + return 0.0 + return 1.0 if a_norm == b_norm else 0.0 + + +def parse_numeric(value: Any) -> Optional[float]: + """Parse numeric value from string or number, handling scientific notation.""" + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + cleaned = re.sub(r'[,\s$]', '', value.strip()) + try: + return float(cleaned) + except ValueError: + return None + return None + + +def numeric_tolerance_match( + gt_val: Any, + pred_val: Any, + exact_weight: float = 1.0, + tolerance_5pct: float = 0.9, + tolerance_10pct: float = 0.8, +) -> float: + """Numeric comparison with tolerance levels.""" + gt_num = parse_numeric(gt_val) + pred_num = parse_numeric(pred_val) + + if gt_num is None and pred_num is None: + return 1.0 + if gt_num is None or pred_num is None: + return 0.0 + + if gt_num == 0 and pred_num == 0: + return 1.0 + if gt_num == 0 or pred_num == 0: + return 0.0 + + diff = abs(gt_num - pred_num) + pct_diff = diff / abs(gt_num) + + if diff == 0: + return exact_weight + elif pct_diff <= 0.05: + return tolerance_5pct + elif pct_diff <= 0.10: + return tolerance_10pct + else: + return 0.0 + + +def parse_variant_list(variants_text: Optional[str]) -> List[str]: + """Parse variant list from text, splitting on common delimiters.""" + if not variants_text: + return [] + tokens = re.split(r"[,;|\s]+(?:\+\s*)?", variants_text) + return [t.strip() for t in tokens if t and t.strip()] + + +def normalize_variant(variant: str) -> str: + """Normalize variant string for comparison.""" + v = variant.strip() + if v.lower().startswith("rs"): + return v.lower() + return re.sub(r"\s+", "", v) + + +def variant_substring_match(gt_val: Any, pred_val: Any) -> float: + """Return 1.0 if GT substring appears in prediction (case-insensitive).""" + if gt_val is None and pred_val is None: + return 1.0 + if gt_val is None or pred_val is None: + return 0.0 + gt_str = re.sub(r"\s+", " ", str(gt_val).strip().lower()) + pred_str = re.sub(r"\s+", " ", str(pred_val).strip().lower()) + if not gt_str: + return 1.0 if not pred_str else 0.0 + return 1.0 if gt_str in pred_str else 0.0 + + +def compute_weighted_score( + field_scores: Dict[str, float], + field_weights: Optional[Dict[str, float]] = None, +) -> float: + """Compute weighted average of field scores.""" + if not field_scores: + return 0.0 + + if field_weights is None: + # Unweighted mean (default behavior) + return sum(field_scores.values()) / len(field_scores) + + weighted_sum = 0.0 + total_weight = 0.0 + for field, score in field_scores.items(): + weight = field_weights.get(field, 1.0) + weighted_sum += score * weight + total_weight += weight + + return weighted_sum / total_weight if total_weight > 0 else 0.0 + diff --git a/src/benchmark/study_parameters_benchmark.py b/src/benchmark/study_parameters_benchmark.py new file mode 100644 index 0000000..eadd02a --- /dev/null +++ b/src/benchmark/study_parameters_benchmark.py @@ -0,0 +1,446 @@ +# SPDX-FileCopyrightText: 2025 Stanford University and the project authors (see CONTRIBUTORS.md) +# SPDX-License-Identifier: Apache-2.0 +from typing import Dict, List, Any, Optional, Tuple +from difflib import SequenceMatcher +import re +from src.benchmark.shared_utils import ( + exact_match, + semantic_similarity, + category_equal, + numeric_tolerance_match, + parse_numeric, + compute_weighted_score, +) + + +def _compute_study_parameters_similarity( + gt_rec: Dict[str, Any], + pred_rec: Dict[str, Any], +) -> float: + """ + Compute similarity score between a ground truth and prediction study parameter entry. + Uses the same field evaluators as the actual evaluation, but excludes ID fields. + """ + # Field evaluators (same as in evaluate_study_parameters, excluding ID fields) + field_evaluators = { + 'Study Type': category_equal, + 'Study Cases': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Study Controls': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Characteristics': semantic_similarity, + 'Characteristics Type': category_equal, + 'Frequency in Cases': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Allele of Frequency in Cases': semantic_similarity, + 'Frequency in Controls': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Allele of Frequency in Controls': semantic_similarity, + 'P Value': p_value_match, + 'Ratio Stat Type': category_equal, + 'Ratio Stat': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Confidence Interval Start': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Confidence Interval Stop': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Biogeographical Groups': category_equal, + } + + scores = [] + for field, evaluator in field_evaluators.items(): + score = evaluator(gt_rec.get(field), pred_rec.get(field)) + scores.append(score) + + # Return mean of all field scores + return sum(scores) / len(scores) if scores else 0.0 + + +def align_study_parameters_by_similarity( + ground_truth_list: List[Dict[str, Any]], + predictions_list: List[Dict[str, Any]], + matching_threshold: float = 0.3, +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """ + Align study parameters by computing similarity scores between all pairs and greedily + selecting the best matches. This works even when Variant Annotation ID is null. + """ + if not ground_truth_list or not predictions_list: + return [], [] + + # First, try Variant Annotation ID matching for records that have it + aligned_gt: List[Dict[str, Any]] = [] + aligned_pred: List[Dict[str, Any]] = [] + matched_gt_indices: set = set() + matched_pred_indices: set = set() + + # Build index by Variant Annotation ID + pred_by_id: Dict[Any, List[Tuple[int, Dict[str, Any]]]] = {} + for idx, pred_rec in enumerate(predictions_list): + variant_id = pred_rec.get('Variant Annotation ID') + if variant_id is not None: + if variant_id not in pred_by_id: + pred_by_id[variant_id] = [] + pred_by_id[variant_id].append((idx, pred_rec)) + + # Match by Variant Annotation ID first + for gt_idx, gt_rec in enumerate(ground_truth_list): + variant_id = gt_rec.get('Variant Annotation ID') + if variant_id is not None and variant_id in pred_by_id: + # Use the first available prediction with this ID + for pred_idx, pred_rec in pred_by_id[variant_id]: + if pred_idx not in matched_pred_indices: + aligned_gt.append(gt_rec) + aligned_pred.append(pred_rec) + matched_gt_indices.add(gt_idx) + matched_pred_indices.add(pred_idx) + break + + # For remaining unmatched records, use similarity-based matching + remaining_gt = [ + (idx, gt_rec) for idx, gt_rec in enumerate(ground_truth_list) + if idx not in matched_gt_indices + ] + remaining_pred = [ + (idx, pred_rec) for idx, pred_rec in enumerate(predictions_list) + if idx not in matched_pred_indices + ] + + if not remaining_gt or not remaining_pred: + return aligned_gt, aligned_pred + + # Compute all pairwise similarities + similarity_scores: List[Tuple[int, int, float]] = [] + for gt_idx, gt_rec in remaining_gt: + for pred_idx, pred_rec in remaining_pred: + similarity = _compute_study_parameters_similarity(gt_rec, pred_rec) + if similarity >= matching_threshold: + similarity_scores.append((gt_idx, pred_idx, similarity)) + + # Sort by similarity score (descending) + similarity_scores.sort(key=lambda x: x[2], reverse=True) + + # Greedily assign matches (one-to-one) + for gt_idx, pred_idx, score in similarity_scores: + if gt_idx not in matched_gt_indices and pred_idx not in matched_pred_indices: + aligned_gt.append(ground_truth_list[gt_idx]) + aligned_pred.append(predictions_list[pred_idx]) + matched_gt_indices.add(gt_idx) + matched_pred_indices.add(pred_idx) + + return aligned_gt, aligned_pred + + +def align_study_parameters_by_variant_id( + ground_truth_list: List[Dict[str, Any]], + predictions_list: List[Dict[str, Any]], +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """ + Align study parameters by Variant Annotation ID (legacy function, kept for compatibility). + Falls back to similarity-based alignment if Variant Annotation IDs are missing. + """ + # Try ID-based matching first + aligned_gt: List[Dict[str, Any]] = [] + aligned_pred: List[Dict[str, Any]] = [] + + # prediction index by Variant Annotation ID + pred_by_id: Dict[Any, Dict[str, Any]] = {} + + for pred_rec in predictions_list: + variant_id = pred_rec.get('Variant Annotation ID') + if variant_id is not None: + pred_by_id[variant_id] = pred_rec + + # ground truth to predictions + for gt_rec in ground_truth_list: + variant_id = gt_rec.get('Variant Annotation ID') + if variant_id is not None and variant_id in pred_by_id: + aligned_gt.append(gt_rec) + aligned_pred.append(pred_by_id[variant_id]) + + # If no matches found by ID, fall back to similarity-based alignment + if not aligned_gt: + return align_study_parameters_by_similarity(ground_truth_list, predictions_list) + + return aligned_gt, aligned_pred + + + + +def parse_p_value(pval_str: Any) -> Tuple[Optional[str], Optional[float]]: + """Parse P value string into operator and numeric value.""" + if pval_str is None: + return None, None + + pval_str = str(pval_str).strip() + if not pval_str: + return None, None + + # Extract operator (<=, >=, <, >, =) + operator_match = re.search(r'([<>=≤≥]=?)', pval_str) + operator = operator_match.group(1) if operator_match else '=' + + # Extract numeric value + value_str = re.sub(r'[<>=≤≥\s]', '', pval_str) + value = parse_numeric(value_str) + + return operator, value + + +def p_value_match(gt_val: Any, pred_val: Any) -> float: + """Match P value with both operator and value.""" + gt_op, gt_val_num = parse_p_value(gt_val) + pred_op, pred_val_num = parse_p_value(pred_val) + + if gt_op is None and pred_op is None: + return 1.0 + if gt_op is None or pred_op is None: + return 0.0 + + # Normalize operators for comparison + op_map = {'<=': '≤', '>=': '≥', '<': '<', '>': '>', '=': '='} + gt_op_norm = op_map.get(gt_op, gt_op) + pred_op_norm = op_map.get(pred_op, pred_op) + + operator_score = 1.0 if gt_op_norm == pred_op_norm else 0.0 + value_score = numeric_tolerance_match( + gt_val_num, pred_val_num, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.7 + ) + + # Combined: 50% operator, 50% value + return 0.5 * operator_score + 0.5 * value_score + + +def validate_study_parameters_dependencies( + annotation: Dict[str, Any], + related_annotations: Optional[List[Dict[str, Any]]] = None, +) -> List[str]: + """Validate field dependencies for study parameters.""" + issues: List[str] = [] + + # Variant Annotation ID should exist in related annotations if provided + variant_id = annotation.get("Variant Annotation ID") + if variant_id and related_annotations: + found = any( + ann.get("Variant Annotation ID") == variant_id + for ann in related_annotations + ) + if not found: + issues.append( + f"Variant Annotation ID {variant_id} not found in related annotations" + ) + + return issues + + +def validate_statistical_consistency(annotation: Dict[str, Any]) -> List[str]: + """Validate statistical consistency of P value, ratio stat, and confidence intervals.""" + issues: List[str] = [] + + p_value_str = annotation.get("P Value") + ratio_stat_type = annotation.get("Ratio Stat Type") + ratio_stat = annotation.get("Ratio Stat") + ci_start = annotation.get("Confidence Interval Start") + ci_stop = annotation.get("Confidence Interval Stop") + + # Parse P value + p_op, p_val = parse_p_value(p_value_str) + ratio_stat_num = parse_numeric(ratio_stat) + ci_start_num = parse_numeric(ci_start) + ci_stop_num = parse_numeric(ci_stop) + + # Check P value and ratio stat consistency + if p_op and ratio_stat_type and ratio_stat_num is not None: + # If P value is significant (< 0.05 or <= 0.05), ratio stat should typically be != 1 + # If P value is not significant (>= 0.05), ratio stat might be closer to 1 + if p_val is not None and p_val < 0.05: + if ratio_stat_num == 1.0: + issues.append( + "P value is significant (< 0.05) but Ratio Stat equals 1.0 (may indicate inconsistency)" + ) + + # Check confidence interval consistency + if ci_start_num is not None and ci_stop_num is not None: + if ci_start_num >= ci_stop_num: + issues.append( + f"Confidence Interval Start ({ci_start_num}) should be less than Stop ({ci_stop_num})" + ) + + if ratio_stat_num is not None: + if ratio_stat_num < ci_start_num or ratio_stat_num > ci_stop_num: + issues.append( + f"Ratio Stat ({ratio_stat_num}) should be within Confidence Interval [{ci_start_num}, {ci_stop_num}]" + ) + + # Check frequency consistency + freq_cases = parse_numeric(annotation.get("Frequency in Cases")) + freq_controls = parse_numeric(annotation.get("Frequency in Controls")) + study_cases = parse_numeric(annotation.get("Study Cases")) + study_controls = parse_numeric(annotation.get("Study Controls")) + + if freq_cases is not None and study_cases is not None: + if freq_cases < 0 or freq_cases > 1: + issues.append( + f"Frequency in Cases ({freq_cases}) should be between 0 and 1" + ) + + if freq_controls is not None and study_controls is not None: + if freq_controls < 0 or freq_controls > 1: + issues.append( + f"Frequency in Controls ({freq_controls}) should be between 0 and 1" + ) + + return issues + + +def evaluate_study_parameters( + samples: List[Dict[str, Any]], + field_weights: Optional[Dict[str, float]] = None, + related_annotations: Optional[List[Dict[str, Any]]] = None, +) -> Dict[str, Any]: + """ + Evaluate study parameters when provided a list with exactly two items: + - samples[0] = ground truth (dict or list of dicts) + - samples[1] = prediction (dict or list of dicts) + + Args: + samples: [ground_truth, prediction] where each can be a dict or list of dicts + field_weights: Optional dict mapping field names to weights for weighted scoring. + If None, all fields are weighted equally (unweighted mean). + related_annotations: Optional list of related annotations for dependency validation. + """ + + if not isinstance(samples, list) or len(samples) != 2: + raise ValueError("Expected a list with exactly two items: [ground_truth, prediction].") + gt, pred = samples[0], samples[1] + + # Normalize to lists + if isinstance(gt, dict): + gt_list_raw: List[Dict[str, Any]] = [gt] + elif isinstance(gt, list): + gt_list_raw: List[Dict[str, Any]] = gt + else: + raise ValueError("Ground truth must be a dict or list of dicts.") + + if isinstance(pred, dict): + pred_list_raw: List[Dict[str, Any]] = [pred] + elif isinstance(pred, list): + pred_list_raw: List[Dict[str, Any]] = pred + else: + raise ValueError("Prediction must be a dict or list of dicts.") + + # Use similarity-based alignment since predictions often have null Variant Annotation ID + gt_list, pred_list = align_study_parameters_by_similarity(gt_list_raw, pred_list_raw) + + if not gt_list: + return {'total_samples': 0, 'field_scores': {}, 'overall_score': 0.0, 'detailed_results': []} + + # Map evaluators to study parameters schema fields + field_evaluators = { + 'Study Parameters ID': exact_match, + 'Variant Annotation ID': exact_match, + 'Study Type': category_equal, + 'Study Cases': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Study Controls': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Characteristics': semantic_similarity, + 'Characteristics Type': category_equal, + 'Frequency in Cases': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Allele of Frequency in Cases': semantic_similarity, + 'Frequency in Controls': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Allele of Frequency in Controls': semantic_similarity, + 'P Value': p_value_match, + 'Ratio Stat Type': category_equal, + 'Ratio Stat': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Confidence Interval Start': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Confidence Interval Stop': lambda gt, pred: numeric_tolerance_match(gt, pred, exact_weight=1.0, tolerance_5pct=0.9, tolerance_10pct=0.8), + 'Biogeographical Groups': category_equal, + } + + results: Dict[str, Any] = {'total_samples': len(gt_list), 'field_scores': {}, 'overall_score': 0.0} + + # Exclude ID fields from field_scores (but still evaluate them for detailed_results) + excluded_fields = {'Study Parameters ID', 'Variant Annotation ID'} + + for field, evaluator in field_evaluators.items(): + scores: List[float] = [] + for g, p in zip(gt_list, pred_list): + scores.append(evaluator(g.get(field), p.get(field))) + # Only include non-ID fields in field_scores for analysis/display + if field not in excluded_fields: + results['field_scores'][field] = {'mean_score': sum(scores) / len(scores), 'scores': scores} + + results['detailed_results'] = [] + for i, (g, p) in enumerate(zip(gt_list, pred_list)): + sample_result: Dict[str, Any] = {'sample_id': i, 'field_scores': {}, 'field_values': {}} + for field, evaluator in field_evaluators.items(): + sample_result['field_scores'][field] = evaluator(g.get(field), p.get(field)) + # Store actual values for display + sample_result['field_values'][field] = { + 'ground_truth': g.get(field), + 'prediction': p.get(field) + } + + # Dependency validation + dependency_issues = [] + dependency_issues.extend(validate_study_parameters_dependencies(p, related_annotations)) + dependency_issues.extend(validate_statistical_consistency(p)) + sample_result['dependency_issues'] = dependency_issues + + # Track penalty information + penalty_info = { + 'total_penalty': 0.0, + 'penalized_fields': {}, + 'issues_by_field': {} + } + + if dependency_issues: + penalty_per_issue = 0.05 + total_penalty = min(len(dependency_issues) * penalty_per_issue, 0.3) + penalty_info['total_penalty'] = total_penalty + fields_to_penalize = set() + for issue in dependency_issues: + affected_fields = [] + if "Variant Annotation ID" in issue: + affected_fields = ["Variant Annotation ID"] + elif "P value" in issue.lower() or "ratio stat" in issue.lower(): + affected_fields = ["P Value", "Ratio Stat", "Ratio Stat Type"] + elif "confidence interval" in issue.lower(): + affected_fields = ["Confidence Interval Start", "Confidence Interval Stop", "Ratio Stat"] + elif "frequency" in issue.lower(): + affected_fields = [ + "Frequency in Cases", + "Frequency in Controls", + "Study Cases", + "Study Controls", + ] + else: + affected_fields = list(sample_result['field_scores'].keys()) + + for field in affected_fields: + fields_to_penalize.add(field) + if field not in penalty_info['issues_by_field']: + penalty_info['issues_by_field'][field] = [] + penalty_info['issues_by_field'][field].append(issue) + + for field in fields_to_penalize: + if field in sample_result['field_scores']: + original_score = sample_result['field_scores'][field] + penalized_score = original_score * (1 - total_penalty) + sample_result['field_scores'][field] = penalized_score + penalty_info['penalized_fields'][field] = { + 'original_score': original_score, + 'penalized_score': penalized_score, + 'penalty_percentage': total_penalty * 100 + } + + sample_result['penalty_info'] = penalty_info + results['detailed_results'].append(sample_result) + + # Recalculate field_scores from detailed_results, excluding ID fields + excluded_fields = {'Study Parameters ID', 'Variant Annotation ID'} + for field in list(field_evaluators.keys()): + if field not in excluded_fields: + field_scores = [s['field_scores'][field] for s in results['detailed_results']] + results['field_scores'][field] = {'mean_score': sum(field_scores) / len(field_scores), 'scores': field_scores} + + # Compute overall score with optional field weights (ID fields already excluded from field_scores) + field_mean_scores = { + k: v['mean_score'] + for k, v in results['field_scores'].items() + } + results['overall_score'] = compute_weighted_score(field_mean_scores, field_weights) + return results +