DaneshjouLab · shloknatarajan · Aug 12, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 5, 2025
diff --git a/data/annotations/PMC11730665.json b/data/annotations/PMC11730665.json
diff --git a/data/annotations/PMC4737107.json b/data/annotations/PMC4737107.json
diff --git a/data/annotations/PMC5712579.json b/data/annotations/PMC5712579.json
diff --git a/data/annotations/PMC5728534.json b/data/annotations/PMC5728534.json
diff --git a/data/annotations/PMC5749368.json b/data/annotations/PMC5749368.json
diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb
@@ -0,0 +1,16 @@
+{
+ "cells": [],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "default",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.13.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pixi.toml b/pixi.toml
@@ -20,6 +20,9 @@ setup-repo = "pixi install && pixi run download-data"
 copy-markdown = "python -m src.copy_markdown"
 annotation-pipeline = "python -m src.annotation_pipeline"
 test-citations = "python -m src.citations.one_shot_citations"
+study-parameters = "python -m src.study_parameters"
+variant-ontology = "python -m src.ontology.variant_ontology"
+drug-ontology = "python -m src.ontology.drug_ontology"
 
 [dependencies]
 seaborn = ">=0.13.2,<0.14"

diff --git a/src/annotation_table.py b/src/annotation_table.py
@@ -47,6 +47,7 @@ def __init__(self, pmcid: str, model: str = "gpt-4.1"):
         self.prompt = """
 What are all the pharmacogenomic relationships found in this paper?
 Output your response in markdown table format with nothing except the table. The columns should be Gene, Polymorphism, Relationship/Effect, and p-value.
+Make sure that every polymorphism gets its own row, even if they have the same effect/p-value.
 """
 
     def generate_table_json(self) -> AnnotationTable:
@@ -62,10 +63,12 @@ def generate_table_json(self) -> AnnotationTable:
 Please extract all pharmacogenomic relationships and format them as structured data with the following fields for each relationship:
 - gene: The gene name
 - polymorphism: The genetic polymorphism or variant
+- drug: The drug name if a drug is part of this relationship. If a drug is not part of this association, fill this field with "None".
 - relationship_effect: Description of the relationship or effect
-- p_value: The statistical p-value
+- p_value: The statistical p-value. If confidence intervals are provided, display that information here as well.
 
 Return the data as a JSON object with a 'relationships' array containing all the pharmacogenomic relationships found.
+Make sure that every polymorphism/relationship gets its own entry, even if they have the same effect/p-value.
 """
 
         response = self.generator.generate(

diff --git a/src/citations/line_citation_generator.py b/src/citations/line_citation_generator.py
@@ -899,7 +899,13 @@ def _score_sentence_for_study_param(
             Relevance score from 1-10
         """
         sentence_lower = sentence.lower()
-        parameter_lower = parameter_content.lower()
+
+        # Handle case where parameter_content is a list
+        if isinstance(parameter_content, list):
+            parameter_lower = " ".join(str(item) for item in parameter_content).lower()
+        else:
+            parameter_lower = str(parameter_content).lower()
+
         score = 0
 
         # Define keywords for each parameter type
@@ -1197,6 +1203,41 @@ def create_citation_generator(
         return LMCitationGenerator(pmcid, model)
 
 
+def process_annotation_file_with_citations(
+    pmcid: str, model: str = "local"
+) -> AnnotationTable:
+    """
+    Convenience function to load annotations from file, add citations, and save back to file.
+
+    Args:
+        pmcid: PubMed Central ID
+        model: Model to use for citation generation
+
+    Returns:
+        AnnotationTable with citations added
+    """
+    # Load annotations from file
+    annotations = load_annotations_from_file(pmcid)
+
+    if not annotations.relationships:
+        logger.warning(f"No annotations found for {pmcid}")
+        return annotations
+
+    # Create citation generator
+    generator = create_citation_generator(pmcid, model)
+
+    # Add citations to annotations
+    updated_annotations = generator.add_citations_to_annotations(annotations)
+
+    # Save updated annotations back to file
+    update_annotations_in_file(pmcid, updated_annotations)
+
+    logger.info(
+        f"Successfully processed {len(updated_annotations.relationships)} annotations for {pmcid}"
+    )
+    return updated_annotations
+
+
 # Maintain backward compatibility
 def CitationGenerator(
     pmcid: str, model: str = "local", approach: str = None
@@ -1226,33 +1267,141 @@ def CitationGenerator(
         return create_citation_generator(pmcid, model)
 
 
+def update_annotations_in_file(
+    pmcid: str, updated_annotations: AnnotationTable
+) -> None:
+    """
+    Save updated annotations back to the JSON file in the new schema format.
+
+    Args:
+        pmcid: PubMed Central ID
+        updated_annotations: AnnotationTable with updated relationships
+    """
+    import json
+    import os
+
+    annotation_file = f"data/annotations/{pmcid}.json"
+
+    if not os.path.exists(annotation_file):
+        logger.error(f"Annotation file not found: {annotation_file}")
+        return
+
+    try:
+        # Load existing data
+        with open(annotation_file, "r") as f:
+            data = json.load(f)
+
+        # Update the relationships in the new schema format
+        if "annotations" not in data:
+            data["annotations"] = {}
+
+        data["annotations"]["relationships"] = []
+        for rel in updated_annotations.relationships:
+            rel_dict = {
+                "gene": rel.gene,
+                "polymorphism": rel.polymorphism,
+                "relationship_effect": rel.relationship_effect,
+                "p_value": rel.p_value,
+                "citations": rel.citations,
+                "p_value_citations": rel.p_value_citations,
+            }
+            data["annotations"]["relationships"].append(rel_dict)
+
+        # Write back to file
+        with open(annotation_file, "w") as f:
+            json.dump(data, f, indent=4, ensure_ascii=False)
+
+        logger.info(f"Updated annotations saved to {annotation_file}")
+
+    except Exception as e:
+        logger.error(f"Error updating annotations in {annotation_file}: {e}")
+
+
+def load_annotations_from_file(pmcid: str) -> AnnotationTable:
+    """
+    Load annotations from the new JSON schema format.
+
+    Args:
+        pmcid: PubMed Central ID
+
+    Returns:
+        AnnotationTable with relationships loaded from the file
+    """
+    import json
+    import os
+
+    annotation_file = f"data/annotations/{pmcid}.json"
+
+    if not os.path.exists(annotation_file):
+        logger.warning(f"Annotation file not found: {annotation_file}")
+        return AnnotationTable(relationships=[])
+
+    try:
+        with open(annotation_file, "r") as f:
+            data = json.load(f)
+
+        # Extract relationships from the new schema format
+        if "annotations" in data and "relationships" in data["annotations"]:
+            relationships = []
+            for rel_data in data["annotations"]["relationships"]:
+                # Convert the dict to AnnotationRelationship object
+                relationship = AnnotationRelationship(
+                    gene=rel_data.get("gene", ""),
+                    polymorphism=rel_data.get("polymorphism", ""),
+                    relationship_effect=rel_data.get("relationship_effect", ""),
+                    p_value=rel_data.get("p_value", ""),
+                    citations=rel_data.get("citations", []),
+                    p_value_citations=rel_data.get("p_value_citations", []),
+                )
+                relationships.append(relationship)
+
+            return AnnotationTable(relationships=relationships)
+        else:
+            logger.warning(f"No annotations found in file: {annotation_file}")
+            return AnnotationTable(relationships=[])
+
+    except Exception as e:
+        logger.error(f"Error loading annotations from {annotation_file}: {e}")
+        return AnnotationTable(relationships=[])
+
+
 def main():
     """
-    Test function for citation generator using PMC11730665 and a single sentence.
+    Test function for citation generator using PMC11730665 and loading annotations from file.
     """
     # Test parameters
     pmcid = "PMC11730665"
-    test_sentence = "Patients with the GG genotype had a trend toward lower efficacy of sitagliptin and higher efficacy of gliclazide, likely due to slower metabolism of gliclazide."
 
     # Create citation generator
-    generator = create_citation_generator(pmcid, model="gemini/gemini-2.5-flash-lite")
+    generator = create_citation_generator(pmcid, model="local")
 
-    # Create a mock annotation for testing
-    from src.annotation_table import AnnotationRelationship
+    # Load annotations from the updated schema file
+    annotations = load_annotations_from_file(pmcid)
 
-    test_annotation = AnnotationRelationship(
-        gene="CYP2C9",
-        polymorphism="rs1057910 GG",
-        relationship_effect="Patients with the GG genotype had a trend toward lower efficacy of sitagliptin and higher efficacy of gliclazide, likely due to slower metabolism of gliclazide.",
-        p_value=".464",
-        citations=[],
-    )
+    if not annotations.relationships:
+        logger.error(
+            "No annotations loaded from file. Creating a test annotation instead."
+        )
+        # Fallback to creating a mock annotation for testing
+        from src.annotation_table import AnnotationRelationship
+
+        test_annotation = AnnotationRelationship(
+            gene="CYP2C9",
+            polymorphism="rs1057910 GG",
+            relationship_effect="Patients with the GG genotype had a trend toward lower efficacy of sitagliptin and higher efficacy of gliclazide, likely due to slower metabolism of gliclazide.",
+            p_value=".464",
+            citations=[],
+        )
+        annotations = AnnotationTable(relationships=[test_annotation])
 
     print(f"Testing citation generator with PMCID: {pmcid}")
-    print(f"Test sentence: {test_sentence}")
-    print(f"Test annotation: {test_annotation.gene} {test_annotation.polymorphism}")
+    print(f"Loaded {len(annotations.relationships)} annotations from file")
     print("-" * 50)
 
+    # Test with first annotation
+    test_annotation = annotations.relationships[0]
+    print(f"Test annotation: {test_annotation.gene} {test_annotation.polymorphism}")
+
     # Get citations for the annotation
     citations = generator._get_top_citations_for_annotation(test_annotation, top_k=3)
 

diff --git a/src/deprecated/all_associations.py b/src/deprecated/all_associations.py
@@ -1,5 +1,5 @@
 from src.inference import Generator, Fuser
-from src.variants import QuotedStr
+from src.deprecated.variants import QuotedStr
 from src.prompts import GeneratorPrompt, ArticlePrompt
 from src.utils import get_article_text
 from loguru import logger

diff --git a/src/deprecated/all_variants.py b/src/deprecated/all_variants.py
@@ -1,5 +1,5 @@
 from src.inference import Generator
-from src.variants import Variant, VariantList
+from src.deprecated.variants import Variant, VariantList
 from src.prompts import GeneratorPrompt, PromptVariables
 from src.utils import get_article_text
 from loguru import logger

diff --git a/src/deprecated/association_types.py b/src/deprecated/association_types.py
@@ -2,7 +2,7 @@
 Given a list of variants and the article text, determine the type of association (drug, phenotype, functional association)
 """
 
-from src.variants import Variant
+from src.deprecated.variants import Variant
 from typing import List, Optional
 from src.prompts import PromptVariables, GeneratorPrompt, ParserPrompt
 from src.inference import Generator, Parser

diff --git a/src/deprecated/functional_annotation_extraction.py b/src/deprecated/functional_annotation_extraction.py
@@ -5,7 +5,11 @@
 from typing import List
 from loguru import logger
 from pydantic import BaseModel
-from src.variants import Variant, FunctionalAnnotation, FunctionalAnnotationList
+from src.deprecated.variants import (
+    Variant,
+    FunctionalAnnotation,
+    FunctionalAnnotationList,
+)
 from src.prompts import PromptVariables, GeneratorPrompt, ParserPrompt
 from src.inference import Generator, Parser
 from src.utils import get_article_text

diff --git a/src/deprecated/phenotype_annotation_extraction.py b/src/deprecated/phenotype_annotation_extraction.py
@@ -5,7 +5,11 @@
 from typing import List
 from loguru import logger
 from pydantic import BaseModel
-from src.variants import Variant, PhenotypeAnnotation, PhenotypeAnnotationList
+from src.deprecated.variants import (
+    Variant,
+    PhenotypeAnnotation,
+    PhenotypeAnnotationList,
+)
 from src.prompts import PromptVariables, GeneratorPrompt, ParserPrompt
 from src.inference import Generator, Parser
 from src.utils import get_article_text