Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 56 additions & 72 deletions data/annotations/PMC11730665.json

Large diffs are not rendered by default.

189 changes: 110 additions & 79 deletions data/annotations/PMC4737107.json

Large diffs are not rendered by default.

176 changes: 93 additions & 83 deletions data/annotations/PMC5712579.json

Large diffs are not rendered by default.

277 changes: 203 additions & 74 deletions data/annotations/PMC5728534.json

Large diffs are not rendered by default.

109 changes: 45 additions & 64 deletions data/annotations/PMC5749368.json

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions notebooks/test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"cells": [],
"metadata": {
"kernelspec": {
"display_name": "default",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
3 changes: 3 additions & 0 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ setup-repo = "pixi install && pixi run download-data"
copy-markdown = "python -m src.copy_markdown"
annotation-pipeline = "python -m src.annotation_pipeline"
test-citations = "python -m src.citations.one_shot_citations"
study-parameters = "python -m src.study_parameters"
variant-ontology = "python -m src.ontology.variant_ontology"
drug-ontology = "python -m src.ontology.drug_ontology"

[dependencies]
seaborn = ">=0.13.2,<0.14"
Expand Down
5 changes: 4 additions & 1 deletion src/annotation_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(self, pmcid: str, model: str = "gpt-4.1"):
self.prompt = """
What are all the pharmacogenomic relationships found in this paper?
Output your response in markdown table format with nothing except the table. The columns should be Gene, Polymorphism, Relationship/Effect, and p-value.
Make sure that every polymorphism gets its own row, even if they have the same effect/p-value.
"""

def generate_table_json(self) -> AnnotationTable:
Expand All @@ -62,10 +63,12 @@ def generate_table_json(self) -> AnnotationTable:
Please extract all pharmacogenomic relationships and format them as structured data with the following fields for each relationship:
- gene: The gene name
- polymorphism: The genetic polymorphism or variant
- drug: The drug name if a drug is part of this relationship. If a drug is not part of this association, fill this field with "None".
- relationship_effect: Description of the relationship or effect
- p_value: The statistical p-value
- p_value: The statistical p-value. If confidence intervals are provided, display that information here as well.

Return the data as a JSON object with a 'relationships' array containing all the pharmacogenomic relationships found.
Make sure that every polymorphism/relationship gets its own entry, even if they have the same effect/p-value.
"""

response = self.generator.generate(
Expand Down
179 changes: 164 additions & 15 deletions src/citations/line_citation_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -899,7 +899,13 @@ def _score_sentence_for_study_param(
Relevance score from 1-10
"""
sentence_lower = sentence.lower()
parameter_lower = parameter_content.lower()

# Handle case where parameter_content is a list
if isinstance(parameter_content, list):
parameter_lower = " ".join(str(item) for item in parameter_content).lower()
else:
parameter_lower = str(parameter_content).lower()

score = 0

# Define keywords for each parameter type
Expand Down Expand Up @@ -1197,6 +1203,41 @@ def create_citation_generator(
return LMCitationGenerator(pmcid, model)


def process_annotation_file_with_citations(
pmcid: str, model: str = "local"
) -> AnnotationTable:
"""
Convenience function to load annotations from file, add citations, and save back to file.

Args:
pmcid: PubMed Central ID
model: Model to use for citation generation

Returns:
AnnotationTable with citations added
"""
# Load annotations from file
annotations = load_annotations_from_file(pmcid)

if not annotations.relationships:
logger.warning(f"No annotations found for {pmcid}")
return annotations

# Create citation generator
generator = create_citation_generator(pmcid, model)

# Add citations to annotations
updated_annotations = generator.add_citations_to_annotations(annotations)

# Save updated annotations back to file
update_annotations_in_file(pmcid, updated_annotations)

logger.info(
f"Successfully processed {len(updated_annotations.relationships)} annotations for {pmcid}"
)
return updated_annotations


# Maintain backward compatibility
def CitationGenerator(
pmcid: str, model: str = "local", approach: str = None
Expand Down Expand Up @@ -1226,33 +1267,141 @@ def CitationGenerator(
return create_citation_generator(pmcid, model)


def update_annotations_in_file(
pmcid: str, updated_annotations: AnnotationTable
) -> None:
"""
Save updated annotations back to the JSON file in the new schema format.

Args:
pmcid: PubMed Central ID
updated_annotations: AnnotationTable with updated relationships
"""
import json
import os

annotation_file = f"data/annotations/{pmcid}.json"

if not os.path.exists(annotation_file):
logger.error(f"Annotation file not found: {annotation_file}")
return

try:
# Load existing data
with open(annotation_file, "r") as f:
data = json.load(f)

# Update the relationships in the new schema format
if "annotations" not in data:
data["annotations"] = {}

data["annotations"]["relationships"] = []
for rel in updated_annotations.relationships:
rel_dict = {
"gene": rel.gene,
"polymorphism": rel.polymorphism,
"relationship_effect": rel.relationship_effect,
"p_value": rel.p_value,
"citations": rel.citations,
"p_value_citations": rel.p_value_citations,
}
data["annotations"]["relationships"].append(rel_dict)

# Write back to file
with open(annotation_file, "w") as f:
json.dump(data, f, indent=4, ensure_ascii=False)

logger.info(f"Updated annotations saved to {annotation_file}")

except Exception as e:
logger.error(f"Error updating annotations in {annotation_file}: {e}")


def load_annotations_from_file(pmcid: str) -> AnnotationTable:
"""
Load annotations from the new JSON schema format.

Args:
pmcid: PubMed Central ID

Returns:
AnnotationTable with relationships loaded from the file
"""
import json
import os

annotation_file = f"data/annotations/{pmcid}.json"

if not os.path.exists(annotation_file):
logger.warning(f"Annotation file not found: {annotation_file}")
return AnnotationTable(relationships=[])

try:
with open(annotation_file, "r") as f:
data = json.load(f)

# Extract relationships from the new schema format
if "annotations" in data and "relationships" in data["annotations"]:
relationships = []
for rel_data in data["annotations"]["relationships"]:
# Convert the dict to AnnotationRelationship object
relationship = AnnotationRelationship(
gene=rel_data.get("gene", ""),
polymorphism=rel_data.get("polymorphism", ""),
relationship_effect=rel_data.get("relationship_effect", ""),
p_value=rel_data.get("p_value", ""),
citations=rel_data.get("citations", []),
p_value_citations=rel_data.get("p_value_citations", []),
)
relationships.append(relationship)

return AnnotationTable(relationships=relationships)
else:
logger.warning(f"No annotations found in file: {annotation_file}")
return AnnotationTable(relationships=[])

except Exception as e:
logger.error(f"Error loading annotations from {annotation_file}: {e}")
return AnnotationTable(relationships=[])


def main():
"""
Test function for citation generator using PMC11730665 and a single sentence.
Test function for citation generator using PMC11730665 and loading annotations from file.
"""
# Test parameters
pmcid = "PMC11730665"
test_sentence = "Patients with the GG genotype had a trend toward lower efficacy of sitagliptin and higher efficacy of gliclazide, likely due to slower metabolism of gliclazide."

# Create citation generator
generator = create_citation_generator(pmcid, model="gemini/gemini-2.5-flash-lite")
generator = create_citation_generator(pmcid, model="local")

# Create a mock annotation for testing
from src.annotation_table import AnnotationRelationship
# Load annotations from the updated schema file
annotations = load_annotations_from_file(pmcid)

test_annotation = AnnotationRelationship(
gene="CYP2C9",
polymorphism="rs1057910 GG",
relationship_effect="Patients with the GG genotype had a trend toward lower efficacy of sitagliptin and higher efficacy of gliclazide, likely due to slower metabolism of gliclazide.",
p_value=".464",
citations=[],
)
if not annotations.relationships:
logger.error(
"No annotations loaded from file. Creating a test annotation instead."
)
# Fallback to creating a mock annotation for testing
from src.annotation_table import AnnotationRelationship

test_annotation = AnnotationRelationship(
gene="CYP2C9",
polymorphism="rs1057910 GG",
relationship_effect="Patients with the GG genotype had a trend toward lower efficacy of sitagliptin and higher efficacy of gliclazide, likely due to slower metabolism of gliclazide.",
p_value=".464",
citations=[],
)
annotations = AnnotationTable(relationships=[test_annotation])

print(f"Testing citation generator with PMCID: {pmcid}")
print(f"Test sentence: {test_sentence}")
print(f"Test annotation: {test_annotation.gene} {test_annotation.polymorphism}")
print(f"Loaded {len(annotations.relationships)} annotations from file")
print("-" * 50)

# Test with first annotation
test_annotation = annotations.relationships[0]
print(f"Test annotation: {test_annotation.gene} {test_annotation.polymorphism}")

# Get citations for the annotation
citations = generator._get_top_citations_for_annotation(test_annotation, top_k=3)

Expand Down
2 changes: 1 addition & 1 deletion src/deprecated/all_associations.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from src.inference import Generator, Fuser
from src.variants import QuotedStr
from src.deprecated.variants import QuotedStr
from src.prompts import GeneratorPrompt, ArticlePrompt
from src.utils import get_article_text
from loguru import logger
Expand Down
2 changes: 1 addition & 1 deletion src/deprecated/all_variants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from src.inference import Generator
from src.variants import Variant, VariantList
from src.deprecated.variants import Variant, VariantList
from src.prompts import GeneratorPrompt, PromptVariables
from src.utils import get_article_text
from loguru import logger
Expand Down
2 changes: 1 addition & 1 deletion src/deprecated/association_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Given a list of variants and the article text, determine the type of association (drug, phenotype, functional association)
"""

from src.variants import Variant
from src.deprecated.variants import Variant
from typing import List, Optional
from src.prompts import PromptVariables, GeneratorPrompt, ParserPrompt
from src.inference import Generator, Parser
Expand Down
6 changes: 5 additions & 1 deletion src/deprecated/functional_annotation_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
from typing import List
from loguru import logger
from pydantic import BaseModel
from src.variants import Variant, FunctionalAnnotation, FunctionalAnnotationList
from src.deprecated.variants import (
Variant,
FunctionalAnnotation,
FunctionalAnnotationList,
)
from src.prompts import PromptVariables, GeneratorPrompt, ParserPrompt
from src.inference import Generator, Parser
from src.utils import get_article_text
Expand Down
6 changes: 5 additions & 1 deletion src/deprecated/phenotype_annotation_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
from typing import List
from loguru import logger
from pydantic import BaseModel
from src.variants import Variant, PhenotypeAnnotation, PhenotypeAnnotationList
from src.deprecated.variants import (
Variant,
PhenotypeAnnotation,
PhenotypeAnnotationList,
)
from src.prompts import PromptVariables, GeneratorPrompt, ParserPrompt
from src.inference import Generator, Parser
from src.utils import get_article_text
Expand Down
Loading
Loading