diff --git a/src/deprecated/all_associations.py b/src/deprecated/all_associations.py deleted file mode 100644 index d32d9c4..0000000 --- a/src/deprecated/all_associations.py +++ /dev/null @@ -1,117 +0,0 @@ -from src.inference import Generator, Fuser -from src.deprecated.variants import QuotedStr -from src.prompts import GeneratorPrompt, ArticlePrompt -from src.utils import get_article_text -from loguru import logger -import json -from typing import List, Optional, Dict -from src.config import DEBUG -from pydantic import BaseModel -import enum -import os - - -class AssociationType(enum.Enum): - DRUG = "Drug Association" - PHENOTYPE = "Phenotype Association" - FUNCTIONAL = "Functional Analysis" - - -class VariantAssociation(BaseModel): - variant: QuotedStr - gene: QuotedStr | None = None - allele: QuotedStr | None = None - association_type: AssociationType - association_summary: str - - -class VariantAssociationList(BaseModel): - association_list: List[VariantAssociation] - - -VARIANT_LIST_KEY_QUESTION = """ -In this article, find all studied associations between genetic variants (ex. rs113993960, CYP1A1*1, etc.) and a drug, phenotype, or functional analysis result. -Include information on the gene group and allele (if present). -""" - -VARIANT_LIST_OUTPUT_QUEUES = """ -Your output format should be a list of associations with the following attributes: -Variant: The Variant / Haplotypes (ex. rs2909451, CYP2C19*1, CYP2C19*2, *1/*18, etc.) -Summary: One sentence summary of the association finding for this variant. -Gene: The gene group of the variant (ex. DPP4, CYP2C19, KCNJ11, etc.) -Allele: Specific allele or genotype if different from variant (ex. TT, *1/*18, del/del, etc.). -Association Type: The type of associations the variant has in the article from the options Drug, Phenotype, or Functional. One variant may have multiple association types. More information on how to determine this below. -Quotes: A direct quote from the article that mentions this specific variant and its found association. Output the exact text where this variant is discussed (ideally in the methodology, abstract, or results section). -More than one quote can be outputted if that would be helpful but try to keep the total number fewer than 3. - -For each term except for Summary make sure to keep track of and output the exact quotes where that information is found/can be deduced. - -To determine the Association Type: - -A variant has a Drug association when the article reports associations between the genetic variant and -pharmacological parameters or clinical drug response measures that specifically relate to: -- Pharmacokinetic/Pharmacodynamic Parameters -- Clinical phenotypes/adverse events (Drug toxicity, organ dysfunction, treatment response phenotypes, disease outcomes when treated with drugs) - -A variant has a Phenotype association when the article reports associations between genetic variants and adverse drug reactions, toxicities, or clinical outcomes that represent: -- Toxicity/Safety outcomes -- Clinical phenotypes/adverse events - -A variant has a Functional association when the article contains in vitro or mechanistic functional studies that directly measure how the variant affects: -- Enzyme/transporter activity (e.g., clearance, metabolism, transport) -- Binding affinity (e.g., protein-drug interactions) -- Functional properties (e.g., uptake rates, kinetic parameters like Km/Vmax) - -The key distinction is mechanistic functional studies typically get Functional associations vs clinical association studies get Phenotype and Drug associations but Functional. -Examples: -- "Cardiotoxicity when treated with anthracyclines" → Phenotype -- "Decreased clearance of methotrexate" → Drug -- "Decreased enzyme activity in cell culture" → Functional -- "Variant affects drug clearance/response" —> Drug -- "Variant affects adverse events/toxicity outcomes" —> Phenotype -- "Variant affects protein function in laboratory studies" —> Functional -""" - - -def get_all_associations(article_text: str) -> List[Dict]: - """ - Extract all variant associations from the article - """ - prompt = GeneratorPrompt( - input_prompt=ArticlePrompt( - article_text=article_text, - key_question=VARIANT_LIST_KEY_QUESTION, - output_queues=VARIANT_LIST_OUTPUT_QUEUES, - ), - output_format_structure=VariantAssociationList, - ).get_hydrated_prompt() - generator = Generator(model="gpt-4o", samples=5) - responses = generator.generate(prompt) - logger.info(f"Fusing {len(responses)} Responses") - - fuser = Fuser(model="gpt-4o", temperature=0.1) - fused_response = fuser.generate(responses, response_format=VariantAssociationList) - - return fused_response.association_list - - -def test_all_associations(): - """ - Output the extracted variant associations to a file - """ - pmcid = "PMC4737107" - article_text = get_article_text(pmcid) - logger.info(f"Got article text {pmcid}") - associations = get_all_associations(article_text) - logger.info("Extracted associations") - file_path = f"data/extractions/all_associations/{pmcid}.jsonl" - os.makedirs(os.path.dirname(file_path), exist_ok=True) - with open(file_path, "w") as f: - json.dump( - [assoc.model_dump(mode="json") for assoc in associations], f, indent=4 - ) - logger.info(f"Saved to file {file_path}") - - -if __name__ == "__main__": - test_all_associations() diff --git a/src/deprecated/all_variants.py b/src/deprecated/all_variants.py deleted file mode 100644 index f94e1c5..0000000 --- a/src/deprecated/all_variants.py +++ /dev/null @@ -1,98 +0,0 @@ -from src.inference import Generator -from src.deprecated.variants import Variant, VariantList -from src.prompts import GeneratorPrompt, PromptVariables -from src.utils import get_article_text -from loguru import logger -import json -from typing import List, Optional -from src.config import DEBUG - -VARIANT_LIST_KEY_QUESTION = """ -From this article, note down ALL discussed variants/haplotypes (ex. rs113993960, CYP1A1*1, etc.). Include information on the gene group and allele (if present). -Make sure they variant has a studied association (likely discussed in the methodology or results section), not simply mentioned as background information. -""" - -VARIANT_LIST_OUTPUT_QUEUES = """Your output format should be a list of the variants with the following attributes: -Variant: The Variant / Haplotypes (ex. rs2909451, CYP2C19*1, CYP2C19*2, *1/*18, etc.) -Gene: The gene group of the variant (ex. DPP4, CYP2C19, KCNJ11, etc.) -Allele: Specific allele or genotype if different from variant (ex. TT, *1/*18, del/del, etc.) -Evidence: REQUIRED - A direct quote from the article that mentions this specific variant. Find the exact text where this variant is discussed in the methodology or results section. - -IMPORTANT: You MUST include the evidence field for every variant. Do not leave it empty or null. -""" - - -def extract_all_variants( - article_text: Optional[str] = None, - pmcid: Optional[str] = None, - model: str = "gpt-4o", - temperature: float = 0.1, -) -> List[Variant]: - """Extract a list of variants from an article. - Args: - article_text: The text of the article. - PMCID: The PMCID of the article. - - Returns: - A list of variants. - """ - article_text = get_article_text(pmcid=pmcid, article_text=article_text) - - if DEBUG: - logger.debug(f"Model: {model}, Temperature: {temperature}") - logger.debug(f"PMCID: {pmcid}") - - generator = Generator(model=model, temperature=temperature) - prompt_variables = PromptVariables( - article_text=article_text, - key_question=VARIANT_LIST_KEY_QUESTION, - output_queues=VARIANT_LIST_OUTPUT_QUEUES, - output_format_structure=VariantList, - ) - prompt_generator = GeneratorPrompt(prompt_variables) - hydrated_prompt = prompt_generator.hydrate_prompt() - logger.info(f"Extracting all variants") - output = generator.prompted_generate(hydrated_prompt) - if DEBUG: - logger.debug(f"Raw LLM output: {output}") - parsed_output = json.loads(output) - if DEBUG: - logger.debug(f"Parsed output: {parsed_output}") - variant_list = [ - Variant(**variant_data) for variant_data in parsed_output["variant_list"] - ] - logger.info(f"Found {len(variant_list)} variants") - return variant_list - - -def main( - pmcid: str, - model: str = "gpt-4o", - temperature: float = 0.1, - output: Optional[str] = None, -): - """Main function to demonstrate variant extraction functionality.""" - try: - # Extract variants - variants = extract_all_variants( - pmcid=pmcid, model=model, temperature=temperature - ) - - # Print results - print(f"Found {len(variants)} variants:") - for i, variant in enumerate(variants, 1): - print(f"{i}. Variant: {variant.variant_id}") - print(f" Gene: {variant.gene}") - print(f" Allele: {variant.allele}") - print(f" Evidence: {variant.evidence}") - print() - - # Save to file if output path specified - if output: - with open(output, "w") as f: - json.dump({"variants": variants}, f, indent=2) - print(f"Results saved to {output}") - - except Exception as e: - logger.error(f"Error extracting variants: {e}") - raise diff --git a/src/deprecated/association_types.py b/src/deprecated/association_types.py deleted file mode 100644 index 48f4320..0000000 --- a/src/deprecated/association_types.py +++ /dev/null @@ -1,182 +0,0 @@ -""" -Given a list of variants and the article text, determine the type of association (drug, phenotype, functional association) -""" - -from src.deprecated.variants import Variant -from typing import List, Optional -from src.prompts import PromptVariables, GeneratorPrompt, ParserPrompt -from src.inference import Generator, Parser -from pydantic import BaseModel -from src.utils import get_article_text -from loguru import logger -from src.config import DEBUG - - -class AssociationType(BaseModel): - """ - Variant Association Type - Members: - - variant: Variant - - drug_association: bool - - drug_association_explanation: str - - drug_association_quote: str - - phenotype_association: bool - - phenotype_association_explanation: str - - phenotype_association_quote: str - - functional_association: bool - - functional_association_explanation: str - - functional_association_quote: str - """ - - variant: Variant - association_type: List[str] - explanation: str - quotes: List[str] - - -class AssociationTypeList(BaseModel): - """ - List of association types for structured output. - """ - - association_types: List[AssociationType] - - -KEY_QUESTION = """ -For the following variants, determine what type of association(s) is being studied by the article. The options are Drug, Phenotype, and Functional. -Variants: {variants} - -A variant has a Drug association when the article reports associations between the genetic variant and -pharmacological parameters or clinical drug response measures that specifically relate to: -- Pharmacokinetic/Pharmacodynamic Parameters -- Clinical phenotypes/adverse events (Drug toxicity, organ dysfunction, treatment response phenotypes, disease outcomes when treated with drugs) - -A variant has a Phenotype association when the article reports associations between genetic variants and adverse drug reactions, toxicities, or clinical outcomes that represent: -- Toxicity/Safety outcomes -- Clinical phenotypes/adverse events - -A variant has a Functional association when the article contains in vitro or mechanistic functional studies that directly measure how the variant affects: -- Enzyme/transporter activity (e.g., clearance, metabolism, transport) -- Binding affinity (e.g., protein-drug interactions) -- Functional properties (e.g., uptake rates, kinetic parameters like Km/Vmax) - -The key distinction is mechanistic functional studies typically get Functional associations vs clinical association studies get Phenotype and Drug associations but Functional. -Examples: -- "Cardiotoxicity when treated with anthracyclines" → Phenotype -- "Decreased clearance of methotrexate" → Drug -- "Decreased enzyme activity in cell culture" → Functional -- "Variant affects drug clearance/response" —> Drug -- "Variant affects adverse events/toxicity outcomes" —> Phenotype -- "Variant affects protein function in laboratory studies" —> Functional - -""" - -OUTPUT_QUEUES = """ -Using this information, decide which out of the 3 annotations the variant should receive with a one sentence summary explanation for the decision along with a sentence/quote from the article that indicates why this is true. It is possible there is more than one Annotation/association per variant - -Variant Object: (variant) -Variant Drug Association: (Y/N) -Explanation: (Reason) -Quote:(Quote) - -Variant Phenotype Association: (Y/N) -Explanation: (Reason) -Quote:(Quote) - -Variant Functional Association: (Y/N) -Explanation: (Reason) -""" - - -def get_association_types( - variants: List[Variant], - article_text: Optional[str] = None, - pmcid: Optional[str] = None, -) -> Optional[List[AssociationType]]: - article_text = get_article_text(pmcid=pmcid, article_text=article_text) - variant_id_list = [variant.variant_id for variant in variants] - prompt_variables = PromptVariables( - article_text=article_text, - key_question=KEY_QUESTION.format(variants=variants), - output_queues=OUTPUT_QUEUES, - output_format_structure=AssociationTypeList, - ) - logger.info(f"Determining association type for variants {variant_id_list}") - prompt_generator = GeneratorPrompt(prompt_variables) - generator_prompt = prompt_generator.hydrate_prompt() - - # Step 1: Generate the analysis - generator = Generator(model="gpt-4o-mini", temperature=0.1) - response = generator.prompted_generate(generator_prompt) - - # Step 2: Parse the response into structured format - parser = Parser(model="gpt-4o-mini", temperature=0.1) - parser_prompt = ParserPrompt( - input_prompt=response, - output_format_structure=AssociationTypeList, - system_prompt=generator_prompt.system_prompt, - ) - parsed_response = parser.prompted_generate(parser_prompt.hydrate_prompt()) - - # Parse the string response into AssociationType objects - try: - import json - - parsed_data = json.loads(parsed_response) - - # Handle different response formats - if isinstance(parsed_data, dict) and "association_types" in parsed_data: - association_data = parsed_data["association_types"] - elif isinstance(parsed_data, list): - association_data = parsed_data - else: - association_data = [parsed_data] - - # Convert to AssociationType objects - return [AssociationType(**item) for item in association_data] - - except (json.JSONDecodeError, TypeError) as e: - logger.error(f"Failed to parse response for variants {variants}: {e}") - return None - - -def list_association_types(association_type: AssociationType) -> List[str]: - association_types = [] - if association_type.drug_association: - association_types.append("Drug") - if DEBUG: - logger.debug(f"Drug Association: {association_type.drug_association}") - logger.debug( - f"Drug Association Explanation: {association_type.drug_association_explanation}" - ) - logger.debug( - f"Drug Association Quote: {association_type.drug_association_quote}" - ) - if association_type.phenotype_association: - association_types.append("Phenotype") - if DEBUG: - logger.debug( - f"Phenotype Association: {association_type.phenotype_association}" - ) - logger.debug( - f"Phenotype Association Explanation: {association_type.phenotype_association_explanation}" - ) - logger.debug( - f"Phenotype Association Quote: {association_type.phenotype_association_quote}" - ) - if association_type.functional_association: - association_types.append("Functional") - if DEBUG: - logger.debug( - f"Functional Association: {association_type.functional_association}" - ) - logger.debug( - f"Functional Association Explanation: {association_type.functional_association_explanation}" - ) - logger.debug( - f"Functional Association Quote: {association_type.functional_association_quote}" - ) - logger.info( - f"Variant: {association_type.variant.variant_id} has association types: {association_types}" - ) - return association_types diff --git a/src/deprecated/drug_annotation.py b/src/deprecated/drug_annotation.py deleted file mode 100644 index 2b787c6..0000000 --- a/src/deprecated/drug_annotation.py +++ /dev/null @@ -1,166 +0,0 @@ -""" -Extract detailed drug annotation information for variants with drug associations. -""" - -from typing import Optional, Dict -from loguru import logger -from pydantic import BaseModel -from src.deprecated.variants import QuotedStr, QuotedList -from src.deprecated.all_associations import ( - VariantAssociation, - get_all_associations, - AssociationType, -) -from src.prompts import GeneratorPrompt, PromptHydrator -from src.inference import Generator -from src.utils import get_article_text -from src.config import DEBUG -import json -import os - -""" -Terms: -- Drug(s): -- Phenotype Category -- Association Significane -- Sentence Summary (get examples) -- Specialty Populations -- Notes: 3-4 sentence summary of the results of the study in relation to these variant and the found association. - -Explain your reasoning step by step by including the term, a one sentence explanation, and an exact quote from the article that details where -""" - - -class DrugAnnotation(BaseModel): - associated_drugs: QuotedList - association_significance: QuotedStr - meatbolizer_info: Optional[QuotedStr] - specialty_populations: QuotedStr - sentence_summary: str - notes: Optional[str] - - -def get_association_background_prompt(variant_association: VariantAssociation): - background_prompt = "" - background_prompt += f"Variant ID: {variant_association.variant.content}\n" - background_prompt += ( - f"Association Summary: {variant_association.association_summary}\n" - ) - return background_prompt - - -""" -Old Terms -Term: Variant/Haplotypes -- Content: The specific genetic variant mentioned in the study -- Exampls: rs2909451, CYP2C19*1, CYP2C19*2, *1/*18 - -Term: Gene -- Content: HGNC symbol for the gene involved in the association. Typically the variants will be within the gene -boundaries, but occasionally this will not be true. E.g. the variant in the annotation may be upstream of the gene but -is reported to affect the gene's expression or otherwise associated with the gene. -- Exampls: DPP4, CYP2C19, KCNJ11 -""" - -KEY_QUESTION = """ -This article contains information on the following variant association: -{association_background} - -We are trying to complete a Drug Annotation report that is specifically interested in associations between genetic variants and -pharmacological parameters or clinical drug response measures. - -For this association, use the article the find the following additional information for us to get a complete undestanding of the findings: - -Term: Drug(s) -- Content: Nme(s) of the drug(s) associated with the variant as part of this association along with a one sentence -description of the results. Convert the drug names to their generic before outputting if possible but include the original term in parentheses. - -Term: Phenotype Category -- Content: Type of clinical outcome studied (EXACTLY ONE: "Efficacy", "Metabolism/PK", "Toxicity", "Dosage", "Other") -- Example: Efficacy - -Term: Metabolizer Info (Optional) -- Content: If the study describes a metabolism relationship, describe the CYP enzyme phenotype categories and how they were created/defined. -For example, if the study references a "poor metabolizer" define poor metabolizer as well as the reference metabolizer types. If -the study is not metabolism related, output None or ignore this term. - -Term: Significance -- Content: Was this association statistically significant? Describe the author's reported p-value or relevant statistical values. - -Term: Specialty Population -- Content: Was an age-specific population studied as part of this association? (EXACTLY ONE: "Pediatric", "Geriatric", "No", or "Unknown") - -Term: Sentence -- Content: One sentence summary of the association. Make sure to include the following information roughly by following this -rough format: "[Genotype/Allele/Variant] is [associated with/not associated with] [increased/decreased] [outcome] [drug context] [population context]" -- Example: "Genotype TT is associated with decreased response to sitagliptin in people with Diabetes Mellitus, Type 2." - -Term: Notes -- Content: Any additional key study details, methodology, or important context -- Example: "Patients with the rs2909451 TT genotype in the study group exhibited a median HbA1c improvement of 0.57..." -""" - -OUTPUT_QUEUES = """ -For each variant, extract all the above information and provide it in structured format - -For each variant, provide: -- All required fields filled with appropriate values or left empty if not applicable -- Ensure controlled vocabulary compliance for categorical fields -- Extract direct quotes from the article to support the annotations -""" - - -def get_drug_annotation(variant_association: VariantAssociation | Dict): - if isinstance(variant_association, dict): - variant_association = VariantAssociation(**variant_association) - prompt = GeneratorPrompt( - input_prompt=PromptHydrator( - prompt_template=KEY_QUESTION, - prompt_variables={ - "association_background": get_association_background_prompt( - variant_association - ), - }, - system_prompt=None, - output_format_structure=DrugAnnotation, - ), - output_format_structure=DrugAnnotation, - ).get_hydrated_prompt() - generator = Generator(model="gpt-4.1") - return generator.generate(prompt) - - -def test_drug_annotations(): - """ - Output the extracted variant associations to a file - """ - pmcid = "PMC11730665" - article_text = get_article_text(pmcid) - logger.info(f"Got article text {pmcid}") - associations = get_all_associations(article_text) - - # Save associations - file_path = f"data/extractions/{pmcid}/associations.jsonl" - os.makedirs(os.path.dirname(file_path), exist_ok=True) - with open(file_path, "w") as f: - json.dump(associations, f, indent=4) - logger.info(f"Saved to file {file_path}") - - logger.info(f"Found {len(associations)} associations") - associations = [VariantAssociation(**association) for association in associations] - drug_annotations = [] - for association in associations: - if association.association_type == AssociationType.DRUG: - drug_annotation = get_drug_annotation(association) - drug_annotations.append(drug_annotation) - - logger.info(f"Got drug annotations for {len(drug_annotations)} associations") - file_path = f"data/extractions/{pmcid}/drug_annotation.jsonl" - os.makedirs(os.path.dirname(file_path), exist_ok=True) - with open(file_path, "w") as f: - json.dump(drug_annotations, f, indent=4) - logger.info(f"Saved to file {file_path}") - - -if __name__ == "__main__": - test_drug_annotations() diff --git a/src/deprecated/functional_annotation.py b/src/deprecated/functional_annotation.py deleted file mode 100644 index b15abdf..0000000 --- a/src/deprecated/functional_annotation.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Extract detailed drug annotation information for variants with drug associations. -""" - -from typing import List, Optional, Dict -import os -from loguru import logger -from pydantic import BaseModel -from src.deprecated.variants import Variant, QuotedStr, QuotedList -from src.deprecated.all_associations import ( - VariantAssociation, - get_all_associations, - AssociationType, -) -from src.prompts import PromptHydrator, GeneratorPrompt -from src.inference import Generator -from src.utils import get_article_text -from src.config import DEBUG -import json - -""" -Terms: -- Drug(s): -- Phenotype Category -- Association Significane -- Sentence Summary (get examples) -- Specialty Populations -- Notes: 3-4 sentence summary of the results of the study in relation to these variant and the found association. - -Explain your reasoning step by step by including the term, a one sentence explanation, and an exact quote from the article that details where -""" - - -class FunctionalAnnotation(BaseModel): - associated_drugs: QuotedList - association_significance: QuotedStr - specialty_populations: QuotedStr - assay_type: QuotedStr - cell_type: QuotedStr - sentence_summary: str - notes: Optional[str] - - -def get_association_background_prompt(variant_association: VariantAssociation): - background_prompt = "" - background_prompt += f"Variant ID: {variant_association.variant.content}\n" - background_prompt += ( - f"Association Summary: {variant_association.association_summary}\n" - ) - return background_prompt - - -KEY_QUESTION = """ -This article contains information on the following variant association: -{association_background} - -We are interested in completing a Functional Annotation report that is specifically interested in associations between genetic variants -and in-vitro outcomes such as: -- Enzyme/transporter activity (e.g., clearance, metabolism, transport) -- Binding affinity (e.g., protein-drug interactions) -- Functional properties (e.g., uptake rates, kinetic parameters like Km/Vmax) - -Term: Drug(s) -- Content: Nme(s) of the drug(s) associated with the variant as part of this association along with a one sentence -description of the results. Convert the drug names to their generic before outputting if possible but include the original term in parentheses. - -Term: Phenotype Category -- Content: Type of clinical outcome studied (EXACTLY ONE: "Efficacy", "Metabolism/PK", "Toxicity", "Dosage", "Other: ") - -Term: Assay Type -- Content: Laboratory method or experimental system used to measure this association. -- Example: hydroxylation assay, crystal structure prediction, etc. - -Term: Cell Type -- Content: The cell type(s) used in the assay for this association. Include species context if available -- Example: insect microsomes, human hepatocytes, E. coli DH5alpha, etc. - -Term: Significance -- Content: Was this association statistically significant? Describe the author's reported p-value or relevant statistical values. - -Term: Sentence -- Content: One sentence summary of the association. Make sure to include the following information roughly by following this -rough format: "[Genotype/Allele/Variant] is [associated with/not associated with] [increased/decreased] [outcome] [drug context] [population context]" -- Example: "Genotype TT is associated with decreased response to sitagliptin in people with Diabetes Mellitus, Type 2." - -Term: Notes -- Content: Any additional key study details, methodology, or important context -- Example: "TPMT protein levels were comparable between TPMT*3C and TPMT*1 when expressed in yeast. Comparable results were seen in COS-1 cells. mRNA levels were comparable between *3C and *1 in yeast." -""" - -OUTPUT_QUEUES = """ -For each variant, extract all the above information and provide it in structured format - -For each variant, provide: -- All required fields filled with appropriate values or left empty if not applicable -- Ensure controlled vocabulary compliance for categorical fields -- Extract direct quotes from the article to support the annotations -""" - - -def get_functional_annotation(variant_association: VariantAssociation | Dict): - if isinstance(variant_association, dict): - variant_association = VariantAssociation(**variant_association) - prompt = GeneratorPrompt( - input_prompt=PromptHydrator( - prompt_template=KEY_QUESTION, - prompt_variables={ - "association_background": get_association_background_prompt( - variant_association - ), - }, - system_prompt=None, - output_format_structure=FunctionalAnnotation, - ), - output_format_structure=FunctionalAnnotation, - ).get_hydrated_prompt() - generator = Generator(model="gpt-4o") - return generator.generate(prompt) - - -def test_functional_annotations(): - """ - Output the extracted variant associations to a file - """ - pmcid = "PMC11730665" - article_text = get_article_text(pmcid) - logger.info(f"Got article text {pmcid}") - associations = get_all_associations(article_text) - - # Save associations - file_path = f"data/extractions/{pmcid}/associations.jsonl" - os.makedirs(os.path.dirname(file_path), exist_ok=True) - with open(file_path, "w") as f: - json.dump(associations, f, indent=4) - logger.info(f"Saved to file {file_path}") - - logger.info(f"Found {len(associations)} associations") - associations = [VariantAssociation(**association) for association in associations] - functional_annotations = [] - for association in associations: - if association.association_type == AssociationType.FUNCTIONAL: - functional_annotation = get_functional_annotation(association) - functional_annotations.append(functional_annotation) - - logger.info(f"Got drug annotations for {len(functional_annotations)} associations") - file_path = f"data/extractions/{pmcid}/functional_annotation.jsonl" - os.makedirs(os.path.dirname(file_path), exist_ok=True) - with open(file_path, "w") as f: - json.dump(functional_annotations, f, indent=4) - logger.info(f"Saved to file {file_path}") - - -if __name__ == "main": - test_functional_annotations() diff --git a/src/deprecated/functional_annotation_extraction.py b/src/deprecated/functional_annotation_extraction.py deleted file mode 100644 index 64fb642..0000000 --- a/src/deprecated/functional_annotation_extraction.py +++ /dev/null @@ -1,208 +0,0 @@ -""" -Extract detailed functional annotation information for variants with functional associations. -""" - -from typing import List -from loguru import logger -from pydantic import BaseModel -from src.deprecated.variants import ( - Variant, - FunctionalAnnotation, - FunctionalAnnotationList, -) -from src.prompts import PromptVariables, GeneratorPrompt, ParserPrompt -from src.inference import Generator, Parser -from src.utils import get_article_text -from src.config import DEBUG -import json -import time -import random - - -KEY_QUESTION = """ -For the following variants that have been identified as having functional associations, extract detailed mechanistic annotation information. - -Variants: {variants} - -Extract the following information for each variant: - -Term: Variant/Haplotypes -- Content: The specific genetic variant studied -- Example: CYP2C19*1, CYP2C19*17, rs72552763, CYP2B6*1, CYP2B6*6 - -Term: Gene -- Content: Gene symbol associated with the variant -- Example: CYP2C19, CYP2B6, SLC22A1 - -Term: Drug(s) -- Content: Substrate or compound used in the functional assay -- Example: normeperidine, bupropion, warfarin, voriconazole, ranitidine - -Term: Phenotype Category -- Content: Type of functional outcome measured (EXACTLY ONE: "Metabolism/PK", "Efficacy", or leave empty) -- Example: Metabolism/PK (for enzyme kinetics), Efficacy (for cellular response) - -Term: Significance -- Content: Statistical significance of functional differences (EXACTLY ONE: "yes", "no", "not stated") -- Example: yes (for significant activity differences), not stated (for descriptive studies) - -Term: Notes -- Content: Key experimental details, methodology, quantitative results -- Example: "Clearance was 26.57% of wild-type. CYP2C19 variants expressed in Sf21 insect cells..." - -Term: Sentence -- Content: Standardized description of the functional relationship -- Format: "[Variant] is associated with [increased/decreased] [functional outcome] [experimental context] as compared to [reference variant]" -- Example: "CYP2C19 *17/*17 is associated with increased formation of normeperidine as compared to CYP2C19 *1/*1 + *1/*17." - -Term: Alleles -- Content: Specific allele or genotype tested -- Example: *17/*17, *1/*1, del, A - -Term: Specialty Population -- Content: Age-specific populations (rarely applicable to functional studies, usually empty) - -Term: Assay type -- Content: Laboratory method or experimental system used -- Example: in human liver microsomes, hydroxylation assay, crystal structure prediction, Cells - -Term: Metabolizer types -- Content: Phenotype classification if applicable (rarely used in functional studies) -- Example: Usually empty - -Term: isPlural -- Content: Grammar helper for sentence construction (EXACTLY ONE: "Is", "Are") -- Example: Is - -Term: Is/Is Not associated -- Content: Direction of functional association (EXACTLY ONE: "Associated with", "Not associated with") - -Term: Direction of effect -- Content: Whether the variant increases or decreases function (EXACTLY ONE: "increased", "decreased") -- Example: increased (for enhanced activity), decreased (for reduced activity) - -Term: Functional terms -- Content: Specific functional outcome measured -- Example: formation of, activity of, clearance of, transport of, affinity to, catalytic activity of - -Term: Gene/gene product -- Content: Specific gene or protein being functionally assessed -- Example: CYP2C19, CYP2B6, CYP2C9 - -Term: When treated with/exposed to/when assayed with -- Content: Experimental substrate context -- Example: when assayed with, of, or leave empty - -Term: Multiple drugs And/or -- Content: Logical connector for multiple substrates (EXACTLY ONE: "and", "or", or leave empty) - -Term: Cell type -- Content: Cell line or tissue system used for the assay -- Example: in 293FT cells, expressed in COS-7 cells, Sf21 insect cells, in insect microsomes - -Term: Comparison Allele(s) or Genotype(s) -- Content: Reference variant for comparison -- Example: *1/*1 + *1/*17, *1, GAT - -Term: Comparison Metabolizer types -- Content: Reference metabolizer status (usually empty for functional studies) -""" - -OUTPUT_QUEUES = """ -For each variant, extract all the above information and provide it in structured format. Generate a unique Variant Annotation ID using timestamp + random numbers. - -For each variant, provide: -- All required fields filled with appropriate values or left empty if not applicable -- Ensure controlled vocabulary compliance for categorical fields -- Extract direct quotes from the article to support the annotations -""" - - -def extract_functional_annotations( - variants: List[Variant], article_text: str = None, pmcid: str = None -) -> List[FunctionalAnnotation]: - """ - Extract detailed functional annotation information for variants with functional associations. - Processes each variant individually for better control and cleaner extraction. - - Args: - variants: List of variants that have functional associations - article_text: The text of the article - pmcid: The PMCID of the article - - Returns: - List of FunctionalAnnotation objects with detailed information - """ - article_text = get_article_text(pmcid=pmcid, article_text=article_text) - variant_id_list = [variant.variant_id for variant in variants] - - logger.info( - f"Extracting functional annotations for {len(variants)} variants individually: {variant_id_list}" - ) - - all_annotations = [] - - for variant in variants: - logger.info(f"Processing variant: {variant.variant_id}") - - class SingleFunctionalAnnotation(BaseModel): - functional_annotation: FunctionalAnnotation - - prompt_variables = PromptVariables( - article_text=article_text, - key_question=KEY_QUESTION.format(variants=[variant]), - output_queues=OUTPUT_QUEUES, - output_format_structure=SingleFunctionalAnnotation, - ) - - prompt_generator = GeneratorPrompt(prompt_variables) - generator_prompt = prompt_generator.hydrate_prompt() - - generator = Generator(model="gpt-4o-mini", temperature=0.1) - response = generator.prompted_generate(generator_prompt) - - parser = Parser(model="gpt-4o-mini", temperature=0.1) - parser_prompt = ParserPrompt( - input_prompt=response, - output_format_structure=SingleFunctionalAnnotation, - system_prompt=generator_prompt.system_prompt, - ) - parsed_response = parser.prompted_generate(parser_prompt) - - try: - parsed_data = json.loads(parsed_response) - - if isinstance(parsed_data, dict) and "functional_annotation" in parsed_data: - annotation_data = parsed_data["functional_annotation"] - elif isinstance(parsed_data, dict): - annotation_data = parsed_data - else: - logger.warning( - f"Unexpected response format for variant {variant.variant_id}: {parsed_data}" - ) - continue - - if ( - "variant_annotation_id" not in annotation_data - or not annotation_data["variant_annotation_id"] - ): - annotation_data["variant_annotation_id"] = int( - str(int(time.time())) + str(random.randint(100000, 999999)) - ) - - annotation = FunctionalAnnotation(**annotation_data) - all_annotations.append(annotation) - logger.info( - f"Successfully extracted functional annotation for variant {variant.variant_id}" - ) - - except (json.JSONDecodeError, TypeError, ValueError) as e: - logger.error( - f"Failed to parse functional annotation response for variant {variant.variant_id}: {e}" - ) - continue - - logger.info( - f"Successfully extracted {len(all_annotations)} functional annotations from {len(variants)} variants" - ) - return all_annotations diff --git a/src/deprecated/phenotype_annotation.py b/src/deprecated/phenotype_annotation.py deleted file mode 100644 index 1ebbfdf..0000000 --- a/src/deprecated/phenotype_annotation.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -Extract detailed drug annotation information for variants with drug associations. -""" - -from typing import List, Optional, Dict -from loguru import logger -from pydantic import BaseModel -from src.deprecated.variants import Variant, QuotedStr, QuotedList -from src.deprecated.all_associations import ( - VariantAssociation, - get_all_associations, - AssociationType, -) -from src.prompts import PromptHydrator, GeneratorPrompt -from src.inference import Generator, Parser -from src.utils import get_article_text -from src.config import DEBUG -import json -import os - -""" -Terms: -- Drug(s): -- Phenotype Category -- Association Significane -- Sentence Summary (get examples) -- Specialty Populations -- Notes: 3-4 sentence summary of the results of the study in relation to these variant and the found association. - -Explain your reasoning step by step by including the term, a one sentence explanation, and an exact quote from the article that details where -""" - - -class PhenotypeAnnotation(BaseModel): - associated_drugs: QuotedList - association_significance: QuotedStr - meatbolizer_info: Optional[QuotedStr] - specialty_populations: QuotedStr - sentence_summary: str - notes: Optional[str] - - -def get_association_background_prompt(variant_association: VariantAssociation): - background_prompt = "" - background_prompt += f"Variant ID: {variant_association.variant.content}\n" - background_prompt += ( - f"Association Summary: {variant_association.association_summary}\n" - ) - return background_prompt - - -KEY_QUESTION = """ -This article contains information on the following variant association: -{association_background} - -We are interested in completing a Phenotype Annotation report that is specifically interested in associations between genetic variants -and adverse drug reactions, toxicities, or clinical outcomes that represent: -- Toxicity/Safety outcomes -- Clinical phenotypes/adverse events - -Term: Drug(s) -- Content: Nme(s) of the drug(s) associated with the variant as part of this association along with a one sentence -description of the results. Convert the drug names to their generic before outputting if possible but include the original term in parentheses. - -Term: Phenotype Category -- Content: Type of clinical outcome studied (EXACTLY ONE: "Efficacy", "Metabolism/PK", "Toxicity", "Dosage", "Other") -- Example: Efficacy - -Term: Significance -- Content: Was this association statistically significant? Describe the author's reported p-value or relevant statistical values. - -Term: Specialty Population -- Content: Was an age-specific population studied as part of this association? (EXACTLY ONE: "Pediatric", "Geriatric", "No", or "Unknown") - -Term: Sentence -- Content: One sentence summary of the association. Make sure to include the following information roughly by following this -rough format: "[Genotype/Allele/Variant] is [associated with/not associated with] [increased/decreased] [outcome] [drug context] [population context]" -- Example: "HLA-B *35:08 is not associated with likelihood of Maculopapular Exanthema, severe cutaneous adverse reactions or Stevens-Johnson Syndrome when treated with lamotrigine in people with Epilepsy." - -Term: Notes -- Content: Any additional key study details, methodology, or important context -- Example: The allele was not significant when comparing allele frequency in cases of severe cutaneous adverse reactions (SCAR), Stevens-Johnson Syndrome (SJS) and Maculopapular Exanthema (MPE) (1/15) and controls (individuals without AEs who took lamotrigine) (0/50). The allele was significant when comparing between cases (1/15) and the general population (1/986)." -""" - -OUTPUT_QUEUES = """ -For each variant, extract all the above information and provide it in structured format - -For each variant, provide: -- All required fields filled with appropriate values or left empty if not applicable -- Ensure controlled vocabulary compliance for categorical fields -- Extract direct quotes from the article to support the annotations -""" - - -def get_phenotype_annotation(variant_association: VariantAssociation | Dict): - if isinstance(variant_association, dict): - variant_association = VariantAssociation(**variant_association) - prompt = GeneratorPrompt( - input_prompt=PromptHydrator( - prompt_template=KEY_QUESTION, - prompt_variables={ - "association_background": get_association_background_prompt( - variant_association - ), - }, - system_prompt=None, - output_format_structure=PhenotypeAnnotation, - ), - output_format_structure=PhenotypeAnnotation, - ).get_hydrated_prompt() - generator = Generator(model="gpt-4o") - return generator.generate(prompt) - - -def test_phenotype_annotations(): - """ - Output the extracted variant associations to a file - """ - pmcid = "PMC11730665" - article_text = get_article_text(pmcid) - logger.info(f"Got article text {pmcid}") - associations = get_all_associations(article_text) - - # Save associations - file_path = f"data/extractions/{pmcid}/associations.jsonl" - os.makedirs(os.path.dirname(file_path), exist_ok=True) - with open(file_path, "w") as f: - json.dump(associations, f, indent=4) - logger.info(f"Saved to file {file_path}") - - logger.info(f"Found {len(associations)} associations") - associations = [VariantAssociation(**association) for association in associations] - phenotype_annotations = [] - for association in associations: - if association.association_type == AssociationType.PHENOTYPE: - phenotype_annotation = get_phenotype_annotation(association) - phenotype_annotations.append(phenotype_annotation) - - logger.info(f"Got drug annotations for {len(phenotype_annotations)} associations") - file_path = f"data/extractions/{pmcid}/phenotype_annotation.jsonl" - os.makedirs(os.path.dirname(file_path), exist_ok=True) - with open(file_path, "w") as f: - json.dump(phenotype_annotations, f, indent=4) - logger.info(f"Saved to file {file_path}") - - -if __name__ == "main": - test_phenotype_annotations() diff --git a/src/deprecated/phenotype_annotation_extraction.py b/src/deprecated/phenotype_annotation_extraction.py deleted file mode 100644 index 72feaba..0000000 --- a/src/deprecated/phenotype_annotation_extraction.py +++ /dev/null @@ -1,214 +0,0 @@ -""" -Extract detailed phenotype annotation information for variants with phenotype associations. -""" - -from typing import List -from loguru import logger -from pydantic import BaseModel -from src.deprecated.variants import ( - Variant, - PhenotypeAnnotation, - PhenotypeAnnotationList, -) -from src.prompts import PromptVariables, GeneratorPrompt, ParserPrompt -from src.inference import Generator, Parser -from src.utils import get_article_text -from src.config import DEBUG -import json -import time -import random - - -KEY_QUESTION = """ -For the following variants that have been identified as having phenotype associations, extract detailed pharmacogenomic annotation information. - -Variants: {variants} - -Extract the following information for each variant: - -Term: Variant/Haplotypes -- Content: The specific genetic variant mentioned in the study -- Example: HLA-B*35:08, rs1801272, UGT1A1*28 - -Term: Gene -- Content: Gene symbol associated with the variant -- Example: HLA-B, CYP2A6, UGT1A1 - -Term: Drug(s) -- Content: Drug(s) that caused or were involved in the phenotype -- Example: lamotrigine, sacituzumab govitecan, empty for disease predisposition - -Term: Phenotype Category -- Content: Type of phenotype or outcome studied (EXACTLY ONE: "Toxicity", "Efficacy", "Metabolism/PK", "Dosage", "Other") -- Example: Toxicity - -Term: Significance -- Content: Whether the association was statistically significant (EXACTLY ONE: "yes", "no", "not stated") -- Example: yes - -Term: Notes -- Content: Key study details, statistics, methodology -- Example: "The allele was not significant when comparing allele frequency in cases..." - -Term: Sentence -- Content: Standardized description of the genetic-phenotype association -- Format: "[Variant] is [associated with/not associated with] [increased/decreased] [phenotype outcome] [drug context] [population context]" -- Example: "HLA-B *35:08 is not associated with likelihood of Maculopapular Exanthema, severe cutaneous adverse reactions or Stevens-Johnson Syndrome when treated with lamotrigine in people with Epilepsy." - -Term: Alleles -- Content: Specific allele or genotype if different from main variant field -- Example: *35:08, AA + AT, *1/*28 + *28/*28 - -Term: Specialty Population -- Content: Age-specific populations (EXACTLY ONE: "Pediatric", "Geriatric", or leave empty) - -Term: Metabolizer types -- Content: CYP enzyme phenotype when applicable -- Example: ultrarapid metabolizer, intermediate activity - -Term: isPlural -- Content: Grammar helper for sentence construction (EXACTLY ONE: "Is", "Are") -- Example: Is (for single allele), Are (for combined genotypes) - -Term: Is/Is Not associated -- Content: Direction of statistical association (EXACTLY ONE: "Associated with", "Not associated with") - -Term: Direction of effect -- Content: Whether the variant increases or decreases the phenotype (EXACTLY ONE: "increased", "decreased", or leave empty) - -Term: Side effect/efficacy/other -- Content: Specific outcome descriptor -- Example: likelihood of, risk of - -Term: Phenotype -- Content: Primary phenotype with standardized prefix -- Example: Side Effect:Maculopapular Exanthema, Disease:Epilepsy - -Term: Multiple phenotypes And/or -- Content: Logical connector for multiple phenotypes (EXACTLY ONE: "and", "or", or leave empty) - -Term: When treated with/exposed to/when assayed with -- Content: Drug administration context -- Example: when treated with, when exposed to - -Term: Multiple drugs And/or -- Content: Logical connector for multiple drugs (EXACTLY ONE: "and", "or", or leave empty) - -Term: Population types -- Content: Descriptor of study population -- Example: in people with - -Term: Population Phenotypes or diseases -- Content: Disease/condition context with standardized prefix -- Example: Disease:Epilepsy, Other:Diabetes Mellitus, Type 2 - -Term: Multiple phenotypes or diseases And/or -- Content: Logical connector for multiple conditions (EXACTLY ONE: "and", "or", or leave empty) - -Term: Comparison Allele(s) or Genotype(s) -- Content: Reference genotype used for comparison -- Example: *1/*1, C - -Term: Comparison Metabolizer types -- Content: Reference metabolizer status for comparison -- Example: normal metabolizer -""" - -OUTPUT_QUEUES = """ -For each variant, extract all the above information and provide it in structured format. Generate a unique Variant Annotation ID using timestamp + random numbers. - -For each variant, provide: -- All required fields filled with appropriate values or left empty if not applicable -- Ensure controlled vocabulary compliance for categorical fields -- Extract direct quotes from the article to support the annotations -""" - - -def extract_phenotype_annotations( - variants: List[Variant], article_text: str = None, pmcid: str = None -) -> List[PhenotypeAnnotation]: - """ - Extract detailed phenotype annotation information for variants with phenotype associations. - Processes each variant individually for better control and cleaner extraction. - - Args: - variants: List of variants that have phenotype associations - article_text: The text of the article - pmcid: The PMCID of the article - - Returns: - List of PhenotypeAnnotation objects with detailed information - """ - article_text = get_article_text(pmcid=pmcid, article_text=article_text) - variant_id_list = [variant.variant_id for variant in variants] - - logger.info( - f"Extracting phenotype annotations for {len(variants)} variants individually: {variant_id_list}" - ) - - all_annotations = [] - - for variant in variants: - logger.info(f"Processing variant: {variant.variant_id}") - - class SinglePhenotypeAnnotation(BaseModel): - phenotype_annotation: PhenotypeAnnotation - - prompt_variables = PromptVariables( - article_text=article_text, - key_question=KEY_QUESTION.format(variants=[variant]), - output_queues=OUTPUT_QUEUES, - output_format_structure=SinglePhenotypeAnnotation, - ) - - prompt_generator = GeneratorPrompt(prompt_variables) - generator_prompt = prompt_generator.hydrate_prompt() - - generator = Generator(model="gpt-4o-mini", temperature=0.1) - response = generator.prompted_generate(generator_prompt) - - parser = Parser(model="gpt-4o-mini", temperature=0.1) - parser_prompt = ParserPrompt( - input_prompt=response, - output_format_structure=SinglePhenotypeAnnotation, - system_prompt=generator_prompt.system_prompt, - ) - parsed_response = parser.prompted_generate(parser_prompt) - - try: - parsed_data = json.loads(parsed_response) - - if isinstance(parsed_data, dict) and "phenotype_annotation" in parsed_data: - annotation_data = parsed_data["phenotype_annotation"] - elif isinstance(parsed_data, dict): - annotation_data = parsed_data - else: - logger.warning( - f"Unexpected response format for variant {variant.variant_id}: {parsed_data}" - ) - continue - - if ( - "variant_annotation_id" not in annotation_data - or not annotation_data["variant_annotation_id"] - ): - annotation_data["variant_annotation_id"] = int( - str(int(time.time())) + str(random.randint(100000, 999999)) - ) - - annotation = PhenotypeAnnotation(**annotation_data) - all_annotations.append(annotation) - logger.info( - f"Successfully extracted phenotype annotation for variant {variant.variant_id}" - ) - - except (json.JSONDecodeError, TypeError, ValueError) as e: - logger.error( - f"Failed to parse phenotype annotation response for variant {variant.variant_id}: {e}" - ) - continue - - logger.info( - f"Successfully extracted {len(all_annotations)} phenotype annotations from {len(variants)} variants" - ) - return all_annotations diff --git a/src/deprecated/variant_association_pipeline.py b/src/deprecated/variant_association_pipeline.py deleted file mode 100644 index cef8588..0000000 --- a/src/deprecated/variant_association_pipeline.py +++ /dev/null @@ -1,218 +0,0 @@ -""" -For an article -1. Extract all variants -2. Determine the association type for each variant - -Final output: -- Dictionary of -{ -"drug_associations": List[Variant], -"phenotype_associations": List[Variant], -"functional_associations": List[Variant], -} -""" - -from typing import Dict, List, Optional -from loguru import logger -from src.deprecated.all_variants import extract_all_variants -from src.deprecated.association_types import ( - get_association_types, - AssociationType, -) -from src.deprecated.drug_annotation import extract_drug_annotations -from src.deprecated.phenotype_annotation_extraction import ( - extract_phenotype_annotations, -) -from src.deprecated.functional_annotation_extraction import ( - extract_functional_annotations, -) -from src.utils import get_article_text -from src.deprecated.variants import Variant - -from src.config import DEBUG - - -class VariantAssociationPipeline: - """Pipeline to extract variants and determine their association types from an article.""" - - def __init__(self, model: str = "gpt-4o-mini", temperature: float = 0.1): - self.model = model - self.temperature = temperature - - def process_article( - self, article_text: Optional[str] = None, pmcid: Optional[str] = None - ) -> Dict[str, List[Variant]]: - """ - Process an article to extract variants and determine their association types. - - Args: - article_text: The text of the article - pmcid: The PMCID of the article - - Returns: - Dictionary with lists of variants for each association type and detailed drug annotations - """ - # Get article text - article_text = get_article_text(pmcid=pmcid, article_text=article_text) - - # Step 1: Extract all variants - logger.info("Step 1: Extracting variants from article") - variants = extract_all_variants(article_text, pmcid) - logger.info( - f"Extracted {len(variants)} variants: {[v.variant_id for v in variants]}" - ) - - if not variants: - logger.warning("No variants found in article") - return { - "drug_associations": [], - "phenotype_associations": [], - "functional_associations": [], - "drug_annotations": [], - "phenotype_annotations": [], - "functional_annotations": [], - } - - # Step 2: Determine association types for all variants - logger.info("Step 2: Determining association types for variants") - association_types_result = get_association_types(variants, article_text, pmcid) - - if association_types_result is None: - logger.error("Failed to determine association types") - return { - "drug_associations": [], - "phenotype_associations": [], - "functional_associations": [], - "drug_annotations": [], - "phenotype_annotations": [], - "functional_annotations": [], - } - - # Step 3: Categorize variants by association type - logger.info("Step 3: Categorizing variants by association type") - result = self._categorize_variants(variants, association_types_result) - - drug_annotations = [] - phenotype_annotations = [] - functional_annotations = [] - - if result["drug_associations"]: - logger.info("Step 4a: Extracting detailed drug annotations") - drug_annotations = extract_drug_annotations( - result["drug_associations"], article_text, pmcid - ) - logger.info(f"Extracted {len(drug_annotations)} detailed drug annotations") - - if result["phenotype_associations"]: - logger.info("Step 4b: Extracting detailed phenotype annotations") - phenotype_annotations = extract_phenotype_annotations( - result["phenotype_associations"], article_text, pmcid - ) - logger.info( - f"Extracted {len(phenotype_annotations)} detailed phenotype annotations" - ) - - if result["functional_associations"]: - logger.info("Step 4c: Extracting detailed functional annotations") - functional_annotations = extract_functional_annotations( - result["functional_associations"], article_text, pmcid - ) - logger.info( - f"Extracted {len(functional_annotations)} detailed functional annotations" - ) - - result["drug_annotations"] = drug_annotations - result["phenotype_annotations"] = phenotype_annotations - result["functional_annotations"] = functional_annotations - - logger.info( - f"Final categorization: {len(result['drug_associations'])} drug, " - f"{len(result['phenotype_associations'])} phenotype, " - f"{len(result['functional_associations'])} functional associations, " - f"{len(result['drug_annotations'])} detailed drug annotations, " - f"{len(result['phenotype_annotations'])} detailed phenotype annotations, " - f"{len(result['functional_annotations'])} detailed functional annotations" - ) - - return result - - def _categorize_variants( - self, variants: List[Variant], association_types: List[AssociationType] - ) -> Dict[str, List[Variant]]: - """ - Categorize variants based on their association types. - - Args: - variants: List of variants - association_types: List of association type results - - Returns: - Dictionary with variants categorized by association type - """ - drug_associations = [] - phenotype_associations = [] - functional_associations = [] - - # Create a mapping from variant_id to association_type for easy lookup - variant_to_association = { - assoc.variant.variant_id: assoc for assoc in association_types - } - - for variant in variants: - association = variant_to_association.get(variant.variant_id) - - if association is None: - logger.warning( - f"No association type found for variant {variant.variant_id}" - ) - continue - - # Categorize based on association types - if association.drug_association: - drug_associations.append(variant) - if DEBUG: - logger.debug( - f"Variant {variant.variant_id} has drug association: {association.drug_association_explanation}" - ) - - if association.phenotype_association: - phenotype_associations.append(variant) - if DEBUG: - logger.debug( - f"Variant {variant.variant_id} has phenotype association: {association.phenotype_association_explanation}" - ) - - if association.functional_association: - functional_associations.append(variant) - if DEBUG: - logger.debug( - f"Variant {variant.variant_id} has functional association: {association.functional_association_explanation}" - ) - - return { - "drug_associations": drug_associations, - "phenotype_associations": phenotype_associations, - "functional_associations": functional_associations, - } - - -def run_variant_association_pipeline( - article_text: Optional[str] = None, - pmcid: Optional[str] = None, - model: str = "gpt-4o-mini", - temperature: float = 0.1, -) -> Dict[str, List]: - """ - Convenience function to run the variant association pipeline. - - Args: - article_text: The text of the article - pmcid: The PMCID of the article - model: The LLM model to use - temperature: The temperature for LLM generation - - Returns: - Dictionary with lists of variants for each association type and detailed drug annotations - """ - pipeline = VariantAssociationPipeline(model=model, temperature=temperature) - return pipeline.process_article(article_text=article_text, pmcid=pmcid) diff --git a/src/deprecated/variants.py b/src/deprecated/variants.py deleted file mode 100644 index ef7cc3e..0000000 --- a/src/deprecated/variants.py +++ /dev/null @@ -1,133 +0,0 @@ -from pydantic import BaseModel -from typing import List - - -class ExplainedStr(BaseModel): - content: str - explanation: str - - -class ExplainedList(BaseModel): - contents: List[str] - explanation: str - - -class Variant(BaseModel): - """Variant.""" - - variant_id: str - gene: str | None = None - allele: str | None = None - evidence: str | None = None - - -class VariantList(BaseModel): - """List of variants.""" - - variant_list: List[Variant] - - -class DrugAnnotation(BaseModel): - """Drug annotation with detailed pharmacogenomic information.""" - - variant_annotation_id: int - variant_haplotypes: str - gene: str | None = None - drugs: str - pmid: int - phenotype_category: str - significance: str - notes: str - sentence: str - alleles: str | None = None - specialty_population: str | None = None - metabolizer_types: str | None = None - is_plural: str | None = None - is_is_not_associated: str - direction_of_effect: str | None = None - side_effect_efficacy_other: str | None = None - phenotype: str | None = None - multiple_phenotypes_and_or: str | None = None - when_treated_with_exposed_to: str | None = None - multiple_drugs_and_or: str | None = None - population_types: str | None = None - population_phenotypes_or_diseases: str | None = None - multiple_phenotypes_or_diseases_and_or: str | None = None - comparison_alleles_or_genotypes: str | None = None - comparison_metabolizer_types: str | None = None - - -class DrugAnnotationList(BaseModel): - """List of drug annotations for structured output.""" - - drug_annotations: List[DrugAnnotation] - - -class PhenotypeAnnotation(BaseModel): - """Phenotype annotation with detailed pharmacogenomic information.""" - - variant_annotation_id: int - variant_haplotypes: str - gene: str | None = None - drugs: str | None = None - pmid: int - phenotype_category: str - significance: str - notes: str - sentence: str - alleles: str | None = None - specialty_population: str | None = None - metabolizer_types: str | None = None - is_plural: str | None = None - is_is_not_associated: str - direction_of_effect: str | None = None - side_effect_efficacy_other: str | None = None - phenotype: str | None = None - multiple_phenotypes_and_or: str | None = None - when_treated_with_exposed_to: str | None = None - multiple_drugs_and_or: str | None = None - population_types: str | None = None - population_phenotypes_or_diseases: str | None = None - multiple_phenotypes_or_diseases_and_or: str | None = None - comparison_alleles_or_genotypes: str | None = None - comparison_metabolizer_types: str | None = None - - -class PhenotypeAnnotationList(BaseModel): - """List of phenotype annotations for structured output.""" - - phenotype_annotations: List[PhenotypeAnnotation] - - -class FunctionalAnnotation(BaseModel): - """Functional annotation with detailed mechanistic information.""" - - variant_annotation_id: int - variant_haplotypes: str - gene: str | None = None - drugs: str | None = None - pmid: int - phenotype_category: str - significance: str - notes: str - sentence: str - alleles: str | None = None - specialty_population: str | None = None - assay_type: str | None = None - metabolizer_types: str | None = None - is_plural: str | None = None - is_is_not_associated: str - direction_of_effect: str | None = None - functional_terms: str | None = None - gene_gene_product: str | None = None - when_treated_with_exposed_to: str | None = None - multiple_drugs_and_or: str | None = None - cell_type: str | None = None - comparison_alleles_or_genotypes: str | None = None - comparison_metabolizer_types: str | None = None - - -class FunctionalAnnotationList(BaseModel): - """List of functional annotations for structured output.""" - - functional_annotations: List[FunctionalAnnotation] diff --git a/src/ontology/__init__.py b/src/ontology/__init__.py deleted file mode 100644 index 8cb3482..0000000 --- a/src/ontology/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -from .variant_ontology import ( - NormalizationResult, - BaseNormalizer, - RSIDNormalizer, - StarAlleleNormalizer, -) -from .drug_ontology import DrugNormalizer - -__all__ = [ - "NormalizationResult", - "BaseNormalizer", - "RSIDNormalizer", - "StarAlleleNormalizer", - "DrugNormalizer", -] - -from .variant_search import VariantLookup -from .drug_search import DrugLookup diff --git a/src/ontology/drug_ontology.py b/src/ontology/drug_ontology.py deleted file mode 100644 index c38ab1d..0000000 --- a/src/ontology/drug_ontology.py +++ /dev/null @@ -1,235 +0,0 @@ -from typing import Optional -from loguru import logger -from .variant_ontology import BaseNormalizer, NormalizationResult - -import requests - - -class DrugNormalizer(BaseNormalizer): - """Normalizes drug names, and connect to common ID's per use.""" - - def __init__(self): - super().__init__() - - self.register_handler(self.lookup_drug_pubchem) - - # TODO: insert logic to handle base generic instead of what we have - - self.register_handler(self.lookup_drug_pharmgkb) - self.register_handler(self.lookup_drug_rxnorm) - # register the pubchem first before I register the other. - - def name(self): - return "Drug Normalizer" - - def lookup_drug_pubchem(self, raw: str) -> Optional[NormalizationResult]: - """ - Normalize a raw drug name via PubChem, return structured result. - """ - query = raw.strip() - if not query: - logger.debug("Empty drug input, skipping.") - return None - - try: - # Step 1: Fetch CID - cid_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{query}/cids/JSON" - cid_resp = requests.get(cid_url, timeout=5) - cid_resp.raise_for_status() - cid_data = cid_resp.json() - cid_list = cid_data.get("IdentifierList", {}).get("CID", []) - if not cid_list: - logger.debug("No CID found for input: %s", query) - return None - cid = cid_list[0] - - # Step 2: Fetch chemical properties - prop_url = ( - f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/" - f"{cid}/property/IUPACName,MolecularFormula,CanonicalSMILES/JSON" - ) - prop_resp = requests.get(prop_url, timeout=5) - prop_resp.raise_for_status() - prop_data = prop_resp.json() - props = prop_data["PropertyTable"]["Properties"][0] - - return NormalizationResult( - raw_input=raw, - normalized_output=props.get("IUPACName", query), - entity_type="drug", - source="PubChem", - metadata={ - "cid": cid, - "molecular_formula": props.get("MolecularFormula"), - "canonical_smiles": props.get("CanonicalSMILES"), - }, - ) - - except requests.RequestException as exc: - logger.warning("Request failed for '%s': %s", raw, exc) - except Exception as exc: - logger.warning("Unexpected error for '%s': %s", raw, exc) - - return None - - def get_generic_from_brand_pubchem(self, raw: str) -> Optional[str]: - """ - Resolves a brand name to a generic (IUPAC) name using PubChem. - Returns the normalized_output from lookup_drug_pubchem, or None. - """ - result = self.lookup_drug_pubchem(raw) - if result: - return result.normalized_output - return None - - def lookup_drug_pharmgkb(self, raw: str) -> Optional[NormalizationResult]: - """ - Lookup drug info from PharmGKB using its REST API. - Returns all available metadata without filtering. - """ - query = raw.strip().lower() - if not query: - logger.debug("Empty drug input for PharmGKB lookup.") - return None - - try: - url = ( - "https://api.pharmgkb.org/v1/data/chemical" - f"?name={requests.utils.quote(query)}&view=max" - ) - headers = {"accept": "application/json"} - response = requests.get(url, headers=headers, timeout=5) - response.raise_for_status() - data = response.json() - - results = data.get("data", []) - if not results: - logger.debug("No PharmGKB chemical match found for: %s", query) - return None - - entry = results[0] # Always take the first match - - return NormalizationResult( - raw_input=raw, - normalized_output=entry.get("name", raw), - entity_type="drug", - source="PharmGKB", - metadata=entry, # Store the entire returned dictionary - ) - - except requests.RequestException as exc: - logger.warning(f"PharmGKB request failed for '{raw}': {exc}") - except Exception as exc: - logger.warning( - f"Unexpected error during PharmGKB lookup for '{raw}': {exc}" - ) - - return None - - def lookup_drug_rxnorm(self, raw: str) -> Optional[NormalizationResult]: - """ - Resolves a drug name (brand or generic) using the RxNorm API. - Returns a NormalizationResult with the generic name and RxNorm metadata. - """ - query = raw.strip() - if not query: - logger.debug("Empty drug input for RxNorm lookup.") - return None - - try: - # Step 1: Get RxCUI for input name - rxcui_url = f"https://rxnav.nlm.nih.gov/REST/rxcui.json?name={requests.utils.quote(query)}" - rxcui_resp = requests.get(rxcui_url, timeout=5) - rxcui_resp.raise_for_status() - rxcui_data = rxcui_resp.json() - rxcui_list = rxcui_data.get("idGroup", {}).get("rxnormId", []) - if not rxcui_list: - logger.debug("No RxCUI found for input: %s", query) - return None - rxcui = rxcui_list[0] - - # Step 2: Get related ingredient (generic) names from RxCUI - related_url = ( - f"https://rxnav.nlm.nih.gov/REST/rxcui/{rxcui}/related.json?tty=IN" - ) - related_resp = requests.get(related_url, timeout=5) - related_resp.raise_for_status() - related_data = related_resp.json() - - concepts = related_data.get("relatedGroup", {}).get("conceptGroup", []) - ingredients = [] - for group in concepts: - if group.get("tty") == "IN": - for concept in group.get("conceptProperties", []): - ingredients.append(concept.get("name")) - - if not ingredients: - logger.debug("No generic (IN) concept found for RxCUI: %s", rxcui) - return None - - return NormalizationResult( - raw_input=raw, - normalized_output=ingredients[0], # first generic match - entity_type="drug", - source="RxNorm", - metadata={"rxcui": rxcui, "generic_candidates": ingredients}, - ) - - except requests.RequestException as exc: - logger.warning(f"RxNorm request failed for '{raw}': {exc}") - except Exception as exc: - logger.warning(f"Unexpected error in RxNorm lookup for '{raw}': {exc}") - - return None - - -def test_lookup_pubchem(): - normalizer = DrugNormalizer() - drug = "Imatinib" - result = normalizer.lookup_drug_pubchem(drug) - - print(f"\n[PubChem] Input: {drug}") - if result is None: - print("❌ No result returned.") - else: - print("✅ Result:") - print(f" Raw: {result.raw_input}") - print(f" Normalized: {result.normalized_output}") - print(f" Source: {result.source}") - print(f" Entity Type: {result.entity_type}") - print(f" CID: {result.metadata.get('cid')}") - print(f" SMILES: {result.metadata.get('canonical_smiles')}") - assert isinstance(result, NormalizationResult) - assert result.source == "PubChem" - assert result.entity_type == "drug" - assert "canonical_smiles" in result.metadata - - -def test_lookup_pharmgkb(): - normalizer = DrugNormalizer() - drug = "Gleevec" # Brand name for Imatinib - print("TEST LOOKUP PHARMGKB") - generic = normalizer.get_generic_from_brand_pubchem("Gleevec") - print(generic) - result = normalizer.lookup_drug_pharmgkb(drug) - - print(f"\n[PharmGKB] Input: {drug}") - if result is None: - print("❌ No result returned.") - else: - print("✅ Result:") - print(f" Raw: {result.raw_input}") - print(f" Normalized: {result.normalized_output}") - print(f" Source: {result.source}") - print(f" Entity Type: {result.entity_type}") - print(f" PharmGKB ID: {result.metadata.get('id')}") - print(f" Brand Names: {result.metadata.get('brandNames')}") - assert isinstance(result, NormalizationResult) - assert result.source == "PharmGKB" - assert result.entity_type == "drug" - assert "id" in result.metadata - - -if __name__ == "__main__": - test_lookup_pharmgkb() - test_lookup_pubchem() diff --git a/src/ontology/term_lookup.py b/src/ontology/term_lookup.py deleted file mode 100644 index d3fa639..0000000 --- a/src/ontology/term_lookup.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Wrapper lookup for Variant and Drug Search -""" - -from src.ontology.variant_search import VariantLookup -from src.ontology.drug_search import DrugLookup -from typing import Optional, List -from src.ontology.variant_search import VariantSearchResult -from src.ontology.drug_search import DrugSearchResult -from enum import Enum - - -class TermType(Enum): - POLYMORPHISM = "polymorphism" - DRUG = "drug" - - -class TermLookup: - def __init__(self): - self.variant_search = VariantLookup() - self.drug_search = DrugLookup() - - def lookup_variant( - self, variant: str, threshold: float = 0.8, top_k: int = 1 - ) -> Optional[List[VariantSearchResult]]: - return self.variant_search.search(variant, threshold=threshold, top_k=top_k) - - def lookup_drug( - self, drug: str, threshold: float = 0.8, top_k: int = 1 - ) -> Optional[List[DrugSearchResult]]: - return self.drug_search.search(drug, threshold=threshold, top_k=top_k) - - def search( - self, term: str, term_type: TermType, threshold: float = 0.8, top_k: int = 1 - ) -> Optional[List[VariantSearchResult]] | Optional[List[DrugSearchResult]]: - if term_type == TermType.POLYMORPHISM: - return self.lookup_variant(term, threshold=threshold, top_k=top_k) - elif term_type == TermType.DRUG: - return self.lookup_drug(term, threshold=threshold, top_k=top_k) diff --git a/src/ontology/variant_ontology.py b/src/ontology/variant_ontology.py deleted file mode 100644 index 06d070f..0000000 --- a/src/ontology/variant_ontology.py +++ /dev/null @@ -1,377 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Callable, Dict, Optional, Any, List -from dataclasses import dataclass, field -from loguru import logger -from Bio import Entrez -import requests - - -@dataclass -class NormalizationResult: - raw_input: str - normalized_output: str - entity_type: str # e.g. "variant", "gene", "drug", etc. - source: str # where the normalized info came from - metadata: Dict[str, Any] = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict) -> "NormalizationResult": - return cls( - raw_input=data["raw_input"], - normalized_output=data["normalized_output"], - entity_type=data.get("entity_type", "unknown"), - source=data["source"], - metadata=data.get("metadata", {}), - ) - - -class BaseNormalizer(ABC): - def __init__(self): - self._handlers: list[Callable[[str], Optional[dict]]] = [] - - def register_handler(self, handler: Callable[[str], Optional[dict]]): - self._handlers.append(handler) - - def normalize(self, raw: str) -> Optional["NormalizationResult"]: - for handler in self._handlers: - try: - result = handler(raw) - if result: - return result # Assuming result is already a NormalizedEntity - except Exception as e: - logger.exception( - f"Handler '{handler.__name__}' failed on input: '{raw}'" - ) - return None - - @abstractmethod - def name(self) -> str: - pass - - -class RSIDNormalizer(BaseNormalizer): - def __init__(self, email: str, api_key: Optional[str] = None): - super().__init__() - Entrez.email = email - if api_key: - Entrez.api_key = api_key - - self.register_handler(self.lookup_dbsnp) - self.register_handler(self.lookup_pharmgkb_id) - - def name(self) -> str: - return "RSIDNormalizer" - - def lookup_dbsnp(self, raw: str) -> Optional[NormalizationResult]: - rsid = raw.lower().strip() - if not rsid.startswith("rs") or not rsid[2:].isdigit(): - return None - - try: - handle = Entrez.esummary(db="snp", id=rsid[2:], retmode="json") - response = handle.read() - handle.close() - - # Convert JSON string to Python dict - import json - - data = json.loads(response) - - record = data.get("result", {}).get(rsid[2:]) - if not record: - return None - - return NormalizationResult( - raw_input=raw, - normalized_output=rsid, - entity_type="variant", - source="dbSNP", - metadata=record, - ) - - except Exception: - logger.exception(f"lookup_dbsnp failed for {raw}") - return None - - def lookup_pharmgkb_id(self, raw: str) -> Optional[NormalizationResult]: - logger.debug(f"Looking up PharmGKB variant by symbol: {raw}") - - base_url = "https://api.pharmgkb.org/v1/data/variant" - params = {"symbol": raw.strip(), "view": "max"} - - try: - response = requests.get(base_url, params=params, timeout=10) - if response.status_code != 200: - logger.warning( - f"PharmGKB lookup failed ({response.status_code}) for {raw}" - ) - return None - - data = response.json() - records = data.get("data", []) - if not records: - logger.info(f"No PharmGKB variant match for symbol: {raw}") - return None - - variant = records[0] - - # Extract only required fields - normalized_output = variant.get("id") - entity_type = "variant" - source = "PharmGKB" - - # Remove known fields so everything else is dumped into metadata - metadata = {k: v for k, v in variant.items() if k not in {"id"}} - - return NormalizationResult( - raw_input=raw, - normalized_output=normalized_output, - entity_type=entity_type, - source=source, - metadata=metadata, - ) - - except Exception: - logger.exception(f"PharmGKB symbol lookup failed for {raw}") - return None - - -class StarAlleleNormalizer(BaseNormalizer): - API_URL = "https://clinicaltables.nlm.nih.gov/api/star_alleles/v3/search" - - def __init__(self): - super().__init__() - self.register_handler(self.lookup_star_allele) - - def name(self): - return "Star Allele Normalizer" - - def lookup_star_allele(self, raw: str) -> Optional[NormalizationResult]: - """ - Normalize a star allele (e.g., CYP2D6*4) using the Clinical Tables API. - Returns a NormalizationResult with detailed metadata. - """ - query = raw.strip() - if not query: - logger.debug("Empty star allele input, skipping.") - return None - - try: - alleles = self.fetch_star_alleles(query, max_results=1) - if not alleles: - logger.debug("No star allele found for input: %s", query) - return None - - allele_data = alleles[0] - - return NormalizationResult( - raw_input=raw, - normalized_output=allele_data.get("StarAlleleName", query), - entity_type="variant", - source="PharmVar/Clinical Tables", - metadata=allele_data, - ) - - except Exception as exc: - logger.warning("Star allele lookup failed for '%s': %s", raw, exc) - return None - - def fetch_star_alleles( - self, query: str, max_results: int = 50 - ) -> List[Dict[str, Any]]: - """ - Fetches all star allele records matching the query string from the PharmVar-backed Clinical Tables API. - Returns a list of dictionaries, one per allele, with all available fields populated. - """ - fields = [ - "StarAlleleName", - "GenBank", - "ProteinAffected", - "cDNANucleotideChanges", - "GeneNucleotideChange", - "XbaIHaplotype", - "RFLP", - "OtherNames", - "ProteinChange", - "InVivoEnzymeActivity", - "InVitroEnzymeActivity", - "References", - "ClinicalPhenotype", - "Notes", - ] - - params = {"terms": query, "count": max_results, "ef": ",".join(fields)} - - try: - response = requests.get(self.API_URL, params=params, timeout=10) - response.raise_for_status() - except Exception as e: - logger.error(f"API request failed: {e}") - return [] - - try: - total_count, allele_names, extra_fields, *_ = response.json() - except Exception as e: - logger.error(f"Failed to parse API response: {e}") - return [] - - results = [] - for i, allele in enumerate(allele_names): - allele_info = {"StarAlleleName": allele} - for field, values in extra_fields.items(): - allele_info[field] = values[i] if i < len(values) else None - results.append(allele_info) - - return results - - -def extract_variants_from_annotations(): - """ - Extract and normalize variants from annotation files. - This demonstrates the core functionality for mapping variants to normalized ontologies. - """ - import json - import os - import re - from typing import Set, List, Dict, Any - - # Initialize normalizers - rsid_normalizer = RSIDNormalizer(email="test@example.com") - star_normalizer = StarAlleleNormalizer() - - annotation_dir = "data/annotations" - if not os.path.exists(annotation_dir): - print(f"❌ Annotation directory not found: {annotation_dir}") - return - - variants_found: Set[str] = set() - normalized_results: List[Dict[str, Any]] = [] - - print("🔍 Scanning annotation files for variants...") - - # Scan all annotation files - for filename in os.listdir(annotation_dir): - if not filename.endswith(".json"): - continue - - filepath = os.path.join(annotation_dir, filename) - try: - with open(filepath, "r") as f: - data = json.load(f) - - # Extract polymorphisms from annotations - if "annotations" in data and "relationships" in data["annotations"]: - for relationship in data["annotations"]["relationships"]: - polymorphism = relationship.get("polymorphism", "") - - # Extract rsIDs (rs followed by numbers) - rsids = re.findall(r"rs\d+", polymorphism) - variants_found.update(rsids) - - # Extract star alleles (gene*number pattern) - star_alleles = re.findall(r"[A-Z0-9]+\*\d+", polymorphism) - variants_found.update(star_alleles) - - except Exception as e: - print(f"⚠️ Error processing {filename}: {e}") - - print(f"📊 Found {len(variants_found)} unique variants") - - # Normalize each variant - for variant in variants_found: - print(f"\n🧬 Processing variant: {variant}") - - result = None - normalizer_used = None - - # Try rsID normalization first - if variant.startswith("rs"): - result = rsid_normalizer.normalize(variant) - normalizer_used = "RSIDNormalizer" - - # Try star allele normalization if rsID didn't work - if not result and "*" in variant: - result = star_normalizer.normalize(variant) - normalizer_used = "StarAlleleNormalizer" - - if result: - print(f"✅ {normalizer_used} successful:") - print(f" Raw: {result.raw_input}") - print(f" Normalized: {result.normalized_output}") - print(f" Source: {result.source}") - print(f" Type: {result.entity_type}") - - normalized_results.append( - { - "raw_variant": variant, - "normalizer": normalizer_used, - "result": result, - } - ) - else: - print(f"❌ No normalization found for {variant}") - - print( - f"\n📈 Summary: {len(normalized_results)}/{len(variants_found)} variants successfully normalized" - ) - return normalized_results - - -def test_individual_normalizers(): - """Test each normalizer with sample data""" - print("\n" + "=" * 50) - print("🧪 TESTING INDIVIDUAL NORMALIZERS") - print("=" * 50) - - # Test RSIDNormalizer - print("\n🧬 Testing RSIDNormalizer:") - rsid_normalizer = RSIDNormalizer(email="test@example.com") - test_rsids = ["rs1799853", "rs1057910", "rs9923231"] - - for rsid in test_rsids: - print(f"\n Testing {rsid}:") - result = rsid_normalizer.normalize(rsid) - if result: - print(f" ✅ Found: {result.normalized_output} from {result.source}") - else: - print(f" ❌ Not found") - - # Test StarAlleleNormalizer - print("\n⭐ Testing StarAlleleNormalizer:") - star_normalizer = StarAlleleNormalizer() - test_alleles = ["CYP2D6*4", "CYP2C9*2", "CYP2C9*3"] - - for allele in test_alleles: - print(f"\n Testing {allele}:") - result = star_normalizer.normalize(allele) - if result: - print(f" ✅ Found: {result.normalized_output} from {result.source}") - if result.metadata: - activity = result.metadata.get("InVivoEnzymeActivity") - if activity: - print(f" 📊 Activity: {activity}") - else: - print(f" ❌ Not found") - - -if __name__ == "__main__": - pass - - print("🎯 AutoGKB Variant Ontology Normalization System") - print("=" * 60) - - # Test individual normalizers first - test_individual_normalizers() - - # Then demonstrate with real annotation data - print("\n" + "=" * 50) - print("📋 PROCESSING ANNOTATION DATA") - print("=" * 50) - - results = extract_variants_from_annotations() - - if results: - print(f"\n🎉 Successfully processed annotation data!") - print(f" Normalized {len(results)} variants") - else: - print("\n⚠️ No results from annotation processing") diff --git a/src/term_normalization/README.md b/src/term_normalization/README.md new file mode 100644 index 0000000..f8f5a72 --- /dev/null +++ b/src/term_normalization/README.md @@ -0,0 +1,173 @@ +# Term Normalization + +## Goal +Take incoming variant annotations and replace all terms with normalized identifiers that map to current entries in ClinPGx and PharmGKB. This ensures consistent terminology across pharmacogenomic data. + +## Overview + +The term normalization module provides automated lookup and normalization for: +- **Variants/Alleles**: rsIDs and star alleles +- **Drugs**: Drug names, generic names, and trade names + +## Architecture + +### Main Components + +1. **`term_lookup.py`**: Main entry point providing `TermLookup` class and `normalize_annotation()` function +2. **`variant_search.py`**: Handles variant/allele normalization via `VariantLookup` class +3. **`drug_search.py`**: Handles drug normalization via `DrugLookup` class +4. **`search_utils.py`**: Shared utilities for similarity matching + +## Usage + +### Normalizing an Annotation File + +```python +from pathlib import Path +from src.term_normalization.term_lookup import normalize_annotation + +input_path = Path("data/example_annotation.json") +output_path = Path("data/example_annotation_normalized.json") + +normalize_annotation(input_path, output_path) +``` + +This will: +1. Load the annotation JSON file +2. Normalize all `Variant/Haplotypes` and `Drug(s)` fields in annotation types: `var_pheno_ann`, `var_fa_ann`, `var_drug_ann` +3. Add normalized fields with `_normalized` suffix (e.g., `Variant/Haplotypes_normalized`) +4. Include a `term_mappings` section with details about each normalized term + +### Using TermLookup Directly + +```python +from src.term_normalization.term_lookup import TermLookup, TermType + +lookup = TermLookup() + +# Search for a variant +variant_results = lookup.search("rs12345", term_type=TermType.VARIANT, threshold=0.8, top_k=1) + +# Search for a drug +drug_results = lookup.search("aspirin", term_type=TermType.DRUG, threshold=0.8, top_k=1) +``` + +## Variant Normalization + +The `VariantLookup` class handles variant normalization with the following features: + +### Search Strategy + +1. **rsID Lookup** (for variants starting with "rs"): + - Queries PharmGKB API (`/v1/data/variant`) + - Searches local ClinPGx variant database (`data/term_lookup_info/variants.tsv`) + - Searches variant names and synonyms + +2. **Star Allele Lookup** (for variants like *1, *2): + - Queries PharmGKB API (`/v1/data/haplotype`) + - Searches local ClinPGx variant database + +### Return Format + +```python +VariantSearchResult( + raw_input="rs12345", + id="PA166154595", + normalized_term="rs12345", + url="https://www.clinpgx.org/variant/PA166154595", + score=1.0 +) +``` + +## Drug Normalization + +The `DrugLookup` class handles drug normalization with the following features: + +### Search Strategy + +1. **ClinPGx Lookup** (primary): + - Searches drug name in local database (`data/term_lookup_info/drugs.tsv`) + - Searches generic names and trade names + - Returns PharmGKB Accession IDs + +2. **RxNorm Lookup** (fallback): + - Queries RxNorm API when ClinPGx search yields no results + - Converts RxCUI to PharmGKB Accession ID using local mapping + - Provides broader drug name coverage + +### Return Format + +```python +DrugSearchResult( + raw_input="aspirin", + id="PA449552", + normalized_term="etoposide", + url="https://www.clinpgx.org/chemical/PA449552", + score=1.0 +) +``` + +## Data Requirements + +The module requires local TSV files in the `data/term_lookup_info/` directory: + +- `variants.tsv`: Variant names, IDs, and synonyms from ClinPGx +- `drugs.tsv`: Drug names, generic names, trade names, RxNorm IDs, and PharmGKB IDs + +## Configuration + +### Parameters + +- **`threshold`** (default: 0.8): Minimum similarity score for fuzzy matching (0.0-1.0) +- **`top_k`** (default: 1): Number of top results to return +- **`data_dir`** (default: "data"): Base directory for lookup TSV files + +### Similarity Matching + +The module uses string similarity (via `calc_similarity` in `search_utils.py`) to match input terms against database entries, allowing for: +- Typos and spelling variations +- Case insensitivity +- Partial matches + +## Output Format + +The `normalize_annotation()` function adds: + +1. **Normalized fields** in each annotation object: + - `Variant/Haplotypes_normalized`: PharmGKB variant ID + - `Drug(s)_normalized`: PharmGKB drug ID + +2. **Term mappings section** at the root level: +```json +{ + "term_mappings": { + "rs6539870": { + "raw_input": "rs6539870", + "id": "PA166154595", + "normalized_term": "rs6539870", + "url": "https://www.clinpgx.org/variant/PA166154595", + "score": 1.0 + }, + "etoposide": { + "raw_input": "etoposide", + "id": "PA449552", + "normalized_term": "etoposide", + "url": "https://www.clinpgx.org/chemical/PA449552", + "score": 1.0 + } + } +} +``` + +## Future Work + +### Gene Normalization +Currently not implemented. Will require: +- HGNC gene symbol lookup +- Gene ID normalization +- Alias resolution + +### Phenotype Normalization +Currently not implemented. Will require: +- Ontology mapping (HPO, MeSH, etc.) +- Phenotype term standardization diff --git a/src/term_normalization/__init__.py b/src/term_normalization/__init__.py new file mode 100644 index 0000000..8235cfe --- /dev/null +++ b/src/term_normalization/__init__.py @@ -0,0 +1,2 @@ +from .variant_search import VariantLookup +from .drug_search import DrugLookup diff --git a/src/ontology/drug_search.py b/src/term_normalization/drug_search.py similarity index 89% rename from src/ontology/drug_search.py rename to src/term_normalization/drug_search.py index f90a426..a9088b7 100644 --- a/src/ontology/drug_search.py +++ b/src/term_normalization/drug_search.py @@ -1,7 +1,7 @@ from pydantic import BaseModel from typing import List, Optional, Any import requests -from src.ontology.search_utils import ( +from src.term_normalization.search_utils import ( calc_similarity, general_search, general_search_comma_list, @@ -14,10 +14,22 @@ class DrugSearchResult(BaseModel): raw_input: str id: str - name: str + normalized_term: str url: str score: float + def to_dict(self) -> dict: + """ + Return a plain-Python dict representation of the result that is safe for json.dump. + Supports both Pydantic v1 (dict) and v2 (model_dump). + """ + try: + # Pydantic v2 + return self.model_dump() + except AttributeError: # pragma: no cover - v1 fallback + # Pydantic v1 + return self.dict() + # RxNorm Helpers def get_first_rxnorm_candidate(data): @@ -52,10 +64,14 @@ def rxnorm_search(drug_name: str) -> Optional[DrugSearchResult]: name = candidate["name"] score = calc_similarity(drug_name, name) return DrugSearchResult( - raw_input=drug_name, id=f"RXN{rxcui}", name=name, url=url, score=score + raw_input=drug_name, + id=f"RXN{rxcui}", + normalized_term=name, + url=url, + score=score, ) return DrugSearchResult( - raw_input=drug_name, id="", name="Not Found", url="", score=0 + raw_input=drug_name, id="", normalized_term="Not Found", url="", score=0 ) @@ -93,7 +109,7 @@ def _clinpgx_drug_name_search( DrugSearchResult( raw_input=self.raw_input, id=result["PharmGKB Accession Id"], - name=result["Name"], + normalized_term=result["Name"], url=f"https://www.clinpgx.org/chemical/{result['PharmGKB Accession Id']}", score=result["score"], ) @@ -131,7 +147,7 @@ def _clinpgx_drug_alternatives_search( DrugSearchResult( raw_input=self.raw_input, id=result["PharmGKB Accession Id"], - name=result["Name"], + normalized_term=result["Name"], url=f"https://www.clinpgx.org/chemical/{result['PharmGKB Accession Id']}", score=result["score"], ) @@ -187,7 +203,7 @@ def rxcui_to_pa_id(self, rxcui: str) -> Optional[List[DrugSearchResult]]: DrugSearchResult( raw_input=self.raw_input, id=result["PharmGKB Accession Id"], - name=result["Name"], + normalized_term=result["Name"], url=f"https://www.clinpgx.org/chemical/{result['PharmGKB Accession Id']}", score=result["score"], ) diff --git a/src/ontology/search_utils.py b/src/term_normalization/search_utils.py similarity index 100% rename from src/ontology/search_utils.py rename to src/term_normalization/search_utils.py diff --git a/src/term_normalization/term_lookup.py b/src/term_normalization/term_lookup.py new file mode 100644 index 0000000..6d4161c --- /dev/null +++ b/src/term_normalization/term_lookup.py @@ -0,0 +1,116 @@ +""" +Wrapper lookup for Variant and Drug Search +""" + +from src.term_normalization.variant_search import VariantLookup +from src.term_normalization.drug_search import DrugLookup +from typing import Optional, List +from src.term_normalization.variant_search import VariantSearchResult +from src.term_normalization.drug_search import DrugSearchResult +from enum import Enum +import shutil +import json +import os +from loguru import logger +from pathlib import Path + + +class TermType(Enum): + VARIANT = "variant" + DRUG = "drug" + + +class TermLookup: + def __init__(self): + self.variant_search = VariantLookup() + self.drug_search = DrugLookup() + + def lookup_variant( + self, variant: str, threshold: float = 0.8, top_k: int = 1 + ) -> Optional[List[VariantSearchResult]]: + return self.variant_search.search(variant, threshold=threshold, top_k=top_k) + + def lookup_drug( + self, drug: str, threshold: float = 0.8, top_k: int = 1 + ) -> Optional[List[DrugSearchResult]]: + return self.drug_search.search(drug, threshold=threshold, top_k=top_k) + + def search( + self, term: str, term_type: TermType, threshold: float = 0.8, top_k: int = 1 + ) -> Optional[List[VariantSearchResult]] | Optional[List[DrugSearchResult]]: + if term_type == TermType.VARIANT: + return self.lookup_variant(term, threshold=threshold, top_k=top_k) + elif term_type == TermType.DRUG: + return self.lookup_drug(term, threshold=threshold, top_k=top_k) + + +def normalize_annotation(input_annotation: Path, output_annotation: Path): + """ + Take a JSON file with a single annotation and normalize the terms using the TermLookup class. + Output a new JSON file with the normalized terms. + + Args: + input_annotation (Path): Path to the raw annotation file + output_annotation (Path): Path to the output file + """ + # Load the annotations file + annotations = None + try: + with open(input_annotation, "r") as f: + annotations = json.load(f) + except Exception as e: + logger.error(f"Failed to load annotations file: {e}") + return + + # Initialize the TermLookup class + term_lookup = TermLookup() + + # Iterate through the annotations and normalize the terms + annotation_types = ["var_pheno_ann", "var_fa_ann", "var_drug_ann"] + saved_mappings = {} + + # Iterate through each annotation type + for ann_type in annotation_types: + if ann_type in annotations: + # Iterate through each annotation in the list + for annotation in annotations[ann_type]: + # Normalize Variant/Haplotypes if present + if ( + "Variant/Haplotypes" in annotation + and annotation["Variant/Haplotypes"] + ): + variant_term = annotation["Variant/Haplotypes"] + results = term_lookup.search( + variant_term, term_type=TermType.VARIANT + ) + if results: + saved_mappings[variant_term] = results[0].to_dict() + annotation["Variant/Haplotypes_normalized"] = results[0].id + + # Normalize Drug(s) if present + if "Drug(s)" in annotation and annotation["Drug(s)"]: + drug_term = annotation["Drug(s)"] + results = term_lookup.search(drug_term, term_type=TermType.DRUG) + if results: + saved_mappings[drug_term] = results[0].to_dict() + annotation["Drug(s)_normalized"] = results[0].id + + # Add saved mappings to annotations + annotations["term_mappings"] = saved_mappings + + # Save the normalized annotations to a file + try: + os.makedirs(output_annotation.parent, exist_ok=True) + with open(output_annotation, "w") as f: + json.dump(annotations, f, indent=4) + except Exception as e: + logger.error(f"Failed to save annotations file: {e}") + return + + logger.info(f"Successfully normalized annotations file: {output_annotation}") + + +if __name__ == "__main__": + input_annotation = Path("data/example_annotation.json") + output_annotation = Path("data/example_annotation_normalized.json") + normalize_annotation(input_annotation, output_annotation) diff --git a/src/ontology/variant_search.py b/src/term_normalization/variant_search.py similarity index 88% rename from src/ontology/variant_search.py rename to src/term_normalization/variant_search.py index 5fb9c33..8df7828 100644 --- a/src/ontology/variant_search.py +++ b/src/term_normalization/variant_search.py @@ -1,7 +1,7 @@ from pydantic import BaseModel from typing import List, Optional, Any import requests -from src.ontology.search_utils import ( +from src.term_normalization.search_utils import ( calc_similarity, general_search, general_search_comma_list, @@ -14,10 +14,22 @@ class VariantSearchResult(BaseModel): raw_input: str id: str - name: str + normalized_term: str url: str score: float + def to_dict(self) -> dict: + """ + Return a plain-Python dict representation of the result that is safe for json.dump. + Supports both Pydantic v1 (dict) and v2 (model_dump). + """ + try: + # Pydantic v2 + return self.model_dump() + except AttributeError: # pragma: no cover - v1 fallback + # Pydantic v1 + return self.dict() + def pgkb_star_allele_search( star_allele: str, threshold: float = 0.8, top_k: int = 1 @@ -32,7 +44,7 @@ def pgkb_star_allele_search( VariantSearchResult( raw_input=star_allele, id=result["id"], - name=result["symbol"], + normalized_term=result["symbol"], url=f"https://www.clinpgx.org/haplotype/{result['id']}", score=score, ) @@ -54,7 +66,7 @@ def pgkb_rsid_search( VariantSearchResult( raw_input=rsid, id=result["id"], - name=result["symbol"], + normalized_term=result["symbol"], url=f"https://www.clinpgx.org/variant/{result['id']}", score=score, ) @@ -102,7 +114,7 @@ def _clinpgx_variant_search( VariantSearchResult( raw_input=variant, id=result["Variant ID"], - name=result["Variant Name"], + normalized_term=result["Variant Name"], url=f"https://www.clinpgx.org/variant/{result['Variant ID']}", score=result["score"], )