diff --git a/src/ontology_module/__init__.py b/src/ontology_module/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/ontology_module/drug_ontology.py b/src/ontology_module/drug_ontology.py new file mode 100644 index 0000000..2ff477f --- /dev/null +++ b/src/ontology_module/drug_ontology.py @@ -0,0 +1,247 @@ + + +from typing import Optional +import logging +from .variant_ontology import BaseNormalizer, NormalizationResult + +import requests + +# how to use, you have thew following, + + +logger = logging.getLogger(__name__) + +class DrugNormalizer(BaseNormalizer): + """Normalizes drug names, and connect to common ID's per use.""" + + def __init__(self): + super().__init__() + + self.register_handler(self.lookup_drug_pubchem) + + + + #TODO: insert logic to handle base generic instead of what we have + + + self.register_handler(self.lookup_drug_pharmgkb) + # register the pubchem first before I register the other. + def name(self): + return "Drug Normalizer" + + def lookup_drug_pubchem(self, raw: str) -> Optional[NormalizationResult]: + """ + Normalize a raw drug name via PubChem, return structured result. + """ + query = raw.strip() + if not query: + logger.debug("Empty drug input, skipping.") + return None + + try: + # Step 1: Fetch CID + cid_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{query}/cids/JSON" + cid_resp = requests.get(cid_url, timeout=5) + cid_resp.raise_for_status() + cid_data = cid_resp.json() + cid_list = cid_data.get("IdentifierList", {}).get("CID", []) + if not cid_list: + logger.debug("No CID found for input: %s", query) + return None + cid = cid_list[0] + + # Step 2: Fetch chemical properties + prop_url = ( + f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/" + f"{cid}/property/IUPACName,MolecularFormula,CanonicalSMILES/JSON" + ) + prop_resp = requests.get(prop_url, timeout=5) + prop_resp.raise_for_status() + prop_data = prop_resp.json() + props = prop_data["PropertyTable"]["Properties"][0] + + return NormalizationResult( + raw_input=raw, + normalized_output=props.get("IUPACName", query), + entity_type="drug", + source="PubChem", + metadata={ + "cid": cid, + "molecular_formula": props.get("MolecularFormula"), + "canonical_smiles": props.get("CanonicalSMILES") + } + ) + + except requests.RequestException as exc: + logger.warning("Request failed for '%s': %s", raw, exc) + except Exception as exc: + logger.warning("Unexpected error for '%s': %s", raw, exc) + + return None + def get_generic_from_brand_pubchem(self, raw: str) -> Optional[str]: + """ + Resolves a brand name to a generic (IUPAC) name using PubChem. + Returns the normalized_output from lookup_drug_pubchem, or None. + """ + result = self.lookup_drug_pubchem(raw) + if result: + return result.normalized_output + return None + + + def lookup_drug_pharmgkb(self, raw: str) -> Optional[NormalizationResult]: + """ + Lookup drug info from PharmGKB using its REST API. + Returns all available metadata without filtering. + """ + query = raw.strip().lower() + if not query: + logger.debug("Empty drug input for PharmGKB lookup.") + return None + + try: + url = ( + "https://api.pharmgkb.org/v1/data/chemical" + f"?name={requests.utils.quote(query)}&view=max" + ) + headers = {"accept": "application/json"} + response = requests.get(url, headers=headers, timeout=5) + response.raise_for_status() + data = response.json() + + results = data.get("data", []) + if not results: + logger.debug("No PharmGKB chemical match found for: %s", query) + return None + + entry = results[0] # Always take the first match + + return NormalizationResult( + raw_input=raw, + normalized_output=entry.get("name", raw), + entity_type="drug", + source="PharmGKB", + metadata=entry # Store the entire returned dictionary + ) + + except requests.RequestException as exc: + logger.warning("PharmGKB request failed for '%s': %s", raw, exc) + except Exception as exc: + logger.warning("Unexpected error during PharmGKB lookup for '%s': %s", raw, exc) + + return None + def lookup_drug_rxnorm(self, raw: str) -> Optional[NormalizationResult]: + """ + Resolves a drug name (brand or generic) using the RxNorm API. + Returns a NormalizationResult with the generic name and RxNorm metadata. + """ + query = raw.strip() + if not query: + logger.debug("Empty drug input for RxNorm lookup.") + return None + + try: + # Step 1: Get RxCUI for input name + rxcui_url = f"https://rxnav.nlm.nih.gov/REST/rxcui.json?name={requests.utils.quote(query)}" + rxcui_resp = requests.get(rxcui_url, timeout=5) + rxcui_resp.raise_for_status() + rxcui_data = rxcui_resp.json() + rxcui_list = rxcui_data.get("idGroup", {}).get("rxnormId", []) + if not rxcui_list: + logger.debug("No RxCUI found for input: %s", query) + return None + rxcui = rxcui_list[0] + + # Step 2: Get related ingredient (generic) names from RxCUI + related_url = f"https://rxnav.nlm.nih.gov/REST/rxcui/{rxcui}/related.json?tty=IN" + related_resp = requests.get(related_url, timeout=5) + related_resp.raise_for_status() + related_data = related_resp.json() + + concepts = related_data.get("relatedGroup", {}).get("conceptGroup", []) + ingredients = [] + for group in concepts: + if group.get("tty") == "IN": + for concept in group.get("conceptProperties", []): + ingredients.append(concept.get("name")) + + if not ingredients: + logger.debug("No generic (IN) concept found for RxCUI: %s", rxcui) + return None + + return NormalizationResult( + raw_input=raw, + normalized_output=ingredients[0], # first generic match + entity_type="drug", + source="RxNorm", + metadata={ + "rxcui": rxcui, + "generic_candidates": ingredients + } + ) + + except requests.RequestException as exc: + logger.warning("RxNorm request failed for '%s': %s", raw, exc) + except Exception as exc: + logger.warning("Unexpected error in RxNorm lookup for '%s': %s", raw, exc) + + return None + + + + +def test_lookup_pubchem(): + normalizer = DrugNormalizer() + drug = "Imatinib" + result = normalizer.lookup_drug_pubchem(drug) + + print(f"\n[PubChem] Input: {drug}") + if result is None: + print("❌ No result returned.") + else: + print("✅ Result:") + print(f" Raw: {result.raw_input}") + print(f" Normalized: {result.normalized_output}") + print(f" Source: {result.source}") + print(f" Entity Type: {result.entity_type}") + print(f" CID: {result.metadata.get('cid')}") + print(f" SMILES: {result.metadata.get('canonical_smiles')}") + assert isinstance(result, NormalizationResult) + assert result.source == "PubChem" + assert result.entity_type == "drug" + assert "canonical_smiles" in result.metadata + + +def test_lookup_pharmgkb(): + normalizer = DrugNormalizer() + drug = "Gleevec" # Brand name for Imatinib + print("TEST LOOKUP PHARMGKB") + generic = normalizer.get_generic_from_brand_pubchem("Gleevec") + print(generic) + result = normalizer.lookup_drug_pharmgkb(drug) + + print(f"\n[PharmGKB] Input: {drug}") + if result is None: + print("❌ No result returned.") + else: + print("✅ Result:") + print(f" Raw: {result.raw_input}") + print(f" Normalized: {result.normalized_output}") + print(f" Source: {result.source}") + print(f" Entity Type: {result.entity_type}") + print(f" PharmGKB ID: {result.metadata.get('id')}") + print(f" Brand Names: {result.metadata.get('brandNames')}") + assert isinstance(result, NormalizationResult) + assert result.source == "PharmGKB" + assert result.entity_type == "drug" + assert "id" in result.metadata + + +if __name__ == "__main__": + test_lookup_pubchem() + + test_lookup_pharmgkb() + normalizer = DrugNormalizer() + result = normalizer.lookup_drug_rxnorm("Gleevec") + print(result.normalized_output) # → "imatinib" + diff --git a/src/ontology_module/variant_ontology.py b/src/ontology_module/variant_ontology.py new file mode 100644 index 0000000..de67242 --- /dev/null +++ b/src/ontology_module/variant_ontology.py @@ -0,0 +1,242 @@ +from abc import ABC, abstractmethod +from typing import Callable,Dict, Optional, Any, List +from dataclasses import dataclass, field +import logging +from Bio import Entrez +import requests + +logger = logging.getLogger(__name__) + +@dataclass +class NormalizationResult: + raw_input: str + normalized_output: str + entity_type: str # e.g. "variant", "gene", "drug", etc. + source: str # where the normalized info came from + metadata: Dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict) -> "NormalizationResult": + return cls( + raw_input=data["raw_input"], + normalized_output=data["normalized_output"], + entity_type=data.get("entity_type", "unknown"), + source=data["source"], + metadata=data.get("metadata", {}) + ) + +class BaseNormalizer(ABC): + def __init__(self): + self._handlers: list[Callable[[str], Optional[dict]]] = [] + + def register_handler(self, handler: Callable[[str], Optional[dict]]): + self._handlers.append(handler) + + def normalize(self, raw: str) -> Optional["NormalizationResult"]: + for handler in self._handlers: + try: + result = handler(raw) + if result: + return result # Assuming result is already a NormalizedEntity + except Exception as e: + logger.exception(f"Handler '{handler.__name__}' failed on input: '{raw}'") + return None + + @abstractmethod + def name(self) -> str: + pass + + + + + +class RSIDNormalizer(BaseNormalizer): + def __init__(self, email: str, api_key: Optional[str] = None): + super().__init__() + Entrez.email = email + if api_key: + Entrez.api_key = api_key + + self.register_handler(self.lookup_dbsnp) + self.register_handler(self.lookup_pharmgkb_id) + + def name(self) -> str: + return "RSIDNormalizer" + + def lookup_dbsnp(self, raw: str) -> Optional[NormalizationResult]: + rsid = raw.lower().strip() + if not rsid.startswith("rs") or not rsid[2:].isdigit(): + return None + + try: + handle = Entrez.esummary(db="snp", id=rsid[2:], retmode="json") + response = handle.read() + handle.close() + + # Convert JSON string to Python dict + import json + data = json.loads(response) + + record = data.get("result", {}).get(rsid[2:]) + if not record: + return None + + return NormalizationResult( + raw_input=raw, + normalized_output=rsid, + entity_type="variant", + source="dbSNP", + metadata=record + ) + + except Exception: + logger.exception(f"lookup_dbsnp failed for {raw}") + return None + + def lookup_pharmgkb_id(self, raw: str) -> Optional[NormalizationResult]: + logger.debug(f"Looking up PharmGKB variant by symbol: {raw}") + + base_url = "https://api.pharmgkb.org/v1/data/variant" + params = { + "symbol": raw.strip(), + "view": "max" + } + + try: + response = requests.get(base_url, params=params, timeout=10) + if response.status_code != 200: + logger.warning(f"PharmGKB lookup failed ({response.status_code}) for {raw}") + return None + + data = response.json() + records = data.get("data", []) + if not records: + logger.info(f"No PharmGKB variant match for symbol: {raw}") + return None + + variant = records[0] + + # Extract only required fields + normalized_output = variant.get("id") + entity_type = "variant" + source = "PharmGKB" + + # Remove known fields so everything else is dumped into metadata + metadata = {k: v for k, v in variant.items() if k not in {"id"}} + + return NormalizationResult( + raw_input=raw, + normalized_output=normalized_output, + entity_type=entity_type, + source=source, + metadata=metadata + ) + + except Exception: + logger.exception(f"PharmGKB symbol lookup failed for {raw}") + return None + +class StarAlleleNormalizer(BaseNormalizer): + API_URL = "https://clinicaltables.nlm.nih.gov/api/star_alleles/v3/search" + + def __init__(self): + pass + def name(self): + return "Star Allele Normalizer" + + + + + def fetch_star_alleles(self, query: str, max_results: int = 50) -> List[Dict[str, Any]]: + """ + Fetches all star allele records matching the query string from the PharmVar-backed Clinical Tables API. + Returns a list of dictionaries, one per allele, with all available fields populated. + """ + fields = [ + "StarAlleleName", "GenBank", "ProteinAffected", "cDNANucleotideChanges", + "GeneNucleotideChange", "XbaIHaplotype", "RFLP", "OtherNames", "ProteinChange", + "InVivoEnzymeActivity", "InVitroEnzymeActivity", "References", + "ClinicalPhenotype", "Notes" + ] + + params = { + "terms": query, + "count": max_results, + "ef": ",".join(fields) + } + + try: + response = requests.get(self.API_URL, params=params, timeout=10) + response.raise_for_status() + except Exception as e: + logger.error(f"API request failed: {e}") + return [] + + try: + total_count, allele_names, extra_fields, *_ = response.json() + except Exception as e: + logger.error(f"Failed to parse API response: {e}") + return [] + + results = [] + for i, allele in enumerate(allele_names): + allele_info = { + "StarAlleleName": allele + } + for field, values in extra_fields.items(): + allele_info[field] = values[i] if i < len(values) else None + results.append(allele_info) + + return results + # def fetch_star_alleles(self, term: str) -> list[dict]: + # """ + # Searches for star alleles matching a term and retrieves full metadata for each. + + # Args: + # term (str): The star allele search string (e.g., "CYP2D6*4"). + + # Returns: + # list[dict]: Each dict contains all metadata fields for a matched star allele. + # """ + # base_url = "https://clinicaltables.nlm.nih.gov/api/star_alleles/v3/search" + # fields = [ + # "StarAlleleName", "GenBank", "ProteinAffected", "cDNANucleotideChanges", + # "GeneNucleotideChange", "ProteinChange", "OtherNames", + # "InVivoEnzymeActivity", "InVitroEnzymeActivity", "References", + # "ClinicalPhenotype", "Notes" + # ] + + # params = { + # "terms": term, + # "ef": ",".join(fields), + # "maxList": "50" + # } + + # response = requests.get(base_url, params=params) + # response.raise_for_status() + # data = response.json() + + # if not data or len(data) < 3: + # return [] + + # codes = data[1] + # extra_fields = data[2] + + # results = [] + # for i, code in enumerate(codes): + # allele_data = {field: extra_fields.get(field, [None])[i] for field in fields} + # results.append(allele_data) + + # return results + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + normalizer = StarAlleleNormalizer() + data = normalizer.fetch_star_alleles("CYP2D6*4") + + for record in data: + print("\n--- Star Allele Record ---") + for k, v in record.items(): + print(f"{k}: {v}") +