From 36ed54225d2806b93d76b0dbec034ec7ac7287fd Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Mon, 1 Dec 2025 14:31:26 +0000 Subject: [PATCH 01/12] ML cookbooks WIP --- cookbook/sepsis_cds_hooks.py | 166 +++++++++++++++++++ cookbook/sepsis_fhir_batch.py | 139 ++++++++++++++++ healthchain/io/containers/dataset.py | 8 +- healthchain/sandbox/sandboxclient.py | 77 +++++---- scripts/extract_mimic_demo_patients.py | 217 +++++++++++++++++++++++++ 5 files changed, 571 insertions(+), 36 deletions(-) create mode 100644 cookbook/sepsis_cds_hooks.py create mode 100644 cookbook/sepsis_fhir_batch.py create mode 100644 scripts/extract_mimic_demo_patients.py diff --git a/cookbook/sepsis_cds_hooks.py b/cookbook/sepsis_cds_hooks.py new file mode 100644 index 00000000..7afcdd75 --- /dev/null +++ b/cookbook/sepsis_cds_hooks.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +Sepsis Risk Prediction via CDS Hooks + +Real-time sepsis alerts when clinician opens patient chart. +Uses pre-extracted MIMIC patient data for fast demos. + +Demo patients extracted from MIMIC-on-FHIR using: + python scripts/extract_mimic_demo_patients.py + +Requirements: +- pip install healthchain joblib xgboost + +Run: +- python sepsis_cds_hooks.py +""" + +from pathlib import Path + +import joblib +from dotenv import load_dotenv + +from healthchain.gateway import HealthChainAPI, CDSHooksService +from healthchain.io import Dataset +from healthchain.models import CDSRequest, CDSResponse +from healthchain.models.responses.cdsresponse import Card +from healthchain.pipeline import Pipeline + +load_dotenv() + +# Configuration +SCRIPT_DIR = Path(__file__).parent +MODEL_PATH = SCRIPT_DIR / "models" / "sepsis_model.pkl" +SCHEMA_PATH = ( + SCRIPT_DIR / ".." / "healthchain" / "configs" / "features" / "sepsis_vitals.yaml" +) +DEMO_PATIENTS_DIR = SCRIPT_DIR / "data" / "mimic_demo_patients" + +# Load model +model_data = joblib.load(MODEL_PATH) +model = model_data["model"] +feature_names = model_data["metadata"]["feature_names"] +threshold = model_data["metadata"]["metrics"].get("optimal_threshold", 0.5) + + +def create_pipeline() -> Pipeline[Dataset]: + """Build sepsis prediction pipeline.""" + pipeline = Pipeline[Dataset]() + + @pipeline.add_node + def impute_missing(dataset: Dataset) -> Dataset: + dataset.data = dataset.data.fillna(dataset.data.median(numeric_only=True)) + return dataset + + @pipeline.add_node + def run_inference(dataset: Dataset) -> Dataset: + features = dataset.data[feature_names] + probabilities = model.predict_proba(features)[:, 1] + dataset.metadata["probabilities"] = probabilities + return dataset + + return pipeline + + +def create_app(): + pipeline = create_pipeline() + cds = CDSHooksService() + + @cds.hook("patient-view", id="sepsis-risk") + def sepsis_alert(request: CDSRequest) -> CDSResponse: + prefetch = request.prefetch or {} + if not prefetch: + return CDSResponse(cards=[]) + + # Merge keyed prefetch into single bundle + # Format: {"patient": {...}, "heart_rate": {"entry": [...]}, ...} + entries = [] + for key, value in prefetch.items(): + if key == "patient": + entries.append({"resource": value}) + elif isinstance(value, dict) and "entry" in value: + entries.extend(value["entry"]) + + bundle = {"type": "collection", "entry": entries} + + # FHIR → Dataset → Prediction + dataset = Dataset.from_fhir_bundle(bundle, schema=SCHEMA_PATH) + result = pipeline(dataset) + + # print("Result:") + # print(result.data.head(10)) + # print("Metadata:") + # print(result.metadata) + + probability = float(result.metadata["probabilities"][0]) + risk = ( + "high" if probability > 0.7 else "moderate" if probability > 0.4 else "low" + ) + + if risk in ["high", "moderate"]: + summary = f"⚠️ Sepsis Risk: {risk.upper()} ({probability:.0%})" + indicator = "critical" if risk == "high" else "warning" + detail = ( + "**AI Guidance:**\n" + f"- Predicted risk: **{risk.upper()}** ({probability:.0%})\n" + "- Recommend sepsis workup and early intervention." + ) + title = "Sepsis Alert (AI Prediction)" + source = { + "label": "HealthChain Sepsis Predictor", + "url": "https://www.sccm.org/SurvivingSepsisCampaign/Guidelines/Adult-Patients", + } + return CDSResponse( + cards=[ + Card( + summary=summary, + indicator=indicator, + detail=detail, + source=source, + title=title, + ) + ] + ) + + return CDSResponse(cards=[]) + + app = HealthChainAPI(title="Sepsis CDS Hooks") + app.register_service(cds, path="/cds") + return app + + +app = create_app() + + +if __name__ == "__main__": + import threading + import uvicorn + from time import sleep + from healthchain.sandbox import SandboxClient + + # Start server + def run_server(): + uvicorn.run(app, port=8000, log_level="warning") + + server = threading.Thread(target=run_server, daemon=True) + server.start() + sleep(2) + + # Test with pre-extracted demo patients (fast, realistic per-patient data) + client = SandboxClient( + url="http://localhost:8000/cds/cds-services/sepsis-risk", + workflow="patient-view", + ) + client.load_from_path(DEMO_PATIENTS_DIR) + responses = client.send_requests() + client.save_results(save_request=True, save_response=True, directory="./output/") + + print(f"\nProcessed {len(responses)} requests") + for i, resp in enumerate(responses): + cards = resp.get("cards", []) + if cards: + print(f" Patient {i+1}: {cards[0].get('summary', 'No alert')}") + else: + print(f" Patient {i+1}: Low risk (no alert)") + + server.join() diff --git a/cookbook/sepsis_fhir_batch.py b/cookbook/sepsis_fhir_batch.py new file mode 100644 index 00000000..252c455c --- /dev/null +++ b/cookbook/sepsis_fhir_batch.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +""" +Sepsis Batch Screening with FHIR Gateway + +Batch process patients and write RiskAssessment resources to FHIR server. +Demonstrates querying FHIR server and writing results back. + +Requirements: +- pip install healthchain joblib xgboost python-dotenv + +Environment Variables: +- MEDPLUM_CLIENT_ID, MEDPLUM_CLIENT_SECRET, MEDPLUM_BASE_URL + +Run: +- python sepsis_fhir_batch.py +""" + +from pathlib import Path + +import joblib +from dotenv import load_dotenv + +from healthchain.gateway import HealthChainAPI, FHIRGateway +from healthchain.gateway.clients.fhir.base import FHIRAuthConfig +from healthchain.io import Dataset +from healthchain.pipeline import Pipeline + +load_dotenv() + +# Configuration +SCRIPT_DIR = Path(__file__).parent +MODEL_PATH = SCRIPT_DIR / "models" / "sepsis_model.pkl" +SCHEMA_PATH = "healthchain/configs/features/sepsis_vitals.yaml" + +# Load model +model_data = joblib.load(MODEL_PATH) +model = model_data["model"] +feature_names = model_data["metadata"]["feature_names"] +threshold = model_data["metadata"]["metrics"].get("optimal_threshold", 0.5) + +# FHIR Gateway +config = FHIRAuthConfig.from_env("MEDPLUM") +gateway = FHIRGateway() +gateway.add_source("fhir", config.to_connection_string()) + + +def create_pipeline() -> Pipeline[Dataset]: + """Build sepsis prediction pipeline.""" + pipeline = Pipeline[Dataset]() + + @pipeline.add_node + def impute_missing(dataset: Dataset) -> Dataset: + dataset.data = dataset.data.fillna(dataset.data.median(numeric_only=True)) + return dataset + + @pipeline.add_node + def run_inference(dataset: Dataset) -> Dataset: + features = dataset.data[feature_names] + probabilities = model.predict_proba(features)[:, 1] + predictions = (probabilities >= threshold).astype(int) + dataset.metadata["predictions"] = predictions + dataset.metadata["probabilities"] = probabilities + return dataset + + return pipeline + + +def run_batch_screening(): + """ + Run batch sepsis screening. + + In production: query FHIR server for ICU patients + For demo: load from MIMIC-on-FHIR + """ + from healthchain.sandbox.loaders import MimicOnFHIRLoader + + pipeline = create_pipeline() + + # Load data (production would use: gateway.search(Patient, {"location": "ICU"})) + loader = MimicOnFHIRLoader() + bundle = loader.load( + data_dir="../datasets/mimic-iv-clinical-database-demo-on-fhir-2.1.0/", + resource_types=[ + "MimicObservationChartevents", + "MimicObservationLabevents", + "MimicPatient", + ], + as_dict=True, + ) + + # FHIR → Dataset → Predictions → RiskAssessments + dataset = Dataset.from_fhir_bundle(bundle, schema=SCHEMA_PATH) + result = pipeline(dataset) + + risk_assessments = result.to_risk_assessment( + result.metadata["predictions"], + result.metadata["probabilities"], + outcome_code="A41.9", + outcome_display="Sepsis", + model_name="XGBoost", + ) + + print(f"Processed {len(result)} patients") + high_risk = sum( + 1 + for ra in risk_assessments + if ra.prediction[0].qualitativeRisk.coding[0].code == "high" + ) + print(f"High risk: {high_risk}") + + # Write to FHIR server + for ra in risk_assessments: + gateway.create(ra, source="fhir") + print(f"Created RiskAssessment/{ra.id}") + + return risk_assessments + + +def create_app(): + """Expose batch endpoint via API.""" + app = HealthChainAPI(title="Sepsis Batch Screening") + app.register_gateway(gateway, path="/fhir") + return app + + +app = create_app() + + +if __name__ == "__main__": + import uvicorn + + # Run batch screening + print("=== Batch Sepsis Screening ===") + run_batch_screening() + + # Start API server + print("\n=== FHIR Gateway Server ===") + print("http://localhost:8000/fhir/") + uvicorn.run(app, port=8000) diff --git a/healthchain/io/containers/dataset.py b/healthchain/io/containers/dataset.py index 39740be5..99f7966f 100644 --- a/healthchain/io/containers/dataset.py +++ b/healthchain/io/containers/dataset.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, Iterator, List, Union, Optional @@ -23,6 +23,10 @@ class Dataset(DataContainer[pd.DataFrame]): """ A container for tabular data optimized for ML inference, lightweight wrapper around a pandas DataFrame. + Attributes: + data: The pandas DataFrame containing the dataset. + metadata: Dict for storing pipeline results (predictions, probabilities, etc.) + Methods: from_csv: Load Dataset from CSV. from_dict: Load Dataset from dict. @@ -31,6 +35,8 @@ class Dataset(DataContainer[pd.DataFrame]): to_risk_assessment: Convert predictions to FHIR RiskAssessment. """ + metadata: Dict[str, Any] = field(default_factory=dict) + def __post_init__(self): if not isinstance(self.data, pd.DataFrame): raise TypeError("data must be a pandas DataFrame") diff --git a/healthchain/sandbox/sandboxclient.py b/healthchain/sandbox/sandboxclient.py index da121501..c3f59576 100644 --- a/healthchain/sandbox/sandboxclient.py +++ b/healthchain/sandbox/sandboxclient.py @@ -464,7 +464,7 @@ def send_requests(self) -> List[Dict]: log.debug(f"Making POST request to: {self.url}") response = client.post( url=str(self.url), - json=request.model_dump(exclude_none=True), + json=request.model_dump(exclude_none=True, mode="json"), timeout=timeout, ) response.raise_for_status() @@ -472,7 +472,9 @@ def send_requests(self) -> List[Dict]: try: response_data = response.json() cds_response = CDSResponse(**response_data) - responses.append(cds_response.model_dump(exclude_none=True)) + responses.append( + cds_response.model_dump(mode="json", exclude_none=True) + ) except json.JSONDecodeError: log.error( f"Invalid JSON response from {self.url}. " @@ -507,51 +509,56 @@ def send_requests(self) -> List[Dict]: return responses - def save_results(self, directory: Union[str, Path] = "./output/") -> None: + def save_results( + self, + directory: Union[str, Path] = "./output/", + save_request: bool = True, + save_response: bool = True, + ) -> None: """ - Save request and response data to disk. + Save request and/or response data to disk. Args: directory: Directory to save data to (default: "./output/") + save_request: Whether to save request data (default: True) + save_response: Whether to save response data (default: True) """ - if not self.responses: + if not self.responses and save_response: raise RuntimeError( "No responses to save. Send requests first using send_requests()" ) save_dir = Path(directory) - request_path = ensure_directory_exists(save_dir / "requests") - - # Determine file extension based on protocol extension = "xml" if self.protocol == ApiProtocol.soap else "json" - # Save requests - if self.protocol == ApiProtocol.soap: - request_data = [request.model_dump_xml() for request in self.requests] - else: - request_data = [ - request.model_dump(exclude_none=True) for request in self.requests - ] - - save_data_to_directory( - request_data, - "request", - self.sandbox_id, - request_path, - extension, - ) - log.info(f"Saved request data at {request_path}/") - - # Save responses - response_path = ensure_directory_exists(save_dir / "responses") - save_data_to_directory( - self.responses, - "response", - self.sandbox_id, - response_path, - extension, - ) - log.info(f"Saved response data at {response_path}/") + if save_request: + request_path = ensure_directory_exists(save_dir / "requests") + if self.protocol == ApiProtocol.soap: + request_data = [request.model_dump_xml() for request in self.requests] + else: + request_data = [ + request.model_dump(mode="json", exclude_none=True) + for request in self.requests + ] + save_data_to_directory( + request_data, + "request", + self.sandbox_id, + request_path, + extension, + ) + log.info(f"Saved request data at {request_path}/") + + if save_response: + response_path = ensure_directory_exists(save_dir / "responses") + save_data_to_directory( + self.responses, + "response", + self.sandbox_id, + response_path, + extension, + ) + log.info(f"Saved response data at {response_path}/") def get_status(self) -> Dict[str, Any]: """ diff --git a/scripts/extract_mimic_demo_patients.py b/scripts/extract_mimic_demo_patients.py new file mode 100644 index 00000000..834df100 --- /dev/null +++ b/scripts/extract_mimic_demo_patients.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 +""" +Extract Demo Patient Prefetch from MIMIC-on-FHIR + +Creates CDS Hooks prefetch files with only the observations needed for +sepsis prediction, keyed by feature name. Much smaller than full bundles! + +Customize: + - MIMIC_DIR: Path to your MIMIC-on-FHIR dataset + - MODEL_PATH: Path to your trained model pickle + - SCHEMA_PATH: Feature schema defining which observations to extract + - OUTPUT_DIR: Where to save extracted patient files + - NUM_PATIENTS_PER_RISK: How many patients to extract per risk level + +Run: + python scripts/extract_mimic_demo_patients.py + +Output format: + { + "patient": {...Patient resource...}, + "heart_rate": {"resourceType": "Bundle", "entry": [...]}, + "temperature": {"resourceType": "Bundle", "entry": [...]}, + ... + } +""" + +import json +from pathlib import Path + +import joblib +import yaml + +from healthchain.sandbox.loaders import MimicOnFHIRLoader +from healthchain.io import Dataset +from healthchain.pipeline import Pipeline + +import os + +try: + from dotenv import load_dotenv + + load_dotenv() +except ImportError: + print( + "Warning: dotenv not installed. Please manually set the MIMIC_FHIR_PATH environment variable." + ) + + +# ============================================================================= +# CUSTOMIZE THESE +# ============================================================================= + +MIMIC_DIR = os.getenv("MIMIC_FHIR_PATH") +MODEL_PATH = "cookbook/models/sepsis_model.pkl" +SCHEMA_PATH = "healthchain/configs/features/sepsis_vitals.yaml" +OUTPUT_DIR = Path("cookbook/data/mimic_demo_patients") + +# Number of patients to extract per risk level (high/moderate/low) +NUM_PATIENTS_PER_RISK = 1 + +# ============================================================================= + + +def load_observation_codes(schema_path: str) -> dict: + """Load feature schema and extract observation codes.""" + with open(schema_path) as f: + schema = yaml.safe_load(f) + + codes = {} + for feature_name, config in schema["features"].items(): + if config.get("fhir_resource") == "Observation": + codes[config["code"]] = feature_name + return codes + + +def create_pipeline(model, feature_names) -> Pipeline[Dataset]: + """Build prediction pipeline.""" + pipeline = Pipeline[Dataset]() + + @pipeline.add_node + def impute_missing(dataset: Dataset) -> Dataset: + dataset.data = dataset.data.fillna(dataset.data.median(numeric_only=True)) + return dataset + + @pipeline.add_node + def run_inference(dataset: Dataset) -> Dataset: + features = dataset.data[feature_names] + probabilities = model.predict_proba(features)[:, 1] + dataset.metadata["probabilities"] = probabilities + return dataset + + return pipeline + + +def get_observation_code(resource: dict) -> str: + """Extract MIMIC code from Observation resource.""" + for coding in resource.get("code", {}).get("coding", []): + if "mimic" in coding.get("system", ""): + return coding.get("code", "") + return "" + + +def extract_patient_prefetch(bundle: dict, patient_ref: str, obs_codes: dict) -> dict: + """Extract keyed prefetch for a patient with only needed observations.""" + patient_id = patient_ref.split("/")[-1] + prefetch = {} + feature_obs = {name: [] for name in obs_codes.values()} + + for entry in bundle["entry"]: + resource = entry.get("resource", {}) + resource_type = resource.get("resourceType", "") + + if resource_type == "Patient" and resource.get("id") == patient_id: + prefetch["patient"] = resource + + elif resource_type == "Observation": + subject = resource.get("subject", {}) + if subject.get("reference", "").endswith(patient_id): + code = get_observation_code(resource) + if code in obs_codes: + feature_obs[obs_codes[code]].append(entry) + + for feature_name, entries in feature_obs.items(): + if entries: + prefetch[feature_name] = { + "resourceType": "Bundle", + "type": "searchset", + "entry": entries, + } + + return prefetch + + +def main(): + print("=" * 60) + print("MIMIC Demo Patient Extraction") + print("=" * 60) + + if MIMIC_DIR is None: + print("Error: MIMIC_FHIR_PATH environment variable is not set.") + return + + # Load configs + obs_codes = load_observation_codes(SCHEMA_PATH) + print(f"Features to extract: {list(obs_codes.values())}") + + model_data = joblib.load(MODEL_PATH) + model = model_data["model"] + feature_names = model_data["metadata"]["feature_names"] + print(f"Model features: {feature_names}") + + # Load MIMIC data + print("\nLoading MIMIC-on-FHIR...") + loader = MimicOnFHIRLoader() + bundle = loader.load( + data_dir=MIMIC_DIR, + resource_types=[ + "MimicObservationChartevents", + "MimicObservationLabevents", + "MimicPatient", + ], + as_dict=True, + ) + print(f"Loaded {len(bundle['entry']):,} resources") + + # Run predictions + print("\nExtracting features and predicting...") + dataset = Dataset.from_fhir_bundle(bundle, schema=SCHEMA_PATH) + pipeline = create_pipeline(model, feature_names) + result = pipeline(dataset) + + df = result.data.copy() + df["probability"] = result.metadata["probabilities"] + df["risk"] = df["probability"].apply( + lambda p: "high" if p > 0.7 else "moderate" if p > 0.4 else "low" + ) + + print(f"\nRisk distribution ({len(df)} patients):") + print(df["risk"].value_counts().to_string()) + + # Select patients per risk level + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + patients_to_extract = [] + + for risk_level in ["high", "moderate", "low"]: + risk_patients = df[df["risk"] == risk_level] + for i in range(min(NUM_PATIENTS_PER_RISK, len(risk_patients))): + patient = risk_patients.iloc[i] + label = ( + f"{risk_level}_risk" + if NUM_PATIENTS_PER_RISK == 1 + else f"{risk_level}_risk_{i+1}" + ) + patients_to_extract.append((label, patient)) + + # Extract and save + print(f"\nExtracting to {OUTPUT_DIR}/") + for label, patient in patients_to_extract: + prefetch = extract_patient_prefetch(bundle, patient["patient_ref"], obs_codes) + + output_file = OUTPUT_DIR / f"{label}_patient.json" + with open(output_file, "w") as f: + json.dump(prefetch, f, indent=2, default=str) + + obs_count = sum( + len(v.get("entry", [])) for k, v in prefetch.items() if k != "patient" + ) + features_with_data = [k for k in prefetch if k != "patient"] + print( + f" {label}: {patient['probability']:.1%} risk, {obs_count} obs ({', '.join(features_with_data)})" + ) + + print("\nDone! Use these files with SandboxClient.load_from_path()") + + +if __name__ == "__main__": + main() From 4a00c9b5b89b85f2ae3f86d97b6b64ebc283252c Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Tue, 2 Dec 2025 10:39:18 +0000 Subject: [PATCH 02/12] Working cookbooks --- cookbook/sepsis_cds_hooks.py | 12 +- cookbook/sepsis_fhir_batch.py | 175 +++++++++----- scripts/extract_mimic_demo_patients.py | 315 +++++++++++++++++-------- 3 files changed, 340 insertions(+), 162 deletions(-) mode change 100644 => 100755 scripts/extract_mimic_demo_patients.py diff --git a/cookbook/sepsis_cds_hooks.py b/cookbook/sepsis_cds_hooks.py index 7afcdd75..d80249af 100644 --- a/cookbook/sepsis_cds_hooks.py +++ b/cookbook/sepsis_cds_hooks.py @@ -2,17 +2,17 @@ """ Sepsis Risk Prediction via CDS Hooks -Real-time sepsis alerts when clinician opens patient chart. -Uses pre-extracted MIMIC patient data for fast demos. +Real-time sepsis alerts triggered when clinician opens a patient chart. +Uses pre-extracted MIMIC patient data for demos. Demo patients extracted from MIMIC-on-FHIR using: python scripts/extract_mimic_demo_patients.py Requirements: -- pip install healthchain joblib xgboost + pip install healthchain joblib xgboost Run: -- python sepsis_cds_hooks.py + python cookbook/sepsis_cds_hooks.py """ from pathlib import Path @@ -89,8 +89,6 @@ def sepsis_alert(request: CDSRequest) -> CDSResponse: # print("Result:") # print(result.data.head(10)) - # print("Metadata:") - # print(result.metadata) probability = float(result.metadata["probabilities"][0]) risk = ( @@ -98,7 +96,7 @@ def sepsis_alert(request: CDSRequest) -> CDSResponse: ) if risk in ["high", "moderate"]: - summary = f"⚠️ Sepsis Risk: {risk.upper()} ({probability:.0%})" + summary = f"Sepsis Risk: {risk.upper()} ({probability:.0%})" indicator = "critical" if risk == "high" else "warning" detail = ( "**AI Guidance:**\n" diff --git a/cookbook/sepsis_fhir_batch.py b/cookbook/sepsis_fhir_batch.py index 252c455c..a9ae5880 100644 --- a/cookbook/sepsis_fhir_batch.py +++ b/cookbook/sepsis_fhir_batch.py @@ -2,23 +2,26 @@ """ Sepsis Batch Screening with FHIR Gateway -Batch process patients and write RiskAssessment resources to FHIR server. -Demonstrates querying FHIR server and writing results back. +Query patients from a FHIR server, batch run sepsis predictions, and write +RiskAssessment resources back. Demonstrates real FHIR server integration. -Requirements: -- pip install healthchain joblib xgboost python-dotenv - -Environment Variables: -- MEDPLUM_CLIENT_ID, MEDPLUM_CLIENT_SECRET, MEDPLUM_BASE_URL +Setup: + 1. Extract and upload demo patients: + python scripts/extract_mimic_demo_patients.py --minimal --upload + 2. Update DEMO_PATIENT_IDS below with the server-assigned IDs + 3. Set env vars: MEDPLUM_CLIENT_ID, MEDPLUM_CLIENT_SECRET, MEDPLUM_BASE_URL Run: -- python sepsis_fhir_batch.py + python cookbook/sepsis_fhir_batch.py """ from pathlib import Path +from typing import List import joblib from dotenv import load_dotenv +from fhir.resources.patient import Patient +from fhir.resources.observation import Observation from healthchain.gateway import HealthChainAPI, FHIRGateway from healthchain.gateway.clients.fhir.base import FHIRAuthConfig @@ -30,7 +33,9 @@ # Configuration SCRIPT_DIR = Path(__file__).parent MODEL_PATH = SCRIPT_DIR / "models" / "sepsis_model.pkl" -SCHEMA_PATH = "healthchain/configs/features/sepsis_vitals.yaml" +SCHEMA_PATH = ( + SCRIPT_DIR / ".." / "healthchain" / "configs" / "features" / "sepsis_vitals.yaml" +) # Load model model_data = joblib.load(MODEL_PATH) @@ -38,10 +43,21 @@ feature_names = model_data["metadata"]["feature_names"] threshold = model_data["metadata"]["metrics"].get("optimal_threshold", 0.5) -# FHIR Gateway -config = FHIRAuthConfig.from_env("MEDPLUM") -gateway = FHIRGateway() -gateway.add_source("fhir", config.to_connection_string()) +# FHIR sources (configure via environment) +MEDPLUM_URL = None +EPIC_URL = None + +try: + config = FHIRAuthConfig.from_env("MEDPLUM") + MEDPLUM_URL = config.to_connection_string() +except Exception: + pass + +try: + config = FHIRAuthConfig.from_env("EPIC") + EPIC_URL = config.to_connection_string() +except Exception: + pass def create_pipeline() -> Pipeline[Dataset]: @@ -65,75 +81,114 @@ def run_inference(dataset: Dataset) -> Dataset: return pipeline -def run_batch_screening(): - """ - Run batch sepsis screening. +def screen_patient( + gateway: FHIRGateway, pipeline: Pipeline, patient_id: str, source: str +): + """Screen a single patient for sepsis risk.""" + # Query patient data from FHIR server + obs_bundle = gateway.search( + Observation, {"patient": patient_id, "_count": "100"}, source + ) + patient_bundle = gateway.search(Patient, {"_id": patient_id}, source) - In production: query FHIR server for ICU patients - For demo: load from MIMIC-on-FHIR - """ - from healthchain.sandbox.loaders import MimicOnFHIRLoader + # Merge into single bundle + entries = [] + if patient_bundle.entry: + entries.extend([e.model_dump() for e in patient_bundle.entry]) + if obs_bundle.entry: + entries.extend([e.model_dump() for e in obs_bundle.entry]) - pipeline = create_pipeline() + if not entries: + return None, "No data found" - # Load data (production would use: gateway.search(Patient, {"location": "ICU"})) - loader = MimicOnFHIRLoader() - bundle = loader.load( - data_dir="../datasets/mimic-iv-clinical-database-demo-on-fhir-2.1.0/", - resource_types=[ - "MimicObservationChartevents", - "MimicObservationLabevents", - "MimicPatient", - ], - as_dict=True, - ) + # FHIR → Dataset → Prediction + bundle = {"type": "collection", "entry": entries} + dataset = Dataset.from_fhir_bundle(bundle, schema=str(SCHEMA_PATH)) + + if len(dataset.data) == 0: + return None, "No matching features" - # FHIR → Dataset → Predictions → RiskAssessments - dataset = Dataset.from_fhir_bundle(bundle, schema=SCHEMA_PATH) result = pipeline(dataset) + probability = float(result.metadata["probabilities"][0]) + risk = "high" if probability > 0.7 else "moderate" if probability > 0.4 else "low" + # Create and save RiskAssessment risk_assessments = result.to_risk_assessment( result.metadata["predictions"], result.metadata["probabilities"], outcome_code="A41.9", outcome_display="Sepsis", - model_name="XGBoost", - ) - - print(f"Processed {len(result)} patients") - high_risk = sum( - 1 - for ra in risk_assessments - if ra.prediction[0].qualitativeRisk.coding[0].code == "high" + model_name="sepsis_xgboost_v1", ) - print(f"High risk: {high_risk}") - # Write to FHIR server for ra in risk_assessments: - gateway.create(ra, source="fhir") - print(f"Created RiskAssessment/{ra.id}") + gateway.create(ra, source=source) - return risk_assessments + return risk_assessments[ + 0 + ] if risk_assessments else None, f"{risk.upper()} ({probability:.0%})" + + +def batch_screen(gateway: FHIRGateway, patient_ids: List[str], source: str = "medplum"): + """Screen multiple patients for sepsis risk.""" + pipeline = create_pipeline() + results = [] + + for patient_id in patient_ids: + try: + ra, status = screen_patient(gateway, pipeline, patient_id, source) + if ra: + results.append( + {"patient": patient_id, "status": status, "risk_assessment": ra.id} + ) + print(f" {patient_id}: {status} → RiskAssessment/{ra.id}") + else: + results.append({"patient": patient_id, "status": status}) + print(f" {patient_id}: {status}") + except Exception as e: + results.append({"patient": patient_id, "error": str(e)}) + print(f" {patient_id}: Error - {e}") + + return results def create_app(): - """Expose batch endpoint via API.""" + """Create FHIR gateway app with configured sources.""" + gateway = FHIRGateway() + + # Add configured sources + if MEDPLUM_URL: + gateway.add_source("medplum", MEDPLUM_URL) + print("✓ Medplum configured") + if EPIC_URL: + gateway.add_source("epic", EPIC_URL) + print("✓ Epic configured") + app = HealthChainAPI(title="Sepsis Batch Screening") app.register_gateway(gateway, path="/fhir") - return app - -app = create_app() + return app, gateway -if __name__ == "__main__": - import uvicorn +# Create app at module level +app, gateway = create_app() - # Run batch screening - print("=== Batch Sepsis Screening ===") - run_batch_screening() - # Start API server - print("\n=== FHIR Gateway Server ===") - print("http://localhost:8000/fhir/") - uvicorn.run(app, port=8000) +if __name__ == "__main__": + # Demo patient IDs from: python scripts/extract_mimic_demo_patients.py --minimal --upload + # (Update these with server-assigned IDs after upload) + DEMO_PATIENT_IDS = [ + "702e11e8-6d21-41dd-9b48-31715fdc0fb1", # high risk + "3b0da7e9-0379-455a-8d35-bedd3a6ee459", # moderate risk + "f490ceb4-6262-4f1e-8b72-5515e6c46741", # low risk + ] + + # Screen Medplum patients + if MEDPLUM_URL: + print("\n=== Screening patients from Medplum ===") + batch_screen(gateway, DEMO_PATIENT_IDS, source="medplum") + + # Demo Epic connectivity (data may not match sepsis features) + if EPIC_URL: + print("\n=== Epic Sandbox (demo connectivity) ===") + batch_screen(gateway, ["e0w0LEDCYtfckT6N.CkJKCw3"], source="epic") diff --git a/scripts/extract_mimic_demo_patients.py b/scripts/extract_mimic_demo_patients.py old mode 100644 new mode 100755 index 834df100..13526dfb --- a/scripts/extract_mimic_demo_patients.py +++ b/scripts/extract_mimic_demo_patients.py @@ -1,80 +1,76 @@ #!/usr/bin/env python3 """ -Extract Demo Patient Prefetch from MIMIC-on-FHIR - -Creates CDS Hooks prefetch files with only the observations needed for -sepsis prediction, keyed by feature name. Much smaller than full bundles! - -Customize: - - MIMIC_DIR: Path to your MIMIC-on-FHIR dataset - - MODEL_PATH: Path to your trained model pickle - - SCHEMA_PATH: Feature schema defining which observations to extract - - OUTPUT_DIR: Where to save extracted patient files - - NUM_PATIENTS_PER_RISK: How many patients to extract per risk level - -Run: - python scripts/extract_mimic_demo_patients.py - -Output format: - { - "patient": {...Patient resource...}, - "heart_rate": {"resourceType": "Bundle", "entry": [...]}, - "temperature": {"resourceType": "Bundle", "entry": [...]}, - ... - } +Extract Demo Patients from MIMIC-on-FHIR + +Extracts patient data for sepsis prediction demos. Creates small files with +only the observations needed for the model. + +Usage: + # For CDS Hooks demo (prefetch format) + python scripts/extract_mimic_demo_patients.py --minimal + + # For FHIR batch demo (upload to Medplum) + python scripts/extract_mimic_demo_patients.py --minimal --upload + +Output formats: + Default (prefetch for CDS Hooks): + {"patient": {...}, "heart_rate": {"entry": [...]}, ...} + + --bundle (for FHIR server upload): + {"resourceType": "Bundle", "type": "transaction", "entry": [...]} + +Requires: + - MIMIC_FHIR_PATH env var (or --mimic flag) + - MEDPLUM_* env vars (if using --upload) """ +import argparse import json +import os +import uuid from pathlib import Path import joblib import yaml -from healthchain.sandbox.loaders import MimicOnFHIRLoader from healthchain.io import Dataset from healthchain.pipeline import Pipeline - -import os +from healthchain.sandbox.loaders import MimicOnFHIRLoader try: from dotenv import load_dotenv load_dotenv() except ImportError: - print( - "Warning: dotenv not installed. Please manually set the MIMIC_FHIR_PATH environment variable." - ) - + pass # ============================================================================= -# CUSTOMIZE THESE +# CONFIGURATION # ============================================================================= -MIMIC_DIR = os.getenv("MIMIC_FHIR_PATH") -MODEL_PATH = "cookbook/models/sepsis_model.pkl" -SCHEMA_PATH = "healthchain/configs/features/sepsis_vitals.yaml" -OUTPUT_DIR = Path("cookbook/data/mimic_demo_patients") +DEFAULT_MODEL_PATH = "cookbook/models/sepsis_model.pkl" +DEFAULT_SCHEMA_PATH = "healthchain/configs/features/sepsis_vitals.yaml" +DEFAULT_OUTPUT_DIR = Path("cookbook/data/mimic_demo_patients") -# Number of patients to extract per risk level (high/moderate/low) -NUM_PATIENTS_PER_RISK = 1 +# ============================================================================= +# HELPER FUNCTIONS # ============================================================================= def load_observation_codes(schema_path: str) -> dict: - """Load feature schema and extract observation codes.""" + """Extract observation codes from feature schema.""" with open(schema_path) as f: schema = yaml.safe_load(f) - - codes = {} - for feature_name, config in schema["features"].items(): - if config.get("fhir_resource") == "Observation": - codes[config["code"]] = feature_name - return codes + return { + config["code"]: name + for name, config in schema["features"].items() + if config.get("fhir_resource") == "Observation" + } def create_pipeline(model, feature_names) -> Pipeline[Dataset]: - """Build prediction pipeline.""" + """Build prediction pipeline for risk stratification.""" pipeline = Pipeline[Dataset]() @pipeline.add_node @@ -85,44 +81,51 @@ def impute_missing(dataset: Dataset) -> Dataset: @pipeline.add_node def run_inference(dataset: Dataset) -> Dataset: features = dataset.data[feature_names] - probabilities = model.predict_proba(features)[:, 1] - dataset.metadata["probabilities"] = probabilities + dataset.metadata["probabilities"] = model.predict_proba(features)[:, 1] return dataset return pipeline def get_observation_code(resource: dict) -> str: - """Extract MIMIC code from Observation resource.""" + """Extract MIMIC code from Observation.""" for coding in resource.get("code", {}).get("coding", []): if "mimic" in coding.get("system", ""): return coding.get("code", "") return "" -def extract_patient_prefetch(bundle: dict, patient_ref: str, obs_codes: dict) -> dict: - """Extract keyed prefetch for a patient with only needed observations.""" +# ============================================================================= +# EXTRACTION FUNCTIONS +# ============================================================================= + + +def extract_patient_prefetch( + bundle: dict, patient_ref: str, obs_codes: dict, minimal: bool = False +) -> dict: + """Extract keyed prefetch for a patient (CDS Hooks format).""" patient_id = patient_ref.split("/")[-1] prefetch = {} feature_obs = {name: [] for name in obs_codes.values()} for entry in bundle["entry"]: resource = entry.get("resource", {}) - resource_type = resource.get("resourceType", "") + rtype = resource.get("resourceType", "") - if resource_type == "Patient" and resource.get("id") == patient_id: + if rtype == "Patient" and resource.get("id") == patient_id: prefetch["patient"] = resource - - elif resource_type == "Observation": - subject = resource.get("subject", {}) - if subject.get("reference", "").endswith(patient_id): + elif rtype == "Observation": + ref = resource.get("subject", {}).get("reference", "") + if ref.endswith(patient_id): code = get_observation_code(resource) if code in obs_codes: feature_obs[obs_codes[code]].append(entry) - for feature_name, entries in feature_obs.items(): + for name, entries in feature_obs.items(): if entries: - prefetch[feature_name] = { + if minimal: + entries = entries[-1:] # Keep only latest + prefetch[name] = { "resourceType": "Bundle", "type": "searchset", "entry": entries, @@ -131,29 +134,125 @@ def extract_patient_prefetch(bundle: dict, patient_ref: str, obs_codes: dict) -> return prefetch +def prefetch_to_bundle(prefetch: dict) -> dict: + """Convert prefetch to FHIR transaction Bundle (for server upload).""" + entries = [] + # Use urn:uuid references so Medplum properly links Observations to Patient. + patient_uuid = f"urn:uuid:{uuid.uuid4()}" + + # Patient + if "patient" in prefetch: + entries.append( + { + "fullUrl": patient_uuid, + "resource": prefetch["patient"].copy(), + "request": {"method": "POST", "url": "Patient"}, + } + ) + + # Observations (with updated subject reference) + for key, value in prefetch.items(): + if key == "patient" or not isinstance(value, dict): + continue + for entry in value.get("entry", []): + resource = entry.get("resource", {}) + if resource.get("resourceType") == "Observation": + obs = resource.copy() + obs["subject"] = {"reference": patient_uuid} + entries.append( + { + "fullUrl": f"urn:uuid:{uuid.uuid4()}", + "resource": obs, + "request": {"method": "POST", "url": "Observation"}, + } + ) + + return {"resourceType": "Bundle", "type": "transaction", "entry": entries} + + +def upload_bundle(gateway, bundle_data: dict) -> str: + """Upload bundle to Medplum, return server-assigned Patient ID.""" + from fhir.resources.bundle import Bundle as FHIRBundle + + response = gateway.transaction(FHIRBundle(**bundle_data), source="medplum") + + # Extract Patient ID from response + if response.entry: + for entry in response.entry: + if entry.response and entry.response.location: + loc = entry.response.location + if "Patient/" in loc: + return loc.split("Patient/")[1].split("/")[0] + return None + + +# ============================================================================= +# MAIN +# ============================================================================= + + def main(): + parser = argparse.ArgumentParser( + description="Extract demo patients from MIMIC-on-FHIR" + ) + parser.add_argument("--mimic", type=str, help="Path to MIMIC-on-FHIR dataset") + parser.add_argument( + "--model", type=str, default=DEFAULT_MODEL_PATH, help="Model pickle path" + ) + parser.add_argument( + "--schema", type=str, default=DEFAULT_SCHEMA_PATH, help="Feature schema YAML" + ) + parser.add_argument( + "--minimal", action="store_true", help="Keep only 1 obs per feature (~12KB)" + ) + parser.add_argument("--bundle", action="store_true", help="Output as FHIR Bundle") + parser.add_argument("--upload", action="store_true", help="Upload to Medplum") + parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT_DIR) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--num-patients-per-risk", type=int, default=1) + args = parser.parse_args() + + mimic_dir = args.mimic or os.getenv("MIMIC_FHIR_PATH") + if not mimic_dir: + print("Error: Set MIMIC_FHIR_PATH or use --mimic") + return + + # --upload implies --bundle + if args.upload: + args.bundle = True + + # Set up FHIRGateway for upload + gateway = None + if args.upload: + from healthchain.gateway import FHIRGateway + from healthchain.gateway.clients.fhir.base import FHIRAuthConfig + + try: + config = FHIRAuthConfig.from_env("MEDPLUM") + gateway = FHIRGateway() + gateway.add_source("medplum", config.to_connection_string()) + print("✓ Medplum configured") + except Exception as e: + print(f"✗ Medplum failed: {e}") + return + print("=" * 60) - print("MIMIC Demo Patient Extraction") + print("MIMIC Demo Patient Extraction" + (" (MINIMAL)" if args.minimal else "")) print("=" * 60) - if MIMIC_DIR is None: - print("Error: MIMIC_FHIR_PATH environment variable is not set.") - return - - # Load configs - obs_codes = load_observation_codes(SCHEMA_PATH) - print(f"Features to extract: {list(obs_codes.values())}") + # Load schema and model + obs_codes = load_observation_codes(args.schema) + print(f"Features: {list(obs_codes.values())}") - model_data = joblib.load(MODEL_PATH) + model_data = joblib.load(args.model) model = model_data["model"] feature_names = model_data["metadata"]["feature_names"] - print(f"Model features: {feature_names}") # Load MIMIC data print("\nLoading MIMIC-on-FHIR...") loader = MimicOnFHIRLoader() bundle = loader.load( - data_dir=MIMIC_DIR, + data_dir=mimic_dir, resource_types=[ "MimicObservationChartevents", "MimicObservationLabevents", @@ -164,10 +263,9 @@ def main(): print(f"Loaded {len(bundle['entry']):,} resources") # Run predictions - print("\nExtracting features and predicting...") - dataset = Dataset.from_fhir_bundle(bundle, schema=SCHEMA_PATH) - pipeline = create_pipeline(model, feature_names) - result = pipeline(dataset) + print("\nExtracting features...") + dataset = Dataset.from_fhir_bundle(bundle, schema=args.schema) + result = create_pipeline(model, feature_names)(dataset) df = result.data.copy() df["probability"] = result.metadata["probabilities"] @@ -178,39 +276,66 @@ def main(): print(f"\nRisk distribution ({len(df)} patients):") print(df["risk"].value_counts().to_string()) - # Select patients per risk level - OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - patients_to_extract = [] + # Extract patients + args.output.mkdir(parents=True, exist_ok=True) + print(f"\nExtracting to {args.output}/") for risk_level in ["high", "moderate", "low"]: - risk_patients = df[df["risk"] == risk_level] - for i in range(min(NUM_PATIENTS_PER_RISK, len(risk_patients))): - patient = risk_patients.iloc[i] + risk_df = df[df["risk"] == risk_level] + if len(risk_df) == 0: + continue + + risk_df = risk_df.sample( + n=min(args.num_patients_per_risk, len(risk_df)), random_state=args.seed + ) + + for i, (_, patient) in enumerate(risk_df.iterrows()): label = ( f"{risk_level}_risk" - if NUM_PATIENTS_PER_RISK == 1 + if args.num_patients_per_risk == 1 else f"{risk_level}_risk_{i+1}" ) - patients_to_extract.append((label, patient)) + prefetch = extract_patient_prefetch( + bundle, patient["patient_ref"], obs_codes, args.minimal + ) - # Extract and save - print(f"\nExtracting to {OUTPUT_DIR}/") - for label, patient in patients_to_extract: - prefetch = extract_patient_prefetch(bundle, patient["patient_ref"], obs_codes) + # Output format + if args.bundle: + output_data = prefetch_to_bundle(prefetch) + suffix = "_bundle.json" + else: + output_data = prefetch + suffix = "_patient.json" - output_file = OUTPUT_DIR / f"{label}_patient.json" - with open(output_file, "w") as f: - json.dump(prefetch, f, indent=2, default=str) + # Save file + with open(args.output / f"{label}{suffix}", "w") as f: + json.dump(output_data, f, indent=2, default=str) - obs_count = sum( - len(v.get("entry", [])) for k, v in prefetch.items() if k != "patient" - ) - features_with_data = [k for k in prefetch if k != "patient"] - print( - f" {label}: {patient['probability']:.1%} risk, {obs_count} obs ({', '.join(features_with_data)})" - ) + obs_count = sum( + len(v.get("entry", [])) for k, v in prefetch.items() if k != "patient" + ) + patient_id = patient["patient_ref"].split("/")[-1] + + # Upload if requested + status = "" + if args.upload and gateway: + server_id = upload_bundle(gateway, output_data) + status = ( + f" ✓ uploaded (ID: {server_id})" if server_id else " ✓ uploaded" + ) + + print( + f" {label}: {patient_id} ({patient['probability']:.1%}, {obs_count} obs){status}" + ) - print("\nDone! Use these files with SandboxClient.load_from_path()") + # Print next steps + print("\n" + "=" * 60) + if args.upload: + print("✓ Uploaded to Medplum! Update patient IDs in sepsis_fhir_batch.py") + elif args.bundle: + print("Re-run with --upload to upload to Medplum") + else: + print("CDS: client.load_from_path('cookbook/data/mimic_demo_patients/')") if __name__ == "__main__": From 0d041b6dc268bdb79d9470d30a6eaf6a8243a705 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Tue, 2 Dec 2025 10:39:48 +0000 Subject: [PATCH 03/12] Add mimic demo patients --- .../mimic_demo_patients/high_risk_bundle.json | 413 ++++++++++++++++++ .../high_risk_patient.json | 390 +++++++++++++++++ .../mimic_demo_patients/low_risk_bundle.json | 413 ++++++++++++++++++ .../mimic_demo_patients/low_risk_patient.json | 407 +++++++++++++++++ .../moderate_risk_bundle.json | 413 ++++++++++++++++++ .../moderate_risk_patient.json | 408 +++++++++++++++++ 6 files changed, 2444 insertions(+) create mode 100644 cookbook/data/mimic_demo_patients/high_risk_bundle.json create mode 100644 cookbook/data/mimic_demo_patients/high_risk_patient.json create mode 100644 cookbook/data/mimic_demo_patients/low_risk_bundle.json create mode 100644 cookbook/data/mimic_demo_patients/low_risk_patient.json create mode 100644 cookbook/data/mimic_demo_patients/moderate_risk_bundle.json create mode 100644 cookbook/data/mimic_demo_patients/moderate_risk_patient.json diff --git a/cookbook/data/mimic_demo_patients/high_risk_bundle.json b/cookbook/data/mimic_demo_patients/high_risk_bundle.json new file mode 100644 index 00000000..ae2ca225 --- /dev/null +++ b/cookbook/data/mimic_demo_patients/high_risk_bundle.json @@ -0,0 +1,413 @@ +{ + "resourceType": "Bundle", + "type": "transaction", + "entry": [ + { + "fullUrl": "urn:uuid:f1f8064e-a37d-4c2b-8002-3efd94d43a26", + "resource": { + "id": "1cf9e585-806c-513b-80af-4ca565a28231", + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-patient" + ] + }, + "name": [ + { + "use": "official", + "family": "Patient_10015860" + } + ], + "gender": "male", + "birthDate": "2133-09-15", + "extension": [ + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", + "extension": [ + { + "url": "ombCategory", + "valueCoding": { + "code": "2106-3", + "system": "urn:oid:2.16.840.1.113883.6.238", + "display": "White" + } + }, + { + "url": "text", + "valueString": "White" + } + ] + }, + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", + "extension": [ + { + "url": "ombCategory", + "valueCoding": { + "code": "2186-5", + "system": "urn:oid:2.16.840.1.113883.6.238", + "display": "Not Hispanic or Latino" + } + }, + { + "url": "text", + "valueString": "Not Hispanic or Latino" + } + ] + }, + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex", + "valueCode": "M" + } + ], + "identifier": [ + { + "value": "10015860", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/patient" + } + ], + "resourceType": "Patient", + "communication": [ + { + "language": { + "coding": [ + { + "code": "en", + "system": "urn:ietf:bcp:47" + } + ] + } + } + ], + "maritalStatus": { + "coding": [ + { + "code": "S", + "system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus" + } + ] + }, + "managingOrganization": { + "reference": "Organization/ee172322-118b-5716-abbc-18e4c5437e15" + } + }, + "request": { + "method": "POST", + "url": "Patient" + } + }, + { + "fullUrl": "urn:uuid:7242a670-6b9d-4a67-a7a9-374658ac6b03", + "resource": { + "id": "1e00686c-4ed2-5acd-bc59-e8c305b95af7", + "code": { + "coding": [ + { + "code": "220045", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Heart Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2192-05-12T17:15:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:f1f8064e-a37d-4c2b-8002-3efd94d43a26" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/36ad3455-d2af-514b-ac42-265954a07a0e" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "bpm", + "unit": "bpm", + "value": 108, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2192-05-12T16:49:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:c29b9720-0c2d-4c02-aeeb-7b515ce318b4", + "resource": { + "id": "8f10f571-8183-5b55-b659-107b69ab6fba", + "code": { + "coding": [ + { + "code": "223761", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Temperature Fahrenheit" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2192-05-12T09:46:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:f1f8064e-a37d-4c2b-8002-3efd94d43a26" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/36ad3455-d2af-514b-ac42-265954a07a0e" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "\u00b0F", + "unit": "\u00b0F", + "value": 98.4, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2192-05-12T09:46:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:4c37b18e-e4a6-4070-83af-c29224ba4fe1", + "resource": { + "id": "3a294f91-6d99-5a23-bc6b-c44d5f69c5db", + "code": { + "coding": [ + { + "code": "220210", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Respiratory Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2192-05-12T17:15:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:f1f8064e-a37d-4c2b-8002-3efd94d43a26" + }, + "category": [ + { + "coding": [ + { + "code": "Respiratory", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/36ad3455-d2af-514b-ac42-265954a07a0e" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "insp/min", + "unit": "insp/min", + "value": 17, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2192-05-12T16:49:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:4e3d2d87-f47a-4eb3-bb09-a707f13b5e1e", + "resource": { + "id": "ff849940-4858-59d3-9da8-da8d43aaa808", + "code": { + "coding": [ + { + "code": "51301", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "White Blood Cells" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "issued": "2188-08-06T08:15:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:f1f8064e-a37d-4c2b-8002-3efd94d43a26" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/5f29e513-36c0-5435-a7ab-e6119d10fcc1" + }, + "encounter": { + "reference": "Encounter/dcd2507e-f200-5bfa-a719-c49d94f17fce" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "ROUTINE" + } + ], + "identifier": [ + { + "value": "196686", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "K/uL", + "unit": "K/uL", + "value": 17.8, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "K/uL", + "unit": "K/uL", + "value": 4, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "K/uL", + "unit": "K/uL", + "value": 11, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2188-08-06T06:57:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:9e170dfc-714e-4afc-889f-9d15d6f5f6b5", + "resource": { + "id": "5bdf7562-d8cd-5611-9177-4cbafa9b8b19", + "code": { + "coding": [ + { + "code": "50912", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "Creatinine" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "issued": "2188-08-06T08:54:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:f1f8064e-a37d-4c2b-8002-3efd94d43a26" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/2a72c2e8-48b9-5d28-92c6-a0cf35c1ca7c" + }, + "encounter": { + "reference": "Encounter/dcd2507e-f200-5bfa-a719-c49d94f17fce" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "ROUTINE" + } + ], + "identifier": [ + { + "value": "196668", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 2, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 0.5, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 1.2, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2188-08-06T06:57:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + } + ] +} diff --git a/cookbook/data/mimic_demo_patients/high_risk_patient.json b/cookbook/data/mimic_demo_patients/high_risk_patient.json new file mode 100644 index 00000000..2d20486d --- /dev/null +++ b/cookbook/data/mimic_demo_patients/high_risk_patient.json @@ -0,0 +1,390 @@ +{ + "patient": { + "id": "f5efdf3f-5b53-5c9f-95a6-047275107c46", + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-patient" + ] + }, + "name": [ + { + "use": "official", + "family": "Patient_10002495" + } + ], + "gender": "male", + "birthDate": "2060-05-22", + "extension": [ + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", + "extension": [ + { + "url": "ombCategory", + "valueCoding": { + "code": "UNK", + "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor", + "display": "unknown" + } + }, + { + "url": "text", + "valueString": "unknown" + } + ] + }, + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex", + "valueCode": "M" + } + ], + "identifier": [ + { + "value": "10002495", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/patient" + } + ], + "resourceType": "Patient", + "communication": [ + { + "language": { + "coding": [ + { + "code": "en", + "system": "urn:ietf:bcp:47" + } + ] + } + } + ], + "maritalStatus": { + "coding": [ + { + "code": "M", + "system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus" + } + ] + }, + "managingOrganization": { + "reference": "Organization/ee172322-118b-5716-abbc-18e4c5437e15" + } + }, + "heart_rate": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "041a0657-63c0-5828-8301-6dd389649892", + "code": { + "coding": [ + { + "code": "220045", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Heart Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2141-05-23T21:50:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/f5efdf3f-5b53-5c9f-95a6-047275107c46" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/25e05468-7cbf-5a04-9209-79cb07703326" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "bpm", + "unit": "bpm", + "value": 113, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2141-05-23T17:55:00-04:00" + } + } + ] + }, + "temperature": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "69958710-586e-5a29-994c-0f93f9da43dd", + "code": { + "coding": [ + { + "code": "223761", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Temperature Fahrenheit" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2141-05-22T20:32:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/f5efdf3f-5b53-5c9f-95a6-047275107c46" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/25e05468-7cbf-5a04-9209-79cb07703326" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "\u00b0F", + "unit": "\u00b0F", + "value": 98.7, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2141-05-22T20:32:00-04:00" + } + } + ] + }, + "respiratory_rate": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "9f0d5f36-58c1-5e3c-a645-9ac6eed4eeca", + "code": { + "coding": [ + { + "code": "220210", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Respiratory Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2141-05-23T21:50:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/f5efdf3f-5b53-5c9f-95a6-047275107c46" + }, + "category": [ + { + "coding": [ + { + "code": "Respiratory", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/25e05468-7cbf-5a04-9209-79cb07703326" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "insp/min", + "unit": "insp/min", + "value": 25, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2141-05-23T17:55:00-04:00" + } + } + ] + }, + "wbc": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "943aa613-4410-5792-8286-eb0a7637de4b", + "code": { + "coding": [ + { + "code": "51301", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "White Blood Cells" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "issued": "2141-05-23T15:28:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/f5efdf3f-5b53-5c9f-95a6-047275107c46" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/122994a4-a976-501c-813d-994c1ab5742d" + }, + "encounter": { + "reference": "Encounter/3e802913-a3f3-573f-90b3-a85dffdec47b" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "ROUTINE" + } + ], + "identifier": [ + { + "value": "32427", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "K/uL", + "unit": "K/uL", + "value": 28.5, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "K/uL", + "unit": "K/uL", + "value": 4, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "K/uL", + "unit": "K/uL", + "value": 10, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2141-05-23T14:53:00-04:00" + } + } + ] + }, + "creatinine": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "9c9eab28-b23d-56ce-a2e0-e3fa0201cc9a", + "code": { + "coding": [ + { + "code": "50912", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "Creatinine" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "issued": "2141-05-23T12:34:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/f5efdf3f-5b53-5c9f-95a6-047275107c46" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/c191d9dd-50e4-5d7c-bb05-cbe8b2e72772" + }, + "encounter": { + "reference": "Encounter/3e802913-a3f3-573f-90b3-a85dffdec47b" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "STAT" + } + ], + "identifier": [ + { + "value": "32407", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 1.6, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 0.5, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 1.2, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2141-05-23T10:52:00-04:00" + } + } + ] + } +} diff --git a/cookbook/data/mimic_demo_patients/low_risk_bundle.json b/cookbook/data/mimic_demo_patients/low_risk_bundle.json new file mode 100644 index 00000000..aa30f6ad --- /dev/null +++ b/cookbook/data/mimic_demo_patients/low_risk_bundle.json @@ -0,0 +1,413 @@ +{ + "resourceType": "Bundle", + "type": "transaction", + "entry": [ + { + "fullUrl": "urn:uuid:f6f0dd59-e75c-4562-9150-ea4a0b5321b3", + "resource": { + "id": "afa7c67f-82b9-5f51-bd04-8b7d7c4456c0", + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-patient" + ] + }, + "name": [ + { + "use": "official", + "family": "Patient_10016150" + } + ], + "gender": "male", + "birthDate": "2073-05-10", + "extension": [ + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", + "extension": [ + { + "url": "ombCategory", + "valueCoding": { + "code": "2106-3", + "system": "urn:oid:2.16.840.1.113883.6.238", + "display": "White" + } + }, + { + "url": "text", + "valueString": "White" + } + ] + }, + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", + "extension": [ + { + "url": "ombCategory", + "valueCoding": { + "code": "2186-5", + "system": "urn:oid:2.16.840.1.113883.6.238", + "display": "Not Hispanic or Latino" + } + }, + { + "url": "text", + "valueString": "Not Hispanic or Latino" + } + ] + }, + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex", + "valueCode": "M" + } + ], + "identifier": [ + { + "value": "10016150", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/patient" + } + ], + "resourceType": "Patient", + "communication": [ + { + "language": { + "coding": [ + { + "code": "en", + "system": "urn:ietf:bcp:47" + } + ] + } + } + ], + "maritalStatus": { + "coding": [ + { + "code": "S", + "system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus" + } + ] + }, + "managingOrganization": { + "reference": "Organization/ee172322-118b-5716-abbc-18e4c5437e15" + } + }, + "request": { + "method": "POST", + "url": "Patient" + } + }, + { + "fullUrl": "urn:uuid:bc20d95c-18b8-4fa8-9095-f05abec6aa78", + "resource": { + "id": "867fe01b-3930-5adf-a45d-f666fecbe864", + "code": { + "coding": [ + { + "code": "220045", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Heart Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2142-05-10T16:59:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:f6f0dd59-e75c-4562-9150-ea4a0b5321b3" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/13987fde-e7cc-5dfb-b5e8-cdf2b709a1d4" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "bpm", + "unit": "bpm", + "value": 71, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2142-05-10T16:59:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:e4ce72da-4ab9-4619-adb6-07bc027c728a", + "resource": { + "id": "93664731-abf1-57e1-a3fb-693fa9b07479", + "code": { + "coding": [ + { + "code": "223761", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Temperature Fahrenheit" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2142-05-10T16:59:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:f6f0dd59-e75c-4562-9150-ea4a0b5321b3" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/13987fde-e7cc-5dfb-b5e8-cdf2b709a1d4" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "\u00b0F", + "unit": "\u00b0F", + "value": 98.4, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2142-05-10T16:59:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:8e23c576-644b-4908-9271-1935d06aff54", + "resource": { + "id": "e9532f81-3f62-5af3-a095-d027492f7e01", + "code": { + "coding": [ + { + "code": "220210", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Respiratory Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2142-05-10T16:59:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:f6f0dd59-e75c-4562-9150-ea4a0b5321b3" + }, + "category": [ + { + "coding": [ + { + "code": "Respiratory", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/13987fde-e7cc-5dfb-b5e8-cdf2b709a1d4" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "insp/min", + "unit": "insp/min", + "value": 33, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2142-05-10T16:59:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:46fe4954-70f4-4652-9d40-15c9598ce545", + "resource": { + "id": "759f2c85-3345-5d7a-8bbb-252d4d7ac5b0", + "code": { + "coding": [ + { + "code": "51301", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "White Blood Cells" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "issued": "2142-05-10T16:09:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:f6f0dd59-e75c-4562-9150-ea4a0b5321b3" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/b0afc9eb-baad-5d87-a7d8-d6eaf699cf96" + }, + "encounter": { + "reference": "Encounter/ef3c0803-f981-59f7-a022-0d1223377142" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "STAT" + } + ], + "identifier": [ + { + "value": "202620", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "K/uL", + "unit": "K/uL", + "value": 5.4, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "K/uL", + "unit": "K/uL", + "value": 4, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "K/uL", + "unit": "K/uL", + "value": 11, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2142-05-10T15:37:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:7d4a5e58-ad87-43d3-bc7c-b89df25c2308", + "resource": { + "id": "7461ac2f-33f1-508b-9eef-bf5c23dd9b8d", + "code": { + "coding": [ + { + "code": "50912", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "Creatinine" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "issued": "2142-05-14T10:23:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:f6f0dd59-e75c-4562-9150-ea4a0b5321b3" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/d936d365-0a43-52e0-b440-26db28d3bbf0" + }, + "encounter": { + "reference": "Encounter/ef3c0803-f981-59f7-a022-0d1223377142" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "ROUTINE" + } + ], + "identifier": [ + { + "value": "202713", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 0.9, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 0.5, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 1.2, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2142-05-14T08:30:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + } + ] +} diff --git a/cookbook/data/mimic_demo_patients/low_risk_patient.json b/cookbook/data/mimic_demo_patients/low_risk_patient.json new file mode 100644 index 00000000..5cb18e13 --- /dev/null +++ b/cookbook/data/mimic_demo_patients/low_risk_patient.json @@ -0,0 +1,407 @@ +{ + "patient": { + "id": "5f3dcdb5-bd27-58f5-b990-859b6bcc2d73", + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-patient" + ] + }, + "name": [ + { + "use": "official", + "family": "Patient_10038999" + } + ], + "gender": "male", + "birthDate": "2086-05-22", + "extension": [ + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", + "extension": [ + { + "url": "ombCategory", + "valueCoding": { + "code": "2106-3", + "system": "urn:oid:2.16.840.1.113883.6.238", + "display": "White" + } + }, + { + "url": "text", + "valueString": "White" + } + ] + }, + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", + "extension": [ + { + "url": "ombCategory", + "valueCoding": { + "code": "2186-5", + "system": "urn:oid:2.16.840.1.113883.6.238", + "display": "Not Hispanic or Latino" + } + }, + { + "url": "text", + "valueString": "Not Hispanic or Latino" + } + ] + }, + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex", + "valueCode": "M" + } + ], + "identifier": [ + { + "value": "10038999", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/patient" + } + ], + "resourceType": "Patient", + "communication": [ + { + "language": { + "coding": [ + { + "code": "en", + "system": "urn:ietf:bcp:47" + } + ] + } + } + ], + "maritalStatus": { + "coding": [ + { + "code": "S", + "system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus" + } + ] + }, + "managingOrganization": { + "reference": "Organization/ee172322-118b-5716-abbc-18e4c5437e15" + } + }, + "heart_rate": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "ccbb50b3-c1e2-5a78-8f40-fda91f209773", + "code": { + "coding": [ + { + "code": "220045", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Heart Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2131-05-22T22:38:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/5f3dcdb5-bd27-58f5-b990-859b6bcc2d73" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/24339e36-0b8e-5f30-91bc-d4b7d9919c3c" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "bpm", + "unit": "bpm", + "value": 110, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2131-05-22T22:38:00-04:00" + } + } + ] + }, + "temperature": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "3c53284d-5069-54bd-8496-3a739180babe", + "code": { + "coding": [ + { + "code": "223761", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Temperature Fahrenheit" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2131-05-22T22:38:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/5f3dcdb5-bd27-58f5-b990-859b6bcc2d73" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/24339e36-0b8e-5f30-91bc-d4b7d9919c3c" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "\u00b0F", + "unit": "\u00b0F", + "value": 98.8, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2131-05-22T22:38:00-04:00" + } + } + ] + }, + "respiratory_rate": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "9331ed77-b563-5abe-bf84-4ac7053b9fe9", + "code": { + "coding": [ + { + "code": "220210", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Respiratory Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2131-05-22T22:38:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/5f3dcdb5-bd27-58f5-b990-859b6bcc2d73" + }, + "category": [ + { + "coding": [ + { + "code": "Respiratory", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/24339e36-0b8e-5f30-91bc-d4b7d9919c3c" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "insp/min", + "unit": "insp/min", + "value": 20, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2131-05-22T22:38:00-04:00" + } + } + ] + }, + "wbc": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "0b9aaaa5-43c1-51a5-b1d2-a6156968513e", + "code": { + "coding": [ + { + "code": "51301", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "White Blood Cells" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "issued": "2131-05-28T03:24:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/5f3dcdb5-bd27-58f5-b990-859b6bcc2d73" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/6025339e-8821-54a8-99dc-ae56d8c705d1" + }, + "encounter": { + "reference": "Encounter/7f95fc8e-1f36-54a6-96f9-798fd9c7e93b" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "STAT" + } + ], + "identifier": [ + { + "value": "455202", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "K/uL", + "unit": "K/uL", + "value": 8.6, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "K/uL", + "unit": "K/uL", + "value": 4, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "K/uL", + "unit": "K/uL", + "value": 10, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2131-05-28T02:56:00-04:00" + } + } + ] + }, + "creatinine": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "e9ea65d0-b198-58c1-bcbe-e436150d6e4d", + "code": { + "coding": [ + { + "code": "50912", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "Creatinine" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "issued": "2131-05-28T03:37:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/5f3dcdb5-bd27-58f5-b990-859b6bcc2d73" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/00d5feac-4fe1-5fdb-ac62-01bff201f55c" + }, + "encounter": { + "reference": "Encounter/7f95fc8e-1f36-54a6-96f9-798fd9c7e93b" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "STAT" + } + ], + "identifier": [ + { + "value": "455210", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 0.8, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 0.5, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 1.2, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2131-05-28T02:56:00-04:00" + } + } + ] + } +} diff --git a/cookbook/data/mimic_demo_patients/moderate_risk_bundle.json b/cookbook/data/mimic_demo_patients/moderate_risk_bundle.json new file mode 100644 index 00000000..869c0676 --- /dev/null +++ b/cookbook/data/mimic_demo_patients/moderate_risk_bundle.json @@ -0,0 +1,413 @@ +{ + "resourceType": "Bundle", + "type": "transaction", + "entry": [ + { + "fullUrl": "urn:uuid:a6f47e5c-2ee6-4a71-ae52-6cdb2edd5122", + "resource": { + "id": "72d56b49-a7ee-5b9a-a679-25d1c836d3c3", + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-patient" + ] + }, + "name": [ + { + "use": "official", + "family": "Patient_10018845" + } + ], + "gender": "male", + "birthDate": "2093-10-07", + "extension": [ + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", + "extension": [ + { + "url": "ombCategory", + "valueCoding": { + "code": "2106-3", + "system": "urn:oid:2.16.840.1.113883.6.238", + "display": "White" + } + }, + { + "url": "text", + "valueString": "White" + } + ] + }, + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", + "extension": [ + { + "url": "ombCategory", + "valueCoding": { + "code": "2186-5", + "system": "urn:oid:2.16.840.1.113883.6.238", + "display": "Not Hispanic or Latino" + } + }, + { + "url": "text", + "valueString": "Not Hispanic or Latino" + } + ] + }, + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex", + "valueCode": "M" + } + ], + "identifier": [ + { + "value": "10018845", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/patient" + } + ], + "resourceType": "Patient", + "communication": [ + { + "language": { + "coding": [ + { + "code": "en", + "system": "urn:ietf:bcp:47" + } + ] + } + } + ], + "maritalStatus": { + "coding": [ + { + "code": "M", + "system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus" + } + ] + }, + "deceasedDateTime": "2184-11-22", + "managingOrganization": { + "reference": "Organization/ee172322-118b-5716-abbc-18e4c5437e15" + } + }, + "request": { + "method": "POST", + "url": "Patient" + } + }, + { + "fullUrl": "urn:uuid:7b9f0473-b12c-49ea-9cec-0af73dccb83c", + "resource": { + "id": "ff7c1328-fe32-5574-b842-144ba3ac8fb0", + "code": { + "coding": [ + { + "code": "220045", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Heart Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2184-10-08T04:31:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:a6f47e5c-2ee6-4a71-ae52-6cdb2edd5122" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/625b0a9e-a378-5e68-b8d6-10c655f7579d" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "bpm", + "unit": "bpm", + "value": 58, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2184-10-08T04:31:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:9eebab6a-909d-424a-baa9-0a155a54f13a", + "resource": { + "id": "4664e5eb-efaa-5062-a594-f20c0b10d901", + "code": { + "coding": [ + { + "code": "223761", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Temperature Fahrenheit" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2184-10-08T04:31:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:a6f47e5c-2ee6-4a71-ae52-6cdb2edd5122" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/625b0a9e-a378-5e68-b8d6-10c655f7579d" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "\u00b0F", + "unit": "\u00b0F", + "value": 98, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2184-10-08T04:31:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:f1358348-72a0-411b-9413-d9261ae6b92a", + "resource": { + "id": "1ac7341b-9efc-5101-a4bf-b5fa5de755dd", + "code": { + "coding": [ + { + "code": "220210", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Respiratory Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2184-10-08T04:31:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:a6f47e5c-2ee6-4a71-ae52-6cdb2edd5122" + }, + "category": [ + { + "coding": [ + { + "code": "Respiratory", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/625b0a9e-a378-5e68-b8d6-10c655f7579d" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "insp/min", + "unit": "insp/min", + "value": 13, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2184-10-08T04:31:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:b868d1ac-4537-43a1-aafa-3438e8c6a28d", + "resource": { + "id": "41599527-6e37-53ce-b710-8d1d071d28eb", + "code": { + "coding": [ + { + "code": "51301", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "White Blood Cells" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "issued": "2184-10-08T02:08:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:a6f47e5c-2ee6-4a71-ae52-6cdb2edd5122" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/f3e2cbca-c799-5acd-85f1-4c3fd56f7dd9" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "STAT" + } + ], + "identifier": [ + { + "value": "222110", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "K/uL", + "unit": "K/uL", + "value": 5.9, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "K/uL", + "unit": "K/uL", + "value": 4, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "K/uL", + "unit": "K/uL", + "value": 11, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2184-10-08T00:50:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + }, + { + "fullUrl": "urn:uuid:8375eaf0-8196-485b-84a2-7a200610d2e4", + "resource": { + "id": "164745e6-16e5-5ded-95c2-3094a9cc0ac6", + "code": { + "coding": [ + { + "code": "50912", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "Creatinine" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "note": [ + { + "text": "VERIFIED - CONSISTENT WITH OTHER DATA." + } + ], + "issued": "2184-10-08T02:09:00-04:00", + "status": "final", + "subject": { + "reference": "urn:uuid:a6f47e5c-2ee6-4a71-ae52-6cdb2edd5122" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/7991a26f-45ad-5d40-b4b9-2a17467b13c9" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "STAT" + } + ], + "identifier": [ + { + "value": "222087", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 5, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 0.5, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 1.2, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2184-10-08T00:50:00-04:00" + }, + "request": { + "method": "POST", + "url": "Observation" + } + } + ] +} diff --git a/cookbook/data/mimic_demo_patients/moderate_risk_patient.json b/cookbook/data/mimic_demo_patients/moderate_risk_patient.json new file mode 100644 index 00000000..c8aaf1b3 --- /dev/null +++ b/cookbook/data/mimic_demo_patients/moderate_risk_patient.json @@ -0,0 +1,408 @@ +{ + "patient": { + "id": "22a3e422-663a-561c-b305-a0c04bf42235", + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-patient" + ] + }, + "name": [ + { + "use": "official", + "family": "Patient_10021666" + } + ], + "gender": "male", + "birthDate": "2085-03-12", + "extension": [ + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", + "extension": [ + { + "url": "ombCategory", + "valueCoding": { + "code": "2106-3", + "system": "urn:oid:2.16.840.1.113883.6.238", + "display": "White" + } + }, + { + "url": "text", + "valueString": "White" + } + ] + }, + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", + "extension": [ + { + "url": "ombCategory", + "valueCoding": { + "code": "2186-5", + "system": "urn:oid:2.16.840.1.113883.6.238", + "display": "Not Hispanic or Latino" + } + }, + { + "url": "text", + "valueString": "Not Hispanic or Latino" + } + ] + }, + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex", + "valueCode": "M" + } + ], + "identifier": [ + { + "value": "10021666", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/patient" + } + ], + "resourceType": "Patient", + "communication": [ + { + "language": { + "coding": [ + { + "code": "en", + "system": "urn:ietf:bcp:47" + } + ] + } + } + ], + "maritalStatus": { + "coding": [ + { + "code": "M", + "system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus" + } + ] + }, + "deceasedDateTime": "2172-04-19", + "managingOrganization": { + "reference": "Organization/ee172322-118b-5716-abbc-18e4c5437e15" + } + }, + "heart_rate": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "01093aef-0cf5-5af0-b5c1-92ca3d7deaf2", + "code": { + "coding": [ + { + "code": "220045", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Heart Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2172-03-13T02:02:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/22a3e422-663a-561c-b305-a0c04bf42235" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/ffce7398-83de-5c56-833d-dfcb02d1abac" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "bpm", + "unit": "bpm", + "value": 70, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2172-03-13T01:56:00-04:00" + } + } + ] + }, + "temperature": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "476c79e7-cdba-5f17-8bee-f0f5bcbaa845", + "code": { + "coding": [ + { + "code": "223761", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Temperature Fahrenheit" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2172-03-13T02:02:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/22a3e422-663a-561c-b305-a0c04bf42235" + }, + "category": [ + { + "coding": [ + { + "code": "Routine Vital Signs", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/ffce7398-83de-5c56-833d-dfcb02d1abac" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "\u00b0F", + "unit": "\u00b0F", + "value": 99.4, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2172-03-13T02:01:00-04:00" + } + } + ] + }, + "respiratory_rate": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "e22290e7-e08c-5e0d-9929-eba8ad24c97a", + "code": { + "coding": [ + { + "code": "220210", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-chartevents-d-items", + "display": "Respiratory Rate" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-chartevents" + ] + }, + "issued": "2172-03-13T02:02:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/22a3e422-663a-561c-b305-a0c04bf42235" + }, + "category": [ + { + "coding": [ + { + "code": "Respiratory", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-observation-category" + } + ] + } + ], + "encounter": { + "reference": "Encounter/ffce7398-83de-5c56-833d-dfcb02d1abac" + }, + "resourceType": "Observation", + "valueQuantity": { + "code": "insp/min", + "unit": "insp/min", + "value": 14, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "effectiveDateTime": "2172-03-13T01:56:00-04:00" + } + } + ] + }, + "wbc": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "b78e2882-469d-566f-bcfe-f47388cb72f0", + "code": { + "coding": [ + { + "code": "51301", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "White Blood Cells" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "issued": "2172-03-15T13:18:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/22a3e422-663a-561c-b305-a0c04bf42235" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/3d1deb9c-3aa2-5bd7-a3c8-3f1766530dc2" + }, + "encounter": { + "reference": "Encounter/f96dcfb3-1c84-5040-b9b9-c227d21a21a1" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "ROUTINE" + } + ], + "identifier": [ + { + "value": "257293", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "K/uL", + "unit": "K/uL", + "value": 10.5, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "K/uL", + "unit": "K/uL", + "value": 4, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "K/uL", + "unit": "K/uL", + "value": 11, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2172-03-15T11:56:00-04:00" + } + } + ] + }, + "creatinine": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "id": "ab2d4a21-fd85-5263-b909-4d92d0c50dac", + "code": { + "coding": [ + { + "code": "50912", + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-d-labitems", + "display": "Creatinine" + } + ] + }, + "meta": { + "profile": [ + "http://mimic.mit.edu/fhir/mimic/StructureDefinition/mimic-observation-labevents" + ] + }, + "issued": "2172-03-15T14:02:00-04:00", + "status": "final", + "subject": { + "reference": "Patient/22a3e422-663a-561c-b305-a0c04bf42235" + }, + "category": [ + { + "coding": [ + { + "code": "laboratory", + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "display": "Laboratory" + } + ] + } + ], + "specimen": { + "reference": "Specimen/99911b05-1540-5236-9689-e9594cc8aeed" + }, + "encounter": { + "reference": "Encounter/f96dcfb3-1c84-5040-b9b9-c227d21a21a1" + }, + "extension": [ + { + "url": "http://mimic.mit.edu/fhir/mimic/StructureDefinition/lab-priority", + "valueString": "ROUTINE" + } + ], + "identifier": [ + { + "value": "257298", + "system": "http://mimic.mit.edu/fhir/mimic/identifier/observation-labevents" + } + ], + "resourceType": "Observation", + "valueQuantity": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 1.2, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "referenceRange": [ + { + "low": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 0.5, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + }, + "high": { + "code": "mg/dL", + "unit": "mg/dL", + "value": 1.2, + "system": "http://mimic.mit.edu/fhir/mimic/CodeSystem/mimic-units" + } + } + ], + "effectiveDateTime": "2172-03-15T11:56:00-04:00" + } + } + ] + } +} From a28d85627c4882ad88c7b18097031d15d68fba67 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Tue, 2 Dec 2025 11:49:31 +0000 Subject: [PATCH 04/12] to_risk_assessment reads preds from metadata --- docs/reference/io/containers/dataset.md | 8 +++----- healthchain/io/containers/dataset.py | 23 +++++++++++++++-------- tests/io/test_dataset.py | 21 +++++++++------------ 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/docs/reference/io/containers/dataset.md b/docs/reference/io/containers/dataset.md index 731e5310..af2bd421 100644 --- a/docs/reference/io/containers/dataset.md +++ b/docs/reference/io/containers/dataset.md @@ -27,14 +27,12 @@ print("Columns:", dataset.columns) validation_result = dataset.validate(schema="path/to/schema.yaml") print("Validation Result:", validation_result) -# 4. Run inference using your ML model -predictions = model.predict(dataset.data) -probabilities = model.predict_proba(dataset.data)[:, 1] +# 4. Run inference using your ML model and store in metadata +dataset.metadata["predictions"] = model.predict(dataset.data) +dataset.metadata["probabilities"] = model.predict_proba(dataset.data)[:, 1] # 5. Convert predictions to FHIR RiskAssessment resources for downstream use risk_assessments = dataset.to_risk_assessment( - predictions=predictions, - probabilities=probabilities, outcome_code="A41.9", outcome_display="Sepsis, unspecified", model_name="SepsisRiskModel", diff --git a/healthchain/io/containers/dataset.py b/healthchain/io/containers/dataset.py index 99f7966f..20a4e36c 100644 --- a/healthchain/io/containers/dataset.py +++ b/healthchain/io/containers/dataset.py @@ -200,8 +200,6 @@ def _dtypes_compatible(self, actual: str, expected: str) -> bool: def to_risk_assessment( self, - predictions: np.ndarray, - probabilities: np.ndarray, outcome_code: str, outcome_display: str, outcome_system: str = "http://hl7.org/fhir/sid/icd-10", @@ -209,6 +207,8 @@ def to_risk_assessment( model_version: Optional[str] = None, high_threshold: float = 0.7, moderate_threshold: float = 0.4, + predictions: Optional[np.ndarray] = None, + probabilities: Optional[np.ndarray] = None, ) -> List[RiskAssessment]: """Convert model predictions to FHIR RiskAssessment resources. @@ -216,8 +216,6 @@ def to_risk_assessment( including in FHIR Bundles or sending to FHIR servers. Args: - predictions: Binary predictions array (0/1) - probabilities: Probability scores array (0-1) outcome_code: Code for the predicted outcome (e.g., "A41.9" for sepsis) outcome_display: Display text for the outcome (e.g., "Sepsis") outcome_system: Code system for the outcome (default: ICD-10) @@ -225,22 +223,31 @@ def to_risk_assessment( model_version: Version of the ML model (optional) high_threshold: Threshold for high risk (default: 0.7) moderate_threshold: Threshold for moderate risk (default: 0.4) + predictions: Binary predictions array (0/1). Defaults to metadata["predictions"] + probabilities: Probability scores array (0-1). Defaults to metadata["probabilities"] Returns: List of RiskAssessment resources, one per patient Example: - >>> predictions = np.array([0, 1, 0]) - >>> probabilities = np.array([0.15, 0.85, 0.32]) >>> risk_assessments = dataset.to_risk_assessment( - ... predictions, - ... probabilities, ... outcome_code="A41.9", ... outcome_display="Sepsis, unspecified", ... model_name="RandomForest", ... model_version="1.0" ... ) """ + # Fall back to metadata if not provided + if predictions is None: + predictions = self.metadata.get("predictions") + if probabilities is None: + probabilities = self.metadata.get("probabilities") + + if predictions is None or probabilities is None: + raise ValueError( + "predictions and probabilities must be provided or available in metadata" + ) + if len(predictions) != len(self.data): raise ValueError( f"Predictions length ({len(predictions)}) must match " diff --git a/tests/io/test_dataset.py b/tests/io/test_dataset.py index be2e25f1..272f26b2 100644 --- a/tests/io/test_dataset.py +++ b/tests/io/test_dataset.py @@ -128,9 +128,9 @@ def test_dataset_to_risk_assessment_creates_resources_with_metadata(sample_datas probabilities = np.array([0.15, 0.85]) # Test with model metadata + sample_dataset.metadata["predictions"] = predictions + sample_dataset.metadata["probabilities"] = probabilities risks = sample_dataset.to_risk_assessment( - predictions, - probabilities, outcome_code="A41.9", outcome_display="Sepsis", model_name="RandomForest", @@ -183,10 +183,10 @@ def test_dataset_to_risk_assessment_categorizes_risk_levels( } ) dataset = Dataset(data) + dataset.metadata["predictions"] = np.array(predictions) + dataset.metadata["probabilities"] = np.array(probabilities) risks = dataset.to_risk_assessment( - np.array(predictions), - np.array(probabilities), outcome_code="A41.9", outcome_display="Sepsis", ) @@ -224,11 +224,11 @@ def test_dataset_to_risk_assessment_validation_errors( """Dataset.to_risk_assessment validates required columns and array lengths.""" data = pd.DataFrame(data_dict) dataset = Dataset(data) + dataset.metadata["predictions"] = np.array(predictions) + dataset.metadata["probabilities"] = np.array(probabilities) with pytest.raises(ValueError, match=expected_error): dataset.to_risk_assessment( - np.array(predictions), - np.array(probabilities), outcome_code="A41.9", outcome_display="Sepsis", ) @@ -284,11 +284,8 @@ def test_dataset_to_risk_assessment_validates_probability_length(): """Dataset.to_risk_assessment validates probabilities array length.""" data = pd.DataFrame({"patient_ref": ["Patient/1", "Patient/2"], "value": [1, 2]}) dataset = Dataset(data) - - predictions = np.array([0, 1]) - probabilities = np.array([0.15]) # Wrong length + dataset.metadata["predictions"] = np.array([0, 1]) + dataset.metadata["probabilities"] = np.array([0.15]) # Wrong length with pytest.raises(ValueError, match="Probabilities length .* must match"): - dataset.to_risk_assessment( - predictions, probabilities, outcome_code="A41.9", outcome_display="Sepsis" - ) + dataset.to_risk_assessment(outcome_code="A41.9", outcome_display="Sepsis") From dba972cce4f56dde56dd40ad7aa36ac5bb0f5434 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Tue, 2 Dec 2025 11:49:50 +0000 Subject: [PATCH 05/12] Add prefetch to bundle helper --- healthchain/fhir/__init__.py | 2 ++ healthchain/fhir/readers.py | 28 ++++++++++++++++++++++++ tests/fhir/test_converters.py | 40 +++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/healthchain/fhir/__init__.py b/healthchain/fhir/__init__.py index 9193ccd4..b33116d2 100644 --- a/healthchain/fhir/__init__.py +++ b/healthchain/fhir/__init__.py @@ -23,6 +23,7 @@ from healthchain.fhir.readers import ( create_resource_from_dict, convert_prefetch_to_fhir_objects, + prefetch_to_bundle, read_content_attachment, ) @@ -74,6 +75,7 @@ # Conversions and readers "create_resource_from_dict", "convert_prefetch_to_fhir_objects", + "prefetch_to_bundle", "read_content_attachment", # Bundle operations "create_bundle", diff --git a/healthchain/fhir/readers.py b/healthchain/fhir/readers.py index 7d7bbd06..d55fe7be 100644 --- a/healthchain/fhir/readers.py +++ b/healthchain/fhir/readers.py @@ -37,6 +37,34 @@ def create_resource_from_dict( return None +def prefetch_to_bundle(prefetch: Dict[str, Any]) -> Dict[str, Any]: + """Flatten CDS Hooks prefetch into a collection Bundle dict. + + Converts the keyed prefetch format (used in CDS Hooks) into a flat bundle + suitable for Dataset.from_fhir_bundle(). + + Args: + prefetch: CDS Hooks prefetch dict with format: + {"patient": {...}, "observations": {"entry": [...]}, ...} + + Returns: + Bundle dict with type "collection" and flattened entries + + Example: + >>> prefetch = request.prefetch + >>> bundle = prefetch_to_bundle(prefetch) + >>> dataset = Dataset.from_fhir_bundle(bundle, schema=schema) + """ + entries = [] + for key, value in prefetch.items(): + if isinstance(value, dict): + if "entry" in value: # Searchset bundle + entries.extend(value["entry"]) + elif "resourceType" in value: # Single resource + entries.append({"resource": value}) + return {"type": "collection", "entry": entries} + + def convert_prefetch_to_fhir_objects( prefetch_dict: Dict[str, Any], ) -> Dict[str, Resource]: diff --git a/tests/fhir/test_converters.py b/tests/fhir/test_converters.py index 40e06725..aa16c20a 100644 --- a/tests/fhir/test_converters.py +++ b/tests/fhir/test_converters.py @@ -23,6 +23,7 @@ create_value_quantity_observation, create_condition, create_medication_statement, + prefetch_to_bundle, ) @@ -523,3 +524,42 @@ def test_bundle_to_dataframe_skips_unsupported_resources_gracefully(): # Should not raise error, just skip unsupported types df = bundle_to_dataframe(bundle, config=config) assert len(df) == 1 + + +def test_prefetch_to_bundle_flattens_cds_prefetch(): + """prefetch_to_bundle converts CDS Hooks prefetch to collection bundle.""" + prefetch = { + "patient": {"resourceType": "Patient", "id": "123", "gender": "male"}, + "heart_rate": { + "resourceType": "Bundle", + "type": "searchset", + "entry": [ + { + "resource": { + "resourceType": "Observation", + "code": {"coding": [{"code": "8867-4"}]}, + "valueQuantity": {"value": 85.0}, + } + } + ], + }, + } + + bundle = prefetch_to_bundle(prefetch) + + assert bundle["type"] == "collection" + assert len(bundle["entry"]) == 2 + # Patient should be wrapped in resource + patient_entry = next( + e + for e in bundle["entry"] + if e.get("resource", {}).get("resourceType") == "Patient" + ) + assert patient_entry["resource"]["id"] == "123" + + +def test_prefetch_to_bundle_handles_empty_prefetch(): + """prefetch_to_bundle handles empty prefetch gracefully.""" + bundle = prefetch_to_bundle({}) + assert bundle["type"] == "collection" + assert bundle["entry"] == [] From 020dd050db22fe3326c2747c55afbb684f716a44 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Tue, 2 Dec 2025 11:50:26 +0000 Subject: [PATCH 06/12] Cleanup cookbooks --- cookbook/sepsis_cds_hooks.py | 16 +++------ cookbook/sepsis_fhir_batch.py | 64 ++++++++++++++++------------------- 2 files changed, 35 insertions(+), 45 deletions(-) diff --git a/cookbook/sepsis_cds_hooks.py b/cookbook/sepsis_cds_hooks.py index d80249af..ff70be67 100644 --- a/cookbook/sepsis_cds_hooks.py +++ b/cookbook/sepsis_cds_hooks.py @@ -21,6 +21,7 @@ from dotenv import load_dotenv from healthchain.gateway import HealthChainAPI, CDSHooksService +from healthchain.fhir import prefetch_to_bundle from healthchain.io import Dataset from healthchain.models import CDSRequest, CDSResponse from healthchain.models.responses.cdsresponse import Card @@ -72,16 +73,8 @@ def sepsis_alert(request: CDSRequest) -> CDSResponse: if not prefetch: return CDSResponse(cards=[]) - # Merge keyed prefetch into single bundle - # Format: {"patient": {...}, "heart_rate": {"entry": [...]}, ...} - entries = [] - for key, value in prefetch.items(): - if key == "patient": - entries.append({"resource": value}) - elif isinstance(value, dict) and "entry" in value: - entries.extend(value["entry"]) - - bundle = {"type": "collection", "entry": entries} + # Flatten keyed prefetch into single bundle + bundle = prefetch_to_bundle(prefetch) # FHIR → Dataset → Prediction dataset = Dataset.from_fhir_bundle(bundle, schema=SCHEMA_PATH) @@ -124,6 +117,7 @@ def sepsis_alert(request: CDSRequest) -> CDSResponse: app = HealthChainAPI(title="Sepsis CDS Hooks") app.register_service(cds, path="/cds") + return app @@ -149,7 +143,7 @@ def run_server(): url="http://localhost:8000/cds/cds-services/sepsis-risk", workflow="patient-view", ) - client.load_from_path(DEMO_PATIENTS_DIR) + client.load_from_path(DEMO_PATIENTS_DIR, pattern="*_patient.json") responses = client.send_requests() client.save_results(save_request=True, save_response=True, directory="./output/") diff --git a/cookbook/sepsis_fhir_batch.py b/cookbook/sepsis_fhir_batch.py index a9ae5880..3c41dcb7 100644 --- a/cookbook/sepsis_fhir_batch.py +++ b/cookbook/sepsis_fhir_batch.py @@ -19,17 +19,22 @@ from typing import List import joblib +import logging from dotenv import load_dotenv from fhir.resources.patient import Patient from fhir.resources.observation import Observation +from fhir.resources.riskassessment import RiskAssessment from healthchain.gateway import HealthChainAPI, FHIRGateway from healthchain.gateway.clients.fhir.base import FHIRAuthConfig +from healthchain.fhir import merge_bundles from healthchain.io import Dataset from healthchain.pipeline import Pipeline load_dotenv() +logger = logging.getLogger(__name__) + # Configuration SCRIPT_DIR = Path(__file__).parent MODEL_PATH = SCRIPT_DIR / "models" / "sepsis_model.pkl" @@ -51,13 +56,19 @@ config = FHIRAuthConfig.from_env("MEDPLUM") MEDPLUM_URL = config.to_connection_string() except Exception: - pass + logger.warning("Failed to load Medplum config") try: config = FHIRAuthConfig.from_env("EPIC") EPIC_URL = config.to_connection_string() except Exception: - pass + logger.warning("Failed to load Epic config") + + +def get_risk_summary(ra: RiskAssessment) -> tuple[str, float]: + """Extract risk level and probability from a RiskAssessment.""" + pred = ra.prediction[0] + return pred.qualitativeRisk.coding[0].code, pred.probabilityDecimal def create_pipeline() -> Pipeline[Dataset]: @@ -83,8 +94,8 @@ def run_inference(dataset: Dataset) -> Dataset: def screen_patient( gateway: FHIRGateway, pipeline: Pipeline, patient_id: str, source: str -): - """Screen a single patient for sepsis risk.""" +) -> RiskAssessment | None: + """Screen a single patient for sepsis risk. Returns RiskAssessment or None.""" # Query patient data from FHIR server obs_bundle = gateway.search( Observation, {"patient": patient_id, "_count": "100"}, source @@ -92,30 +103,21 @@ def screen_patient( patient_bundle = gateway.search(Patient, {"_id": patient_id}, source) # Merge into single bundle - entries = [] - if patient_bundle.entry: - entries.extend([e.model_dump() for e in patient_bundle.entry]) - if obs_bundle.entry: - entries.extend([e.model_dump() for e in obs_bundle.entry]) + bundle = merge_bundles([patient_bundle, obs_bundle]) - if not entries: - return None, "No data found" + if not bundle.entry: + return None # FHIR → Dataset → Prediction - bundle = {"type": "collection", "entry": entries} dataset = Dataset.from_fhir_bundle(bundle, schema=str(SCHEMA_PATH)) if len(dataset.data) == 0: - return None, "No matching features" + return None result = pipeline(dataset) - probability = float(result.metadata["probabilities"][0]) - risk = "high" if probability > 0.7 else "moderate" if probability > 0.4 else "low" # Create and save RiskAssessment risk_assessments = result.to_risk_assessment( - result.metadata["predictions"], - result.metadata["probabilities"], outcome_code="A41.9", outcome_display="Sepsis", model_name="sepsis_xgboost_v1", @@ -124,33 +126,28 @@ def screen_patient( for ra in risk_assessments: gateway.create(ra, source=source) - return risk_assessments[ - 0 - ] if risk_assessments else None, f"{risk.upper()} ({probability:.0%})" + return risk_assessments[0] if risk_assessments else None -def batch_screen(gateway: FHIRGateway, patient_ids: List[str], source: str = "medplum"): +def batch_screen( + gateway: FHIRGateway, patient_ids: List[str], source: str = "medplum" +) -> None: """Screen multiple patients for sepsis risk.""" pipeline = create_pipeline() - results = [] for patient_id in patient_ids: try: - ra, status = screen_patient(gateway, pipeline, patient_id, source) + ra = screen_patient(gateway, pipeline, patient_id, source) if ra: - results.append( - {"patient": patient_id, "status": status, "risk_assessment": ra.id} + risk, prob = get_risk_summary(ra) + print( + f" {patient_id}: {risk.upper()} ({prob:.0%}) → RiskAssessment/{ra.id}" ) - print(f" {patient_id}: {status} → RiskAssessment/{ra.id}") else: - results.append({"patient": patient_id, "status": status}) - print(f" {patient_id}: {status}") + print(f" {patient_id}: No data") except Exception as e: - results.append({"patient": patient_id, "error": str(e)}) print(f" {patient_id}: Error - {e}") - return results - def create_app(): """Create FHIR gateway app with configured sources.""" @@ -159,10 +156,10 @@ def create_app(): # Add configured sources if MEDPLUM_URL: gateway.add_source("medplum", MEDPLUM_URL) - print("✓ Medplum configured") + logger.info("✓ Medplum configured") if EPIC_URL: gateway.add_source("epic", EPIC_URL) - print("✓ Epic configured") + logger.info("✓ Epic configured") app = HealthChainAPI(title="Sepsis Batch Screening") app.register_gateway(gateway, path="/fhir") @@ -170,7 +167,6 @@ def create_app(): return app, gateway -# Create app at module level app, gateway = create_app() From ad1fefaa8870fa8b09aa79d9ae4563ff97bdc49f Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Tue, 2 Dec 2025 11:51:37 +0000 Subject: [PATCH 07/12] Make patient ids easier to copy and paste --- scripts/extract_mimic_demo_patients.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/scripts/extract_mimic_demo_patients.py b/scripts/extract_mimic_demo_patients.py index 13526dfb..8e68e86e 100755 --- a/scripts/extract_mimic_demo_patients.py +++ b/scripts/extract_mimic_demo_patients.py @@ -280,6 +280,8 @@ def main(): args.output.mkdir(parents=True, exist_ok=True) print(f"\nExtracting to {args.output}/") + uploaded_ids = [] # Track server-assigned IDs for copy-paste output + for risk_level in ["high", "moderate", "low"]: risk_df = df[df["risk"] == risk_level] if len(risk_df) == 0: @@ -320,9 +322,11 @@ def main(): status = "" if args.upload and gateway: server_id = upload_bundle(gateway, output_data) - status = ( - f" ✓ uploaded (ID: {server_id})" if server_id else " ✓ uploaded" - ) + if server_id: + uploaded_ids.append((server_id, risk_level)) + status = f" ✓ uploaded (ID: {server_id})" + else: + status = " ✓ uploaded" print( f" {label}: {patient_id} ({patient['probability']:.1%}, {obs_count} obs){status}" @@ -331,7 +335,13 @@ def main(): # Print next steps print("\n" + "=" * 60) if args.upload: - print("✓ Uploaded to Medplum! Update patient IDs in sepsis_fhir_batch.py") + print("✓ Uploaded to Medplum!") + if uploaded_ids: + print("\nCopy this into sepsis_fhir_batch.py:\n") + print("DEMO_PATIENT_IDS = [") + for server_id, risk in uploaded_ids: + print(f' "{server_id}", # {risk} risk') + print("]") elif args.bundle: print("Re-run with --upload to upload to Medplum") else: From a1ba549c7278dca516ccce5025b183b1309c64ad Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Tue, 2 Dec 2025 11:52:20 +0000 Subject: [PATCH 08/12] Move sepsis training scripts to scripts folder and add epic connection check --- scripts/check_epic_connection.py | 96 +++ scripts/sepsis_prediction_inference.py | 206 +++++ scripts/sepsis_prediction_training.py | 1039 ++++++++++++++++++++++++ 3 files changed, 1341 insertions(+) create mode 100644 scripts/check_epic_connection.py create mode 100644 scripts/sepsis_prediction_inference.py create mode 100644 scripts/sepsis_prediction_training.py diff --git a/scripts/check_epic_connection.py b/scripts/check_epic_connection.py new file mode 100644 index 00000000..e55ad023 --- /dev/null +++ b/scripts/check_epic_connection.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +""" +Quick Epic FHIR connection test. + +Run: python scripts/check_epic_connection.py +""" + +from dotenv import load_dotenv + +load_dotenv() + + +def main(): + print("=" * 50) + print("Epic FHIR Connection Test") + print("=" * 50) + + # 1. Load config + print("\n1. Loading config from environment...") + try: + from healthchain.gateway.clients.fhir.base import FHIRAuthConfig + + config = FHIRAuthConfig.from_env("EPIC") + print(f" ✓ client_id: {config.client_id[:8]}...") + print(f" ✓ token_url: {config.token_url}") + print(f" ✓ base_url: {config.base_url}") + print(f" ✓ use_jwt_assertion: {config.use_jwt_assertion}") + except Exception as e: + print(f" ✗ Failed to load config: {e}") + return False + + # 2. Test JWT creation + print("\n2. Creating JWT assertion...") + try: + oauth_config = config.to_oauth2_config() + from healthchain.gateway.clients.auth import OAuth2TokenManager + + manager = OAuth2TokenManager(oauth_config) + jwt = manager._create_jwt_assertion() + print(f" ✓ JWT created ({len(jwt)} chars)") + except Exception as e: + print(f" ✗ JWT creation failed: {e}") + return False + + # 3. Get access token + print("\n3. Requesting access token from Epic...") + try: + token = manager.get_access_token() + print(f" ✓ Token received: {token[:20]}...") + except Exception as e: + print(f" ✗ Token request failed: {e}") + print("\n Possible causes:") + print(" - App changes still propagating (wait 15-30 min)") + print(" - Public key not registered in Epic App Orchard") + print(" - App not in 'Ready for Sandbox' state") + return False + + # 4. Test FHIR endpoint + print("\n4. Testing FHIR endpoint (CapabilityStatement)...") + try: + from healthchain.gateway.clients.fhir.sync.client import FHIRClient + + client = FHIRClient(config) + caps = client.capabilities() + print(f" ✓ FHIR server: {caps.software.name if caps.software else 'Unknown'}") + print(f" ✓ FHIR version: {caps.fhirVersion}") + except Exception as e: + print(f" ✗ FHIR request failed: {e}") + return False + + # 5. Test patient read (optional) + print("\n5. Testing Patient read...") + test_patient_id = "e0w0LEDCYtfckT6N.CkJKCw3" # Epic sandbox patient + try: + from fhir.resources.patient import Patient + + patient = client.read(Patient, test_patient_id) + name = patient.name[0] if patient.name else None + print( + f" ✓ Patient: {name.given[0] if name and name.given else '?'} {name.family if name else '?'}" + ) + except Exception as e: + print(f" ⚠ Patient read failed: {e}") + print(" (This may be a permissions issue, not a connection issue)") + + print("\n" + "=" * 50) + print("✓ Epic connection working!") + print("=" * 50) + return True + + +if __name__ == "__main__": + import sys + + success = main() + sys.exit(0 if success else 1) diff --git a/scripts/sepsis_prediction_inference.py b/scripts/sepsis_prediction_inference.py new file mode 100644 index 00000000..33edb858 --- /dev/null +++ b/scripts/sepsis_prediction_inference.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +""" +Sepsis Prediction Inference Script + +Demonstrates how to load and use the trained sepsis prediction model. + +Requirements: +- pip install scikit-learn xgboost joblib pandas numpy + +Usage: +- python sepsis_prediction_inference.py +""" + +import pandas as pd +import numpy as np +from pathlib import Path +from typing import Dict, Union, Tuple +import joblib + + +def load_model(model_path: Union[str, Path]) -> Dict: + """ + Load trained sepsis prediction model. + + Args: + model_path: Path to saved model file + + Returns: + Dictionary containing model, scaler, and metadata + """ + print(f"Loading model from {model_path}...") + model_data = joblib.load(model_path) + + metadata = model_data["metadata"] + print(f" Model: {metadata['model_name']}") + print(f" Training date: {metadata['training_date']}") + print(f" Features: {', '.join(metadata['feature_names'])}") + print(f" Test F1-score: {metadata['metrics']['f1']:.4f}") + print(f" Test AUC-ROC: {metadata['metrics']['auc']:.4f}") + + if "optimal_threshold" in metadata["metrics"]: + print(f" Optimal threshold: {metadata['metrics']['optimal_threshold']:.4f}") + print(f" Optimal F1-score: {metadata['metrics']['optimal_f1']:.4f}") + + return model_data + + +def predict_sepsis( + model_data: Dict, patient_features: pd.DataFrame, use_optimal_threshold: bool = True +) -> Tuple[np.ndarray, np.ndarray]: + """ + Predict sepsis risk for patient(s). + + Args: + model_data: Dictionary containing model, scaler, and metadata + patient_features: DataFrame with patient features + use_optimal_threshold: Whether to use optimal threshold (default: True) + + Returns: + Tuple of (predictions, probabilities) + """ + model = model_data["model"] + scaler = model_data["scaler"] + metadata = model_data["metadata"] + feature_names = metadata["feature_names"] + + # Ensure features are in correct order + patient_features = patient_features[feature_names] + + # Apply scaling if Logistic Regression + if scaler is not None: + patient_features_scaled = scaler.transform(patient_features) + probabilities = model.predict_proba(patient_features_scaled)[:, 1] + else: + probabilities = model.predict_proba(patient_features)[:, 1] + + # Use optimal threshold if available and requested + if use_optimal_threshold and "optimal_threshold" in metadata["metrics"]: + threshold = metadata["metrics"]["optimal_threshold"] + else: + threshold = 0.5 + + predictions = (probabilities >= threshold).astype(int) + + return predictions, probabilities + + +def create_example_patients() -> pd.DataFrame: + """ + Create example patient data for demonstration. + + Returns: + DataFrame with example patient features + """ + # Example patient data + # Patient 1: Healthy patient (low risk) + # Patient 2: Moderate risk (some abnormal values) + # Patient 3: Low risk (normal values) + # Patient 4: High risk for sepsis (multiple severe abnormalities) + # Patient 5: Critical sepsis risk (severe multi-organ dysfunction) + patients = pd.DataFrame( + { + "heart_rate": [85, 110, 75, 130, 145], # beats/min (normal: 60-100) + "temperature": [ + 37.2, + 38.5, + 36.8, + 39.2, + 35.5, + ], # Celsius (normal: 36.5-37.5, hypothermia <36) + "respiratory_rate": [16, 24, 14, 30, 35], # breaths/min (normal: 12-20) + "wbc": [8.5, 15.2, 7.0, 18.5, 22.0], # x10^9/L (normal: 4-11) + "lactate": [ + 1.2, + 3.5, + 0.9, + 4.8, + 6.5, + ], # mmol/L (normal: <2, severe sepsis: >4) + "creatinine": [0.9, 1.8, 0.8, 2.5, 3.2], # mg/dL (normal: 0.6-1.2) + "age": [45, 68, 35, 72, 78], # years + "gender_encoded": [1, 0, 1, 1, 0], # 1=Male, 0=Female + } + ) + + return patients + + +def interpret_results( + predictions: np.ndarray, probabilities: np.ndarray, patient_features: pd.DataFrame +) -> None: + """ + Interpret and display prediction results. + + Args: + predictions: Binary predictions (0=no sepsis, 1=sepsis) + probabilities: Probability scores + patient_features: Original patient features + """ + print("\n" + "=" * 80) + print("SEPSIS PREDICTION RESULTS") + print("=" * 80) + + for i in range(len(predictions)): + print(f"\nPatient {i+1}:") + print(f" Risk Score: {probabilities[i]:.2%}") + print(f" Prediction: {'SEPSIS RISK' if predictions[i] == 1 else 'Low Risk'}") + + # Show key vital signs + print(" Key Features:") + print(f" Heart Rate: {patient_features.iloc[i]['heart_rate']:.1f} bpm") + print(f" Temperature: {patient_features.iloc[i]['temperature']:.1f}°C") + print( + f" Respiratory Rate: {patient_features.iloc[i]['respiratory_rate']:.1f} /min" + ) + print(f" WBC: {patient_features.iloc[i]['wbc']:.1f} x10^9/L") + print(f" Lactate: {patient_features.iloc[i]['lactate']:.1f} mmol/L") + print(f" Creatinine: {patient_features.iloc[i]['creatinine']:.2f} mg/dL") + + # Risk interpretation + if probabilities[i] >= 0.7: + risk_level = "HIGH" + elif probabilities[i] >= 0.4: + risk_level = "MODERATE" + else: + risk_level = "LOW" + + print(f" Clinical Interpretation: {risk_level} RISK") + + print("\n" + "=" * 80) + + +def main(): + """Main inference pipeline.""" + # Model path (relative to script location) + script_dir = Path(__file__).parent + model_path = script_dir / "models" / "sepsis_model.pkl" + + print("=" * 80) + print("Sepsis Prediction Inference") + print("=" * 80 + "\n") + + # Load model + model_data = load_model(model_path) + + # Create example patients + print("\nCreating example patient data...") + patient_features = create_example_patients() + print(f"Number of patients: {len(patient_features)}") + + # Make predictions + print("\nMaking predictions...") + predictions, probabilities = predict_sepsis( + model_data, patient_features, use_optimal_threshold=True + ) + + # Interpret results + interpret_results(predictions, probabilities, patient_features) + + print("\n" + "=" * 80) + print("Inference complete!") + print("=" * 80) + + +if __name__ == "__main__": + main() diff --git a/scripts/sepsis_prediction_training.py b/scripts/sepsis_prediction_training.py new file mode 100644 index 00000000..a0ea85ce --- /dev/null +++ b/scripts/sepsis_prediction_training.py @@ -0,0 +1,1039 @@ +#!/usr/bin/env python3 +""" +Sepsis Prediction Training Script + +Trains Random Forest, XGBoost, and Logistic Regression models for sepsis prediction +using MIMIC-IV clinical database data. + +Requirements: +- pip install scikit-learn xgboost joblib pandas numpy + +Run: +- python sepsis_prediction_training.py +""" + +import pandas as pd +import numpy as np +from pathlib import Path +from datetime import datetime +from typing import Dict, Tuple, List, Any, Union + +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import ( + accuracy_score, + precision_score, + recall_score, + f1_score, + roc_auc_score, + precision_recall_curve, +) +import xgboost as xgb +import joblib + + +# MIMIC-IV ItemID mappings for features +CHARTEVENTS_ITEMIDS = { + "heart_rate": 220050, + "temperature_f": 223761, + "temperature_c": 223762, + "respiratory_rate": 220210, +} + +LABEVENTS_ITEMIDS = { + "wbc": [51300, 51301], # White Blood Cell Count + "lactate": 50813, + "creatinine": 50912, +} + +# Sepsis ICD-10 codes +SEPSIS_ICD10_CODES = [ + "A41.9", # Sepsis, unspecified organism + "A40", # Streptococcal sepsis (starts with) + "A41", # Other sepsis (starts with) + "R65.20", # Severe sepsis without shock + "R65.21", # Severe sepsis with shock + "R65.1", # SIRS (Systemic Inflammatory Response Syndrome) + "A41.0", # Sepsis due to Streptococcus, group A + "A41.1", # Sepsis due to Streptococcus, group B + "A41.2", # Sepsis due to other specified streptococci + "A41.3", # Sepsis due to Haemophilus influenzae + "A41.4", # Sepsis due to anaerobes + "A41.5", # Sepsis due to other Gram-negative organisms + "A41.50", # Sepsis due to unspecified Gram-negative organism + "A41.51", # Sepsis due to Escherichia coli + "A41.52", # Sepsis due to Pseudomonas + "A41.53", # Sepsis due to Serratia + "A41.59", # Sepsis due to other Gram-negative organisms + "A41.8", # Other specified sepsis + "A41.81", # Sepsis due to Enterococcus + "A41.89", # Other specified sepsis +] + +# Sepsis ICD-9 codes (for older data) +SEPSIS_ICD9_CODES = [ + "038", # Septicemia (starts with) + "99591", # Sepsis + "99592", # Severe sepsis + "78552", # Septic shock +] + + +def load_mimic_data(data_dir: str) -> Dict[str, pd.DataFrame]: + """ + Load all required MIMIC-IV CSV tables. + + Args: + data_dir: Path to MIMIC-IV dataset directory + + Returns: + Dictionary mapping table names to DataFrames + """ + data_dir = Path(data_dir) + + print("Loading MIMIC-IV data...") + + tables = { + "patients": pd.read_csv( + data_dir / "hosp" / "patients.csv.gz", compression="gzip", low_memory=False + ), + "admissions": pd.read_csv( + data_dir / "hosp" / "admissions.csv.gz", + compression="gzip", + low_memory=False, + ), + "icustays": pd.read_csv( + data_dir / "icu" / "icustays.csv.gz", compression="gzip", low_memory=False + ), + "chartevents": pd.read_csv( + data_dir / "icu" / "chartevents.csv.gz", + compression="gzip", + low_memory=False, + ), + "labevents": pd.read_csv( + data_dir / "hosp" / "labevents.csv.gz", compression="gzip", low_memory=False + ), + "diagnoses_icd": pd.read_csv( + data_dir / "hosp" / "diagnoses_icd.csv.gz", + compression="gzip", + low_memory=False, + ), + } + + print(f"Loaded {len(tables)} tables") + for name, df in tables.items(): + print(f" {name}: {len(df)} rows") + + return tables + + +def extract_chartevents_features( + chartevents: pd.DataFrame, icustays: pd.DataFrame +) -> pd.DataFrame: + """ + Extract 2-3 vital signs from chartevents table. + + Args: + chartevents: Chart events DataFrame + icustays: ICU stays DataFrame + + Returns: + DataFrame with features per stay_id + """ + print("Extracting chartevents features...") + + # Filter to relevant itemids + relevant_itemids = list(CHARTEVENTS_ITEMIDS.values()) + chartevents_filtered = chartevents[ + chartevents["itemid"].isin(relevant_itemids) + ].copy() + + # Merge with icustays to get stay times + chartevents_merged = chartevents_filtered.merge( + icustays[["stay_id", "intime", "outtime"]], on="stay_id", how="inner" + ) + + # Convert charttime to datetime + chartevents_merged["charttime"] = pd.to_datetime(chartevents_merged["charttime"]) + chartevents_merged["intime"] = pd.to_datetime(chartevents_merged["intime"]) + + # Filter to first 24 hours of ICU stay + chartevents_merged = chartevents_merged[ + (chartevents_merged["charttime"] >= chartevents_merged["intime"]) + & ( + chartevents_merged["charttime"] + <= chartevents_merged["intime"] + pd.Timedelta(hours=24) + ) + ] + + # Extract numeric values + chartevents_merged["valuenum"] = pd.to_numeric( + chartevents_merged["valuenum"], errors="coerce" + ) + + # Aggregate by stay_id and itemid (take mean) + features = [] + + for stay_id in icustays["stay_id"].unique(): + stay_data = chartevents_merged[chartevents_merged["stay_id"] == stay_id] + + feature_row = {"stay_id": stay_id} + + # Heart Rate + hr_data = stay_data[stay_data["itemid"] == CHARTEVENTS_ITEMIDS["heart_rate"]][ + "valuenum" + ] + feature_row["heart_rate"] = hr_data.mean() if not hr_data.empty else np.nan + + # Temperature (prefer Celsius, convert Fahrenheit if needed) + temp_c = stay_data[stay_data["itemid"] == CHARTEVENTS_ITEMIDS["temperature_c"]][ + "valuenum" + ] + temp_f = stay_data[stay_data["itemid"] == CHARTEVENTS_ITEMIDS["temperature_f"]][ + "valuenum" + ] + + if not temp_c.empty: + feature_row["temperature"] = temp_c.mean() + elif not temp_f.empty: + # Convert Fahrenheit to Celsius + feature_row["temperature"] = (temp_f.mean() - 32) * 5 / 9 + else: + feature_row["temperature"] = np.nan + + # Respiratory Rate + rr_data = stay_data[ + stay_data["itemid"] == CHARTEVENTS_ITEMIDS["respiratory_rate"] + ]["valuenum"] + feature_row["respiratory_rate"] = ( + rr_data.mean() if not rr_data.empty else np.nan + ) + + features.append(feature_row) + + return pd.DataFrame(features) + + +def extract_labevents_features( + labevents: pd.DataFrame, icustays: pd.DataFrame +) -> pd.DataFrame: + """ + Extract 2-3 lab values from labevents table. + + Args: + labevents: Lab events DataFrame + icustays: ICU stays DataFrame + + Returns: + DataFrame with features per stay_id + """ + print("Extracting labevents features...") + + # Get relevant itemids + relevant_itemids = [ + LABEVENTS_ITEMIDS["lactate"], + LABEVENTS_ITEMIDS["creatinine"], + ] + LABEVENTS_ITEMIDS["wbc"] + + labevents_filtered = labevents[labevents["itemid"].isin(relevant_itemids)].copy() + + # Merge with icustays via admissions + # First need to get hadm_id from icustays + icustays_with_hadm = icustays[["stay_id", "hadm_id", "intime"]].copy() + + # Labevents links via hadm_id, then we need to link to stay_id + labevents_merged = labevents_filtered.merge( + icustays_with_hadm, on="hadm_id", how="inner" + ) + + # Convert charttime to datetime + labevents_merged["charttime"] = pd.to_datetime(labevents_merged["charttime"]) + labevents_merged["intime"] = pd.to_datetime(labevents_merged["intime"]) + + # Filter to first 24 hours of ICU stay + labevents_merged = labevents_merged[ + (labevents_merged["charttime"] >= labevents_merged["intime"]) + & ( + labevents_merged["charttime"] + <= labevents_merged["intime"] + pd.Timedelta(hours=24) + ) + ] + + # Extract numeric values + labevents_merged["valuenum"] = pd.to_numeric( + labevents_merged["valuenum"], errors="coerce" + ) + + # Aggregate by stay_id and itemid + features = [] + + for stay_id in icustays["stay_id"].unique(): + stay_data = labevents_merged[labevents_merged["stay_id"] == stay_id] + + feature_row = {"stay_id": stay_id} + + # WBC (check both itemids) + wbc_data = stay_data[stay_data["itemid"].isin(LABEVENTS_ITEMIDS["wbc"])][ + "valuenum" + ] + feature_row["wbc"] = wbc_data.mean() if not wbc_data.empty else np.nan + + # Lactate + lactate_data = stay_data[stay_data["itemid"] == LABEVENTS_ITEMIDS["lactate"]][ + "valuenum" + ] + feature_row["lactate"] = ( + lactate_data.mean() if not lactate_data.empty else np.nan + ) + + # Creatinine + creatinine_data = stay_data[ + stay_data["itemid"] == LABEVENTS_ITEMIDS["creatinine"] + ]["valuenum"] + feature_row["creatinine"] = ( + creatinine_data.mean() if not creatinine_data.empty else np.nan + ) + + features.append(feature_row) + + return pd.DataFrame(features) + + +def extract_demographics( + patients: pd.DataFrame, admissions: pd.DataFrame, icustays: pd.DataFrame +) -> pd.DataFrame: + """ + Extract age and gender from patients table. + + Args: + patients: Patients DataFrame + admissions: Admissions DataFrame (not used, kept for compatibility) + icustays: ICU stays DataFrame + + Returns: + DataFrame with demographics per stay_id + """ + print("Extracting demographics...") + + # icustays already has subject_id, so merge directly with patients + icustays_with_patient = icustays[["stay_id", "subject_id"]].merge( + patients[["subject_id", "gender", "anchor_age"]], on="subject_id", how="left" + ) + + # Use anchor_age if available, otherwise calculate from anchor_year and anchor_age + # For demo data, anchor_age should be available + demographics = icustays_with_patient[["stay_id", "anchor_age", "gender"]].copy() + demographics.rename(columns={"anchor_age": "age"}, inplace=True) + + # Encode gender (M=1, F=0) + demographics["gender_encoded"] = (demographics["gender"] == "M").astype(int) + + return demographics[["stay_id", "age", "gender_encoded"]] + + +def extract_sepsis_labels( + diagnoses_icd: pd.DataFrame, icustays: pd.DataFrame +) -> pd.DataFrame: + """ + Extract sepsis labels from diagnoses_icd table. + Checks both ICD-9 and ICD-10 codes to maximize positive samples. + + Args: + diagnoses_icd: Diagnoses ICD DataFrame + icustays: ICU stays DataFrame + + Returns: + DataFrame with sepsis labels per stay_id + """ + print("Extracting sepsis labels...") + + # Check what ICD versions are available + icd_versions = diagnoses_icd["icd_version"].unique() + print(f" Available ICD versions: {sorted(icd_versions)}") + + all_sepsis_diagnoses = [] + + # Check ICD-10 codes + if 10 in icd_versions: + diagnoses_icd10 = diagnoses_icd[diagnoses_icd["icd_version"] == 10].copy() + print(f" ICD-10 diagnoses: {len(diagnoses_icd10)} rows") + + sepsis_mask = pd.Series( + [False] * len(diagnoses_icd10), index=diagnoses_icd10.index + ) + + for code in SEPSIS_ICD10_CODES: + if "." not in code or code.endswith("."): + # Pattern match (e.g., "A40" matches "A40.x") + code_prefix = code.rstrip(".") + mask = diagnoses_icd10["icd_code"].str.startswith(code_prefix, na=False) + sepsis_mask |= mask + if mask.sum() > 0: + print( + f" Found {mask.sum()} ICD-10 diagnoses matching pattern '{code}'" + ) + else: + # Exact match + mask = diagnoses_icd10["icd_code"] == code + sepsis_mask |= mask + if mask.sum() > 0: + print( + f" Found {mask.sum()} ICD-10 diagnoses with exact code '{code}'" + ) + + sepsis_icd10 = diagnoses_icd10[sepsis_mask].copy() + if len(sepsis_icd10) > 0: + all_sepsis_diagnoses.append(sepsis_icd10) + print(f" Total ICD-10 sepsis diagnoses: {len(sepsis_icd10)}") + + # Check ICD-9 codes + if 9 in icd_versions: + diagnoses_icd9 = diagnoses_icd[diagnoses_icd["icd_version"] == 9].copy() + print(f" ICD-9 diagnoses: {len(diagnoses_icd9)} rows") + + sepsis_mask = pd.Series( + [False] * len(diagnoses_icd9), index=diagnoses_icd9.index + ) + + for code in SEPSIS_ICD9_CODES: + if len(code) <= 3 or code.endswith("."): + # Pattern match (e.g., "038" matches "038.x") + code_prefix = code.rstrip(".") + mask = diagnoses_icd9["icd_code"].str.startswith(code_prefix, na=False) + sepsis_mask |= mask + if mask.sum() > 0: + print( + f" Found {mask.sum()} ICD-9 diagnoses matching pattern '{code}'" + ) + else: + # Exact match + mask = diagnoses_icd9["icd_code"] == code + sepsis_mask |= mask + if mask.sum() > 0: + print( + f" Found {mask.sum()} ICD-9 diagnoses with exact code '{code}'" + ) + + sepsis_icd9 = diagnoses_icd9[sepsis_mask].copy() + if len(sepsis_icd9) > 0: + all_sepsis_diagnoses.append(sepsis_icd9) + print(f" Total ICD-9 sepsis diagnoses: {len(sepsis_icd9)}") + + # Combine all sepsis diagnoses + if all_sepsis_diagnoses: + sepsis_diagnoses = pd.concat(all_sepsis_diagnoses, ignore_index=True) + print(f" Total sepsis diagnoses (ICD-9 + ICD-10): {len(sepsis_diagnoses)}") + + if len(sepsis_diagnoses) > 0: + print( + f" Sample sepsis ICD codes: {sepsis_diagnoses['icd_code'].unique()[:15].tolist()}" + ) + print( + f" Unique hadm_id with sepsis: {sepsis_diagnoses['hadm_id'].nunique()}" + ) + else: + sepsis_diagnoses = pd.DataFrame(columns=diagnoses_icd.columns) + print(" No sepsis diagnoses found") + + # Merge with icustays to get stay_id + icustays_with_hadm = icustays[["stay_id", "hadm_id"]].copy() + + if len(sepsis_diagnoses) > 0: + sepsis_labels = icustays_with_hadm.merge( + sepsis_diagnoses[["hadm_id"]].drop_duplicates(), + on="hadm_id", + how="left", + indicator=True, + ) + else: + sepsis_labels = icustays_with_hadm.copy() + sepsis_labels["_merge"] = "left_only" + + # Create binary label (1 if sepsis, 0 otherwise) + sepsis_labels["sepsis"] = (sepsis_labels["_merge"] == "both").astype(int) + + sepsis_count = sepsis_labels["sepsis"].sum() + print( + f" ICU stays with sepsis: {sepsis_count}/{len(sepsis_labels)} ({sepsis_count/len(sepsis_labels)*100:.2f}%)" + ) + + return sepsis_labels[["stay_id", "sepsis"]] + + +def print_feature_summary(X: pd.DataFrame): + """Print feature statistics with FHIR mapping information. + + Args: + X: Feature matrix with actual data + """ + print("\n" + "=" * 120) + print("FEATURE SUMMARY: MIMIC-IV → Model → FHIR Mapping") + print("=" * 120) + + # Define FHIR mappings for each feature + fhir_mappings = { + "heart_rate": { + "mimic_table": "chartevents", + "mimic_itemid": "220050", + "fhir_resource": "Observation", + "fhir_code": "8867-4", + "fhir_system": "LOINC", + "fhir_display": "Heart rate", + }, + "temperature": { + "mimic_table": "chartevents", + "mimic_itemid": "223762/223761", + "fhir_resource": "Observation", + "fhir_code": "8310-5", + "fhir_system": "LOINC", + "fhir_display": "Body temperature", + }, + "respiratory_rate": { + "mimic_table": "chartevents", + "mimic_itemid": "220210", + "fhir_resource": "Observation", + "fhir_code": "9279-1", + "fhir_system": "LOINC", + "fhir_display": "Respiratory rate", + }, + "wbc": { + "mimic_table": "labevents", + "mimic_itemid": "51300/51301", + "fhir_resource": "Observation", + "fhir_code": "6690-2", + "fhir_system": "LOINC", + "fhir_display": "Leukocytes [#/volume] in Blood", + }, + "lactate": { + "mimic_table": "labevents", + "mimic_itemid": "50813", + "fhir_resource": "Observation", + "fhir_code": "2524-7", + "fhir_system": "LOINC", + "fhir_display": "Lactate [Moles/volume] in Blood", + }, + "creatinine": { + "mimic_table": "labevents", + "mimic_itemid": "50912", + "fhir_resource": "Observation", + "fhir_code": "2160-0", + "fhir_system": "LOINC", + "fhir_display": "Creatinine [Mass/volume] in Serum or Plasma", + }, + "age": { + "mimic_table": "patients", + "mimic_itemid": "anchor_age", + "fhir_resource": "Patient", + "fhir_code": "birthDate", + "fhir_system": "FHIR Core", + "fhir_display": "Patient birth date (calculate age)", + }, + "gender_encoded": { + "mimic_table": "patients", + "mimic_itemid": "gender", + "fhir_resource": "Patient", + "fhir_code": "gender", + "fhir_system": "FHIR Core", + "fhir_display": "Administrative Gender (M/F)", + }, + } + + print( + f"\n{'Feature':<20} {'Mean±SD':<20} {'MIMIC Source':<20} {'FHIR Resource':<20} {'FHIR Code (System)':<30}" + ) + print("-" * 120) + + for feature in X.columns: + mapping = fhir_mappings.get(feature, {}) + + # Calculate statistics + mean_val = X[feature].mean() + std_val = X[feature].std() + + # Format based on feature type + if feature == "gender_encoded": + stats = f"{mean_val:.2f} (M={X[feature].sum():.0f})" + else: + stats = f"{mean_val:.2f}±{std_val:.2f}" + + mimic_source = f"{mapping.get('mimic_table', 'N/A')} ({mapping.get('mimic_itemid', 'N/A')})" + fhir_resource = mapping.get("fhir_resource", "N/A") + fhir_code = ( + f"{mapping.get('fhir_code', 'N/A')} ({mapping.get('fhir_system', 'N/A')})" + ) + + print( + f"{feature:<20} {stats:<20} {mimic_source:<20} {fhir_resource:<20} {fhir_code:<30}" + ) + + print("\n" + "=" * 120) + print( + "Note: Statistics calculated from first 24 hours of ICU stay. Missing values imputed with median." + ) + print("=" * 120 + "\n") + + +def create_feature_matrix( + chartevents_features: pd.DataFrame, + labevents_features: pd.DataFrame, + demographics: pd.DataFrame, + sepsis_labels: pd.DataFrame, +) -> Tuple[pd.DataFrame, pd.Series]: + """ + Create feature matrix and labels from extracted features. + + Args: + chartevents_features: Chart events features + labevents_features: Lab events features + demographics: Demographics features + sepsis_labels: Sepsis labels + + Returns: + Tuple of (feature matrix, labels) + """ + print("Creating feature matrix...") + + # Merge all features on stay_id + features = ( + chartevents_features.merge(labevents_features, on="stay_id", how="outer") + .merge(demographics, on="stay_id", how="outer") + .merge(sepsis_labels, on="stay_id", how="inner") + ) + + # Select feature columns (exclude stay_id and sepsis) + feature_cols = [ + "heart_rate", + "temperature", + "respiratory_rate", + "wbc", + "lactate", + "creatinine", + "age", + "gender_encoded", + ] + + X = features[feature_cols].copy() + y = features["sepsis"].copy() + + print(f"Feature matrix shape: {X.shape}") + print(f"Sepsis cases: {y.sum()} ({y.sum() / len(y) * 100:.2f}%)") + + return X, y + + +def train_models(X_train: pd.DataFrame, y_train: pd.Series) -> Dict[str, Any]: + """ + Train all three models (Random Forest, XGBoost, Logistic Regression). + + Args: + X_train: Training features + y_train: Training labels + + Returns: + Dictionary of trained models + """ + print("\nTraining models...") + + models = {} + + # Check if we have any positive samples + positive_samples = y_train.sum() + total_samples = len(y_train) + positive_rate = positive_samples / total_samples if total_samples > 0 else 0.0 + + print( + f" Positive samples: {positive_samples}/{total_samples} ({positive_rate*100:.2f}%)" + ) + + # Random Forest - use class_weight to handle imbalance + print(" Training Random Forest...") + rf = RandomForestClassifier( + n_estimators=100, + random_state=42, + n_jobs=-1, + class_weight="balanced", # Automatically adjust for class imbalance + ) + rf.fit(X_train, y_train) + models["RandomForest"] = rf + + # XGBoost - handle case with no positive samples + print(" Training XGBoost...") + if positive_samples == 0: + # When there are no positive samples, set base_score to a small value + # and use scale_pos_weight to avoid errors + xgb_model = xgb.XGBClassifier( + random_state=42, + n_jobs=-1, + eval_metric="logloss", + base_score=0.01, # Small positive value instead of 0 + scale_pos_weight=1.0, + ) + else: + # Calculate scale_pos_weight for imbalanced data + scale_pos_weight = (total_samples - positive_samples) / positive_samples + xgb_model = xgb.XGBClassifier( + random_state=42, + n_jobs=-1, + eval_metric="logloss", + scale_pos_weight=scale_pos_weight, + ) + xgb_model.fit(X_train, y_train) + models["XGBoost"] = xgb_model + + # Logistic Regression (with scaling) - use class_weight to handle imbalance + print(" Training Logistic Regression...") + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + lr = LogisticRegression( + random_state=42, + max_iter=1000, + class_weight="balanced", # Automatically adjust for class imbalance + ) + lr.fit(X_train_scaled, y_train) + models["LogisticRegression"] = lr + models["scaler"] = scaler # Store scaler for later use + + return models + + +def evaluate_models( + models: Dict[str, Any], + X_test: pd.DataFrame, + y_test: pd.Series, + feature_names: List[str], +) -> Dict[str, Dict[str, float]]: + """ + Evaluate and compare all models. + + Args: + models: Dictionary of trained models + X_test: Test features + y_test: Test labels + feature_names: List of feature names + + Returns: + Dictionary of evaluation metrics for each model + """ + print("\nEvaluating models...") + print( + f"Test set: {len(y_test)} samples, {y_test.sum()} positive ({y_test.sum()/len(y_test)*100:.2f}%)" + ) + + results = {} + + for name, model in models.items(): + if name == "scaler": + continue + + # Get probability predictions + if name == "LogisticRegression": + X_test_scaled = models["scaler"].transform(X_test) + y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] + else: + y_pred_proba = model.predict_proba(X_test)[:, 1] + + # Use default threshold (0.5) for predictions + y_pred = (y_pred_proba >= 0.5).astype(int) + + # Calculate metrics with default threshold + metrics = { + "accuracy": accuracy_score(y_test, y_pred), + "precision": precision_score(y_test, y_pred, zero_division=0), + "recall": recall_score(y_test, y_pred, zero_division=0), + "f1": f1_score(y_test, y_pred, zero_division=0), + "auc": roc_auc_score(y_test, y_pred_proba) + if len(np.unique(y_test)) > 1 + else 0.0, + } + + # Try to find optimal threshold for F1 score + if len(np.unique(y_test)) > 1 and y_test.sum() > 0: + precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba) + f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10) + optimal_idx = np.argmax(f1_scores) + optimal_threshold = ( + thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5 + ) + optimal_f1 = f1_scores[optimal_idx] + + # Predictions with optimal threshold + y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int) + metrics["optimal_threshold"] = optimal_threshold + metrics["optimal_f1"] = optimal_f1 + metrics["optimal_precision"] = precision_score( + y_test, y_pred_optimal, zero_division=0 + ) + metrics["optimal_recall"] = recall_score( + y_test, y_pred_optimal, zero_division=0 + ) + else: + metrics["optimal_threshold"] = 0.5 + metrics["optimal_f1"] = 0.0 + metrics["optimal_precision"] = 0.0 + metrics["optimal_recall"] = 0.0 + + results[name] = metrics + + print(f"\n{name}:") + print( + f" Predictions: {y_pred.sum()} positive predicted (actual: {y_test.sum()})" + ) + print(f" Accuracy: {metrics['accuracy']:.4f}") + print(f" Precision: {metrics['precision']:.4f}") + print(f" Recall: {metrics['recall']:.4f}") + print(f" F1-score: {metrics['f1']:.4f}") + print(f" AUC-ROC: {metrics['auc']:.4f}") + if metrics["optimal_f1"] > 0: + print(f" Optimal threshold: {metrics['optimal_threshold']:.4f}") + print(f" Optimal F1-score: {metrics['optimal_f1']:.4f}") + print(f" Optimal Precision: {metrics['optimal_precision']:.4f}") + print(f" Optimal Recall: {metrics['optimal_recall']:.4f}") + + # Show feature importance for tree-based models + if hasattr(model, "feature_importances_"): + print("\n Top 5 Feature Importances:") + importances = model.feature_importances_ + indices = np.argsort(importances)[::-1][:5] + for idx in indices: + print(f" {feature_names[idx]}: {importances[idx]:.4f}") + + return results + + +def select_best_model( + models: Dict[str, Any], + results: Dict[str, Dict[str, float]], + metric: str = "f1", +) -> Tuple[str, Any, Dict[str, float]]: + """ + Select best model based on specified metric. + + Args: + models: Dictionary of trained models + results: Evaluation results + metric: Metric to optimize ("f1", "recall", "precision", "auc") + + Returns: + Tuple of (best model name, best model, best metrics) + """ + print(f"\nSelecting best model based on {metric}...") + + # Get the appropriate metric value (prefer optimal if available) + def get_metric_value(metrics, metric_name): + if metric_name == "f1": + return metrics.get("optimal_f1", metrics["f1"]) + elif metric_name == "recall": + return metrics.get("optimal_recall", metrics["recall"]) + elif metric_name == "precision": + return metrics.get("optimal_precision", metrics["precision"]) + elif metric_name == "auc": + return metrics.get("auc", 0.0) + else: + return metrics.get("optimal_f1", metrics["f1"]) + + best_name = max(results.keys(), key=lambda k: get_metric_value(results[k], metric)) + best_model = models[best_name] + best_metrics = results[best_name] + + best_value = get_metric_value(best_metrics, metric) + print(f"Best model: {best_name} ({metric}: {best_value:.4f})") + + return best_name, best_model, best_metrics + + +def save_model( + model: Any, + model_name: str, + feature_names: List[str], + metrics: Dict[str, float], + scaler: Any, + output_path: Union[str, Path], +) -> None: + """ + Save the best model with metadata. + + Args: + model: Trained model + model_name: Name of the model + feature_names: List of feature names + metrics: Evaluation metrics + scaler: StandardScaler (if Logistic Regression, None otherwise) + output_path: Path to save model + """ + print(f"\nSaving model to {output_path}...") + + # Create output directory if it doesn't exist + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Prepare metadata + metadata = { + "model_name": model_name, + "training_date": datetime.now().isoformat(), + "feature_names": feature_names, + "metrics": metrics, + "itemid_mappings": { + "chartevents": CHARTEVENTS_ITEMIDS, + "labevents": LABEVENTS_ITEMIDS, + }, + "sepsis_icd_codes": { + "icd10": SEPSIS_ICD10_CODES, + "icd9": SEPSIS_ICD9_CODES, + }, + } + + # Save model and metadata + model_data = { + "model": model, + "scaler": scaler, + "metadata": metadata, + } + + joblib.dump(model_data, output_path) + + print("Model saved successfully!") + + +def main(): + """Main training pipeline.""" + # Data directory + data_dir = "../datasets/mimic-iv-clinical-database-demo-2.2" + + # Output path (relative to script location) + script_dir = Path(__file__).parent + output_path = script_dir / "models" / "sepsis_model.pkl" + + print("=" * 60) + print("Sepsis Prediction Model Training") + print("=" * 60) + + # Load data + tables = load_mimic_data(data_dir) + + # Extract features + chartevents_features = extract_chartevents_features( + tables["chartevents"], tables["icustays"] + ) + labevents_features = extract_labevents_features( + tables["labevents"], tables["icustays"] + ) + demographics = extract_demographics( + tables["patients"], tables["admissions"], tables["icustays"] + ) + + # Extract labels + sepsis_labels = extract_sepsis_labels(tables["diagnoses_icd"], tables["icustays"]) + + # Create feature matrix + X, y = create_feature_matrix( + chartevents_features, + labevents_features, + demographics, + sepsis_labels, + ) + + # Handle missing values (impute with median) + print("\nHandling missing values...") + missing_before = X.isnull().sum().sum() + print(f" Missing values before imputation: {missing_before}") + X = X.fillna(X.median()) + + # Print feature summary with actual data statistics + print_feature_summary(X) + + # Split data with careful stratification to ensure positive samples in both sets + print("\nSplitting data...") + if len(np.unique(y)) > 1 and y.sum() > 0: + # Use stratification to ensure positive samples in both train and test + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y + ) + print( + f" Training set: {len(X_train)} samples ({y_train.sum()} positive, {y_train.sum()/len(y_train)*100:.2f}%)" + ) + print( + f" Test set: {len(X_test)} samples ({y_test.sum()} positive, {y_test.sum()/len(y_test)*100:.2f}%)" + ) + + # Warn if test set has no positive samples (shouldn't happen with stratify, but check anyway) + if y_test.sum() == 0: + print( + " WARNING: Test set has no positive samples! Consider using a different random seed." + ) + else: + print( + " Warning: No positive samples or only one class. Skipping stratification." + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + print(f" Training set: {len(X_train)} samples") + print(f" Test set: {len(X_test)} samples") + + # Apply oversampling to training data to balance classes + print("\nApplying oversampling to training data...") + try: + from imblearn.over_sampling import SMOTE + + # Only apply SMOTE if we have positive samples + if y_train.sum() > 0 and len(np.unique(y_train)) > 1: + print( + f" Before oversampling: {len(X_train)} samples ({y_train.sum()} positive, {y_train.sum()/len(y_train)*100:.2f}%)" + ) + # Ensure k_neighbors doesn't exceed available positive samples + k_neighbors = min(5, max(1, y_train.sum() - 1)) + smote = SMOTE(random_state=42, k_neighbors=k_neighbors) + X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train) + print( + f" After oversampling: {len(X_train_resampled)} samples ({y_train_resampled.sum()} positive, {y_train_resampled.sum()/len(X_train_resampled)*100:.2f}%)" + ) + X_train = pd.DataFrame( + X_train_resampled, + columns=X_train.columns, + index=X_train.index[: len(X_train_resampled)], + ) + y_train = pd.Series( + y_train_resampled, index=y_train.index[: len(y_train_resampled)] + ) + else: + print(" Skipping oversampling: insufficient positive samples") + except (ImportError, ModuleNotFoundError) as e: + print( + " imbalanced-learn not installed. Install with: pip install imbalanced-learn" + ) + print(f" Error: {e}") + print(" Proceeding without oversampling...") + + # Train models + models = train_models(X_train, y_train) + + # Evaluate models + feature_names = X.columns.tolist() + results = evaluate_models(models, X_test, y_test, feature_names) + + # Select best model (can change metric: "f1", "recall", "precision", "auc") + # For sepsis prediction, recall (sensitivity) is often most important + best_name, best_model, best_metrics = select_best_model( + models, results, metric="f1" + ) + + # Save best model + scaler = models.get("scaler") + save_model( + best_model, + best_name, + feature_names, + best_metrics, + scaler, + output_path, + ) + + print("\n" + "=" * 60) + print("Training complete!") + print("=" * 60) + + +if __name__ == "__main__": + main() From 8d68b7eddb214091356e3519198eb8a311c49c98 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Tue, 2 Dec 2025 12:20:44 +0000 Subject: [PATCH 09/12] Remove output directory from git tracking --- output/README.md | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 output/README.md diff --git a/output/README.md b/output/README.md deleted file mode 100644 index 2b90777e..00000000 --- a/output/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Output Directory - -This is where the requests and responses generated by sandbox runs will be saved. From b63660709221b3a99cfbe6e029d7270ee7f8dcc8 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Tue, 2 Dec 2025 12:24:09 +0000 Subject: [PATCH 10/12] Add notebooks --- cookbook/sepsis_prediction_inference.py | 206 ----- cookbook/sepsis_prediction_training.py | 1039 ----------------------- notebooks/fhir_ml_workflow.ipynb | 567 +++++++++++++ 3 files changed, 567 insertions(+), 1245 deletions(-) delete mode 100644 cookbook/sepsis_prediction_inference.py delete mode 100644 cookbook/sepsis_prediction_training.py create mode 100644 notebooks/fhir_ml_workflow.ipynb diff --git a/cookbook/sepsis_prediction_inference.py b/cookbook/sepsis_prediction_inference.py deleted file mode 100644 index 33edb858..00000000 --- a/cookbook/sepsis_prediction_inference.py +++ /dev/null @@ -1,206 +0,0 @@ -#!/usr/bin/env python3 -""" -Sepsis Prediction Inference Script - -Demonstrates how to load and use the trained sepsis prediction model. - -Requirements: -- pip install scikit-learn xgboost joblib pandas numpy - -Usage: -- python sepsis_prediction_inference.py -""" - -import pandas as pd -import numpy as np -from pathlib import Path -from typing import Dict, Union, Tuple -import joblib - - -def load_model(model_path: Union[str, Path]) -> Dict: - """ - Load trained sepsis prediction model. - - Args: - model_path: Path to saved model file - - Returns: - Dictionary containing model, scaler, and metadata - """ - print(f"Loading model from {model_path}...") - model_data = joblib.load(model_path) - - metadata = model_data["metadata"] - print(f" Model: {metadata['model_name']}") - print(f" Training date: {metadata['training_date']}") - print(f" Features: {', '.join(metadata['feature_names'])}") - print(f" Test F1-score: {metadata['metrics']['f1']:.4f}") - print(f" Test AUC-ROC: {metadata['metrics']['auc']:.4f}") - - if "optimal_threshold" in metadata["metrics"]: - print(f" Optimal threshold: {metadata['metrics']['optimal_threshold']:.4f}") - print(f" Optimal F1-score: {metadata['metrics']['optimal_f1']:.4f}") - - return model_data - - -def predict_sepsis( - model_data: Dict, patient_features: pd.DataFrame, use_optimal_threshold: bool = True -) -> Tuple[np.ndarray, np.ndarray]: - """ - Predict sepsis risk for patient(s). - - Args: - model_data: Dictionary containing model, scaler, and metadata - patient_features: DataFrame with patient features - use_optimal_threshold: Whether to use optimal threshold (default: True) - - Returns: - Tuple of (predictions, probabilities) - """ - model = model_data["model"] - scaler = model_data["scaler"] - metadata = model_data["metadata"] - feature_names = metadata["feature_names"] - - # Ensure features are in correct order - patient_features = patient_features[feature_names] - - # Apply scaling if Logistic Regression - if scaler is not None: - patient_features_scaled = scaler.transform(patient_features) - probabilities = model.predict_proba(patient_features_scaled)[:, 1] - else: - probabilities = model.predict_proba(patient_features)[:, 1] - - # Use optimal threshold if available and requested - if use_optimal_threshold and "optimal_threshold" in metadata["metrics"]: - threshold = metadata["metrics"]["optimal_threshold"] - else: - threshold = 0.5 - - predictions = (probabilities >= threshold).astype(int) - - return predictions, probabilities - - -def create_example_patients() -> pd.DataFrame: - """ - Create example patient data for demonstration. - - Returns: - DataFrame with example patient features - """ - # Example patient data - # Patient 1: Healthy patient (low risk) - # Patient 2: Moderate risk (some abnormal values) - # Patient 3: Low risk (normal values) - # Patient 4: High risk for sepsis (multiple severe abnormalities) - # Patient 5: Critical sepsis risk (severe multi-organ dysfunction) - patients = pd.DataFrame( - { - "heart_rate": [85, 110, 75, 130, 145], # beats/min (normal: 60-100) - "temperature": [ - 37.2, - 38.5, - 36.8, - 39.2, - 35.5, - ], # Celsius (normal: 36.5-37.5, hypothermia <36) - "respiratory_rate": [16, 24, 14, 30, 35], # breaths/min (normal: 12-20) - "wbc": [8.5, 15.2, 7.0, 18.5, 22.0], # x10^9/L (normal: 4-11) - "lactate": [ - 1.2, - 3.5, - 0.9, - 4.8, - 6.5, - ], # mmol/L (normal: <2, severe sepsis: >4) - "creatinine": [0.9, 1.8, 0.8, 2.5, 3.2], # mg/dL (normal: 0.6-1.2) - "age": [45, 68, 35, 72, 78], # years - "gender_encoded": [1, 0, 1, 1, 0], # 1=Male, 0=Female - } - ) - - return patients - - -def interpret_results( - predictions: np.ndarray, probabilities: np.ndarray, patient_features: pd.DataFrame -) -> None: - """ - Interpret and display prediction results. - - Args: - predictions: Binary predictions (0=no sepsis, 1=sepsis) - probabilities: Probability scores - patient_features: Original patient features - """ - print("\n" + "=" * 80) - print("SEPSIS PREDICTION RESULTS") - print("=" * 80) - - for i in range(len(predictions)): - print(f"\nPatient {i+1}:") - print(f" Risk Score: {probabilities[i]:.2%}") - print(f" Prediction: {'SEPSIS RISK' if predictions[i] == 1 else 'Low Risk'}") - - # Show key vital signs - print(" Key Features:") - print(f" Heart Rate: {patient_features.iloc[i]['heart_rate']:.1f} bpm") - print(f" Temperature: {patient_features.iloc[i]['temperature']:.1f}°C") - print( - f" Respiratory Rate: {patient_features.iloc[i]['respiratory_rate']:.1f} /min" - ) - print(f" WBC: {patient_features.iloc[i]['wbc']:.1f} x10^9/L") - print(f" Lactate: {patient_features.iloc[i]['lactate']:.1f} mmol/L") - print(f" Creatinine: {patient_features.iloc[i]['creatinine']:.2f} mg/dL") - - # Risk interpretation - if probabilities[i] >= 0.7: - risk_level = "HIGH" - elif probabilities[i] >= 0.4: - risk_level = "MODERATE" - else: - risk_level = "LOW" - - print(f" Clinical Interpretation: {risk_level} RISK") - - print("\n" + "=" * 80) - - -def main(): - """Main inference pipeline.""" - # Model path (relative to script location) - script_dir = Path(__file__).parent - model_path = script_dir / "models" / "sepsis_model.pkl" - - print("=" * 80) - print("Sepsis Prediction Inference") - print("=" * 80 + "\n") - - # Load model - model_data = load_model(model_path) - - # Create example patients - print("\nCreating example patient data...") - patient_features = create_example_patients() - print(f"Number of patients: {len(patient_features)}") - - # Make predictions - print("\nMaking predictions...") - predictions, probabilities = predict_sepsis( - model_data, patient_features, use_optimal_threshold=True - ) - - # Interpret results - interpret_results(predictions, probabilities, patient_features) - - print("\n" + "=" * 80) - print("Inference complete!") - print("=" * 80) - - -if __name__ == "__main__": - main() diff --git a/cookbook/sepsis_prediction_training.py b/cookbook/sepsis_prediction_training.py deleted file mode 100644 index a0ea85ce..00000000 --- a/cookbook/sepsis_prediction_training.py +++ /dev/null @@ -1,1039 +0,0 @@ -#!/usr/bin/env python3 -""" -Sepsis Prediction Training Script - -Trains Random Forest, XGBoost, and Logistic Regression models for sepsis prediction -using MIMIC-IV clinical database data. - -Requirements: -- pip install scikit-learn xgboost joblib pandas numpy - -Run: -- python sepsis_prediction_training.py -""" - -import pandas as pd -import numpy as np -from pathlib import Path -from datetime import datetime -from typing import Dict, Tuple, List, Any, Union - -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from sklearn.metrics import ( - accuracy_score, - precision_score, - recall_score, - f1_score, - roc_auc_score, - precision_recall_curve, -) -import xgboost as xgb -import joblib - - -# MIMIC-IV ItemID mappings for features -CHARTEVENTS_ITEMIDS = { - "heart_rate": 220050, - "temperature_f": 223761, - "temperature_c": 223762, - "respiratory_rate": 220210, -} - -LABEVENTS_ITEMIDS = { - "wbc": [51300, 51301], # White Blood Cell Count - "lactate": 50813, - "creatinine": 50912, -} - -# Sepsis ICD-10 codes -SEPSIS_ICD10_CODES = [ - "A41.9", # Sepsis, unspecified organism - "A40", # Streptococcal sepsis (starts with) - "A41", # Other sepsis (starts with) - "R65.20", # Severe sepsis without shock - "R65.21", # Severe sepsis with shock - "R65.1", # SIRS (Systemic Inflammatory Response Syndrome) - "A41.0", # Sepsis due to Streptococcus, group A - "A41.1", # Sepsis due to Streptococcus, group B - "A41.2", # Sepsis due to other specified streptococci - "A41.3", # Sepsis due to Haemophilus influenzae - "A41.4", # Sepsis due to anaerobes - "A41.5", # Sepsis due to other Gram-negative organisms - "A41.50", # Sepsis due to unspecified Gram-negative organism - "A41.51", # Sepsis due to Escherichia coli - "A41.52", # Sepsis due to Pseudomonas - "A41.53", # Sepsis due to Serratia - "A41.59", # Sepsis due to other Gram-negative organisms - "A41.8", # Other specified sepsis - "A41.81", # Sepsis due to Enterococcus - "A41.89", # Other specified sepsis -] - -# Sepsis ICD-9 codes (for older data) -SEPSIS_ICD9_CODES = [ - "038", # Septicemia (starts with) - "99591", # Sepsis - "99592", # Severe sepsis - "78552", # Septic shock -] - - -def load_mimic_data(data_dir: str) -> Dict[str, pd.DataFrame]: - """ - Load all required MIMIC-IV CSV tables. - - Args: - data_dir: Path to MIMIC-IV dataset directory - - Returns: - Dictionary mapping table names to DataFrames - """ - data_dir = Path(data_dir) - - print("Loading MIMIC-IV data...") - - tables = { - "patients": pd.read_csv( - data_dir / "hosp" / "patients.csv.gz", compression="gzip", low_memory=False - ), - "admissions": pd.read_csv( - data_dir / "hosp" / "admissions.csv.gz", - compression="gzip", - low_memory=False, - ), - "icustays": pd.read_csv( - data_dir / "icu" / "icustays.csv.gz", compression="gzip", low_memory=False - ), - "chartevents": pd.read_csv( - data_dir / "icu" / "chartevents.csv.gz", - compression="gzip", - low_memory=False, - ), - "labevents": pd.read_csv( - data_dir / "hosp" / "labevents.csv.gz", compression="gzip", low_memory=False - ), - "diagnoses_icd": pd.read_csv( - data_dir / "hosp" / "diagnoses_icd.csv.gz", - compression="gzip", - low_memory=False, - ), - } - - print(f"Loaded {len(tables)} tables") - for name, df in tables.items(): - print(f" {name}: {len(df)} rows") - - return tables - - -def extract_chartevents_features( - chartevents: pd.DataFrame, icustays: pd.DataFrame -) -> pd.DataFrame: - """ - Extract 2-3 vital signs from chartevents table. - - Args: - chartevents: Chart events DataFrame - icustays: ICU stays DataFrame - - Returns: - DataFrame with features per stay_id - """ - print("Extracting chartevents features...") - - # Filter to relevant itemids - relevant_itemids = list(CHARTEVENTS_ITEMIDS.values()) - chartevents_filtered = chartevents[ - chartevents["itemid"].isin(relevant_itemids) - ].copy() - - # Merge with icustays to get stay times - chartevents_merged = chartevents_filtered.merge( - icustays[["stay_id", "intime", "outtime"]], on="stay_id", how="inner" - ) - - # Convert charttime to datetime - chartevents_merged["charttime"] = pd.to_datetime(chartevents_merged["charttime"]) - chartevents_merged["intime"] = pd.to_datetime(chartevents_merged["intime"]) - - # Filter to first 24 hours of ICU stay - chartevents_merged = chartevents_merged[ - (chartevents_merged["charttime"] >= chartevents_merged["intime"]) - & ( - chartevents_merged["charttime"] - <= chartevents_merged["intime"] + pd.Timedelta(hours=24) - ) - ] - - # Extract numeric values - chartevents_merged["valuenum"] = pd.to_numeric( - chartevents_merged["valuenum"], errors="coerce" - ) - - # Aggregate by stay_id and itemid (take mean) - features = [] - - for stay_id in icustays["stay_id"].unique(): - stay_data = chartevents_merged[chartevents_merged["stay_id"] == stay_id] - - feature_row = {"stay_id": stay_id} - - # Heart Rate - hr_data = stay_data[stay_data["itemid"] == CHARTEVENTS_ITEMIDS["heart_rate"]][ - "valuenum" - ] - feature_row["heart_rate"] = hr_data.mean() if not hr_data.empty else np.nan - - # Temperature (prefer Celsius, convert Fahrenheit if needed) - temp_c = stay_data[stay_data["itemid"] == CHARTEVENTS_ITEMIDS["temperature_c"]][ - "valuenum" - ] - temp_f = stay_data[stay_data["itemid"] == CHARTEVENTS_ITEMIDS["temperature_f"]][ - "valuenum" - ] - - if not temp_c.empty: - feature_row["temperature"] = temp_c.mean() - elif not temp_f.empty: - # Convert Fahrenheit to Celsius - feature_row["temperature"] = (temp_f.mean() - 32) * 5 / 9 - else: - feature_row["temperature"] = np.nan - - # Respiratory Rate - rr_data = stay_data[ - stay_data["itemid"] == CHARTEVENTS_ITEMIDS["respiratory_rate"] - ]["valuenum"] - feature_row["respiratory_rate"] = ( - rr_data.mean() if not rr_data.empty else np.nan - ) - - features.append(feature_row) - - return pd.DataFrame(features) - - -def extract_labevents_features( - labevents: pd.DataFrame, icustays: pd.DataFrame -) -> pd.DataFrame: - """ - Extract 2-3 lab values from labevents table. - - Args: - labevents: Lab events DataFrame - icustays: ICU stays DataFrame - - Returns: - DataFrame with features per stay_id - """ - print("Extracting labevents features...") - - # Get relevant itemids - relevant_itemids = [ - LABEVENTS_ITEMIDS["lactate"], - LABEVENTS_ITEMIDS["creatinine"], - ] + LABEVENTS_ITEMIDS["wbc"] - - labevents_filtered = labevents[labevents["itemid"].isin(relevant_itemids)].copy() - - # Merge with icustays via admissions - # First need to get hadm_id from icustays - icustays_with_hadm = icustays[["stay_id", "hadm_id", "intime"]].copy() - - # Labevents links via hadm_id, then we need to link to stay_id - labevents_merged = labevents_filtered.merge( - icustays_with_hadm, on="hadm_id", how="inner" - ) - - # Convert charttime to datetime - labevents_merged["charttime"] = pd.to_datetime(labevents_merged["charttime"]) - labevents_merged["intime"] = pd.to_datetime(labevents_merged["intime"]) - - # Filter to first 24 hours of ICU stay - labevents_merged = labevents_merged[ - (labevents_merged["charttime"] >= labevents_merged["intime"]) - & ( - labevents_merged["charttime"] - <= labevents_merged["intime"] + pd.Timedelta(hours=24) - ) - ] - - # Extract numeric values - labevents_merged["valuenum"] = pd.to_numeric( - labevents_merged["valuenum"], errors="coerce" - ) - - # Aggregate by stay_id and itemid - features = [] - - for stay_id in icustays["stay_id"].unique(): - stay_data = labevents_merged[labevents_merged["stay_id"] == stay_id] - - feature_row = {"stay_id": stay_id} - - # WBC (check both itemids) - wbc_data = stay_data[stay_data["itemid"].isin(LABEVENTS_ITEMIDS["wbc"])][ - "valuenum" - ] - feature_row["wbc"] = wbc_data.mean() if not wbc_data.empty else np.nan - - # Lactate - lactate_data = stay_data[stay_data["itemid"] == LABEVENTS_ITEMIDS["lactate"]][ - "valuenum" - ] - feature_row["lactate"] = ( - lactate_data.mean() if not lactate_data.empty else np.nan - ) - - # Creatinine - creatinine_data = stay_data[ - stay_data["itemid"] == LABEVENTS_ITEMIDS["creatinine"] - ]["valuenum"] - feature_row["creatinine"] = ( - creatinine_data.mean() if not creatinine_data.empty else np.nan - ) - - features.append(feature_row) - - return pd.DataFrame(features) - - -def extract_demographics( - patients: pd.DataFrame, admissions: pd.DataFrame, icustays: pd.DataFrame -) -> pd.DataFrame: - """ - Extract age and gender from patients table. - - Args: - patients: Patients DataFrame - admissions: Admissions DataFrame (not used, kept for compatibility) - icustays: ICU stays DataFrame - - Returns: - DataFrame with demographics per stay_id - """ - print("Extracting demographics...") - - # icustays already has subject_id, so merge directly with patients - icustays_with_patient = icustays[["stay_id", "subject_id"]].merge( - patients[["subject_id", "gender", "anchor_age"]], on="subject_id", how="left" - ) - - # Use anchor_age if available, otherwise calculate from anchor_year and anchor_age - # For demo data, anchor_age should be available - demographics = icustays_with_patient[["stay_id", "anchor_age", "gender"]].copy() - demographics.rename(columns={"anchor_age": "age"}, inplace=True) - - # Encode gender (M=1, F=0) - demographics["gender_encoded"] = (demographics["gender"] == "M").astype(int) - - return demographics[["stay_id", "age", "gender_encoded"]] - - -def extract_sepsis_labels( - diagnoses_icd: pd.DataFrame, icustays: pd.DataFrame -) -> pd.DataFrame: - """ - Extract sepsis labels from diagnoses_icd table. - Checks both ICD-9 and ICD-10 codes to maximize positive samples. - - Args: - diagnoses_icd: Diagnoses ICD DataFrame - icustays: ICU stays DataFrame - - Returns: - DataFrame with sepsis labels per stay_id - """ - print("Extracting sepsis labels...") - - # Check what ICD versions are available - icd_versions = diagnoses_icd["icd_version"].unique() - print(f" Available ICD versions: {sorted(icd_versions)}") - - all_sepsis_diagnoses = [] - - # Check ICD-10 codes - if 10 in icd_versions: - diagnoses_icd10 = diagnoses_icd[diagnoses_icd["icd_version"] == 10].copy() - print(f" ICD-10 diagnoses: {len(diagnoses_icd10)} rows") - - sepsis_mask = pd.Series( - [False] * len(diagnoses_icd10), index=diagnoses_icd10.index - ) - - for code in SEPSIS_ICD10_CODES: - if "." not in code or code.endswith("."): - # Pattern match (e.g., "A40" matches "A40.x") - code_prefix = code.rstrip(".") - mask = diagnoses_icd10["icd_code"].str.startswith(code_prefix, na=False) - sepsis_mask |= mask - if mask.sum() > 0: - print( - f" Found {mask.sum()} ICD-10 diagnoses matching pattern '{code}'" - ) - else: - # Exact match - mask = diagnoses_icd10["icd_code"] == code - sepsis_mask |= mask - if mask.sum() > 0: - print( - f" Found {mask.sum()} ICD-10 diagnoses with exact code '{code}'" - ) - - sepsis_icd10 = diagnoses_icd10[sepsis_mask].copy() - if len(sepsis_icd10) > 0: - all_sepsis_diagnoses.append(sepsis_icd10) - print(f" Total ICD-10 sepsis diagnoses: {len(sepsis_icd10)}") - - # Check ICD-9 codes - if 9 in icd_versions: - diagnoses_icd9 = diagnoses_icd[diagnoses_icd["icd_version"] == 9].copy() - print(f" ICD-9 diagnoses: {len(diagnoses_icd9)} rows") - - sepsis_mask = pd.Series( - [False] * len(diagnoses_icd9), index=diagnoses_icd9.index - ) - - for code in SEPSIS_ICD9_CODES: - if len(code) <= 3 or code.endswith("."): - # Pattern match (e.g., "038" matches "038.x") - code_prefix = code.rstrip(".") - mask = diagnoses_icd9["icd_code"].str.startswith(code_prefix, na=False) - sepsis_mask |= mask - if mask.sum() > 0: - print( - f" Found {mask.sum()} ICD-9 diagnoses matching pattern '{code}'" - ) - else: - # Exact match - mask = diagnoses_icd9["icd_code"] == code - sepsis_mask |= mask - if mask.sum() > 0: - print( - f" Found {mask.sum()} ICD-9 diagnoses with exact code '{code}'" - ) - - sepsis_icd9 = diagnoses_icd9[sepsis_mask].copy() - if len(sepsis_icd9) > 0: - all_sepsis_diagnoses.append(sepsis_icd9) - print(f" Total ICD-9 sepsis diagnoses: {len(sepsis_icd9)}") - - # Combine all sepsis diagnoses - if all_sepsis_diagnoses: - sepsis_diagnoses = pd.concat(all_sepsis_diagnoses, ignore_index=True) - print(f" Total sepsis diagnoses (ICD-9 + ICD-10): {len(sepsis_diagnoses)}") - - if len(sepsis_diagnoses) > 0: - print( - f" Sample sepsis ICD codes: {sepsis_diagnoses['icd_code'].unique()[:15].tolist()}" - ) - print( - f" Unique hadm_id with sepsis: {sepsis_diagnoses['hadm_id'].nunique()}" - ) - else: - sepsis_diagnoses = pd.DataFrame(columns=diagnoses_icd.columns) - print(" No sepsis diagnoses found") - - # Merge with icustays to get stay_id - icustays_with_hadm = icustays[["stay_id", "hadm_id"]].copy() - - if len(sepsis_diagnoses) > 0: - sepsis_labels = icustays_with_hadm.merge( - sepsis_diagnoses[["hadm_id"]].drop_duplicates(), - on="hadm_id", - how="left", - indicator=True, - ) - else: - sepsis_labels = icustays_with_hadm.copy() - sepsis_labels["_merge"] = "left_only" - - # Create binary label (1 if sepsis, 0 otherwise) - sepsis_labels["sepsis"] = (sepsis_labels["_merge"] == "both").astype(int) - - sepsis_count = sepsis_labels["sepsis"].sum() - print( - f" ICU stays with sepsis: {sepsis_count}/{len(sepsis_labels)} ({sepsis_count/len(sepsis_labels)*100:.2f}%)" - ) - - return sepsis_labels[["stay_id", "sepsis"]] - - -def print_feature_summary(X: pd.DataFrame): - """Print feature statistics with FHIR mapping information. - - Args: - X: Feature matrix with actual data - """ - print("\n" + "=" * 120) - print("FEATURE SUMMARY: MIMIC-IV → Model → FHIR Mapping") - print("=" * 120) - - # Define FHIR mappings for each feature - fhir_mappings = { - "heart_rate": { - "mimic_table": "chartevents", - "mimic_itemid": "220050", - "fhir_resource": "Observation", - "fhir_code": "8867-4", - "fhir_system": "LOINC", - "fhir_display": "Heart rate", - }, - "temperature": { - "mimic_table": "chartevents", - "mimic_itemid": "223762/223761", - "fhir_resource": "Observation", - "fhir_code": "8310-5", - "fhir_system": "LOINC", - "fhir_display": "Body temperature", - }, - "respiratory_rate": { - "mimic_table": "chartevents", - "mimic_itemid": "220210", - "fhir_resource": "Observation", - "fhir_code": "9279-1", - "fhir_system": "LOINC", - "fhir_display": "Respiratory rate", - }, - "wbc": { - "mimic_table": "labevents", - "mimic_itemid": "51300/51301", - "fhir_resource": "Observation", - "fhir_code": "6690-2", - "fhir_system": "LOINC", - "fhir_display": "Leukocytes [#/volume] in Blood", - }, - "lactate": { - "mimic_table": "labevents", - "mimic_itemid": "50813", - "fhir_resource": "Observation", - "fhir_code": "2524-7", - "fhir_system": "LOINC", - "fhir_display": "Lactate [Moles/volume] in Blood", - }, - "creatinine": { - "mimic_table": "labevents", - "mimic_itemid": "50912", - "fhir_resource": "Observation", - "fhir_code": "2160-0", - "fhir_system": "LOINC", - "fhir_display": "Creatinine [Mass/volume] in Serum or Plasma", - }, - "age": { - "mimic_table": "patients", - "mimic_itemid": "anchor_age", - "fhir_resource": "Patient", - "fhir_code": "birthDate", - "fhir_system": "FHIR Core", - "fhir_display": "Patient birth date (calculate age)", - }, - "gender_encoded": { - "mimic_table": "patients", - "mimic_itemid": "gender", - "fhir_resource": "Patient", - "fhir_code": "gender", - "fhir_system": "FHIR Core", - "fhir_display": "Administrative Gender (M/F)", - }, - } - - print( - f"\n{'Feature':<20} {'Mean±SD':<20} {'MIMIC Source':<20} {'FHIR Resource':<20} {'FHIR Code (System)':<30}" - ) - print("-" * 120) - - for feature in X.columns: - mapping = fhir_mappings.get(feature, {}) - - # Calculate statistics - mean_val = X[feature].mean() - std_val = X[feature].std() - - # Format based on feature type - if feature == "gender_encoded": - stats = f"{mean_val:.2f} (M={X[feature].sum():.0f})" - else: - stats = f"{mean_val:.2f}±{std_val:.2f}" - - mimic_source = f"{mapping.get('mimic_table', 'N/A')} ({mapping.get('mimic_itemid', 'N/A')})" - fhir_resource = mapping.get("fhir_resource", "N/A") - fhir_code = ( - f"{mapping.get('fhir_code', 'N/A')} ({mapping.get('fhir_system', 'N/A')})" - ) - - print( - f"{feature:<20} {stats:<20} {mimic_source:<20} {fhir_resource:<20} {fhir_code:<30}" - ) - - print("\n" + "=" * 120) - print( - "Note: Statistics calculated from first 24 hours of ICU stay. Missing values imputed with median." - ) - print("=" * 120 + "\n") - - -def create_feature_matrix( - chartevents_features: pd.DataFrame, - labevents_features: pd.DataFrame, - demographics: pd.DataFrame, - sepsis_labels: pd.DataFrame, -) -> Tuple[pd.DataFrame, pd.Series]: - """ - Create feature matrix and labels from extracted features. - - Args: - chartevents_features: Chart events features - labevents_features: Lab events features - demographics: Demographics features - sepsis_labels: Sepsis labels - - Returns: - Tuple of (feature matrix, labels) - """ - print("Creating feature matrix...") - - # Merge all features on stay_id - features = ( - chartevents_features.merge(labevents_features, on="stay_id", how="outer") - .merge(demographics, on="stay_id", how="outer") - .merge(sepsis_labels, on="stay_id", how="inner") - ) - - # Select feature columns (exclude stay_id and sepsis) - feature_cols = [ - "heart_rate", - "temperature", - "respiratory_rate", - "wbc", - "lactate", - "creatinine", - "age", - "gender_encoded", - ] - - X = features[feature_cols].copy() - y = features["sepsis"].copy() - - print(f"Feature matrix shape: {X.shape}") - print(f"Sepsis cases: {y.sum()} ({y.sum() / len(y) * 100:.2f}%)") - - return X, y - - -def train_models(X_train: pd.DataFrame, y_train: pd.Series) -> Dict[str, Any]: - """ - Train all three models (Random Forest, XGBoost, Logistic Regression). - - Args: - X_train: Training features - y_train: Training labels - - Returns: - Dictionary of trained models - """ - print("\nTraining models...") - - models = {} - - # Check if we have any positive samples - positive_samples = y_train.sum() - total_samples = len(y_train) - positive_rate = positive_samples / total_samples if total_samples > 0 else 0.0 - - print( - f" Positive samples: {positive_samples}/{total_samples} ({positive_rate*100:.2f}%)" - ) - - # Random Forest - use class_weight to handle imbalance - print(" Training Random Forest...") - rf = RandomForestClassifier( - n_estimators=100, - random_state=42, - n_jobs=-1, - class_weight="balanced", # Automatically adjust for class imbalance - ) - rf.fit(X_train, y_train) - models["RandomForest"] = rf - - # XGBoost - handle case with no positive samples - print(" Training XGBoost...") - if positive_samples == 0: - # When there are no positive samples, set base_score to a small value - # and use scale_pos_weight to avoid errors - xgb_model = xgb.XGBClassifier( - random_state=42, - n_jobs=-1, - eval_metric="logloss", - base_score=0.01, # Small positive value instead of 0 - scale_pos_weight=1.0, - ) - else: - # Calculate scale_pos_weight for imbalanced data - scale_pos_weight = (total_samples - positive_samples) / positive_samples - xgb_model = xgb.XGBClassifier( - random_state=42, - n_jobs=-1, - eval_metric="logloss", - scale_pos_weight=scale_pos_weight, - ) - xgb_model.fit(X_train, y_train) - models["XGBoost"] = xgb_model - - # Logistic Regression (with scaling) - use class_weight to handle imbalance - print(" Training Logistic Regression...") - scaler = StandardScaler() - X_train_scaled = scaler.fit_transform(X_train) - lr = LogisticRegression( - random_state=42, - max_iter=1000, - class_weight="balanced", # Automatically adjust for class imbalance - ) - lr.fit(X_train_scaled, y_train) - models["LogisticRegression"] = lr - models["scaler"] = scaler # Store scaler for later use - - return models - - -def evaluate_models( - models: Dict[str, Any], - X_test: pd.DataFrame, - y_test: pd.Series, - feature_names: List[str], -) -> Dict[str, Dict[str, float]]: - """ - Evaluate and compare all models. - - Args: - models: Dictionary of trained models - X_test: Test features - y_test: Test labels - feature_names: List of feature names - - Returns: - Dictionary of evaluation metrics for each model - """ - print("\nEvaluating models...") - print( - f"Test set: {len(y_test)} samples, {y_test.sum()} positive ({y_test.sum()/len(y_test)*100:.2f}%)" - ) - - results = {} - - for name, model in models.items(): - if name == "scaler": - continue - - # Get probability predictions - if name == "LogisticRegression": - X_test_scaled = models["scaler"].transform(X_test) - y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] - else: - y_pred_proba = model.predict_proba(X_test)[:, 1] - - # Use default threshold (0.5) for predictions - y_pred = (y_pred_proba >= 0.5).astype(int) - - # Calculate metrics with default threshold - metrics = { - "accuracy": accuracy_score(y_test, y_pred), - "precision": precision_score(y_test, y_pred, zero_division=0), - "recall": recall_score(y_test, y_pred, zero_division=0), - "f1": f1_score(y_test, y_pred, zero_division=0), - "auc": roc_auc_score(y_test, y_pred_proba) - if len(np.unique(y_test)) > 1 - else 0.0, - } - - # Try to find optimal threshold for F1 score - if len(np.unique(y_test)) > 1 and y_test.sum() > 0: - precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba) - f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10) - optimal_idx = np.argmax(f1_scores) - optimal_threshold = ( - thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5 - ) - optimal_f1 = f1_scores[optimal_idx] - - # Predictions with optimal threshold - y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int) - metrics["optimal_threshold"] = optimal_threshold - metrics["optimal_f1"] = optimal_f1 - metrics["optimal_precision"] = precision_score( - y_test, y_pred_optimal, zero_division=0 - ) - metrics["optimal_recall"] = recall_score( - y_test, y_pred_optimal, zero_division=0 - ) - else: - metrics["optimal_threshold"] = 0.5 - metrics["optimal_f1"] = 0.0 - metrics["optimal_precision"] = 0.0 - metrics["optimal_recall"] = 0.0 - - results[name] = metrics - - print(f"\n{name}:") - print( - f" Predictions: {y_pred.sum()} positive predicted (actual: {y_test.sum()})" - ) - print(f" Accuracy: {metrics['accuracy']:.4f}") - print(f" Precision: {metrics['precision']:.4f}") - print(f" Recall: {metrics['recall']:.4f}") - print(f" F1-score: {metrics['f1']:.4f}") - print(f" AUC-ROC: {metrics['auc']:.4f}") - if metrics["optimal_f1"] > 0: - print(f" Optimal threshold: {metrics['optimal_threshold']:.4f}") - print(f" Optimal F1-score: {metrics['optimal_f1']:.4f}") - print(f" Optimal Precision: {metrics['optimal_precision']:.4f}") - print(f" Optimal Recall: {metrics['optimal_recall']:.4f}") - - # Show feature importance for tree-based models - if hasattr(model, "feature_importances_"): - print("\n Top 5 Feature Importances:") - importances = model.feature_importances_ - indices = np.argsort(importances)[::-1][:5] - for idx in indices: - print(f" {feature_names[idx]}: {importances[idx]:.4f}") - - return results - - -def select_best_model( - models: Dict[str, Any], - results: Dict[str, Dict[str, float]], - metric: str = "f1", -) -> Tuple[str, Any, Dict[str, float]]: - """ - Select best model based on specified metric. - - Args: - models: Dictionary of trained models - results: Evaluation results - metric: Metric to optimize ("f1", "recall", "precision", "auc") - - Returns: - Tuple of (best model name, best model, best metrics) - """ - print(f"\nSelecting best model based on {metric}...") - - # Get the appropriate metric value (prefer optimal if available) - def get_metric_value(metrics, metric_name): - if metric_name == "f1": - return metrics.get("optimal_f1", metrics["f1"]) - elif metric_name == "recall": - return metrics.get("optimal_recall", metrics["recall"]) - elif metric_name == "precision": - return metrics.get("optimal_precision", metrics["precision"]) - elif metric_name == "auc": - return metrics.get("auc", 0.0) - else: - return metrics.get("optimal_f1", metrics["f1"]) - - best_name = max(results.keys(), key=lambda k: get_metric_value(results[k], metric)) - best_model = models[best_name] - best_metrics = results[best_name] - - best_value = get_metric_value(best_metrics, metric) - print(f"Best model: {best_name} ({metric}: {best_value:.4f})") - - return best_name, best_model, best_metrics - - -def save_model( - model: Any, - model_name: str, - feature_names: List[str], - metrics: Dict[str, float], - scaler: Any, - output_path: Union[str, Path], -) -> None: - """ - Save the best model with metadata. - - Args: - model: Trained model - model_name: Name of the model - feature_names: List of feature names - metrics: Evaluation metrics - scaler: StandardScaler (if Logistic Regression, None otherwise) - output_path: Path to save model - """ - print(f"\nSaving model to {output_path}...") - - # Create output directory if it doesn't exist - output_path = Path(output_path) - output_path.parent.mkdir(parents=True, exist_ok=True) - - # Prepare metadata - metadata = { - "model_name": model_name, - "training_date": datetime.now().isoformat(), - "feature_names": feature_names, - "metrics": metrics, - "itemid_mappings": { - "chartevents": CHARTEVENTS_ITEMIDS, - "labevents": LABEVENTS_ITEMIDS, - }, - "sepsis_icd_codes": { - "icd10": SEPSIS_ICD10_CODES, - "icd9": SEPSIS_ICD9_CODES, - }, - } - - # Save model and metadata - model_data = { - "model": model, - "scaler": scaler, - "metadata": metadata, - } - - joblib.dump(model_data, output_path) - - print("Model saved successfully!") - - -def main(): - """Main training pipeline.""" - # Data directory - data_dir = "../datasets/mimic-iv-clinical-database-demo-2.2" - - # Output path (relative to script location) - script_dir = Path(__file__).parent - output_path = script_dir / "models" / "sepsis_model.pkl" - - print("=" * 60) - print("Sepsis Prediction Model Training") - print("=" * 60) - - # Load data - tables = load_mimic_data(data_dir) - - # Extract features - chartevents_features = extract_chartevents_features( - tables["chartevents"], tables["icustays"] - ) - labevents_features = extract_labevents_features( - tables["labevents"], tables["icustays"] - ) - demographics = extract_demographics( - tables["patients"], tables["admissions"], tables["icustays"] - ) - - # Extract labels - sepsis_labels = extract_sepsis_labels(tables["diagnoses_icd"], tables["icustays"]) - - # Create feature matrix - X, y = create_feature_matrix( - chartevents_features, - labevents_features, - demographics, - sepsis_labels, - ) - - # Handle missing values (impute with median) - print("\nHandling missing values...") - missing_before = X.isnull().sum().sum() - print(f" Missing values before imputation: {missing_before}") - X = X.fillna(X.median()) - - # Print feature summary with actual data statistics - print_feature_summary(X) - - # Split data with careful stratification to ensure positive samples in both sets - print("\nSplitting data...") - if len(np.unique(y)) > 1 and y.sum() > 0: - # Use stratification to ensure positive samples in both train and test - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42, stratify=y - ) - print( - f" Training set: {len(X_train)} samples ({y_train.sum()} positive, {y_train.sum()/len(y_train)*100:.2f}%)" - ) - print( - f" Test set: {len(X_test)} samples ({y_test.sum()} positive, {y_test.sum()/len(y_test)*100:.2f}%)" - ) - - # Warn if test set has no positive samples (shouldn't happen with stratify, but check anyway) - if y_test.sum() == 0: - print( - " WARNING: Test set has no positive samples! Consider using a different random seed." - ) - else: - print( - " Warning: No positive samples or only one class. Skipping stratification." - ) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42 - ) - print(f" Training set: {len(X_train)} samples") - print(f" Test set: {len(X_test)} samples") - - # Apply oversampling to training data to balance classes - print("\nApplying oversampling to training data...") - try: - from imblearn.over_sampling import SMOTE - - # Only apply SMOTE if we have positive samples - if y_train.sum() > 0 and len(np.unique(y_train)) > 1: - print( - f" Before oversampling: {len(X_train)} samples ({y_train.sum()} positive, {y_train.sum()/len(y_train)*100:.2f}%)" - ) - # Ensure k_neighbors doesn't exceed available positive samples - k_neighbors = min(5, max(1, y_train.sum() - 1)) - smote = SMOTE(random_state=42, k_neighbors=k_neighbors) - X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train) - print( - f" After oversampling: {len(X_train_resampled)} samples ({y_train_resampled.sum()} positive, {y_train_resampled.sum()/len(X_train_resampled)*100:.2f}%)" - ) - X_train = pd.DataFrame( - X_train_resampled, - columns=X_train.columns, - index=X_train.index[: len(X_train_resampled)], - ) - y_train = pd.Series( - y_train_resampled, index=y_train.index[: len(y_train_resampled)] - ) - else: - print(" Skipping oversampling: insufficient positive samples") - except (ImportError, ModuleNotFoundError) as e: - print( - " imbalanced-learn not installed. Install with: pip install imbalanced-learn" - ) - print(f" Error: {e}") - print(" Proceeding without oversampling...") - - # Train models - models = train_models(X_train, y_train) - - # Evaluate models - feature_names = X.columns.tolist() - results = evaluate_models(models, X_test, y_test, feature_names) - - # Select best model (can change metric: "f1", "recall", "precision", "auc") - # For sepsis prediction, recall (sensitivity) is often most important - best_name, best_model, best_metrics = select_best_model( - models, results, metric="f1" - ) - - # Save best model - scaler = models.get("scaler") - save_model( - best_model, - best_name, - feature_names, - best_metrics, - scaler, - output_path, - ) - - print("\n" + "=" * 60) - print("Training complete!") - print("=" * 60) - - -if __name__ == "__main__": - main() diff --git a/notebooks/fhir_ml_workflow.ipynb b/notebooks/fhir_ml_workflow.ipynb new file mode 100644 index 00000000..5a73c1dd --- /dev/null +++ b/notebooks/fhir_ml_workflow.ipynb @@ -0,0 +1,567 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from healthchain.sandbox.loaders import MimicOnFHIRLoader\n", + "from healthchain.io import Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "MIMIC_DIR = \"../../datasets/mimic-iv-clinical-database-demo-on-fhir-2.1.0/\"\n", + "RESOURCES_TO_LOAD = [\"MimicObservationChartevents\", \"MimicObservationLabevents\", \"MimicPatient\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO: 2025-11-27 18:55:49,156 [healthchain.sandbox.loaders.mimic]: Loaded 668862 resources from MimicObservationChartevents.ndjson.gz\n", + "INFO: 2025-11-27 18:55:54,360 [healthchain.sandbox.loaders.mimic]: Loaded 107727 resources from MimicObservationLabevents.ndjson.gz\n", + "INFO: 2025-11-27 18:55:54,366 [healthchain.sandbox.loaders.mimic]: Loaded 100 resources from MimicPatient.ndjson.gz\n" + ] + } + ], + "source": [ + "# Load MIMIC data as single bundle dict (fast, no validation)\n", + "loader = MimicOnFHIRLoader()\n", + "bundle = loader.load(\n", + " data_dir=MIMIC_DIR,\n", + " resource_types=RESOURCES_TO_LOAD,\n", + " as_dict=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "776689" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(bundle[\"entry\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert to DataFrame using schema\n", + "tabular = Dataset.from_fhir_bundle(\n", + " bundle, \n", + " schema=\"../healthchain/configs/features/sepsis_vitals.yaml\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'patient_ref': 'object',\n", + " 'heart_rate': 'float64',\n", + " 'temperature': 'float64',\n", + " 'respiratory_rate': 'float64',\n", + " 'wbc': 'float64',\n", + " 'lactate': 'float64',\n", + " 'creatinine': 'float64',\n", + " 'age': 'int64',\n", + " 'gender_encoded': 'int64'}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tabular.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(tabular)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patient_refheart_ratetemperaturerespiratory_ratewbclactatecreatinineagegender_encoded
0Patient/0a8eebfd-a352-522e-89f0-1d4a13abdebc96.50000098.96666720.7000005.81538531.00.466667520
1Patient/0c2243d2-987b-5cbd-8eb1-170a8064769385.37931098.13529412.1034487.73750066.50.594937571
2Patient/13df78e7-150e-5eb7-be5f-5f62b2baee87104.30769298.37500015.9615388.337500NaN0.966667661
3Patient/158f3a39-e3d7-5e7a-93aa-57af894aadd999.73913098.39000018.35869611.509091NaN0.628571400
4Patient/1ab119a5-aac8-5002-9d2f-b8ff6962338793.40298598.82424221.75373115.578571NaN0.646667341
5Patient/1bb918ba-e04e-5e7a-87ca-dbcbbb4c72c378.18181898.72500016.96153822.906250NaN0.756667380
6Patient/1cf9e585-806c-513b-80af-4ca565a28231104.66666798.81000019.38888912.520952NaN3.847321531
7Patient/22a3e422-663a-561c-b305-a0c04bf4223569.05000098.80000016.90000010.754545NaN1.353846871
8Patient/23069939-0c4c-517b-a3ec-baae0d4e398878.69444498.32500016.0555569.383333NaN0.860000521
9Patient/23f959c1-6ac2-562b-9cbe-c111f338e27b87.18452498.82709117.25149714.954054NaN0.642105511
\n", + "
" + ], + "text/plain": [ + " patient_ref heart_rate temperature \\\n", + "0 Patient/0a8eebfd-a352-522e-89f0-1d4a13abdebc 96.500000 98.966667 \n", + "1 Patient/0c2243d2-987b-5cbd-8eb1-170a80647693 85.379310 98.135294 \n", + "2 Patient/13df78e7-150e-5eb7-be5f-5f62b2baee87 104.307692 98.375000 \n", + "3 Patient/158f3a39-e3d7-5e7a-93aa-57af894aadd9 99.739130 98.390000 \n", + "4 Patient/1ab119a5-aac8-5002-9d2f-b8ff69623387 93.402985 98.824242 \n", + "5 Patient/1bb918ba-e04e-5e7a-87ca-dbcbbb4c72c3 78.181818 98.725000 \n", + "6 Patient/1cf9e585-806c-513b-80af-4ca565a28231 104.666667 98.810000 \n", + "7 Patient/22a3e422-663a-561c-b305-a0c04bf42235 69.050000 98.800000 \n", + "8 Patient/23069939-0c4c-517b-a3ec-baae0d4e3988 78.694444 98.325000 \n", + "9 Patient/23f959c1-6ac2-562b-9cbe-c111f338e27b 87.184524 98.827091 \n", + "\n", + " respiratory_rate wbc lactate creatinine age gender_encoded \n", + "0 20.700000 5.815385 31.0 0.466667 52 0 \n", + "1 12.103448 7.737500 66.5 0.594937 57 1 \n", + "2 15.961538 8.337500 NaN 0.966667 66 1 \n", + "3 18.358696 11.509091 NaN 0.628571 40 0 \n", + "4 21.753731 15.578571 NaN 0.646667 34 1 \n", + "5 16.961538 22.906250 NaN 0.756667 38 0 \n", + "6 19.388889 12.520952 NaN 3.847321 53 1 \n", + "7 16.900000 10.754545 NaN 1.353846 87 1 \n", + "8 16.055556 9.383333 NaN 0.860000 52 1 \n", + "9 17.251497 14.954054 NaN 0.642105 51 1 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = tabular.data\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "MODEL_PATH = '../cookbook/models/sepsis_model.pkl'\n", + "model_data = joblib.load(MODEL_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'model': XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric='logloss',\n", + " feature_types=None, feature_weights=None, gamma=None,\n", + " grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=None, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=None, n_jobs=-1,\n", + " num_parallel_tree=None, ...),\n", + " 'scaler': StandardScaler(),\n", + " 'metadata': {'model_name': 'XGBoost',\n", + " 'training_date': '2025-11-22T13:52:14.144052',\n", + " 'feature_names': ['heart_rate',\n", + " 'temperature',\n", + " 'respiratory_rate',\n", + " 'wbc',\n", + " 'lactate',\n", + " 'creatinine',\n", + " 'age',\n", + " 'gender_encoded'],\n", + " 'metrics': {'accuracy': 0.8214285714285714,\n", + " 'precision': 0.5,\n", + " 'recall': 0.2,\n", + " 'f1': 0.2857142857142857,\n", + " 'auc': 0.7391304347826086,\n", + " 'optimal_threshold': 0.19611828,\n", + " 'optimal_f1': 0.5454545454049586,\n", + " 'optimal_precision': 0.5,\n", + " 'optimal_recall': 0.6},\n", + " 'itemid_mappings': {'chartevents': {'heart_rate': 220050,\n", + " 'temperature_f': 223761,\n", + " 'temperature_c': 223762,\n", + " 'respiratory_rate': 220210},\n", + " 'labevents': {'wbc': [51300, 51301],\n", + " 'lactate': 50813,\n", + " 'creatinine': 50912}},\n", + " 'sepsis_icd_codes': {'icd10': ['A41.9',\n", + " 'A40',\n", + " 'A41',\n", + " 'R65.20',\n", + " 'R65.21',\n", + " 'R65.1',\n", + " 'A41.0',\n", + " 'A41.1',\n", + " 'A41.2',\n", + " 'A41.3',\n", + " 'A41.4',\n", + " 'A41.5',\n", + " 'A41.50',\n", + " 'A41.51',\n", + " 'A41.52',\n", + " 'A41.53',\n", + " 'A41.59',\n", + " 'A41.8',\n", + " 'A41.81',\n", + " 'A41.89'],\n", + " 'icd9': ['038', '99591', '99592', '78552']}}}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "model = model_data[\"model\"]\n", + "patient_features = df[model_data[\"metadata\"][\"feature_names\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# ML inference\n", + "probabilities = model.predict_proba(patient_features)[:, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "threshold = model_data[\"metadata\"][\"metrics\"][\"optimal_threshold\"]\n", + "predictions = (probabilities >= threshold).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert back to FHIR\n", + "risk_assessments = tabular.to_risk_assessment(\n", + " outcome_code=\"A41.9\",\n", + " outcome_display=\"Sepsis\",\n", + " model_name=\"XGBoost\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(risk_assessments)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'resourceType': 'RiskAssessment',\n", + " 'id': 'hc-71012a5d-cf7f-436d-864b-327efe28b483',\n", + " 'status': 'final',\n", + " 'method': {'coding': [{'system': 'https://healthchain.github.io/ml-models',\n", + " 'code': 'XGBoost',\n", + " 'display': 'XGBoost'}]},\n", + " 'subject': {'reference': 'Patient/1bb918ba-e04e-5e7a-87ca-dbcbbb4c72c3'},\n", + " 'occurrenceDateTime': datetime.datetime(2025, 11, 27, 18, 56, 51, tzinfo=TzInfo(UTC)),\n", + " 'prediction': [{'outcome': {'coding': [{'system': 'http://hl7.org/fhir/sid/icd-10',\n", + " 'code': 'A41.9',\n", + " 'display': 'Sepsis'}]},\n", + " 'probabilityDecimal': 0.07619287073612213,\n", + " 'qualitativeRisk': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/risk-probability',\n", + " 'code': 'low',\n", + " 'display': 'Low'}]}}],\n", + " 'note': [{'text': 'ML prediction: Negative (probability: 7.62%, risk: low)'}]}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "risk_assessments[5].model_dump()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From d46d1678af924b2a8e78b313bbde640785a29776 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Tue, 2 Dec 2025 12:33:15 +0000 Subject: [PATCH 11/12] gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8709beb5..f5547d53 100644 --- a/.gitignore +++ b/.gitignore @@ -167,7 +167,8 @@ scrap/ .ruff_cache/ .python-version .cursor/ -.private/ +.local/ +.keys/ .idea/ # Personal AI context (keep local) From ecbcff55b1a6eee8930709bd2237341e6f311d28 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Tue, 2 Dec 2025 20:22:43 +0000 Subject: [PATCH 12/12] Update cookbook docs --- docs/cookbook/index.md | 3 + docs/cookbook/ml_model_deployment.md | 539 ++++++++++++++++++++++++-- docs/index.md | 2 +- mkdocs.yml | 1 + scripts/sepsis_prediction_training.py | 7 +- 5 files changed, 506 insertions(+), 46 deletions(-) diff --git a/docs/cookbook/index.md b/docs/cookbook/index.md index e64b6d3c..165624b6 100644 --- a/docs/cookbook/index.md +++ b/docs/cookbook/index.md @@ -13,6 +13,9 @@ Dive into real-world, production-ready examples to learn how to build interopera ## 📚 How-To Guides +- 🔬 **[Deploy ML Models: Real-Time Alerts & Batch Screening](./ml_model_deployment.md)** + *Deploy the same ML model two ways: CDS Hooks for point-of-care sepsis alerts, and FHIR Gateway for population-level batch screening with RiskAssessment resources.* + - 🚦 **[Multi-Source Patient Data Aggregation](./multi_ehr_aggregation.md)** *Merge patient data from multiple FHIR sources (Epic, Cerner, etc.), deduplicate conditions, prove provenance, and robustly handle cross-vendor errors. Foundation for retrieval-augmented generation (RAG) and analytics workflows.* diff --git a/docs/cookbook/ml_model_deployment.md b/docs/cookbook/ml_model_deployment.md index 74fa87c1..9c878a15 100644 --- a/docs/cookbook/ml_model_deployment.md +++ b/docs/cookbook/ml_model_deployment.md @@ -1,64 +1,517 @@ -# Deploy ML Models as Healthcare APIs +# Deploy ML Models: Real-Time Alerts & Batch Screening -*This example is coming soon! 🚧* +You trained a model on CSVs. Now you need to deploy it against FHIR data from EHRs. This tutorial shows how to bridge that gap with two production patterns: **real-time CDS Hooks alerts** and **batch FHIR Gateway screening**—both using the same model and a simple YAML schema that maps FHIR resources to your training features. -
- ML Model Deployment Architecture -
+Check out the full working examples: -## Overview +- [Real-time CDS Hooks](https://github.com/dotimplement/HealthChain/tree/main/cookbook/sepsis_cds_hooks.py) +- [Batch FHIR Gateway](https://github.com/dotimplement/HealthChain/tree/main/cookbook/sepsis_fhir_batch.py) -This tutorial will demonstrate how to deploy any trained ML model as a production-ready healthcare API with FHIR input/output, multi-EHR connectivity, and comprehensive monitoring. +![](../assets/images/hc-use-cases-ml-deployment.png) -## What You'll Learn +## When to Use Each Pattern -- **Model serving architecture** - Deploy Hugging Face, scikit-learn, PyTorch, and custom models -- **FHIR-native endpoints** - Serve predictions with structured healthcare data formats -- **Multi-EHR integration** - Connect your model to live FHIR servers for real-time inference -- **Healthcare data validation** - Ensure type-safe input/output with Pydantic models -- **Production monitoring** - Track model performance, data drift, and API health -- **Scalable deployment** - Configure auto-scaling and load balancing for healthcare workloads +| Pattern | Trigger | Output | Best For | +|---------|---------|--------|----------| +| **CDS Hooks** | Clinician opens chart | Alert cards in EHR UI | Point-of-care decision support | +| **FHIR Gateway** | Scheduled job / API call | [RiskAssessment](https://www.hl7.org/fhir/riskassessment.html) resources | Population screening, quality measures | -## Architecture +Both patterns share the same trained model and feature extraction—only the integration layer differs. -The example will showcase: +## Setup -1. **Model Packaging** - Wrap any ML model with HealthChain's deployment framework -2. **FHIR Endpoint Creation** - Automatically generate OpenAPI-compliant healthcare APIs -3. **Real-time Inference** - Process FHIR resources and return structured predictions -4. **Multi-source Integration** - Connect to Epic, Cerner, and other FHIR systems -5. **Performance Monitoring** - Track latency, throughput, and prediction quality -6. **Security & Compliance** - Implement OAuth2, audit logging, and data governance +### Install Dependencies -## Use Cases +```bash +pip install healthchain joblib xgboost scikit-learn python-dotenv +``` -Perfect for: -- **Clinical Decision Support** - Deploy diagnostic or prognostic models in EHR workflows -- **Population Health** - Serve risk stratification models for large patient cohorts -- **Research Platforms** - Make trained models available to clinical researchers -- **AI-powered Applications** - Build healthcare apps with ML-driven features +### Train the Model (or Bring Your Own) -## Example Models +The cookbook includes a training script that builds an XGBoost classifier from MIMIC-IV data. From the project root: -We'll show deployment patterns for: -- **Clinical NLP models** - Named entity recognition, clinical coding, text classification -- **Diagnostic models** - Medical imaging analysis, lab result interpretation -- **Risk prediction models** - Readmission risk, mortality prediction, drug interactions -- **Recommendation systems** - Treatment recommendations, medication optimization +```bash +cd scripts +python sepsis_prediction_training.py +``` -## Prerequisites +This script: -- A trained ML model (any framework supported) -- Understanding of FHIR resources and healthcare data standards -- Python environment with HealthChain installed -- Basic knowledge of API deployment concepts +- Loads MIMIC-IV CSV tables (chartevents, labevents, patients, diagnoses) +- Extracts vitals features (heart rate, temperature, respiratory rate, WBC, lactate, creatinine, age, gender) +- Labels ICU stays with sepsis diagnoses (ICD-9/ICD-10) +- Trains Random Forest, XGBoost, and Logistic Regression models +- Saves the best model (by F1 score) to `scripts/models/sepsis_model.pkl` -## Coming Soon +After training, copy the model to the cookbook directory: -We're building comprehensive examples covering multiple model types and deployment scenarios! +```bash +cp scripts/models/sepsis_model.pkl cookbook/models/ +``` -In the meantime, explore our [Gateway documentation](../reference/gateway/gateway.md) to understand the deployment infrastructure. +!!! note "MIMIC-IV Demo Dataset" + + The training script uses the [MIMIC-IV Clinical Database Demo](https://physionet.org/content/mimic-iv-demo/2.2/) (~50MB, freely downloadable). Set the path: + + ```bash + export MIMIC_CSV_PATH=/path/to/mimic-iv-clinical-database-demo-2.2 + ``` + + *This is a quick-start workflow for demo purposes. Full MIMIC requires credentialed access. Most researchers use BigQuery or a PostgreSQL database. + +**Using your own model?** The pipeline is flexible—just save any scikit-learn-compatible model as a pickle with this structure: + +```python +import joblib + +model_data = { + "model": your_trained_model, # Must have .predict_proba() + "metadata": { + "feature_names": ["heart_rate", "temperature", ...], + "metrics": {"optimal_threshold": 0.5} + } +} +joblib.dump(model_data, "cookbook/models/sepsis_model.pkl") +``` + +The pipeline will work with any model that implements `predict_proba()` - XGBoost, Random Forest, LightGBM, or even PyTorch/TensorFlow models wrapped with a sklearn-compatible interface. + +### Prepare Demo Patient Data + +The two patterns have different data requirements: + +| Pattern | Data Source | What You Need | +|---------|-------------|---------------| +| **CDS Hooks** | Local JSON files | Download pre-extracted patients (quick start) | +| **FHIR Gateway** | FHIR server | Upload patients to Medplum and get server-assigned IDs | + +=== "CDS Hooks Only (Quick Start)" + + Download pre-extracted patient bundles—these are already in the repo if you cloned it: + + ```bash + mkdir -p cookbook/data/mimic_demo_patients + cd cookbook/data/mimic_demo_patients + wget https://github.com/dotimplement/HealthChain/raw/main/cookbook/data/mimic_demo_patients/high_risk_patient.json + wget https://github.com/dotimplement/HealthChain/raw/main/cookbook/data/mimic_demo_patients/moderate_risk_patient.json + wget https://github.com/dotimplement/HealthChain/raw/main/cookbook/data/mimic_demo_patients/low_risk_patient.json + ``` + + That's it! Skip to [Pattern 1: CDS Hooks](#pattern-1-real-time-cds-hooks-alerts). + +=== "FHIR Gateway (Full Setup)" + + The batch screening pattern queries patients from a FHIR server. This tutorial uses [Medplum](https://www.medplum.com/) (a free, hosted FHIR server), but any FHIR R4-compliant API works - just swap the credentials. + + **1. Configure FHIR Credentials** + + Add Medplum credentials to your `.env` file. See [FHIR Sandbox Setup](./setup_fhir_sandboxes.md#medplum) for details: + + ```bash + MEDPLUM_BASE_URL=https://api.medplum.com/fhir/R4 + MEDPLUM_CLIENT_ID=your_client_id + MEDPLUM_CLIENT_SECRET=your_client_secret + MEDPLUM_TOKEN_URL=https://api.medplum.com/oauth2/token + MEDPLUM_SCOPE=openid + ``` + + **2. Extract and Upload Demo Patients** + + ```bash + # Set MIMIC-on-FHIR path (or use --mimic flag) + export MIMIC_FHIR_PATH=/path/to/mimic-iv-on-fhir + + # Extract and upload to Medplum + cd scripts + python extract_mimic_demo_patients.py --minimal --upload + ``` + + This script: + + - Loads patient data from [MIMIC-IV on FHIR](https://physionet.org/content/mimic-iv-demo/2.2/) + - Runs the sepsis model to find high/moderate/low risk patients + - Creates minimal FHIR bundles with only the observations needed + - Uploads them to your Medplum instance as transaction bundles + + **3. Copy Patient IDs** + + After upload, the script prints server-assigned patient IDs: + + ``` + ✓ Uploaded to Medplum! + + Copy this into sepsis_fhir_batch.py: + + DEMO_PATIENT_IDS = [ + "702e11e8-6d21-41dd-9b48-31715fdc0fb1", # high risk + "3b0da7e9-0379-455a-8d35-bedd3a6ee459", # moderate risk + "f490ceb4-6262-4f1e-8b72-5515e6c46741", # low risk + ] + ``` + + Copy these IDs into the `DEMO_PATIENT_IDS` list in `sepsis_fhir_batch.py`. + + !!! tip "Generate More Patients" + + The script has options for generating larger test sets: + + ```bash + python extract_mimic_demo_patients.py --help + + # Examples: + --num-patients-per-risk 5 # 5 patients per risk level (15 total) + --seed 123 # Different random sample + --minimal # Keep only latest observation per feature (~12KB each) + ``` + + !!! tip "Alternative: Manual Upload" + + If you prefer, run without `--upload` to generate bundle JSON files, then upload them manually via the [Medplum → Batch](https://app.medplum.com/batch) page. + +--- + +**Setup complete!** You should now have: + +- ✅ A trained model at `cookbook/models/sepsis_model.pkl` +- ✅ Demo patient data (local JSON or uploaded to Medplum) + +If using the **FHIR Gateway pattern**, also confirm: + +- ✅ FHIR credentials in `.env` +- ✅ Patient IDs copied into `DEMO_PATIENT_IDS` in `sepsis_fhir_batch.py` + +## The Shared Model Pipeline + +Both patterns reuse the same pipeline. Here's what you'll write: + +```python +def create_pipeline() -> Pipeline[Dataset]: + pipeline = Pipeline[Dataset]() + + @pipeline.add_node + def impute_missing(dataset: Dataset) -> Dataset: + dataset.data = dataset.data.fillna(dataset.data.median(numeric_only=True)) + return dataset + + @pipeline.add_node + def run_inference(dataset: Dataset) -> Dataset: + features = dataset.data[feature_names] + probabilities = model.predict_proba(features)[:, 1] + dataset.metadata["probabilities"] = probabilities + return dataset + + return pipeline +``` + +The pipeline operates on a `Dataset`, which you create from a FHIR bundle: + +```python +dataset = Dataset.from_fhir_bundle(bundle, schema=SCHEMA_PATH) +``` + +**How does FHIR become a DataFrame?** The schema maps FHIR resources to your training features: + +```yaml +# sepsis_vitals.yaml (excerpt) +features: + heart_rate: + fhir_resource: Observation + code: "220045" # MIMIC chartevents code + wbc: + fhir_resource: Observation + code: "51301" # MIMIC labevents code + age: + fhir_resource: Patient + field: birthDate + transform: calculate_age +``` + +No FHIR parsing code needed—define the mapping once, use it everywhere. + +!!! tip "Explore Interactively" + + Step through the full flow in [notebooks/fhir_ml_workflow.ipynb](../../notebooks/fhir_ml_workflow.ipynb): FHIR bundle → Dataset → DataFrame → inference → RiskAssessment. + +Now let's see how this pipeline plugs into each deployment pattern. --- -**Want to be notified when this example is ready?** Join our [Discord community](https://discord.gg/UQC6uAepUz) for updates! +## Pattern 1: Real-Time CDS Hooks Alerts + +Use CDS Hooks when you need **instant alerts** during clinical workflows. The EHR triggers your service and pushes patient data via prefetch—no server queries needed. + +### How It Works + +``` +Clinician opens chart → EHR fires patient-view hook → Your service runs prediction → CDS card appears in EHR +``` + +### Set Up the CDS Hook Handler + +Create a [CDSHooksService](../reference/gateway/cdshooks.md) that listens for `patient-view` events: + +```python +from healthchain.gateway import CDSHooksService +from healthchain.fhir import prefetch_to_bundle +from healthchain.models import CDSRequest, CDSResponse +from healthchain.models.responses.cdsresponse import Card + +cds = CDSHooksService() + +@cds.hook("patient-view", id="sepsis-risk") +def sepsis_alert(request: CDSRequest) -> CDSResponse: + if not request.prefetch: + return CDSResponse(cards=[]) + + # FHIR prefetch → Dataset → Prediction + bundle = prefetch_to_bundle(request.prefetch) + dataset = Dataset.from_fhir_bundle(bundle, schema=SCHEMA_PATH) + result = pipeline(dataset) + + # Generate alert card if risk is elevated + prob = float(result.metadata["probabilities"][0]) + risk = "high" if prob > 0.7 else "moderate" if prob > 0.4 else "low" + + if risk in ["high", "moderate"]: + return CDSResponse(cards=[ + Card( + summary=f"Sepsis Risk: {risk.upper()} ({prob:.0%})", + indicator="critical" if risk == "high" else "warning", + detail=f"Predicted sepsis risk: {risk.upper()}. Recommend workup.", + source={"label": "HealthChain Sepsis Predictor"}, + ) + ]) + + return CDSResponse(cards=[]) +``` + +### Build the Service + +Register with [HealthChainAPI](../reference/gateway/api.md): + +```python +app = HealthChainAPI(title="Sepsis CDS Hooks") +app.register_service(cds, path="/cds") +``` + +### Test with Sandbox Client + +The [SandboxClient](../reference/utilities/sandbox.md) simulates EHR requests using your demo patient files: + +```python +from healthchain.sandbox import SandboxClient + +client = SandboxClient( + url="http://localhost:8000/cds/cds-services/sepsis-risk", + workflow="patient-view", +) +client.load_from_path("data/mimic_demo_patients", pattern="*_patient.json") +responses = client.send_requests() +client.save_results(save_request=True, save_response=True, directory="./output/") +``` + +### Expected Output + +``` +Processed 3 requests + Patient 1: Sepsis Risk: HIGH (85%) + Patient 2: Sepsis Risk: MODERATE (52%) + Patient 3: Low risk (no alert) +``` + +??? example "Example CDS Response" + + ```json + { + "cards": [ + { + "summary": "Sepsis Risk: HIGH (85%)", + "indicator": "critical", + "source": { + "label": "HealthChain Sepsis Predictor", + "url": "https://www.sccm.org/SurvivingSepsisCampaign/Guidelines/Adult-Patients" + }, + "detail": "**AI Guidance:**\n- Predicted risk: **HIGH** (85%)\n- Recommend sepsis workup and early intervention.", + "title": "Sepsis Alert (AI Prediction)" + } + ] + } + ``` + +--- + +## Pattern 2: Batch FHIR Gateway Screening + +Use the FHIR Gateway when you need to **screen multiple patients** from a FHIR server. Unlike CDS Hooks (ephemeral alerts), this pattern **persists predictions back to the FHIR server** as RiskAssessment resources, making them available for dashboards, reports, and downstream workflows. + +### How It Works + +``` +Query patients from FHIR server → Run predictions → Write RiskAssessment back to FHIR server +``` + +### Set Up FHIR Gateway + +Configure the [FHIRGateway](../reference/gateway/fhir_gateway.md) with your FHIR source: + +```python +from fhir.resources.patient import Patient +from fhir.resources.observation import Observation +from healthchain.gateway import FHIRGateway +from healthchain.gateway.clients.fhir.base import FHIRAuthConfig +from healthchain.fhir import merge_bundles + +gateway = FHIRGateway() +config = FHIRAuthConfig.from_env("MEDPLUM") +gateway.add_source("medplum", config.to_connection_string()) +``` + +### Screen Individual Patients + +Query patient data, run prediction, and write back a [RiskAssessment](https://www.hl7.org/fhir/riskassessment.html) resource: + +```python +def screen_patient(gateway: FHIRGateway, patient_id: str, source: str): + # Query patient + observations from FHIR server + patient_bundle = gateway.search(Patient, {"_id": patient_id}, source) + obs_bundle = gateway.search(Observation, {"patient": patient_id}, source) + bundle = merge_bundles([patient_bundle, obs_bundle]) + + # FHIR → Dataset → Prediction + dataset = Dataset.from_fhir_bundle(bundle, schema=SCHEMA_PATH) + result = pipeline(dataset) + + # Convert to RiskAssessment and write back + for ra in result.to_risk_assessment( + outcome_code="A41.9", + outcome_display="Sepsis", + model_name="sepsis_xgboost_v1", + ): + gateway.create(ra, source=source) +``` + +### Batch Screen Multiple Patients + +Loop over patient IDs and screen each one: + +```python +for patient_id in patient_ids: + screen_patient(gateway, patient_id, source="medplum") +``` + +!!! note "Demo vs Production" + + This demo uses a fixed list of patient IDs. In production, you'd query for patients dynamically—for example, ICU admissions in the last hour: + + ```python + # Find patients with recent ICU encounters + encounters = gateway.search( + Encounter, + { + "class": "IMP", # inpatient + "location": "icu", + "date": "ge2024-01-01", + }, + source="ehr" + ) + patient_ids = [e.subject.reference.split("/")[1] for e in encounters] + ``` + +### Build the Service + +```python +app = HealthChainAPI(title="Sepsis Batch Screening") +app.register_gateway(gateway, path="/fhir") +``` + +### Expected Output + +After uploading demo patients to Medplum and running batch screening: + +``` +=== Screening patients from Medplum === + 702e11e8-6d21-41dd-9b48-31715fdc0fb1: HIGH (85%) → RiskAssessment/abc123 + 3b0da7e9-0379-455a-8d35-bedd3a6ee459: MODERATE (52%) → RiskAssessment/def456 + f490ceb4-6262-4f1e-8b72-5515e6c46741: LOW (15%) → RiskAssessment/ghi789 +``` +You should be able to see the RiskAssessment resources in the [Medplum console](https://app.medplum.com) (search for "RiskAssessment" in "Resource Type" search bar in top left corner) + +??? example "Example RiskAssessment Resource" + + ```json + { + "resourceType": "RiskAssessment", + "id": "abc123", + "status": "final", + "subject": { + "reference": "Patient/702e11e8-6d21-41dd-9b48-31715fdc0fb1" + }, + "method": { + "coding": [{ + "system": "https://healthchain.io/models", + "code": "sepsis_xgboost_v1", + "display": "Sepsis XGBoost Model v1" + }] + }, + "prediction": [{ + "outcome": { + "coding": [{ + "system": "http://hl7.org/fhir/sid/icd-10", + "code": "A41.9", + "display": "Sepsis" + }] + }, + "probabilityDecimal": 0.85, + "qualitativeRisk": { + "coding": [{ + "system": "http://terminology.hl7.org/CodeSystem/risk-probability", + "code": "high", + "display": "High likelihood" + }] + } + }] + } + ``` + +--- + +## What You've Built + +Two deployment patterns for the same ML model: + +| | CDS Hooks | FHIR Gateway | +|-|-----------|--------------| +| **Integration** | Event-driven (EHR pushes data) | Pull-based (service queries server) | +| **Latency** | Real-time (<1s) | Batch (seconds to minutes) | +| **Output** | CDS Cards (ephemeral alerts) | RiskAssessment (persisted resources) | +| **Scaling** | Per-patient on demand | Parallel/scheduled batch jobs | + +Both patterns: + +- **Share the same model** - Train once, deploy multiple ways +- **Use YAML feature schemas** - Declarative FHIR → features mapping +- **Handle FHIR natively** - No custom data wrangling per integration + +!!! info "Use Cases" + + **CDS Hooks (Real-time)** + + - Sepsis early warning alerts when opening ICU patient charts + - Drug interaction warnings during medication ordering + - Clinical guideline reminders triggered by diagnosis codes + + **FHIR Gateway (Batch)** + + - Nightly population health screening + - Quality measure calculation for reporting + - Research cohort identification + - Pre-visit risk stratification + +!!! tip "Next Steps" + + - **Train your own model**: Replace `sepsis_model.pkl` with your model; update the feature schema to match + - **Add more features**: Extend `sepsis_vitals.yaml` with lab values, medications, or other Observations + - **Add more FHIR sources**: The gateway supports multiple sources—see the cookbook script for Epic sandbox configuration, or the [FHIR Sandbox Setup guide](./setup_fhir_sandboxes.md) + - **Automate batch runs**: Schedule screening jobs with cron, Airflow, or cloud schedulers; or use [FHIR Subscriptions](https://www.hl7.org/fhir/subscription.html) to trigger on new ICU admissions ([PRs welcome!](https://github.com/dotimplement/HealthChain/pulls)) + - **Combine patterns**: Use batch screening to identify high-risk patients, then enable CDS + alerts for those patients diff --git a/docs/index.md b/docs/index.md index e314ef41..f3681305 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,7 @@ HealthChain is an open-source Python toolkit that streamlines productionizing he
-- :material-tools:{ .lg .middle } __FHIR-native ML Pipelines__ +- :material-tools:{ .lg .middle } __FHIR-native Pipelines__ --- diff --git a/mkdocs.yml b/mkdocs.yml index 8692f0d3..00d9ebc0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -18,6 +18,7 @@ nav: - Multi-Source Data Integration: cookbook/multi_ehr_aggregation.md - Automated Clinical Coding: cookbook/clinical_coding.md - Discharge Summarizer: cookbook/discharge_summarizer.md + - ML Model Deployment: cookbook/ml_model_deployment.md - Docs: - Welcome: reference/index.md - Gateway: diff --git a/scripts/sepsis_prediction_training.py b/scripts/sepsis_prediction_training.py index a0ea85ce..16d630de 100644 --- a/scripts/sepsis_prediction_training.py +++ b/scripts/sepsis_prediction_training.py @@ -12,6 +12,7 @@ - python sepsis_prediction_training.py """ +import os import pandas as pd import numpy as np from pathlib import Path @@ -898,8 +899,10 @@ def save_model( def main(): """Main training pipeline.""" - # Data directory - data_dir = "../datasets/mimic-iv-clinical-database-demo-2.2" + # Data directory (set via MIMIC_CSV_PATH or use default) + data_dir = os.getenv( + "MIMIC_CSV_PATH", "../datasets/mimic-iv-clinical-database-demo-2.2" + ) # Output path (relative to script location) script_dir = Path(__file__).parent