From 055e86e4c1e1fdc2e55ad46fe22160aaafa65bec Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Thu, 30 Oct 2025 19:50:41 +0000 Subject: [PATCH 01/12] Add loaders wip and removed prefetch model --- docs/reference/utilities/sandbox.md | 2 + healthchain/fhir/__init__.py | 2 + healthchain/fhir/helpers.py | 56 +++ healthchain/io/adapters/cdsfhiradapter.py | 15 +- healthchain/models/__init__.py | 2 - healthchain/models/hooks/__init__.py | 2 - healthchain/models/hooks/prefetch.py | 33 -- healthchain/models/requests/cdsrequest.py | 4 +- healthchain/sandbox/__init__.py | 2 +- healthchain/sandbox/base.py | 9 +- healthchain/sandbox/datasets.py | 8 +- .../sandbox/generators/cdsdatagenerator.py | 11 +- healthchain/sandbox/loaders/__init__.py | 14 + healthchain/sandbox/loaders/mimic.py | 204 +++++++++++ healthchain/sandbox/loaders/synthea.py | 84 +++++ healthchain/sandbox/requestconstructors.py | 19 +- healthchain/sandbox/sandboxclient.py | 47 ++- tests/conftest.py | 19 +- .../generators/test_cds_data_generator.py | 40 +-- tests/sandbox/test_cds_sandbox.py | 5 +- tests/sandbox/test_mimic_loader.py | 318 ++++++++++++++++++ tests/sandbox/test_request_constructors.py | 28 +- tests/sandbox/test_sandbox_client.py | 15 +- 23 files changed, 788 insertions(+), 151 deletions(-) delete mode 100644 healthchain/models/hooks/prefetch.py create mode 100644 healthchain/sandbox/loaders/__init__.py create mode 100644 healthchain/sandbox/loaders/mimic.py create mode 100644 healthchain/sandbox/loaders/synthea.py create mode 100644 tests/sandbox/test_mimic_loader.py diff --git a/docs/reference/utilities/sandbox.md b/docs/reference/utilities/sandbox.md index b1d1a6a1..6cc3a2a5 100644 --- a/docs/reference/utilities/sandbox.md +++ b/docs/reference/utilities/sandbox.md @@ -142,6 +142,7 @@ print(status) The `@hc.sandbox` and `@hc.ehr` decorators with `ClinicalDecisionSupport` and `ClinicalDocumentation` base classes are deprecated. Use `SandboxClient` instead. **Before:** + ```python @hc.sandbox class TestCDS(ClinicalDecisionSupport): @@ -151,6 +152,7 @@ class TestCDS(ClinicalDecisionSupport): ``` **After:** + ```python client = SandboxClient( api_url="http://localhost:8000", diff --git a/healthchain/fhir/__init__.py b/healthchain/fhir/__init__.py index 96a85f40..2790ed89 100644 --- a/healthchain/fhir/__init__.py +++ b/healthchain/fhir/__init__.py @@ -11,6 +11,7 @@ create_document_reference, create_single_attachment, create_resource_from_dict, + convert_prefetch_to_fhir_objects, add_provenance_metadata, add_coding_to_codeable_concept, ) @@ -36,6 +37,7 @@ "create_document_reference", "create_single_attachment", "create_resource_from_dict", + "convert_prefetch_to_fhir_objects", # Resource modification "add_provenance_metadata", "add_coding_to_codeable_concept", diff --git a/healthchain/fhir/helpers.py b/healthchain/fhir/helpers.py index 20f8c106..d89ec14b 100644 --- a/healthchain/fhir/helpers.py +++ b/healthchain/fhir/helpers.py @@ -61,6 +61,62 @@ def create_resource_from_dict( return None +def convert_prefetch_to_fhir_objects( + prefetch_dict: Dict[str, Any], +) -> Dict[str, Resource]: + """Convert a dictionary of FHIR resource dicts to FHIR Resource objects. + + Takes a prefetch dictionary where values may be either dict representations of FHIR + resources or already instantiated FHIR Resource objects, and ensures all values are + FHIR Resource objects. + + Args: + prefetch_dict: Dictionary mapping keys to FHIR resource dicts or objects + + Returns: + Dict[str, Resource]: Dictionary with same keys but all values as FHIR Resource objects + + Example: + >>> prefetch = { + ... "patient": {"resourceType": "Patient", "id": "123"}, + ... "condition": Condition(id="456", ...) + ... } + >>> fhir_objects = convert_prefetch_to_fhir_objects(prefetch) + >>> isinstance(fhir_objects["patient"], Patient) # True + >>> isinstance(fhir_objects["condition"], Condition) # True + """ + from fhir.resources import get_fhir_model_class + + result: Dict[str, Resource] = {} + + for key, resource_data in prefetch_dict.items(): + if isinstance(resource_data, dict): + # Convert dict to FHIR Resource object + resource_type = resource_data.get("resourceType") + if resource_type: + try: + resource_class = get_fhir_model_class(resource_type) + result[key] = resource_class(**resource_data) + except Exception as e: + logger.warning( + f"Failed to convert {resource_type} to FHIR object: {e}" + ) + result[key] = resource_data + else: + logger.warning( + f"No resourceType found for key '{key}', keeping as dict" + ) + result[key] = resource_data + elif isinstance(resource_data, Resource): + # Already a FHIR object + result[key] = resource_data + else: + logger.warning(f"Unexpected type for key '{key}': {type(resource_data)}") + result[key] = resource_data + + return result + + def create_single_codeable_concept( code: str, display: Optional[str] = None, diff --git a/healthchain/io/adapters/cdsfhiradapter.py b/healthchain/io/adapters/cdsfhiradapter.py index 882071a3..7d3be0e7 100644 --- a/healthchain/io/adapters/cdsfhiradapter.py +++ b/healthchain/io/adapters/cdsfhiradapter.py @@ -7,8 +7,7 @@ from healthchain.io.base import BaseAdapter from healthchain.models.requests.cdsrequest import CDSRequest from healthchain.models.responses.cdsresponse import CDSResponse -from healthchain.fhir import read_content_attachment -from healthchain.models.hooks.prefetch import Prefetch +from healthchain.fhir import read_content_attachment, convert_prefetch_to_fhir_objects log = logging.getLogger(__name__) @@ -69,7 +68,6 @@ def parse( Raises: ValueError: If neither prefetch nor fhirServer is provided in cds_request - ValueError: If the prefetch data is invalid or cannot be processed NotImplementedError: If fhirServer is provided (FHIR server support not implemented) """ if cds_request.prefetch is None and cds_request.fhirServer is None: @@ -83,14 +81,13 @@ def parse( # Create an empty Document object doc = Document(data="") - # Validate the prefetch data - validated_prefetch = Prefetch(prefetch=cds_request.prefetch) - - # Set the prefetch resources - doc.fhir.prefetch_resources = validated_prefetch.prefetch + # Convert prefetch dict resources to FHIR objects + doc.fhir.prefetch_resources = convert_prefetch_to_fhir_objects( + cds_request.prefetch or {} + ) # Extract text content from DocumentReference resource if provided - document_resource = validated_prefetch.prefetch.get(prefetch_document_key) + document_resource = doc.fhir.prefetch_resources.get(prefetch_document_key) if not document_resource: log.warning( diff --git a/healthchain/models/__init__.py b/healthchain/models/__init__.py index 8b8caba2..13de5201 100644 --- a/healthchain/models/__init__.py +++ b/healthchain/models/__init__.py @@ -13,7 +13,6 @@ CDSServiceInformation, CdaResponse, ) -from .hooks import Prefetch __all__ = [ "CDSRequest", @@ -31,5 +30,4 @@ "CDSResponse", "CdaRequest", "CdaResponse", - "Prefetch", ] diff --git a/healthchain/models/hooks/__init__.py b/healthchain/models/hooks/__init__.py index e19b9e2b..62a0edcd 100644 --- a/healthchain/models/hooks/__init__.py +++ b/healthchain/models/hooks/__init__.py @@ -2,7 +2,6 @@ from .encounterdischarge import EncounterDischargeContext from .orderselect import OrderSelectContext from .ordersign import OrderSignContext -from .prefetch import Prefetch __all__ = [ @@ -10,5 +9,4 @@ "EncounterDischargeContext", "OrderSelectContext", "OrderSignContext", - "Prefetch", ] diff --git a/healthchain/models/hooks/prefetch.py b/healthchain/models/hooks/prefetch.py deleted file mode 100644 index 085c1678..00000000 --- a/healthchain/models/hooks/prefetch.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import Dict, Any -from pydantic import BaseModel, field_validator -from fhir.resources.resource import Resource -from fhir.resources import get_fhir_model_class - - -class Prefetch(BaseModel): - prefetch: Dict[str, Any] - - @field_validator("prefetch") - @classmethod - def validate_fhir_resources(cls, v: Dict[str, Any]) -> Dict[str, Resource]: - if not v: - return v - - validated = {} - for key, resource_dict in v.items(): - if not isinstance(resource_dict, dict): - continue - - resource_type = resource_dict.get("resourceType") - if not resource_type: - continue - - try: - # Get the appropriate FHIR resource class - resource_class = get_fhir_model_class(resource_type) - # Convert the dict to a FHIR resource - validated[key] = resource_class.model_validate(resource_dict) - except Exception as e: - raise ValueError(f"Failed to validate FHIR resource {key}: {str(e)}") - - return validated diff --git a/healthchain/models/requests/cdsrequest.py b/healthchain/models/requests/cdsrequest.py index 99e08004..cbcf5da9 100644 --- a/healthchain/models/requests/cdsrequest.py +++ b/healthchain/models/requests/cdsrequest.py @@ -40,9 +40,7 @@ class CDSRequest(BaseModel): fhirAuthorization: Optional[FHIRAuthorization] = ( None # TODO: note this is required if fhirserver is given ) - prefetch: Optional[Dict[str, Any]] = ( - None # fhir resource is passed either thru prefetched template of fhir server - ) + prefetch: Optional[Dict[str, Any]] = None extension: Optional[List[Dict[str, Any]]] = None def model_dump(self, **kwargs): diff --git a/healthchain/sandbox/__init__.py b/healthchain/sandbox/__init__.py index 5c1c708c..af22abc3 100644 --- a/healthchain/sandbox/__init__.py +++ b/healthchain/sandbox/__init__.py @@ -3,8 +3,8 @@ from .sandboxclient import SandboxClient from .datasets import DatasetRegistry, DatasetLoader, list_available_datasets - # Import loaders to trigger auto-registration +from . import loaders # noqa: F401 __all__ = [ "SandboxClient", diff --git a/healthchain/sandbox/base.py b/healthchain/sandbox/base.py index 5db7a742..47e1d1d2 100644 --- a/healthchain/sandbox/base.py +++ b/healthchain/sandbox/base.py @@ -2,7 +2,6 @@ from typing import Dict from enum import Enum -from healthchain.models.hooks.prefetch import Prefetch from healthchain.sandbox.workflows import Workflow @@ -36,20 +35,20 @@ class DatasetLoader(ABC): """ Abstract base class for dataset loaders. - Subclasses should implement the load() method to return Prefetch data + Subclasses should implement the load() method to return data from their specific dataset source. """ @abstractmethod - def load(self, **kwargs) -> Prefetch: + def load(self, **kwargs) -> Dict: """ - Load dataset and return as Prefetch object. + Load dataset and return as dict of FHIR resources. Args: **kwargs: Loader-specific parameters Returns: - Prefetch object containing FHIR resources + Dict containing FHIR resources Raises: FileNotFoundError: If dataset files are not found diff --git a/healthchain/sandbox/datasets.py b/healthchain/sandbox/datasets.py index 336cacf7..5e7ed58e 100644 --- a/healthchain/sandbox/datasets.py +++ b/healthchain/sandbox/datasets.py @@ -8,7 +8,6 @@ from typing import Any, Dict, List -from healthchain.models import Prefetch from healthchain.sandbox.base import DatasetLoader @@ -43,16 +42,17 @@ def register(cls, loader: DatasetLoader) -> None: log.debug(f"Registered dataset: {name}") @classmethod - def load(cls, name: str, **kwargs) -> Prefetch: + def load(cls, name: str, data_path: str, **kwargs) -> Dict: """ Load a dataset by name. Args: name: Name of the dataset to load + data_path: Path to the dataset files **kwargs: Dataset-specific parameters Returns: - Prefetch object containing FHIR resources + Dict containing FHIR resources Raises: KeyError: If dataset name is not registered @@ -65,7 +65,7 @@ def load(cls, name: str, **kwargs) -> Prefetch: loader = cls._datasets[name] log.info(f"Loading dataset: {name}") - return loader.load(**kwargs) + return loader.load(data_path=data_path, **kwargs) @classmethod def list_datasets(cls) -> List[str]: diff --git a/healthchain/sandbox/generators/cdsdatagenerator.py b/healthchain/sandbox/generators/cdsdatagenerator.py index f9d9742f..5f20f979 100644 --- a/healthchain/sandbox/generators/cdsdatagenerator.py +++ b/healthchain/sandbox/generators/cdsdatagenerator.py @@ -8,7 +8,6 @@ from fhir.resources.resource import Resource from healthchain.sandbox.generators.basegenerators import generator_registry -from healthchain.models import Prefetch from healthchain.fhir import create_document_reference from healthchain.sandbox.workflows import Workflow @@ -93,7 +92,7 @@ def generate_prefetch( free_text_path: Optional[str] = None, column_name: Optional[str] = None, random_seed: Optional[int] = None, - ) -> Prefetch: + ) -> Dict[str, Resource]: """ Generates CDS data based on the current workflow, constraints, and optional free text data. @@ -113,7 +112,7 @@ def generate_prefetch( reproducible results. If not provided, generation will be truly random. Returns: - Prefetch: A dictionary mapping resource types to generated FHIR resources. + Dict[str, Resource]: A dictionary mapping resource types to generated FHIR resources. The keys are lowercase resource type names (e.g. "patient", "encounter"). If free text is provided, includes a "document" key with a DocumentReference. @@ -122,7 +121,7 @@ def generate_prefetch( FileNotFoundError: If the free_text_path is provided but file not found ValueError: If free_text_path provided without column_name """ - prefetch = Prefetch(prefetch={}) + prefetch = {} if self.workflow not in self.mappings.keys(): raise ValueError(f"Workflow {self.workflow} not found in mappings") @@ -134,7 +133,7 @@ def generate_prefetch( constraints=constraints, random_seed=random_seed ) - prefetch.prefetch[resource.__resource_type__.lower()] = resource + prefetch[resource.__resource_type__.lower()] = resource parsed_free_text = ( self.free_text_parser(free_text_path, column_name) @@ -142,7 +141,7 @@ def generate_prefetch( else None ) if parsed_free_text: - prefetch.prefetch["document"] = create_document_reference( + prefetch["document"] = create_document_reference( data=random.choice(parsed_free_text), content_type="text/plain", status="current", diff --git a/healthchain/sandbox/loaders/__init__.py b/healthchain/sandbox/loaders/__init__.py new file mode 100644 index 00000000..0086b08f --- /dev/null +++ b/healthchain/sandbox/loaders/__init__.py @@ -0,0 +1,14 @@ +""" +Dataset loaders package. + +Auto-registers all available dataset loaders on import. +""" + +from healthchain.sandbox.datasets import DatasetRegistry +from healthchain.sandbox.loaders.mimic import MimicOnFHIRLoader + +# Register loaders +DatasetRegistry.register(MimicOnFHIRLoader()) +# DatasetRegistry.register(SyntheaLoader()) # if implemented + +__all__ = ["MimicOnFHIRLoader"] diff --git a/healthchain/sandbox/loaders/mimic.py b/healthchain/sandbox/loaders/mimic.py new file mode 100644 index 00000000..fbfe503f --- /dev/null +++ b/healthchain/sandbox/loaders/mimic.py @@ -0,0 +1,204 @@ +""" +MIMIC-on-FHIR dataset loader. + +Loads patient data from the MIMIC-IV-on-FHIR dataset for testing and demos. +""" + +import logging +import random +from pathlib import Path +from typing import Dict, List, Optional + +from fhir.resources.R4B.bundle import Bundle + +from healthchain.sandbox.datasets import DatasetLoader + +log = logging.getLogger(__name__) + + +class MimicOnFHIRLoader(DatasetLoader): + """ + Loader for MIMIC-IV-on-FHIR dataset. + + This loader supports loading FHIR resources from the MIMIC-IV dataset + that has been converted to FHIR format. It can load specific patients, + sample random patients, or filter by resource types. + + Examples: + Load via SandboxClient: + >>> client = SandboxClient(...) + >>> client.load_from_registry( + ... "mimic-on-fhir", + ... data_path="./data/mimic-fhir", + ... resource_types=["MimicMedication", "MimicCondition"], + ... sample_size=10 + ... ) + """ + + @property + def name(self) -> str: + """Dataset name for registration.""" + return "mimic-on-fhir" + + @property + def description(self) -> str: + """Dataset description.""" + return ( + "MIMIC-IV-on-FHIR: Real de-identified clinical data from " + "Beth Israel Deaconess Medical Center in FHIR format" + ) + + def load( + self, + data_path: str, + resource_types: Optional[List[str]] = None, + sample_size: Optional[int] = None, + random_seed: Optional[int] = None, + **kwargs, + ) -> Dict: + """ + Load MIMIC-on-FHIR data as dict containing R4B Bundles. + + Args: + data_path: Path to MIMIC-on-FHIR data directory + resource_types: List of MIMIC resource types to load (e.g., ["MimicMedication", "MimicCondition"]). + These should match the MIMIC-on-FHIR filename format (without .ndjson.gz extension). + If None, raises ValueError. + sample_size: Number of resources to randomly sample per resource type. + If None, loads all available resources. + random_seed: Random seed for reproducible sampling. + **kwargs: Additional parameters (reserved for future use) + + Returns: + Dict containing R4B Bundle resources grouped by FHIR resource type. + Each Bundle contains resources of the same type. + Example: {"MedicationStatement": Bundle(...), "Condition": Bundle(...)} + + Raises: + FileNotFoundError: If data path doesn't exist or resource files not found + ValueError: If resource_types is None or empty, or if resource validation fails + """ + data_path = Path(data_path) + if not data_path.exists(): + raise FileNotFoundError( + f"MIMIC-on-FHIR data not found at: {data_path}. " + "Please provide a valid data_path." + ) + + if not resource_types: + raise ValueError( + "resource_types parameter is required. " + "Provide a list of MIMIC resource types to load (e.g., ['MimicMedication', 'MimicCondition'])." + ) + + # Set random seed if provided + if random_seed is not None: + random.seed(random_seed) + + # Load resources and group by FHIR resource type + resources_by_type: Dict[str, List[Dict]] = {} + + for resource_type in resource_types: + try: + resources = self._load_resource_file( + data_path, resource_type, sample_size + ) + + # Group by FHIR resourceType (not filename) + for resource in resources: + fhir_type = resource["resourceType"] + if fhir_type not in resources_by_type: + resources_by_type[fhir_type] = [] + resources_by_type[fhir_type].append(resource) + + log.info( + f"Loaded {len(resources)} resources from {resource_type}.ndjson.gz" + ) + except FileNotFoundError as e: + log.error(f"Failed to load {resource_type}: {e}") + raise + except Exception as e: + log.error(f"Error loading {resource_type}: {e}") + raise ValueError(f"Failed to load {resource_type}: {e}") + + if not resources_by_type: + raise ValueError( + f"No valid resources loaded from specified resource types: {resource_types}" + ) + + bundles = {} + for fhir_type, resources in resources_by_type.items(): + bundles[fhir_type] = Bundle( + type="collection", + entry=[{"resource": resource} for resource in resources], + ) + + return bundles + + def _load_resource_file( + self, data_path: Path, resource_type: str, sample_size: Optional[int] = None + ) -> List[Dict]: + """ + Load resources from a single MIMIC-on-FHIR .ndjson.gz file. + + Args: + data_path: Path to MIMIC-on-FHIR data directory + resource_type: MIMIC resource type (e.g., "MimicMedication") + sample_size: Number of resources to randomly sample + + Returns: + List of resource dicts + + Raises: + FileNotFoundError: If the resource file doesn't exist + ValueError: If no valid resources found + """ + import gzip + import json + + # Construct file path + fhir_dir = data_path / "fhir" + file_path = fhir_dir / f"{resource_type}.ndjson.gz" + + if not file_path.exists(): + raise FileNotFoundError( + f"Resource file not found: {file_path}. " + f"Expected MIMIC-on-FHIR file at {fhir_dir}/{resource_type}.ndjson.gz" + ) + + # Read all resources from file as dicts + resources = [] + line_num = 0 + + with gzip.open(file_path, "rt") as f: + for line in f: + line_num += 1 + try: + data = json.loads(line) + + if not data.get("resourceType"): + log.warning( + f"Skipping line {line_num} in {resource_type}.ndjson.gz: " + "No resourceType field found" + ) + continue + + resources.append(data) + + except json.JSONDecodeError as e: + log.warning( + f"Skipping malformed JSON at line {line_num} in {resource_type}.ndjson.gz: {e}" + ) + continue + + if not resources: + raise ValueError( + f"No valid resources found in {file_path}. " + "File may be empty or contain only invalid resources." + ) + + # Apply random sampling if requested + if sample_size is not None and sample_size < len(resources): + resources = random.sample(resources, sample_size) + + return resources diff --git a/healthchain/sandbox/loaders/synthea.py b/healthchain/sandbox/loaders/synthea.py new file mode 100644 index 00000000..e68dd50b --- /dev/null +++ b/healthchain/sandbox/loaders/synthea.py @@ -0,0 +1,84 @@ +""" +Synthea dataset loader. + +Loads synthetic patient data generated by Synthea. +""" + +import logging +from pathlib import Path +from typing import Dict, List, Optional + +from healthchain.sandbox.datasets import DatasetLoader + +log = logging.getLogger(__name__) + + +class SyntheaLoader(DatasetLoader): + """ + Loader for Synthea-generated FHIR data. + + Synthea is an open-source synthetic patient generator that produces + realistic patient records in FHIR format. This loader supports loading + Synthea-generated FHIR bundles. + + Examples: + Load via SandboxClient: + >>> client = SandboxClient(...) + >>> client.load_from_registry( + ... "synthea", + ... data_path="./data/synthea/output/fhir", + ... sample_size=10 + ... ) + """ + + @property + def name(self) -> str: + """Dataset name for registration.""" + return "synthea" + + @property + def description(self) -> str: + """Dataset description.""" + return "Synthea: Synthetic patient data generated by SyntheaTM" + + def load( + self, + data_path: str, + patient_ids: Optional[List[str]] = None, + sample_size: Optional[int] = None, + random_seed: Optional[int] = None, + **kwargs, + ) -> Dict: + """ + Load Synthea FHIR data as dict. + + Args: + data_path: Path to Synthea FHIR output directory + patient_ids: Specific patient IDs to load + sample_size: Number of random patients to sample + random_seed: Random seed for reproducible sampling + **kwargs: Additional parameters (reserved for future use) + + Returns: + Dict containing FHIR resources + + Raises: + FileNotFoundError: If data path doesn't exist + NotImplementedError: Full implementation pending + """ + data_path = Path(data_path) + if not data_path.exists(): + raise FileNotFoundError( + f"Synthea data not found at: {data_path}. " + "Please provide a valid data_path." + ) + + # TODO: Implement Synthea FHIR bundle loading + # Synthea typically outputs individual FHIR bundles per patient + # Format: .json containing a Bundle resource + + raise NotImplementedError( + "Synthea loader is not yet fully implemented. " + "This is a placeholder for future functionality. " + "For now, please use load_free_text() with CdsDataGenerator for synthetic data." + ) diff --git a/healthchain/sandbox/requestconstructors.py b/healthchain/sandbox/requestconstructors.py index 600ef8c5..5b0222f6 100644 --- a/healthchain/sandbox/requestconstructors.py +++ b/healthchain/sandbox/requestconstructors.py @@ -10,8 +10,7 @@ import pkgutil import xmltodict -from typing import Dict, Optional -from fhir.resources.resource import Resource +from typing import Any, Dict, Optional from healthchain.sandbox.base import BaseRequestConstructor, ApiProtocol from healthchain.sandbox.workflows import ( @@ -27,7 +26,6 @@ OrderSignContext, PatientViewContext, EncounterDischargeContext, - Prefetch, ) @@ -59,7 +57,7 @@ def __init__(self) -> None: @validate_workflow(UseCaseMapping.ClinicalDecisionSupport) def construct_request( self, - prefetch_data: Dict[str, Resource], + prefetch_data: Dict[str, Any], workflow: Workflow, context: Optional[Dict[str, str]] = {}, ) -> CDSRequest: @@ -67,7 +65,7 @@ def construct_request( Build a CDS Hooks request including context and prefetch data. Args: - prefetch_data (Dict[str, Resource]): Dictionary mapping prefetch template names to FHIR resource objects. + prefetch_data (Dict[str, Any]): Dict containing FHIR resource objects. workflow (Workflow): The name of the CDS Hooks workflow (e.g., Workflow.patient_view). context (Optional[Dict[str, str]]): Optional context values for initializing the workflow's context model. @@ -76,30 +74,25 @@ def construct_request( Raises: ValueError: If the workflow is not supported or lacks a defined context model. - TypeError: If prefetch_data is not an instance of Prefetch. Note: Only CDS workflows supported by UseCaseMapping.ClinicalDecisionSupport are valid. - The expected prefetch_data argument is a Prefetch object encapsulating FHIR resources. # TODO: Add FhirServer support in future. """ - log.debug(f"Constructing CDS request for {workflow.value} from {prefetch_data}") + log.debug(f"Constructing CDS request for {workflow.value}") context_model = self.context_mapping.get(workflow, None) if context_model is None: raise ValueError( f"Invalid workflow {workflow.value} or workflow model not implemented." ) - if not isinstance(prefetch_data, Prefetch): - raise TypeError( - f"Prefetch data must be a Prefetch object, but got {type(prefetch_data)}" - ) + request = CDSRequest( hook=workflow.value, context=context_model(**context), - prefetch=prefetch_data.prefetch, + prefetch=prefetch_data, ) return request diff --git a/healthchain/sandbox/sandboxclient.py b/healthchain/sandbox/sandboxclient.py index 2587fcb0..2d595637 100644 --- a/healthchain/sandbox/sandboxclient.py +++ b/healthchain/sandbox/sandboxclient.py @@ -13,7 +13,7 @@ from typing import Any, Dict, List, Literal, Optional, Union from healthchain.sandbox.base import ApiProtocol -from healthchain.models import CDSRequest, CDSResponse, Prefetch +from healthchain.models import CDSRequest, CDSResponse from healthchain.models.responses.cdaresponse import CdaResponse from healthchain.sandbox.workflows import Workflow from healthchain.sandbox.utils import ensure_directory_exists, save_data_to_directory @@ -110,6 +110,8 @@ def __init__( def load_from_registry( self, source: str, + data_path: str, + workflow: Optional[Union[Workflow, str]] = None, **kwargs: Any, ) -> "SandboxClient": """ @@ -120,13 +122,17 @@ def load_from_registry( Args: source: Dataset name (e.g., "mimic-on-fhir", "synthea") - **kwargs: Dataset-specific parameters (e.g., sample_size, num_patients) + data_path: Path to the dataset files + workflow: CDS workflow type (e.g., "encounter-discharge", "patient-view"). + If not provided, uses the workflow set on client initialization. + **kwargs: Dataset-specific parameters (e.g., resource_types, sample_size) Returns: Self for method chaining Raises: - ValueError: If dataset not found in registry + ValueError: If dataset not found in registry or workflow not specified + FileNotFoundError: If data_path doesn't exist Examples: Discover available datasets: @@ -134,14 +140,23 @@ def load_from_registry( >>> print(list_available_datasets()) Load MIMIC dataset: - >>> client.load_from_registry("mimic-on-fhir", sample_size=10) + >>> client.load_from_registry( + ... "mimic-on-fhir", + ... data_path="./data/mimic-fhir", + ... workflow="patient-view", + ... resource_types=["MimicMedication"], + ... sample_size=10 + ... ) """ from healthchain.sandbox.datasets import DatasetRegistry log.info(f"Loading dataset from registry: {source}") try: - loaded_data = DatasetRegistry.load(source, **kwargs) - self._construct_request(loaded_data) + loaded_data = DatasetRegistry.load(source, data_path=data_path, **kwargs) + workflow_enum = ( + Workflow(workflow) if isinstance(workflow, str) else workflow + ) + self._construct_request(loaded_data, workflow_enum) log.info(f"Loaded {source} dataset with {len(self.request_data)} requests") except KeyError: raise ValueError( @@ -227,9 +242,7 @@ def load_from_path( json_data = json.load(f) try: - # Validate and load as Prefetch object - prefetch_data = Prefetch(**json_data) - + # Load as prefetch dict workflow_enum = ( Workflow(workflow) if isinstance(workflow, str) @@ -237,17 +250,17 @@ def load_from_path( ) if not workflow_enum: raise ValueError( - "Workflow must be specified when loading JSON Prefetch data. " + "Workflow must be specified when loading JSON prefetch data. " "Provide via 'workflow' parameter or set on client initialization." ) - self._construct_request(prefetch_data, workflow_enum) - log.info(f"Loaded Prefetch data from {file_path.name}") + self._construct_request(json_data, workflow_enum) + log.info(f"Loaded prefetch data from {file_path.name}") except Exception as e: - log.error(f"Failed to parse {file_path} as Prefetch: {e}") + log.error(f"Failed to parse {file_path} as prefetch data: {e}") raise ValueError( - f"File {file_path} is not valid Prefetch format. " - f"Expected JSON with 'prefetch' key containing FHIR resources. " + f"File {file_path} is not valid prefetch format. " + f"Expected JSON with FHIR resources. " f"Error: {e}" ) else: @@ -318,13 +331,13 @@ def load_free_text( return self def _construct_request( - self, data: Union[Prefetch, Any], workflow: Optional[Workflow] = None + self, data: Union[Dict[str, Any], Any], workflow: Optional[Workflow] = None ) -> None: """ Convert data to request format and add to queue. Args: - data: Data to convert (Prefetch for CDS, DocumentReference for CDA) + data: Data to convert (Dict for CDS prefetch, string for CDA) workflow: Workflow to use for request construction """ workflow = workflow or self.workflow diff --git a/tests/conftest.py b/tests/conftest.py index be60e5a2..ee9e4b47 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,6 @@ import tempfile -from healthchain.models.hooks.prefetch import Prefetch from healthchain.models.requests.cdarequest import CdaRequest from healthchain.models.requests.cdsrequest import CDSRequest from healthchain.models.responses.cdaresponse import CdaResponse @@ -353,26 +352,24 @@ def test_empty_document(): @pytest.fixture def valid_prefetch_data(): - """Provides a `Prefetch` model object for CDS Hooks testing. + """Provides a dict of FHIR resources for CDS Hooks testing. Contains a single prefetch key "document" with a DocumentReference resource. Use this for testing services that consume CDS Hooks prefetch data. Example: def test_prefetch_handler(valid_prefetch_data): - request = CDSRequest(prefetch=valid_prefetch_data.prefetch) + request = CDSRequest(prefetch=valid_prefetch_data) # ... test logic Returns: - healthchain.models.hooks.prefetch.Prefetch: A Pydantic model representing valid prefetch data. + dict: A dictionary containing FHIR resources for prefetch data. """ - return Prefetch( - prefetch={ - "document": create_document_reference( - content_type="text/plain", data="Test document content" - ) - } - ) + return { + "document": create_document_reference( + content_type="text/plain", data="Test document content" + ) + } # ################################################# diff --git a/tests/sandbox/generators/test_cds_data_generator.py b/tests/sandbox/generators/test_cds_data_generator.py index eecea05a..1597ff67 100644 --- a/tests/sandbox/generators/test_cds_data_generator.py +++ b/tests/sandbox/generators/test_cds_data_generator.py @@ -16,14 +16,14 @@ def test_generator_orchestrator_encounter_discharge(): generator.set_workflow(workflow=workflow) generator.generate_prefetch() - assert len(generator.generated_data.prefetch) == 4 - assert generator.generated_data.prefetch["encounter"] is not None - assert isinstance(generator.generated_data.prefetch["encounter"], Encounter) - assert generator.generated_data.prefetch["condition"] is not None - assert isinstance(generator.generated_data.prefetch["condition"], Condition) - assert generator.generated_data.prefetch["procedure"] is not None - assert isinstance(generator.generated_data.prefetch["procedure"], Procedure) - assert generator.generated_data.prefetch["medicationrequest"] is not None + assert len(generator.generated_data) == 4 + assert generator.generated_data["encounter"] is not None + assert isinstance(generator.generated_data["encounter"], Encounter) + assert generator.generated_data["condition"] is not None + assert isinstance(generator.generated_data["condition"], Condition) + assert generator.generated_data["procedure"] is not None + assert isinstance(generator.generated_data["procedure"], Procedure) + assert generator.generated_data["medicationrequest"] is not None def test_generator_orchestrator_patient_view(): @@ -33,13 +33,13 @@ def test_generator_orchestrator_patient_view(): generator.set_workflow(workflow=workflow) generator.generate_prefetch() - assert len(generator.generated_data.prefetch) == 3 - assert generator.generated_data.prefetch["patient"] is not None - assert isinstance(generator.generated_data.prefetch["patient"], Patient) - assert generator.generated_data.prefetch["encounter"] is not None - assert isinstance(generator.generated_data.prefetch["encounter"], Encounter) - assert generator.generated_data.prefetch["condition"] is not None - assert isinstance(generator.generated_data.prefetch["condition"], Condition) + assert len(generator.generated_data) == 3 + assert generator.generated_data["patient"] is not None + assert isinstance(generator.generated_data["patient"], Patient) + assert generator.generated_data["encounter"] is not None + assert isinstance(generator.generated_data["encounter"], Encounter) + assert generator.generated_data["condition"] is not None + assert isinstance(generator.generated_data["condition"], Condition) @pytest.mark.skip() @@ -52,8 +52,8 @@ def test_generator_with_json(): free_text_path="use_cases/my_encounter_data.csv", column_name="free_text" ) - assert len(generator.generated_data.prefetch) == 4 - assert generator.generated_data.prefetch["patient"] is not None - assert generator.generated_data.prefetch["encounter"] is not None - assert generator.generated_data.prefetch["condition"] is not None - assert generator.generated_data.prefetch["document"] is not None + assert len(generator.generated_data) == 4 + assert generator.generated_data["patient"] is not None + assert generator.generated_data["encounter"] is not None + assert generator.generated_data["condition"] is not None + assert generator.generated_data["document"] is not None diff --git a/tests/sandbox/test_cds_sandbox.py b/tests/sandbox/test_cds_sandbox.py index 28450fb8..c2880f79 100644 --- a/tests/sandbox/test_cds_sandbox.py +++ b/tests/sandbox/test_cds_sandbox.py @@ -5,7 +5,6 @@ from healthchain.gateway.api import HealthChainAPI from healthchain.models.requests.cdsrequest import CDSRequest from healthchain.models.responses.cdsresponse import CDSResponse, Card -from healthchain.models.hooks.prefetch import Prefetch from healthchain.fhir import create_bundle, create_condition @@ -37,7 +36,7 @@ async def handle_patient_view(request: CDSRequest) -> CDSResponse: # Load test data test_bundle = create_bundle() - prefetch_data = Prefetch(prefetch={"patient": test_bundle}) + prefetch_data = {"patient": test_bundle} client._construct_request(prefetch_data, client.workflow) # Verify request was constructed @@ -88,7 +87,7 @@ def test_cdshooks_workflows(): patient_bundle.entry = [{"resource": condition}] # Load data into client - prefetch_data = Prefetch(prefetch={"patient": patient_bundle}) + prefetch_data = {"patient": patient_bundle} client._construct_request(prefetch_data, client.workflow) # Verify request was constructed diff --git a/tests/sandbox/test_mimic_loader.py b/tests/sandbox/test_mimic_loader.py new file mode 100644 index 00000000..a0e35980 --- /dev/null +++ b/tests/sandbox/test_mimic_loader.py @@ -0,0 +1,318 @@ +"""Tests for MIMIC-on-FHIR dataset loader.""" + +import gzip +import json +import tempfile +from pathlib import Path + +import pytest + +from healthchain.sandbox.loaders.mimic import MimicOnFHIRLoader + + +@pytest.fixture +def temp_mimic_data_dir(): + """Create temporary MIMIC-on-FHIR data directory structure.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_path = Path(tmpdir) + fhir_dir = data_path / "fhir" + fhir_dir.mkdir() + yield data_path + + +@pytest.fixture +def mock_medication_resources(): + """Sample MedicationStatement resources for testing.""" + return [ + { + "resourceType": "MedicationStatement", + "id": "med-1", + "status": "recorded", + "medication": { + "concept": { + "coding": [ + { + "system": "http://www.nlm.nih.gov/research/umls/rxnorm", + "code": "313782", + } + ] + } + }, + "subject": {"reference": "Patient/123"}, + }, + { + "resourceType": "MedicationStatement", + "id": "med-2", + "status": "recorded", + "medication": { + "concept": { + "coding": [ + { + "system": "http://www.nlm.nih.gov/research/umls/rxnorm", + "code": "197361", + } + ] + } + }, + "subject": {"reference": "Patient/456"}, + }, + ] + + +@pytest.fixture +def mock_condition_resources(): + """Sample Condition resources for testing.""" + return [ + { + "resourceType": "Condition", + "id": "cond-1", + "clinicalStatus": { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", + "code": "active", + } + ] + }, + "code": { + "coding": [{"system": "http://snomed.info/sct", "code": "44054006"}] + }, + "subject": {"reference": "Patient/123"}, + } + ] + + +def create_ndjson_gz_file(file_path: Path, resources: list): + """Helper to create gzipped NDJSON file.""" + with gzip.open(file_path, "wt") as f: + for resource in resources: + f.write(json.dumps(resource) + "\n") + + +def test_mimic_loader_requires_resource_types(temp_mimic_data_dir): + """MimicOnFHIRLoader raises ValueError when resource_types is None.""" + loader = MimicOnFHIRLoader() + + with pytest.raises(ValueError, match="resource_types parameter is required"): + loader.load(data_path=str(temp_mimic_data_dir)) + + +def test_mimic_loader_raises_error_for_missing_data_path(): + """MimicOnFHIRLoader raises FileNotFoundError when data path doesn't exist.""" + loader = MimicOnFHIRLoader() + + with pytest.raises(FileNotFoundError, match="MIMIC-on-FHIR data not found"): + loader.load(data_path="/nonexistent/path", resource_types=["MimicMedication"]) + + +def test_mimic_loader_raises_error_for_missing_resource_file(temp_mimic_data_dir): + """MimicOnFHIRLoader raises FileNotFoundError when resource file doesn't exist.""" + loader = MimicOnFHIRLoader() + + with pytest.raises(FileNotFoundError, match="Resource file not found"): + loader.load( + data_path=str(temp_mimic_data_dir), resource_types=["MimicMedication"] + ) + + +def test_mimic_loader_loads_single_resource_type( + temp_mimic_data_dir, mock_medication_resources +): + """MimicOnFHIRLoader loads and validates single resource type.""" + # Create mock data file + fhir_dir = temp_mimic_data_dir / "fhir" + create_ndjson_gz_file( + fhir_dir / "MimicMedication.ndjson.gz", mock_medication_resources + ) + + loader = MimicOnFHIRLoader() + result = loader.load( + data_path=str(temp_mimic_data_dir), resource_types=["MimicMedication"] + ) + + assert isinstance(result, dict) + assert "MedicationStatement" in result + # Result dict contains a Bundle + bundle = result["MedicationStatement"] + assert type(bundle).__name__ == "Bundle" + assert len(bundle.entry) == 2 + assert bundle.entry[0].resource.id == "med-1" + + +def test_mimic_loader_loads_multiple_resource_types( + temp_mimic_data_dir, mock_medication_resources, mock_condition_resources +): + """MimicOnFHIRLoader loads multiple resource types and groups by FHIR type.""" + fhir_dir = temp_mimic_data_dir / "fhir" + create_ndjson_gz_file( + fhir_dir / "MimicMedication.ndjson.gz", mock_medication_resources + ) + create_ndjson_gz_file( + fhir_dir / "MimicCondition.ndjson.gz", mock_condition_resources + ) + + loader = MimicOnFHIRLoader() + result = loader.load( + data_path=str(temp_mimic_data_dir), + resource_types=["MimicMedication", "MimicCondition"], + ) + + assert "MedicationStatement" in result + assert "Condition" in result + # Each result value is a Bundle + med_bundle = result["MedicationStatement"] + cond_bundle = result["Condition"] + assert len(med_bundle.entry) == 2 + assert len(cond_bundle.entry) == 1 + + +@pytest.mark.parametrize("sample_size,expected_count", [(1, 1), (2, 2)]) +def test_mimic_loader_sampling_behavior( + temp_mimic_data_dir, mock_medication_resources, sample_size, expected_count +): + """MimicOnFHIRLoader samples specified number of resources.""" + fhir_dir = temp_mimic_data_dir / "fhir" + create_ndjson_gz_file( + fhir_dir / "MimicMedication.ndjson.gz", mock_medication_resources + ) + + loader = MimicOnFHIRLoader() + result = loader.load( + data_path=str(temp_mimic_data_dir), + resource_types=["MimicMedication"], + sample_size=sample_size, + ) + + bundle = result["MedicationStatement"] + assert len(bundle.entry) == expected_count + + +def test_mimic_loader_deterministic_sampling_with_seed( + temp_mimic_data_dir, mock_medication_resources +): + """MimicOnFHIRLoader produces consistent results with random_seed.""" + fhir_dir = temp_mimic_data_dir / "fhir" + create_ndjson_gz_file( + fhir_dir / "MimicMedication.ndjson.gz", mock_medication_resources + ) + + loader = MimicOnFHIRLoader() + result1 = loader.load( + data_path=str(temp_mimic_data_dir), + resource_types=["MimicMedication"], + sample_size=1, + random_seed=42, + ) + result2 = loader.load( + data_path=str(temp_mimic_data_dir), + resource_types=["MimicMedication"], + sample_size=1, + random_seed=42, + ) + + bundle1 = result1["MedicationStatement"] + bundle2 = result2["MedicationStatement"] + assert bundle1.entry[0].resource.id == bundle2.entry[0].resource.id + + +def test_mimic_loader_handles_malformed_json(temp_mimic_data_dir): + """MimicOnFHIRLoader skips malformed JSON lines and continues processing.""" + fhir_dir = temp_mimic_data_dir / "fhir" + file_path = fhir_dir / "MimicMedication.ndjson.gz" + + # Create file with mix of valid and malformed JSON + with gzip.open(file_path, "wt") as f: + f.write('{"invalid json\n') # Malformed + f.write( + json.dumps( + { + "resourceType": "MedicationStatement", + "id": "med-1", + "status": "recorded", + "medication": { + "concept": { + "coding": [ + { + "system": "http://www.nlm.nih.gov/research/umls/rxnorm", + "code": "313782", + } + ] + } + }, + "subject": {"reference": "Patient/123"}, + } + ) + + "\n" + ) # Valid + + loader = MimicOnFHIRLoader() + result = loader.load( + data_path=str(temp_mimic_data_dir), resource_types=["MimicMedication"] + ) + + # Should load the valid resource despite malformed line + bundle = result["MedicationStatement"] + assert len(bundle.entry) == 1 + + +def test_mimic_loader_raises_error_for_invalid_fhir_resources(temp_mimic_data_dir): + """Loader validates FHIR resources and raises error for invalid data.""" + fhir_dir = temp_mimic_data_dir / "fhir" + file_path = fhir_dir / "MimicMedication.ndjson.gz" + + # Create file with invalid FHIR resource (missing required fields) + invalid_resources = [ + { + "resourceType": "MedicationStatement", + "id": "med-1", + }, # Missing required fields + ] + + with gzip.open(file_path, "wt") as f: + for resource in invalid_resources: + f.write(json.dumps(resource) + "\n") + + loader = MimicOnFHIRLoader() + + # FHIR validation now catches the invalid resource + with pytest.raises(Exception): + loader.load( + data_path=str(temp_mimic_data_dir), resource_types=["MimicMedication"] + ) + + +def test_mimic_loader_skips_resources_without_resource_type(temp_mimic_data_dir): + """MimicOnFHIRLoader skips resources missing resourceType field.""" + fhir_dir = temp_mimic_data_dir / "fhir" + file_path = fhir_dir / "MimicMedication.ndjson.gz" + + resources = [ + {"id": "med-1", "status": "recorded"}, # No resourceType + { + "resourceType": "MedicationStatement", + "id": "med-2", + "status": "recorded", + "medication": { + "concept": { + "coding": [ + { + "system": "http://www.nlm.nih.gov/research/umls/rxnorm", + "code": "313782", + } + ] + } + }, + "subject": {"reference": "Patient/123"}, + }, + ] + + create_ndjson_gz_file(file_path, resources) + + loader = MimicOnFHIRLoader() + result = loader.load( + data_path=str(temp_mimic_data_dir), resource_types=["MimicMedication"] + ) + + # Should only load the valid resource + bundle = result["MedicationStatement"] + assert len(bundle.entry) == 1 diff --git a/tests/sandbox/test_request_constructors.py b/tests/sandbox/test_request_constructors.py index cd9e4ae5..31243bd0 100644 --- a/tests/sandbox/test_request_constructors.py +++ b/tests/sandbox/test_request_constructors.py @@ -6,7 +6,6 @@ ClinDocRequestConstructor, ) from healthchain.sandbox.workflows import Workflow -from healthchain.models.hooks.prefetch import Prefetch from healthchain.sandbox.base import ApiProtocol from healthchain.fhir import create_bundle @@ -29,8 +28,8 @@ def test_cds_request_constructor_validation(): """Test validation of workflows in CdsRequestConstructor""" constructor = CdsRequestConstructor() - # Create a prefetch object - prefetch = Prefetch(prefetch={"patient": create_bundle()}) + # Create a prefetch dict + prefetch = {"patient": create_bundle()} # Test with valid workflow valid_workflow = Workflow.patient_view @@ -46,15 +45,16 @@ def test_cds_request_constructor_validation(): def test_cds_request_constructor_type_error(): - """Test type error handling in CdsRequestConstructor""" + """Test validation error handling in CdsRequestConstructor""" constructor = CdsRequestConstructor() - # Test with invalid prefetch data type - should raise TypeError - with pytest.raises(TypeError): - # Not a Prefetch object - invalid_prefetch = {"patient": create_bundle()} + # Test with invalid workflow - should raise ValueError + with pytest.raises(ValueError): + # Invalid workflow + invalid_workflow = MagicMock() + invalid_workflow.value = "invalid-workflow" constructor.construct_request( - prefetch_data=invalid_prefetch, workflow=Workflow.patient_view + prefetch_data={"patient": create_bundle()}, workflow=invalid_workflow ) @@ -62,9 +62,9 @@ def test_cds_request_construction(): """Test request construction in CdsRequestConstructor""" constructor = CdsRequestConstructor() - # Create a bundle and prefetch + # Create a bundle and prefetch dict bundle = create_bundle() - prefetch = Prefetch(prefetch={"patient": bundle}) + prefetch = {"patient": bundle} # Construct a request request = constructor.construct_request( @@ -76,7 +76,7 @@ def test_cds_request_construction(): # Verify request properties assert request.hook == "patient-view" assert request.context.patientId == "test-patient-123" - assert request.prefetch == prefetch.prefetch + assert request.prefetch == prefetch def test_clindoc_request_constructor_init(): @@ -185,7 +185,7 @@ def test_cds_request_construction_with_custom_context(): """CdsRequestConstructor includes custom context parameters in request.""" constructor = CdsRequestConstructor() bundle = create_bundle() - prefetch = Prefetch(prefetch={"patient": bundle}) + prefetch = {"patient": bundle} # Test with custom context custom_context = {"patientId": "patient-123", "encounterId": "encounter-456"} @@ -201,7 +201,7 @@ def test_cds_request_construction_with_custom_context(): def test_cds_request_validates_workflow_for_clinical_doc(): """CdsRequestConstructor rejects ClinicalDocumentation workflows.""" constructor = CdsRequestConstructor() - prefetch = Prefetch(prefetch={"patient": create_bundle()}) + prefetch = {"patient": create_bundle()} # Should reject sign-note workflows with pytest.raises(ValueError, match="Invalid workflow"): diff --git a/tests/sandbox/test_sandbox_client.py b/tests/sandbox/test_sandbox_client.py index e0498d79..c52c18a0 100644 --- a/tests/sandbox/test_sandbox_client.py +++ b/tests/sandbox/test_sandbox_client.py @@ -164,21 +164,21 @@ def test_load_from_path_json_without_workflow_fails(tmp_path): def test_load_from_path_invalid_json_prefetch(tmp_path): - """load_from_path rejects malformed JSON Prefetch data.""" - json_file = tmp_path / "invalid.json" + """load_from_path processes JSON data for prefetch.""" + json_file = tmp_path / "data.json" json_file.write_text('{"not_prefetch": "data"}') client = SandboxClient( api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" ) - with pytest.raises(ValueError, match="not valid Prefetch format"): - client.load_from_path(str(json_file)) + # Should load the JSON data without error since we're using plain dicts now + client.load_from_path(str(json_file)) + assert len(client.request_data) == 1 def test_save_results_distinguishes_protocols(tmp_path): """save_results uses correct file extension based on protocol.""" - from healthchain.models import Prefetch from healthchain.fhir import create_bundle from healthchain.sandbox.workflows import Workflow @@ -186,7 +186,7 @@ def test_save_results_distinguishes_protocols(tmp_path): rest_client = SandboxClient( api_url="http://localhost:8000", endpoint="/test", protocol="rest" ) - prefetch = Prefetch(prefetch={"patient": create_bundle()}) + prefetch = {"patient": create_bundle()} rest_client._construct_request(prefetch, Workflow.patient_view) rest_client.responses = [{"cards": []}] @@ -213,10 +213,9 @@ def test_save_results_distinguishes_protocols(tmp_path): def test_construct_request_requires_workflow_for_rest(): """_construct_request raises ValueError if workflow missing for REST protocol.""" client = SandboxClient(api_url="http://localhost:8000", endpoint="/test") - from healthchain.models import Prefetch from healthchain.fhir import create_bundle - prefetch = Prefetch(prefetch={"patient": create_bundle()}) + prefetch = {"patient": create_bundle()} with pytest.raises(ValueError, match="Workflow must be specified for REST"): client._construct_request(prefetch, None) From 93b2cd83a72cc6949dc774465d057c598fcdadd5 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Fri, 31 Oct 2025 16:25:36 +0000 Subject: [PATCH 02/12] Tidy sandboxclient API and tests --- cookbook/cds_discharge_summarizer_hf_chat.py | 2 +- cookbook/cds_discharge_summarizer_hf_trf.py | 2 +- cookbook/notereader_clinical_coding_fhir.py | 5 +- healthchain/gateway/fhir/aio.py | 12 +- healthchain/gateway/fhir/sync.py | 10 +- healthchain/sandbox/sandboxclient.py | 238 +++++++++---- scripts/healthchainapi_e2e_demo.py | 6 +- tests/gateway/test_fhir_gateway.py | 44 +-- tests/gateway/test_fhir_gateway_async.py | 39 +-- tests/sandbox/test_cds_sandbox.py | 4 +- tests/sandbox/test_clindoc_sandbox.py | 4 +- tests/sandbox/test_sandbox_client.py | 348 +++++++++++++++++-- 12 files changed, 544 insertions(+), 170 deletions(-) diff --git a/cookbook/cds_discharge_summarizer_hf_chat.py b/cookbook/cds_discharge_summarizer_hf_chat.py index ad0a15c0..3a6496d4 100644 --- a/cookbook/cds_discharge_summarizer_hf_chat.py +++ b/cookbook/cds_discharge_summarizer_hf_chat.py @@ -83,10 +83,10 @@ def start_api(): client = SandboxClient( api_url="http://localhost:8000", endpoint="/cds/cds-services/discharge-summarizer", + workflow="encounter-discharge", ) # Load discharge notes from CSV client.load_free_text( - workflow="encounter-discharge", csv_path="data/discharge_notes.csv", column_name="text", ) diff --git a/cookbook/cds_discharge_summarizer_hf_trf.py b/cookbook/cds_discharge_summarizer_hf_trf.py index 2cb49baa..635fafa5 100644 --- a/cookbook/cds_discharge_summarizer_hf_trf.py +++ b/cookbook/cds_discharge_summarizer_hf_trf.py @@ -56,10 +56,10 @@ def start_api(): client = SandboxClient( api_url="http://localhost:8000", endpoint="/cds/cds-services/discharge-summarizer", + workflow="encounter-discharge", ) # Load discharge notes from CSV client.load_free_text( - workflow="encounter-discharge", csv_path="data/discharge_notes.csv", column_name="text", ) diff --git a/cookbook/notereader_clinical_coding_fhir.py b/cookbook/notereader_clinical_coding_fhir.py index 76e9e8b3..52967a3d 100644 --- a/cookbook/notereader_clinical_coding_fhir.py +++ b/cookbook/notereader_clinical_coding_fhir.py @@ -132,7 +132,10 @@ def run_server(): # Create sandbox client for testing client = SandboxClient( - api_url="http://localhost:8000", endpoint="/notereader/fhir/", protocol="soap" + api_url="http://localhost:8000", + endpoint="/notereader/fhir/", + workflow="sign-note-inpatient", + protocol="soap", ) # Load clinical document from file client.load_from_path("./data/notereader_cda.xml") diff --git a/healthchain/gateway/fhir/aio.py b/healthchain/gateway/fhir/aio.py index de88d6c3..44c849bb 100644 --- a/healthchain/gateway/fhir/aio.py +++ b/healthchain/gateway/fhir/aio.py @@ -216,19 +216,23 @@ async def search( client_kwargs={"params": params}, ) - # Handle pagination if requested + # Handle pagination if requested if follow_pagination: all_entries = bundle.entry or [] page_count = 1 while bundle.link: - next_link = next((link for link in bundle.link if link.relation == "next"), None) + next_link = next( + (link for link in bundle.link if link.relation == "next"), None + ) if not next_link or (max_pages and page_count >= max_pages): break # Extract the relative URL from the next link - next_url = next_link.url.split("/")[-2:] # Get resource_type/_search part - next_params = dict(pair.split("=") for pair in next_link.url.split("?")[1].split("&")) + # next_url = next_link.url.split("/")[-2:] # Get resource_type/_search part + next_params = dict( + pair.split("=") for pair in next_link.url.split("?")[1].split("&") + ) bundle = await self._execute_with_client( "search", diff --git a/healthchain/gateway/fhir/sync.py b/healthchain/gateway/fhir/sync.py index ab0ebdb6..6a2bf464 100644 --- a/healthchain/gateway/fhir/sync.py +++ b/healthchain/gateway/fhir/sync.py @@ -280,13 +280,17 @@ def search( page_count = 1 while bundle.link: - next_link = next((link for link in bundle.link if link.relation == "next"), None) + next_link = next( + (link for link in bundle.link if link.relation == "next"), None + ) if not next_link or (max_pages and page_count >= max_pages): break # Extract the relative URL from the next link - next_url = next_link.url.split("/")[-2:] # Get resource_type/_search part - next_params = dict(pair.split("=") for pair in next_link.url.split("?")[1].split("&")) + # next_url = next_link.url.split("/")[-2:] # Get resource_type/_search part + next_params = dict( + pair.split("=") for pair in next_link.url.split("?")[1].split("&") + ) bundle = self._execute_with_client( "search", diff --git a/healthchain/sandbox/sandboxclient.py b/healthchain/sandbox/sandboxclient.py index 2d595637..752321a4 100644 --- a/healthchain/sandbox/sandboxclient.py +++ b/healthchain/sandbox/sandboxclient.py @@ -71,7 +71,7 @@ def __init__( self, api_url: str, endpoint: str, - workflow: Optional[Union[Workflow, str]] = None, + workflow: Union[Workflow, str], protocol: Literal["rest", "soap"] = "rest", timeout: float = 10.0, ): @@ -81,12 +81,12 @@ def __init__( Args: api_url: Base URL of the service (e.g., "http://localhost:8000") endpoint: Service endpoint path (e.g., "/cds/cds-services/my-service") - workflow: Optional workflow specification (auto-detected if not provided) + workflow: Workflow specification (required) - determines request type and validation protocol: Communication protocol - "rest" for CDS Hooks, "soap" for CDA timeout: Request timeout in seconds Raises: - ValueError: If api_url or endpoint is invalid + ValueError: If api_url, endpoint, or workflow-protocol combination is invalid """ try: self.api = httpx.URL(api_url) @@ -103,15 +103,44 @@ def __init__( self.responses: List[Dict] = [] self.sandbox_id = uuid.uuid4() + # Single validation point - fail fast on incompatible workflow-protocol + self._validate_workflow_protocol() + log.info( f"Initialized SandboxClient {self.sandbox_id} for {self.api}{self.endpoint}" ) + def _validate_workflow_protocol(self) -> None: + """ + Validate workflow is compatible with protocol. + + Raises: + ValueError: If workflow-protocol combination is invalid + """ + from healthchain.sandbox.workflows import UseCaseMapping + + if self.protocol == ApiProtocol.soap: + # SOAP only works with ClinicalDocumentation workflows + soap_workflows = UseCaseMapping.ClinicalDocumentation.allowed_workflows + if self.workflow.value not in soap_workflows: + raise ValueError( + f"Workflow '{self.workflow.value}' is not compatible with SOAP protocol. " + f"SOAP requires Clinical Documentation workflows: {soap_workflows}" + ) + + elif self.protocol == ApiProtocol.rest: + # REST only works with CDS workflows + rest_workflows = UseCaseMapping.ClinicalDecisionSupport.allowed_workflows + if self.workflow.value not in rest_workflows: + raise ValueError( + f"Workflow '{self.workflow.value}' is not compatible with REST protocol. " + f"REST requires CDS workflows: {rest_workflows}" + ) + def load_from_registry( self, source: str, data_path: str, - workflow: Optional[Union[Workflow, str]] = None, **kwargs: Any, ) -> "SandboxClient": """ @@ -123,15 +152,13 @@ def load_from_registry( Args: source: Dataset name (e.g., "mimic-on-fhir", "synthea") data_path: Path to the dataset files - workflow: CDS workflow type (e.g., "encounter-discharge", "patient-view"). - If not provided, uses the workflow set on client initialization. **kwargs: Dataset-specific parameters (e.g., resource_types, sample_size) Returns: Self for method chaining Raises: - ValueError: If dataset not found in registry or workflow not specified + ValueError: If dataset not found in registry FileNotFoundError: If data_path doesn't exist Examples: @@ -140,10 +167,15 @@ def load_from_registry( >>> print(list_available_datasets()) Load MIMIC dataset: + >>> client = SandboxClient( + ... api_url="http://localhost:8000", + ... endpoint="/cds/patient-view", + ... workflow="patient-view", + ... protocol="rest" + ... ) >>> client.load_from_registry( ... "mimic-on-fhir", ... data_path="./data/mimic-fhir", - ... workflow="patient-view", ... resource_types=["MimicMedication"], ... sample_size=10 ... ) @@ -153,10 +185,7 @@ def load_from_registry( log.info(f"Loading dataset from registry: {source}") try: loaded_data = DatasetRegistry.load(source, data_path=data_path, **kwargs) - workflow_enum = ( - Workflow(workflow) if isinstance(workflow, str) else workflow - ) - self._construct_request(loaded_data, workflow_enum) + self._construct_request(loaded_data) log.info(f"Loaded {source} dataset with {len(self.request_data)} requests") except KeyError: raise ValueError( @@ -169,7 +198,6 @@ def load_from_path( self, path: Union[str, Path], pattern: Optional[str] = None, - workflow: Optional[Union[Workflow, str]] = None, ) -> "SandboxClient": """ Load data from file system path. @@ -182,7 +210,6 @@ def load_from_path( Args: path: File path or directory path pattern: Glob pattern for filtering files in directory (e.g., "*.xml") - workflow: Optional workflow override (auto-detected from protocol if not provided) Returns: Self for method chaining @@ -197,9 +224,6 @@ def load_from_path( Load directory of CDA files: >>> client.load_from_path("./data/cda_files/", pattern="*.xml") - - Load with explicit workflow: - >>> client.load_from_path("./data/note.xml", workflow="sign-note-inpatient") """ path = Path(path) if not path.exists(): @@ -229,12 +253,7 @@ def load_from_path( if extension == ".xml": with open(file_path, "r") as f: xml_content = f.read() - workflow_enum = ( - Workflow(workflow) - if isinstance(workflow, str) - else workflow or self.workflow or Workflow.sign_note_inpatient - ) - self._construct_request(xml_content, workflow_enum) + self._construct_request(xml_content) log.info(f"Loaded CDA document from {file_path.name}") elif extension == ".json": @@ -242,18 +261,7 @@ def load_from_path( json_data = json.load(f) try: - # Load as prefetch dict - workflow_enum = ( - Workflow(workflow) - if isinstance(workflow, str) - else workflow or self.workflow - ) - if not workflow_enum: - raise ValueError( - "Workflow must be specified when loading JSON prefetch data. " - "Provide via 'workflow' parameter or set on client initialization." - ) - self._construct_request(json_data, workflow_enum) + self._construct_request(json_data) log.info(f"Loaded prefetch data from {file_path.name}") except Exception as e: @@ -275,7 +283,6 @@ def load_free_text( self, csv_path: str, column_name: str, - workflow: Union[Workflow, str], random_seed: Optional[int] = None, **kwargs: Any, ) -> "SandboxClient": @@ -283,13 +290,12 @@ def load_free_text( Generates a CDS prefetch from free text notes. Reads clinical notes from a CSV file and wraps it in FHIR DocumentReferences - in a CDS prefetch field for CDS Hooks workflows. Generates additional synthetic - FHIR resources as needed based on the specified workflow. + in a CDS prefetch field for CDS Hooks workflows. Hydrates it with generated + FHIR resources as needed based on the workflow set at client initialization. Args: csv_path: Path to CSV file containing clinical notes column_name: Name of the column containing the text - workflow: CDS workflow type (e.g., "encounter-discharge", "patient-view") random_seed: Seed for reproducible data generation **kwargs: Additional parameters for data generation @@ -298,23 +304,19 @@ def load_free_text( Raises: FileNotFoundError: If CSV file doesn't exist - ValueError: If workflow is invalid or column not found + ValueError: If column not found Examples: - Generate discharge summaries: >>> client.load_free_text( ... csv_path="./data/discharge_notes.csv", ... column_name="text", - ... workflow="encounter-discharge", ... random_seed=42 ... ) """ from .generators import CdsDataGenerator - workflow_enum = Workflow(workflow) if isinstance(workflow, str) else workflow - generator = CdsDataGenerator() - generator.set_workflow(workflow_enum) + generator.set_workflow(self.workflow) prefetch_data = generator.generate_prefetch( random_seed=random_seed, @@ -323,51 +325,139 @@ def load_free_text( **kwargs, ) - self._construct_request(prefetch_data, workflow_enum) + self._construct_request(prefetch_data) log.info( - f"Generated {len(self.request_data)} requests from free text for workflow {workflow_enum.value}" + f"Generated {len(self.request_data)} requests from free text for workflow {self.workflow.value}" ) return self - def _construct_request( - self, data: Union[Dict[str, Any], Any], workflow: Optional[Workflow] = None - ) -> None: + def _construct_request(self, data: Union[Dict[str, Any], Any]) -> None: """ Convert data to request format and add to queue. Args: data: Data to convert (Dict for CDS prefetch, string for CDA) - workflow: Workflow to use for request construction """ - workflow = workflow or self.workflow - if self.protocol == ApiProtocol.rest: - if not workflow: - raise ValueError( - "Workflow must be specified for REST/CDS Hooks requests" - ) constructor = CdsRequestConstructor() - request = constructor.construct_request(data, workflow) + request = constructor.construct_request(data, self.workflow) elif self.protocol == ApiProtocol.soap: constructor = ClinDocRequestConstructor() - request = constructor.construct_request( - data, workflow or Workflow.sign_note_inpatient - ) + request = constructor.construct_request(data, self.workflow) else: raise ValueError(f"Unsupported protocol: {self.protocol}") self.request_data.append(request) + def clear_requests(self) -> "SandboxClient": + """ + Clear all queued requests. + + Useful when you want to start fresh without creating a new client instance. + + Returns: + Self for method chaining + """ + count = len(self.request_data) + self.request_data.clear() + log.info(f"Cleared {count} queued request(s)") + + return self + + def preview_requests(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: + """ + Get preview of queued requests without sending them. + + Provides a summary view of what requests are queued, useful for debugging + and verifying data was loaded correctly before sending. + + Args: + limit: Maximum number of requests to preview. If None, preview all. + + Returns: + List of request summary dictionaries containing metadata + + Examples: + >>> client.load_free_text(csv_path="data.csv", column_name="text", workflow="encounter-discharge") + >>> previews = client.preview_requests(limit=3) + >>> for p in previews: + ... print(f"Request {p['index']}: {p['type']} for {p['protocol']}") + """ + requests = self.request_data[:limit] if limit else self.request_data + previews = [] + + for idx, req in enumerate(requests): + preview = { + "index": idx, + "type": req.__class__.__name__, + "protocol": self.protocol.value + if hasattr(self.protocol, "value") + else str(self.protocol), + } + + # Add protocol-specific info + if self.protocol == ApiProtocol.rest and hasattr(req, "hook"): + preview["hook"] = req.hook + preview["hookInstance"] = getattr(req, "hookInstance", None) + elif self.protocol == ApiProtocol.soap: + preview["has_document"] = hasattr(req, "document") + + previews.append(preview) + + return previews + + def get_request_data( + self, format: Literal["raw", "dict", "json"] = "dict" + ) -> Union[List, str]: + """ + Get raw request data for inspection. + + Allows direct access to request data for debugging or custom processing. + + Args: + format: Return format - "raw" for list of request objects, + "dict" for list of dictionaries, "json" for JSON string + + Returns: + Request data in specified format + + Raises: + ValueError: If format is not one of "raw", "dict", or "json" + + Examples: + >>> client.load_from_path("data.xml") + >>> # Get as dictionaries + >>> dicts = client.get_request_data("dict") + >>> # Get as JSON string + >>> json_str = client.get_request_data("json") + >>> print(json_str) + """ + if format == "raw": + return self.request_data + elif format == "dict": + result = [] + for req in self.request_data: + if hasattr(req, "model_dump"): + result.append(req.model_dump(exclude_none=True)) + elif hasattr(req, "model_dump_xml"): + result.append({"document": req.model_dump_xml()}) + else: + result.append(req) + return result + elif format == "json": + return json.dumps(self.get_request_data("dict"), indent=2) + else: + raise ValueError( + f"Invalid format '{format}'. Must be 'raw', 'dict', or 'json'" + ) + def send_requests(self) -> List[Dict]: """ Send all queued requests to the service. Returns: List of response dictionaries - - Raises: - RuntimeError: If no requests are queued """ if not self.request_data: raise RuntimeError( @@ -441,9 +531,6 @@ def save_results(self, directory: Union[str, Path] = "./output/") -> None: Args: directory: Directory to save data to (default: "./output/") - - Raises: - RuntimeError: If no responses are available to save """ if not self.responses: raise RuntimeError( @@ -503,6 +590,23 @@ def get_status(self) -> Dict[str, Any]: "responses_received": len(self.responses), } + def __enter__(self) -> "SandboxClient": + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + """ + Context manager exit - auto-save results if responses exist. + + Only saves if no exception occurred and responses were generated. + """ + if self.responses and exc_type is None: + try: + self.save_results() + log.info("Auto-saved results on context exit") + except Exception as e: + log.warning(f"Failed to auto-save results: {e}") + def __repr__(self) -> str: """String representation of SandboxClient.""" return ( diff --git a/scripts/healthchainapi_e2e_demo.py b/scripts/healthchainapi_e2e_demo.py index 0b6870d0..de76a29f 100644 --- a/scripts/healthchainapi_e2e_demo.py +++ b/scripts/healthchainapi_e2e_demo.py @@ -57,6 +57,11 @@ from healthchain.sandbox import SandboxClient from healthchain.fhir import create_document_reference +from dotenv import load_dotenv + +load_dotenv() + + # Configuration CONFIG = { "server": { @@ -430,7 +435,6 @@ def create_sandboxes(): cds_client.load_free_text( csv_path=CONFIG["data"]["discharge_notes_path"], column_name="text", - workflow=CONFIG["workflows"]["cds"], ) print_success("Sandbox environments created") diff --git a/tests/gateway/test_fhir_gateway.py b/tests/gateway/test_fhir_gateway.py index 14cf289e..fea4207d 100644 --- a/tests/gateway/test_fhir_gateway.py +++ b/tests/gateway/test_fhir_gateway.py @@ -285,32 +285,26 @@ def test_search_with_empty_bundle(): ) assert result.entry is None + def test_search_with_pagination(fhir_gateway): """Gateway.search fetches all pages when pagination is enabled.""" # Create mock bundles for pagination page1 = Bundle( type="searchset", entry=[BundleEntry(resource=Patient(id="1"))], - link=[{"relation": "next", "url": "Patient?page=2"}] + link=[{"relation": "next", "url": "Patient?page=2"}], ) page2 = Bundle( type="searchset", entry=[BundleEntry(resource=Patient(id="2"))], - link=[{"relation": "next", "url": "Patient?page=3"}] - ) - page3 = Bundle( - type="searchset", - entry=[BundleEntry(resource=Patient(id="3"))] + link=[{"relation": "next", "url": "Patient?page=3"}], ) + page3 = Bundle(type="searchset", entry=[BundleEntry(resource=Patient(id="3"))]) with patch.object( fhir_gateway, "_execute_with_client", side_effect=[page1, page2, page3] ) as mock_execute: - result = fhir_gateway.search( - Patient, - {"name": "Smith"}, - follow_pagination=True - ) + result = fhir_gateway.search(Patient, {"name": "Smith"}, follow_pagination=True) assert mock_execute.call_count == 3 assert result.entry is not None @@ -324,22 +318,19 @@ def test_search_with_max_pages(fhir_gateway): page1 = Bundle( type="searchset", entry=[BundleEntry(resource=Patient(id="1"))], - link=[{"relation": "next", "url": "Patient?page=2"}] + link=[{"relation": "next", "url": "Patient?page=2"}], ) page2 = Bundle( type="searchset", entry=[BundleEntry(resource=Patient(id="2"))], - link=[{"relation": "next", "url": "Patient?page=3"}] + link=[{"relation": "next", "url": "Patient?page=3"}], ) with patch.object( fhir_gateway, "_execute_with_client", side_effect=[page1, page2] ) as mock_execute: result = fhir_gateway.search( - Patient, - {"name": "Smith"}, - follow_pagination=True, - max_pages=2 + Patient, {"name": "Smith"}, follow_pagination=True, max_pages=2 ) assert mock_execute.call_count == 2 @@ -354,17 +345,13 @@ def test_search_with_pagination_empty_next_link(fhir_gateway): bundle = Bundle( type="searchset", entry=[BundleEntry(resource=Patient(id="1"))], - link=[{"relation": "self", "url": "Patient?name=Smith"}] + link=[{"relation": "self", "url": "Patient?name=Smith"}], ) with patch.object( fhir_gateway, "_execute_with_client", return_value=bundle ) as mock_execute: - result = fhir_gateway.search( - Patient, - {"name": "Smith"}, - follow_pagination=True - ) + result = fhir_gateway.search(Patient, {"name": "Smith"}, follow_pagination=True) mock_execute.assert_called_once() assert result.entry is not None @@ -377,12 +364,9 @@ def test_search_with_pagination_and_provenance(fhir_gateway): page1 = Bundle( type="searchset", entry=[BundleEntry(resource=Patient(id="1"))], - link=[{"relation": "next", "url": "Patient?page=2"}] - ) - page2 = Bundle( - type="searchset", - entry=[BundleEntry(resource=Patient(id="2"))] + link=[{"relation": "next", "url": "Patient?page=2"}], ) + page2 = Bundle(type="searchset", entry=[BundleEntry(resource=Patient(id="2"))]) with patch.object( fhir_gateway, "_execute_with_client", side_effect=[page1, page2] @@ -393,7 +377,7 @@ def test_search_with_pagination_and_provenance(fhir_gateway): source="test_source", follow_pagination=True, add_provenance=True, - provenance_tag="aggregated" + provenance_tag="aggregated", ) assert mock_execute.call_count == 2 @@ -404,4 +388,4 @@ def test_search_with_pagination_and_provenance(fhir_gateway): for entry in result.entry: assert entry.resource.meta is not None assert entry.resource.meta.source == "urn:healthchain:source:test_source" - assert entry.resource.meta.tag[0].code == "aggregated" \ No newline at end of file + assert entry.resource.meta.tag[0].code == "aggregated" diff --git a/tests/gateway/test_fhir_gateway_async.py b/tests/gateway/test_fhir_gateway_async.py index f0e948f9..a64235fc 100644 --- a/tests/gateway/test_fhir_gateway_async.py +++ b/tests/gateway/test_fhir_gateway_async.py @@ -126,7 +126,6 @@ async def test_search_operation_with_parameters(fhir_gateway): assert result == mock_bundle - @pytest.mark.asyncio async def test_search_with_pagination(fhir_gateway): """AsyncFHIRGateway.search fetches all pages when pagination is enabled.""" @@ -134,25 +133,20 @@ async def test_search_with_pagination(fhir_gateway): page1 = Bundle( type="searchset", entry=[{"resource": Patient(id="1")}], - link=[{"relation": "next", "url": "Patient?page=2"}] + link=[{"relation": "next", "url": "Patient?page=2"}], ) page2 = Bundle( type="searchset", entry=[{"resource": Patient(id="2")}], - link=[{"relation": "next", "url": "Patient?page=3"}] - ) - page3 = Bundle( - type="searchset", - entry=[{"resource": Patient(id="3")}] + link=[{"relation": "next", "url": "Patient?page=3"}], ) + page3 = Bundle(type="searchset", entry=[{"resource": Patient(id="3")}]) with patch.object( fhir_gateway, "_execute_with_client", side_effect=[page1, page2, page3] ) as mock_execute: result = await fhir_gateway.search( - Patient, - {"name": "Smith"}, - follow_pagination=True + Patient, {"name": "Smith"}, follow_pagination=True ) assert mock_execute.call_count == 3 @@ -167,22 +161,19 @@ async def test_search_with_max_pages(fhir_gateway): page1 = Bundle( type="searchset", entry=[{"resource": Patient(id="1")}], - link=[{"relation": "next", "url": "Patient?page=2"}] + link=[{"relation": "next", "url": "Patient?page=2"}], ) page2 = Bundle( type="searchset", entry=[{"resource": Patient(id="2")}], - link=[{"relation": "next", "url": "Patient?page=3"}] + link=[{"relation": "next", "url": "Patient?page=3"}], ) with patch.object( fhir_gateway, "_execute_with_client", side_effect=[page1, page2] ) as mock_execute: result = await fhir_gateway.search( - Patient, - {"name": "Smith"}, - follow_pagination=True, - max_pages=2 + Patient, {"name": "Smith"}, follow_pagination=True, max_pages=2 ) assert mock_execute.call_count == 2 @@ -197,16 +188,14 @@ async def test_search_with_pagination_empty_next_link(fhir_gateway): bundle = Bundle( type="searchset", entry=[{"resource": Patient(id="1")}], - link=[{"relation": "self", "url": "Patient?name=Smith"}] + link=[{"relation": "self", "url": "Patient?name=Smith"}], ) with patch.object( fhir_gateway, "_execute_with_client", return_value=bundle ) as mock_execute: result = await fhir_gateway.search( - Patient, - {"name": "Smith"}, - follow_pagination=True + Patient, {"name": "Smith"}, follow_pagination=True ) mock_execute.assert_called_once() @@ -221,12 +210,9 @@ async def test_search_with_pagination_and_provenance(fhir_gateway): page1 = Bundle( type="searchset", entry=[{"resource": Patient(id="1")}], - link=[{"relation": "next", "url": "Patient?page=2"}] - ) - page2 = Bundle( - type="searchset", - entry=[{"resource": Patient(id="2")}] + link=[{"relation": "next", "url": "Patient?page=2"}], ) + page2 = Bundle(type="searchset", entry=[{"resource": Patient(id="2")}]) with patch.object( fhir_gateway, "_execute_with_client", side_effect=[page1, page2] @@ -237,7 +223,7 @@ async def test_search_with_pagination_and_provenance(fhir_gateway): source="test_source", follow_pagination=True, add_provenance=True, - provenance_tag="aggregated" + provenance_tag="aggregated", ) assert mock_execute.call_count == 2 @@ -250,6 +236,7 @@ async def test_search_with_pagination_and_provenance(fhir_gateway): assert entry.resource.meta.source == "urn:healthchain:source:test_source" assert entry.resource.meta.tag[0].code == "aggregated" + @pytest.mark.asyncio async def test_modify_context_for_existing_resource(fhir_gateway, test_patient): """Modify context manager fetches, yields, and updates existing resources.""" diff --git a/tests/sandbox/test_cds_sandbox.py b/tests/sandbox/test_cds_sandbox.py index c2880f79..3d472682 100644 --- a/tests/sandbox/test_cds_sandbox.py +++ b/tests/sandbox/test_cds_sandbox.py @@ -37,7 +37,7 @@ async def handle_patient_view(request: CDSRequest) -> CDSResponse: # Load test data test_bundle = create_bundle() prefetch_data = {"patient": test_bundle} - client._construct_request(prefetch_data, client.workflow) + client._construct_request(prefetch_data) # Verify request was constructed assert len(client.request_data) == 1 @@ -88,7 +88,7 @@ def test_cdshooks_workflows(): # Load data into client prefetch_data = {"patient": patient_bundle} - client._construct_request(prefetch_data, client.workflow) + client._construct_request(prefetch_data) # Verify request was constructed assert len(client.request_data) == 1 diff --git a/tests/sandbox/test_clindoc_sandbox.py b/tests/sandbox/test_clindoc_sandbox.py index 9d2b0fab..24ad9969 100644 --- a/tests/sandbox/test_clindoc_sandbox.py +++ b/tests/sandbox/test_clindoc_sandbox.py @@ -32,7 +32,7 @@ def process_document(cda_request: CdaRequest) -> CdaResponse: # Load test document test_document = "document" - client._construct_request(test_document, client.workflow) + client._construct_request(test_document) # Verify request was constructed assert len(client.request_data) == 1 @@ -82,7 +82,7 @@ def test_notereader_sandbox_workflow_execution(): # Load clinical document clinical_document = "Test content" - client._construct_request(clinical_document, client.workflow) + client._construct_request(clinical_document) # Verify request was constructed assert len(client.request_data) == 1 diff --git a/tests/sandbox/test_sandbox_client.py b/tests/sandbox/test_sandbox_client.py index c52c18a0..6734f323 100644 --- a/tests/sandbox/test_sandbox_client.py +++ b/tests/sandbox/test_sandbox_client.py @@ -1,15 +1,18 @@ import pytest import json +from unittest.mock import Mock, patch from healthchain.sandbox import SandboxClient def test_load_from_registry_unknown_dataset(): """load_from_registry raises ValueError for unknown datasets.""" - client = SandboxClient(api_url="http://localhost:8000", endpoint="/test") + client = SandboxClient( + api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" + ) with pytest.raises(ValueError, match="Unknown dataset"): - client.load_from_registry("nonexistent-dataset") + client.load_from_registry("nonexistent-dataset", data_path="/test") def test_load_from_path_single_xml_file(tmp_path): @@ -19,7 +22,10 @@ def test_load_from_path_single_xml_file(tmp_path): cda_file.write_text("Test CDA") client = SandboxClient( - api_url="http://localhost:8000", endpoint="/notereader/fhir/", protocol="soap" + api_url="http://localhost:8000", + endpoint="/notereader/fhir/", + workflow="sign-note-inpatient", + protocol="soap", ) result = client.load_from_path(str(cda_file)) @@ -36,7 +42,10 @@ def test_load_from_path_directory_with_pattern(tmp_path): (tmp_path / "other.txt").write_text("Not XML") client = SandboxClient( - api_url="http://localhost:8000", endpoint="/notereader/fhir/", protocol="soap" + api_url="http://localhost:8000", + endpoint="/notereader/fhir/", + workflow="sign-note-inpatient", + protocol="soap", ) client.load_from_path(str(tmp_path), pattern="*.xml") @@ -51,7 +60,10 @@ def test_load_from_path_directory_all_files(tmp_path): (tmp_path / "note2.xml").write_text("Note 2") client = SandboxClient( - api_url="http://localhost:8000", endpoint="/notereader/fhir/", protocol="soap" + api_url="http://localhost:8000", + endpoint="/notereader/fhir/", + workflow="sign-note-inpatient", + protocol="soap", ) client.load_from_path(str(tmp_path)) @@ -62,7 +74,10 @@ def test_load_from_path_directory_all_files(tmp_path): def test_load_from_path_error_handling(tmp_path): """load_from_path raises FileNotFoundError for nonexistent path.""" client = SandboxClient( - api_url="http://localhost:8000", endpoint="/notereader/fhir/", protocol="soap" + api_url="http://localhost:8000", + endpoint="/notereader/fhir/", + workflow="sign-note-inpatient", + protocol="soap", ) with pytest.raises(FileNotFoundError): @@ -78,12 +93,15 @@ def test_load_free_text_generates_data(tmp_path): csv_file = tmp_path / "test.csv" csv_file.write_text("text\nSample discharge note\n") - client = SandboxClient(api_url="http://localhost:8000", endpoint="/test") + client = SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="encounter-discharge", + ) client.load_free_text( csv_path=str(csv_file), column_name="text", - workflow="encounter-discharge", random_seed=42, ) assert len(client.request_data) > 0 @@ -91,7 +109,9 @@ def test_load_free_text_generates_data(tmp_path): def test_send_requests_without_data(): """send_requests raises RuntimeError if no data is loaded.""" - client = SandboxClient(api_url="http://localhost:8000", endpoint="/test") + client = SandboxClient( + api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" + ) with pytest.raises(RuntimeError, match="No requests to send"): client.send_requests() @@ -99,7 +119,9 @@ def test_send_requests_without_data(): def test_save_results_without_responses(): """save_results raises RuntimeError if no responses available.""" - client = SandboxClient(api_url="http://localhost:8000", endpoint="/test") + client = SandboxClient( + api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" + ) with pytest.raises(RuntimeError, match="No responses to save"): client.save_results() @@ -124,7 +146,9 @@ def test_get_status(): def test_repr(): """__repr__ returns meaningful string representation.""" - client = SandboxClient(api_url="http://localhost:8000", endpoint="/test") + client = SandboxClient( + api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" + ) repr_str = repr(client) @@ -152,17 +176,6 @@ def test_load_from_path_json_prefetch_file(tmp_path): assert client.request_data[0].hook == "patient-view" -def test_load_from_path_json_without_workflow_fails(tmp_path): - """load_from_path requires workflow for JSON Prefetch files.""" - json_file = tmp_path / "prefetch.json" - json_file.write_text('{"prefetch": {}}') - - client = SandboxClient(api_url="http://localhost:8000", endpoint="/test") - - with pytest.raises(ValueError, match="Workflow must be specified"): - client.load_from_path(str(json_file)) - - def test_load_from_path_invalid_json_prefetch(tmp_path): """load_from_path processes JSON data for prefetch.""" json_file = tmp_path / "data.json" @@ -180,14 +193,16 @@ def test_load_from_path_invalid_json_prefetch(tmp_path): def test_save_results_distinguishes_protocols(tmp_path): """save_results uses correct file extension based on protocol.""" from healthchain.fhir import create_bundle - from healthchain.sandbox.workflows import Workflow # Test REST/JSON protocol rest_client = SandboxClient( - api_url="http://localhost:8000", endpoint="/test", protocol="rest" + api_url="http://localhost:8000", + endpoint="/test", + workflow="patient-view", + protocol="rest", ) prefetch = {"patient": create_bundle()} - rest_client._construct_request(prefetch, Workflow.patient_view) + rest_client._construct_request(prefetch) rest_client.responses = [{"cards": []}] rest_dir = tmp_path / "rest" @@ -198,9 +213,12 @@ def test_save_results_distinguishes_protocols(tmp_path): # Test SOAP/XML protocol soap_client = SandboxClient( - api_url="http://localhost:8000", endpoint="/test", protocol="soap" + api_url="http://localhost:8000", + endpoint="/test", + workflow="sign-note-inpatient", + protocol="soap", ) - soap_client._construct_request("test", Workflow.sign_note_inpatient) + soap_client._construct_request("test") soap_client.responses = ["data"] soap_dir = tmp_path / "soap" @@ -210,12 +228,278 @@ def test_save_results_distinguishes_protocols(tmp_path): assert len(list(soap_dir.glob("**/*.json"))) == 0 -def test_construct_request_requires_workflow_for_rest(): - """_construct_request raises ValueError if workflow missing for REST protocol.""" - client = SandboxClient(api_url="http://localhost:8000", endpoint="/test") +@pytest.mark.parametrize( + "workflow,protocol,should_fail", + [ + ("patient-view", "rest", False), # Valid CDS workflow with REST + ("encounter-discharge", "rest", False), # Valid CDS workflow with REST + ("sign-note-inpatient", "soap", False), # Valid ClinDoc workflow with SOAP + ("patient-view", "soap", True), # CDS workflow with SOAP - invalid + ("sign-note-inpatient", "rest", True), # ClinDoc workflow with REST - invalid + ], +) +def test_workflow_protocol_validation(workflow, protocol, should_fail): + """SandboxClient validates workflow-protocol compatibility at initialization.""" + if should_fail: + with pytest.raises(ValueError, match="not compatible"): + SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow=workflow, + protocol=protocol, + ) + else: + client = SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow=workflow, + protocol=protocol, + ) + assert client.workflow.value == workflow + + +def test_clear_requests(): + """clear_requests empties the request queue.""" + from healthchain.fhir import create_bundle + + client = SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="patient-view", + ) + + # Load some data + prefetch = {"patient": create_bundle()} + client._construct_request(prefetch) + assert len(client.request_data) == 1 + + # Clear and verify + result = client.clear_requests() + assert result is client # Method chaining + assert len(client.request_data) == 0 + + +def test_preview_requests_provides_metadata(): + """preview_requests returns summary metadata without sending requests.""" from healthchain.fhir import create_bundle + client = SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="patient-view", + ) + + # Load data prefetch = {"patient": create_bundle()} + client._construct_request(prefetch) + client._construct_request(prefetch) + + # Preview without sending + previews = client.preview_requests() + + assert len(previews) == 2 + assert previews[0]["index"] == 0 + assert previews[0]["type"] == "CDSRequest" + assert ( + previews[0]["protocol"] == "REST" + ) # Protocol is uppercase in actual implementation + assert previews[0]["hook"] == "patient-view" + + +def test_preview_requests_respects_limit(): + """preview_requests limits returned results when limit specified.""" + from healthchain.fhir import create_bundle + + client = SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="patient-view", + ) + + # Load multiple requests + prefetch = {"patient": create_bundle()} + for _ in range(5): + client._construct_request(prefetch) + + previews = client.preview_requests(limit=2) + assert len(previews) == 2 + + +@pytest.mark.parametrize( + "format_type,check", + [ + ("raw", lambda data: isinstance(data, list)), + ("dict", lambda data: isinstance(data, list) and isinstance(data[0], dict)), + ("json", lambda data: isinstance(data, str) and json.loads(data)), + ], +) +def test_get_request_data_formats(format_type, check): + """get_request_data returns data in specified format.""" + from healthchain.fhir import create_bundle + + client = SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="patient-view", + ) + + prefetch = {"patient": create_bundle()} + client._construct_request(prefetch) + + data = client.get_request_data(format=format_type) + + assert check(data) + + +def test_get_request_data_invalid_format(): + """get_request_data raises ValueError for invalid format.""" + client = SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="patient-view", + ) + + with pytest.raises(ValueError, match="Invalid format"): + client.get_request_data(format="invalid") + + +def test_context_manager_auto_saves_on_success(tmp_path): + """Context manager auto-saves results when responses exist and no exception.""" + from healthchain.fhir import create_bundle + + with SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="patient-view", + ) as client: + prefetch = {"patient": create_bundle()} + client._construct_request(prefetch) + # Simulate responses + client.responses = [{"cards": []}] + + # Auto-save should have been triggered (saves to "./output/" by default) + + +def test_context_manager_no_save_without_responses(tmp_path): + """Context manager does not save if no responses generated.""" + with SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="patient-view", + ) as client: + # No requests or responses + pass + + # Should exit cleanly without trying to save + assert len(client.responses) == 0 + + +def test_context_manager_no_save_on_exception(): + """Context manager does not save if exception occurs.""" + with pytest.raises(RuntimeError): + with SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="patient-view", + ) as client: + client.responses = [{"cards": []}] + raise RuntimeError("Test exception") + + # Should exit without attempting save + + +@patch("httpx.Client") +def test_send_requests_rest_success(mock_client_class): + """send_requests successfully processes REST/CDS Hooks requests.""" + from healthchain.fhir import create_bundle + + # Mock successful response + mock_response = Mock() + mock_response.json.return_value = {"cards": [{"summary": "Test card"}]} + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.post.return_value = mock_response + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=None) + mock_client_class.return_value = mock_client + + client = SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="patient-view", + ) + + prefetch = {"patient": create_bundle()} + client._construct_request(prefetch) + + responses = client.send_requests() + + assert len(responses) == 1 + assert responses[0]["cards"][0]["summary"] == "Test card" + assert mock_client.post.called + + +@patch("httpx.Client") +def test_send_requests_soap_success(mock_client_class): + """send_requests successfully processes SOAP/CDA requests.""" + # Mock successful response + mock_response = Mock() + mock_response.text = "Response" + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.post.return_value = mock_response + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=None) + mock_client_class.return_value = mock_client + + client = SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="sign-note-inpatient", + protocol="soap", + ) + + client._construct_request("Test") + + responses = client.send_requests() + + assert len(responses) == 1 + # Response is processed by CdaResponse which may return empty dict if parsing fails + assert isinstance(responses[0], (str, dict)) + assert mock_client.post.called + + +@patch("httpx.Client") +def test_send_requests_handles_multiple_requests(mock_client_class): + """send_requests processes multiple queued requests sequentially.""" + from healthchain.fhir import create_bundle + + # Mock successful responses + mock_response = Mock() + mock_response.json.return_value = {"cards": []} + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.post.return_value = mock_response + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=None) + mock_client_class.return_value = mock_client + + client = SandboxClient( + api_url="http://localhost:8000", + endpoint="/test", + workflow="patient-view", + ) + + # Load multiple requests + prefetch = {"patient": create_bundle()} + client._construct_request(prefetch) + client._construct_request(prefetch) + client._construct_request(prefetch) + + responses = client.send_requests() - with pytest.raises(ValueError, match="Workflow must be specified for REST"): - client._construct_request(prefetch, None) + assert len(responses) == 3 + assert mock_client.post.call_count == 3 From 0e1d43f85ef3a14fbc20ddf02b3e92582656b04e Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Fri, 31 Oct 2025 16:52:23 +0000 Subject: [PATCH 03/12] Consolidate url params and make synth gen more explicit --- healthchain/sandbox/datasets.py | 6 +- .../sandbox/generators/cdsdatagenerator.py | 100 ++++++------- healthchain/sandbox/loaders/mimic.py | 86 +++++++---- healthchain/sandbox/loaders/synthea.py | 14 +- healthchain/sandbox/sandboxclient.py | 135 +++++++----------- tests/sandbox/test_cds_sandbox.py | 6 +- tests/sandbox/test_clindoc_sandbox.py | 6 +- tests/sandbox/test_mimic_loader.py | 24 ++-- tests/sandbox/test_sandbox_client.py | 119 +++++++-------- 9 files changed, 229 insertions(+), 267 deletions(-) diff --git a/healthchain/sandbox/datasets.py b/healthchain/sandbox/datasets.py index 5e7ed58e..44ff9f4d 100644 --- a/healthchain/sandbox/datasets.py +++ b/healthchain/sandbox/datasets.py @@ -42,13 +42,13 @@ def register(cls, loader: DatasetLoader) -> None: log.debug(f"Registered dataset: {name}") @classmethod - def load(cls, name: str, data_path: str, **kwargs) -> Dict: + def load(cls, name: str, data_dir: str, **kwargs) -> Dict: """ Load a dataset by name. Args: name: Name of the dataset to load - data_path: Path to the dataset files + data_dir: Path to the directory containing dataset files **kwargs: Dataset-specific parameters Returns: @@ -65,7 +65,7 @@ def load(cls, name: str, data_path: str, **kwargs) -> Dict: loader = cls._datasets[name] log.info(f"Loading dataset: {name}") - return loader.load(data_path=data_path, **kwargs) + return loader.load(data_dir=data_dir, **kwargs) @classmethod def list_datasets(cls) -> List[str]: diff --git a/healthchain/sandbox/generators/cdsdatagenerator.py b/healthchain/sandbox/generators/cdsdatagenerator.py index 5f20f979..2837c315 100644 --- a/healthchain/sandbox/generators/cdsdatagenerator.py +++ b/healthchain/sandbox/generators/cdsdatagenerator.py @@ -18,24 +18,21 @@ # TODO: generate test context - move from hook models class CdsDataGenerator: """ - A class to generate CDS (Clinical Decision Support) data based on specified workflows and constraints. + Generates synthetic CDS (Clinical Decision Support) data for specified workflows. - This class provides functionality to generate synthetic FHIR resources for testing CDS systems. - It uses registered data generators to create resources like Patients, Encounters, Conditions etc. - based on configured workflows. It can also incorporate free text data from CSV files. + Uses registered generators to create FHIR resources (e.g., Patient, Encounter, Condition) according to workflow configuration. + Can optionally include free text data from a CSV file as DocumentReference. Attributes: - registry (dict): A registry mapping generator names to generator classes. - mappings (dict): A mapping of workflow names to lists of required generators. - generated_data (Dict[str, Resource]): The most recently generated FHIR resources. - workflow (str): The currently active workflow. + registry (dict): Maps generator names to classes. + mappings (dict): Maps workflows to required generators. + generated_data (Dict[str, Resource]): Most recently generated resources. + workflow (str): Currently active workflow. Example: >>> generator = CdsDataGenerator() >>> generator.set_workflow("encounter_discharge") - >>> data = generator.generate_prefetch( - ... random_seed=42 - ... ) + >>> data = generator.generate_prefetch(random_seed=42) """ # TODO: Add ordering and logic so that patient/encounter IDs are passed to subsequent generators @@ -62,27 +59,25 @@ def __init__(self): def fetch_generator(self, generator_name: str) -> Callable: """ - Fetches a data generator class by its name from the registry. + Return the generator class by name from the registry. Args: - generator_name (str): The name of the data generator to fetch (e.g. "PatientGenerator", "EncounterGenerator") + generator_name (str): Name of the data generator. Returns: - Callable: The data generator class that can be used to generate FHIR resources. Returns None if generator not found. + Callable: Generator class, or None if not found. Example: - >>> generator = CdsDataGenerator() - >>> patient_gen = generator.fetch_generator("PatientGenerator") - >>> patient = patient_gen.generate() + >>> gen = CdsDataGenerator().fetch_generator("PatientGenerator") """ return self.registry.get(generator_name) def set_workflow(self, workflow: str) -> None: """ - Sets the current workflow to be used for data generation. + Set the current workflow name to use for data generation. - Parameters: - workflow (str): The name of the workflow to set. + Args: + workflow (str): Workflow name. """ self.workflow = workflow @@ -92,48 +87,38 @@ def generate_prefetch( free_text_path: Optional[str] = None, column_name: Optional[str] = None, random_seed: Optional[int] = None, + generate_resources: bool = True, ) -> Dict[str, Resource]: """ - Generates CDS data based on the current workflow, constraints, and optional free text data. - - This method generates FHIR resources according to the configured workflow mapping. For each - resource type in the workflow, it uses the corresponding generator to create a FHIR resource. - If free text data is provided via CSV, it will also generate a DocumentReference containing - randomly selected text from the CSV. + Generate prefetch FHIR resources and/or DocumentReference. Args: - constraints (Optional[list]): A list of constraints to apply to the data generation. - Each constraint should match the format expected by the individual generators. - free_text_path (Optional[str]): Path to a CSV file containing free text data to be - included as DocumentReferences. If provided, column_name must also be specified. - column_name (Optional[str]): The name of the column in the CSV file containing the - free text data to use. Required if free_text_path is provided. - random_seed (Optional[int]): Seed value for random number generation to ensure - reproducible results. If not provided, generation will be truly random. + constraints (Optional[list]): Constraints for resource generation. + free_text_path (Optional[str]): CSV file containing free text. + column_name (Optional[str]): CSV column for free text. + random_seed (Optional[int]): Random seed. + generate_resources (bool): If True, generate synthetic FHIR resources. Returns: - Dict[str, Resource]: A dictionary mapping resource types to generated FHIR resources. - The keys are lowercase resource type names (e.g. "patient", "encounter"). - If free text is provided, includes a "document" key with a DocumentReference. + Dict[str, Resource]: Generated resources keyed by resource type (lowercase), plus "document" if a free text entry is used. Raises: - ValueError: If the configured workflow is not found in the mappings - FileNotFoundError: If the free_text_path is provided but file not found - ValueError: If free_text_path provided without column_name + ValueError: If workflow is not recognized, or column name is missing. + FileNotFoundError: If free_text_path does not exist. """ prefetch = {} - if self.workflow not in self.mappings.keys(): - raise ValueError(f"Workflow {self.workflow} not found in mappings") + if generate_resources: + if self.workflow not in self.mappings: + raise ValueError(f"Workflow {self.workflow} not found in mappings") - for resource in self.mappings[self.workflow]: - generator_name = resource["generator"] - generator = self.fetch_generator(generator_name) - resource = generator.generate( - constraints=constraints, random_seed=random_seed - ) - - prefetch[resource.__resource_type__.lower()] = resource + for resource in self.mappings[self.workflow]: + generator_name = resource["generator"] + generator = self.fetch_generator(generator_name) + resource = generator.generate( + constraints=constraints, random_seed=random_seed + ) + prefetch[resource.__resource_type__.lower()] = resource parsed_free_text = ( self.free_text_parser(free_text_path, column_name) @@ -155,26 +140,21 @@ def generate_prefetch( def free_text_parser(self, path_to_csv: str, column_name: str) -> List[str]: """ - Parse free text data from a CSV file. - - This method reads a CSV file and extracts text data from a specified column. The text data - can later be used to create DocumentReference resources. + Read a column of free text from a CSV file. Args: - path_to_csv (str): Path to the CSV file containing the free text data. - column_name (str): Name of the column in the CSV file to extract text from. + path_to_csv (str): Path to CSV file. + column_name (str): Column name to extract. Returns: - List[str]: List of text strings extracted from the specified column. + List[str]: Extracted text values. Raises: - FileNotFoundError: If the specified CSV file does not exist or is not a file. + FileNotFoundError: If CSV file does not exist. ValueError: If column_name is not provided. - Exception: If any other error occurs while reading/parsing the CSV file. """ text_data = [] - # Check that path_to_csv is a valid path with pathlib path = Path(path_to_csv) if not path.is_file(): raise FileNotFoundError( diff --git a/healthchain/sandbox/loaders/mimic.py b/healthchain/sandbox/loaders/mimic.py index fbfe503f..92395f48 100644 --- a/healthchain/sandbox/loaders/mimic.py +++ b/healthchain/sandbox/loaders/mimic.py @@ -29,7 +29,7 @@ class MimicOnFHIRLoader(DatasetLoader): >>> client = SandboxClient(...) >>> client.load_from_registry( ... "mimic-on-fhir", - ... data_path="./data/mimic-fhir", + ... data_dir="./data/mimic-fhir", ... resource_types=["MimicMedication", "MimicCondition"], ... sample_size=10 ... ) @@ -50,39 +50,54 @@ def description(self) -> str: def load( self, - data_path: str, + data_dir: str, resource_types: Optional[List[str]] = None, sample_size: Optional[int] = None, random_seed: Optional[int] = None, **kwargs, ) -> Dict: """ - Load MIMIC-on-FHIR data as dict containing R4B Bundles. + Load MIMIC-on-FHIR data as a dict of FHIR Bundles. Args: - data_path: Path to MIMIC-on-FHIR data directory - resource_types: List of MIMIC resource types to load (e.g., ["MimicMedication", "MimicCondition"]). - These should match the MIMIC-on-FHIR filename format (without .ndjson.gz extension). - If None, raises ValueError. - sample_size: Number of resources to randomly sample per resource type. - If None, loads all available resources. - random_seed: Random seed for reproducible sampling. - **kwargs: Additional parameters (reserved for future use) + data_dir: Path to root MIMIC-on-FHIR directory (expects a /fhir subdir with .ndjson.gz files) + resource_types: Resource type names to load (e.g., ["MimicMedication"]). Required. + sample_size: Number of resources to randomly sample per type (loads all if None) + random_seed: Seed for sampling + **kwargs: Reserved for future use Returns: - Dict containing R4B Bundle resources grouped by FHIR resource type. - Each Bundle contains resources of the same type. - Example: {"MedicationStatement": Bundle(...), "Condition": Bundle(...)} + Dict mapping resource type (e.g., "MedicationStatement") to FHIR R4B Bundle Raises: - FileNotFoundError: If data path doesn't exist or resource files not found - ValueError: If resource_types is None or empty, or if resource validation fails + FileNotFoundError: If directory or resource files not found + ValueError: If resource_types is None/empty or resources fail validation + + Example: + >>> loader = MimicOnFHIRLoader() + >>> loader.load(data_dir="./data/mimic-iv-fhir", resource_types=["MimicMedication"], sample_size=100) """ - data_path = Path(data_path) - if not data_path.exists(): + + data_dir = Path(data_dir) + if not data_dir.exists(): + raise FileNotFoundError( + f"MIMIC-on-FHIR data directory not found at: {data_dir}\n" + f"Please ensure the directory exists and contains a 'fhir' subdirectory with .ndjson.gz files.\n" + f"Expected structure: {data_dir}/fhir/MimicMedication.ndjson.gz, etc." + ) + + # Check if /fhir subdirectory exists + fhir_dir = data_dir / "fhir" + if not fhir_dir.exists(): raise FileNotFoundError( - f"MIMIC-on-FHIR data not found at: {data_path}. " - "Please provide a valid data_path." + f"MIMIC-on-FHIR 'fhir' subdirectory not found at: {fhir_dir}\n" + f"The loader expects data_dir to contain a 'fhir' subdirectory with .ndjson.gz resource files.\n" + f"Expected structure:\n" + f" {data_dir}/\n" + f" └── fhir/\n" + f" ├── MimicMedication.ndjson.gz\n" + f" ├── MimicCondition.ndjson.gz\n" + f" └── ... (other resource files)" ) if not resource_types: @@ -101,7 +116,7 @@ def load( for resource_type in resource_types: try: resources = self._load_resource_file( - data_path, resource_type, sample_size + data_dir, resource_type, sample_size ) # Group by FHIR resourceType (not filename) @@ -136,13 +151,13 @@ def load( return bundles def _load_resource_file( - self, data_path: Path, resource_type: str, sample_size: Optional[int] = None + self, data_dir: Path, resource_type: str, sample_size: Optional[int] = None ) -> List[Dict]: """ Load resources from a single MIMIC-on-FHIR .ndjson.gz file. Args: - data_path: Path to MIMIC-on-FHIR data directory + data_dir: Path to MIMIC-on-FHIR data directory resource_type: MIMIC resource type (e.g., "MimicMedication") sample_size: Number of resources to randomly sample @@ -156,16 +171,31 @@ def _load_resource_file( import gzip import json - # Construct file path - fhir_dir = data_path / "fhir" + # Construct file path - MIMIC-on-FHIR stores resources in /fhir subdirectory + fhir_dir = data_dir / "fhir" file_path = fhir_dir / f"{resource_type}.ndjson.gz" if not file_path.exists(): - raise FileNotFoundError( - f"Resource file not found: {file_path}. " - f"Expected MIMIC-on-FHIR file at {fhir_dir}/{resource_type}.ndjson.gz" + # Provide helpful error with available files + available_files = [] + if fhir_dir.exists(): + available_files = [f.stem for f in fhir_dir.glob("*.ndjson.gz")] + + error_msg = f"Resource file not found: {file_path}\n" + error_msg += ( + f"Expected MIMIC-on-FHIR file at {fhir_dir}/{resource_type}.ndjson.gz\n" ) + if available_files: + error_msg += f"\nAvailable resource files in {fhir_dir}:\n" + error_msg += "\n".join(f" - {f}" for f in available_files[:10]) + if len(available_files) > 10: + error_msg += f"\n ... and {len(available_files) - 10} more" + else: + error_msg += f"\nNo .ndjson.gz files found in {fhir_dir}" + + raise FileNotFoundError(error_msg) + # Read all resources from file as dicts resources = [] line_num = 0 diff --git a/healthchain/sandbox/loaders/synthea.py b/healthchain/sandbox/loaders/synthea.py index e68dd50b..56eafc17 100644 --- a/healthchain/sandbox/loaders/synthea.py +++ b/healthchain/sandbox/loaders/synthea.py @@ -26,7 +26,7 @@ class SyntheaLoader(DatasetLoader): >>> client = SandboxClient(...) >>> client.load_from_registry( ... "synthea", - ... data_path="./data/synthea/output/fhir", + ... data_dir="./data/synthea/output/fhir", ... sample_size=10 ... ) """ @@ -43,7 +43,7 @@ def description(self) -> str: def load( self, - data_path: str, + data_dir: str, patient_ids: Optional[List[str]] = None, sample_size: Optional[int] = None, random_seed: Optional[int] = None, @@ -53,7 +53,7 @@ def load( Load Synthea FHIR data as dict. Args: - data_path: Path to Synthea FHIR output directory + data_dir: Path to Synthea FHIR output directory patient_ids: Specific patient IDs to load sample_size: Number of random patients to sample random_seed: Random seed for reproducible sampling @@ -66,11 +66,11 @@ def load( FileNotFoundError: If data path doesn't exist NotImplementedError: Full implementation pending """ - data_path = Path(data_path) - if not data_path.exists(): + data_dir = Path(data_dir) + if not data_dir.exists(): raise FileNotFoundError( - f"Synthea data not found at: {data_path}. " - "Please provide a valid data_path." + f"Synthea data not found at: {data_dir}. " + "Please provide a valid data_dir." ) # TODO: Implement Synthea FHIR bundle loading diff --git a/healthchain/sandbox/sandboxclient.py b/healthchain/sandbox/sandboxclient.py index 752321a4..3a21a670 100644 --- a/healthchain/sandbox/sandboxclient.py +++ b/healthchain/sandbox/sandboxclient.py @@ -31,7 +31,7 @@ class SandboxClient: Simplified client for testing healthcare services with various data sources. This class provides an intuitive interface for: - - Loading test datasets (MIMIC-on-FHIR, Synthea, CSV) + - Loading test datasets (MIMIC-on-FHIR, Synthea) - Generating synthetic FHIR data - Sending requests to healthcare services - Managing request/response lifecycle @@ -39,16 +39,14 @@ class SandboxClient: Examples: Load from dataset registry: >>> client = SandboxClient( - ... api_url="http://localhost:8000", - ... endpoint="/cds/cds-services/my-service" + ... url="http://localhost:8000/cds/cds-services/my-service" ... ) >>> client.load_from_registry("mimic-on-fhir", sample_size=10) >>> responses = client.send_requests() Load CDA file from path: >>> client = SandboxClient( - ... api_url="http://localhost:8000", - ... endpoint="/notereader/fhir/", + ... url="http://localhost:8000/notereader/fhir/", ... protocol="soap" ... ) >>> client.load_from_path("./data/clinical_note.xml") @@ -56,8 +54,7 @@ class SandboxClient: Generate data from free text: >>> client = SandboxClient( - ... api_url="http://localhost:8000", - ... endpoint="/cds/cds-services/discharge-summarizer" + ... url="http://localhost:8000/cds/cds-services/discharge-summarizer" ... ) >>> client.load_free_text( ... csv_path="./data/notes.csv", @@ -69,8 +66,7 @@ class SandboxClient: def __init__( self, - api_url: str, - endpoint: str, + url: str, workflow: Union[Workflow, str], protocol: Literal["rest", "soap"] = "rest", timeout: float = 10.0, @@ -79,21 +75,19 @@ def __init__( Initialize SandboxClient. Args: - api_url: Base URL of the service (e.g., "http://localhost:8000") - endpoint: Service endpoint path (e.g., "/cds/cds-services/my-service") + url: Full service URL (e.g., "http://localhost:8000/cds/cds-services/my-service") workflow: Workflow specification (required) - determines request type and validation protocol: Communication protocol - "rest" for CDS Hooks, "soap" for CDA timeout: Request timeout in seconds Raises: - ValueError: If api_url, endpoint, or workflow-protocol combination is invalid + ValueError: If url or workflow-protocol combination is invalid """ try: - self.api = httpx.URL(api_url) + self.url = httpx.URL(url) except Exception as e: - raise ValueError(f"Invalid API URL: {str(e)}") + raise ValueError(f"Invalid URL: {str(e)}") - self.endpoint = endpoint self.workflow = Workflow(workflow) if isinstance(workflow, str) else workflow self.protocol = ApiProtocol.soap if protocol == "soap" else ApiProtocol.rest self.timeout = timeout @@ -106,9 +100,7 @@ def __init__( # Single validation point - fail fast on incompatible workflow-protocol self._validate_workflow_protocol() - log.info( - f"Initialized SandboxClient {self.sandbox_id} for {self.api}{self.endpoint}" - ) + log.info(f"Initialized SandboxClient {self.sandbox_id} for {self.url}") def _validate_workflow_protocol(self) -> None: """ @@ -140,7 +132,7 @@ def _validate_workflow_protocol(self) -> None: def load_from_registry( self, source: str, - data_path: str, + data_dir: str, **kwargs: Any, ) -> "SandboxClient": """ @@ -151,7 +143,7 @@ def load_from_registry( Args: source: Dataset name (e.g., "mimic-on-fhir", "synthea") - data_path: Path to the dataset files + data_dir: Path to the dataset directory **kwargs: Dataset-specific parameters (e.g., resource_types, sample_size) Returns: @@ -159,23 +151,17 @@ def load_from_registry( Raises: ValueError: If dataset not found in registry - FileNotFoundError: If data_path doesn't exist + FileNotFoundError: If data_dir doesn't exist Examples: - Discover available datasets: - >>> from healthchain.sandbox import list_available_datasets - >>> print(list_available_datasets()) - Load MIMIC dataset: >>> client = SandboxClient( - ... api_url="http://localhost:8000", - ... endpoint="/cds/patient-view", + ... url="http://localhost:8000/cds/patient-view", ... workflow="patient-view", - ... protocol="rest" ... ) >>> client.load_from_registry( ... "mimic-on-fhir", - ... data_path="./data/mimic-fhir", + ... data_dir="./data/mimic-fhir", ... resource_types=["MimicMedication"], ... sample_size=10 ... ) @@ -184,7 +170,7 @@ def load_from_registry( log.info(f"Loading dataset from registry: {source}") try: - loaded_data = DatasetRegistry.load(source, data_path=data_path, **kwargs) + loaded_data = DatasetRegistry.load(source, data_dir=data_dir, **kwargs) self._construct_request(loaded_data) log.info(f"Loaded {source} dataset with {len(self.request_data)} requests") except KeyError: @@ -200,31 +186,23 @@ def load_from_path( pattern: Optional[str] = None, ) -> "SandboxClient": """ - Load data from file system path. + Load data from a file or directory. - Supports loading single files or directories. File type is auto-detected - from extension and protocol: - - .xml files with SOAP protocol → CDA documents - - .json files with REST protocol → Pre-formatted Prefetch data + Supports single files or all matching files in a directory (with optional glob pattern). + For .xml (SOAP protocol) loads CDA; for .json (REST protocol) loads Prefetch. Args: - path: File path or directory path - pattern: Glob pattern for filtering files in directory (e.g., "*.xml") + path: File or directory path. + pattern: Glob pattern for files in directory (e.g., "*.xml"). Returns: - Self for method chaining + Self. Raises: - FileNotFoundError: If path doesn't exist - ValueError: If no matching files found or unsupported file type - - Examples: - Load single CDA file: - >>> client.load_from_path("./data/clinical_note.xml") - - Load directory of CDA files: - >>> client.load_from_path("./data/cda_files/", pattern="*.xml") + FileNotFoundError: If path does not exist. + ValueError: If no matching files are found or if path is not file/dir. """ + path = Path(path) if not path.exists(): raise FileNotFoundError(f"Path not found: {path}") @@ -283,35 +261,27 @@ def load_free_text( self, csv_path: str, column_name: str, + generate_synthetic: bool = True, random_seed: Optional[int] = None, **kwargs: Any, ) -> "SandboxClient": """ - Generates a CDS prefetch from free text notes. - - Reads clinical notes from a CSV file and wraps it in FHIR DocumentReferences - in a CDS prefetch field for CDS Hooks workflows. Hydrates it with generated - FHIR resources as needed based on the workflow set at client initialization. + Load free-text notes from a CSV column and create FHIR DocumentReferences for CDS prefetch. + Optionally include synthetic FHIR resources based on the current workflow. Args: - csv_path: Path to CSV file containing clinical notes - column_name: Name of the column containing the text - random_seed: Seed for reproducible data generation - **kwargs: Additional parameters for data generation + csv_path: Path to the CSV file + column_name: Name of the text column + generate_synthetic: Whether to add synthetic FHIR resources (default: True) + random_seed: Seed for reproducible results + **kwargs: Extra parameters for data generation Returns: - Self for method chaining + self Raises: - FileNotFoundError: If CSV file doesn't exist - ValueError: If column not found - - Examples: - >>> client.load_free_text( - ... csv_path="./data/discharge_notes.csv", - ... column_name="text", - ... random_seed=42 - ... ) + FileNotFoundError: If the CSV file does not exist + ValueError: If the column is not found """ from .generators import CdsDataGenerator @@ -322,13 +292,20 @@ def load_free_text( random_seed=random_seed, free_text_path=csv_path, column_name=column_name, + generate_resources=generate_synthetic, **kwargs, ) self._construct_request(prefetch_data) - log.info( - f"Generated {len(self.request_data)} requests from free text for workflow {self.workflow.value}" - ) + + if generate_synthetic: + log.info( + f"Generated {len(self.request_data)} requests from free text with synthetic resources for workflow {self.workflow.value}" + ) + else: + log.info( + f"Generated {len(self.request_data)} requests from free text only (no synthetic resources)" + ) return self @@ -377,12 +354,6 @@ def preview_requests(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: Returns: List of request summary dictionaries containing metadata - - Examples: - >>> client.load_free_text(csv_path="data.csv", column_name="text", workflow="encounter-discharge") - >>> previews = client.preview_requests(limit=3) - >>> for p in previews: - ... print(f"Request {p['index']}: {p['type']} for {p['protocol']}") """ requests = self.request_data[:limit] if limit else self.request_data previews = [] @@ -464,8 +435,7 @@ def send_requests(self) -> List[Dict]: "No requests to send. Load data first using load_from_registry(), load_from_path(), or load_free_text()" ) - url = self.api.join(self.endpoint) - log.info(f"Sending {len(self.request_data)} requests to {url}") + log.info(f"Sending {len(self.request_data)} requests to {self.url}") with httpx.Client(follow_redirects=True) as client: responses: List[Dict] = [] @@ -476,7 +446,7 @@ def send_requests(self) -> List[Dict]: if self.protocol == ApiProtocol.soap: headers = {"Content-Type": "text/xml; charset=utf-8"} response = client.post( - url=str(url), + url=str(self.url), data=request.document, headers=headers, timeout=timeout, @@ -486,9 +456,9 @@ def send_requests(self) -> List[Dict]: responses.append(response_model.model_dump_xml()) else: # REST/CDS Hooks - log.debug(f"Making POST request to: {url}") + log.debug(f"Making POST request to: {self.url}") response = client.post( - url=str(url), + url=str(self.url), json=request.model_dump(exclude_none=True), timeout=timeout, ) @@ -580,8 +550,7 @@ def get_status(self) -> Dict[str, Any]: """ return { "sandbox_id": str(self.sandbox_id), - "api_url": str(self.api), - "endpoint": self.endpoint, + "url": str(self.url), "protocol": self.protocol.value if hasattr(self.protocol, "value") else str(self.protocol), @@ -610,7 +579,7 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None: def __repr__(self) -> str: """String representation of SandboxClient.""" return ( - f"SandboxClient(api_url='{self.api}', endpoint='{self.endpoint}', " + f"SandboxClient(url='{self.url}', " f"protocol='{self.protocol.value if hasattr(self.protocol, 'value') else self.protocol}', " f"requests={len(self.request_data)})" ) diff --git a/tests/sandbox/test_cds_sandbox.py b/tests/sandbox/test_cds_sandbox.py index 3d472682..405f7634 100644 --- a/tests/sandbox/test_cds_sandbox.py +++ b/tests/sandbox/test_cds_sandbox.py @@ -28,8 +28,7 @@ async def handle_patient_view(request: CDSRequest) -> CDSResponse: # Create SandboxClient client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/cds/cds-services/test-patient-view", + url="http://localhost:8000/cds/cds-services/test-patient-view", workflow="patient-view", protocol="rest", ) @@ -73,8 +72,7 @@ def test_cdshooks_workflows(): """Test CDSHooks sandbox with patient-view workflow""" # Create SandboxClient client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/cds/cds-services/patient-view", + url="http://localhost:8000/cds/cds-services/patient-view", workflow="patient-view", protocol="rest", ) diff --git a/tests/sandbox/test_clindoc_sandbox.py b/tests/sandbox/test_clindoc_sandbox.py index 24ad9969..2c4193dc 100644 --- a/tests/sandbox/test_clindoc_sandbox.py +++ b/tests/sandbox/test_clindoc_sandbox.py @@ -24,8 +24,7 @@ def process_document(cda_request: CdaRequest) -> CdaResponse: # Create SandboxClient for SOAP/CDA client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/notereader/fhir/", + url="http://localhost:8000/notereader/fhir/", workflow="sign-note-inpatient", protocol="soap", ) @@ -74,8 +73,7 @@ def test_notereader_sandbox_workflow_execution(): """Test executing a NoteReader workflow with SandboxClient""" # Create SandboxClient client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/notereader/fhir/", + url="http://localhost:8000/notereader/fhir/", workflow="sign-note-inpatient", protocol="soap", ) diff --git a/tests/sandbox/test_mimic_loader.py b/tests/sandbox/test_mimic_loader.py index a0e35980..1f746f38 100644 --- a/tests/sandbox/test_mimic_loader.py +++ b/tests/sandbox/test_mimic_loader.py @@ -94,15 +94,15 @@ def test_mimic_loader_requires_resource_types(temp_mimic_data_dir): loader = MimicOnFHIRLoader() with pytest.raises(ValueError, match="resource_types parameter is required"): - loader.load(data_path=str(temp_mimic_data_dir)) + loader.load(data_dir=str(temp_mimic_data_dir)) def test_mimic_loader_raises_error_for_missing_data_path(): """MimicOnFHIRLoader raises FileNotFoundError when data path doesn't exist.""" loader = MimicOnFHIRLoader() - with pytest.raises(FileNotFoundError, match="MIMIC-on-FHIR data not found"): - loader.load(data_path="/nonexistent/path", resource_types=["MimicMedication"]) + with pytest.raises(FileNotFoundError): + loader.load(data_dir="/nonexistent/path", resource_types=["MimicMedication"]) def test_mimic_loader_raises_error_for_missing_resource_file(temp_mimic_data_dir): @@ -111,7 +111,7 @@ def test_mimic_loader_raises_error_for_missing_resource_file(temp_mimic_data_dir with pytest.raises(FileNotFoundError, match="Resource file not found"): loader.load( - data_path=str(temp_mimic_data_dir), resource_types=["MimicMedication"] + data_dir=str(temp_mimic_data_dir), resource_types=["MimicMedication"] ) @@ -127,7 +127,7 @@ def test_mimic_loader_loads_single_resource_type( loader = MimicOnFHIRLoader() result = loader.load( - data_path=str(temp_mimic_data_dir), resource_types=["MimicMedication"] + data_dir=str(temp_mimic_data_dir), resource_types=["MimicMedication"] ) assert isinstance(result, dict) @@ -153,7 +153,7 @@ def test_mimic_loader_loads_multiple_resource_types( loader = MimicOnFHIRLoader() result = loader.load( - data_path=str(temp_mimic_data_dir), + data_dir=str(temp_mimic_data_dir), resource_types=["MimicMedication", "MimicCondition"], ) @@ -178,7 +178,7 @@ def test_mimic_loader_sampling_behavior( loader = MimicOnFHIRLoader() result = loader.load( - data_path=str(temp_mimic_data_dir), + data_dir=str(temp_mimic_data_dir), resource_types=["MimicMedication"], sample_size=sample_size, ) @@ -198,13 +198,13 @@ def test_mimic_loader_deterministic_sampling_with_seed( loader = MimicOnFHIRLoader() result1 = loader.load( - data_path=str(temp_mimic_data_dir), + data_dir=str(temp_mimic_data_dir), resource_types=["MimicMedication"], sample_size=1, random_seed=42, ) result2 = loader.load( - data_path=str(temp_mimic_data_dir), + data_dir=str(temp_mimic_data_dir), resource_types=["MimicMedication"], sample_size=1, random_seed=42, @@ -247,7 +247,7 @@ def test_mimic_loader_handles_malformed_json(temp_mimic_data_dir): loader = MimicOnFHIRLoader() result = loader.load( - data_path=str(temp_mimic_data_dir), resource_types=["MimicMedication"] + data_dir=str(temp_mimic_data_dir), resource_types=["MimicMedication"] ) # Should load the valid resource despite malformed line @@ -277,7 +277,7 @@ def test_mimic_loader_raises_error_for_invalid_fhir_resources(temp_mimic_data_di # FHIR validation now catches the invalid resource with pytest.raises(Exception): loader.load( - data_path=str(temp_mimic_data_dir), resource_types=["MimicMedication"] + data_dir=str(temp_mimic_data_dir), resource_types=["MimicMedication"] ) @@ -310,7 +310,7 @@ def test_mimic_loader_skips_resources_without_resource_type(temp_mimic_data_dir) loader = MimicOnFHIRLoader() result = loader.load( - data_path=str(temp_mimic_data_dir), resource_types=["MimicMedication"] + data_dir=str(temp_mimic_data_dir), resource_types=["MimicMedication"] ) # Should only load the valid resource diff --git a/tests/sandbox/test_sandbox_client.py b/tests/sandbox/test_sandbox_client.py index 6734f323..47f7d857 100644 --- a/tests/sandbox/test_sandbox_client.py +++ b/tests/sandbox/test_sandbox_client.py @@ -7,12 +7,10 @@ def test_load_from_registry_unknown_dataset(): """load_from_registry raises ValueError for unknown datasets.""" - client = SandboxClient( - api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" - ) + client = SandboxClient(url="http://localhost:8000/test", workflow="patient-view") with pytest.raises(ValueError, match="Unknown dataset"): - client.load_from_registry("nonexistent-dataset", data_path="/test") + client.load_from_registry("nonexistent-dataset", data_dir="/test") def test_load_from_path_single_xml_file(tmp_path): @@ -22,8 +20,7 @@ def test_load_from_path_single_xml_file(tmp_path): cda_file.write_text("Test CDA") client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/notereader/fhir/", + url="http://localhost:8000/notereader/fhir/", workflow="sign-note-inpatient", protocol="soap", ) @@ -42,8 +39,7 @@ def test_load_from_path_directory_with_pattern(tmp_path): (tmp_path / "other.txt").write_text("Not XML") client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/notereader/fhir/", + url="http://localhost:8000/notereader/fhir/", workflow="sign-note-inpatient", protocol="soap", ) @@ -60,8 +56,7 @@ def test_load_from_path_directory_all_files(tmp_path): (tmp_path / "note2.xml").write_text("Note 2") client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/notereader/fhir/", + url="http://localhost:8000/notereader/fhir/", workflow="sign-note-inpatient", protocol="soap", ) @@ -74,8 +69,7 @@ def test_load_from_path_directory_all_files(tmp_path): def test_load_from_path_error_handling(tmp_path): """load_from_path raises FileNotFoundError for nonexistent path.""" client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/notereader/fhir/", + url="http://localhost:8000/notereader/fhir/", workflow="sign-note-inpatient", protocol="soap", ) @@ -94,8 +88,7 @@ def test_load_free_text_generates_data(tmp_path): csv_file.write_text("text\nSample discharge note\n") client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="encounter-discharge", ) @@ -107,21 +100,40 @@ def test_load_free_text_generates_data(tmp_path): assert len(client.request_data) > 0 -def test_send_requests_without_data(): - """send_requests raises RuntimeError if no data is loaded.""" +def test_load_free_text_without_synthetic_data(tmp_path): + """load_free_text can generate data without synthetic resources.""" + # Create test CSV + csv_file = tmp_path / "test.csv" + csv_file.write_text("text\nSample discharge note\nAnother note\n") + client = SandboxClient( - api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" + url="http://localhost:8000/test", + workflow="encounter-discharge", ) + client.load_free_text( + csv_path=str(csv_file), + column_name="text", + generate_synthetic=False, + random_seed=42, + ) + + assert len(client.request_data) > 0 + # Verify request was created (but without checking prefetch content details) + assert client.request_data[0].hook == "encounter-discharge" + + +def test_send_requests_without_data(): + """send_requests raises RuntimeError if no data is loaded.""" + client = SandboxClient(url="http://localhost:8000/test", workflow="patient-view") + with pytest.raises(RuntimeError, match="No requests to send"): client.send_requests() def test_save_results_without_responses(): """save_results raises RuntimeError if no responses available.""" - client = SandboxClient( - api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" - ) + client = SandboxClient(url="http://localhost:8000/test", workflow="patient-view") with pytest.raises(RuntimeError, match="No responses to save"): client.save_results() @@ -129,15 +141,12 @@ def test_save_results_without_responses(): def test_get_status(): """get_status returns client status information.""" - client = SandboxClient( - api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" - ) + client = SandboxClient(url="http://localhost:8000/test", workflow="patient-view") status = client.get_status() assert "sandbox_id" in status - assert status["api_url"] == "http://localhost:8000" - assert status["endpoint"] == "/test" + assert status["url"] == "http://localhost:8000/test" assert status["protocol"] == "REST" assert status["workflow"] == "patient-view" assert status["requests_queued"] == 0 @@ -146,15 +155,12 @@ def test_get_status(): def test_repr(): """__repr__ returns meaningful string representation.""" - client = SandboxClient( - api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" - ) + client = SandboxClient(url="http://localhost:8000/test", workflow="patient-view") repr_str = repr(client) assert "SandboxClient" in repr_str - assert "http://localhost:8000" in repr_str - assert "/test" in repr_str + assert "http://localhost:8000/test" in repr_str def test_load_from_path_json_prefetch_file(tmp_path): @@ -166,9 +172,7 @@ def test_load_from_path_json_prefetch_file(tmp_path): prefetch_data = {"prefetch": {"patient": create_bundle().model_dump()}} json_file.write_text(json.dumps(prefetch_data)) - client = SandboxClient( - api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" - ) + client = SandboxClient(url="http://localhost:8000/test", workflow="patient-view") client.load_from_path(str(json_file)) @@ -181,9 +185,7 @@ def test_load_from_path_invalid_json_prefetch(tmp_path): json_file = tmp_path / "data.json" json_file.write_text('{"not_prefetch": "data"}') - client = SandboxClient( - api_url="http://localhost:8000", endpoint="/test", workflow="patient-view" - ) + client = SandboxClient(url="http://localhost:8000/test", workflow="patient-view") # Should load the JSON data without error since we're using plain dicts now client.load_from_path(str(json_file)) @@ -196,8 +198,7 @@ def test_save_results_distinguishes_protocols(tmp_path): # Test REST/JSON protocol rest_client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="patient-view", protocol="rest", ) @@ -213,8 +214,7 @@ def test_save_results_distinguishes_protocols(tmp_path): # Test SOAP/XML protocol soap_client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="sign-note-inpatient", protocol="soap", ) @@ -243,15 +243,13 @@ def test_workflow_protocol_validation(workflow, protocol, should_fail): if should_fail: with pytest.raises(ValueError, match="not compatible"): SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow=workflow, protocol=protocol, ) else: client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow=workflow, protocol=protocol, ) @@ -263,8 +261,7 @@ def test_clear_requests(): from healthchain.fhir import create_bundle client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="patient-view", ) @@ -284,8 +281,7 @@ def test_preview_requests_provides_metadata(): from healthchain.fhir import create_bundle client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="patient-view", ) @@ -311,8 +307,7 @@ def test_preview_requests_respects_limit(): from healthchain.fhir import create_bundle client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="patient-view", ) @@ -338,8 +333,7 @@ def test_get_request_data_formats(format_type, check): from healthchain.fhir import create_bundle client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="patient-view", ) @@ -354,8 +348,7 @@ def test_get_request_data_formats(format_type, check): def test_get_request_data_invalid_format(): """get_request_data raises ValueError for invalid format.""" client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="patient-view", ) @@ -368,8 +361,7 @@ def test_context_manager_auto_saves_on_success(tmp_path): from healthchain.fhir import create_bundle with SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="patient-view", ) as client: prefetch = {"patient": create_bundle()} @@ -383,8 +375,7 @@ def test_context_manager_auto_saves_on_success(tmp_path): def test_context_manager_no_save_without_responses(tmp_path): """Context manager does not save if no responses generated.""" with SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="patient-view", ) as client: # No requests or responses @@ -398,8 +389,7 @@ def test_context_manager_no_save_on_exception(): """Context manager does not save if exception occurs.""" with pytest.raises(RuntimeError): with SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="patient-view", ) as client: client.responses = [{"cards": []}] @@ -425,8 +415,7 @@ def test_send_requests_rest_success(mock_client_class): mock_client_class.return_value = mock_client client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="patient-view", ) @@ -455,8 +444,7 @@ def test_send_requests_soap_success(mock_client_class): mock_client_class.return_value = mock_client client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="sign-note-inpatient", protocol="soap", ) @@ -488,8 +476,7 @@ def test_send_requests_handles_multiple_requests(mock_client_class): mock_client_class.return_value = mock_client client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/test", + url="http://localhost:8000/test", workflow="patient-view", ) From 359e4c76d90058b907ae43cc0df75326b26cb968 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Fri, 31 Oct 2025 18:52:09 +0000 Subject: [PATCH 04/12] Add synthea dataloader --- healthchain/sandbox/loaders/__init__.py | 5 +- healthchain/sandbox/loaders/mimic.py | 4 +- healthchain/sandbox/loaders/synthea.py | 344 ++++++++++++++++++++++-- tests/sandbox/test_synthea_loader.py | 299 ++++++++++++++++++++ 4 files changed, 621 insertions(+), 31 deletions(-) create mode 100644 tests/sandbox/test_synthea_loader.py diff --git a/healthchain/sandbox/loaders/__init__.py b/healthchain/sandbox/loaders/__init__.py index 0086b08f..da09d135 100644 --- a/healthchain/sandbox/loaders/__init__.py +++ b/healthchain/sandbox/loaders/__init__.py @@ -6,9 +6,10 @@ from healthchain.sandbox.datasets import DatasetRegistry from healthchain.sandbox.loaders.mimic import MimicOnFHIRLoader +from healthchain.sandbox.loaders.synthea import SyntheaFHIRPatientLoader # Register loaders DatasetRegistry.register(MimicOnFHIRLoader()) -# DatasetRegistry.register(SyntheaLoader()) # if implemented +DatasetRegistry.register(SyntheaFHIRPatientLoader()) -__all__ = ["MimicOnFHIRLoader"] +__all__ = ["MimicOnFHIRLoader", "SyntheaFHIRPatientLoader"] diff --git a/healthchain/sandbox/loaders/mimic.py b/healthchain/sandbox/loaders/mimic.py index 92395f48..be79adc3 100644 --- a/healthchain/sandbox/loaders/mimic.py +++ b/healthchain/sandbox/loaders/mimic.py @@ -121,7 +121,7 @@ def load( # Group by FHIR resourceType (not filename) for resource in resources: - fhir_type = resource["resourceType"] + fhir_type = resource.get("resourceType") if fhir_type not in resources_by_type: resources_by_type[fhir_type] = [] resources_by_type[fhir_type].append(resource) @@ -143,7 +143,7 @@ def load( bundles = {} for fhir_type, resources in resources_by_type.items(): - bundles[fhir_type] = Bundle( + bundles[fhir_type.lower()] = Bundle( type="collection", entry=[{"resource": resource} for resource in resources], ) diff --git a/healthchain/sandbox/loaders/synthea.py b/healthchain/sandbox/loaders/synthea.py index 56eafc17..aded1aa9 100644 --- a/healthchain/sandbox/loaders/synthea.py +++ b/healthchain/sandbox/loaders/synthea.py @@ -4,81 +4,371 @@ Loads synthetic patient data generated by Synthea. """ +import json import logging +import random from pathlib import Path from typing import Dict, List, Optional +from fhir.resources.R4B.bundle import Bundle + from healthchain.sandbox.datasets import DatasetLoader log = logging.getLogger(__name__) -class SyntheaLoader(DatasetLoader): +class SyntheaFHIRPatientLoader(DatasetLoader): """ - Loader for Synthea-generated FHIR data. + Loader for Synthea-generated FHIR patient data. Synthea is an open-source synthetic patient generator that produces - realistic patient records in FHIR format. This loader supports loading - Synthea-generated FHIR bundles. + realistic patient records in FHIR format. This loader loads a single + patient's Bundle (typically containing 100-500 resources), which is + sufficient for quick demos and testing. + + The loader supports multiple ways to specify which patient file to load: + - By patient_id (UUID portion of filename) + - By patient_file (exact filename) + - Default: first .json file found Examples: - Load via SandboxClient: + Load by patient ID: >>> client = SandboxClient(...) >>> client.load_from_registry( - ... "synthea", - ... data_dir="./data/synthea/output/fhir", - ... sample_size=10 + ... "synthea-patient", + ... data_dir="./synthea_sample_data_fhir_latest", + ... patient_id="a969c177-a995-7b89-7b6d-885214dfa253", + ... resource_types=["Condition", "MedicationRequest"], + ... sample_size=5 + ... ) + + Load by filename: + >>> client.load_from_registry( + ... "synthea-patient", + ... data_dir="./synthea_sample_data_fhir_latest", + ... patient_file="Alton320_Gutkowski940_a969c177-a995-7b89-7b6d-885214dfa253.json" + ... ) + + Load first patient (quick demo): + >>> client.load_from_registry( + ... "synthea-patient", + ... data_dir="./synthea_sample_data_fhir_latest" ... ) """ @property def name(self) -> str: """Dataset name for registration.""" - return "synthea" + return "synthea-patient" @property def description(self) -> str: """Dataset description.""" - return "Synthea: Synthetic patient data generated by SyntheaTM" + return "Synthea: Synthetic FHIR patient data generated by SyntheaTM (single patient per load)" def load( self, data_dir: str, - patient_ids: Optional[List[str]] = None, + patient_id: Optional[str] = None, + patient_file: Optional[str] = None, + resource_types: Optional[List[str]] = None, sample_size: Optional[int] = None, random_seed: Optional[int] = None, **kwargs, - ) -> Dict: + ) -> Dict[str, Bundle]: """ - Load Synthea FHIR data as dict. + Load a single Synthea FHIR patient Bundle. Args: data_dir: Path to Synthea FHIR output directory - patient_ids: Specific patient IDs to load - sample_size: Number of random patients to sample + patient_id: Patient UUID (the ID portion of the filename after the name) + e.g., "a969c177-a995-7b89-7b6d-885214dfa253" + patient_file: Exact filename to load + e.g., "Alton320_Gutkowski940_a969c177-a995-7b89-7b6d-885214dfa253.json" + resource_types: FHIR resource types to include (e.g., ["Condition", "MedicationRequest"]) + If None, all resource types are included + sample_size: Number of resources to randomly sample per resource type + If None, all resources of each type are included random_seed: Random seed for reproducible sampling **kwargs: Additional parameters (reserved for future use) Returns: - Dict containing FHIR resources + Dict mapping resource type to FHIR R4B Bundle in prefetch format + e.g., {"Condition": Bundle(...), "MedicationStatement": Bundle(...)} Raises: - FileNotFoundError: If data path doesn't exist - NotImplementedError: Full implementation pending + FileNotFoundError: If data directory or patient file not found + ValueError: If patient file is not a valid FHIR Bundle or no resources found + + Example: + >>> loader = SyntheaFHIRPatientLoader() + >>> data = loader.load( + ... data_dir="./synthea_sample_data_fhir_latest", + ... patient_id="a969c177-a995-7b89-7b6d-885214dfa253", + ... resource_types=["Condition", "MedicationRequest"], + ... sample_size=3 + ... ) + >>> # Returns: {"Condition": Bundle(...), "MedicationRequest": Bundle(...)} """ data_dir = Path(data_dir) if not data_dir.exists(): raise FileNotFoundError( - f"Synthea data not found at: {data_dir}. " - "Please provide a valid data_dir." + f"Synthea data directory not found at: {data_dir}\n" + "Please provide a valid data_dir containing Synthea FHIR patient files." ) - # TODO: Implement Synthea FHIR bundle loading - # Synthea typically outputs individual FHIR bundles per patient - # Format: .json containing a Bundle resource + # Find the patient file + patient_file_path = self._find_patient_file(data_dir, patient_id, patient_file) + + # Load and validate the Bundle + bundle_dict = self._load_bundle(patient_file_path) + + # Log patient information + self._log_patient_info(bundle_dict, patient_file_path.name) + + # Set random seed if provided + if random_seed is not None: + random.seed(random_seed) + + # Extract and group resources by type + resources_by_type = self._extract_resources(bundle_dict, resource_types) + + if not resources_by_type: + available_types = self._get_available_resource_types(bundle_dict) + if resource_types: + raise ValueError( + f"No resources found for requested types: {resource_types}\n" + f"Available resource types in this patient file: {available_types}" + ) + else: + raise ValueError( + f"No valid resources found in patient file: {patient_file_path.name}" + ) + + # Apply sampling if requested + if sample_size is not None: + resources_by_type = self._sample_resources(resources_by_type, sample_size) + + # Convert to Bundle objects + bundles = {} + for resource_type, resources in resources_by_type.items(): + bundles[resource_type.lower()] = Bundle( + type="collection", + entry=[{"resource": resource} for resource in resources], + ) + log.info( + f"Loaded {len(resources)} {resource_type} resource(s) from {patient_file_path.name}" + ) + + return bundles + + def _find_patient_file( + self, + data_dir: Path, + patient_id: Optional[str] = None, + patient_file: Optional[str] = None, + ) -> Path: + """ + Find the patient file to load based on provided parameters. + + Args: + data_dir: Directory containing patient files + patient_id: Patient UUID to search for + patient_file: Exact filename + + Returns: + Path to the patient file - raise NotImplementedError( - "Synthea loader is not yet fully implemented. " - "This is a placeholder for future functionality. " - "For now, please use load_free_text() with CdsDataGenerator for synthetic data." + Raises: + FileNotFoundError: If no matching file is found + ValueError: If multiple files match the patient_id + """ + # Option 1: Exact filename provided + if patient_file: + file_path = data_dir / patient_file + if not file_path.exists(): + raise FileNotFoundError( + f"Patient file not found: {file_path}\n" + f"Please check that the file exists in {data_dir}" + ) + return file_path + + # Option 2: Patient ID provided - search for matching file + if patient_id: + matching_files = list(data_dir.glob(f"*{patient_id}*.json")) + if not matching_files: + # List available files for helpful error message + available_files = list(data_dir.glob("*.json")) + error_msg = f"No patient file found with ID: {patient_id}\n" + if available_files: + error_msg += f"\nAvailable patient files in {data_dir}:\n" + error_msg += "\n".join(f" - {f.name}" for f in available_files[:5]) + if len(available_files) > 5: + error_msg += f"\n ... and {len(available_files) - 5} more" + else: + error_msg += f"\nNo .json files found in {data_dir}" + raise FileNotFoundError(error_msg) + + if len(matching_files) > 1: + raise ValueError( + f"Multiple patient files found with ID '{patient_id}':\n" + + "\n".join(f" - {f.name}" for f in matching_files) + + "\nPlease use patient_file parameter to specify the exact file." + ) + return matching_files[0] + + # Option 3: Default - use first .json file + json_files = list(data_dir.glob("*.json")) + if not json_files: + raise FileNotFoundError( + f"No patient files (.json) found in {data_dir}\n" + "Please ensure the directory contains Synthea FHIR patient files." + ) + + log.info( + f"No patient_id or patient_file specified, using first file: {json_files[0].name}" ) + return json_files[0] + + def _load_bundle(self, file_path: Path) -> Dict: + """ + Load and validate a Synthea FHIR Bundle from JSON file. + + Args: + file_path: Path to the patient Bundle JSON file + + Returns: + Bundle as dict + + Raises: + ValueError: If file is not a valid FHIR Bundle + """ + try: + with open(file_path, "r") as f: + bundle_dict = json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in file {file_path.name}: {e}") + + # Validate it's a FHIR Bundle + if not isinstance(bundle_dict, dict): + raise ValueError( + f"File {file_path.name} does not contain a valid JSON object" + ) + + if bundle_dict.get("resourceType") != "Bundle": + raise ValueError( + f"File {file_path.name} is not a FHIR Bundle. " + f"Found resourceType: {bundle_dict.get('resourceType')}" + ) + + if "entry" not in bundle_dict: + raise ValueError(f"Bundle in {file_path.name} has no 'entry' field") + + return bundle_dict + + def _log_patient_info(self, bundle_dict: Dict, filename: str) -> None: + """ + Log information about the loaded patient. + + Args: + bundle_dict: Bundle dictionary + filename: Name of the patient file + """ + entries = bundle_dict.get("entry", []) + total_resources = len(entries) + + # Try to find Patient resource for additional info + patient_info = None + for entry in entries: + resource = entry.get("resource", {}) + if resource.get("resourceType") == "Patient": + patient_id = resource.get("id", "unknown") + name_list = resource.get("name", []) + if name_list: + name = name_list[0] + given = " ".join(name.get("given", [])) + family = name.get("family", "") + patient_info = f"{given} {family} (ID: {patient_id})" + break + + if patient_info: + log.info( + f"Loaded patient: {patient_info} from {filename} ({total_resources} resources)" + ) + else: + log.info(f"Loaded patient file {filename} ({total_resources} resources)") + + def _get_available_resource_types(self, bundle_dict: Dict) -> List[str]: + """ + Get list of available resource types in the bundle. + + Args: + bundle_dict: Bundle dictionary + + Returns: + List of unique resource types + """ + resource_types = set() + for entry in bundle_dict.get("entry", []): + resource = entry.get("resource", {}) + resource_type = resource.get("resourceType") + if resource_type: + resource_types.add(resource_type) + return sorted(resource_types) + + def _extract_resources( + self, bundle_dict: Dict, resource_types: Optional[List[str]] = None + ) -> Dict[str, List[Dict]]: + """ + Extract and group resources by type from the bundle. + + Args: + bundle_dict: Bundle dictionary + resource_types: Optional list of resource types to filter by + + Returns: + Dict mapping resource type to list of resource dicts + """ + resources_by_type: Dict[str, List[Dict]] = {} + + for entry in bundle_dict.get("entry", []): + resource = entry.get("resource", {}) + resource_type = resource.get("resourceType") + + if not resource_type: + log.warning("Skipping entry with no resourceType") + continue + + # Filter by resource types if specified + if resource_types and resource_type not in resource_types: + continue + + if resource_type not in resources_by_type: + resources_by_type[resource_type] = [] + resources_by_type[resource_type].append(resource) + + return resources_by_type + + def _sample_resources( + self, resources_by_type: Dict[str, List[Dict]], sample_size: int + ) -> Dict[str, List[Dict]]: + """ + Randomly sample resources from each type. + + Args: + resources_by_type: Dict mapping resource type to list of resources + sample_size: Number of resources to sample per type + + Returns: + Dict with sampled resources + """ + sampled = {} + for resource_type, resources in resources_by_type.items(): + if len(resources) <= sample_size: + sampled[resource_type] = resources + else: + sampled[resource_type] = random.sample(resources, sample_size) + log.info( + f"Sampled {sample_size} of {len(resources)} {resource_type} resources" + ) + return sampled diff --git a/tests/sandbox/test_synthea_loader.py b/tests/sandbox/test_synthea_loader.py new file mode 100644 index 00000000..049c4b46 --- /dev/null +++ b/tests/sandbox/test_synthea_loader.py @@ -0,0 +1,299 @@ +"""Tests for Synthea FHIR Patient dataset loader.""" + +import json +import tempfile +from pathlib import Path + +import pytest + +from healthchain.sandbox.loaders.synthea import SyntheaFHIRPatientLoader + + +@pytest.fixture +def temp_synthea_data_dir(): + """Create temporary Synthea data directory structure.""" + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + +@pytest.fixture +def mock_patient_bundle(): + """Sample Synthea patient Bundle with multiple resource types.""" + return { + "resourceType": "Bundle", + "type": "collection", + "entry": [ + { + "resource": { + "resourceType": "Patient", + "id": "a969c177-a995-7b89-7b6d-885214dfa253", + "name": [{"given": ["Alton"], "family": "Gutkowski"}], + "gender": "male", + "birthDate": "1980-01-01", + } + }, + { + "resource": { + "resourceType": "Condition", + "id": "cond-1", + "clinicalStatus": { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", + "code": "active", + } + ] + }, + "code": { + "coding": [ + {"system": "http://snomed.info/sct", "code": "44054006"} + ] + }, + "subject": { + "reference": "Patient/a969c177-a995-7b89-7b6d-885214dfa253" + }, + } + }, + { + "resource": { + "resourceType": "Condition", + "id": "cond-2", + "clinicalStatus": { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", + "code": "active", + } + ] + }, + "code": { + "coding": [ + {"system": "http://snomed.info/sct", "code": "38341003"} + ] + }, + "subject": { + "reference": "Patient/a969c177-a995-7b89-7b6d-885214dfa253" + }, + } + }, + { + "resource": { + "resourceType": "MedicationStatement", + "id": "med-1", + "status": "recorded", + "medication": { + "concept": { + "coding": [ + { + "system": "http://www.nlm.nih.gov/research/umls/rxnorm", + "code": "313782", + } + ] + } + }, + "subject": { + "reference": "Patient/a969c177-a995-7b89-7b6d-885214dfa253" + }, + } + }, + ], + } + + +def create_patient_file(data_dir: Path, filename: str, bundle: dict) -> Path: + """Helper to create patient Bundle JSON file.""" + file_path = data_dir / filename + with open(file_path, "w") as f: + json.dump(bundle, f) + return file_path + + +@pytest.mark.parametrize( + "patient_spec,filename", + [ + ( + {"patient_id": "a969c177-a995-7b89-7b6d-885214dfa253"}, + "Alton320_Gutkowski940_a969c177-a995-7b89-7b6d-885214dfa253.json", + ), + ( + { + "patient_file": "Alton320_Gutkowski940_a969c177-a995-7b89-7b6d-885214dfa253.json" + }, + "Alton320_Gutkowski940_a969c177-a995-7b89-7b6d-885214dfa253.json", + ), + ({}, "Patient1.json"), # Default: first file + ], +) +def test_synthea_loader_supports_multiple_file_specification_methods( + temp_synthea_data_dir, mock_patient_bundle, patient_spec, filename +): + """SyntheaFHIRPatientLoader supports patient_id, patient_file, and default loading.""" + create_patient_file(temp_synthea_data_dir, filename, mock_patient_bundle) + + loader = SyntheaFHIRPatientLoader() + result = loader.load(data_dir=str(temp_synthea_data_dir), **patient_spec) + + assert isinstance(result, dict) + assert "Patient" in result and "Condition" in result + # Returns Bundle objects + assert type(result["Patient"]).__name__ == "Bundle" + assert len(result["Patient"].entry) == 1 + assert len(result["Condition"].entry) == 2 + + +def test_synthea_loader_filters_and_groups_resources_by_type( + temp_synthea_data_dir, mock_patient_bundle +): + """SyntheaFHIRPatientLoader filters by resource_types and groups into separate Bundles.""" + filename = "Patient1.json" + create_patient_file(temp_synthea_data_dir, filename, mock_patient_bundle) + + loader = SyntheaFHIRPatientLoader() + result = loader.load( + data_dir=str(temp_synthea_data_dir), + resource_types=["Condition", "MedicationStatement"], + ) + + # Only requested types included + assert set(result.keys()) == {"Condition", "MedicationStatement"} + assert len(result["Condition"].entry) == 2 + assert len(result["MedicationStatement"].entry) == 1 + + +@pytest.mark.parametrize("sample_size,expected_count", [(1, 1), (2, 2)]) +def test_synthea_loader_sampling_behavior( + temp_synthea_data_dir, mock_patient_bundle, sample_size, expected_count +): + """SyntheaFHIRPatientLoader samples specified number of resources per type.""" + create_patient_file(temp_synthea_data_dir, "Patient1.json", mock_patient_bundle) + + loader = SyntheaFHIRPatientLoader() + result = loader.load( + data_dir=str(temp_synthea_data_dir), + resource_types=["Condition"], + sample_size=sample_size, + ) + + assert len(result["Condition"].entry) == expected_count + + +def test_synthea_loader_deterministic_sampling_with_seed( + temp_synthea_data_dir, mock_patient_bundle +): + """SyntheaFHIRPatientLoader produces consistent results with random_seed.""" + create_patient_file(temp_synthea_data_dir, "Patient1.json", mock_patient_bundle) + + loader = SyntheaFHIRPatientLoader() + result1 = loader.load( + data_dir=str(temp_synthea_data_dir), + resource_types=["Condition"], + sample_size=1, + random_seed=42, + ) + result2 = loader.load( + data_dir=str(temp_synthea_data_dir), + resource_types=["Condition"], + sample_size=1, + random_seed=42, + ) + + assert ( + result1["Condition"].entry[0].resource.id + == result2["Condition"].entry[0].resource.id + ) + + +@pytest.mark.parametrize( + "error_case,error_match", + [ + ({"data_dir": "/nonexistent"}, "Synthea data directory not found"), + ({"patient_id": "nonexistent-uuid"}, "No patient file found with ID"), + ({"patient_file": "nonexistent.json"}, "Patient file not found"), + ], +) +def test_synthea_loader_error_handling_for_missing_files( + temp_synthea_data_dir, mock_patient_bundle, error_case, error_match +): + """SyntheaFHIRPatientLoader raises clear errors for missing files and directories.""" + if "data_dir" not in error_case: + error_case["data_dir"] = str(temp_synthea_data_dir) + + loader = SyntheaFHIRPatientLoader() + with pytest.raises(FileNotFoundError, match=error_match): + loader.load(**error_case) + + +def test_synthea_loader_raises_error_for_multiple_matching_patient_ids( + temp_synthea_data_dir, mock_patient_bundle +): + """SyntheaFHIRPatientLoader raises ValueError when patient_id matches multiple files.""" + create_patient_file( + temp_synthea_data_dir, "Patient1_a969c177.json", mock_patient_bundle + ) + create_patient_file( + temp_synthea_data_dir, "Patient2_a969c177.json", mock_patient_bundle + ) + + loader = SyntheaFHIRPatientLoader() + with pytest.raises(ValueError, match="Multiple patient files found"): + loader.load(data_dir=str(temp_synthea_data_dir), patient_id="a969c177") + + +@pytest.mark.parametrize( + "invalid_bundle,error_match", + [ + ({"not": "a bundle"}, "is not a FHIR Bundle"), + ({"resourceType": "Patient"}, "is not a FHIR Bundle"), + ({"resourceType": "Bundle"}, "has no 'entry' field"), + ], +) +def test_synthea_loader_validates_bundle_structure( + temp_synthea_data_dir, invalid_bundle, error_match +): + """SyntheaFHIRPatientLoader validates Bundle structure and raises errors for invalid data.""" + create_patient_file(temp_synthea_data_dir, "Invalid.json", invalid_bundle) + + loader = SyntheaFHIRPatientLoader() + with pytest.raises(ValueError, match=error_match): + loader.load(data_dir=str(temp_synthea_data_dir)) + + +def test_synthea_loader_raises_error_for_nonexistent_resource_types( + temp_synthea_data_dir, mock_patient_bundle +): + """SyntheaFHIRPatientLoader raises error when requested resource_types don't exist.""" + create_patient_file(temp_synthea_data_dir, "Patient1.json", mock_patient_bundle) + + loader = SyntheaFHIRPatientLoader() + with pytest.raises(ValueError, match="No resources found for requested types"): + loader.load( + data_dir=str(temp_synthea_data_dir), + resource_types=["Observation", "Procedure"], # Not in bundle + ) + + +def test_synthea_loader_skips_resources_without_resource_type(temp_synthea_data_dir): + """SyntheaFHIRPatientLoader skips entries missing resourceType field.""" + invalid_bundle = { + "resourceType": "Bundle", + "type": "collection", + "entry": [ + {"resource": {"id": "no-type"}}, # Missing resourceType + { + "resource": { + "resourceType": "Patient", + "id": "patient-1", + "gender": "male", + "birthDate": "1980-01-01", + } + }, + ], + } + create_patient_file(temp_synthea_data_dir, "Patient1.json", invalid_bundle) + + loader = SyntheaFHIRPatientLoader() + result = loader.load(data_dir=str(temp_synthea_data_dir)) + + # Should only load valid Patient resource + assert "Patient" in result + assert len(result["Patient"].entry) == 1 From f24a2bd0a9405d407b94018a7b388faa42a0e214 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Fri, 31 Oct 2025 19:08:31 +0000 Subject: [PATCH 05/12] Add bundle counting helper --- healthchain/fhir/__init__.py | 2 ++ healthchain/fhir/bundle_helpers.py | 32 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/healthchain/fhir/__init__.py b/healthchain/fhir/__init__.py index 2790ed89..78022bf5 100644 --- a/healthchain/fhir/__init__.py +++ b/healthchain/fhir/__init__.py @@ -23,6 +23,7 @@ set_resources, merge_bundles, extract_resources, + count_resources, ) __all__ = [ @@ -48,4 +49,5 @@ "set_resources", "merge_bundles", "extract_resources", + "count_resources", ] diff --git a/healthchain/fhir/bundle_helpers.py b/healthchain/fhir/bundle_helpers.py index 14ac08e4..3edbc7ed 100644 --- a/healthchain/fhir/bundle_helpers.py +++ b/healthchain/fhir/bundle_helpers.py @@ -256,3 +256,35 @@ def extract_resources( bundle.entry = remaining_entries return extracted + + +def count_resources(bundle: Bundle) -> dict[str, int]: + """Count resources by type in a bundle. + + Args: + bundle: The FHIR Bundle to analyze + + Returns: + Dictionary mapping resource type names to their counts. + Example: {"Condition": 2, "MedicationStatement": 1, "Patient": 1} + + Example: + >>> bundle = create_bundle() + >>> add_resource(bundle, create_condition(...)) + >>> add_resource(bundle, create_condition(...)) + >>> add_resource(bundle, create_medication_statement(...)) + >>> counts = count_resources(bundle) + >>> print(counts) + {'Condition': 2, 'MedicationStatement': 1} + """ + if not bundle or not bundle.entry: + return {} + + counts: dict[str, int] = {} + for entry in bundle.entry: + if entry.resource: + # Get the resource type from the class name + resource_type = entry.resource.__resource_type__ + counts[resource_type] = counts.get(resource_type, 0) + 1 + + return counts From be28c80e2d902ed6e561a5dce01d0eb9d83c96ec Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Fri, 31 Oct 2025 19:08:57 +0000 Subject: [PATCH 06/12] Update cookbook usage --- cookbook/cds_discharge_summarizer_hf_chat.py | 3 +-- cookbook/cds_discharge_summarizer_hf_trf.py | 3 +-- cookbook/notereader_clinical_coding_fhir.py | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/cookbook/cds_discharge_summarizer_hf_chat.py b/cookbook/cds_discharge_summarizer_hf_chat.py index 3a6496d4..e1b6b528 100644 --- a/cookbook/cds_discharge_summarizer_hf_chat.py +++ b/cookbook/cds_discharge_summarizer_hf_chat.py @@ -81,8 +81,7 @@ def start_api(): # Create sandbox client and load test data client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/cds/cds-services/discharge-summarizer", + url="http://localhost:8000/cds/cds-services/discharge-summarizer", workflow="encounter-discharge", ) # Load discharge notes from CSV diff --git a/cookbook/cds_discharge_summarizer_hf_trf.py b/cookbook/cds_discharge_summarizer_hf_trf.py index 635fafa5..6d08332c 100644 --- a/cookbook/cds_discharge_summarizer_hf_trf.py +++ b/cookbook/cds_discharge_summarizer_hf_trf.py @@ -54,8 +54,7 @@ def start_api(): # Create sandbox client and load test data client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/cds/cds-services/discharge-summarizer", + url="http://localhost:8000/cds/cds-services/discharge-summarizer", workflow="encounter-discharge", ) # Load discharge notes from CSV diff --git a/cookbook/notereader_clinical_coding_fhir.py b/cookbook/notereader_clinical_coding_fhir.py index 52967a3d..24f1f343 100644 --- a/cookbook/notereader_clinical_coding_fhir.py +++ b/cookbook/notereader_clinical_coding_fhir.py @@ -132,8 +132,7 @@ def run_server(): # Create sandbox client for testing client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/notereader/fhir/", + url="http://localhost:8000/notereader/fhir/", workflow="sign-note-inpatient", protocol="soap", ) From 214134d7761e6035331d6229bf13878428744b8b Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Fri, 31 Oct 2025 19:09:36 +0000 Subject: [PATCH 07/12] Add error handling for empty responses --- healthchain/sandbox/sandboxclient.py | 47 ++++++++++++++++------------ scripts/healthchainapi_e2e_demo.py | 6 ++-- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/healthchain/sandbox/sandboxclient.py b/healthchain/sandbox/sandboxclient.py index 3a21a670..db7a26e9 100644 --- a/healthchain/sandbox/sandboxclient.py +++ b/healthchain/sandbox/sandboxclient.py @@ -93,7 +93,7 @@ def __init__( self.timeout = timeout # Request/response management - self.request_data: List[Union[CDSRequest, Any]] = [] + self.requests: List[Union[CDSRequest, Any]] = [] self.responses: List[Dict] = [] self.sandbox_id = uuid.uuid4() @@ -172,7 +172,7 @@ def load_from_registry( try: loaded_data = DatasetRegistry.load(source, data_dir=data_dir, **kwargs) self._construct_request(loaded_data) - log.info(f"Loaded {source} dataset with {len(self.request_data)} requests") + log.info(f"Loaded {source} dataset with {len(self.requests)} requests") except KeyError: raise ValueError( f"Unknown dataset: {source}. " @@ -253,7 +253,7 @@ def load_from_path( log.warning(f"Skipping unsupported file type: {file_path}") log.info( - f"Loaded {len(self.request_data)} requests from {len(files_to_load)} file(s)" + f"Loaded {len(self.requests)} requests from {len(files_to_load)} file(s)" ) return self @@ -300,11 +300,11 @@ def load_free_text( if generate_synthetic: log.info( - f"Generated {len(self.request_data)} requests from free text with synthetic resources for workflow {self.workflow.value}" + f"Generated {len(self.requests)} requests from free text with synthetic resources for workflow {self.workflow.value}" ) else: log.info( - f"Generated {len(self.request_data)} requests from free text only (no synthetic resources)" + f"Generated {len(self.requests)} requests from free text only (no synthetic resources)" ) return self @@ -325,7 +325,7 @@ def _construct_request(self, data: Union[Dict[str, Any], Any]) -> None: else: raise ValueError(f"Unsupported protocol: {self.protocol}") - self.request_data.append(request) + self.requests.append(request) def clear_requests(self) -> "SandboxClient": """ @@ -336,8 +336,8 @@ def clear_requests(self) -> "SandboxClient": Returns: Self for method chaining """ - count = len(self.request_data) - self.request_data.clear() + count = len(self.requests) + self.requests.clear() log.info(f"Cleared {count} queued request(s)") return self @@ -355,7 +355,7 @@ def preview_requests(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: Returns: List of request summary dictionaries containing metadata """ - requests = self.request_data[:limit] if limit else self.request_data + requests = self.requests[:limit] if limit else self.requests previews = [] for idx, req in enumerate(requests): @@ -405,10 +405,10 @@ def get_request_data( >>> print(json_str) """ if format == "raw": - return self.request_data + return self.requests elif format == "dict": result = [] - for req in self.request_data: + for req in self.requests: if hasattr(req, "model_dump"): result.append(req.model_dump(exclude_none=True)) elif hasattr(req, "model_dump_xml"): @@ -430,18 +430,18 @@ def send_requests(self) -> List[Dict]: Returns: List of response dictionaries """ - if not self.request_data: + if not self.requests: raise RuntimeError( "No requests to send. Load data first using load_from_registry(), load_from_path(), or load_free_text()" ) - log.info(f"Sending {len(self.request_data)} requests to {self.url}") + log.info(f"Sending {len(self.requests)} requests to {self.url}") with httpx.Client(follow_redirects=True) as client: responses: List[Dict] = [] timeout = httpx.Timeout(self.timeout, read=None) - for request in self.request_data: + for request in self.requests: try: if self.protocol == ApiProtocol.soap: headers = {"Content-Type": "text/xml; charset=utf-8"} @@ -463,12 +463,19 @@ def send_requests(self) -> List[Dict]: timeout=timeout, ) response.raise_for_status() - response_data = response.json() + try: + response_data = response.json() cds_response = CDSResponse(**response_data) responses.append(cds_response.model_dump(exclude_none=True)) + except json.JSONDecodeError: + log.error( + f"Invalid JSON response from {self.url}. " + f"Response preview: {response.text[:200]}" + ) + responses.append({}) except Exception: - # Fallback to raw response if parsing fails + # Fallback to raw response if CDSResponse parsing fails responses.append(response_data) except httpx.HTTPStatusError as exc: @@ -515,10 +522,10 @@ def save_results(self, directory: Union[str, Path] = "./output/") -> None: # Save requests if self.protocol == ApiProtocol.soap: - request_data = [request.model_dump_xml() for request in self.request_data] + request_data = [request.model_dump_xml() for request in self.requests] else: request_data = [ - request.model_dump(exclude_none=True) for request in self.request_data + request.model_dump(exclude_none=True) for request in self.requests ] save_data_to_directory( @@ -555,7 +562,7 @@ def get_status(self) -> Dict[str, Any]: if hasattr(self.protocol, "value") else str(self.protocol), "workflow": self.workflow.value if self.workflow else None, - "requests_queued": len(self.request_data), + "requests_queued": len(self.requests), "responses_received": len(self.responses), } @@ -581,5 +588,5 @@ def __repr__(self) -> str: return ( f"SandboxClient(url='{self.url}', " f"protocol='{self.protocol.value if hasattr(self.protocol, 'value') else self.protocol}', " - f"requests={len(self.request_data)})" + f"requests={len(self.requests)})" ) diff --git a/scripts/healthchainapi_e2e_demo.py b/scripts/healthchainapi_e2e_demo.py index de76a29f..27483168 100644 --- a/scripts/healthchainapi_e2e_demo.py +++ b/scripts/healthchainapi_e2e_demo.py @@ -414,8 +414,7 @@ def create_sandboxes(): # NoteReader Sandbox notereader_client = SandboxClient( - api_url=base_url, - endpoint="/notereader/fhir/", + url=base_url + "/notereader/fhir/", workflow=CONFIG["workflows"]["notereader"], protocol="soap", ) @@ -425,8 +424,7 @@ def create_sandboxes(): # CDS Hooks Sandbox cds_client = SandboxClient( - api_url=base_url, - endpoint="/cds/cds-services/discharge-summary", + url=base_url + "/cds/cds-services/discharge-summary", workflow=CONFIG["workflows"]["cds"], protocol="rest", ) From 208b913ba3fcc71512a89734b509e374f7b68f46 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Fri, 31 Oct 2025 19:09:52 +0000 Subject: [PATCH 08/12] Update tests --- tests/fhir/test_bundle_resource_counting.py | 57 +++++++++++++++++++++ tests/sandbox/test_cds_sandbox.py | 6 +-- tests/sandbox/test_clindoc_sandbox.py | 4 +- tests/sandbox/test_mimic_loader.py | 22 ++++---- tests/sandbox/test_sandbox_client.py | 22 ++++---- tests/sandbox/test_synthea_loader.py | 24 ++++----- 6 files changed, 96 insertions(+), 39 deletions(-) create mode 100644 tests/fhir/test_bundle_resource_counting.py diff --git a/tests/fhir/test_bundle_resource_counting.py b/tests/fhir/test_bundle_resource_counting.py new file mode 100644 index 00000000..821a0d32 --- /dev/null +++ b/tests/fhir/test_bundle_resource_counting.py @@ -0,0 +1,57 @@ +"""Tests for bundle resource counting functionality.""" + +from healthchain.fhir import ( + create_bundle, + add_resource, + create_condition, + create_medication_statement, + create_allergy_intolerance, + count_resources, +) + + +def test_count_resources_with_empty_bundle(empty_bundle): + """count_resources returns empty dict for empty bundle.""" + counts = count_resources(empty_bundle) + assert counts == {} + + +def test_count_resources_with_single_resource_type(empty_bundle): + """count_resources counts single resource type correctly.""" + add_resource(empty_bundle, create_condition("Patient/1", "123", "Test")) + add_resource(empty_bundle, create_condition("Patient/1", "456", "Test 2")) + + counts = count_resources(empty_bundle) + assert counts == {"Condition": 2} + + +def test_count_resources_with_mixed_resource_types(empty_bundle): + """count_resources counts multiple resource types correctly.""" + add_resource(empty_bundle, create_condition("Patient/1", "123", "Test")) + add_resource(empty_bundle, create_condition("Patient/1", "456", "Test 2")) + add_resource(empty_bundle, create_medication_statement("Patient/1", "789", "Med")) + add_resource( + empty_bundle, create_allergy_intolerance("Patient/1", "999", "Allergy") + ) + + counts = count_resources(empty_bundle) + assert counts == { + "Condition": 2, + "MedicationStatement": 1, + "AllergyIntolerance": 1, + } + + +def test_count_resources_with_none_bundle(): + """count_resources handles None bundle gracefully.""" + counts = count_resources(None) + assert counts == {} + + +def test_count_resources_with_bundle_no_entry(): + """count_resources handles bundle with None entry.""" + bundle = create_bundle() + bundle.entry = None + + counts = count_resources(bundle) + assert counts == {} diff --git a/tests/sandbox/test_cds_sandbox.py b/tests/sandbox/test_cds_sandbox.py index 405f7634..e997b120 100644 --- a/tests/sandbox/test_cds_sandbox.py +++ b/tests/sandbox/test_cds_sandbox.py @@ -39,8 +39,8 @@ async def handle_patient_view(request: CDSRequest) -> CDSResponse: client._construct_request(prefetch_data) # Verify request was constructed - assert len(client.request_data) == 1 - assert client.request_data[0].hook == "patient-view" + assert len(client.requests) == 1 + assert client.requests[0].hook == "patient-view" # Mock HTTP response with patch("httpx.Client") as mock_client_class: @@ -89,7 +89,7 @@ def test_cdshooks_workflows(): client._construct_request(prefetch_data) # Verify request was constructed - assert len(client.request_data) == 1 + assert len(client.requests) == 1 # Mock HTTP response with patch("httpx.Client") as mock_client_class: diff --git a/tests/sandbox/test_clindoc_sandbox.py b/tests/sandbox/test_clindoc_sandbox.py index 2c4193dc..e1ae80d0 100644 --- a/tests/sandbox/test_clindoc_sandbox.py +++ b/tests/sandbox/test_clindoc_sandbox.py @@ -34,7 +34,7 @@ def process_document(cda_request: CdaRequest) -> CdaResponse: client._construct_request(test_document) # Verify request was constructed - assert len(client.request_data) == 1 + assert len(client.requests) == 1 # Mock HTTP response with proper SOAP envelope structure with patch("httpx.Client") as mock_client_class: @@ -83,7 +83,7 @@ def test_notereader_sandbox_workflow_execution(): client._construct_request(clinical_document) # Verify request was constructed - assert len(client.request_data) == 1 + assert len(client.requests) == 1 # Mock HTTP response with proper SOAP envelope structure with patch("httpx.Client") as mock_client_class: diff --git a/tests/sandbox/test_mimic_loader.py b/tests/sandbox/test_mimic_loader.py index 1f746f38..0c2614e2 100644 --- a/tests/sandbox/test_mimic_loader.py +++ b/tests/sandbox/test_mimic_loader.py @@ -131,9 +131,9 @@ def test_mimic_loader_loads_single_resource_type( ) assert isinstance(result, dict) - assert "MedicationStatement" in result + assert "medicationstatement" in result # Result dict contains a Bundle - bundle = result["MedicationStatement"] + bundle = result["medicationstatement"] assert type(bundle).__name__ == "Bundle" assert len(bundle.entry) == 2 assert bundle.entry[0].resource.id == "med-1" @@ -157,11 +157,11 @@ def test_mimic_loader_loads_multiple_resource_types( resource_types=["MimicMedication", "MimicCondition"], ) - assert "MedicationStatement" in result - assert "Condition" in result + assert "medicationstatement" in result + assert "condition" in result # Each result value is a Bundle - med_bundle = result["MedicationStatement"] - cond_bundle = result["Condition"] + med_bundle = result["medicationstatement"] + cond_bundle = result["condition"] assert len(med_bundle.entry) == 2 assert len(cond_bundle.entry) == 1 @@ -183,7 +183,7 @@ def test_mimic_loader_sampling_behavior( sample_size=sample_size, ) - bundle = result["MedicationStatement"] + bundle = result["medicationstatement"] assert len(bundle.entry) == expected_count @@ -210,8 +210,8 @@ def test_mimic_loader_deterministic_sampling_with_seed( random_seed=42, ) - bundle1 = result1["MedicationStatement"] - bundle2 = result2["MedicationStatement"] + bundle1 = result1["medicationstatement"] + bundle2 = result2["medicationstatement"] assert bundle1.entry[0].resource.id == bundle2.entry[0].resource.id @@ -251,7 +251,7 @@ def test_mimic_loader_handles_malformed_json(temp_mimic_data_dir): ) # Should load the valid resource despite malformed line - bundle = result["MedicationStatement"] + bundle = result["medicationstatement"] assert len(bundle.entry) == 1 @@ -314,5 +314,5 @@ def test_mimic_loader_skips_resources_without_resource_type(temp_mimic_data_dir) ) # Should only load the valid resource - bundle = result["MedicationStatement"] + bundle = result["medicationstatement"] assert len(bundle.entry) == 1 diff --git a/tests/sandbox/test_sandbox_client.py b/tests/sandbox/test_sandbox_client.py index 47f7d857..5dc22fb7 100644 --- a/tests/sandbox/test_sandbox_client.py +++ b/tests/sandbox/test_sandbox_client.py @@ -28,7 +28,7 @@ def test_load_from_path_single_xml_file(tmp_path): result = client.load_from_path(str(cda_file)) assert result is client - assert len(client.request_data) == 1 + assert len(client.requests) == 1 def test_load_from_path_directory_with_pattern(tmp_path): @@ -46,7 +46,7 @@ def test_load_from_path_directory_with_pattern(tmp_path): client.load_from_path(str(tmp_path), pattern="*.xml") - assert len(client.request_data) == 2 + assert len(client.requests) == 2 def test_load_from_path_directory_all_files(tmp_path): @@ -63,7 +63,7 @@ def test_load_from_path_directory_all_files(tmp_path): client.load_from_path(str(tmp_path)) - assert len(client.request_data) == 2 + assert len(client.requests) == 2 def test_load_from_path_error_handling(tmp_path): @@ -97,7 +97,7 @@ def test_load_free_text_generates_data(tmp_path): column_name="text", random_seed=42, ) - assert len(client.request_data) > 0 + assert len(client.requests) > 0 def test_load_free_text_without_synthetic_data(tmp_path): @@ -118,9 +118,9 @@ def test_load_free_text_without_synthetic_data(tmp_path): random_seed=42, ) - assert len(client.request_data) > 0 + assert len(client.requests) > 0 # Verify request was created (but without checking prefetch content details) - assert client.request_data[0].hook == "encounter-discharge" + assert client.requests[0].hook == "encounter-discharge" def test_send_requests_without_data(): @@ -176,8 +176,8 @@ def test_load_from_path_json_prefetch_file(tmp_path): client.load_from_path(str(json_file)) - assert len(client.request_data) == 1 - assert client.request_data[0].hook == "patient-view" + assert len(client.requests) == 1 + assert client.requests[0].hook == "patient-view" def test_load_from_path_invalid_json_prefetch(tmp_path): @@ -189,7 +189,7 @@ def test_load_from_path_invalid_json_prefetch(tmp_path): # Should load the JSON data without error since we're using plain dicts now client.load_from_path(str(json_file)) - assert len(client.request_data) == 1 + assert len(client.requests) == 1 def test_save_results_distinguishes_protocols(tmp_path): @@ -268,12 +268,12 @@ def test_clear_requests(): # Load some data prefetch = {"patient": create_bundle()} client._construct_request(prefetch) - assert len(client.request_data) == 1 + assert len(client.requests) == 1 # Clear and verify result = client.clear_requests() assert result is client # Method chaining - assert len(client.request_data) == 0 + assert len(client.requests) == 0 def test_preview_requests_provides_metadata(): diff --git a/tests/sandbox/test_synthea_loader.py b/tests/sandbox/test_synthea_loader.py index 049c4b46..0910b91b 100644 --- a/tests/sandbox/test_synthea_loader.py +++ b/tests/sandbox/test_synthea_loader.py @@ -134,11 +134,11 @@ def test_synthea_loader_supports_multiple_file_specification_methods( result = loader.load(data_dir=str(temp_synthea_data_dir), **patient_spec) assert isinstance(result, dict) - assert "Patient" in result and "Condition" in result + assert "patient" in result and "condition" in result # Returns Bundle objects - assert type(result["Patient"]).__name__ == "Bundle" - assert len(result["Patient"].entry) == 1 - assert len(result["Condition"].entry) == 2 + assert type(result["patient"]).__name__ == "Bundle" + assert len(result["patient"].entry) == 1 + assert len(result["condition"].entry) == 2 def test_synthea_loader_filters_and_groups_resources_by_type( @@ -155,9 +155,9 @@ def test_synthea_loader_filters_and_groups_resources_by_type( ) # Only requested types included - assert set(result.keys()) == {"Condition", "MedicationStatement"} - assert len(result["Condition"].entry) == 2 - assert len(result["MedicationStatement"].entry) == 1 + assert set(result.keys()) == {"condition", "medicationstatement"} + assert len(result["condition"].entry) == 2 + assert len(result["medicationstatement"].entry) == 1 @pytest.mark.parametrize("sample_size,expected_count", [(1, 1), (2, 2)]) @@ -174,7 +174,7 @@ def test_synthea_loader_sampling_behavior( sample_size=sample_size, ) - assert len(result["Condition"].entry) == expected_count + assert len(result["condition"].entry) == expected_count def test_synthea_loader_deterministic_sampling_with_seed( @@ -198,8 +198,8 @@ def test_synthea_loader_deterministic_sampling_with_seed( ) assert ( - result1["Condition"].entry[0].resource.id - == result2["Condition"].entry[0].resource.id + result1["condition"].entry[0].resource.id + == result2["condition"].entry[0].resource.id ) @@ -295,5 +295,5 @@ def test_synthea_loader_skips_resources_without_resource_type(temp_synthea_data_ result = loader.load(data_dir=str(temp_synthea_data_dir)) # Should only load valid Patient resource - assert "Patient" in result - assert len(result["Patient"].entry) == 1 + assert "patient" in result + assert len(result["patient"].entry) == 1 From fb0ad668c3098d309648f661186a0b769b54c14e Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Fri, 31 Oct 2025 20:15:44 +0000 Subject: [PATCH 09/12] Update docs --- docs/cookbook/clinical_coding.md | 3 +- docs/cookbook/discharge_summarizer.md | 6 +- docs/quickstart.md | 12 +- docs/reference/utilities/data_generator.md | 6 +- docs/reference/utilities/sandbox.md | 267 +++++++++++++++++---- 5 files changed, 236 insertions(+), 58 deletions(-) diff --git a/docs/cookbook/clinical_coding.md b/docs/cookbook/clinical_coding.md index 9ea0b277..cd4bf482 100644 --- a/docs/cookbook/clinical_coding.md +++ b/docs/cookbook/clinical_coding.md @@ -224,8 +224,7 @@ from healthchain.sandbox import SandboxClient # Create sandbox client for SOAP/CDA testing client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/notereader/ProcessDocument", + url="http://localhost:8000/notereader/ProcessDocument", workflow="sign-note-inpatient", protocol="soap" ) diff --git a/docs/cookbook/discharge_summarizer.md b/docs/cookbook/discharge_summarizer.md index 96a9a7af..1af15122 100644 --- a/docs/cookbook/discharge_summarizer.md +++ b/docs/cookbook/discharge_summarizer.md @@ -159,16 +159,14 @@ from healthchain.sandbox import SandboxClient # Create sandbox client for testing client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/cds/cds-services/discharge-summarizer", + url="http://localhost:8000/cds/cds-services/discharge-summarizer", workflow="encounter-discharge" ) # Load discharge notes from CSV and generate FHIR data client.load_free_text( csv_path="data/discharge_notes.csv", - column_name="text", - workflow="encounter-discharge" + column_name="text" ) ``` diff --git a/docs/quickstart.md b/docs/quickstart.md index 2a93fe66..be40bbb1 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -185,15 +185,19 @@ Test your AI applications in realistic healthcare contexts with `SandboxClient` ```python from healthchain.sandbox import SandboxClient -# Create client and load test data +# Create client with service URL and workflow client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/cds/cds-services/my-service", + url="http://localhost:8000/cds/cds-services/my-service", workflow="encounter-discharge" ) # Load from datasets or files -client.load_from_registry("synthea", num_patients=5) +client.load_from_registry( + "synthea-patient", + data_dir="./data/synthea", + resource_types=["Condition", "DocumentReference"], + sample_size=3 +) responses = client.send_requests() ``` diff --git a/docs/reference/utilities/data_generator.md b/docs/reference/utilities/data_generator.md index a7324cf8..33958143 100644 --- a/docs/reference/utilities/data_generator.md +++ b/docs/reference/utilities/data_generator.md @@ -10,7 +10,7 @@ According to the [UK ONS synthetic data classification](https://www.ons.gov.uk/m ## CDS Data Generator -The `.generate_prefetch()` method will return a `Prefetch` model with the `prefetch` field populated with a dictionary of FHIR resources. Each key in the dictionary corresponds to a FHIR resource type, and the value is a list of FHIR resources of that type. For more information, check out the [CDS Hooks documentation](https://cds-hooks.org/specification/current/#providing-fhir-resources-to-a-cds-service). +The `.generate_prefetch()` method will return a dictionary of resources. Each key in the dictionary corresponds to a FHIR resource type, and the value is a list of FHIR resources or a Bundle of that type. For more information, check out the [CDS Hooks documentation](https://cds-hooks.org/specification/current/#providing-fhir-resources-to-a-cds-service). For each workflow, a pre-configured list of FHIR resources is randomly generated and placed in the `prefetch` field of a `CDSRequest`. @@ -33,8 +33,7 @@ You can use the data generator with `SandboxClient.load_free_text()` or standalo # Create client client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/cds/cds-services/my-service", + url="http://localhost:8000/cds/cds-services/my-service", workflow="encounter-discharge" ) @@ -42,7 +41,6 @@ You can use the data generator with `SandboxClient.load_free_text()` or standalo client.load_free_text( csv_path="./data/discharge_notes.csv", column_name="text", - workflow="encounter-discharge", random_seed=42 ) diff --git a/docs/reference/utilities/sandbox.md b/docs/reference/utilities/sandbox.md index 6cc3a2a5..a64bcaf3 100644 --- a/docs/reference/utilities/sandbox.md +++ b/docs/reference/utilities/sandbox.md @@ -9,15 +9,19 @@ Test CDS Hooks workflows with synthetic data: ```python from healthchain.sandbox import SandboxClient -# Create client +# Create client with full service URL and workflow client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/cds/cds-services/my-service", + url="http://localhost:8000/cds/cds-services/my-service", workflow="encounter-discharge" ) # Load data and send requests -client.load_from_registry("synthea", num_patients=5) +client.load_from_registry( + "synthea-patient", + data_dir="./data/synthea", + resource_types=["Condition", "MedicationStatement"], + sample_size=5 + ) responses = client.send_requests() ``` @@ -29,25 +33,36 @@ responses = client.send_requests() from healthchain.sandbox import SandboxClient client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/cds/cds-services/my-service", - workflow="encounter-discharge", # Optional, auto-detected if not provided + url="http://localhost:8000/cds/cds-services/my-service", + workflow="encounter-discharge", # Required protocol="rest", # "rest" for CDS Hooks, "soap" for CDA timeout=10.0 ) ``` +### Workflow-Protocol Compatibility + +The client validates workflow-protocol combinations at initialization: + +| Protocol | Compatible Workflows | +|----------|---------------------| +| **REST** | `patient-view`, `encounter-discharge`, `order-select`, `order-sign` | +| **SOAP** | `sign-note-inpatient`, `sign-note-outpatient` | + + ### Loading Data === "From Registry" ```python # Load from pre-configured datasets - client.load_from_registry("mimic-on-fhir", sample_size=10) - client.load_from_registry("synthea", num_patients=5) + client.load_from_registry( + "mimic-on-fhir", + data_dir="./data/mimic-fhir", + resource_types=["MimicMedication"], + sample_size=10 + ) - # See available datasets - from healthchain.sandbox import list_available_datasets - print(list_available_datasets()) + # Available datasets: "mimic-on-fhir", "synthea-patient" ``` === "From Files" @@ -55,7 +70,7 @@ client = SandboxClient( # Load single file client.load_from_path("./data/clinical_note.xml") - # Load directory + # Load directory with glob pattern client.load_from_path("./data/cda_files/", pattern="*.xml") ``` @@ -65,38 +80,180 @@ client = SandboxClient( client.load_free_text( csv_path="./data/discharge_notes.csv", column_name="text", - workflow="encounter-discharge", + generate_synthetic=True, # Include synthetic FHIR resources random_seed=42 ) ``` +## Dataset Loaders + +HealthChain provides two pre-configured dataset loaders for testing with common FHIR testing datasets. Use `load_from_registry()` to access these datasets. + +### Overview + +| Dataset | Type | Use Case | File Format | +|---------|------|----------|-------------| +| **MIMIC-on-FHIR** | Real de-identified | Testing with realistic clinical patterns | `.ndjson.gz` per resource type | +| **Synthea** | Synthetic | Quick demos, single patient testing | `.json` Bundle per patient | + + +**When to use:** + +- **MIMIC**: Test with real-world data distributions and clinical patterns from a major hospital +- **Synthea**: Quick demos without downloading large datasets; ideal for single-patient workflows + +### MIMIC-on-FHIR Loader + +Real de-identified clinical data from Beth Israel Deaconess Medical Center in FHIR R4 format. + +**Directory Structure:** + +``` +data_dir/ +└── fhir/ + ├── MimicMedication.ndjson.gz + ├── MimicCondition.ndjson.gz + ├── MimicObservation.ndjson.gz + └── ... (other resource types) +``` + +**Usage:** + +=== "Basic" + ```python + client.load_from_registry( + "mimic-on-fhir", + data_dir="./data/mimic-iv-fhir", + resource_types=["MimicMedication", "MimicCondition"] + ) + ``` + +=== "With Sampling" + ```python + # Load random sample for faster testing + client.load_from_registry( + "mimic-on-fhir", + data_dir="./data/mimic-iv-fhir", + resource_types=["MimicMedication", "MimicObservation"], + sample_size=5, # 5 resources per type + random_seed=42 # Reproducible sampling + ) + ``` + +**Available Resource Types:** + +`MimicMedication`, `MimicCondition`, `MimicObservation`, `MimicProcedure`, `MimicEncounter`, `MimicPatient`, and more. Check your dataset's `/fhir` directory for available types. + +!!! note "Setup Requirements" + The full MIMIC-on-FHIR dataset requires credentialed PhysioNet access, but you can download the [demo dataset without credentials](https://physionet.org/content/mimic-iv-fhir-demo/2.1.0/) (100 patients). + +### Synthea Loader + +Synthetic patient data generated by Synthea, containing realistic FHIR Bundles (typically 100-500 resources per patient). + +**Directory Structure:** + +``` +data_dir/ +├── FirstName123_LastName456_uuid.json +├── FirstName789_LastName012_uuid.json +└── ... (one .json file per patient) +``` + +**Usage:** + +=== "First Patient (Quick Demo)" + ```python + # Automatically loads first .json file found + client.load_from_registry( + "synthea-patient", + data_dir="./synthea_sample_data_fhir_latest" + resource_type=["Condition"], # Finds all Condition resources, loads all if not specified + ) + ``` + +=== "By Patient ID" + ```python + client.load_from_registry( + "synthea-patient", + data_dir="./synthea_sample_data_fhir_latest", + patient_id="a969c177-a995-7b89-7b6d-885214dfa253", + resource_type=["Condition"], + ) + ``` + +=== "With Resource Filtering" + ```python + # Load specific resource types with sampling + client.load_from_registry( + "synthea-patient", + data_dir="./synthea_sample_data_fhir_latest", + resource_types=["Condition", "MedicationRequest", "Observation"], + sample_size=5, # 5 resources per type + random_seed=42, + ) + ``` + + +!!! tip "Getting Synthea Data" + Generate synthetic patients using [Synthea](https://github.com/synthetichealth/synthea) or [download sample data](https://synthea.mitre.org/downloads) from their releases. Each patient Bundle is self-contained with all clinical history. + +### Managing Requests + +```python +# Preview queued requests before sending +previews = client.preview_requests(limit=3) +for preview in previews: + print(f"Request {preview['index']}: {preview['type']}") + +# Get full request data for inspection +requests_dict = client.get_request_data(format="dict") +requests_json = client.get_request_data(format="json") +requests_raw = client.get_request_data(format="raw") + +# Clear queued requests to start fresh +client.clear_requests() +client.load_from_path("./different_data.xml") +``` + ### Sending Requests ```python # Send all queued requests responses = client.send_requests() -# Save results +# Save results to disk client.save_results("./output/") -# Get status +# Get client status status = client.get_status() print(status) +# { +# "sandbox_id": "...", +# "url": "http://localhost:8000/cds/...", +# "protocol": "rest", +# "workflow": "encounter-discharge", +# "requests_queued": 5, +# "responses_received": 5 +# } ``` -## Available Testing Scenarios +### Using Context Manager -**CDS Hooks** (REST protocol): +For automatic result saving on successful completion: -- `workflow`: "patient-view", "encounter-discharge", "order-select", etc. -- Load FHIR Prefetch data -- Test clinical decision support services - -**Clinical Documentation** (SOAP protocol): - -- `workflow`: "sign-note-inpatient", "sign-note-outpatient" -- Load CDA XML documents -- Test SOAP/CDA document processing +```python +with SandboxClient( + url="http://localhost:8000/cds/cds-services/my-service", + workflow="encounter-discharge" +) as client: + client.load_free_text( + csv_path="./data/notes.csv", + column_name="text" + ) + responses = client.send_requests() + # Results automatically saved to ./output/ on successful exit +``` ## Complete Examples @@ -106,14 +263,17 @@ print(status) # Initialize for CDS Hooks client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/cds/cds-services/discharge-summarizer", - workflow="encounter-discharge", - protocol="rest" + url="http://localhost:8000/cds/cds-services/sepsis-alert", + workflow="patient-view" ) # Load and send - client.load_from_registry("synthea", num_patients=3) + client.load_from_registry( + "mimic-on-fhir", + data_dir="./data/mimic-iv-fhir", + resource_types=["MimicConditionED", "MimicObservation"], + sample_size=10, + responses = client.send_requests() client.save_results("./output/") ``` @@ -124,18 +284,39 @@ print(status) # Initialize for SOAP/CDA client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/notereader/fhir/", + url="http://localhost:8000/notereader/ProcessDocument/", workflow="sign-note-inpatient", protocol="soap" ) - # Load CDA files + # Load CDA files from directory client.load_from_path("./data/cda_files/", pattern="*.xml") responses = client.send_requests() client.save_results("./output/") ``` +=== "Free Text CSV" + ```python + from healthchain.sandbox import SandboxClient + + # Initialize client + client = SandboxClient( + url="http://localhost:8000/cds/cds-services/my-service", + workflow="patient-view" + ) + + # Load text data + client.load_free_text( + csv_path="./data/clinical_notes.csv", + column_name="note_text", + generate_synthetic=True + ) + + # Send and save + responses = client.send_requests() + client.save_results("./output/") + ``` + ## Migration Guide !!! warning "Decorator Pattern Deprecated" @@ -155,16 +336,14 @@ class TestCDS(ClinicalDecisionSupport): ```python client = SandboxClient( - api_url="http://localhost:8000", - endpoint="/cds/cds-services/my-service", + url="http://localhost:8000/cds/cds-services/my-service", workflow="patient-view" ) -client.load_from_registry("synthea", num_patients=5) +client.load_from_registry( + "synthea-patient", + data_dir="./data/synthea", + resource_types=["Condition", "Observation"], + sample_size=10 +) responses = client.send_requests() ``` - -## Next Steps - -1. **Testing**: Use `SandboxClient` for local development and testing -2. **Production**: Migrate to [HealthChainAPI Gateway](../gateway/gateway.md) -3. **Protocols**: See [CDS Hooks](../gateway/cdshooks.md) and [SOAP/CDA](../gateway/soap_cda.md) From 5a126041f7a2f23ed5a4f5460b66d761309f9ff9 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Thu, 6 Nov 2025 16:40:14 +0000 Subject: [PATCH 10/12] Remove redundant logging --- healthchain/sandbox/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/healthchain/sandbox/datasets.py b/healthchain/sandbox/datasets.py index 44ff9f4d..17da56a6 100644 --- a/healthchain/sandbox/datasets.py +++ b/healthchain/sandbox/datasets.py @@ -64,7 +64,7 @@ def load(cls, name: str, data_dir: str, **kwargs) -> Dict: ) loader = cls._datasets[name] - log.info(f"Loading dataset: {name}") + return loader.load(data_dir=data_dir, **kwargs) @classmethod From 03533becbe21fae986b51933b602b5c954521397 Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Thu, 6 Nov 2025 16:41:28 +0000 Subject: [PATCH 11/12] Remove client.get_request_data raw format - direct access instead --- healthchain/sandbox/sandboxclient.py | 27 ++++++++++++++++----------- tests/sandbox/test_sandbox_client.py | 1 - 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/healthchain/sandbox/sandboxclient.py b/healthchain/sandbox/sandboxclient.py index db7a26e9..da121501 100644 --- a/healthchain/sandbox/sandboxclient.py +++ b/healthchain/sandbox/sandboxclient.py @@ -379,34 +379,38 @@ def preview_requests(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: return previews def get_request_data( - self, format: Literal["raw", "dict", "json"] = "dict" - ) -> Union[List, str]: + self, format: Literal["dict", "json"] = "dict" + ) -> Union[List[Dict], str]: """ - Get raw request data for inspection. + Get transformed request data for inspection. - Allows direct access to request data for debugging or custom processing. + Allows access to serialized request data for debugging or custom processing. + For direct access to Pydantic models, use the `requests` attribute: + >>> for request in client.requests: + >>> print(request.model_dump()) Args: - format: Return format - "raw" for list of request objects, - "dict" for list of dictionaries, "json" for JSON string + format: Return format - "dict" for list of dictionaries, + "json" for JSON string Returns: Request data in specified format Raises: - ValueError: If format is not one of "raw", "dict", or "json" + ValueError: If format is not "dict" or "json" Examples: >>> client.load_from_path("data.xml") + >>> # Access raw Pydantic models directly + >>> for request in client.requests: + >>> print(request.model_dump(exclude_none=True)) >>> # Get as dictionaries >>> dicts = client.get_request_data("dict") >>> # Get as JSON string >>> json_str = client.get_request_data("json") >>> print(json_str) """ - if format == "raw": - return self.requests - elif format == "dict": + if format == "dict": result = [] for req in self.requests: if hasattr(req, "model_dump"): @@ -420,7 +424,8 @@ def get_request_data( return json.dumps(self.get_request_data("dict"), indent=2) else: raise ValueError( - f"Invalid format '{format}'. Must be 'raw', 'dict', or 'json'" + f"Invalid format '{format}'. Must be 'dict' or 'json'. " + f"For raw Pydantic models, access the 'requests' attribute directly." ) def send_requests(self) -> List[Dict]: diff --git a/tests/sandbox/test_sandbox_client.py b/tests/sandbox/test_sandbox_client.py index 5dc22fb7..d7a71614 100644 --- a/tests/sandbox/test_sandbox_client.py +++ b/tests/sandbox/test_sandbox_client.py @@ -323,7 +323,6 @@ def test_preview_requests_respects_limit(): @pytest.mark.parametrize( "format_type,check", [ - ("raw", lambda data: isinstance(data, list)), ("dict", lambda data: isinstance(data, list) and isinstance(data[0], dict)), ("json", lambda data: isinstance(data, str) and json.loads(data)), ], From 672cd5314fcf8c8694bf7cfbed0073118ca4eecc Mon Sep 17 00:00:00 2001 From: jenniferjiangkells Date: Thu, 6 Nov 2025 16:44:17 +0000 Subject: [PATCH 12/12] Update docs --- README.md | 24 ++ docs/cookbook/clinical_coding.md | 4 + docs/cookbook/discharge_summarizer.md | 4 + docs/quickstart.md | 188 +++++++------ .../pipeline/integrations/integrations.md | 2 +- docs/reference/pipeline/pipeline.md | 103 ++++--- docs/reference/utilities/sandbox.md | 251 +++++++++++++----- 7 files changed, 372 insertions(+), 204 deletions(-) diff --git a/README.md b/README.md index 26d704d4..9ef165b1 100644 --- a/README.md +++ b/README.md @@ -189,6 +189,30 @@ if __name__ == "__main__": uvicorn.run(app, port=8000) ``` +### Testing with Sandbox [[Docs](https://dotimplement.github.io/HealthChain/reference/utilities/sandbox)] + +```python +from healthchain.sandbox import SandboxClient + +# Test CDS Hooks service with synthetic data +client = SandboxClient( + url="http://localhost:8000/cds/cds-services/discharge-summary", + workflow="encounter-discharge" +) + +# Load from test datasets +client.load_from_registry( + "synthea-patient", + data_dir="./data/synthea", + resource_types=["Condition", "DocumentReference"], + sample_size=5 +) + +# Send requests and save results +responses = client.send_requests() +client.save_results("./output/") +``` + ## Road Map - [ ] 🔍 Data provenance and audit trails tracking diff --git a/docs/cookbook/clinical_coding.md b/docs/cookbook/clinical_coding.md index cd4bf482..46b34a05 100644 --- a/docs/cookbook/clinical_coding.md +++ b/docs/cookbook/clinical_coding.md @@ -231,6 +231,10 @@ client = SandboxClient( # Load sample CDA document client.load_from_path("./data/notereader_cda.xml") + +# Inspect CDA document before sending +# for request in client.requests: +# print(request.document[:1000]) # View first 1000 chars of CDA XML ``` ## Run the Complete Example diff --git a/docs/cookbook/discharge_summarizer.md b/docs/cookbook/discharge_summarizer.md index 1af15122..897fd010 100644 --- a/docs/cookbook/discharge_summarizer.md +++ b/docs/cookbook/discharge_summarizer.md @@ -168,6 +168,10 @@ client.load_free_text( csv_path="data/discharge_notes.csv", column_name="text" ) + +# Inspect requests before sending to verify data +# for request in client.requests: +# print(request.prefetch.get('document')) # Get DocumentReference ``` !!! tip "Learn More About Test Data Generation" diff --git a/docs/quickstart.md b/docs/quickstart.md index be40bbb1..f09ac280 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -1,14 +1,18 @@ # Quickstart After [installing HealthChain](installation.md), get up to speed quickly with the core components before diving further into the [full documentation](reference/index.md)! +HealthChain has three main components: -HealthChain provides three core tools for healthcare AI integration: **Gateway** for connecting to multiple healthcare systems, **Pipelines** for FHIR-native AI workflows, and **InteropEngine** for healthcare data format conversion between FHIR, CDA, and HL7v2. +- **Gateway:** Connect to multiple healthcare systems with a single API. +- **Pipelines:** Easily build data processing pipelines for both clinical text and [FHIR](https://www.hl7.org/fhir/) data. +- **InteropEngine:** Seamlessly convert between data formats like [FHIR](https://www.hl7.org/fhir/), [HL7 CDA](https://www.hl7.org/implement/standards/product_brief.cfm?product_id=7), and [HL7v2](https://www.hl7.org/implement/standards/product_brief.cfm?product_id=185). -## Core Components -### HealthChainAPI Gateway 🔌 +## Core Components 🧩 -The HealthChainAPI provides a unified interface for connecting your AI models to multiple healthcare systems through a single API. Handle FHIR, CDS Hooks, and SOAP/CDA protocols with OAuth2 authentication. +### Gateway 🔌 + +The [**HealthChainAPI**](./reference/gateway/api.md) provides a unified interface for connecting your AI application and models to multiple healthcare systems through a single API. It automatically handles [FHIR API](https://www.hl7.org/fhir/http.html), [CDS Hooks](https://cds-hooks.org/), and [SOAP/CDA protocols](https://www.hl7.org/implement/standards/product_brief.cfm?product_id=7) with [OAuth2 authentication](https://oauth.net/2/). [(Full Documentation on Gateway)](./reference/gateway/gateway.md) @@ -41,51 +45,52 @@ app.register_gateway(fhir) ### Pipeline 🛠️ -HealthChain Pipelines provide a flexible way to build and manage processing pipelines for NLP and ML tasks that can easily integrate with electronic health record (EHR) systems. +HealthChain [**Pipelines**](./reference/pipeline/pipeline.md) provide a flexible way to build and manage processing pipelines for NLP and ML tasks that can easily integrate with electronic health record (EHR) systems. You can build pipelines with three different approaches: -#### 1. Build Your Own Pipeline with Inline Functions +#### 1. Quick Inline Functions + +For quick experiments, start by picking the right [**Container**](./reference/pipeline/data_container.md) when you initialize your pipeline (e.g. `Pipeline[Document]()` for clinical text). -This is the most flexible approach, ideal for quick experiments and prototyping. Initialize a pipeline type hinted with the container type you want to process, then add components to your pipeline with the `@add_node` decorator. +Containers make your pipeline FHIR-native by loading and transforming your data (free text, EHR resources, etc.) into structured FHIR-ready formats. Just add your processing functions with `@add_node`, compile with `.build()`, and your pipeline is ready to process FHIR data end-to-end. -Compile the pipeline with `.build()` to use it. +[(Full Documentation on Container)](./reference/pipeline/data_container.md) ```python from healthchain.pipeline import Pipeline from healthchain.io import Document +from healthchain.fhir import create_condition -nlp_pipeline = Pipeline[Document]() +pipeline = Pipeline[Document]() -@nlp_pipeline.add_node -def tokenize(doc: Document) -> Document: - doc.tokens = doc.text.split() - return doc +@pipeline.add_node +def extract_diabetes(doc: Document) -> Document: + """Adds a FHIR Condition for diabetes if mentioned in the text.""" + if "diabetes" in doc.text.lower(): + condition = create_condition( + code="73211009", + display="Diabetes mellitus", + ) + doc.fhir.problem_list.append(condition) -@nlp_pipeline.add_node -def pos_tag(doc: Document) -> Document: - doc.pos_tags = ["NOUN" if token[0].isupper() else "VERB" for token in doc.tokens] return doc -nlp = nlp_pipeline.build() - -doc = Document("Patient has a fracture of the left femur.") -doc = nlp(doc) +pipe = pipeline.build() -print(doc.tokens) -print(doc.pos_tags) +doc = Document("Patient has a history of diabetes.") +doc = pipe(doc) -# ['Patient', 'has', 'fracture', 'of', 'left', 'femur.'] -# ['NOUN', 'VERB', 'VERB', 'VERB', 'VERB', 'VERB'] +print(doc.fhir.problem_list) # FHIR Condition ``` -#### 2. Build Your Own Pipeline with Components, Models, and Connectors +#### 2. Build With Components and Adapters -Components are stateful - they're classes instead of functions. They can be useful for grouping related processing steps together, setting configurations, or wrapping specific model loading steps. +[**Components**](./reference/) are reusable, stateful classes that encapsulate specific processing logic, model loading, or configuration for your pipeline. Use them to organize complex workflows, handle model state, or integrate third-party libraries with minimal setup. -HealthChain comes with a few pre-built components, but you can also easily add your own. You can find more details on the [Components](./reference/pipeline/components/components.md) and [Integrations](./reference/pipeline/integrations/integrations.md) documentation pages. +HealthChain provides a set of ready-to-use [**NLP Integrations**](./reference/pipeline/integrations/integrations.md) for common clinical NLP and ML tasks, and you can easily implement your own. -Add components to your pipeline with the `.add_node()` method and compile with `.build()`. +[(Full Documentation on Components)](./reference/pipeline/components/components.md) ```python from healthchain.pipeline import Pipeline @@ -104,18 +109,14 @@ doc = Document("Patient presents with hypertension.") output = pipe(doc) ``` -Let's go one step further! You can use [Adapters](./reference/pipeline/adapters/adapters.md) to work directly with [CDA](https://www.hl7.org.uk/standards/hl7-standards/cda-clinical-document-architecture/) and [FHIR](https://hl7.org/fhir/) data received from healthcare system APIs. Adapters handle format conversion while keeping your pipeline pure ML processing. +You can process legacy healthcare data formats too. [**Adapters**](./reference/pipeline/adapters/adapters.md) convert between healthcare formats like [CDA](https://www.hl7.org/implement/standards/product_brief.cfm?product_id=7) and your pipeline — just parse, process, and format without worrying about low-level data conversion. + +[(Full Documentation on Adapters)](./reference/pipeline/adapters/adapters.md) ```python -from healthchain.pipeline import Pipeline -from healthchain.pipeline.components import SpacyNLP from healthchain.io import CdaAdapter from healthchain.models import CdaRequest -pipeline = Pipeline() -pipeline.add_node(SpacyNLP.from_model_id("en_core_sci_sm")) -pipe = pipeline.build() - # Use adapter for format conversion adapter = CdaAdapter() cda_request = CdaRequest(document="") @@ -128,21 +129,14 @@ output = adapter.format(processed_doc) #### 3. Use Prebuilt Pipelines -Prebuilt pipelines are pre-configured collections of Components and Models optimized for specific healthcare AI use cases. They offer the highest level of abstraction and are the easiest way to get started. +Prebuilt pipelines are the fastest way to jump into healthcare AI with minimal setup: just load and run. Each pipeline bundles best-practice components and models for common clinical tasks (like coding or summarization) and handles all FHIR/CDA conversion for you. Easily customize or extend pipelines by adding/removing components, or swap models as needed. -For a full list of available prebuilt pipelines and details on how to configure and customize them, see the [Pipelines](./reference/pipeline/pipeline.md) documentation page. +[(Full Documentation on Pipelines)](./reference/pipeline/pipeline.md#prebuilt-) ```python from healthchain.pipeline import MedicalCodingPipeline from healthchain.models import CdaRequest -# Load from pre-built chain -chain = ChatPromptTemplate.from_template("Summarize: {text}") | ChatOpenAI() -pipeline = MedicalCodingPipeline.load(chain, source="langchain") - -# Or load from model ID -pipeline = MedicalCodingPipeline.from_model_id("facebook/bart-large-cnn", source="huggingface") - # Or load from local model pipeline = MedicalCodingPipeline.from_local_model("./path/to/model", source="spacy") @@ -152,7 +146,7 @@ output = pipeline.process_request(cda_request) ### Interoperability 🔄 -The HealthChain Interoperability module provides tools for converting between different healthcare data formats, including HL7 FHIR, HL7 CDA, and HL7v2 messages. +The HealthChain Interoperability module provides tools for converting between different healthcare data formats, including FHIR, CDA, and HL7v2 messages. [(Full Documentation on Interoperability Engine)](./reference/interop/interop.md) @@ -176,34 +170,88 @@ cda_document = engine.from_fhir(fhir_resources, dest_format=FormatType.CDA) ## Utilities ⚙️ -### Sandbox Testing +### Sandbox Client 🧪 -Test your AI applications in realistic healthcare contexts with `SandboxClient` for CDS Hooks and clinical documentation workflows. +Use [**SandboxClient**](./reference/utilities/sandbox.md) to quickly test your app against real-world EHR scenarios like CDS Hooks or Clinical Documentation Improvement (CDI) workflows. Load test datasets, send requests to your service, and validate responses in a few lines of code. [(Full Documentation on Sandbox)](./reference/utilities/sandbox.md) +#### Workflows + +A [**workflow**](./reference/utilities/sandbox.md#workflow-protocol-compatibility) represents a specific event in an EHR system that triggers your service (e.g., `patient-view` when opening a patient chart, `encounter-discharge` when discharging a patient). + +Workflows determine the request structure, required FHIR resources, and validation rules. Different workflows are compatible with different protocols: + +| Workflow Type | Protocol | Example Workflows | +|-------------------------------------|------------|--------------------------------------------------------| +| **CDS Hooks** | REST | `patient-view`, `order-select`, `order-sign`, `encounter-discharge` | +| **Clinical Documentation** | SOAP | `sign-note-inpatient`, `sign-note-outpatient` | + + +#### Available Dataset Loaders + +[**Dataset Loaders**](./reference/utilities/sandbox.md#dataset-loaders) are shortcuts for loading common clinical test datasets from file. Currently available: + +| Dataset Key | Description | FHIR Version | Source | Download Link | +|--------------------|---------------------------------------------|--------------|-----------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------| +| `mimic-on-fhir` | **MIMIC-IV on FHIR Demo Dataset** | R4 | [PhysioNet Project](https://physionet.org/content/mimic-iv-fhir-demo/2.1.0/) | [Download ZIP](https://physionet.org/content/mimic-iv-fhir-demo/get-zip/2.1.0/) (49.5 MB) | +| `synthea-patient` | **Synthea FHIR Patient Records** | R4 | [Synthea Downloads](https://synthea.mitre.org/downloads) | [Download ZIP](https://arc.net/l/quote/hoquexhy) (100 Sample, 36 MB) | + + +```python +from healthchain.sandbox import list_available_datasets + +# See all registered datasets with descriptions +datasets = list_available_datasets() +print(datasets) +``` + +#### Basic Usage + ```python from healthchain.sandbox import SandboxClient -# Create client with service URL and workflow +# Initialize client with your service URL and workflow client = SandboxClient( - url="http://localhost:8000/cds/cds-services/my-service", - workflow="encounter-discharge" + url="http://localhost:8000/cds/patient-view", + workflow="patient-view" ) -# Load from datasets or files +# Load test data from a registered dataset client.load_from_registry( "synthea-patient", data_dir="./data/synthea", - resource_types=["Condition", "DocumentReference"], + resource_types=["Condition", "MedicationStatement"], sample_size=3 ) + +# Optionally inspect before sending +client.preview_requests() # See what will be sent +client.get_status() # Check client state + +# Send requests to your service responses = client.send_requests() ``` -### FHIR Helpers +For clinical documentation workflows using SOAP/CDA: + +```python +# Use context manager for automatic result saving +with SandboxClient( + url="http://localhost:8000/notereader/ProcessDocument", + workflow="sign-note-inpatient", + protocol="soap" +) as client: + client.load_from_path("./cookbook/data/notereader_cda.xml") + responses = client.send_requests() + # Results automatically saved to ./output/ on success +``` + +### FHIR Helpers 🔥 -The `fhir` module provides a set of helper functions for working with FHIR resources. +Use `healthchain.fhir` helpers to quickly create and manipulate FHIR resources (like `Condition`, `Observation`, etc.) in your code, ensuring they’re standards-compliant with minimal boilerplate. + +[(Full Documentation on FHIR Helpers)](./reference/utilities/fhir_helpers.md) ```python from healthchain.fhir import create_condition @@ -217,37 +265,5 @@ condition = create_condition( ) ``` -[(Full Documentation on FHIR Helpers)](./reference/utilities/fhir_helpers.md) - -### Data Generator - -You can use the data generator to generate synthetic FHIR data for testing. - -The `CdsDataGenerator` generates synthetic [FHIR](https://hl7.org/fhir/) data as [Pydantic](https://docs.pydantic.dev/) models suitable for different CDS workflows. Use it standalone or with `SandboxClient.load_free_text()` to include text-based data. - -[(Full Documentation on Data Generators)](./reference/utilities/data_generator.md) - -```python -from healthchain.sandbox.generators import CdsDataGenerator -from healthchain.sandbox.workflows import Workflow - -# Initialize data generator -data_generator = CdsDataGenerator() - -# Generate FHIR resources for specific workflow -data_generator.set_workflow(Workflow.encounter_discharge) -data = data_generator.generate_prefetch() - -print(data.model_dump()) - -# { -# "prefetch": { -# "encounter": { -# "resourceType": ... -# } -# } -# } -``` - ## Going further ✨ Check out our [Cookbook](cookbook/index.md) section for more worked examples! HealthChain is still in its early stages, so if you have any questions please feel free to reach us on [Github](https://github.com/dotimplement/HealthChain/discussions) or [Discord](https://discord.gg/UQC6uAepUz). diff --git a/docs/reference/pipeline/integrations/integrations.md b/docs/reference/pipeline/integrations/integrations.md index 390fe7d7..950d4376 100644 --- a/docs/reference/pipeline/integrations/integrations.md +++ b/docs/reference/pipeline/integrations/integrations.md @@ -1,4 +1,4 @@ -# HealthChain Integrations +# NLP Integrations This document provides an overview of the integration components available in the HealthChain package. These components allow you to easily incorporate popular NLP libraries into your HealthChain pipelines. diff --git a/docs/reference/pipeline/pipeline.md b/docs/reference/pipeline/pipeline.md index 1b7195ad..5fc99da9 100644 --- a/docs/reference/pipeline/pipeline.md +++ b/docs/reference/pipeline/pipeline.md @@ -1,12 +1,15 @@ # Pipeline -HealthChain pipelines enable FHIR-native workflows that integrate directly with EHR systems. Pipelines handle the complexities of healthcare data standards like [CDA (Clinical Document Architecture)](https://www.hl7.org.uk/standards/hl7-standards/cda-clinical-document-architecture/) and [FHIR (Fast Healthcare Interoperability Resources)](https://build.fhir.org/), allowing you to focus on building AI models while maintaining production-ready interoperability. +HealthChain pipelines help you quickly build data processing workflows that integrate seamlessly with EHR systems. They support healthcare formats like [FHIR](https://build.fhir.org/) out of the box and include built-in NLP to process free-text and structured clinical data—so you can focus on AI, not integration hassles. -You can either use prebuilt pipelines optimized for common clinical workflows, or build custom pipelines from scratch for specialized use cases. +Choose from prebuilt pipelines tailored to standard clinical workflows, or build custom pipelines for your own applications. Both approaches ensure production-ready interoperability and make it easy to adapt pipelines for any healthcare use case. ## Prebuilt 📦 -HealthChain comes with a set of prebuilt pipelines that are out-of-the-box implementations of common healthcare data processing tasks: +HealthChain comes with a set of end-to-end pipeline implementations of common healthcare data processing tasks. + +These prebuilt pipelines handle FHIR conversion, validation, and EHR integration for you. They work out-of-the-box with [**Adapters**](./adapters/adapters.md) and [**Gateways**](../gateway/gateway.md), supporting CDS Hooks, NoteReader CDI, and FHIR APIs. They're great for a quick setup to build more complex integrations on top of. + | Pipeline | Container | Use Case | Description | Example Application | |----------|-----------|----------|-------------|---------------------| @@ -15,75 +18,71 @@ HealthChain comes with a set of prebuilt pipelines that are out-of-the-box imple -Prebuilt pipelines are production-ready workflows that automatically handle FHIR conversion, validation, and formatting. They integrate seamlessly with EHR systems through [adapters](./adapters/adapters.md) and [gateways](../gateway/gateway.md), supporting standards like CDS Hooks and FHIR REST APIs. - -Load your models from Hugging Face, local files, or pipeline objects: +When you load your data into a prebuilt pipeline, it receives and returns request and response data ready to send to EHR integration points: ```python from healthchain.pipeline import MedicalCodingPipeline from healthchain.models import CdaRequest -# Load from Hugging Face -pipeline = MedicalCodingPipeline.from_model_id( - 'blaze999/Medical-NER', task="token-classification", source="huggingface" -) -# Load from local model files -pipeline = MedicalCodingPipeline.from_local_model( - '/path/to/model', source="spacy" -) # Load from a pipeline object pipeline = MedicalCodingPipeline.load(pipeline_object) # Simple end-to-end processing cda_request = CdaRequest(document="") cda_response = pipeline.process_request(cda_request) - -# Or manual adapter control for more granular control -from healthchain.io import CdaAdapter -adapter = CdaAdapter() -doc = adapter.parse(cda_request) -doc = pipeline(doc) -# Access: doc.fhir.problem_list, doc.fhir.medication_list -response = adapter.format(doc) ``` ### Customizing Prebuilt Pipelines -To customize a prebuilt pipeline, you can use the [pipeline management methods](#pipeline-management) to add, remove, and replace components. For example, you may want to change the model being used. [TODO] +To customize a prebuilt pipeline, you can use the [pipeline management](#pipeline-management) methods to add, remove, and replace components. If you need more control and don't mind writing more code, you can subclass `BasePipeline` and implement your own pipeline logic. [(BasePipeline API Reference)](../../api/pipeline.md#healthchain.pipeline.base.BasePipeline) -## Integrations +## NLP Integrations + +HealthChain integrates directly with popular NLP libraries like spaCy, HuggingFace Transformers, and LangChain. Easily add advanced NLP models and components into your pipelines to power state-of-the-art healthcare AI workflows. + +[(Full Documentation on NLP Integrations)](./integrations/integrations.md) -HealthChain offers powerful integrations with popular NLP libraries, enhancing its capabilities and allowing you to build more sophisticated pipelines. These integrations include components for spaCy, Hugging Face Transformers, and LangChain, enabling you to leverage state-of-the-art NLP models and techniques within your HealthChain workflows. +```python +from healthchain.pipeline import MedicalCodingPipeline -Integrations are covered in detail on the [Integrations](./integrations/integrations.md) homepage. +# Load from Hugging Face +pipeline = MedicalCodingPipeline.from_model_id( + 'blaze999/Medical-NER', task="token-classification", source="huggingface" +) +# Load from local model files +pipeline = MedicalCodingPipeline.from_local_model( + '/path/to/model', source="spacy" +) +``` ## Freestyle 🕺 -To build your own pipeline, you can start with an empty pipeline and add components to it. Initialize your pipeline with the appropriate container type, such as `Document` or `Tabular`. This is not essential, but it allows the pipeline to enforce type safety (If you don't specify the container type, it will be inferred from the first component added.) +[**Containers**](./data_container.md) are at the core of HealthChain pipelines: they define your data type and flow through each pipeline step, just like spaCy’s `Doc`. -You can see the full list of available containers at the [Container](./data_container.md) page. +Specify the container (e.g. `Document` or `Tabular`) when creating your pipeline (`Pipeline[Document]()`). Each node processes and returns the container, enabling smooth, type-safe, modular workflows and direct FHIR conversion. ```python from healthchain.pipeline import Pipeline from healthchain.io.containers import Document pipeline = Pipeline[Document]() - -# Or if you live dangerously -# pipeline = Pipeline() ``` To use a built pipeline, compile it by running `.build()`. This will return a compiled pipeline that you can run on your data. ```python +# Compile the pipeline to create a callable object pipe = pipeline.build() + +# Create a Document with your clinical text and run it through the pipeline doc = pipe(Document("Patient is diagnosed with diabetes")) -print(doc.entities) +# Print the extracted problem list items +print(doc.fhir.problem_list) ``` ### Adding Nodes @@ -181,26 +180,6 @@ pipeline.add_node(linker) [(BaseComponent API Reference)](../../api/component.md#healthchain.pipeline.components.base.BaseComponent) -### Working with Healthcare Data Formats 🔄 - -Adapters convert between healthcare formats (CDA, FHIR, CDS Hooks) and HealthChain's internal Document objects, enabling clean separation between ML processing and format handling. This allows your pipeline to work with any healthcare data source while maintaining FHIR-native outputs. - -```python -from healthchain.io import CdaAdapter, Document - -adapter = CdaAdapter() - -# Parse healthcare data into Document -doc = adapter.parse(cda_request) - -# Process with pure pipeline -processed_doc = pipeline(doc) - -# Convert back to healthcare format -response = adapter.format(processed_doc) -``` - -You can learn more about adapters at the [Adapters](./adapters/adapters.md) documentation page. ## Pipeline Management 🔨 @@ -299,3 +278,23 @@ print(pipeline.stages) # fhir_conversion: # - FHIRProblemListExtractor ``` +## Working with Healthcare Data Formats 🔄 + +Adapters let you easily convert between healthcare formats (CDA, FHIR, CDS Hooks) and HealthChain Documents. Keep your ML pipeline format-agnostic while always getting FHIR-ready outputs. + +[(Full Documentation on Adapters)](./adapters/adapters.md) + +```python +from healthchain.io import CdaAdapter, Document + +adapter = CdaAdapter() + +# Parse healthcare data into Document +doc = adapter.parse(cda_request) + +# Process with pure pipeline +processed_doc = pipeline(doc) + +# Convert back to healthcare format +response = adapter.format(processed_doc) +``` diff --git a/docs/reference/utilities/sandbox.md b/docs/reference/utilities/sandbox.md index a64bcaf3..3c666bb8 100644 --- a/docs/reference/utilities/sandbox.md +++ b/docs/reference/utilities/sandbox.md @@ -85,28 +85,28 @@ The client validates workflow-protocol combinations at initialization: ) ``` -## Dataset Loaders +See [Data Generator](data_generator.md) for more details on `.load_free_text()` `generate_synthetic` field. -HealthChain provides two pre-configured dataset loaders for testing with common FHIR testing datasets. Use `load_from_registry()` to access these datasets. +## Dataset Registry -### Overview - -| Dataset | Type | Use Case | File Format | -|---------|------|----------|-------------| -| **MIMIC-on-FHIR** | Real de-identified | Testing with realistic clinical patterns | `.ndjson.gz` per resource type | -| **Synthea** | Synthetic | Quick demos, single patient testing | `.json` Bundle per patient | +HealthChain provides two pre-configured dataset loaders for testing common FHIR test datasets with CDS Hooks workflows. Download the datasets and use `.load_from_registry()` to load from your local directory. +### Overview -**When to use:** +| Dataset & Description | FHIR Version | Type | File Format | Source | Download Link | +|------------------------------------------------------------------------|--------------|---------------------|-----------------------------|------------------------------------------------------------------------------------|---------------------------------------------------------| +| **MIMIC-on-FHIR**: MIMIC-IV on FHIR Demo Dataset | R4 | Real de-identified | `.ndjson.gz` per resource type | [PhysioNet Project](https://physionet.org/content/mimic-iv-fhir-demo/2.1.0/) | [Download ZIP](https://physionet.org/content/mimic-iv-fhir-demo/get-zip/2.1.0/) | +| **Synthea**: Synthea FHIR Patient Records (100 Sample) | R4 | Synthetic | `.json` Bundle per patient | [Synthea Downloads](https://synthea.mitre.org/downloads) | [Download ZIP](https://arc.net/l/quote/hoquexhy) | -- **MIMIC**: Test with real-world data distributions and clinical patterns from a major hospital -- **Synthea**: Quick demos without downloading large datasets; ideal for single-patient workflows ### MIMIC-on-FHIR Loader -Real de-identified clinical data from Beth Israel Deaconess Medical Center in FHIR R4 format. +Real-world, de-identified FHIR R4 data from Beth Israel Deaconess Medical Center. Suitable for testing with real-world data distributions and clinical patterns + +!!! tip "Full Dataset" + The [MIMIC-on-FHIR demo dataset](https://physionet.org/content/mimic-iv-fhir-demo/2.1.0/) is open access and contains about 100 patients. Access to the [full dataset](https://physionet.org/content/mimic-iv-fhir/2.1/) requires PhysioNet credentialed access. -**Directory Structure:** +#### Directory Structure ``` data_dir/ @@ -117,7 +117,7 @@ data_dir/ └── ... (other resource types) ``` -**Usage:** +#### Usage === "Basic" ```python @@ -140,18 +140,14 @@ data_dir/ ) ``` -**Available Resource Types:** - -`MimicMedication`, `MimicCondition`, `MimicObservation`, `MimicProcedure`, `MimicEncounter`, `MimicPatient`, and more. Check your dataset's `/fhir` directory for available types. - -!!! note "Setup Requirements" - The full MIMIC-on-FHIR dataset requires credentialed PhysioNet access, but you can download the [demo dataset without credentials](https://physionet.org/content/mimic-iv-fhir-demo/2.1.0/) (100 patients). - ### Synthea Loader -Synthetic patient data generated by Synthea, containing realistic FHIR Bundles (typically 100-500 resources per patient). +Synthetic patient data generated by [Synthea](https://synthea.mitre.org), containing realistic FHIR Bundles (typically 100-500 resources per patient). Ideal for single-patient workflows that require diverse data scenarios. -**Directory Structure:** +!!! tip "Getting Synthea Data" + Generate synthetic patients using [Synthea](https://github.com/synthetichealth/synthea) or [download sample data](https://synthea.mitre.org/downloads) from their releases. Each patient Bundle is self-contained with all clinical history. + +#### Directory Structure ``` data_dir/ @@ -160,7 +156,7 @@ data_dir/ └── ... (one .json file per patient) ``` -**Usage:** +#### Usage === "First Patient (Quick Demo)" ```python @@ -188,34 +184,118 @@ data_dir/ client.load_from_registry( "synthea-patient", data_dir="./synthea_sample_data_fhir_latest", + patient_id="a969c177-a995-7b89-7b6d-885214dfa253", resource_types=["Condition", "MedicationRequest", "Observation"], sample_size=5, # 5 resources per type random_seed=42, ) ``` +### Request Inspection and Debugging -!!! tip "Getting Synthea Data" - Generate synthetic patients using [Synthea](https://github.com/synthetichealth/synthea) or [download sample data](https://synthea.mitre.org/downloads) from their releases. Each patient Bundle is self-contained with all clinical history. +Before sending requests to your service, you can inspect and verify the queued data using several debugging methods. These are particularly useful for troubleshooting data loading issues or verifying request structure. + +#### Preview Requests -### Managing Requests +Get a high-level summary of queued requests without retrieving full payloads: ```python -# Preview queued requests before sending +# Preview all queued requests +previews = client.preview_requests() + +# Preview first 3 requests only previews = client.preview_requests(limit=3) -for preview in previews: - print(f"Request {preview['index']}: {preview['type']}") +print(previews) + +# [{'index': 0, 'type': 'CdaRequest', 'protocol': 'SOAP', 'has_document': True}] +``` + +#### Get Request Data + +Access the full request data in different formats for detailed inspection: + +```python +# Access raw Pydantic models directly +for request in client.requests: + print(f"Prefetch keys: {request.prefetch.keys()}") + print(request.model_dump()) -# Get full request data for inspection +# Get as list of dictionaries (for serialization) requests_dict = client.get_request_data(format="dict") +print(requests_dict[0].keys()) # See available fields + +# Get as JSON string (for saving or logging) requests_json = client.get_request_data(format="json") -requests_raw = client.get_request_data(format="raw") +with open("debug_requests.json", "w") as f: + f.write(requests_json) +``` + +#### Check Client Status + +Get the current state of your sandbox client: + +```python +status = client.get_status() +print(status) + +# { +# "sandbox_id": "550e8400-e29b-41d4-a716-446655440000", +# "url": "http://localhost:8000/cds/cds-services/my-service", +# "protocol": "rest", +# "workflow": "encounter-discharge", +# "requests_queued": 5, +# "responses_received": 0 +# } +``` + +#### Clear and Reload + +Reset the request queue to start fresh without creating a new client: -# Clear queued requests to start fresh +```python +# Clear all queued requests client.clear_requests() -client.load_from_path("./different_data.xml") + +# Load new data +client.load_from_path("./different_data.json") + +# Verify new queue +status = client.get_status() +print(f"New queue size: {status['requests_queued']}") ``` +??? example "Example Debugging Workflow" + ```python + from healthchain.sandbox import SandboxClient + + client = SandboxClient( + url="http://localhost:8000/cds/cds-services/discharge-summary", + workflow="encounter-discharge" + ) + + # Load data + client.load_free_text("data/notes.csv", column_name="text") + + # Debug before sending + print("=== Client Status ===") + print(client.get_status()) + + print("\n=== Request Previews ===") + for preview in client.preview_requests(limit=2): + print(f"Request {preview['index']}: {preview['type']}") + + print("\n=== Inspecting First Request ===") + first_request = client.requests[0] + print(f"Hook: {first_request.hook}") + print(f"Context: {first_request.context}") + print(f"Prefetch keys: {first_request.prefetch.keys()}") + print(f"Example DocumentReference: {first_request.prefetch['document'].model_dump()}") + + # If everything looks good, send + responses = client.send_requests() + ``` + + ### Sending Requests ```python @@ -238,42 +318,31 @@ print(status) # } ``` -### Using Context Manager - -For automatic result saving on successful completion: - -```python -with SandboxClient( - url="http://localhost:8000/cds/cds-services/my-service", - workflow="encounter-discharge" -) as client: - client.load_free_text( - csv_path="./data/notes.csv", - column_name="text" - ) - responses = client.send_requests() - # Results automatically saved to ./output/ on successful exit -``` - ## Complete Examples === "CDS Hooks Test" ```python from healthchain.sandbox import SandboxClient - # Initialize for CDS Hooks + # Initialize client for CDS Hooks workflow client = SandboxClient( url="http://localhost:8000/cds/cds-services/sepsis-alert", workflow="patient-view" ) - # Load and send + # Load MIMIC-on-FHIR data client.load_from_registry( "mimic-on-fhir", data_dir="./data/mimic-iv-fhir", - resource_types=["MimicConditionED", "MimicObservation"], - sample_size=10, + resource_types=["MimicConditionED"], + sample_size=5 + ) + + # Optional: Inspect before sending + # client.preview_requests() + # client.get_status() + # Send requests and save results responses = client.send_requests() client.save_results("./output/") ``` @@ -282,15 +351,20 @@ with SandboxClient( ```python from healthchain.sandbox import SandboxClient - # Initialize for SOAP/CDA + # Initialize client for SOAP/CDA workflow client = SandboxClient( url="http://localhost:8000/notereader/ProcessDocument/", workflow="sign-note-inpatient", protocol="soap" ) - # Load CDA files from directory + # Load CDA documents from directory client.load_from_path("./data/cda_files/", pattern="*.xml") + + # Optional: Inspect before sending + # client.preview_requests() + + # Send requests and save results responses = client.send_requests() client.save_results("./output/") ``` @@ -299,24 +373,71 @@ with SandboxClient( ```python from healthchain.sandbox import SandboxClient - # Initialize client + # Initialize client for CDS workflow client = SandboxClient( url="http://localhost:8000/cds/cds-services/my-service", - workflow="patient-view" + workflow="encounter-discharge" ) - # Load text data + # Load and generate FHIR from clinical notes client.load_free_text( - csv_path="./data/clinical_notes.csv", - column_name="note_text", - generate_synthetic=True + csv_path="./data/discharge_notes.csv", + column_name="text", + generate_synthetic=True # Adds synthetic data ) - # Send and save + # Optional: Inspect generated data + # requests = client.get_request_data(format="dict") + # print(requests[0]['prefetch'].keys()) + + # Send requests responses = client.send_requests() - client.save_results("./output/") ``` +## Advanced Usage + +`SandboxClient` supports method chaining and context manager patterns for more concise code. + +### Method Chaining + +All data loading methods return `self`, enabling fluent method chaining: + +```python +from healthchain.sandbox import SandboxClient + +# Chain initialization, loading, and sending +responses = ( + SandboxClient( + url="http://localhost:8000/cds/cds-services/my-service", + workflow="encounter-discharge" + ) + .load_from_registry( + "synthea-patient", + data_dir="./data/synthea", + sample_size=5 + ) + .send_requests() +) +``` + +### Context Manager + +Use the context manager for automatic result saving on successful completion: + +```python +# Auto-save results to ./output/ on successful exit +with SandboxClient( + url="http://localhost:8000/cds/cds-services/my-service", + workflow="encounter-discharge" +) as client: + client.load_free_text( + csv_path="./data/notes.csv", + column_name="text" + ) + responses = client.send_requests() + # Results automatically saved on successful exit +``` + ## Migration Guide !!! warning "Decorator Pattern Deprecated"