Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
36aecd3
Fix dy operation and domain wildcard replacement
gerrycampion Mar 6, 2026
fe8aa5d
Convert some OperationErrors to skippable DomainNotFound and KeyErrors
gerrycampion Mar 6, 2026
2f57346
skip the keyerror
gerrycampion Mar 6, 2026
2aa7655
fix for empty datasets. info instead of warning for bad file formats
gerrycampion Mar 12, 2026
f787d11
Merge branch 'main' into 1578-resolve-existing-rule-execution-and-rul…
gerrycampion Mar 12, 2026
4b76b1f
fix dask empty check
gerrycampion Mar 12, 2026
48b58df
update is_custom_domain to consider model domains. fixed some domain/…
gerrycampion Mar 13, 2026
33279fe
move sdtm-related utils to sdtm_utilities
gerrycampion Mar 13, 2026
7cc39ed
Generalize _replace_variable_wildcards
gerrycampion Mar 13, 2026
a87661d
Fixed distinct referenced datasets. Fix relrec merge on columns with …
gerrycampion Mar 13, 2026
0c2cdea
Fix more wildcard replacements
gerrycampion Mar 13, 2026
a6f77d3
Fix case where relrec merge produces no records
gerrycampion Mar 13, 2026
9ec1ec9
another wildcard replacement fix
gerrycampion Mar 13, 2026
c2e0c09
fix define xml variable metadata merge
gerrycampion Mar 13, 2026
ddb8238
minor tweak to domain handling in rule processor
gerrycampion Mar 16, 2026
14a6982
Merge branch 'main' into 1578-resolve-existing-rule-execution-and-rul…
gerrycampion Mar 16, 2026
992d4f6
fix distinct get_dataset and rename dataset_metadata.dataset_name
gerrycampion Mar 17, 2026
553f987
better define-xml itemgroupdef match handling
gerrycampion Mar 18, 2026
b004725
define metadata documentation
gerrycampion Mar 18, 2026
c3bf504
Added test for get_define_xml_variables_metadata
gerrycampion Mar 18, 2026
2c591a8
Merge branch 'main' into 1578-resolve-existing-rule-execution-and-rul…
SFJohnson24 Mar 20, 2026
ac21bf8
Merge branch 'main' into 1578-resolve-existing-rule-execution-and-rul…
SFJohnson24 Mar 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cdisc_rules_engine/check_operators/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ def flatten_list(data, items):


vectorized_apply_regex = np.vectorize(apply_regex)
vectorized_is_complete_date = np.vectorize(is_complete_date)
vectorized_is_complete_date = np.vectorize(is_complete_date, otypes=[bool])
vectorized_compare_dates = np.vectorize(compare_dates)
vectorized_is_valid = np.vectorize(is_valid_date)
vectorized_is_valid_duration = np.vectorize(is_valid_duration)
Expand Down
29 changes: 13 additions & 16 deletions cdisc_rules_engine/dataset_builders/base_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import (
DefineXMLReaderFactory,
)
from cdisc_rules_engine.utilities.utils import (
get_corresponding_datasets,
from cdisc_rules_engine.utilities.sdtm_utilities import get_corresponding_datasets
from cdisc_rules_engine.utilities.sdtm_utilities import (
tag_source,
)
from typing import List, Iterable, Optional
Expand Down Expand Up @@ -155,22 +155,21 @@ def get_define_xml_item_group_metadata_for_domain(self, domain: str) -> List[dic

def get_define_xml_variables_metadata(self) -> List[dict]:
"""
Gets Define XML variables metadata.
Gets Define XML variables metadata. Name and Domain combos for these types of datasets:
| Name | Domain |
| ------ | ------ |
| AE | AE |
| QSPH | QS |
| RELREC | |
| SUPPDM | DM |
"""
define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader(
self.dataset_path, self.define_xml_path, self.data_service, self.cache
)
# If domain is not set and this is a SUPP domain, use rdomain
domain = self.dataset_metadata.domain
if not domain and getattr(self.dataset_metadata, "is_supp", False):
domain = getattr(self.dataset_metadata, "rdomain", None)
name = getattr(self.dataset_metadata, "name", None)
return define_xml_reader.extract_variables_metadata(
domain_name=domain, name=name
)
if not domain:
return []
return define_xml_reader.extract_variables_metadata(domain_name=domain)
domain = self.dataset_metadata.domain or self.dataset_metadata.rdomain
return define_xml_reader.extract_variables_metadata(
domain_name=domain, name=self.dataset_metadata.name
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be misunderstanding this but before this looked like a lookup only using domain and now we also require dataset name. Could that cause split dataset lookups to fail if in define xml is keyed by base domain?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was to fix an issue with CORE-001081 (see before-report) where define metadata was not being extracted for RELREC. Here is what I would expect to see for Name/Domain when doing an itemgroupdef lookup.

Name Domain
AE AE
QSPH QS
RELREC
SUPPDM DM

I added additional fixes now so that RELREC only needs a name and not domain. This corresponds with what is in the Define-xml specification:

Image

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the clarification. Do you think we should a test for these define-xml lookup cases. So the expected matching behavior is clearly covered?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

)

def get_define_xml_value_level_metadata(self) -> List[dict]:
"""
Expand Down Expand Up @@ -204,10 +203,8 @@ def get_library_variables_metadata(self) -> DatasetInterface:
else:
domain = self.dataset_metadata.domain
variables: List[dict] = sdtm_utilities.get_variables_metadata_from_standard(
domain=self.dataset_metadata.unsplit_name,
library_metadata=self.library_metadata,
data_service=self.data_service,
dataset=self.get_dataset_contents(),
datasets=self.datasets,
dataset_metadata=self.dataset_metadata,
dataset_path=self.dataset_path,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder
from cdisc_rules_engine.utilities.utils import (
from cdisc_rules_engine.utilities.sdtm_utilities import (
get_corresponding_datasets,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from jsonschema import validators, exceptions
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder
from cdisc_rules_engine.models.dataset import DatasetInterface
from cdisc_rules_engine.utilities.utils import tag_source
from cdisc_rules_engine.utilities.sdtm_utilities import tag_source


class JsonSchemaCheckDatasetBuilder(BaseDatasetBuilder):
Expand Down
3 changes: 2 additions & 1 deletion cdisc_rules_engine/enums/execution_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ class ExecutionStatus(BaseEnum):
class SkippedReason(BaseEnum):
COLUMN_NOT_FOUND_IN_DATA = "Column not found in data"
DOMAIN_NOT_FOUND = "Domain not found"
SCHEMA_VALIDATION_IS_OFF = "Schema validation is off"
EMPTY_DATASET = "Empty dataset"
OUTSIDE_SCOPE = "Outside scope"
SCHEMA_VALIDATION_IS_OFF = "Schema validation is off"


class ExecutionError(BaseEnum):
Expand Down
4 changes: 2 additions & 2 deletions cdisc_rules_engine/models/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def generate_dataset_error_objects(self, message: str, results: pd.Series):
# get targets in the order they appear in rule.output_variables
target_names: List[str] = RuleProcessor.extract_target_names_from_rule(
self.rule,
self.dataset_metadata.domain_cleaned,
self.dataset_metadata.wildcard_replacement,
self.variable.dataset.columns.tolist(),
)
target_names = self._get_target_names_from_list_values(
Expand Down Expand Up @@ -242,7 +242,7 @@ def generate_targeted_error_object( # noqa: C901
),
targets=targets_list,
errors=errors_list,
message=message.replace("--", self.dataset_metadata.domain_cleaned or ""),
message=message.replace("--", self.dataset_metadata.wildcard_replacement),
)

def _generate_errors_by_target_presence(
Expand Down
4 changes: 4 additions & 0 deletions cdisc_rules_engine/models/dataset/dask_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ def __len__(self):

return self.length

@property
def empty(self):
return len(self) == 0

def __deepcopy__(self, memo):
pandas_df = self._data.compute()
fresh_dask_df = dd.from_pandas(pandas_df, npartitions=DEFAULT_NUM_PARTITIONS)
Expand Down
5 changes: 5 additions & 0 deletions cdisc_rules_engine/models/dataset_metadata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass
from typing import Union
from os.path import basename


@dataclass
Expand All @@ -17,3 +18,7 @@ class DatasetMetadata:
full_path: Union[str, None] = None
first_record: Union[dict, None] = None
original_path: Union[str, None] = None

@property
def data_service_identifier(self) -> str:
return basename(self.full_path) if self.full_path else self.filename
44 changes: 22 additions & 22 deletions cdisc_rules_engine/models/sdtm_dataset_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,35 @@ class SDTMDatasetMetadata(DatasetMetadata):

"""
Examples
| name | unsplit_name | is_supp | domain | rdomain | is_ap | ap_suffix | domain_is_custom | related_domain | related_domain_is_custom |
| -------- | ------------ | ------- | ------ | ------- | ----- | --------- | ----------------- | -------------- | ------------------------ |
| QS | QS | False | QS | None | False | | False | | |
| QSX | QS | False | QS | None | False | | False | | |
| QSXX | QS | False | QS | None | False | | False | | |
| SUPPQS | SUPPQS | True | None | QS | False | | False | QS | |
| SUPPQSX | SUPPQS | True | None | QS | False | | False | QS | |
| SUPPQSXX | SUPPQS | True | None | QS | False | | False | QS | |
| APQS | APQS | False | APQS | None | True | QS | False | QS | |
| APQSX | APQS | False | APQS | None | True | QS | False | QS | |
| APQSXX | APQS | False | APQS | None | True | QS | False | QS | |
| SQAPQS | SQAPQS | True | None | APQS | True | | False | QS | |
| SQAPQSX | SQAPQS | True | None | APQS | True | | False | QS | |
| SQAPQSXX | SQAPQS | True | None | APQS | True | | False | | |
| RELREC | RELREC | False | None | None | False | | False | | |
| XX | XX | False | XX | None | False | | True | | |
| SUPPXX | SUPPXX | True | None | XX | False | | False | XX | True |
| APXX | APXX | False | APXX | None | True | XX | False | XX | True |
| SQAPXX | SQAPXX | True | None | APXX | True | | False | XX | True |
| FA | FA | False | FA | None | False | | False | | |
| name | unsplit_name | is_supp | domain | wildcard_replacement | rdomain | is_ap | ap_suffix | domain_is_custom | related_domain | related_domain_is_custom |
| -------- | ------------ | ------- | ------ | -------------------- | ------- | ----- | --------- | ---------------- | -------------- | ------------------------ |
| QS | QS | False | QS | QS | None | False | | False | | |
| QSX | QS | False | QS | QS | None | False | | False | | |
| QSXX | QS | False | QS | QS | None | False | | False | | |
| SUPPQS | SUPPQS | True | None | | QS | False | | False | QS | |
| SUPPQSX | SUPPQS | True | None | | QS | False | | False | QS | |
| SUPPQSXX | SUPPQS | True | None | | QS | False | | False | QS | |
| APQS | APQS | False | APQS | QS | None | True | QS | False | QS | |
| APQSX | APQS | False | APQS | QS | None | True | QS | False | QS | |
| APQSXX | APQS | False | APQS | QS | None | True | QS | False | QS | |
| SQAPQS | SQAPQS | True | None | | APQS | True | | False | QS | |
| SQAPQSX | SQAPQS | True | None | | APQS | True | | False | QS | |
| SQAPQSXX | SQAPQS | True | None | | APQS | True | | False | | |
| RELREC | RELREC | False | None | | None | False | | False | | |
| XX | XX | False | XX | XX | None | False | | True | | |
| SUPPXX | SUPPXX | True | None | | XX | False | | False | XX | True |
| APXX | APXX | False | APXX | XX | None | True | XX | False | XX | True |
| SQAPXX | SQAPXX | True | None | | APXX | True | | False | XX | True |
| FA | FA | False | FA | FA | None | False | | False | | |
""" # noqa: E501 W291

@property
def domain(self) -> Union[str, None]:
return (self.first_record or {}).get("DOMAIN", None)

@property
def domain_cleaned(self) -> Union[str, None]:
return self.domain.replace("AP", "") if self.domain else None
def wildcard_replacement(self) -> Union[str, None]:
return self.ap_suffix or self.domain or ""

@property
def rdomain(self) -> Union[str, None]:
Expand Down
24 changes: 11 additions & 13 deletions cdisc_rules_engine/operations/base_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,10 @@ def _handle_grouped_result(self, result):
result = self._rename_grouping_columns(result)
grouping_columns = self._get_grouping_columns()
target_columns = grouping_columns + [self.params.operation_id]
target_columns = self._resolve_variable_name(target_columns, self.params.domain)
grouping_columns = self._resolve_variable_name(
target_columns = self._replace_variable_wildcard(
target_columns, self.params.domain
)
grouping_columns = self._replace_variable_wildcard(
grouping_columns, self.params.domain
)
result = result.reset_index()
Expand Down Expand Up @@ -225,13 +227,9 @@ def _expand_operation_results_in_grouping(self, grouping_list):
def _get_variables_metadata_from_standard(self) -> List[dict]:
# TODO: Update to handle other standard types: adam, cdash, etc.

# self.params.domain is unsplit_name
domain_for_library = self.params.domain
return sdtm_utilities.get_variables_metadata_from_standard(
domain=domain_for_library,
library_metadata=self.library_metadata,
data_service=self.data_service,
dataset=self.evaluation_dataset,
dataset_metadata=self.data_service.get_raw_dataset_metadata(
dataset_name=self.params.dataset_path, datasets=self.params.datasets
),
Expand All @@ -250,17 +248,15 @@ def get_allowed_variable_permissibility(self, variable_metadata: dict):
def _get_variable_names_list(self, domain, dataframe):
# get variables metadata from the standard model
variables_metadata: List[dict] = (
self._get_variables_metadata_from_standard_model(domain, dataframe)
self._get_variables_metadata_from_standard_model(dataframe)
)
# create a list of variable names in accordance to the "ordinal" key
variable_names_list = self._replace_variable_wildcards(
variables_metadata, domain
)
return list(OrderedDict.fromkeys(variable_names_list))

def _get_variables_metadata_from_standard_model(
self, domain, dataframe
) -> List[dict]:
def _get_variables_metadata_from_standard_model(self, dataframe) -> List[dict]:
"""
Gets variables metadata for the given class and domain from cache.
The cache stores CDISC Library metadata.
Expand All @@ -287,7 +283,6 @@ def _get_variables_metadata_from_standard_model(
# TODO: Update to handle multiple standard types.

return sdtm_utilities.get_variables_metadata_from_standard_model(
domain=domain,
dataframe=dataframe,
datasets=self.params.datasets,
dataset_path=self.params.dataset_path,
Expand All @@ -300,10 +295,13 @@ def _get_variables_metadata_from_standard_model(

@staticmethod
def _replace_variable_wildcards(variables_metadata, domain):
return [var["name"].replace("--", domain) for var in variables_metadata]
return [
BaseOperation._replace_variable_wildcard(var["name"], domain)
for var in variables_metadata
]

@staticmethod
def _resolve_variable_name(variable_name, domain: str):
def _replace_variable_wildcard(variable_name, domain: str):
if isinstance(variable_name, list):
return [
var.replace("--", domain) if "--" in var else var
Expand Down
12 changes: 5 additions & 7 deletions cdisc_rules_engine/operations/day_data_validator.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
from cdisc_rules_engine.exceptions.custom_exceptions import DomainNotFoundError
from cdisc_rules_engine.operations.base_operation import BaseOperation
from datetime import datetime
import numpy as np
from cdisc_rules_engine.services import logger
from cdisc_rules_engine.utilities.utils import tag_source
from cdisc_rules_engine.utilities.sdtm_utilities import tag_source


class DayDataValidator(BaseOperation):
def _execute_operation(self):
logger.info(
f"trying to find '{self.params.target}' in the {self.evaluation_dataset['DOMAIN'].iloc[0]}."
)
dtc_value = self.evaluation_dataset[self.params.target].map(
self.parse_timestamp
)
Expand All @@ -18,8 +15,9 @@ def _execute_operation(self):
dataset for dataset in self.params.datasets if dataset.domain == "DM"
]
if not dm_datasets:
# Return none for all values if dm is not provided.
return [0] * len(self.evaluation_dataset[self.params.target])
raise DomainNotFoundError(
"Operation dy requires DM domain but Domain not found in datasets"
)
if len(dm_datasets) > 1:
dm_data = self.data_service.concat_split_datasets(
self.data_service.get_dataset, dm_datasets
Expand Down
6 changes: 3 additions & 3 deletions cdisc_rules_engine/operations/distinct.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ def get_existing_column_names(group):

def _get_referenced_datasets(self):
referenced_datasets = {}
for dataset_meta in self.data_service.data:
dataset = self.data_service.get_dataset(dataset_meta.filename)
referenced_datasets[dataset_meta.name] = dataset
for dataset_metadata in self.data_service.get_datasets():
dataset = self.data_service.get_dataset(dataset_metadata.filename)
referenced_datasets[dataset_metadata.name] = dataset
return referenced_datasets

def _unique_values_for_column(self, column):
Expand Down
4 changes: 2 additions & 2 deletions cdisc_rules_engine/operations/domain_is_custom.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from cdisc_rules_engine.operations.base_operation import BaseOperation
from cdisc_rules_engine.utilities.sdtm_utilities import is_custom_domain


class DomainIsCustom(BaseOperation):
Expand All @@ -8,5 +9,4 @@ def _execute_operation(self):
given domain is in standard domains.
If no -> the domain is custom.
"""
standard_data: dict = self.library_metadata.standard_metadata
return self.params.domain not in standard_data.get("domains", {})
return is_custom_domain(self.library_metadata, self.params.domain)
4 changes: 3 additions & 1 deletion cdisc_rules_engine/operations/expected_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ def _execute_operation(self):

return list(
{
var["name"].replace("--", self.params.domain): None
BaseOperation._replace_variable_wildcard(
var["name"], self.params.domain
): None
for var in variables_metadata
if self.get_allowed_variable_permissibility(var) == EXPECTED
}.keys()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _get_model_filtered_variables(self):
key = self.params.key_name
val = self.params.key_value
model_variables: List[dict] = self._get_variables_metadata_from_standard_model(
self.params.domain, self.params.dataframe
self.params.dataframe
)
filtered_model = [var for var in model_variables if var.get(key) == val]
variable_names_list = self._replace_variable_wildcards(
Expand Down
9 changes: 6 additions & 3 deletions cdisc_rules_engine/operations/library_column_order.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ def _execute_operation(self):
variables_metadata: List[dict] = self._get_variables_metadata_from_standard()

# create a list of variable names in accordance to the "ordinal" key
variable_names_list = [
var["name"].replace("--", self.params.domain) for var in variables_metadata
]
variable_names_list = BaseOperation._replace_variable_wildcards(
variables_metadata,
self.data_service.get_raw_dataset_metadata(
dataset_name=self.params.dataset_path, datasets=self.params.datasets
).wildcard_replacement,
)
return list(OrderedDict.fromkeys(variable_names_list))
Loading
Loading