diff --git a/cdisc_rules_engine/constants/permissibility.py b/cdisc_rules_engine/constants/permissibility.py index 5ca672c5d..628bd1f01 100644 --- a/cdisc_rules_engine/constants/permissibility.py +++ b/cdisc_rules_engine/constants/permissibility.py @@ -2,3 +2,4 @@ EXPECTED = "Exp" PERMISSIBLE = "Perm" PERMISSIBILITY_KEY = "core" +PERMISSIBILITY_DEFAULT = PERMISSIBLE diff --git a/cdisc_rules_engine/operations/base_operation.py b/cdisc_rules_engine/operations/base_operation.py index 22f1fd57e..bd1538d09 100644 --- a/cdisc_rules_engine/operations/base_operation.py +++ b/cdisc_rules_engine/operations/base_operation.py @@ -1,8 +1,4 @@ from cdisc_rules_engine.models.operation_params import OperationParams -from cdisc_rules_engine.constants.permissibility import ( - PERMISSIBLE, - PERMISSIBILITY_KEY, -) from abc import abstractmethod from typing import List import pandas as pd @@ -237,14 +233,6 @@ def _get_variables_metadata_from_standard(self) -> List[dict]: dataset_path=self.params.dataset_path, ) - def get_allowed_variable_permissibility(self, variable_metadata: dict): - """ - Returns the permissibility value of a variable allowed in the current domain - """ - if PERMISSIBILITY_KEY in variable_metadata: - return variable_metadata[PERMISSIBILITY_KEY] - return PERMISSIBLE - def _get_variable_names_list(self, domain, dataframe): # get variables metadata from the standard model variables_metadata: List[dict] = ( diff --git a/cdisc_rules_engine/operations/expected_variables.py b/cdisc_rules_engine/operations/expected_variables.py index 4a6dba786..897bf4b97 100644 --- a/cdisc_rules_engine/operations/expected_variables.py +++ b/cdisc_rules_engine/operations/expected_variables.py @@ -1,9 +1,11 @@ -from cdisc_rules_engine.operations.base_operation import BaseOperation -from cdisc_rules_engine.constants.permissibility import EXPECTED -from typing import List +from cdisc_rules_engine.constants.permissibility import ( + EXPECTED, + PERMISSIBILITY_KEY, +) +from cdisc_rules_engine.operations.library_column_order import LibraryColumnOrder -class ExpectedVariables(BaseOperation): +class ExpectedVariables(LibraryColumnOrder): def _execute_operation(self): """ Fetches required variables for a given domain from the CDISC library. @@ -17,16 +19,6 @@ def _execute_operation(self): The lists with column names are sorted in accordance to "ordinal" key of library metadata. """ - - # get variables metadata from the standard/model - variables_metadata: List[dict] = self._get_variables_metadata_from_standard() - - return list( - { - BaseOperation._replace_variable_wildcard( - var["name"], self.params.domain - ): None - for var in variables_metadata - if self.get_allowed_variable_permissibility(var) == EXPECTED - }.keys() - ) + self.params.key_name = PERMISSIBILITY_KEY + self.params.key_value = EXPECTED + return super()._execute_operation() diff --git a/cdisc_rules_engine/operations/get_dataset_filtered_variables.py b/cdisc_rules_engine/operations/get_dataset_filtered_variables.py index b54b211e9..88c8109e5 100644 --- a/cdisc_rules_engine/operations/get_dataset_filtered_variables.py +++ b/cdisc_rules_engine/operations/get_dataset_filtered_variables.py @@ -1,8 +1,7 @@ -from typing import List -from cdisc_rules_engine.operations.base_operation import BaseOperation +from cdisc_rules_engine.operations.library_column_order import LibraryColumnOrder -class GetDatasetFilteredVariables(BaseOperation): +class GetDatasetFilteredVariables(LibraryColumnOrder): def _execute_operation(self): """ Filter variables from the dataset based on specified criteria. @@ -11,21 +10,7 @@ def _execute_operation(self): - key_name: The metadata key to filter by (e.g., "role", "type", etc.) - key_value: The value to match for the filter key (e.g., "Timing", "Identifier", etc.) """ - filter_key = self.params.key_name - filter_value = self.params.key_value - - # Get variables metadata from the standard model for the current domain - variables_metadata: List[dict] = self._get_variables_metadata_from_standard() - - # Filter variables based on the specified criteria - filtered_variables = [ - var for var in variables_metadata if var.get(filter_key) == filter_value - ] - - # Replace variable wildcards with actual domain names - variable_names_list = self._replace_variable_wildcards( - filtered_variables, self.params.domain - ) + variable_names_list = super()._execute_operation() # Get actual column names from the dataset that match our filtered list dataset_columns = self.params.dataframe.columns.tolist() diff --git a/cdisc_rules_engine/operations/library_column_order.py b/cdisc_rules_engine/operations/library_column_order.py index 7360de800..cd49138ca 100644 --- a/cdisc_rules_engine/operations/library_column_order.py +++ b/cdisc_rules_engine/operations/library_column_order.py @@ -1,6 +1,5 @@ from cdisc_rules_engine.operations.base_operation import BaseOperation from typing import List -from collections import OrderedDict class LibraryColumnOrder(BaseOperation): @@ -16,16 +15,28 @@ def _execute_operation(self): Length of Series is equal to the length of given dataframe. The lists with column names are sorted in accordance to "ordinal" key of library metadata. - """ - # get variables metadata , for custom domains from model; for non-custom from IG and model + If key_name and key_value are provided, filter variables based on specified criteria. + + Optional parameters: + - key_name: The metadata key to filter by (e.g., "role", "type", etc.) + - key_value: The value to match for the filter key (e.g., "Timing", "Identifier", etc.) + """ + # Get variables metadata from the standard model for the current domain variables_metadata: List[dict] = self._get_variables_metadata_from_standard() - # create a list of variable names in accordance to the "ordinal" key - variable_names_list = BaseOperation._replace_variable_wildcards( - variables_metadata, - self.data_service.get_raw_dataset_metadata( - dataset_name=self.params.dataset_path, datasets=self.params.datasets - ).wildcard_replacement, + # Filter variables based on the specified criteria + + if self.params.key_name: + variables_metadata = [ + var + for var in variables_metadata + if var.get(self.params.key_name) == self.params.key_value + ] + + # Replace variable wildcards with actual domain names + variable_names_list = self._replace_variable_wildcards( + variables_metadata, self.params.domain ) - return list(OrderedDict.fromkeys(variable_names_list)) + + return variable_names_list diff --git a/cdisc_rules_engine/operations/permissible_variables.py b/cdisc_rules_engine/operations/permissible_variables.py index 96cffd36c..3477eba51 100644 --- a/cdisc_rules_engine/operations/permissible_variables.py +++ b/cdisc_rules_engine/operations/permissible_variables.py @@ -1,9 +1,8 @@ -from cdisc_rules_engine.operations.base_operation import BaseOperation -from cdisc_rules_engine.constants.permissibility import PERMISSIBLE -from typing import List +from cdisc_rules_engine.constants.permissibility import PERMISSIBILITY_KEY, PERMISSIBLE +from cdisc_rules_engine.operations.library_column_order import LibraryColumnOrder -class PermissibleVariables(BaseOperation): +class PermissibleVariables(LibraryColumnOrder): def _execute_operation(self): """ Fetches required variables for a given domain from the CDISC library. @@ -17,16 +16,6 @@ def _execute_operation(self): The lists with column names are sorted in accordance to "ordinal" key of library metadata. """ - - # get variables metadata from the standard model - variables_metadata: List[dict] = self._get_variables_metadata_from_standard() - - return list( - { - BaseOperation._replace_variable_wildcard( - var["name"], self.params.domain - ): None - for var in variables_metadata - if self.get_allowed_variable_permissibility(var) == PERMISSIBLE - }.keys() - ) + self.params.key_name = PERMISSIBILITY_KEY + self.params.key_value = PERMISSIBLE + return super()._execute_operation() diff --git a/cdisc_rules_engine/operations/required_variables.py b/cdisc_rules_engine/operations/required_variables.py index 8cc41d3be..efe72f370 100644 --- a/cdisc_rules_engine/operations/required_variables.py +++ b/cdisc_rules_engine/operations/required_variables.py @@ -1,9 +1,8 @@ -from typing import List -from cdisc_rules_engine.operations.base_operation import BaseOperation -from cdisc_rules_engine.constants.permissibility import REQUIRED +from cdisc_rules_engine.constants.permissibility import PERMISSIBILITY_KEY, REQUIRED +from cdisc_rules_engine.operations.library_column_order import LibraryColumnOrder -class RequiredVariables(BaseOperation): +class RequiredVariables(LibraryColumnOrder): def _execute_operation(self): """ Fetches required variables for a given domain from the CDISC library. @@ -17,15 +16,6 @@ def _execute_operation(self): The lists with column names are sorted in accordance to "ordinal" key of library metadata. """ - - # get variables metadata from the standard model - variables_metadata: List[dict] = self._get_variables_metadata_from_standard() - return list( - { - BaseOperation._replace_variable_wildcard( - var["name"], self.params.domain - ): None - for var in variables_metadata - if self.get_allowed_variable_permissibility(var) == REQUIRED - }.keys() - ) + self.params.key_name = PERMISSIBILITY_KEY + self.params.key_value = REQUIRED + return super()._execute_operation() diff --git a/cdisc_rules_engine/utilities/sdtm_utilities.py b/cdisc_rules_engine/utilities/sdtm_utilities.py index c4e19972e..c00c0d6e2 100644 --- a/cdisc_rules_engine/utilities/sdtm_utilities.py +++ b/cdisc_rules_engine/utilities/sdtm_utilities.py @@ -25,6 +25,10 @@ SPECIAL_PURPOSE, SPECIAL_PURPOSE_MODEL, ) +from cdisc_rules_engine.constants.permissibility import ( + PERMISSIBILITY_DEFAULT, + PERMISSIBILITY_KEY, +) from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.library_metadata_container import ( LibraryMetadataContainer, @@ -224,6 +228,7 @@ def get_variables_metadata_from_standard( # noqa ) else: variables_metadata = ig_variables + set_default_variable_permissibility(variables_metadata) return variables_metadata @@ -403,6 +408,7 @@ def get_variables_metadata_from_standard_model( # noqa timing_metadata, ]: replace_variable_wildcards(var_list, original_domain, variables_metadata) + set_default_variable_permissibility(variables_metadata) return variables_metadata else: # First, try to get class metadata and check for classVariables @@ -424,6 +430,7 @@ def get_variables_metadata_from_standard_model( # noqa replace_variable_wildcards( class_variables, original_domain, variables_metadata ) + set_default_variable_permissibility(variables_metadata) return variables_metadata else: # Second, check if domain exists in model datasets @@ -446,6 +453,7 @@ def get_variables_metadata_from_standard_model( # noqa dataset_variables, original_domain, variables_metadata ) variables_metadata.sort(key=lambda item: int(item["ordinal"])) + set_default_variable_permissibility(variables_metadata) return variables_metadata # Third, fall back to standard datasets if IG_domain_details: @@ -465,6 +473,7 @@ def get_variables_metadata_from_standard_model( # noqa replace_variable_wildcards( dataset_variables, original_domain, variables_metadata ) + set_default_variable_permissibility(variables_metadata) return variables_metadata return None @@ -487,6 +496,12 @@ def replace_variable_wildcards(var_list, domain, target_list): target_list.append(var_copy) +def set_default_variable_permissibility(var_list): + for variable_metadata in var_list: + if PERMISSIBILITY_KEY not in variable_metadata: + variable_metadata[PERMISSIBILITY_KEY] = PERMISSIBILITY_DEFAULT + + def get_all_model_wildcard_variables(model_details: dict): return { classVariable["name"] diff --git a/resources/schema/rule-merged/Operations.json b/resources/schema/rule-merged/Operations.json index 562d87898..a93003e72 100644 --- a/resources/schema/rule-merged/Operations.json +++ b/resources/schema/rule-merged/Operations.json @@ -137,7 +137,7 @@ "properties": { "operator": { "const": "get_column_order_from_library", - "markdownDescription": "\nFetches column order for a given domain from the CDISC library. The lists with column names are sorted in accordance to \"ordinal\" key of library metadata.\n\nRule Type: Variable Metadata Check\n\n```yaml\nCheck:\n all:\n - name: variable_name\n operator: is_not_contained_by\n value: $ig_variables\nOperations:\n - id: $ig_variables\n operator: get_column_order_from_library\n```\n" + "markdownDescription": "\nFetches column order for a given domain from the CDISC library. The lists with column names are sorted in accordance to \"ordinal\" key of library metadata.\nOptionally Filters variables based on specified metadata criteria.\n\nRule Type: Variable Metadata Check\n\n```yaml\nCheck:\n all:\n - name: variable_name\n operator: is_not_contained_by\n value: $ig_variables\nOperations:\n - id: $ig_variables\n operator: get_column_order_from_library\n key_name: \"role\" # role, core, etc\n key_value: \"Exp\" # Timing, Req, Exp, Perm, etc\n```\n" } }, "required": ["id", "operator"], @@ -187,7 +187,7 @@ "properties": { "operator": { "const": "get_dataset_filtered_variables", - "markdownDescription": "\nFilters variables from the dataset based on specified metadata criteria. Returns a list of variable names that exist in the dataset and match the filter criteria.\n\n```yaml\n- operation: get_dataset_filtered_variables\n id: $timing_variables\n key_name: \"role\"\n key_value: \"Timing\"\n```\n" + "markdownDescription": "\nFilters variables from the dataset based on specified metadata criteria. Returns a list of variable names that exist in the dataset and match the filter criteria.\n\n```yaml\n- operator: get_dataset_filtered_variables\n id: $timing_variables\n key_name: \"role\"\n key_value: \"Timing\"\n```\n" } }, "required": ["id", "operator", "key_name", "key_value"], @@ -197,7 +197,7 @@ "properties": { "operator": { "const": "label_referenced_variable_metadata", - "markdownDescription": "\nGenerates a dataframe where each record in the dataframe is the library ig variable metadata corresponding with the variable label found in the column provided in name. The metadata column names are prefixed with the string provided in `id`.\n\nInput\n\nTarget Dataset: SUPPLB\n\nProduct: sdtmig\n\nVersion: 3-4\n\nDataset:\n\n```\n{\n \"STUDYID\": [\"STUDY1\", \"STUDY1\", \"STUDY1\"],\n \"USUBJID\": [\"SUBJ1\", \"SUBJ1\", \"SUBJ1\"],\n \"QLABEL\": [\"Toxicity\", \"Viscosity\", \"Analysis Method\"]\n}\n```\n\nRule:\n\n```yaml\n- operator: label_referenced_variable_metadata\n id: $qlabel_referenced_variable_metadata\n name: \"QLABEL\"\n```\n\nOutput\n\n```\n{\n \"STUDYID\": [\"STUDY1\", \"STUDY1\", \"STUDY1\"],\n \"USUBJID\": [\"SUBJ1\", \"SUBJ1\", \"SUBJ1\"],\n \"QLABEL\": [\"Toxicity\", \"Viscosity\", \"Analysis Method\"],\n \"$qlabel_referenced_variable_metadata_name\": [\"LBTOX\", null, \"LBANMETH\"],\n \"$qlabel_referenced_variable_metadata_role\": [\n \"Variable Qualifier\",\n null,\n \"Record Qualifier\"\n ],\n \"$qlabel_referenced_variable_metadata_ordinal\": [44, null, 38],\n \"$qlabel_referenced_variable_metadata_label\": [\"Toxicity\", null, \"Analysis Method\"]\n}\n```\n" + "markdownDescription": "\nGenerates a dataframe where each record in the dataframe is the library ig variable metadata corresponding with the variable label found in the column provided in name. The metadata column names are prefixed with the string provided in `id`.\n\nInput\n\nTarget Dataset: SUPPLB\n\nProduct: sdtmig\n\nVersion: 3-4\n\nDataset:\n\n```\n{\n \"STUDYID\": [\"STUDY1\", \"STUDY1\", \"STUDY1\"],\n \"USUBJID\": [\"SUBJ1\", \"SUBJ1\", \"SUBJ1\"],\n \"QLABEL\": [\"Toxicity\", \"Viscosity\", \"Analysis Method\"]\n}\n```\n\nRule:\n\n```yaml\n- operator: label_referenced_variable_metadata\n id: $qlabel_referenced_variable_metadata\n name: \"QLABEL\"\n```\n\nOutput\n\n```\n{\n \"STUDYID\": [\"STUDY1\", \"STUDY1\", \"STUDY1\"],\n \"USUBJID\": [\"SUBJ1\", \"SUBJ1\", \"SUBJ1\"],\n \"QLABEL\": [\"Toxicity\", \"Viscosity\", \"Analysis Method\"],\n \"$qlabel_referenced_variable_metadata_name\": [\"LBTOX\", null, \"LBANMETH\"],\n \"$qlabel_referenced_variable_metadata_role\": [\n \"Variable Qualifier\",\n null,\n \"Record Qualifier\"\n ],\n \"$qlabel_referenced_variable_metadata_ordinal\": [44, null, 38],\n \"$qlabel_referenced_variable_metadata_core\": [\"Req\", \"Req\", \"Req\"],\n \"$qlabel_referenced_variable_metadata_label\": [\"Toxicity\", null, \"Analysis Method\"]\n}\n```\n" } }, "required": ["id", "operator", "name"], @@ -277,7 +277,7 @@ "properties": { "operator": { "const": "name_referenced_variable_metadata", - "markdownDescription": "\nGenerates a dataframe where each record in the dataframe is the library ig variable metadata corresponding with the variable name found in the column provided in name. The metadata column names are prefixed with the string provided in `id`.\n\nInput\n\nTarget Dataset: SUPPLB\n\nProduct: sdtmig\n\nVersion: 3-4\n\nDataset:\n\n```\n{\n \"STUDYID\": [\"STUDY1\", \"STUDY1\", \"STUDY1\"],\n \"USUBJID\": [\"SUBJ1\", \"SUBJ1\", \"SUBJ1\"],\n \"QNAM\": [\"Toxicity\", \"LBVISCOS\", \"Analysis Method\"]\n}\n```\n\nRule:\n\n```yaml\n- operator: name_referenced_variable_metadata\n id: $qnam_referenced_variable_metadata\n name: \"QNAM\"\n```\n\nOutput\n\n```\n{\n \"STUDYID\": [\"STUDY1\", \"STUDY1\", \"STUDY1\"],\n \"USUBJID\": [\"SUBJ1\", \"SUBJ1\", \"SUBJ1\"],\n \"QNAM\": [\"LBTOX\", \"LBVISCOS\", \"LBANMETH\"],\n \"$qnam_referenced_variable_metadata_name\": [\"LBTOX\", null, \"LBANMETH\"],\n \"$qnam_referenced_variable_metadata_role\": [\n \"Variable Qualifier\",\n null,\n \"Record Qualifier\"\n ],\n \"$qnam_referenced_variable_metadata_ordinal\": [44, null, 38],\n \"$qnam_referenced_variable_metadata_label\": [\"Toxicity\", null, \"Analysis Method\"]\n}\n```\n" + "markdownDescription": "\nGenerates a dataframe where each record in the dataframe is the library ig variable metadata corresponding with the variable name found in the column provided in name. The metadata column names are prefixed with the string provided in `id`.\n\nInput\n\nTarget Dataset: SUPPLB\n\nProduct: sdtmig\n\nVersion: 3-4\n\nDataset:\n\n```\n{\n \"STUDYID\": [\"STUDY1\", \"STUDY1\", \"STUDY1\"],\n \"USUBJID\": [\"SUBJ1\", \"SUBJ1\", \"SUBJ1\"],\n \"QNAM\": [\"Toxicity\", \"LBVISCOS\", \"Analysis Method\"]\n}\n```\n\nRule:\n\n```yaml\n- operator: name_referenced_variable_metadata\n id: $qnam_referenced_variable_metadata\n name: \"QNAM\"\n```\n\nOutput\n\n```\n{\n \"STUDYID\": [\"STUDY1\", \"STUDY1\", \"STUDY1\"],\n \"USUBJID\": [\"SUBJ1\", \"SUBJ1\", \"SUBJ1\"],\n \"QNAM\": [\"LBTOX\", \"LBVISCOS\", \"LBANMETH\"],\n \"$qnam_referenced_variable_metadata_name\": [\"LBTOX\", null, \"LBANMETH\"],\n \"$qnam_referenced_variable_metadata_role\": [\n \"Variable Qualifier\",\n null,\n \"Record Qualifier\"\n ],\n \"$qnam_referenced_variable_metadata_ordinal\": [44, null, 38],\n \"$qnam_referenced_variable_metadata_core\": [\"Req\", \"Req\", \"Req\"],\n \"$qnam_referenced_variable_metadata_label\": [\"Toxicity\", null, \"Analysis Method\"]\n}\n```\n" } }, "required": ["id", "operator", "name"], @@ -297,7 +297,7 @@ "properties": { "operator": { "const": "record_count", - "markdownDescription": "\nIf no filter or group is provided, returns the number of records in the dataset. If filter is provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter. Filter can have a wildcard `&` that when added to the end of the filter value will look for all instances of that prefix (see 4th example below). If group is provided, returns the number of rows matching each unique set of the grouping variables. These can be static column name(s) or can be derived from other operations like get_dataset_filtered_variables.\n\nIf both filter and group are provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter that also match each unique set of the grouping variables.\n\n**Wildcard Filtering:** Filter values ending with % will match any records where the column value starts with the specified prefix. For example, RACE% will match RACE1, RACE2, RACE3, etc. This is useful for matching related variables with numeric or alphabetic suffixes.\n\n**Regex Transformation:** If regex is provided along with group, the regex pattern will be applied to transform grouping column values before grouping. The regex is only applied to columns where the pattern matches the data type. For example, using regex `^\\d{4}-\\d{2}-\\d{2}` on a column containing `2022-01-14T08:00` will extract `2022-01-14` for grouping purposes.\n\nIf group is provided, group_aliases may also be provided to assign new grouping variable names so that results grouped by the values in one set of grouping variables can be merged onto a dataset according to the same grouping value(s) stored in different set of grouping variables. When both group and group_aliases are provided, columns are renamed according to corresponding list position (i.e., the 1st column in group is renamed to the 1st column in group_aliases, etc.). If there are more columns listed in group than in group_aliases, only the group columns with corresponding group_aliases columns will be renamed. If there are more columns listed in group_aliases than in group, the extra column names in group_aliases will be ignored.\n\nExample: return the number of records in a dataset.\n\n```yaml\n- operator: record_count\n id: $records_in_dataset\n```\n\nExample: return the number of records where STUDYID = \"CDISC01\" and FLAGVAR = \"Y\".\n\n```yaml\n- operator: record_count\n id: $flagged_cdisc01_records_in_dataset\n filter:\n STUDYID: \"CDISC01\"\n FLAGVAR: \"Y\"\n```\n\nExample: return the number of records grouped by USUBJID and timing variables, extracting only the date portion from datetime values.\n\n```yaml\n- operator: record_count\n id: $records_per_usubjid_date\n group:\n - USUBJID\n - --TESTCD\n - $TIMING_VARIABLES\n regex: \"^\\d{4}-\\d{2}-\\d{2}\"\n```\n\nExample: return the number of records where QNAM starts with \"RACE\" (matches RACE1, RACE2, RACE3, etc.) per USUBJID.\n\n```yaml\n- operation: record_count\n id: $race_records_in_dataset\n filter:\n QNAM: \"RACE&\"\n group:\n - \"USUBJID\"\n```\n\nExample: return the number of records grouped by USUBJID.\n\n```yaml\n- operator: record_count\n id: $records_per_usubjid\n group:\n - USUBJID\n```\n\nExample: return the number of records grouped by USUBJID where FLAGVAR = \"Y\".\n\n```yaml\n- operator: record_count\n id: $flagged_records_per_usubjid\n group:\n - USUBJID\n filter:\n FLAGVAR: \"Y\"\n```\n\nExample: return the number of records grouped by USUBJID and IDVARVAL where QNAM = \"TEST1\" and IDVAR = \"GROUPID\", renaming the IDVARVAL column to GROUPID for subsequent merging.\n\n```yaml\n- operator: record_count\n id: $test1_records_per_usubjid_groupid\n group:\n - USUBJID\n - IDVARVAL\n filter:\n QNAM: \"TEST1\"\n IDVAR: \"GROUPID\"\n group_aliases:\n - USUBJID\n - GROUPID\n```\n\nExample: Group the StudyIdentifier dataset by parent_id and merge the result back to the context dataset StudyVersion using StudyVersion.id == StudyIdentifier.parent_id\n\n```yaml\nScope:\n Entities:\n Include:\n - StudyVersion\nOperations:\n - domain: StudyIdentifier\n filter:\n parent_entity: \"StudyVersion\"\n parent_rel: \"studyIdentifiers\"\n rel_type: \"definition\"\n studyIdentifierScope.organizationType.code: \"C70793\"\n studyIdentifierScope.organizationType.codeSystem: \"http://www.cdisc.org\"\n group:\n - parent_id\n group_aliases:\n - id\n id: $num_sponsor_ids\n operator: record_count\n```\n" + "markdownDescription": "\nIf no filter or group is provided, returns the number of records in the dataset. If filter is provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter. Filter can have a wildcard `&` that when added to the end of the filter value will look for all instances of that prefix (see 4th example below). If group is provided, returns the number of rows matching each unique set of the grouping variables. These can be static column name(s) or can be derived from other operations like get_dataset_filtered_variables.\n\nIf both filter and group are provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter that also match each unique set of the grouping variables.\n\n**Wildcard Filtering:** Filter values ending with % will match any records where the column value starts with the specified prefix. For example, RACE% will match RACE1, RACE2, RACE3, etc. This is useful for matching related variables with numeric or alphabetic suffixes.\n\n**Regex Transformation:** If regex is provided along with group, the regex pattern will be applied to transform grouping column values before grouping. The regex is only applied to columns where the pattern matches the data type. For example, using regex `^\\d{4}-\\d{2}-\\d{2}` on a column containing `2022-01-14T08:00` will extract `2022-01-14` for grouping purposes.\n\nIf group is provided, group_aliases may also be provided to assign new grouping variable names so that results grouped by the values in one set of grouping variables can be merged onto a dataset according to the same grouping value(s) stored in different set of grouping variables. When both group and group_aliases are provided, columns are renamed according to corresponding list position (i.e., the 1st column in group is renamed to the 1st column in group_aliases, etc.). If there are more columns listed in group than in group_aliases, only the group columns with corresponding group_aliases columns will be renamed. If there are more columns listed in group_aliases than in group, the extra column names in group_aliases will be ignored.\n\nExample: return the number of records in a dataset.\n\n```yaml\n- operator: record_count\n id: $records_in_dataset\n```\n\nExample: return the number of records where STUDYID = \"CDISC01\" and FLAGVAR = \"Y\".\n\n```yaml\n- operator: record_count\n id: $flagged_cdisc01_records_in_dataset\n filter:\n STUDYID: \"CDISC01\"\n FLAGVAR: \"Y\"\n```\n\nExample: return the number of records grouped by USUBJID and timing variables, extracting only the date portion from datetime values.\n\n```yaml\n- operator: record_count\n id: $records_per_usubjid_date\n group:\n - USUBJID\n - --TESTCD\n - $TIMING_VARIABLES\n regex: \"^\\d{4}-\\d{2}-\\d{2}\"\n```\n\nExample: return the number of records where QNAM starts with \"RACE\" (matches RACE1, RACE2, RACE3, etc.) per USUBJID.\n\n```yaml\n- operator: record_count\n id: $race_records_in_dataset\n filter:\n QNAM: \"RACE&\"\n group:\n - \"USUBJID\"\n```\n\nExample: return the number of records grouped by USUBJID.\n\n```yaml\n- operator: record_count\n id: $records_per_usubjid\n group:\n - USUBJID\n```\n\nExample: return the number of records grouped by USUBJID where FLAGVAR = \"Y\".\n\n```yaml\n- operator: record_count\n id: $flagged_records_per_usubjid\n group:\n - USUBJID\n filter:\n FLAGVAR: \"Y\"\n```\n\nExample: return the number of records grouped by USUBJID and IDVARVAL where QNAM = \"TEST1\" and IDVAR = \"GROUPID\", renaming the IDVARVAL column to GROUPID for subsequent merging.\n\n```yaml\n- operator: record_count\n id: $test1_records_per_usubjid_groupid\n group:\n - USUBJID\n - IDVARVAL\n filter:\n QNAM: \"TEST1\"\n IDVAR: \"GROUPID\"\n group_aliases:\n - USUBJID\n - GROUPID\n```\n\nExample: Group the StudyIdentifier dataset by parent_id and merge the result back to the context dataset StudyVersion using StudyVersion.id == StudyIdentifier.parent_id\n\n```yaml\nScope:\n Entities:\n Include:\n - StudyVersion\nOperations:\n - domain: StudyIdentifier\n filter:\n parent_entity: \"StudyVersion\"\n parent_rel: \"studyIdentifiers\"\n rel_type: \"definition\"\n studyIdentifierScope.organizationType.code: \"C70793\"\n studyIdentifierScope.organizationType.codeSystem: \"http://www.cdisc.org\"\n group:\n - parent_id\n group_aliases:\n - id\n id: $num_sponsor_ids\n operator: record_count\n```\n" } }, "required": ["id", "operator"], diff --git a/resources/schema/rule/Operations.md b/resources/schema/rule/Operations.md index 7615f980b..0d9b24bac 100644 --- a/resources/schema/rule/Operations.md +++ b/resources/schema/rule/Operations.md @@ -674,6 +674,7 @@ Output: ### get_column_order_from_library Fetches column order for a given domain from the CDISC library. The lists with column names are sorted in accordance to "ordinal" key of library metadata. +Optionally Filters variables based on specified metadata criteria. Rule Type: Variable Metadata Check @@ -686,6 +687,8 @@ Check: Operations: - id: $ig_variables operator: get_column_order_from_library + key_name: "role" # role, core, etc + key_value: "Exp" # Timing, Req, Exp, Perm, etc ``` ### get_model_column_order @@ -751,7 +754,7 @@ Output Filters variables from the dataset based on specified metadata criteria. Returns a list of variable names that exist in the dataset and match the filter criteria. ```yaml -- operation: get_dataset_filtered_variables +- operator: get_dataset_filtered_variables id: $timing_variables key_name: "role" key_value: "Timing" @@ -834,6 +837,7 @@ Output "Record Qualifier" ], "$qlabel_referenced_variable_metadata_ordinal": [44, null, 38], + "$qlabel_referenced_variable_metadata_core": ["Req", "Req", "Req"], "$qlabel_referenced_variable_metadata_label": ["Toxicity", null, "Analysis Method"] } ``` @@ -882,6 +886,7 @@ Output "Record Qualifier" ], "$qnam_referenced_variable_metadata_ordinal": [44, null, 38], + "$qnam_referenced_variable_metadata_core": ["Req", "Req", "Req"], "$qnam_referenced_variable_metadata_label": ["Toxicity", null, "Analysis Method"] } ``` @@ -1095,7 +1100,7 @@ Example: return the number of records grouped by USUBJID and timing variables, e Example: return the number of records where QNAM starts with "RACE" (matches RACE1, RACE2, RACE3, etc.) per USUBJID. ```yaml -- operation: record_count +- operator: record_count id: $race_records_in_dataset filter: QNAM: "RACE&" diff --git a/tests/unit/test_operations/test_label_referenced_variable_metadata.py b/tests/unit/test_operations/test_label_referenced_variable_metadata.py index a70e9d8d5..58caa1c15 100644 --- a/tests/unit/test_operations/test_label_referenced_variable_metadata.py +++ b/tests/unit/test_operations/test_label_referenced_variable_metadata.py @@ -206,6 +206,7 @@ def mock_cached_method(*args, **kwargs): "$label_referenced_variable_name", "$label_referenced_variable_role", "$label_referenced_variable_ordinal", + "$label_referenced_variable_core", "$label_referenced_variable_label", ] diff --git a/tests/unit/test_operations/test_name_referenced_variable_metadata.py b/tests/unit/test_operations/test_name_referenced_variable_metadata.py index 0014bacdf..51ba41363 100644 --- a/tests/unit/test_operations/test_name_referenced_variable_metadata.py +++ b/tests/unit/test_operations/test_name_referenced_variable_metadata.py @@ -206,6 +206,7 @@ def mock_cached_method(*args, **kwargs): "$name_referenced_variable_name", "$name_referenced_variable_role", "$name_referenced_variable_ordinal", + "$name_referenced_variable_core", "$name_referenced_variable_label", ]