cdisc-org · alexfurmenkov · Feb 25, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/README.md b/README.md
@@ -137,20 +137,23 @@ This will show the list of validation options.
 ```
   -ca, --cache TEXT               Relative path to cache files containing pre
                                   loaded metadata and rules
-  -ps, --pool-size INTEGER         Number of parallel processes for validation
-  -d, --data TEXT                 Path to directory containing data files
+  -ps, --pool-size INTEGER        Number of parallel processes for validation
+  -dep, --dotenv-path             Path to the .env file used to set environment variables.
+  -d, --data TEXT                 Path to directory containing data files.
+                                    DATA_DIR environment variable can be used to pass value.
   -dp, --dataset-path TEXT        Absolute path to dataset file. Can be specified multiple times.
-  -dxp, --define-xml-path TEXT    Path to Define-XML
+                                  DATASET_PATH environment variable can be used to pass values separated by ':' on Unix and ';' for Windows.
+  -dxp, --define-xml-path TEXT    Path to Define-XML. DEFINE environment variable can be used to pass value.
   -l, --log-level [info|debug|error|critical|disabled|warn]
                                   Sets log level for engine logs, logs are
                                   disabled by default
   -rt, --report-template TEXT     File path of report template to use for
                                   excel output
-  -s, --standard TEXT             CDISC standard to validate against
+  -s, --standard TEXT             CDISC standard to validate against. STANDARD environment variable can be used to pass value.
                                   [required]
-  -v, --version TEXT              Standard version to validate against
+  -v, --version TEXT              Standard version to validate against. VERSION environment variable can be used to pass value.
                                   [required]
-  -ss, --substandard TEXT         Substandard to validate against
+  -ss, --substandard TEXT         Substandard to validate against. SUBSTANDARD environment variable can be used to pass value.
                                   "SDTM", "SEND", "ADaM", or "CDASH"
                                   [required for TIG]
   -uc, --use-case TEXT            Use Case for TIG Validation
@@ -161,7 +164,8 @@ This will show the list of validation options.
                                   against, can provide more than one
                                   NOTE: if a defineXML is provided, if it is version 2.1
                                   engine will use the CT laid out in the define.  If it is
-                                  version 2.0, -ct is expected to specify the CT package
+                                  version 2.0, -ct is expected to specify the CT package.
+                                  CONTROLLED_TERMINOLOGY_PACKAGE environment variable can be used to pass values separated by ':' on Unix and ';' for Windows.
   -o, --output TEXT               Report output file destination and name. Path will be
                                   relative to the validation execution directory
                                   and should end in the desired output filename
@@ -204,27 +208,30 @@ This will show the list of validation options.
                                   if both .env and -me <limit> are specified, the larger value will be used.  If either sets the per_dataset_flag to true, it will be true
                                   If limit is set to 0, no maximum will be enforced.
                                   No maximum is the default behavior.
-  -dv, --define-version TEXT      Define-XML version used for validation
+  -dv, --define-version TEXT      Define-XML version used for validation. DEFINE_VERSION environment variable can be used to pass value.
   -dxp, --define-xml-path         Path to define-xml file.
   -vx, --validate-xml             Enable XML validation (default 'y' to enable, otherwise disable).
   --whodrug TEXT                  Path to directory with WHODrug dictionary
                                   files
   --meddra TEXT                   Path to directory with MedDRA dictionary
                                   files
-  --loinc TEXT                  Path to directory with LOINC dictionary
+  --loinc TEXT                    Path to directory with LOINC dictionary
                                   files
-  --medrt TEXT                  Path to directory with MEDRT dictionary
+  --medrt TEXT                    Path to directory with MEDRT dictionary
                                   files
-  --unii TEXT                  Path to directory with UNII dictionary
+  --unii TEXT                     Path to directory with UNII dictionary
                                   files
-  --snomed-version TEXT        Version of snomed to use. (ex. 2024-09-01)
-  --snomed-url TEXT            Base url of snomed api to use. (ex. https://snowstorm.snomedtools.org/snowstorm/snomed-ct)
-  --snomed-edition TEXT        Edition of snomed to use. (ex. SNOMEDCT-US)
+  --snomed-version TEXT           Version of snomed to use. (ex. 2024-09-01)
+  --snomed-url TEXT               Base url of snomed api to use. (ex. https://snowstorm.snomedtools.org/snowstorm/snomed-ct)
+  --snomed-edition TEXT           Edition of snomed to use. (ex. SNOMEDCT-US)
   -r, --rules TEXT                Specify rule core ID ex. CORE-000001. Can be specified multiple times.
+                                    RULES environment variable can be used to pass values separated by ':' on Unix and ';' for Windows.
   -er, --exclude-rules TEXT       Specify rule core ID to exclude, ex. CORE-000001. Can be specified multiple times.
+                                    EXCLUDE_RULES environment variable can be used to pass values separated by ':' on Unix and ';' for Windows.
   -lr, --local-rules TEXT         Specify relative path to directory or file containing
                                   local rule yml and/or json rule files.
-  -cs, --custom-standard       Adding this flag tells engine to use a custom standard specified with -s and -v
+                                  LOCAL_RULES environment variable can be used to pass values separated by ':' on Unix and ';' for Windows.
+  -cs, --custom-standard          Adding this flag tells engine to use a custom standard specified with -s and -v
                                   that has been uploaded to the cache using update-cache
   -cse, --custom-standard-encoding TEXT
                                   Explicitly specify the file encoding to use
@@ -243,6 +250,11 @@ This will show the list of validation options.
   -jcf, --jsonata-custom-functions Pair containing a variable name and a Path to directory containing a set of custom JSONata functions. Can be specified multiple times
   -e, --encoding TEXT             File encoding for reading datasets. If not specified, defaults to utf-8. Supported encodings: utf-8, utf-16, utf-32, cp1252, latin-1, etc.
   -ft, --filetype TEXT            File extension to filter datasets. Has higher priority than --dataset-path parameter.
+  -vcp, --variables-csv-path      Path to variables.csv. Used when multiple dataset paths are provided and refer to different folders.
+                                    Not required if variables.txt exists in all -dp directories.
+                                    VARIABLES_CSV environment variable can be used to pass value.
+  -tcp, --tables-csv-path         Path to tables.csv. Required when multiple dataset paths are provided and refer to different folders.
+                                    TABLES_CSV environment variable can be used to pass value
   --help                          Show this message and exit.
 ```
 

diff --git a/cdisc_rules_engine/enums/dataformat_types.py b/cdisc_rules_engine/enums/dataformat_types.py
@@ -8,3 +8,4 @@ class DataFormatTypes(BaseEnum):
     USDM = "USDM"
     XLSX = "XLSX"
     XPT = "XPT"
+    CSV = "CSV"
diff --git a/cdisc_rules_engine/exceptions/custom_exceptions.py b/cdisc_rules_engine/exceptions/custom_exceptions.py
@@ -82,6 +82,11 @@ class CTPackageNotFoundError(EngineError):
     description = "Controlled terminology package(s) not found"
 
 
+class InvalidCSVFile(EngineError):
+    code = 400
+    description = "CSV data is malformed."
+
+
 class NumberOfAttemptsExceeded(EngineError):
     pass
 

diff --git a/cdisc_rules_engine/interfaces/data_reader_interface.py b/cdisc_rules_engine/interfaces/data_reader_interface.py
@@ -11,8 +11,8 @@ def __init__(
         self, dataset_implementation=PandasDataset, encoding: str = DEFAULT_ENCODING
     ):
         """
-        :param dataset_implementation DatasetInterface: The dataset type to return.
-        :param encoding str: The encoding to use when reading files. Defaults to DEFAULT_ENCODING (e.g. utf-8).
+        :param DatasetInterface dataset_implementation : The dataset type to return.
+        :param str encoding : The encoding to use when reading files. Defaults to DEFAULT_ENCODING (e.g. utf-8).
         """
         self.dataset_implementation = dataset_implementation
         self.encoding = encoding
@@ -26,3 +26,7 @@ def read(self, data):
 
     def from_file(self, file_path):
         raise NotImplementedError
+
+    def to_parquet(self, file_path) -> tuple[int, str]:
+        """Returns number of rows and path to the parquet file"""
+        raise NotImplementedError
diff --git a/cdisc_rules_engine/models/validation_args.py b/cdisc_rules_engine/models/validation_args.py
@@ -29,5 +29,7 @@
         "max_report_rows",
         "max_errors_per_rule",
         "encoding",
+        "variables_csv_path",
+        "tables_csv_path",
     ],
 )
diff --git a/cdisc_rules_engine/services/csv_metadata_reader.py b/cdisc_rules_engine/services/csv_metadata_reader.py
@@ -0,0 +1,183 @@
+import logging
+from datetime import datetime
+from pathlib import Path
+
+import pandas as pd
+
+from cdisc_rules_engine.constants import DEFAULT_ENCODING
+
+
+class DatasetCSVMetadataReader:
+    def __init__(
+        self,
+        file_path: str,
+        file_name: str,
+        encoding: str = DEFAULT_ENCODING,
+        variables_csv_path: str = None,
+        tables_csv_path: str = None,
+        **kwargs,
+    ):
+        self.file_path = file_path
+        self.file_name = file_name
+        self.encoding = encoding
+        self.variables_csv_path = (
+            Path(variables_csv_path)
+            if variables_csv_path
+            else Path(self.file_path).parent / "variables.csv"
+        )
+        self.tables_csv_path = (
+            Path(tables_csv_path)
+            if tables_csv_path
+            else Path(self.file_path).parent / "tables.csv"
+        )
+
+    def read(self) -> dict:
+        dataset_name = Path(self.file_name).stem.lower()
+
+        if not self.variables_csv_path.exists():
+            logger = logging.getLogger("validator")
+            logger.info("No variables file found for %s", dataset_name)
+            variables_meta = {}
+        else:
+            variables_meta = self.__get_variable_metadata(
+                dataset_name, self.variables_csv_path
+            )
+
+        metadata = {
+            "dataset_name": dataset_name.upper(),
+            "dataset_modification_date": datetime.fromtimestamp(
+                Path(self.file_path).stat().st_mtime
+            ).isoformat(),
+            "adam_info": {
+                "categorization_scheme": {},
+                "w_indexes": {},
+                "period": {},
+                "selection_algorithm": {},
+            },
+        }
+        metadata.update(variables_meta)
+        metadata.update(self.__data_meta())
+        metadata.update(self.__dataset_label())
+        return metadata
+
+    def __get_variable_metadata(
+        self, dataset_name: str, variables_file_path: Path
+    ) -> dict:
+        logger = logging.getLogger("validator")
+        try:
+            meta_df = pd.read_csv(variables_file_path, encoding=self.encoding)
+        except (UnicodeDecodeError, UnicodeError) as e:
+            logger.error(
+                f"Could not decode CSV file {variables_file_path} with {self.encoding} encoding: {e}. "
+                f"Please specify the correct encoding using the -e flag."
+            )
+            return {}
+        except Exception as e:
+            logger.error("Error reading CSV file %s. %s", self.file_path, e)
+            return {}
+
+        meta_df["dataset"] = meta_df["dataset"].apply(
+            lambda x: Path(str(x)).stem.lower()
+        )
+
+        dataset_meta_df = meta_df[meta_df["dataset"] == dataset_name]
+
+        if dataset_meta_df.empty:
+            logger = logging.getLogger("validator")
+            logger.info("No dataset metadata found for %s", dataset_name)
+            return {}
+
+        variable_names = dataset_meta_df["variable"].tolist()
+        variable_labels = dataset_meta_df["label"].tolist()
+
+        variable_name_to_label_map = dict(zip(variable_names, variable_labels))
+        variable_name_to_data_type_map = dict(
+            zip(variable_names, dataset_meta_df["type"])
+        )
+        variable_name_to_size_map = {
+            var: (int(length) if pd.notna(length) else None)
+            for var, length in zip(variable_names, dataset_meta_df["length"])
+        }
+        return {
+            "variable_names": variable_names,
+            "variable_labels": variable_labels,
+            "variable_formats": [""] * len(variable_names),
+            "variable_name_to_label_map": variable_name_to_label_map,
+            "variable_name_to_data_type_map": variable_name_to_data_type_map,
+            "variable_name_to_size_map": variable_name_to_size_map,
+            "number_of_variables": len(variable_names),
+        }
+
+    def __dataset_label(self) -> dict:
+        logger = logging.getLogger("validator")
+
+        if not self.tables_csv_path.exists():
+            return {}
+
+        try:
+            tables_df = pd.read_csv(self.tables_csv_path, encoding=self.encoding)
+        except (UnicodeDecodeError, UnicodeError) as e:
+            logger.error(
+                f"\n  Error reading CSV from: {self.file_path}"
+                f"\n  Failed to decode with {self.encoding} encoding: {e}"
+                f"\n  Please specify the correct encoding using the -e flag."
+            )
+            return {}
+        except Exception as e:
+            logger.error("Error reading CSV file %s. %s", self.file_path, e)
+            return {}
+
+        if "Filename" not in tables_df.columns or "Label" not in tables_df.columns:
+            return {}
+
+        tables_df["dataset"] = tables_df["Filename"].apply(
+            lambda x: Path(str(x)).stem.lower()
+        )
+
+        current_dataset = Path(self.file_name).stem.lower()
+        match = tables_df[tables_df["dataset"] == current_dataset]
+
+        if match.empty:
+            return {}
+
+        return {"dataset_label": str(match.iloc[0]["Label"])}
+
+    def __data_meta(self):
+        logger = logging.getLogger("validator")
+        result = {
+            "dataset_length": 0,
+            "first_record": {},
+        }
+        try:
+            first_row_df = pd.read_csv(self.file_path, encoding=self.encoding, nrows=1)
+        except (UnicodeDecodeError, UnicodeError) as e:
+            logger.error(
+                f"\n  Error reading CSV from: {self.file_path}"
+                f"\n  Failed to decode with {self.encoding} encoding: {e}"
+                f"\n  Please specify the correct encoding using the -e flag."
+            )
+            return result
+        except Exception as e:
+            logger.error("Error reading CSV file %s. %s", self.file_path, e)
+            return result
+
+        if not first_row_df.empty:
+            result["first_record"] = (
+                first_row_df.iloc[0].fillna("").astype(str).to_dict()
+            )
+
+        try:
+            with open(self.file_path, encoding=self.encoding) as f:
+                result["dataset_length"] = max(
+                    sum(1 for _ in f) - 1, 0
+                )  # subtract header
+        except (UnicodeDecodeError, UnicodeError) as e:
+            logger.error(
+                f"\n  Error reading CSV from: {self.file_path}"
+                f"\n  Failed to decode with {self.encoding} encoding: {e}"
+                f"\n  Please specify the correct encoding using the -e flag."
+            )
+        except Exception as e:
+            logger.error("Error reading CSV file %s. %s", self.file_path, e)
+
+        return result
diff --git a/cdisc_rules_engine/services/data_readers/csv_reader.py b/cdisc_rules_engine/services/data_readers/csv_reader.py
@@ -0,0 +1,54 @@
+import tempfile
+
+from cdisc_rules_engine.exceptions.custom_exceptions import InvalidCSVFile
+from cdisc_rules_engine.interfaces import DataReaderInterface
+import pandas as pd
+
+
+class CSVReader(DataReaderInterface):
+    def read(self, data):
+        """
+        Function for reading data from a specific file type and returning a
+        pandas dataframe of the data.
+        """
+        raise NotImplementedError
+
+    def from_file(self, file_path):
+        try:
+            with open(file_path, "r", encoding=self.encoding) as fp:
+                data = pd.read_csv(fp, sep=",", header=0, index_col=False)
+            return data
+        except (UnicodeDecodeError, UnicodeError) as e:
+            raise InvalidCSVFile(
+                f"\n  Error reading CSV from: {file_path}"
+                f"\n  Failed to decode with {self.encoding} encoding: {e}"
+                f"\n  Please specify the correct encoding using the -e flag."
+            )
+        except Exception as e:
+            raise InvalidCSVFile(
+                f"\n  Error reading CSV from: {file_path}"
+                f"\n  {type(e).__name__}: {e}"
+            )
+
+    def to_parquet(self, file_path: str) -> tuple[int, str]:
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet")
+
+        dataset = pd.read_csv(file_path, chunksize=20000, encoding=self.encoding)
+
+        created = False
+        num_rows = 0
+
+        for chunk in dataset:
+            num_rows += len(chunk)
+
+            if not created:
+                chunk.to_parquet(temp_file.name, engine="fastparquet")
+                created = True
+            else:
+                chunk.to_parquet(temp_file.name, engine="fastparquet", append=True)
+
+        if not created:
+            empty_df = pd.read_csv(file_path, nrows=0, encoding=self.encoding)
+            empty_df.to_parquet(temp_file.name, engine="fastparquet")
+
+        return num_rows, temp_file.name
diff --git a/cdisc_rules_engine/services/data_readers/data_reader_factory.py b/cdisc_rules_engine/services/data_readers/data_reader_factory.py
@@ -4,6 +4,7 @@
     DataReaderInterface,
     FactoryInterface,
 )
+from cdisc_rules_engine.services.data_readers.csv_reader import CSVReader
 from cdisc_rules_engine.services.data_readers.xpt_reader import XPTReader
 from cdisc_rules_engine.services.data_readers.dataset_json_reader import (
     DatasetJSONReader,
@@ -19,12 +20,13 @@
 
 
 class DataReaderFactory(FactoryInterface):
-    _reader_map = {
+    _reader_map: dict[str, Type[DataReaderInterface]] = {
         DataFormatTypes.XPT.value: XPTReader,
         DataFormatTypes.PARQUET.value: ParquetReader,
         DataFormatTypes.JSON.value: DatasetJSONReader,
         DataFormatTypes.NDJSON.value: DatasetNDJSONReader,
         DataFormatTypes.USDM.value: JSONReader,
+        DataFormatTypes.CSV.value: CSVReader,
     }
 
     def __init__(