From 968970fdd3ec18fe7381d0077ac0a7087633550a Mon Sep 17 00:00:00 2001
From: Ron Keizer <ron@insight-rx.com>
Date: Fri, 3 Apr 2026 12:08:08 -0700
Subject: [PATCH 1/5] feat: add check_nm_dataset() for automated NONMEM dataset
 validation

Adds a comprehensive validation function that runs 35+ checks on NONMEM
datasets and returns a summary table (check/description/result) without
throwing errors. Covers column naming, EVID/AMT/DV/MDV consistency,
TIME ordering, ADDL/II overlap detection, reserved names, covariate
missingness, and more.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 DESCRIPTION                            |   2 +-
 NAMESPACE                              |   1 +
 R/check_nm_dataset.R                   | 626 +++++++++++++++++++++++++
 man/check_nm_dataset.Rd                |  45 ++
 tests/testthat/test-check_nm_dataset.R | 232 +++++++++
 5 files changed, 905 insertions(+), 1 deletion(-)
 create mode 100644 R/check_nm_dataset.R
 create mode 100644 man/check_nm_dataset.Rd
 create mode 100644 tests/testthat/test-check_nm_dataset.R

diff --git a/DESCRIPTION b/DESCRIPTION
index d3add72..9550923 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: irxforge
 Title: Forging data for pharmacometric analyses
-Version: 0.0.0.9004
+Version: 0.0.0.9005
 Authors@R: c(
     person("Ron", "Keizer", email = "ron@insight-rx.com", role = c("cre", "aut")),
     person("Michael", "McCarthy", email = "michael.mccarthy@insight-rx.com", role = "ctb"),
diff --git a/NAMESPACE b/NAMESPACE
index 23045ba..c93d68c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add_nominal_timepoints)
+export(check_nm_dataset)
 export(create_demographics_table)
 export(download_nhanes_cache)
 export(get_nominal_timepoints)
diff --git a/R/check_nm_dataset.R b/R/check_nm_dataset.R
new file mode 100644
index 0000000..555d824
--- /dev/null
+++ b/R/check_nm_dataset.R
@@ -0,0 +1,626 @@
+#' Run automated quality checks on a NONMEM dataset
+#'
+#' Performs a comprehensive set of checks on a NONMEM-style dataset and returns
+#' a summary table with the result of each check. No errors are thrown; all
+#' issues are reported as rows in the output table.
+#'
+#' @param data a data.frame (or tibble) containing the NONMEM dataset to check.
+#' @param exclusion_flag character name of the column used as an exclusion flag
+#'   (for `$DATA ACCEPT` / `IGNORE` in NONMEM). If `NULL` (default), the
+#'   exclusion-flag check is skipped.
+#' @param max_digits integer; threshold for flagging numbers with too many
+#'   significant digits (default `10`).
+#' @param max_id_digits integer; preferred maximum number of characters for the
+#'   ID variable (default `5`).
+#' @param extra_reserved character vector of additional reserved NONMEM names
+#'   to check for (e.g. PK parameters used in ADVAN/TRANS such as `"CL"`,
+#'   `"V"`, `"V1"`, `"K12"`).
+#' @param verbose logical; if `TRUE` (default), messages are printed for every
+#'   check as it is evaluated.
+#'
+#' @returns A data.frame with columns `check` (short test name), `description`
+#'   (human-readable explanation), and `result` (`"PASS"`, `"FAIL"`, or
+#'   `"NA"` when a check is not applicable).
+#'
+#' @export
+check_nm_dataset <- function(
+  data,
+  exclusion_flag = NULL,
+  max_digits = 10,
+  max_id_digits = 5,
+  extra_reserved = NULL,
+  verbose = TRUE
+) {
+
+  results <- data.frame(
+    check = character(0),
+    description = character(0),
+    result = character(0),
+    stringsAsFactors = FALSE
+  )
+
+  log_check <- function(check, description, result) {
+    if (verbose) {
+      symbol <- switch(result, PASS = "[PASS]", FAIL = "[FAIL]", "[ NA ]")
+      message(sprintf("%-8s %s", symbol, description))
+    }
+    results[nrow(results) + 1, ] <<- list(check, description, result)
+  }
+
+  has_col <- function(col) col %in% names(data)
+  cols <- toupper(names(data))
+
+  # ──────────────────────────────────────────────────────────────────────────
+
+  # Required columns
+  # ──────────────────────────────────────────────────────────────────────────
+  required <- c("ID", "TIME", "DV")
+  present <- required %in% cols
+  log_check(
+    "required_cols",
+    paste0("Minimum required columns present (", paste(required, collapse = ", "), ")"),
+    if (all(present)) "PASS" else "FAIL"
+  )
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # Unique column names
+  # ──────────────────────────────────────────────────────────────────────────
+  log_check(
+    "unique_colnames",
+    "Column names are unique",
+    if (anyDuplicated(names(data)) == 0) "PASS" else "FAIL"
+  )
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # No special characters in column names
+  # ──────────────────────────────────────────────────────────────────────────
+  bad_names <- grep("[^A-Za-z0-9_]", names(data), value = TRUE)
+  log_check(
+    "colname_chars",
+    "Column names contain no special characters",
+    if (length(bad_names) == 0) "PASS" else "FAIL"
+  )
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # Reserved NONMEM names
+  # ──────────────────────────────────────────────────────────────────────────
+  reserved <- c(
+    "L1", "L2", "DATE", "DAT1", "DAT2", "DAT3",
+    "PCMT", "CALL", "CONT", "NEWIND", "W",
+    "ETA", "THETA", "EPS", "ERR",
+    "IPRED", "IRES", "IWRES", "CWRES", "PHI"
+  )
+  if (!is.null(extra_reserved)) {
+    reserved <- unique(c(reserved, toupper(extra_reserved)))
+  }
+  used_reserved <- intersect(cols, reserved)
+  log_check(
+    "reserved_names",
+    "No reserved NONMEM variable names used as column names",
+    if (length(used_reserved) == 0) "PASS" else "FAIL"
+  )
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # Character columns warning
+  # ──────────────────────────────────────────────────────────────────────────
+  char_cols <- names(data)[vapply(data, is.character, logical(1))]
+  log_check(
+    "char_columns",
+    "No character-type columns present (may cause NONMEM compilation failure)",
+    if (length(char_cols) == 0) "PASS" else "FAIL"
+  )
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # NA represented as "." not "NA"
+  # ──────────────────────────────────────────────────────────────────────────
+  has_na_string <- any(vapply(data, function(x) any(x == "NA", na.rm = TRUE), logical(1)))
+  log_check(
+    "na_coding",
+    "Missing values coded as '.' and not as string 'NA'",
+    if (!has_na_string) "PASS" else "FAIL"
+  )
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # Commas inside character values (problematic for CSV)
+  # ──────────────────────────────────────────────────────────────────────────
+  has_commas <- any(vapply(
+    data[vapply(data, is.character, logical(1))],
+    function(x) any(grepl(",", x, fixed = TRUE), na.rm = TRUE),
+    logical(1)
+  ))
+  log_check(
+    "csv_commas",
+    "Character values do not contain commas (safe for CSV)",
+    if (!has_commas) "PASS" else "FAIL"
+  )
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # Exclusion flag
+  # ──────────────────────────────────────────────────────────────────────────
+  if (!is.null(exclusion_flag)) {
+    if (has_col(exclusion_flag)) {
+      ef <- data[[exclusion_flag]]
+      ef_ok <- !any(is.na(ef)) && is.numeric(ef) && all(ef == round(ef))
+      log_check(
+        "exclusion_flag",
+        paste0("Exclusion flag '", exclusion_flag, "' is non-missing and integer"),
+        if (ef_ok) "PASS" else "FAIL"
+      )
+    } else {
+      log_check(
+        "exclusion_flag",
+        paste0("Exclusion flag '", exclusion_flag, "' exists in dataset"),
+        "FAIL"
+      )
+    }
+  } else {
+    log_check(
+      "exclusion_flag",
+      "Exclusion flag check (not specified)",
+      "NA"
+    )
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # TIME checks
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("TIME")) {
+    time <- data$TIME
+    log_check(
+      "time_numeric",
+      "TIME is numeric",
+      if (is.numeric(time)) "PASS" else "FAIL"
+    )
+    if (is.numeric(time)) {
+      log_check(
+        "time_nonneg",
+        "TIME is non-negative",
+        if (all(time >= 0, na.rm = TRUE)) "PASS" else "FAIL"
+      )
+    }
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # TIME non-decreasing within ID (except EVID 3, 4)
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("TIME") && has_col("ID") && is.numeric(data$TIME)) {
+    time_ok <- TRUE
+    for (id in unique(data$ID)) {
+      idx <- data$ID == id
+      sub <- data[idx, ]
+      # exclude reset events (EVID 3, 4) from this check
+      if (has_col("EVID")) {
+        keep <- !(sub$EVID %in% c(3, 4))
+        t <- sub$TIME[keep]
+      } else {
+        t <- sub$TIME
+      }
+      if (length(t) > 1 && any(diff(t) < 0, na.rm = TRUE)) {
+        time_ok <- FALSE
+        break
+      }
+    }
+    log_check(
+      "time_order",
+      "TIME is non-decreasing within each ID (excluding EVID 3/4 reset events)",
+      if (time_ok) "PASS" else "FAIL"
+    )
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # EVID checks
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("EVID")) {
+    evid <- data$EVID
+    log_check(
+      "evid_values",
+      "EVID values are in {0, 1, 2, 3, 4}",
+      if (all(evid %in% c(0, 1, 2, 3, 4), na.rm = TRUE)) "PASS" else "FAIL"
+    )
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # CMT checks
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("CMT")) {
+    cmt <- data$CMT
+    log_check(
+      "cmt_integer",
+      "CMT values are integers",
+      if (is.numeric(cmt) && all(cmt == round(cmt), na.rm = TRUE)) "PASS" else "FAIL"
+    )
+    if (has_col("EVID")) {
+      # For non-EVID==3 records, CMT should be positive (allow negative for compartment emptying)
+      non_reset <- data$EVID != 3
+      cmt_sub <- cmt[non_reset]
+      cmt_sub <- cmt_sub[!is.na(cmt_sub)]
+      log_check(
+        "cmt_positive",
+        "CMT is non-zero for non-reset (EVID!=3) records (negative allowed for emptying)",
+        if (all(cmt_sub != 0)) "PASS" else "FAIL"
+      )
+    }
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # DV checks
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("DV")) {
+    dv <- data$DV
+    log_check(
+      "dv_numeric",
+      "DV is numeric",
+      if (is.numeric(dv)) "PASS" else "FAIL"
+    )
+    if (has_col("EVID")) {
+      dose_evids <- data$EVID %in% c(1, 4)
+      dv_at_dose <- dv[dose_evids]
+      log_check(
+        "dv_missing_dose",
+        "DV is missing (NA or 0) for dosing records (EVID 1, 4)",
+        if (all(is.na(dv_at_dose) | dv_at_dose == 0)) "PASS" else "FAIL"
+      )
+    }
+    # Non-numeric / NA in DV for observation records
+    if (has_col("EVID") && is.numeric(dv)) {
+      obs_dv <- dv[data$EVID == 0]
+      log_check(
+        "dv_obs_nonmissing",
+        "DV is non-missing for observation records (EVID 0)",
+        if (!any(is.na(obs_dv))) "PASS" else "FAIL"
+      )
+    }
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # MDV checks
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("MDV")) {
+    mdv <- data$MDV
+    log_check(
+      "mdv_binary",
+      "MDV is binary (0 or 1)",
+      if (all(mdv %in% c(0, 1), na.rm = TRUE)) "PASS" else "FAIL"
+    )
+    if (has_col("DV") && is.numeric(data$DV) && has_col("EVID")) {
+      obs <- data$EVID == 0
+      # For observations: MDV should be 1 when DV is NA, 0 when DV is not NA
+      expected_mdv <- ifelse(is.na(data$DV[obs]), 1, 0)
+      log_check(
+        "mdv_dv_consistent",
+        "MDV is consistent with is.na(DV) for observation records (EVID 0)",
+        if (all(mdv[obs] == expected_mdv, na.rm = TRUE)) "PASS" else "FAIL"
+      )
+      # For dosing records, MDV should be 1
+      dosing <- data$EVID %in% c(1, 4)
+      log_check(
+        "mdv_dosing",
+        "MDV is 1 for dosing records (EVID 1, 4)",
+        if (all(mdv[dosing] == 1, na.rm = TRUE)) "PASS" else "FAIL"
+      )
+    }
+    # Non-missing observations should have MDV = 0
+    if (has_col("DV") && is.numeric(data$DV)) {
+      nonmissing_obs <- !is.na(data$DV) & data$DV != 0
+      if (has_col("EVID")) {
+        nonmissing_obs <- nonmissing_obs & data$EVID == 0
+      }
+      log_check(
+        "mdv_nonmissing_obs",
+        "MDV is 0 for non-missing observation records (DV > 0)",
+        if (all(mdv[nonmissing_obs] == 0, na.rm = TRUE)) "PASS" else "FAIL"
+      )
+    }
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # AMT checks
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("AMT")) {
+    amt <- data$AMT
+    log_check(
+      "amt_numeric",
+      "AMT is numeric",
+      if (is.numeric(amt)) "PASS" else "FAIL"
+    )
+    if (has_col("EVID") && is.numeric(amt)) {
+      obs_other <- data$EVID %in% c(0, 2)
+      amt_obs <- amt[obs_other]
+      log_check(
+        "amt_zero_obs",
+        "AMT is 0 or missing for observation/other-type records (EVID 0, 2)",
+        if (all(is.na(amt_obs) | amt_obs == 0)) "PASS" else "FAIL"
+      )
+      dose_recs <- data$EVID %in% c(1, 4)
+      amt_dose <- amt[dose_recs]
+      log_check(
+        "amt_positive_dose",
+        "AMT is positive for dosing records (EVID 1, 4)",
+        if (all(amt_dose > 0, na.rm = TRUE)) "PASS" else "FAIL"
+      )
+    }
+    # AMT > 0 should have EVID = 1
+    if (has_col("EVID") && is.numeric(amt)) {
+      amt_pos <- !is.na(amt) & amt > 0
+      log_check(
+        "amt_evid_consistent",
+        "Records with AMT > 0 have EVID = 1 (or 4)",
+        if (all(data$EVID[amt_pos] %in% c(1, 4))) "PASS" else "FAIL"
+      )
+    }
+    # Non-numeric/NA in AMT for dosing records
+    if (has_col("EVID")) {
+      dose_recs <- data$EVID %in% c(1, 4)
+      log_check(
+        "amt_nonmissing_dose",
+        "AMT is non-missing for dosing records (EVID 1, 4)",
+        if (!any(is.na(amt[dose_recs]))) "PASS" else "FAIL"
+      )
+    }
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # DV / MDV / BLQ consistency
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("DV") && has_col("MDV") && has_col("EVID") && is.numeric(data$DV)) {
+    obs <- data$EVID == 0
+    dv_leq_zero <- obs & !is.na(data$DV) & data$DV <= 0
+    log_check(
+      "dv_leq_zero_mdv",
+      "DV <= 0 observations have MDV = 1 (or BLQ flag)",
+      if (sum(dv_leq_zero) == 0 || all(data$MDV[dv_leq_zero] == 1)) "PASS" else "FAIL"
+    )
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # RATE checks
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("RATE")) {
+    rate <- data$RATE
+    log_check(
+      "rate_numeric",
+      "RATE is numeric",
+      if (is.numeric(rate)) "PASS" else "FAIL"
+    )
+    if (is.numeric(rate) && has_col("EVID")) {
+      dose_recs <- data$EVID %in% c(1, 4)
+      rate_dose <- rate[dose_recs]
+      rate_dose <- rate_dose[!is.na(rate_dose)]
+      # RATE for dosing events: -2, -1, 0, or positive
+      log_check(
+        "rate_dose_values",
+        "RATE for dosing records is -2, -1, 0, or positive",
+        if (all(rate_dose %in% c(-2, -1, 0) | rate_dose > 0)) "PASS" else "FAIL"
+      )
+    }
+  } else {
+    log_check("rate_present", "RATE column present", "NA")
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # SS checks
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("SS")) {
+    ss <- data$SS
+    log_check(
+      "ss_numeric",
+      "SS is numeric",
+      if (is.numeric(ss)) "PASS" else "FAIL"
+    )
+    if (is.numeric(ss) && has_col("EVID")) {
+      dose_recs <- data$EVID %in% c(1, 4)
+      ss_dose <- ss[dose_recs]
+      ss_dose <- ss_dose[!is.na(ss_dose)]
+      log_check(
+        "ss_values",
+        "SS for dosing records is 0 or 1",
+        if (all(ss_dose %in% c(0, 1))) "PASS" else "FAIL"
+      )
+    }
+  } else {
+    log_check("ss_present", "SS column present", "NA")
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # ADDL / II checks
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("ADDL") || has_col("II")) {
+    # ADDL and II must both be present
+    log_check(
+      "addl_ii_paired",
+      "ADDL and II are both present (required together)",
+      if (has_col("ADDL") && has_col("II")) "PASS" else "FAIL"
+    )
+    if (has_col("ADDL")) {
+      addl <- data$ADDL
+      log_check(
+        "addl_nonneg_int",
+        "ADDL is a non-negative integer for dosing records",
+        if (is.numeric(addl) && has_col("EVID")) {
+          dose_recs <- data$EVID %in% c(1, 4)
+          addl_d <- addl[dose_recs]
+          addl_d <- addl_d[!is.na(addl_d)]
+          all(addl_d >= 0) && all(addl_d == round(addl_d))
+        } else {
+          FALSE
+        }
+      )
+    }
+    if (has_col("II")) {
+      ii <- data$II
+      log_check(
+        "ii_nonneg_int",
+        "II is a non-negative integer for dosing records",
+        if (is.numeric(ii) && has_col("EVID")) {
+          dose_recs <- data$EVID %in% c(1, 4)
+          ii_d <- ii[dose_recs]
+          ii_d <- ii_d[!is.na(ii_d)]
+          all(ii_d >= 0) && all(ii_d == round(ii_d))
+        } else {
+          FALSE
+        }
+      )
+    }
+  } else {
+    log_check("addl_ii_present", "ADDL/II columns present", "NA")
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # Overlapping ADDL-expanded doses
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("ADDL") && has_col("II") && has_col("EVID") && has_col("ID") &&
+      has_col("TIME") && is.numeric(data$ADDL) && is.numeric(data$II)) {
+    overlap_found <- FALSE
+    for (id in unique(data$ID)) {
+      sub <- data[data$ID == id, ]
+      dose_idx <- which(sub$EVID %in% c(1, 4))
+      if (length(dose_idx) < 2) next
+      for (i in seq_len(length(dose_idx) - 1)) {
+        row_i <- dose_idx[i]
+        row_next <- dose_idx[i + 1]
+        addl_i <- sub$ADDL[row_i]
+        ii_i <- sub$II[row_i]
+        if (!is.na(addl_i) && !is.na(ii_i) && addl_i > 0 && ii_i > 0) {
+          last_expanded <- sub$TIME[row_i] + addl_i * ii_i
+          if (last_expanded >= sub$TIME[row_next]) {
+            overlap_found <- TRUE
+            break
+          }
+        }
+      }
+      if (overlap_found) break
+    }
+    log_check(
+      "addl_overlap",
+      "No overlapping ADDL-expanded doses (last expanded dose before next explicit dose)",
+      if (!overlap_found) "PASS" else "FAIL"
+    )
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # ID checks
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("ID")) {
+    ids <- as.character(data$ID)
+    has_leading_zeros <- any(grepl("^0\\d+", ids))
+    log_check(
+      "id_leading_zeros",
+      "IDs do not have leading zeros (would be lost when NONMEM reads the data)",
+      if (!has_leading_zeros) "PASS" else "FAIL"
+    )
+    max_len <- max(nchar(ids))
+    log_check(
+      "id_digits",
+      paste0("ID variable has <= ", max_id_digits, " digits (longer may need $TABLE FORMAT)"),
+      if (max_len <= max_id_digits) "PASS" else "FAIL"
+    )
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # Unique records (ID, TIME, CMT, EVID)
+  # ──────────────────────────────────────────────────────────────────────────
+  dup_cols <- intersect(c("ID", "TIME", "CMT", "EVID"), names(data))
+  if (length(dup_cols) >= 2) {
+    dup_check <- duplicated(data[, dup_cols, drop = FALSE])
+    log_check(
+      "unique_records",
+      paste0("Unique combination of ", paste(dup_cols, collapse = ", ")),
+      if (!any(dup_check)) "PASS" else "FAIL"
+    )
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # Dataset ordering (ID, TIME)
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("ID") && has_col("TIME")) {
+    sorted <- data[order(data$ID, data$TIME), ]
+    is_sorted <- identical(
+      paste(data$ID, data$TIME),
+      paste(sorted$ID, sorted$TIME)
+    )
+    log_check(
+      "data_ordered",
+      "Dataset is ordered by at least ID and TIME",
+      if (is_sorted) "PASS" else "FAIL"
+    )
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # Large numbers (precision issues)
+  # ──────────────────────────────────────────────────────────────────────────
+  num_cols <- names(data)[vapply(data, is.numeric, logical(1))]
+  if (length(num_cols) > 0) {
+    too_many_digits <- FALSE
+    for (col in num_cols) {
+      vals <- data[[col]]
+      vals <- vals[!is.na(vals) & vals != 0]
+      if (length(vals) > 0) {
+        n_digits <- nchar(gsub("[^0-9]", "", format(vals, scientific = FALSE)))
+        if (any(n_digits > max_digits)) {
+          too_many_digits <- TRUE
+          break
+        }
+      }
+    }
+    log_check(
+      "precision",
+      paste0("Numeric values have <= ", max_digits, " significant digits"),
+      if (!too_many_digits) "PASS" else "FAIL"
+    )
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # Missing covariate values
+  # ──────────────────────────────────────────────────────────────────────────
+  # Covariates are columns that are not standard NONMEM columns
+  nm_standard <- c(
+    "ID", "TIME", "DV", "MDV", "EVID", "AMT", "RATE", "SS",
+    "II", "ADDL", "CMT", "GROUP", "BLQ", "LLOQ", "TAD"
+  )
+  cov_cols <- setdiff(names(data), c(nm_standard, if (!is.null(exclusion_flag)) exclusion_flag))
+  if (length(cov_cols) > 0) {
+    has_missing_cov <- FALSE
+    for (col in cov_cols) {
+      vals <- data[[col]]
+      if (any(is.na(vals)) || any(vals %in% c(-999, -99, "."), na.rm = TRUE)) {
+        has_missing_cov <- TRUE
+        break
+      }
+    }
+    log_check(
+      "covariate_missing",
+      "No missing or sentinel values (-999, '.') in covariate columns",
+      if (!has_missing_cov) "PASS" else "FAIL"
+    )
+  }
+
+  # ──────────────────────────────────────────────────────────────────────────
+  # RATE column consistency with AMT
+  # ──────────────────────────────────────────────────────────────────────────
+  if (has_col("RATE") && has_col("AMT") && is.numeric(data$RATE) && is.numeric(data$AMT)) {
+    if (has_col("EVID")) {
+      dose_recs <- data$EVID %in% c(1, 4)
+      rate_dose <- data$RATE[dose_recs]
+      amt_dose <- data$AMT[dose_recs]
+      # If RATE is a real rate (> 0 and not -1/-2), AMT handling differs
+      real_rate <- !is.na(rate_dose) & !is.na(amt_dose) & rate_dose > 0
+      log_check(
+        "rate_amt_consistency",
+        "Dosing records with positive RATE (mg/h) also have positive AMT",
+        if (sum(real_rate) == 0 || all(amt_dose[real_rate] > 0)) "PASS" else "FAIL"
+      )
+    }
+  }
+
+  if (verbose) {
+    n_pass <- sum(results$result == "PASS")
+    n_fail <- sum(results$result == "FAIL")
+    n_na <- sum(results$result == "NA")
+    message(sprintf(
+      "\n--- Summary: %d checks | %d PASS | %d FAIL | %d NA ---",
+      nrow(results), n_pass, n_fail, n_na
+    ))
+  }
+
+  results
+}
diff --git a/man/check_nm_dataset.Rd b/man/check_nm_dataset.Rd
new file mode 100644
index 0000000..2d0bcd2
--- /dev/null
+++ b/man/check_nm_dataset.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/check_nm_dataset.R
+\name{check_nm_dataset}
+\alias{check_nm_dataset}
+\title{Run automated quality checks on a NONMEM dataset}
+\usage{
+check_nm_dataset(
+  data,
+  exclusion_flag = NULL,
+  max_digits = 10,
+  max_id_digits = 5,
+  extra_reserved = NULL,
+  verbose = TRUE
+)
+}
+\arguments{
+\item{data}{a data.frame (or tibble) containing the NONMEM dataset to check.}
+
+\item{exclusion_flag}{character name of the column used as an exclusion flag
+(for \verb{$DATA ACCEPT} / \code{IGNORE} in NONMEM). If \code{NULL} (default), the
+exclusion-flag check is skipped.}
+
+\item{max_digits}{integer; threshold for flagging numbers with too many
+significant digits (default \code{10}).}
+
+\item{max_id_digits}{integer; preferred maximum number of characters for the
+ID variable (default \code{5}).}
+
+\item{extra_reserved}{character vector of additional reserved NONMEM names
+to check for (e.g. PK parameters used in ADVAN/TRANS such as \code{"CL"},
+\code{"V"}, \code{"V1"}, \code{"K12"}).}
+
+\item{verbose}{logical; if \code{TRUE} (default), messages are printed for every
+check as it is evaluated.}
+}
+\value{
+A data.frame with columns \code{check} (short test name), \code{description}
+(human-readable explanation), and \code{result} (\code{"PASS"}, \code{"FAIL"}, or
+\code{"NA"} when a check is not applicable).
+}
+\description{
+Performs a comprehensive set of checks on a NONMEM-style dataset and returns
+a summary table with the result of each check. No errors are thrown; all
+issues are reported as rows in the output table.
+}
diff --git a/tests/testthat/test-check_nm_dataset.R b/tests/testthat/test-check_nm_dataset.R
new file mode 100644
index 0000000..1f19ef5
--- /dev/null
+++ b/tests/testthat/test-check_nm_dataset.R
@@ -0,0 +1,232 @@
+# Helper: minimal valid NONMEM dataset
+make_nm <- function(n_id = 2, n_obs = 3) {
+  do.call(rbind, lapply(seq_len(n_id), function(id) {
+    dose <- data.frame(
+      ID = id, TIME = 0, DV = NA_real_, MDV = 1, EVID = 1,
+      AMT = 100, CMT = 1, stringsAsFactors = FALSE
+    )
+    obs <- data.frame(
+      ID = id,
+      TIME = seq_len(n_obs),
+      DV = runif(n_obs, 1, 10),
+      MDV = 0, EVID = 0, AMT = 0, CMT = 1,
+      stringsAsFactors = FALSE
+    )
+    rbind(dose, obs)
+  }))
+}
+
+test_that("clean dataset passes all applicable checks", {
+  d <- make_nm()
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_true(all(res$result %in% c("PASS", "NA")))
+  expect_s3_class(res, "data.frame")
+  expect_named(res, c("check", "description", "result"))
+})
+
+test_that("missing required columns are flagged", {
+  d <- data.frame(ID = 1, TIME = 0) # no DV
+
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "required_cols"], "FAIL")
+})
+
+test_that("duplicate column names are flagged", {
+  d <- make_nm()
+  names(d)[7] <- "ID" # duplicate
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "unique_colnames"], "FAIL")
+})
+
+test_that("special characters in column names are flagged", {
+  d <- make_nm()
+  names(d)[1] <- "ID#1"
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "colname_chars"], "FAIL")
+})
+
+test_that("reserved NONMEM names are detected", {
+  d <- make_nm()
+  d$THETA <- 1
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "reserved_names"], "FAIL")
+})
+
+test_that("extra_reserved catches PK parameter names", {
+  d <- make_nm()
+  d$CL <- 5
+  # Without extra_reserved, CL is not flagged (it's not in the base list)
+  res1 <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res1$result[res1$check == "reserved_names"], "PASS")
+  # With extra_reserved
+  res2 <- check_nm_dataset(d, extra_reserved = c("CL", "V"), verbose = FALSE)
+  expect_equal(res2$result[res2$check == "reserved_names"], "FAIL")
+})
+
+test_that("negative TIME is flagged", {
+  d <- make_nm()
+  d$TIME[2] <- -1
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "time_nonneg"], "FAIL")
+})
+
+test_that("invalid EVID values are flagged", {
+  d <- make_nm()
+  d$EVID[1] <- 5
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "evid_values"], "FAIL")
+})
+
+test_that("AMT checks work correctly", {
+  d <- make_nm()
+  # AMT > 0 for observation record
+  d$AMT[2] <- 50
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "amt_zero_obs"], "FAIL")
+  expect_equal(res$result[res$check == "amt_evid_consistent"], "FAIL")
+})
+
+test_that("AMT missing for dosing records is flagged", {
+  d <- make_nm()
+  d$AMT[1] <- NA
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "amt_nonmissing_dose"], "FAIL")
+})
+
+test_that("DV present for dosing records is flagged", {
+  d <- make_nm()
+  d$DV[1] <- 999  # dose record should not have a real DV
+
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "dv_missing_dose"], "FAIL")
+})
+
+test_that("MDV consistency is checked", {
+  d <- make_nm()
+  d$MDV[2] <- 1  # observation with non-missing DV should have MDV = 0
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "mdv_nonmissing_obs"], "FAIL")
+})
+
+test_that("exclusion flag checks", {
+  d <- make_nm()
+  d$EXCL <- c(0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L)
+  res <- check_nm_dataset(d, exclusion_flag = "EXCL", verbose = FALSE)
+  expect_equal(res$result[res$check == "exclusion_flag"], "PASS")
+
+  # NA in exclusion flag
+  d$EXCL[1] <- NA
+
+  res2 <- check_nm_dataset(d, exclusion_flag = "EXCL", verbose = FALSE)
+  expect_equal(res2$result[res2$check == "exclusion_flag"], "FAIL")
+})
+
+test_that("ID leading zeros flagged", {
+  d <- make_nm(n_id = 1)
+  d$ID <- "001"
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "id_leading_zeros"], "FAIL")
+})
+
+test_that("decreasing TIME within ID is flagged", {
+  d <- make_nm(n_id = 1)
+  d$TIME <- c(0, 3, 2, 1)
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "time_order"], "FAIL")
+})
+
+test_that("character columns are flagged", {
+  d <- make_nm()
+  d$STUDY <- "A"
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "char_columns"], "FAIL")
+})
+
+test_that("string 'NA' values are detected", {
+  d <- make_nm()
+  d$DV <- as.character(d$DV)
+  d$DV[2] <- "NA"
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "na_coding"], "FAIL")
+})
+
+test_that("dataset ordering is checked", {
+  d <- make_nm(n_id = 2)
+  d <- d[sample(nrow(d)), ]  # shuffle
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "data_ordered"], "FAIL")
+})
+
+test_that("ADDL/II pairing is checked", {
+  d <- make_nm()
+  d$ADDL <- 0
+  # II missing
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "addl_ii_paired"], "FAIL")
+})
+
+test_that("ADDL overlap is detected", {
+  d <- data.frame(
+    ID = c(1, 1, 1),
+    TIME = c(0, 24, 48),
+    DV = c(NA, 5, 3),
+    MDV = c(1, 0, 0),
+    EVID = c(1, 0, 0),
+    AMT = c(100, 0, 0),
+    CMT = c(1, 1, 1),
+    ADDL = c(5, 0, 0),  # expands to TIME = 0, 12, 24, 36, 48 -- overlaps next dose
+    II = c(12, 0, 0),
+    stringsAsFactors = FALSE
+  )
+  # No second dose record here, so no overlap to detect.
+  # Add a second dose:
+  d2 <- rbind(d[1, ], data.frame(
+    ID = 1, TIME = 24, DV = NA, MDV = 1, EVID = 1,
+    AMT = 100, CMT = 1, ADDL = 0, II = 0, stringsAsFactors = FALSE
+  ), d[2:3, ])
+  res <- check_nm_dataset(d2, verbose = FALSE)
+  # First dose: TIME=0, ADDL=5, II=12 => last expanded at 60 >= next dose at 24
+  expect_equal(res$result[res$check == "addl_overlap"], "FAIL")
+})
+
+test_that("covariate missing values are detected", {
+  d <- make_nm()
+  d$WT <- c(70, 80, NA, 75, 65, 90, 85, 70)
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "covariate_missing"], "FAIL")
+})
+
+test_that("covariate sentinel values (-999) are detected", {
+  d <- make_nm()
+  d$WT <- c(70, 80, -999, 75, 65, 90, 85, 70)
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "covariate_missing"], "FAIL")
+})
+
+test_that("verbose output produces messages", {
+  d <- make_nm()
+  expect_message(check_nm_dataset(d, verbose = TRUE), "PASS")
+})
+
+test_that("SS values checked for dosing records", {
+  d <- make_nm()
+  d$SS <- 0
+  d$SS[1] <- 2  # invalid: SS should be 0 or 1
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "ss_values"], "FAIL")
+})
+
+test_that("RATE column checks", {
+  d <- make_nm()
+  d$RATE <- 0
+  d$RATE[1] <- -3  # invalid
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "rate_dose_values"], "FAIL")
+})
+
+test_that("duplicate records are flagged", {
+  d <- make_nm(n_id = 1)
+  d <- rbind(d, d[2, ])  # duplicate observation record
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "unique_records"], "FAIL")
+})

From 540bef86c21f54c8848dd2594fe96702191988b0 Mon Sep 17 00:00:00 2001
From: Ron Keizer <ron@insight-rx.com>
Date: Fri, 3 Apr 2026 12:39:54 -0700
Subject: [PATCH 2/5] feat: add dictionary support to check_nm_dataset()

Allows mapping non-standard column names to expected NONMEM names via a
dictionary argument (e.g., list(ID = "SUBJ", TIME = "TAD")). All checks
now resolve column names through the dictionary. Keys are
case-insensitive and unknown keys produce a warning.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 DESCRIPTION                            |   2 +-
 R/check_nm_dataset.R                   | 230 ++++++++++++++++---------
 man/check_nm_dataset.Rd                |   9 +
 tests/testthat/test-check_nm_dataset.R |  91 ++++++++++
 4 files changed, 253 insertions(+), 79 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 9550923..7dd7fe9 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: irxforge
 Title: Forging data for pharmacometric analyses
-Version: 0.0.0.9005
+Version: 0.0.0.9006
 Authors@R: c(
     person("Ron", "Keizer", email = "ron@insight-rx.com", role = c("cre", "aut")),
     person("Michael", "McCarthy", email = "michael.mccarthy@insight-rx.com", role = "ctb"),
diff --git a/R/check_nm_dataset.R b/R/check_nm_dataset.R
index 555d824..befd741 100644
--- a/R/check_nm_dataset.R
+++ b/R/check_nm_dataset.R
@@ -5,6 +5,13 @@
 #' issues are reported as rows in the output table.
 #'
 #' @param data a data.frame (or tibble) containing the NONMEM dataset to check.
+#' @param dictionary an optional named list that maps standard NONMEM column
+#'   names to the actual column names in `data`. For example,
+#'   `list(ID = "NMID", TIME = "TAD")` means the column called `"NMID"` in
+#'   `data` plays the role of `ID`, and `"TAD"` plays the role of `TIME`.
+#'   Any standard name not listed in the dictionary is assumed to match the
+#'   column name in `data` as-is. Supported keys: `ID`, `TIME`, `DV`, `MDV`,
+#'   `EVID`, `AMT`, `CMT`, `RATE`, `SS`, `ADDL`, `II`.
 #' @param exclusion_flag character name of the column used as an exclusion flag
 #'   (for `$DATA ACCEPT` / `IGNORE` in NONMEM). If `NULL` (default), the
 #'   exclusion-flag check is skipped.
@@ -25,6 +32,7 @@
 #' @export
 check_nm_dataset <- function(
   data,
+  dictionary = NULL,
   exclusion_flag = NULL,
   max_digits = 10,
   max_id_digits = 5,
@@ -32,6 +40,43 @@ check_nm_dataset <- function(
   verbose = TRUE
 ) {
 
+  # ──────────────────────────────────────────────────────────────────────────
+
+  # Resolve dictionary: map standard NONMEM names → actual column names
+
+  # ──────────────────────────────────────────────────────────────────────────
+  nm_keys <- c("ID", "TIME", "DV", "MDV", "EVID", "AMT", "CMT",
+               "RATE", "SS", "ADDL", "II")
+  col_map <- stats::setNames(nm_keys, nm_keys)          # default: name == column
+  if (!is.null(dictionary)) {
+    dict_upper <- stats::setNames(
+      as.character(dictionary),
+      toupper(names(dictionary))
+    )
+    unknown <- setdiff(names(dict_upper), nm_keys)
+    if (length(unknown) > 0) {
+      warning(
+        "Dictionary keys not recognised (ignored): ",
+        paste(unknown, collapse = ", ")
+      )
+    }
+    for (k in intersect(names(dict_upper), nm_keys)) {
+      col_map[[k]] <- dict_upper[[k]]
+    }
+    if (verbose) {
+      mapped <- col_map[col_map != names(col_map)]
+      if (length(mapped) > 0) {
+        message(
+          "Dictionary mapping: ",
+          paste(names(mapped), "->", mapped, collapse = ", ")
+        )
+      }
+    }
+  }
+
+  # Helper: resolve a standard NONMEM name to its actual column name
+  col_for <- function(nm_name) col_map[[nm_name]]
+
   results <- data.frame(
     check = character(0),
     description = character(0),
@@ -47,18 +92,22 @@ check_nm_dataset <- function(
     results[nrow(results) + 1, ] <<- list(check, description, result)
   }
 
-  has_col <- function(col) col %in% names(data)
+  # Column accessor helpers — all use dictionary-resolved names
+  has_col <- function(nm_name) col_for(nm_name) %in% names(data)
+  get_col <- function(nm_name) data[[col_for(nm_name)]]
   cols <- toupper(names(data))
 
   # ──────────────────────────────────────────────────────────────────────────
-
   # Required columns
   # ──────────────────────────────────────────────────────────────────────────
   required <- c("ID", "TIME", "DV")
-  present <- required %in% cols
+  present <- vapply(required, function(k) col_for(k) %in% names(data), logical(1))
   log_check(
     "required_cols",
-    paste0("Minimum required columns present (", paste(required, collapse = ", "), ")"),
+    paste0(
+      "Minimum required columns present (",
+      paste(vapply(required, col_for, character(1)), collapse = ", "), ")"
+    ),
     if (all(present)) "PASS" else "FAIL"
   )
 
@@ -138,7 +187,7 @@ check_nm_dataset <- function(
   # Exclusion flag
   # ──────────────────────────────────────────────────────────────────────────
   if (!is.null(exclusion_flag)) {
-    if (has_col(exclusion_flag)) {
+    if (exclusion_flag %in% names(data)) {
       ef <- data[[exclusion_flag]]
       ef_ok <- !any(is.na(ef)) && is.numeric(ef) && all(ef == round(ef))
       log_check(
@@ -165,7 +214,7 @@ check_nm_dataset <- function(
   # TIME checks
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("TIME")) {
-    time <- data$TIME
+    time <- get_col("TIME")
     log_check(
       "time_numeric",
       "TIME is numeric",
@@ -183,17 +232,17 @@ check_nm_dataset <- function(
   # ──────────────────────────────────────────────────────────────────────────
   # TIME non-decreasing within ID (except EVID 3, 4)
   # ──────────────────────────────────────────────────────────────────────────
-  if (has_col("TIME") && has_col("ID") && is.numeric(data$TIME)) {
+  if (has_col("TIME") && has_col("ID") && is.numeric(get_col("TIME"))) {
     time_ok <- TRUE
-    for (id in unique(data$ID)) {
-      idx <- data$ID == id
-      sub <- data[idx, ]
-      # exclude reset events (EVID 3, 4) from this check
-      if (has_col("EVID")) {
-        keep <- !(sub$EVID %in% c(3, 4))
-        t <- sub$TIME[keep]
-      } else {
-        t <- sub$TIME
+    id_vec <- get_col("ID")
+    time_vec <- get_col("TIME")
+    evid_vec <- if (has_col("EVID")) get_col("EVID") else NULL
+    for (id in unique(id_vec)) {
+      idx <- id_vec == id
+      t <- time_vec[idx]
+      if (!is.null(evid_vec)) {
+        keep <- !(evid_vec[idx] %in% c(3, 4))
+        t <- t[keep]
       }
       if (length(t) > 1 && any(diff(t) < 0, na.rm = TRUE)) {
         time_ok <- FALSE
@@ -211,7 +260,7 @@ check_nm_dataset <- function(
   # EVID checks
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("EVID")) {
-    evid <- data$EVID
+    evid <- get_col("EVID")
     log_check(
       "evid_values",
       "EVID values are in {0, 1, 2, 3, 4}",
@@ -223,15 +272,15 @@ check_nm_dataset <- function(
   # CMT checks
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("CMT")) {
-    cmt <- data$CMT
+    cmt <- get_col("CMT")
     log_check(
       "cmt_integer",
       "CMT values are integers",
       if (is.numeric(cmt) && all(cmt == round(cmt), na.rm = TRUE)) "PASS" else "FAIL"
     )
     if (has_col("EVID")) {
-      # For non-EVID==3 records, CMT should be positive (allow negative for compartment emptying)
-      non_reset <- data$EVID != 3
+      evid <- get_col("EVID")
+      non_reset <- evid != 3
       cmt_sub <- cmt[non_reset]
       cmt_sub <- cmt_sub[!is.na(cmt_sub)]
       log_check(
@@ -246,14 +295,15 @@ check_nm_dataset <- function(
   # DV checks
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("DV")) {
-    dv <- data$DV
+    dv <- get_col("DV")
     log_check(
       "dv_numeric",
       "DV is numeric",
       if (is.numeric(dv)) "PASS" else "FAIL"
     )
     if (has_col("EVID")) {
-      dose_evids <- data$EVID %in% c(1, 4)
+      evid <- get_col("EVID")
+      dose_evids <- evid %in% c(1, 4)
       dv_at_dose <- dv[dose_evids]
       log_check(
         "dv_missing_dose",
@@ -261,9 +311,9 @@ check_nm_dataset <- function(
         if (all(is.na(dv_at_dose) | dv_at_dose == 0)) "PASS" else "FAIL"
       )
     }
-    # Non-numeric / NA in DV for observation records
     if (has_col("EVID") && is.numeric(dv)) {
-      obs_dv <- dv[data$EVID == 0]
+      evid <- get_col("EVID")
+      obs_dv <- dv[evid == 0]
       log_check(
         "dv_obs_nonmissing",
         "DV is non-missing for observation records (EVID 0)",
@@ -276,34 +326,34 @@ check_nm_dataset <- function(
   # MDV checks
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("MDV")) {
-    mdv <- data$MDV
+    mdv <- get_col("MDV")
     log_check(
       "mdv_binary",
       "MDV is binary (0 or 1)",
       if (all(mdv %in% c(0, 1), na.rm = TRUE)) "PASS" else "FAIL"
     )
-    if (has_col("DV") && is.numeric(data$DV) && has_col("EVID")) {
-      obs <- data$EVID == 0
-      # For observations: MDV should be 1 when DV is NA, 0 when DV is not NA
-      expected_mdv <- ifelse(is.na(data$DV[obs]), 1, 0)
+    if (has_col("DV") && is.numeric(get_col("DV")) && has_col("EVID")) {
+      dv <- get_col("DV")
+      evid <- get_col("EVID")
+      obs <- evid == 0
+      expected_mdv <- ifelse(is.na(dv[obs]), 1, 0)
       log_check(
         "mdv_dv_consistent",
         "MDV is consistent with is.na(DV) for observation records (EVID 0)",
         if (all(mdv[obs] == expected_mdv, na.rm = TRUE)) "PASS" else "FAIL"
       )
-      # For dosing records, MDV should be 1
-      dosing <- data$EVID %in% c(1, 4)
+      dosing <- evid %in% c(1, 4)
       log_check(
         "mdv_dosing",
         "MDV is 1 for dosing records (EVID 1, 4)",
         if (all(mdv[dosing] == 1, na.rm = TRUE)) "PASS" else "FAIL"
       )
     }
-    # Non-missing observations should have MDV = 0
-    if (has_col("DV") && is.numeric(data$DV)) {
-      nonmissing_obs <- !is.na(data$DV) & data$DV != 0
+    if (has_col("DV") && is.numeric(get_col("DV"))) {
+      dv <- get_col("DV")
+      nonmissing_obs <- !is.na(dv) & dv != 0
       if (has_col("EVID")) {
-        nonmissing_obs <- nonmissing_obs & data$EVID == 0
+        nonmissing_obs <- nonmissing_obs & get_col("EVID") == 0
       }
       log_check(
         "mdv_nonmissing_obs",
@@ -317,21 +367,22 @@ check_nm_dataset <- function(
   # AMT checks
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("AMT")) {
-    amt <- data$AMT
+    amt <- get_col("AMT")
     log_check(
       "amt_numeric",
       "AMT is numeric",
       if (is.numeric(amt)) "PASS" else "FAIL"
     )
     if (has_col("EVID") && is.numeric(amt)) {
-      obs_other <- data$EVID %in% c(0, 2)
+      evid <- get_col("EVID")
+      obs_other <- evid %in% c(0, 2)
       amt_obs <- amt[obs_other]
       log_check(
         "amt_zero_obs",
         "AMT is 0 or missing for observation/other-type records (EVID 0, 2)",
         if (all(is.na(amt_obs) | amt_obs == 0)) "PASS" else "FAIL"
       )
-      dose_recs <- data$EVID %in% c(1, 4)
+      dose_recs <- evid %in% c(1, 4)
       amt_dose <- amt[dose_recs]
       log_check(
         "amt_positive_dose",
@@ -339,18 +390,18 @@ check_nm_dataset <- function(
         if (all(amt_dose > 0, na.rm = TRUE)) "PASS" else "FAIL"
       )
     }
-    # AMT > 0 should have EVID = 1
     if (has_col("EVID") && is.numeric(amt)) {
+      evid <- get_col("EVID")
       amt_pos <- !is.na(amt) & amt > 0
       log_check(
         "amt_evid_consistent",
         "Records with AMT > 0 have EVID = 1 (or 4)",
-        if (all(data$EVID[amt_pos] %in% c(1, 4))) "PASS" else "FAIL"
+        if (all(evid[amt_pos] %in% c(1, 4))) "PASS" else "FAIL"
       )
     }
-    # Non-numeric/NA in AMT for dosing records
     if (has_col("EVID")) {
-      dose_recs <- data$EVID %in% c(1, 4)
+      evid <- get_col("EVID")
+      dose_recs <- evid %in% c(1, 4)
       log_check(
         "amt_nonmissing_dose",
         "AMT is non-missing for dosing records (EVID 1, 4)",
@@ -362,13 +413,16 @@ check_nm_dataset <- function(
   # ──────────────────────────────────────────────────────────────────────────
   # DV / MDV / BLQ consistency
   # ──────────────────────────────────────────────────────────────────────────
-  if (has_col("DV") && has_col("MDV") && has_col("EVID") && is.numeric(data$DV)) {
-    obs <- data$EVID == 0
-    dv_leq_zero <- obs & !is.na(data$DV) & data$DV <= 0
+  if (has_col("DV") && has_col("MDV") && has_col("EVID") && is.numeric(get_col("DV"))) {
+    dv <- get_col("DV")
+    mdv <- get_col("MDV")
+    evid <- get_col("EVID")
+    obs <- evid == 0
+    dv_leq_zero <- obs & !is.na(dv) & dv <= 0
     log_check(
       "dv_leq_zero_mdv",
       "DV <= 0 observations have MDV = 1 (or BLQ flag)",
-      if (sum(dv_leq_zero) == 0 || all(data$MDV[dv_leq_zero] == 1)) "PASS" else "FAIL"
+      if (sum(dv_leq_zero) == 0 || all(mdv[dv_leq_zero] == 1)) "PASS" else "FAIL"
     )
   }
 
@@ -376,17 +430,17 @@ check_nm_dataset <- function(
   # RATE checks
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("RATE")) {
-    rate <- data$RATE
+    rate <- get_col("RATE")
     log_check(
       "rate_numeric",
       "RATE is numeric",
       if (is.numeric(rate)) "PASS" else "FAIL"
     )
     if (is.numeric(rate) && has_col("EVID")) {
-      dose_recs <- data$EVID %in% c(1, 4)
+      evid <- get_col("EVID")
+      dose_recs <- evid %in% c(1, 4)
       rate_dose <- rate[dose_recs]
       rate_dose <- rate_dose[!is.na(rate_dose)]
-      # RATE for dosing events: -2, -1, 0, or positive
       log_check(
         "rate_dose_values",
         "RATE for dosing records is -2, -1, 0, or positive",
@@ -401,14 +455,15 @@ check_nm_dataset <- function(
   # SS checks
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("SS")) {
-    ss <- data$SS
+    ss <- get_col("SS")
     log_check(
       "ss_numeric",
       "SS is numeric",
       if (is.numeric(ss)) "PASS" else "FAIL"
     )
     if (is.numeric(ss) && has_col("EVID")) {
-      dose_recs <- data$EVID %in% c(1, 4)
+      evid <- get_col("EVID")
+      dose_recs <- evid %in% c(1, 4)
       ss_dose <- ss[dose_recs]
       ss_dose <- ss_dose[!is.na(ss_dose)]
       log_check(
@@ -425,19 +480,19 @@ check_nm_dataset <- function(
   # ADDL / II checks
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("ADDL") || has_col("II")) {
-    # ADDL and II must both be present
     log_check(
       "addl_ii_paired",
       "ADDL and II are both present (required together)",
       if (has_col("ADDL") && has_col("II")) "PASS" else "FAIL"
     )
     if (has_col("ADDL")) {
-      addl <- data$ADDL
+      addl <- get_col("ADDL")
       log_check(
         "addl_nonneg_int",
         "ADDL is a non-negative integer for dosing records",
         if (is.numeric(addl) && has_col("EVID")) {
-          dose_recs <- data$EVID %in% c(1, 4)
+          evid <- get_col("EVID")
+          dose_recs <- evid %in% c(1, 4)
           addl_d <- addl[dose_recs]
           addl_d <- addl_d[!is.na(addl_d)]
           all(addl_d >= 0) && all(addl_d == round(addl_d))
@@ -447,12 +502,13 @@ check_nm_dataset <- function(
       )
     }
     if (has_col("II")) {
-      ii <- data$II
+      ii <- get_col("II")
       log_check(
         "ii_nonneg_int",
         "II is a non-negative integer for dosing records",
         if (is.numeric(ii) && has_col("EVID")) {
-          dose_recs <- data$EVID %in% c(1, 4)
+          evid <- get_col("EVID")
+          dose_recs <- evid %in% c(1, 4)
           ii_d <- ii[dose_recs]
           ii_d <- ii_d[!is.na(ii_d)]
           all(ii_d >= 0) && all(ii_d == round(ii_d))
@@ -469,20 +525,25 @@ check_nm_dataset <- function(
   # Overlapping ADDL-expanded doses
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("ADDL") && has_col("II") && has_col("EVID") && has_col("ID") &&
-      has_col("TIME") && is.numeric(data$ADDL) && is.numeric(data$II)) {
+      has_col("TIME") && is.numeric(get_col("ADDL")) && is.numeric(get_col("II"))) {
     overlap_found <- FALSE
-    for (id in unique(data$ID)) {
-      sub <- data[data$ID == id, ]
-      dose_idx <- which(sub$EVID %in% c(1, 4))
+    id_vec <- get_col("ID")
+    time_vec <- get_col("TIME")
+    evid_vec <- get_col("EVID")
+    addl_vec <- get_col("ADDL")
+    ii_vec <- get_col("II")
+    for (id in unique(id_vec)) {
+      idx <- which(id_vec == id)
+      dose_idx <- idx[evid_vec[idx] %in% c(1, 4)]
       if (length(dose_idx) < 2) next
       for (i in seq_len(length(dose_idx) - 1)) {
-        row_i <- dose_idx[i]
-        row_next <- dose_idx[i + 1]
-        addl_i <- sub$ADDL[row_i]
-        ii_i <- sub$II[row_i]
+        ri <- dose_idx[i]
+        rn <- dose_idx[i + 1]
+        addl_i <- addl_vec[ri]
+        ii_i <- ii_vec[ri]
         if (!is.na(addl_i) && !is.na(ii_i) && addl_i > 0 && ii_i > 0) {
-          last_expanded <- sub$TIME[row_i] + addl_i * ii_i
-          if (last_expanded >= sub$TIME[row_next]) {
+          last_expanded <- time_vec[ri] + addl_i * ii_i
+          if (last_expanded >= time_vec[rn]) {
             overlap_found <- TRUE
             break
           }
@@ -501,7 +562,7 @@ check_nm_dataset <- function(
   # ID checks
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("ID")) {
-    ids <- as.character(data$ID)
+    ids <- as.character(get_col("ID"))
     has_leading_zeros <- any(grepl("^0\\d+", ids))
     log_check(
       "id_leading_zeros",
@@ -519,7 +580,9 @@ check_nm_dataset <- function(
   # ──────────────────────────────────────────────────────────────────────────
   # Unique records (ID, TIME, CMT, EVID)
   # ──────────────────────────────────────────────────────────────────────────
-  dup_cols <- intersect(c("ID", "TIME", "CMT", "EVID"), names(data))
+  dup_nm_keys <- c("ID", "TIME", "CMT", "EVID")
+  dup_actual <- vapply(dup_nm_keys, col_for, character(1))
+  dup_cols <- dup_actual[dup_actual %in% names(data)]
   if (length(dup_cols) >= 2) {
     dup_check <- duplicated(data[, dup_cols, drop = FALSE])
     log_check(
@@ -533,10 +596,12 @@ check_nm_dataset <- function(
   # Dataset ordering (ID, TIME)
   # ──────────────────────────────────────────────────────────────────────────
   if (has_col("ID") && has_col("TIME")) {
-    sorted <- data[order(data$ID, data$TIME), ]
+    id_vec <- get_col("ID")
+    time_vec <- get_col("TIME")
+    ord <- order(id_vec, time_vec)
     is_sorted <- identical(
-      paste(data$ID, data$TIME),
-      paste(sorted$ID, sorted$TIME)
+      paste(id_vec, time_vec),
+      paste(id_vec[ord], time_vec[ord])
     )
     log_check(
       "data_ordered",
@@ -572,12 +637,20 @@ check_nm_dataset <- function(
   # ──────────────────────────────────────────────────────────────────────────
   # Missing covariate values
   # ──────────────────────────────────────────────────────────────────────────
-  # Covariates are columns that are not standard NONMEM columns
+  # Covariates are columns that are not standard NONMEM columns (mapped or not)
   nm_standard <- c(
     "ID", "TIME", "DV", "MDV", "EVID", "AMT", "RATE", "SS",
     "II", "ADDL", "CMT", "GROUP", "BLQ", "LLOQ", "TAD"
   )
-  cov_cols <- setdiff(names(data), c(nm_standard, if (!is.null(exclusion_flag)) exclusion_flag))
+  # Include both the standard names and the dictionary-mapped actual names
+  all_standard <- unique(c(
+    nm_standard,
+    vapply(intersect(nm_keys, nm_standard), col_for, character(1))
+  ))
+  cov_cols <- setdiff(
+    names(data),
+    c(all_standard, if (!is.null(exclusion_flag)) exclusion_flag)
+  )
   if (length(cov_cols) > 0) {
     has_missing_cov <- FALSE
     for (col in cov_cols) {
@@ -597,12 +670,13 @@ check_nm_dataset <- function(
   # ──────────────────────────────────────────────────────────────────────────
   # RATE column consistency with AMT
   # ──────────────────────────────────────────────────────────────────────────
-  if (has_col("RATE") && has_col("AMT") && is.numeric(data$RATE) && is.numeric(data$AMT)) {
+  if (has_col("RATE") && has_col("AMT") &&
+      is.numeric(get_col("RATE")) && is.numeric(get_col("AMT"))) {
     if (has_col("EVID")) {
-      dose_recs <- data$EVID %in% c(1, 4)
-      rate_dose <- data$RATE[dose_recs]
-      amt_dose <- data$AMT[dose_recs]
-      # If RATE is a real rate (> 0 and not -1/-2), AMT handling differs
+      evid <- get_col("EVID")
+      dose_recs <- evid %in% c(1, 4)
+      rate_dose <- get_col("RATE")[dose_recs]
+      amt_dose <- get_col("AMT")[dose_recs]
       real_rate <- !is.na(rate_dose) & !is.na(amt_dose) & rate_dose > 0
       log_check(
         "rate_amt_consistency",
diff --git a/man/check_nm_dataset.Rd b/man/check_nm_dataset.Rd
index 2d0bcd2..16a1754 100644
--- a/man/check_nm_dataset.Rd
+++ b/man/check_nm_dataset.Rd
@@ -6,6 +6,7 @@
 \usage{
 check_nm_dataset(
   data,
+  dictionary = NULL,
   exclusion_flag = NULL,
   max_digits = 10,
   max_id_digits = 5,
@@ -16,6 +17,14 @@ check_nm_dataset(
 \arguments{
 \item{data}{a data.frame (or tibble) containing the NONMEM dataset to check.}
 
+\item{dictionary}{an optional named list that maps standard NONMEM column
+names to the actual column names in \code{data}. For example,
+\code{list(ID = "NMID", TIME = "TAD")} means the column called \code{"NMID"} in
+\code{data} plays the role of \code{ID}, and \code{"TAD"} plays the role of \code{TIME}.
+Any standard name not listed in the dictionary is assumed to match the
+column name in \code{data} as-is. Supported keys: \code{ID}, \code{TIME}, \code{DV}, \code{MDV},
+\code{EVID}, \code{AMT}, \code{CMT}, \code{RATE}, \code{SS}, \code{ADDL}, \code{II}.}
+
 \item{exclusion_flag}{character name of the column used as an exclusion flag
 (for \verb{$DATA ACCEPT} / \code{IGNORE} in NONMEM). If \code{NULL} (default), the
 exclusion-flag check is skipped.}
diff --git a/tests/testthat/test-check_nm_dataset.R b/tests/testthat/test-check_nm_dataset.R
index 1f19ef5..30ce6f5 100644
--- a/tests/testthat/test-check_nm_dataset.R
+++ b/tests/testthat/test-check_nm_dataset.R
@@ -16,6 +16,13 @@ make_nm <- function(n_id = 2, n_obs = 3) {
   }))
 }
 
+# Helper: dataset with renamed columns (for dictionary tests)
+make_nm_renamed <- function(n_id = 2, n_obs = 3) {
+  d <- make_nm(n_id = n_id, n_obs = n_obs)
+  names(d) <- c("SUBJ", "TAD", "CONC", "MISS", "FLAG", "DOSE", "COMP")
+  d
+}
+
 test_that("clean dataset passes all applicable checks", {
   d <- make_nm()
   res <- check_nm_dataset(d, verbose = FALSE)
@@ -230,3 +237,87 @@ test_that("duplicate records are flagged", {
   res <- check_nm_dataset(d, verbose = FALSE)
   expect_equal(res$result[res$check == "unique_records"], "FAIL")
 })
+
+# ── Dictionary tests ──────────────────────────────────────────────────────────
+
+test_that("dictionary maps renamed columns and passes clean dataset", {
+  d <- make_nm_renamed()
+  dict <- list(
+    ID = "SUBJ", TIME = "TAD", DV = "CONC", MDV = "MISS",
+    EVID = "FLAG", AMT = "DOSE", CMT = "COMP"
+  )
+  res <- check_nm_dataset(d, dictionary = dict, verbose = FALSE)
+  # All applicable checks should pass (or be NA), none should fail
+  expect_true(all(res$result %in% c("PASS", "NA")))
+})
+
+test_that("dictionary resolves required columns correctly", {
+  # Without dictionary, renamed dataset fails required_cols
+  d <- make_nm_renamed()
+  res_no_dict <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res_no_dict$result[res_no_dict$check == "required_cols"], "FAIL")
+
+  # With dictionary, it passes
+  dict <- list(ID = "SUBJ", TIME = "TAD", DV = "CONC")
+  res_dict <- check_nm_dataset(d, dictionary = dict, verbose = FALSE)
+  expect_equal(res_dict$result[res_dict$check == "required_cols"], "PASS")
+})
+
+test_that("dictionary catches errors in renamed columns", {
+  d <- make_nm_renamed()
+  d$TAD[2] <- -1  # negative TIME (via TAD)
+  dict <- list(
+    ID = "SUBJ", TIME = "TAD", DV = "CONC", MDV = "MISS",
+    EVID = "FLAG", AMT = "DOSE", CMT = "COMP"
+  )
+  res <- check_nm_dataset(d, dictionary = dict, verbose = FALSE)
+  expect_equal(res$result[res$check == "time_nonneg"], "FAIL")
+})
+
+test_that("dictionary works for AMT/EVID consistency", {
+  d <- make_nm_renamed()
+  d$DOSE[3] <- 50  # AMT > 0 for an observation (EVID = 0)
+  dict <- list(
+    ID = "SUBJ", TIME = "TAD", DV = "CONC", MDV = "MISS",
+    EVID = "FLAG", AMT = "DOSE", CMT = "COMP"
+  )
+  res <- check_nm_dataset(d, dictionary = dict, verbose = FALSE)
+  expect_equal(res$result[res$check == "amt_zero_obs"], "FAIL")
+  expect_equal(res$result[res$check == "amt_evid_consistent"], "FAIL")
+})
+
+test_that("partial dictionary works (unmapped columns keep standard names)", {
+  d <- make_nm()
+  names(d)[1] <- "SUBJ"  # only rename ID
+  dict <- list(ID = "SUBJ")
+  res <- check_nm_dataset(d, dictionary = dict, verbose = FALSE)
+  expect_equal(res$result[res$check == "required_cols"], "PASS")
+  expect_true(all(res$result %in% c("PASS", "NA")))
+})
+
+test_that("dictionary is case-insensitive for keys", {
+  d <- make_nm_renamed()
+  dict <- list(
+    id = "SUBJ", time = "TAD", dv = "CONC", mdv = "MISS",
+    evid = "FLAG", amt = "DOSE", cmt = "COMP"
+  )
+  res <- check_nm_dataset(d, dictionary = dict, verbose = FALSE)
+  expect_equal(res$result[res$check == "required_cols"], "PASS")
+})
+
+test_that("unknown dictionary keys produce a warning", {
+  d <- make_nm()
+  expect_warning(
+    check_nm_dataset(d, dictionary = list(FOOBAR = "X"), verbose = FALSE),
+    "not recognised"
+  )
+})
+
+test_that("dictionary verbose output shows mapping", {
+  d <- make_nm()
+  names(d)[1] <- "SUBJ"
+  expect_message(
+    check_nm_dataset(d, dictionary = list(ID = "SUBJ"), verbose = TRUE),
+    "Dictionary mapping: ID -> SUBJ"
+  )
+})

From d957d489b01f278a6b0436f796076ae815303692 Mon Sep 17 00:00:00 2001
From: roninsightrx <ron@insight-rx.com>
Date: Fri, 3 Apr 2026 13:18:47 -0700
Subject: [PATCH 3/5] Update R/check_nm_dataset.R

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 R/check_nm_dataset.R | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/R/check_nm_dataset.R b/R/check_nm_dataset.R
index befd741..fdcc2b7 100644
--- a/R/check_nm_dataset.R
+++ b/R/check_nm_dataset.R
@@ -655,14 +655,22 @@ check_nm_dataset <- function(
     has_missing_cov <- FALSE
     for (col in cov_cols) {
       vals <- data[[col]]
-      if (any(is.na(vals)) || any(vals %in% c(-999, -99, "."), na.rm = TRUE)) {
+      has_sentinel <- FALSE
+
+      if (is.numeric(vals)) {
+        has_sentinel <- any(vals %in% c(-999, -99), na.rm = TRUE)
+      } else if (is.character(vals) || is.factor(vals)) {
+        has_sentinel <- any(as.character(vals) %in% c(".", "NA"), na.rm = TRUE)
+      }
+
+      if (any(is.na(vals)) || has_sentinel) {
         has_missing_cov <- TRUE
         break
       }
     }
     log_check(
       "covariate_missing",
-      "No missing or sentinel values (-999, '.') in covariate columns",
+      "No missing or sentinel values (-999, -99, '.', 'NA') in covariate columns",
       if (!has_missing_cov) "PASS" else "FAIL"
     )
   }

From 3b300e51b3169dbd9cd1220dbd6093bc6aed3ae0 Mon Sep 17 00:00:00 2001
From: Ron Keizer <ron@insight-rx.com>
Date: Fri, 3 Apr 2026 13:21:10 -0700
Subject: [PATCH 4/5] fix: align MDV/DV checks on consistent missing/BLQ
 definition

Reviewer feedback: the MDV/DV checks were internally inconsistent.
mdv_dv_consistent used only is.na(DV), but dv_leq_zero_mdv expected
DV <= 0 to have MDV=1, and mdv_nonmissing_obs used DV != 0 instead
of DV > 0. A valid dataset with BLQ records could not pass all three.

Changes:
- Unified "missing/BLQ" = is.na(DV) | DV <= 0 across all MDV checks
- mdv_dv_consistent now expects MDV=1 for NA or DV<=0, MDV=0 for DV>0
- Removed redundant dv_leq_zero_mdv check (covered by updated above)
- Fixed mdv_nonmissing_obs filter to use DV > 0 (matching description)
- Added test for BLQ (DV <= 0) MDV consistency

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 R/check_nm_dataset.R                   | 30 +++++++-------------------
 tests/testthat/test-check_nm_dataset.R | 21 ++++++++++++++++--
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/R/check_nm_dataset.R b/R/check_nm_dataset.R
index fdcc2b7..5245f58 100644
--- a/R/check_nm_dataset.R
+++ b/R/check_nm_dataset.R
@@ -336,10 +336,12 @@ check_nm_dataset <- function(
       dv <- get_col("DV")
       evid <- get_col("EVID")
       obs <- evid == 0
-      expected_mdv <- ifelse(is.na(dv[obs]), 1, 0)
+      # Treat NA and DV <= 0 as missing/BLQ → MDV should be 1;
+      # DV > 0 → MDV should be 0
+      expected_mdv <- ifelse(is.na(dv[obs]) | dv[obs] <= 0, 1, 0)
       log_check(
         "mdv_dv_consistent",
-        "MDV is consistent with is.na(DV) for observation records (EVID 0)",
+        "MDV matches DV for observations: MDV=1 when DV is NA or <= 0 (BLQ), MDV=0 when DV > 0",
         if (all(mdv[obs] == expected_mdv, na.rm = TRUE)) "PASS" else "FAIL"
       )
       dosing <- evid %in% c(1, 4)
@@ -351,14 +353,14 @@ check_nm_dataset <- function(
     }
     if (has_col("DV") && is.numeric(get_col("DV"))) {
       dv <- get_col("DV")
-      nonmissing_obs <- !is.na(dv) & dv != 0
+      valid_obs <- !is.na(dv) & dv > 0
       if (has_col("EVID")) {
-        nonmissing_obs <- nonmissing_obs & get_col("EVID") == 0
+        valid_obs <- valid_obs & get_col("EVID") == 0
       }
       log_check(
         "mdv_nonmissing_obs",
-        "MDV is 0 for non-missing observation records (DV > 0)",
-        if (all(mdv[nonmissing_obs] == 0, na.rm = TRUE)) "PASS" else "FAIL"
+        "MDV is 0 for valid observation records (DV > 0)",
+        if (all(mdv[valid_obs] == 0, na.rm = TRUE)) "PASS" else "FAIL"
       )
     }
   }
@@ -410,22 +412,6 @@ check_nm_dataset <- function(
     }
   }
 
-  # ──────────────────────────────────────────────────────────────────────────
-  # DV / MDV / BLQ consistency
-  # ──────────────────────────────────────────────────────────────────────────
-  if (has_col("DV") && has_col("MDV") && has_col("EVID") && is.numeric(get_col("DV"))) {
-    dv <- get_col("DV")
-    mdv <- get_col("MDV")
-    evid <- get_col("EVID")
-    obs <- evid == 0
-    dv_leq_zero <- obs & !is.na(dv) & dv <= 0
-    log_check(
-      "dv_leq_zero_mdv",
-      "DV <= 0 observations have MDV = 1 (or BLQ flag)",
-      if (sum(dv_leq_zero) == 0 || all(mdv[dv_leq_zero] == 1)) "PASS" else "FAIL"
-    )
-  }
-
   # ──────────────────────────────────────────────────────────────────────────
   # RATE checks
   # ──────────────────────────────────────────────────────────────────────────
diff --git a/tests/testthat/test-check_nm_dataset.R b/tests/testthat/test-check_nm_dataset.R
index 30ce6f5..3f3bdc7 100644
--- a/tests/testthat/test-check_nm_dataset.R
+++ b/tests/testthat/test-check_nm_dataset.R
@@ -108,11 +108,28 @@ test_that("DV present for dosing records is flagged", {
   expect_equal(res$result[res$check == "dv_missing_dose"], "FAIL")
 })
 
-test_that("MDV consistency is checked", {
+test_that("MDV = 1 for valid observation (DV > 0) is flagged", {
   d <- make_nm()
-  d$MDV[2] <- 1  # observation with non-missing DV should have MDV = 0
+  d$MDV[2] <- 1  # observation with DV > 0 should have MDV = 0
   res <- check_nm_dataset(d, verbose = FALSE)
   expect_equal(res$result[res$check == "mdv_nonmissing_obs"], "FAIL")
+  expect_equal(res$result[res$check == "mdv_dv_consistent"], "FAIL")
+})
+
+test_that("MDV/DV consistency treats DV <= 0 as BLQ (MDV should be 1)", {
+  d <- make_nm()
+  # Set an observation DV to a negative value (e.g. log-transformed BLQ)
+  # and MDV = 0 — this should FAIL because DV <= 0 requires MDV = 1
+  d$DV[2] <- -2
+  d$MDV[2] <- 0
+  res <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res$result[res$check == "mdv_dv_consistent"], "FAIL")
+
+  # Now set MDV = 1 for the BLQ record — should PASS
+  d$MDV[2] <- 1
+  res2 <- check_nm_dataset(d, verbose = FALSE)
+  expect_equal(res2$result[res2$check == "mdv_dv_consistent"], "PASS")
+  expect_equal(res2$result[res2$check == "mdv_nonmissing_obs"], "PASS")
 })
 
 test_that("exclusion flag checks", {

From 50dc8752d1f0ea8954f70e44119972ebabab830e Mon Sep 17 00:00:00 2001
From: Ron Keizer <ron@insight-rx.com>
Date: Fri, 3 Apr 2026 13:22:53 -0700
Subject: [PATCH 5/5] fix: sort dose records by TIME before ADDL overlap
 detection

The overlap check iterated dose records in row order, which could
produce false positives/negatives on unsorted datasets. Now sorts
dose_idx by TIME within each ID before comparing expanded dose
windows against subsequent doses.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 R/check_nm_dataset.R                   |  3 +++
 tests/testthat/test-check_nm_dataset.R | 28 ++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/R/check_nm_dataset.R b/R/check_nm_dataset.R
index 5245f58..84ec070 100644
--- a/R/check_nm_dataset.R
+++ b/R/check_nm_dataset.R
@@ -520,8 +520,11 @@ check_nm_dataset <- function(
     ii_vec <- get_col("II")
     for (id in unique(id_vec)) {
       idx <- which(id_vec == id)
+      # Sort dose records by TIME to avoid false positives/negatives
+      # when the dataset is not ordered
       dose_idx <- idx[evid_vec[idx] %in% c(1, 4)]
       if (length(dose_idx) < 2) next
+      dose_idx <- dose_idx[order(time_vec[dose_idx])]
       for (i in seq_len(length(dose_idx) - 1)) {
         ri <- dose_idx[i]
         rn <- dose_idx[i + 1]
diff --git a/tests/testthat/test-check_nm_dataset.R b/tests/testthat/test-check_nm_dataset.R
index 3f3bdc7..4acbf7a 100644
--- a/tests/testthat/test-check_nm_dataset.R
+++ b/tests/testthat/test-check_nm_dataset.R
@@ -213,6 +213,34 @@ test_that("ADDL overlap is detected", {
   expect_equal(res$result[res$check == "addl_overlap"], "FAIL")
 })
 
+test_that("ADDL overlap detection works on unsorted datasets", {
+  # Build an overlapping dataset but shuffle the rows so it's not sorted
+  d <- data.frame(
+    ID   = c(1, 1, 1, 1),
+    TIME = c(0, 24, 24, 48),
+    DV   = c(NA, NA, 5, 3),
+    MDV  = c(1, 1, 0, 0),
+    EVID = c(1, 1, 0, 0),
+    AMT  = c(100, 100, 0, 0),
+    CMT  = c(1, 1, 1, 1),
+    ADDL = c(5, 0, 0, 0),
+    II   = c(12, 0, 0, 0),
+    stringsAsFactors = FALSE
+  )
+  # TIME=0, ADDL=5, II=12 => last expanded at 60 >= next dose at 24
+  # Shuffle rows so dose at TIME=24 comes before dose at TIME=0
+  d_shuffled <- d[c(3, 2, 4, 1), ]
+  res <- check_nm_dataset(d_shuffled, verbose = FALSE)
+  expect_equal(res$result[res$check == "addl_overlap"], "FAIL")
+
+  # Non-overlapping case: reduce ADDL so expansion ends before next dose
+  d2 <- d
+  d2$ADDL[1] <- 1  # last expanded at TIME = 12, next dose at 24 → OK
+  d2_shuffled <- d2[c(3, 2, 4, 1), ]
+  res2 <- check_nm_dataset(d2_shuffled, verbose = FALSE)
+  expect_equal(res2$result[res2$check == "addl_overlap"], "PASS")
+})
+
 test_that("covariate missing values are detected", {
   d <- make_nm()
   d$WT <- c(70, 80, NA, 75, 65, 90, 85, 70)