From 968970fdd3ec18fe7381d0077ac0a7087633550a Mon Sep 17 00:00:00 2001 From: Ron Keizer Date: Fri, 3 Apr 2026 12:08:08 -0700 Subject: [PATCH 1/5] feat: add check_nm_dataset() for automated NONMEM dataset validation Adds a comprehensive validation function that runs 35+ checks on NONMEM datasets and returns a summary table (check/description/result) without throwing errors. Covers column naming, EVID/AMT/DV/MDV consistency, TIME ordering, ADDL/II overlap detection, reserved names, covariate missingness, and more. Co-Authored-By: Claude Opus 4.6 (1M context) --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/check_nm_dataset.R | 626 +++++++++++++++++++++++++ man/check_nm_dataset.Rd | 45 ++ tests/testthat/test-check_nm_dataset.R | 232 +++++++++ 5 files changed, 905 insertions(+), 1 deletion(-) create mode 100644 R/check_nm_dataset.R create mode 100644 man/check_nm_dataset.Rd create mode 100644 tests/testthat/test-check_nm_dataset.R diff --git a/DESCRIPTION b/DESCRIPTION index d3add72..9550923 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: irxforge Title: Forging data for pharmacometric analyses -Version: 0.0.0.9004 +Version: 0.0.0.9005 Authors@R: c( person("Ron", "Keizer", email = "ron@insight-rx.com", role = c("cre", "aut")), person("Michael", "McCarthy", email = "michael.mccarthy@insight-rx.com", role = "ctb"), diff --git a/NAMESPACE b/NAMESPACE index 23045ba..c93d68c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(add_nominal_timepoints) +export(check_nm_dataset) export(create_demographics_table) export(download_nhanes_cache) export(get_nominal_timepoints) diff --git a/R/check_nm_dataset.R b/R/check_nm_dataset.R new file mode 100644 index 0000000..555d824 --- /dev/null +++ b/R/check_nm_dataset.R @@ -0,0 +1,626 @@ +#' Run automated quality checks on a NONMEM dataset +#' +#' Performs a comprehensive set of checks on a NONMEM-style dataset and returns +#' a summary table with the result of each check. No errors are thrown; all +#' issues are reported as rows in the output table. +#' +#' @param data a data.frame (or tibble) containing the NONMEM dataset to check. +#' @param exclusion_flag character name of the column used as an exclusion flag +#' (for `$DATA ACCEPT` / `IGNORE` in NONMEM). If `NULL` (default), the +#' exclusion-flag check is skipped. +#' @param max_digits integer; threshold for flagging numbers with too many +#' significant digits (default `10`). +#' @param max_id_digits integer; preferred maximum number of characters for the +#' ID variable (default `5`). +#' @param extra_reserved character vector of additional reserved NONMEM names +#' to check for (e.g. PK parameters used in ADVAN/TRANS such as `"CL"`, +#' `"V"`, `"V1"`, `"K12"`). +#' @param verbose logical; if `TRUE` (default), messages are printed for every +#' check as it is evaluated. +#' +#' @returns A data.frame with columns `check` (short test name), `description` +#' (human-readable explanation), and `result` (`"PASS"`, `"FAIL"`, or +#' `"NA"` when a check is not applicable). +#' +#' @export +check_nm_dataset <- function( + data, + exclusion_flag = NULL, + max_digits = 10, + max_id_digits = 5, + extra_reserved = NULL, + verbose = TRUE +) { + + results <- data.frame( + check = character(0), + description = character(0), + result = character(0), + stringsAsFactors = FALSE + ) + + log_check <- function(check, description, result) { + if (verbose) { + symbol <- switch(result, PASS = "[PASS]", FAIL = "[FAIL]", "[ NA ]") + message(sprintf("%-8s %s", symbol, description)) + } + results[nrow(results) + 1, ] <<- list(check, description, result) + } + + has_col <- function(col) col %in% names(data) + cols <- toupper(names(data)) + + # ────────────────────────────────────────────────────────────────────────── + + # Required columns + # ────────────────────────────────────────────────────────────────────────── + required <- c("ID", "TIME", "DV") + present <- required %in% cols + log_check( + "required_cols", + paste0("Minimum required columns present (", paste(required, collapse = ", "), ")"), + if (all(present)) "PASS" else "FAIL" + ) + + # ────────────────────────────────────────────────────────────────────────── + # Unique column names + # ────────────────────────────────────────────────────────────────────────── + log_check( + "unique_colnames", + "Column names are unique", + if (anyDuplicated(names(data)) == 0) "PASS" else "FAIL" + ) + + # ────────────────────────────────────────────────────────────────────────── + # No special characters in column names + # ────────────────────────────────────────────────────────────────────────── + bad_names <- grep("[^A-Za-z0-9_]", names(data), value = TRUE) + log_check( + "colname_chars", + "Column names contain no special characters", + if (length(bad_names) == 0) "PASS" else "FAIL" + ) + + # ────────────────────────────────────────────────────────────────────────── + # Reserved NONMEM names + # ────────────────────────────────────────────────────────────────────────── + reserved <- c( + "L1", "L2", "DATE", "DAT1", "DAT2", "DAT3", + "PCMT", "CALL", "CONT", "NEWIND", "W", + "ETA", "THETA", "EPS", "ERR", + "IPRED", "IRES", "IWRES", "CWRES", "PHI" + ) + if (!is.null(extra_reserved)) { + reserved <- unique(c(reserved, toupper(extra_reserved))) + } + used_reserved <- intersect(cols, reserved) + log_check( + "reserved_names", + "No reserved NONMEM variable names used as column names", + if (length(used_reserved) == 0) "PASS" else "FAIL" + ) + + # ────────────────────────────────────────────────────────────────────────── + # Character columns warning + # ────────────────────────────────────────────────────────────────────────── + char_cols <- names(data)[vapply(data, is.character, logical(1))] + log_check( + "char_columns", + "No character-type columns present (may cause NONMEM compilation failure)", + if (length(char_cols) == 0) "PASS" else "FAIL" + ) + + # ────────────────────────────────────────────────────────────────────────── + # NA represented as "." not "NA" + # ────────────────────────────────────────────────────────────────────────── + has_na_string <- any(vapply(data, function(x) any(x == "NA", na.rm = TRUE), logical(1))) + log_check( + "na_coding", + "Missing values coded as '.' and not as string 'NA'", + if (!has_na_string) "PASS" else "FAIL" + ) + + # ────────────────────────────────────────────────────────────────────────── + # Commas inside character values (problematic for CSV) + # ────────────────────────────────────────────────────────────────────────── + has_commas <- any(vapply( + data[vapply(data, is.character, logical(1))], + function(x) any(grepl(",", x, fixed = TRUE), na.rm = TRUE), + logical(1) + )) + log_check( + "csv_commas", + "Character values do not contain commas (safe for CSV)", + if (!has_commas) "PASS" else "FAIL" + ) + + # ────────────────────────────────────────────────────────────────────────── + # Exclusion flag + # ────────────────────────────────────────────────────────────────────────── + if (!is.null(exclusion_flag)) { + if (has_col(exclusion_flag)) { + ef <- data[[exclusion_flag]] + ef_ok <- !any(is.na(ef)) && is.numeric(ef) && all(ef == round(ef)) + log_check( + "exclusion_flag", + paste0("Exclusion flag '", exclusion_flag, "' is non-missing and integer"), + if (ef_ok) "PASS" else "FAIL" + ) + } else { + log_check( + "exclusion_flag", + paste0("Exclusion flag '", exclusion_flag, "' exists in dataset"), + "FAIL" + ) + } + } else { + log_check( + "exclusion_flag", + "Exclusion flag check (not specified)", + "NA" + ) + } + + # ────────────────────────────────────────────────────────────────────────── + # TIME checks + # ────────────────────────────────────────────────────────────────────────── + if (has_col("TIME")) { + time <- data$TIME + log_check( + "time_numeric", + "TIME is numeric", + if (is.numeric(time)) "PASS" else "FAIL" + ) + if (is.numeric(time)) { + log_check( + "time_nonneg", + "TIME is non-negative", + if (all(time >= 0, na.rm = TRUE)) "PASS" else "FAIL" + ) + } + } + + # ────────────────────────────────────────────────────────────────────────── + # TIME non-decreasing within ID (except EVID 3, 4) + # ────────────────────────────────────────────────────────────────────────── + if (has_col("TIME") && has_col("ID") && is.numeric(data$TIME)) { + time_ok <- TRUE + for (id in unique(data$ID)) { + idx <- data$ID == id + sub <- data[idx, ] + # exclude reset events (EVID 3, 4) from this check + if (has_col("EVID")) { + keep <- !(sub$EVID %in% c(3, 4)) + t <- sub$TIME[keep] + } else { + t <- sub$TIME + } + if (length(t) > 1 && any(diff(t) < 0, na.rm = TRUE)) { + time_ok <- FALSE + break + } + } + log_check( + "time_order", + "TIME is non-decreasing within each ID (excluding EVID 3/4 reset events)", + if (time_ok) "PASS" else "FAIL" + ) + } + + # ────────────────────────────────────────────────────────────────────────── + # EVID checks + # ────────────────────────────────────────────────────────────────────────── + if (has_col("EVID")) { + evid <- data$EVID + log_check( + "evid_values", + "EVID values are in {0, 1, 2, 3, 4}", + if (all(evid %in% c(0, 1, 2, 3, 4), na.rm = TRUE)) "PASS" else "FAIL" + ) + } + + # ────────────────────────────────────────────────────────────────────────── + # CMT checks + # ────────────────────────────────────────────────────────────────────────── + if (has_col("CMT")) { + cmt <- data$CMT + log_check( + "cmt_integer", + "CMT values are integers", + if (is.numeric(cmt) && all(cmt == round(cmt), na.rm = TRUE)) "PASS" else "FAIL" + ) + if (has_col("EVID")) { + # For non-EVID==3 records, CMT should be positive (allow negative for compartment emptying) + non_reset <- data$EVID != 3 + cmt_sub <- cmt[non_reset] + cmt_sub <- cmt_sub[!is.na(cmt_sub)] + log_check( + "cmt_positive", + "CMT is non-zero for non-reset (EVID!=3) records (negative allowed for emptying)", + if (all(cmt_sub != 0)) "PASS" else "FAIL" + ) + } + } + + # ────────────────────────────────────────────────────────────────────────── + # DV checks + # ────────────────────────────────────────────────────────────────────────── + if (has_col("DV")) { + dv <- data$DV + log_check( + "dv_numeric", + "DV is numeric", + if (is.numeric(dv)) "PASS" else "FAIL" + ) + if (has_col("EVID")) { + dose_evids <- data$EVID %in% c(1, 4) + dv_at_dose <- dv[dose_evids] + log_check( + "dv_missing_dose", + "DV is missing (NA or 0) for dosing records (EVID 1, 4)", + if (all(is.na(dv_at_dose) | dv_at_dose == 0)) "PASS" else "FAIL" + ) + } + # Non-numeric / NA in DV for observation records + if (has_col("EVID") && is.numeric(dv)) { + obs_dv <- dv[data$EVID == 0] + log_check( + "dv_obs_nonmissing", + "DV is non-missing for observation records (EVID 0)", + if (!any(is.na(obs_dv))) "PASS" else "FAIL" + ) + } + } + + # ────────────────────────────────────────────────────────────────────────── + # MDV checks + # ────────────────────────────────────────────────────────────────────────── + if (has_col("MDV")) { + mdv <- data$MDV + log_check( + "mdv_binary", + "MDV is binary (0 or 1)", + if (all(mdv %in% c(0, 1), na.rm = TRUE)) "PASS" else "FAIL" + ) + if (has_col("DV") && is.numeric(data$DV) && has_col("EVID")) { + obs <- data$EVID == 0 + # For observations: MDV should be 1 when DV is NA, 0 when DV is not NA + expected_mdv <- ifelse(is.na(data$DV[obs]), 1, 0) + log_check( + "mdv_dv_consistent", + "MDV is consistent with is.na(DV) for observation records (EVID 0)", + if (all(mdv[obs] == expected_mdv, na.rm = TRUE)) "PASS" else "FAIL" + ) + # For dosing records, MDV should be 1 + dosing <- data$EVID %in% c(1, 4) + log_check( + "mdv_dosing", + "MDV is 1 for dosing records (EVID 1, 4)", + if (all(mdv[dosing] == 1, na.rm = TRUE)) "PASS" else "FAIL" + ) + } + # Non-missing observations should have MDV = 0 + if (has_col("DV") && is.numeric(data$DV)) { + nonmissing_obs <- !is.na(data$DV) & data$DV != 0 + if (has_col("EVID")) { + nonmissing_obs <- nonmissing_obs & data$EVID == 0 + } + log_check( + "mdv_nonmissing_obs", + "MDV is 0 for non-missing observation records (DV > 0)", + if (all(mdv[nonmissing_obs] == 0, na.rm = TRUE)) "PASS" else "FAIL" + ) + } + } + + # ────────────────────────────────────────────────────────────────────────── + # AMT checks + # ────────────────────────────────────────────────────────────────────────── + if (has_col("AMT")) { + amt <- data$AMT + log_check( + "amt_numeric", + "AMT is numeric", + if (is.numeric(amt)) "PASS" else "FAIL" + ) + if (has_col("EVID") && is.numeric(amt)) { + obs_other <- data$EVID %in% c(0, 2) + amt_obs <- amt[obs_other] + log_check( + "amt_zero_obs", + "AMT is 0 or missing for observation/other-type records (EVID 0, 2)", + if (all(is.na(amt_obs) | amt_obs == 0)) "PASS" else "FAIL" + ) + dose_recs <- data$EVID %in% c(1, 4) + amt_dose <- amt[dose_recs] + log_check( + "amt_positive_dose", + "AMT is positive for dosing records (EVID 1, 4)", + if (all(amt_dose > 0, na.rm = TRUE)) "PASS" else "FAIL" + ) + } + # AMT > 0 should have EVID = 1 + if (has_col("EVID") && is.numeric(amt)) { + amt_pos <- !is.na(amt) & amt > 0 + log_check( + "amt_evid_consistent", + "Records with AMT > 0 have EVID = 1 (or 4)", + if (all(data$EVID[amt_pos] %in% c(1, 4))) "PASS" else "FAIL" + ) + } + # Non-numeric/NA in AMT for dosing records + if (has_col("EVID")) { + dose_recs <- data$EVID %in% c(1, 4) + log_check( + "amt_nonmissing_dose", + "AMT is non-missing for dosing records (EVID 1, 4)", + if (!any(is.na(amt[dose_recs]))) "PASS" else "FAIL" + ) + } + } + + # ────────────────────────────────────────────────────────────────────────── + # DV / MDV / BLQ consistency + # ────────────────────────────────────────────────────────────────────────── + if (has_col("DV") && has_col("MDV") && has_col("EVID") && is.numeric(data$DV)) { + obs <- data$EVID == 0 + dv_leq_zero <- obs & !is.na(data$DV) & data$DV <= 0 + log_check( + "dv_leq_zero_mdv", + "DV <= 0 observations have MDV = 1 (or BLQ flag)", + if (sum(dv_leq_zero) == 0 || all(data$MDV[dv_leq_zero] == 1)) "PASS" else "FAIL" + ) + } + + # ────────────────────────────────────────────────────────────────────────── + # RATE checks + # ────────────────────────────────────────────────────────────────────────── + if (has_col("RATE")) { + rate <- data$RATE + log_check( + "rate_numeric", + "RATE is numeric", + if (is.numeric(rate)) "PASS" else "FAIL" + ) + if (is.numeric(rate) && has_col("EVID")) { + dose_recs <- data$EVID %in% c(1, 4) + rate_dose <- rate[dose_recs] + rate_dose <- rate_dose[!is.na(rate_dose)] + # RATE for dosing events: -2, -1, 0, or positive + log_check( + "rate_dose_values", + "RATE for dosing records is -2, -1, 0, or positive", + if (all(rate_dose %in% c(-2, -1, 0) | rate_dose > 0)) "PASS" else "FAIL" + ) + } + } else { + log_check("rate_present", "RATE column present", "NA") + } + + # ────────────────────────────────────────────────────────────────────────── + # SS checks + # ────────────────────────────────────────────────────────────────────────── + if (has_col("SS")) { + ss <- data$SS + log_check( + "ss_numeric", + "SS is numeric", + if (is.numeric(ss)) "PASS" else "FAIL" + ) + if (is.numeric(ss) && has_col("EVID")) { + dose_recs <- data$EVID %in% c(1, 4) + ss_dose <- ss[dose_recs] + ss_dose <- ss_dose[!is.na(ss_dose)] + log_check( + "ss_values", + "SS for dosing records is 0 or 1", + if (all(ss_dose %in% c(0, 1))) "PASS" else "FAIL" + ) + } + } else { + log_check("ss_present", "SS column present", "NA") + } + + # ────────────────────────────────────────────────────────────────────────── + # ADDL / II checks + # ────────────────────────────────────────────────────────────────────────── + if (has_col("ADDL") || has_col("II")) { + # ADDL and II must both be present + log_check( + "addl_ii_paired", + "ADDL and II are both present (required together)", + if (has_col("ADDL") && has_col("II")) "PASS" else "FAIL" + ) + if (has_col("ADDL")) { + addl <- data$ADDL + log_check( + "addl_nonneg_int", + "ADDL is a non-negative integer for dosing records", + if (is.numeric(addl) && has_col("EVID")) { + dose_recs <- data$EVID %in% c(1, 4) + addl_d <- addl[dose_recs] + addl_d <- addl_d[!is.na(addl_d)] + all(addl_d >= 0) && all(addl_d == round(addl_d)) + } else { + FALSE + } + ) + } + if (has_col("II")) { + ii <- data$II + log_check( + "ii_nonneg_int", + "II is a non-negative integer for dosing records", + if (is.numeric(ii) && has_col("EVID")) { + dose_recs <- data$EVID %in% c(1, 4) + ii_d <- ii[dose_recs] + ii_d <- ii_d[!is.na(ii_d)] + all(ii_d >= 0) && all(ii_d == round(ii_d)) + } else { + FALSE + } + ) + } + } else { + log_check("addl_ii_present", "ADDL/II columns present", "NA") + } + + # ────────────────────────────────────────────────────────────────────────── + # Overlapping ADDL-expanded doses + # ────────────────────────────────────────────────────────────────────────── + if (has_col("ADDL") && has_col("II") && has_col("EVID") && has_col("ID") && + has_col("TIME") && is.numeric(data$ADDL) && is.numeric(data$II)) { + overlap_found <- FALSE + for (id in unique(data$ID)) { + sub <- data[data$ID == id, ] + dose_idx <- which(sub$EVID %in% c(1, 4)) + if (length(dose_idx) < 2) next + for (i in seq_len(length(dose_idx) - 1)) { + row_i <- dose_idx[i] + row_next <- dose_idx[i + 1] + addl_i <- sub$ADDL[row_i] + ii_i <- sub$II[row_i] + if (!is.na(addl_i) && !is.na(ii_i) && addl_i > 0 && ii_i > 0) { + last_expanded <- sub$TIME[row_i] + addl_i * ii_i + if (last_expanded >= sub$TIME[row_next]) { + overlap_found <- TRUE + break + } + } + } + if (overlap_found) break + } + log_check( + "addl_overlap", + "No overlapping ADDL-expanded doses (last expanded dose before next explicit dose)", + if (!overlap_found) "PASS" else "FAIL" + ) + } + + # ────────────────────────────────────────────────────────────────────────── + # ID checks + # ────────────────────────────────────────────────────────────────────────── + if (has_col("ID")) { + ids <- as.character(data$ID) + has_leading_zeros <- any(grepl("^0\\d+", ids)) + log_check( + "id_leading_zeros", + "IDs do not have leading zeros (would be lost when NONMEM reads the data)", + if (!has_leading_zeros) "PASS" else "FAIL" + ) + max_len <- max(nchar(ids)) + log_check( + "id_digits", + paste0("ID variable has <= ", max_id_digits, " digits (longer may need $TABLE FORMAT)"), + if (max_len <= max_id_digits) "PASS" else "FAIL" + ) + } + + # ────────────────────────────────────────────────────────────────────────── + # Unique records (ID, TIME, CMT, EVID) + # ────────────────────────────────────────────────────────────────────────── + dup_cols <- intersect(c("ID", "TIME", "CMT", "EVID"), names(data)) + if (length(dup_cols) >= 2) { + dup_check <- duplicated(data[, dup_cols, drop = FALSE]) + log_check( + "unique_records", + paste0("Unique combination of ", paste(dup_cols, collapse = ", ")), + if (!any(dup_check)) "PASS" else "FAIL" + ) + } + + # ────────────────────────────────────────────────────────────────────────── + # Dataset ordering (ID, TIME) + # ────────────────────────────────────────────────────────────────────────── + if (has_col("ID") && has_col("TIME")) { + sorted <- data[order(data$ID, data$TIME), ] + is_sorted <- identical( + paste(data$ID, data$TIME), + paste(sorted$ID, sorted$TIME) + ) + log_check( + "data_ordered", + "Dataset is ordered by at least ID and TIME", + if (is_sorted) "PASS" else "FAIL" + ) + } + + # ────────────────────────────────────────────────────────────────────────── + # Large numbers (precision issues) + # ────────────────────────────────────────────────────────────────────────── + num_cols <- names(data)[vapply(data, is.numeric, logical(1))] + if (length(num_cols) > 0) { + too_many_digits <- FALSE + for (col in num_cols) { + vals <- data[[col]] + vals <- vals[!is.na(vals) & vals != 0] + if (length(vals) > 0) { + n_digits <- nchar(gsub("[^0-9]", "", format(vals, scientific = FALSE))) + if (any(n_digits > max_digits)) { + too_many_digits <- TRUE + break + } + } + } + log_check( + "precision", + paste0("Numeric values have <= ", max_digits, " significant digits"), + if (!too_many_digits) "PASS" else "FAIL" + ) + } + + # ────────────────────────────────────────────────────────────────────────── + # Missing covariate values + # ────────────────────────────────────────────────────────────────────────── + # Covariates are columns that are not standard NONMEM columns + nm_standard <- c( + "ID", "TIME", "DV", "MDV", "EVID", "AMT", "RATE", "SS", + "II", "ADDL", "CMT", "GROUP", "BLQ", "LLOQ", "TAD" + ) + cov_cols <- setdiff(names(data), c(nm_standard, if (!is.null(exclusion_flag)) exclusion_flag)) + if (length(cov_cols) > 0) { + has_missing_cov <- FALSE + for (col in cov_cols) { + vals <- data[[col]] + if (any(is.na(vals)) || any(vals %in% c(-999, -99, "."), na.rm = TRUE)) { + has_missing_cov <- TRUE + break + } + } + log_check( + "covariate_missing", + "No missing or sentinel values (-999, '.') in covariate columns", + if (!has_missing_cov) "PASS" else "FAIL" + ) + } + + # ────────────────────────────────────────────────────────────────────────── + # RATE column consistency with AMT + # ────────────────────────────────────────────────────────────────────────── + if (has_col("RATE") && has_col("AMT") && is.numeric(data$RATE) && is.numeric(data$AMT)) { + if (has_col("EVID")) { + dose_recs <- data$EVID %in% c(1, 4) + rate_dose <- data$RATE[dose_recs] + amt_dose <- data$AMT[dose_recs] + # If RATE is a real rate (> 0 and not -1/-2), AMT handling differs + real_rate <- !is.na(rate_dose) & !is.na(amt_dose) & rate_dose > 0 + log_check( + "rate_amt_consistency", + "Dosing records with positive RATE (mg/h) also have positive AMT", + if (sum(real_rate) == 0 || all(amt_dose[real_rate] > 0)) "PASS" else "FAIL" + ) + } + } + + if (verbose) { + n_pass <- sum(results$result == "PASS") + n_fail <- sum(results$result == "FAIL") + n_na <- sum(results$result == "NA") + message(sprintf( + "\n--- Summary: %d checks | %d PASS | %d FAIL | %d NA ---", + nrow(results), n_pass, n_fail, n_na + )) + } + + results +} diff --git a/man/check_nm_dataset.Rd b/man/check_nm_dataset.Rd new file mode 100644 index 0000000..2d0bcd2 --- /dev/null +++ b/man/check_nm_dataset.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_nm_dataset.R +\name{check_nm_dataset} +\alias{check_nm_dataset} +\title{Run automated quality checks on a NONMEM dataset} +\usage{ +check_nm_dataset( + data, + exclusion_flag = NULL, + max_digits = 10, + max_id_digits = 5, + extra_reserved = NULL, + verbose = TRUE +) +} +\arguments{ +\item{data}{a data.frame (or tibble) containing the NONMEM dataset to check.} + +\item{exclusion_flag}{character name of the column used as an exclusion flag +(for \verb{$DATA ACCEPT} / \code{IGNORE} in NONMEM). If \code{NULL} (default), the +exclusion-flag check is skipped.} + +\item{max_digits}{integer; threshold for flagging numbers with too many +significant digits (default \code{10}).} + +\item{max_id_digits}{integer; preferred maximum number of characters for the +ID variable (default \code{5}).} + +\item{extra_reserved}{character vector of additional reserved NONMEM names +to check for (e.g. PK parameters used in ADVAN/TRANS such as \code{"CL"}, +\code{"V"}, \code{"V1"}, \code{"K12"}).} + +\item{verbose}{logical; if \code{TRUE} (default), messages are printed for every +check as it is evaluated.} +} +\value{ +A data.frame with columns \code{check} (short test name), \code{description} +(human-readable explanation), and \code{result} (\code{"PASS"}, \code{"FAIL"}, or +\code{"NA"} when a check is not applicable). +} +\description{ +Performs a comprehensive set of checks on a NONMEM-style dataset and returns +a summary table with the result of each check. No errors are thrown; all +issues are reported as rows in the output table. +} diff --git a/tests/testthat/test-check_nm_dataset.R b/tests/testthat/test-check_nm_dataset.R new file mode 100644 index 0000000..1f19ef5 --- /dev/null +++ b/tests/testthat/test-check_nm_dataset.R @@ -0,0 +1,232 @@ +# Helper: minimal valid NONMEM dataset +make_nm <- function(n_id = 2, n_obs = 3) { + do.call(rbind, lapply(seq_len(n_id), function(id) { + dose <- data.frame( + ID = id, TIME = 0, DV = NA_real_, MDV = 1, EVID = 1, + AMT = 100, CMT = 1, stringsAsFactors = FALSE + ) + obs <- data.frame( + ID = id, + TIME = seq_len(n_obs), + DV = runif(n_obs, 1, 10), + MDV = 0, EVID = 0, AMT = 0, CMT = 1, + stringsAsFactors = FALSE + ) + rbind(dose, obs) + })) +} + +test_that("clean dataset passes all applicable checks", { + d <- make_nm() + res <- check_nm_dataset(d, verbose = FALSE) + expect_true(all(res$result %in% c("PASS", "NA"))) + expect_s3_class(res, "data.frame") + expect_named(res, c("check", "description", "result")) +}) + +test_that("missing required columns are flagged", { + d <- data.frame(ID = 1, TIME = 0) # no DV + + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "required_cols"], "FAIL") +}) + +test_that("duplicate column names are flagged", { + d <- make_nm() + names(d)[7] <- "ID" # duplicate + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "unique_colnames"], "FAIL") +}) + +test_that("special characters in column names are flagged", { + d <- make_nm() + names(d)[1] <- "ID#1" + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "colname_chars"], "FAIL") +}) + +test_that("reserved NONMEM names are detected", { + d <- make_nm() + d$THETA <- 1 + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "reserved_names"], "FAIL") +}) + +test_that("extra_reserved catches PK parameter names", { + d <- make_nm() + d$CL <- 5 + # Without extra_reserved, CL is not flagged (it's not in the base list) + res1 <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res1$result[res1$check == "reserved_names"], "PASS") + # With extra_reserved + res2 <- check_nm_dataset(d, extra_reserved = c("CL", "V"), verbose = FALSE) + expect_equal(res2$result[res2$check == "reserved_names"], "FAIL") +}) + +test_that("negative TIME is flagged", { + d <- make_nm() + d$TIME[2] <- -1 + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "time_nonneg"], "FAIL") +}) + +test_that("invalid EVID values are flagged", { + d <- make_nm() + d$EVID[1] <- 5 + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "evid_values"], "FAIL") +}) + +test_that("AMT checks work correctly", { + d <- make_nm() + # AMT > 0 for observation record + d$AMT[2] <- 50 + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "amt_zero_obs"], "FAIL") + expect_equal(res$result[res$check == "amt_evid_consistent"], "FAIL") +}) + +test_that("AMT missing for dosing records is flagged", { + d <- make_nm() + d$AMT[1] <- NA + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "amt_nonmissing_dose"], "FAIL") +}) + +test_that("DV present for dosing records is flagged", { + d <- make_nm() + d$DV[1] <- 999 # dose record should not have a real DV + + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "dv_missing_dose"], "FAIL") +}) + +test_that("MDV consistency is checked", { + d <- make_nm() + d$MDV[2] <- 1 # observation with non-missing DV should have MDV = 0 + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "mdv_nonmissing_obs"], "FAIL") +}) + +test_that("exclusion flag checks", { + d <- make_nm() + d$EXCL <- c(0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L) + res <- check_nm_dataset(d, exclusion_flag = "EXCL", verbose = FALSE) + expect_equal(res$result[res$check == "exclusion_flag"], "PASS") + + # NA in exclusion flag + d$EXCL[1] <- NA + + res2 <- check_nm_dataset(d, exclusion_flag = "EXCL", verbose = FALSE) + expect_equal(res2$result[res2$check == "exclusion_flag"], "FAIL") +}) + +test_that("ID leading zeros flagged", { + d <- make_nm(n_id = 1) + d$ID <- "001" + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "id_leading_zeros"], "FAIL") +}) + +test_that("decreasing TIME within ID is flagged", { + d <- make_nm(n_id = 1) + d$TIME <- c(0, 3, 2, 1) + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "time_order"], "FAIL") +}) + +test_that("character columns are flagged", { + d <- make_nm() + d$STUDY <- "A" + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "char_columns"], "FAIL") +}) + +test_that("string 'NA' values are detected", { + d <- make_nm() + d$DV <- as.character(d$DV) + d$DV[2] <- "NA" + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "na_coding"], "FAIL") +}) + +test_that("dataset ordering is checked", { + d <- make_nm(n_id = 2) + d <- d[sample(nrow(d)), ] # shuffle + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "data_ordered"], "FAIL") +}) + +test_that("ADDL/II pairing is checked", { + d <- make_nm() + d$ADDL <- 0 + # II missing + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "addl_ii_paired"], "FAIL") +}) + +test_that("ADDL overlap is detected", { + d <- data.frame( + ID = c(1, 1, 1), + TIME = c(0, 24, 48), + DV = c(NA, 5, 3), + MDV = c(1, 0, 0), + EVID = c(1, 0, 0), + AMT = c(100, 0, 0), + CMT = c(1, 1, 1), + ADDL = c(5, 0, 0), # expands to TIME = 0, 12, 24, 36, 48 -- overlaps next dose + II = c(12, 0, 0), + stringsAsFactors = FALSE + ) + # No second dose record here, so no overlap to detect. + # Add a second dose: + d2 <- rbind(d[1, ], data.frame( + ID = 1, TIME = 24, DV = NA, MDV = 1, EVID = 1, + AMT = 100, CMT = 1, ADDL = 0, II = 0, stringsAsFactors = FALSE + ), d[2:3, ]) + res <- check_nm_dataset(d2, verbose = FALSE) + # First dose: TIME=0, ADDL=5, II=12 => last expanded at 60 >= next dose at 24 + expect_equal(res$result[res$check == "addl_overlap"], "FAIL") +}) + +test_that("covariate missing values are detected", { + d <- make_nm() + d$WT <- c(70, 80, NA, 75, 65, 90, 85, 70) + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "covariate_missing"], "FAIL") +}) + +test_that("covariate sentinel values (-999) are detected", { + d <- make_nm() + d$WT <- c(70, 80, -999, 75, 65, 90, 85, 70) + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "covariate_missing"], "FAIL") +}) + +test_that("verbose output produces messages", { + d <- make_nm() + expect_message(check_nm_dataset(d, verbose = TRUE), "PASS") +}) + +test_that("SS values checked for dosing records", { + d <- make_nm() + d$SS <- 0 + d$SS[1] <- 2 # invalid: SS should be 0 or 1 + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "ss_values"], "FAIL") +}) + +test_that("RATE column checks", { + d <- make_nm() + d$RATE <- 0 + d$RATE[1] <- -3 # invalid + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "rate_dose_values"], "FAIL") +}) + +test_that("duplicate records are flagged", { + d <- make_nm(n_id = 1) + d <- rbind(d, d[2, ]) # duplicate observation record + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "unique_records"], "FAIL") +}) From 540bef86c21f54c8848dd2594fe96702191988b0 Mon Sep 17 00:00:00 2001 From: Ron Keizer Date: Fri, 3 Apr 2026 12:39:54 -0700 Subject: [PATCH 2/5] feat: add dictionary support to check_nm_dataset() Allows mapping non-standard column names to expected NONMEM names via a dictionary argument (e.g., list(ID = "SUBJ", TIME = "TAD")). All checks now resolve column names through the dictionary. Keys are case-insensitive and unknown keys produce a warning. Co-Authored-By: Claude Opus 4.6 (1M context) --- DESCRIPTION | 2 +- R/check_nm_dataset.R | 230 ++++++++++++++++--------- man/check_nm_dataset.Rd | 9 + tests/testthat/test-check_nm_dataset.R | 91 ++++++++++ 4 files changed, 253 insertions(+), 79 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9550923..7dd7fe9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: irxforge Title: Forging data for pharmacometric analyses -Version: 0.0.0.9005 +Version: 0.0.0.9006 Authors@R: c( person("Ron", "Keizer", email = "ron@insight-rx.com", role = c("cre", "aut")), person("Michael", "McCarthy", email = "michael.mccarthy@insight-rx.com", role = "ctb"), diff --git a/R/check_nm_dataset.R b/R/check_nm_dataset.R index 555d824..befd741 100644 --- a/R/check_nm_dataset.R +++ b/R/check_nm_dataset.R @@ -5,6 +5,13 @@ #' issues are reported as rows in the output table. #' #' @param data a data.frame (or tibble) containing the NONMEM dataset to check. +#' @param dictionary an optional named list that maps standard NONMEM column +#' names to the actual column names in `data`. For example, +#' `list(ID = "NMID", TIME = "TAD")` means the column called `"NMID"` in +#' `data` plays the role of `ID`, and `"TAD"` plays the role of `TIME`. +#' Any standard name not listed in the dictionary is assumed to match the +#' column name in `data` as-is. Supported keys: `ID`, `TIME`, `DV`, `MDV`, +#' `EVID`, `AMT`, `CMT`, `RATE`, `SS`, `ADDL`, `II`. #' @param exclusion_flag character name of the column used as an exclusion flag #' (for `$DATA ACCEPT` / `IGNORE` in NONMEM). If `NULL` (default), the #' exclusion-flag check is skipped. @@ -25,6 +32,7 @@ #' @export check_nm_dataset <- function( data, + dictionary = NULL, exclusion_flag = NULL, max_digits = 10, max_id_digits = 5, @@ -32,6 +40,43 @@ check_nm_dataset <- function( verbose = TRUE ) { + # ────────────────────────────────────────────────────────────────────────── + + # Resolve dictionary: map standard NONMEM names → actual column names + + # ────────────────────────────────────────────────────────────────────────── + nm_keys <- c("ID", "TIME", "DV", "MDV", "EVID", "AMT", "CMT", + "RATE", "SS", "ADDL", "II") + col_map <- stats::setNames(nm_keys, nm_keys) # default: name == column + if (!is.null(dictionary)) { + dict_upper <- stats::setNames( + as.character(dictionary), + toupper(names(dictionary)) + ) + unknown <- setdiff(names(dict_upper), nm_keys) + if (length(unknown) > 0) { + warning( + "Dictionary keys not recognised (ignored): ", + paste(unknown, collapse = ", ") + ) + } + for (k in intersect(names(dict_upper), nm_keys)) { + col_map[[k]] <- dict_upper[[k]] + } + if (verbose) { + mapped <- col_map[col_map != names(col_map)] + if (length(mapped) > 0) { + message( + "Dictionary mapping: ", + paste(names(mapped), "->", mapped, collapse = ", ") + ) + } + } + } + + # Helper: resolve a standard NONMEM name to its actual column name + col_for <- function(nm_name) col_map[[nm_name]] + results <- data.frame( check = character(0), description = character(0), @@ -47,18 +92,22 @@ check_nm_dataset <- function( results[nrow(results) + 1, ] <<- list(check, description, result) } - has_col <- function(col) col %in% names(data) + # Column accessor helpers — all use dictionary-resolved names + has_col <- function(nm_name) col_for(nm_name) %in% names(data) + get_col <- function(nm_name) data[[col_for(nm_name)]] cols <- toupper(names(data)) # ────────────────────────────────────────────────────────────────────────── - # Required columns # ────────────────────────────────────────────────────────────────────────── required <- c("ID", "TIME", "DV") - present <- required %in% cols + present <- vapply(required, function(k) col_for(k) %in% names(data), logical(1)) log_check( "required_cols", - paste0("Minimum required columns present (", paste(required, collapse = ", "), ")"), + paste0( + "Minimum required columns present (", + paste(vapply(required, col_for, character(1)), collapse = ", "), ")" + ), if (all(present)) "PASS" else "FAIL" ) @@ -138,7 +187,7 @@ check_nm_dataset <- function( # Exclusion flag # ────────────────────────────────────────────────────────────────────────── if (!is.null(exclusion_flag)) { - if (has_col(exclusion_flag)) { + if (exclusion_flag %in% names(data)) { ef <- data[[exclusion_flag]] ef_ok <- !any(is.na(ef)) && is.numeric(ef) && all(ef == round(ef)) log_check( @@ -165,7 +214,7 @@ check_nm_dataset <- function( # TIME checks # ────────────────────────────────────────────────────────────────────────── if (has_col("TIME")) { - time <- data$TIME + time <- get_col("TIME") log_check( "time_numeric", "TIME is numeric", @@ -183,17 +232,17 @@ check_nm_dataset <- function( # ────────────────────────────────────────────────────────────────────────── # TIME non-decreasing within ID (except EVID 3, 4) # ────────────────────────────────────────────────────────────────────────── - if (has_col("TIME") && has_col("ID") && is.numeric(data$TIME)) { + if (has_col("TIME") && has_col("ID") && is.numeric(get_col("TIME"))) { time_ok <- TRUE - for (id in unique(data$ID)) { - idx <- data$ID == id - sub <- data[idx, ] - # exclude reset events (EVID 3, 4) from this check - if (has_col("EVID")) { - keep <- !(sub$EVID %in% c(3, 4)) - t <- sub$TIME[keep] - } else { - t <- sub$TIME + id_vec <- get_col("ID") + time_vec <- get_col("TIME") + evid_vec <- if (has_col("EVID")) get_col("EVID") else NULL + for (id in unique(id_vec)) { + idx <- id_vec == id + t <- time_vec[idx] + if (!is.null(evid_vec)) { + keep <- !(evid_vec[idx] %in% c(3, 4)) + t <- t[keep] } if (length(t) > 1 && any(diff(t) < 0, na.rm = TRUE)) { time_ok <- FALSE @@ -211,7 +260,7 @@ check_nm_dataset <- function( # EVID checks # ────────────────────────────────────────────────────────────────────────── if (has_col("EVID")) { - evid <- data$EVID + evid <- get_col("EVID") log_check( "evid_values", "EVID values are in {0, 1, 2, 3, 4}", @@ -223,15 +272,15 @@ check_nm_dataset <- function( # CMT checks # ────────────────────────────────────────────────────────────────────────── if (has_col("CMT")) { - cmt <- data$CMT + cmt <- get_col("CMT") log_check( "cmt_integer", "CMT values are integers", if (is.numeric(cmt) && all(cmt == round(cmt), na.rm = TRUE)) "PASS" else "FAIL" ) if (has_col("EVID")) { - # For non-EVID==3 records, CMT should be positive (allow negative for compartment emptying) - non_reset <- data$EVID != 3 + evid <- get_col("EVID") + non_reset <- evid != 3 cmt_sub <- cmt[non_reset] cmt_sub <- cmt_sub[!is.na(cmt_sub)] log_check( @@ -246,14 +295,15 @@ check_nm_dataset <- function( # DV checks # ────────────────────────────────────────────────────────────────────────── if (has_col("DV")) { - dv <- data$DV + dv <- get_col("DV") log_check( "dv_numeric", "DV is numeric", if (is.numeric(dv)) "PASS" else "FAIL" ) if (has_col("EVID")) { - dose_evids <- data$EVID %in% c(1, 4) + evid <- get_col("EVID") + dose_evids <- evid %in% c(1, 4) dv_at_dose <- dv[dose_evids] log_check( "dv_missing_dose", @@ -261,9 +311,9 @@ check_nm_dataset <- function( if (all(is.na(dv_at_dose) | dv_at_dose == 0)) "PASS" else "FAIL" ) } - # Non-numeric / NA in DV for observation records if (has_col("EVID") && is.numeric(dv)) { - obs_dv <- dv[data$EVID == 0] + evid <- get_col("EVID") + obs_dv <- dv[evid == 0] log_check( "dv_obs_nonmissing", "DV is non-missing for observation records (EVID 0)", @@ -276,34 +326,34 @@ check_nm_dataset <- function( # MDV checks # ────────────────────────────────────────────────────────────────────────── if (has_col("MDV")) { - mdv <- data$MDV + mdv <- get_col("MDV") log_check( "mdv_binary", "MDV is binary (0 or 1)", if (all(mdv %in% c(0, 1), na.rm = TRUE)) "PASS" else "FAIL" ) - if (has_col("DV") && is.numeric(data$DV) && has_col("EVID")) { - obs <- data$EVID == 0 - # For observations: MDV should be 1 when DV is NA, 0 when DV is not NA - expected_mdv <- ifelse(is.na(data$DV[obs]), 1, 0) + if (has_col("DV") && is.numeric(get_col("DV")) && has_col("EVID")) { + dv <- get_col("DV") + evid <- get_col("EVID") + obs <- evid == 0 + expected_mdv <- ifelse(is.na(dv[obs]), 1, 0) log_check( "mdv_dv_consistent", "MDV is consistent with is.na(DV) for observation records (EVID 0)", if (all(mdv[obs] == expected_mdv, na.rm = TRUE)) "PASS" else "FAIL" ) - # For dosing records, MDV should be 1 - dosing <- data$EVID %in% c(1, 4) + dosing <- evid %in% c(1, 4) log_check( "mdv_dosing", "MDV is 1 for dosing records (EVID 1, 4)", if (all(mdv[dosing] == 1, na.rm = TRUE)) "PASS" else "FAIL" ) } - # Non-missing observations should have MDV = 0 - if (has_col("DV") && is.numeric(data$DV)) { - nonmissing_obs <- !is.na(data$DV) & data$DV != 0 + if (has_col("DV") && is.numeric(get_col("DV"))) { + dv <- get_col("DV") + nonmissing_obs <- !is.na(dv) & dv != 0 if (has_col("EVID")) { - nonmissing_obs <- nonmissing_obs & data$EVID == 0 + nonmissing_obs <- nonmissing_obs & get_col("EVID") == 0 } log_check( "mdv_nonmissing_obs", @@ -317,21 +367,22 @@ check_nm_dataset <- function( # AMT checks # ────────────────────────────────────────────────────────────────────────── if (has_col("AMT")) { - amt <- data$AMT + amt <- get_col("AMT") log_check( "amt_numeric", "AMT is numeric", if (is.numeric(amt)) "PASS" else "FAIL" ) if (has_col("EVID") && is.numeric(amt)) { - obs_other <- data$EVID %in% c(0, 2) + evid <- get_col("EVID") + obs_other <- evid %in% c(0, 2) amt_obs <- amt[obs_other] log_check( "amt_zero_obs", "AMT is 0 or missing for observation/other-type records (EVID 0, 2)", if (all(is.na(amt_obs) | amt_obs == 0)) "PASS" else "FAIL" ) - dose_recs <- data$EVID %in% c(1, 4) + dose_recs <- evid %in% c(1, 4) amt_dose <- amt[dose_recs] log_check( "amt_positive_dose", @@ -339,18 +390,18 @@ check_nm_dataset <- function( if (all(amt_dose > 0, na.rm = TRUE)) "PASS" else "FAIL" ) } - # AMT > 0 should have EVID = 1 if (has_col("EVID") && is.numeric(amt)) { + evid <- get_col("EVID") amt_pos <- !is.na(amt) & amt > 0 log_check( "amt_evid_consistent", "Records with AMT > 0 have EVID = 1 (or 4)", - if (all(data$EVID[amt_pos] %in% c(1, 4))) "PASS" else "FAIL" + if (all(evid[amt_pos] %in% c(1, 4))) "PASS" else "FAIL" ) } - # Non-numeric/NA in AMT for dosing records if (has_col("EVID")) { - dose_recs <- data$EVID %in% c(1, 4) + evid <- get_col("EVID") + dose_recs <- evid %in% c(1, 4) log_check( "amt_nonmissing_dose", "AMT is non-missing for dosing records (EVID 1, 4)", @@ -362,13 +413,16 @@ check_nm_dataset <- function( # ────────────────────────────────────────────────────────────────────────── # DV / MDV / BLQ consistency # ────────────────────────────────────────────────────────────────────────── - if (has_col("DV") && has_col("MDV") && has_col("EVID") && is.numeric(data$DV)) { - obs <- data$EVID == 0 - dv_leq_zero <- obs & !is.na(data$DV) & data$DV <= 0 + if (has_col("DV") && has_col("MDV") && has_col("EVID") && is.numeric(get_col("DV"))) { + dv <- get_col("DV") + mdv <- get_col("MDV") + evid <- get_col("EVID") + obs <- evid == 0 + dv_leq_zero <- obs & !is.na(dv) & dv <= 0 log_check( "dv_leq_zero_mdv", "DV <= 0 observations have MDV = 1 (or BLQ flag)", - if (sum(dv_leq_zero) == 0 || all(data$MDV[dv_leq_zero] == 1)) "PASS" else "FAIL" + if (sum(dv_leq_zero) == 0 || all(mdv[dv_leq_zero] == 1)) "PASS" else "FAIL" ) } @@ -376,17 +430,17 @@ check_nm_dataset <- function( # RATE checks # ────────────────────────────────────────────────────────────────────────── if (has_col("RATE")) { - rate <- data$RATE + rate <- get_col("RATE") log_check( "rate_numeric", "RATE is numeric", if (is.numeric(rate)) "PASS" else "FAIL" ) if (is.numeric(rate) && has_col("EVID")) { - dose_recs <- data$EVID %in% c(1, 4) + evid <- get_col("EVID") + dose_recs <- evid %in% c(1, 4) rate_dose <- rate[dose_recs] rate_dose <- rate_dose[!is.na(rate_dose)] - # RATE for dosing events: -2, -1, 0, or positive log_check( "rate_dose_values", "RATE for dosing records is -2, -1, 0, or positive", @@ -401,14 +455,15 @@ check_nm_dataset <- function( # SS checks # ────────────────────────────────────────────────────────────────────────── if (has_col("SS")) { - ss <- data$SS + ss <- get_col("SS") log_check( "ss_numeric", "SS is numeric", if (is.numeric(ss)) "PASS" else "FAIL" ) if (is.numeric(ss) && has_col("EVID")) { - dose_recs <- data$EVID %in% c(1, 4) + evid <- get_col("EVID") + dose_recs <- evid %in% c(1, 4) ss_dose <- ss[dose_recs] ss_dose <- ss_dose[!is.na(ss_dose)] log_check( @@ -425,19 +480,19 @@ check_nm_dataset <- function( # ADDL / II checks # ────────────────────────────────────────────────────────────────────────── if (has_col("ADDL") || has_col("II")) { - # ADDL and II must both be present log_check( "addl_ii_paired", "ADDL and II are both present (required together)", if (has_col("ADDL") && has_col("II")) "PASS" else "FAIL" ) if (has_col("ADDL")) { - addl <- data$ADDL + addl <- get_col("ADDL") log_check( "addl_nonneg_int", "ADDL is a non-negative integer for dosing records", if (is.numeric(addl) && has_col("EVID")) { - dose_recs <- data$EVID %in% c(1, 4) + evid <- get_col("EVID") + dose_recs <- evid %in% c(1, 4) addl_d <- addl[dose_recs] addl_d <- addl_d[!is.na(addl_d)] all(addl_d >= 0) && all(addl_d == round(addl_d)) @@ -447,12 +502,13 @@ check_nm_dataset <- function( ) } if (has_col("II")) { - ii <- data$II + ii <- get_col("II") log_check( "ii_nonneg_int", "II is a non-negative integer for dosing records", if (is.numeric(ii) && has_col("EVID")) { - dose_recs <- data$EVID %in% c(1, 4) + evid <- get_col("EVID") + dose_recs <- evid %in% c(1, 4) ii_d <- ii[dose_recs] ii_d <- ii_d[!is.na(ii_d)] all(ii_d >= 0) && all(ii_d == round(ii_d)) @@ -469,20 +525,25 @@ check_nm_dataset <- function( # Overlapping ADDL-expanded doses # ────────────────────────────────────────────────────────────────────────── if (has_col("ADDL") && has_col("II") && has_col("EVID") && has_col("ID") && - has_col("TIME") && is.numeric(data$ADDL) && is.numeric(data$II)) { + has_col("TIME") && is.numeric(get_col("ADDL")) && is.numeric(get_col("II"))) { overlap_found <- FALSE - for (id in unique(data$ID)) { - sub <- data[data$ID == id, ] - dose_idx <- which(sub$EVID %in% c(1, 4)) + id_vec <- get_col("ID") + time_vec <- get_col("TIME") + evid_vec <- get_col("EVID") + addl_vec <- get_col("ADDL") + ii_vec <- get_col("II") + for (id in unique(id_vec)) { + idx <- which(id_vec == id) + dose_idx <- idx[evid_vec[idx] %in% c(1, 4)] if (length(dose_idx) < 2) next for (i in seq_len(length(dose_idx) - 1)) { - row_i <- dose_idx[i] - row_next <- dose_idx[i + 1] - addl_i <- sub$ADDL[row_i] - ii_i <- sub$II[row_i] + ri <- dose_idx[i] + rn <- dose_idx[i + 1] + addl_i <- addl_vec[ri] + ii_i <- ii_vec[ri] if (!is.na(addl_i) && !is.na(ii_i) && addl_i > 0 && ii_i > 0) { - last_expanded <- sub$TIME[row_i] + addl_i * ii_i - if (last_expanded >= sub$TIME[row_next]) { + last_expanded <- time_vec[ri] + addl_i * ii_i + if (last_expanded >= time_vec[rn]) { overlap_found <- TRUE break } @@ -501,7 +562,7 @@ check_nm_dataset <- function( # ID checks # ────────────────────────────────────────────────────────────────────────── if (has_col("ID")) { - ids <- as.character(data$ID) + ids <- as.character(get_col("ID")) has_leading_zeros <- any(grepl("^0\\d+", ids)) log_check( "id_leading_zeros", @@ -519,7 +580,9 @@ check_nm_dataset <- function( # ────────────────────────────────────────────────────────────────────────── # Unique records (ID, TIME, CMT, EVID) # ────────────────────────────────────────────────────────────────────────── - dup_cols <- intersect(c("ID", "TIME", "CMT", "EVID"), names(data)) + dup_nm_keys <- c("ID", "TIME", "CMT", "EVID") + dup_actual <- vapply(dup_nm_keys, col_for, character(1)) + dup_cols <- dup_actual[dup_actual %in% names(data)] if (length(dup_cols) >= 2) { dup_check <- duplicated(data[, dup_cols, drop = FALSE]) log_check( @@ -533,10 +596,12 @@ check_nm_dataset <- function( # Dataset ordering (ID, TIME) # ────────────────────────────────────────────────────────────────────────── if (has_col("ID") && has_col("TIME")) { - sorted <- data[order(data$ID, data$TIME), ] + id_vec <- get_col("ID") + time_vec <- get_col("TIME") + ord <- order(id_vec, time_vec) is_sorted <- identical( - paste(data$ID, data$TIME), - paste(sorted$ID, sorted$TIME) + paste(id_vec, time_vec), + paste(id_vec[ord], time_vec[ord]) ) log_check( "data_ordered", @@ -572,12 +637,20 @@ check_nm_dataset <- function( # ────────────────────────────────────────────────────────────────────────── # Missing covariate values # ────────────────────────────────────────────────────────────────────────── - # Covariates are columns that are not standard NONMEM columns + # Covariates are columns that are not standard NONMEM columns (mapped or not) nm_standard <- c( "ID", "TIME", "DV", "MDV", "EVID", "AMT", "RATE", "SS", "II", "ADDL", "CMT", "GROUP", "BLQ", "LLOQ", "TAD" ) - cov_cols <- setdiff(names(data), c(nm_standard, if (!is.null(exclusion_flag)) exclusion_flag)) + # Include both the standard names and the dictionary-mapped actual names + all_standard <- unique(c( + nm_standard, + vapply(intersect(nm_keys, nm_standard), col_for, character(1)) + )) + cov_cols <- setdiff( + names(data), + c(all_standard, if (!is.null(exclusion_flag)) exclusion_flag) + ) if (length(cov_cols) > 0) { has_missing_cov <- FALSE for (col in cov_cols) { @@ -597,12 +670,13 @@ check_nm_dataset <- function( # ────────────────────────────────────────────────────────────────────────── # RATE column consistency with AMT # ────────────────────────────────────────────────────────────────────────── - if (has_col("RATE") && has_col("AMT") && is.numeric(data$RATE) && is.numeric(data$AMT)) { + if (has_col("RATE") && has_col("AMT") && + is.numeric(get_col("RATE")) && is.numeric(get_col("AMT"))) { if (has_col("EVID")) { - dose_recs <- data$EVID %in% c(1, 4) - rate_dose <- data$RATE[dose_recs] - amt_dose <- data$AMT[dose_recs] - # If RATE is a real rate (> 0 and not -1/-2), AMT handling differs + evid <- get_col("EVID") + dose_recs <- evid %in% c(1, 4) + rate_dose <- get_col("RATE")[dose_recs] + amt_dose <- get_col("AMT")[dose_recs] real_rate <- !is.na(rate_dose) & !is.na(amt_dose) & rate_dose > 0 log_check( "rate_amt_consistency", diff --git a/man/check_nm_dataset.Rd b/man/check_nm_dataset.Rd index 2d0bcd2..16a1754 100644 --- a/man/check_nm_dataset.Rd +++ b/man/check_nm_dataset.Rd @@ -6,6 +6,7 @@ \usage{ check_nm_dataset( data, + dictionary = NULL, exclusion_flag = NULL, max_digits = 10, max_id_digits = 5, @@ -16,6 +17,14 @@ check_nm_dataset( \arguments{ \item{data}{a data.frame (or tibble) containing the NONMEM dataset to check.} +\item{dictionary}{an optional named list that maps standard NONMEM column +names to the actual column names in \code{data}. For example, +\code{list(ID = "NMID", TIME = "TAD")} means the column called \code{"NMID"} in +\code{data} plays the role of \code{ID}, and \code{"TAD"} plays the role of \code{TIME}. +Any standard name not listed in the dictionary is assumed to match the +column name in \code{data} as-is. Supported keys: \code{ID}, \code{TIME}, \code{DV}, \code{MDV}, +\code{EVID}, \code{AMT}, \code{CMT}, \code{RATE}, \code{SS}, \code{ADDL}, \code{II}.} + \item{exclusion_flag}{character name of the column used as an exclusion flag (for \verb{$DATA ACCEPT} / \code{IGNORE} in NONMEM). If \code{NULL} (default), the exclusion-flag check is skipped.} diff --git a/tests/testthat/test-check_nm_dataset.R b/tests/testthat/test-check_nm_dataset.R index 1f19ef5..30ce6f5 100644 --- a/tests/testthat/test-check_nm_dataset.R +++ b/tests/testthat/test-check_nm_dataset.R @@ -16,6 +16,13 @@ make_nm <- function(n_id = 2, n_obs = 3) { })) } +# Helper: dataset with renamed columns (for dictionary tests) +make_nm_renamed <- function(n_id = 2, n_obs = 3) { + d <- make_nm(n_id = n_id, n_obs = n_obs) + names(d) <- c("SUBJ", "TAD", "CONC", "MISS", "FLAG", "DOSE", "COMP") + d +} + test_that("clean dataset passes all applicable checks", { d <- make_nm() res <- check_nm_dataset(d, verbose = FALSE) @@ -230,3 +237,87 @@ test_that("duplicate records are flagged", { res <- check_nm_dataset(d, verbose = FALSE) expect_equal(res$result[res$check == "unique_records"], "FAIL") }) + +# ── Dictionary tests ────────────────────────────────────────────────────────── + +test_that("dictionary maps renamed columns and passes clean dataset", { + d <- make_nm_renamed() + dict <- list( + ID = "SUBJ", TIME = "TAD", DV = "CONC", MDV = "MISS", + EVID = "FLAG", AMT = "DOSE", CMT = "COMP" + ) + res <- check_nm_dataset(d, dictionary = dict, verbose = FALSE) + # All applicable checks should pass (or be NA), none should fail + expect_true(all(res$result %in% c("PASS", "NA"))) +}) + +test_that("dictionary resolves required columns correctly", { + # Without dictionary, renamed dataset fails required_cols + d <- make_nm_renamed() + res_no_dict <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res_no_dict$result[res_no_dict$check == "required_cols"], "FAIL") + + # With dictionary, it passes + dict <- list(ID = "SUBJ", TIME = "TAD", DV = "CONC") + res_dict <- check_nm_dataset(d, dictionary = dict, verbose = FALSE) + expect_equal(res_dict$result[res_dict$check == "required_cols"], "PASS") +}) + +test_that("dictionary catches errors in renamed columns", { + d <- make_nm_renamed() + d$TAD[2] <- -1 # negative TIME (via TAD) + dict <- list( + ID = "SUBJ", TIME = "TAD", DV = "CONC", MDV = "MISS", + EVID = "FLAG", AMT = "DOSE", CMT = "COMP" + ) + res <- check_nm_dataset(d, dictionary = dict, verbose = FALSE) + expect_equal(res$result[res$check == "time_nonneg"], "FAIL") +}) + +test_that("dictionary works for AMT/EVID consistency", { + d <- make_nm_renamed() + d$DOSE[3] <- 50 # AMT > 0 for an observation (EVID = 0) + dict <- list( + ID = "SUBJ", TIME = "TAD", DV = "CONC", MDV = "MISS", + EVID = "FLAG", AMT = "DOSE", CMT = "COMP" + ) + res <- check_nm_dataset(d, dictionary = dict, verbose = FALSE) + expect_equal(res$result[res$check == "amt_zero_obs"], "FAIL") + expect_equal(res$result[res$check == "amt_evid_consistent"], "FAIL") +}) + +test_that("partial dictionary works (unmapped columns keep standard names)", { + d <- make_nm() + names(d)[1] <- "SUBJ" # only rename ID + dict <- list(ID = "SUBJ") + res <- check_nm_dataset(d, dictionary = dict, verbose = FALSE) + expect_equal(res$result[res$check == "required_cols"], "PASS") + expect_true(all(res$result %in% c("PASS", "NA"))) +}) + +test_that("dictionary is case-insensitive for keys", { + d <- make_nm_renamed() + dict <- list( + id = "SUBJ", time = "TAD", dv = "CONC", mdv = "MISS", + evid = "FLAG", amt = "DOSE", cmt = "COMP" + ) + res <- check_nm_dataset(d, dictionary = dict, verbose = FALSE) + expect_equal(res$result[res$check == "required_cols"], "PASS") +}) + +test_that("unknown dictionary keys produce a warning", { + d <- make_nm() + expect_warning( + check_nm_dataset(d, dictionary = list(FOOBAR = "X"), verbose = FALSE), + "not recognised" + ) +}) + +test_that("dictionary verbose output shows mapping", { + d <- make_nm() + names(d)[1] <- "SUBJ" + expect_message( + check_nm_dataset(d, dictionary = list(ID = "SUBJ"), verbose = TRUE), + "Dictionary mapping: ID -> SUBJ" + ) +}) From d957d489b01f278a6b0436f796076ae815303692 Mon Sep 17 00:00:00 2001 From: roninsightrx Date: Fri, 3 Apr 2026 13:18:47 -0700 Subject: [PATCH 3/5] Update R/check_nm_dataset.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/check_nm_dataset.R | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/R/check_nm_dataset.R b/R/check_nm_dataset.R index befd741..fdcc2b7 100644 --- a/R/check_nm_dataset.R +++ b/R/check_nm_dataset.R @@ -655,14 +655,22 @@ check_nm_dataset <- function( has_missing_cov <- FALSE for (col in cov_cols) { vals <- data[[col]] - if (any(is.na(vals)) || any(vals %in% c(-999, -99, "."), na.rm = TRUE)) { + has_sentinel <- FALSE + + if (is.numeric(vals)) { + has_sentinel <- any(vals %in% c(-999, -99), na.rm = TRUE) + } else if (is.character(vals) || is.factor(vals)) { + has_sentinel <- any(as.character(vals) %in% c(".", "NA"), na.rm = TRUE) + } + + if (any(is.na(vals)) || has_sentinel) { has_missing_cov <- TRUE break } } log_check( "covariate_missing", - "No missing or sentinel values (-999, '.') in covariate columns", + "No missing or sentinel values (-999, -99, '.', 'NA') in covariate columns", if (!has_missing_cov) "PASS" else "FAIL" ) } From 3b300e51b3169dbd9cd1220dbd6093bc6aed3ae0 Mon Sep 17 00:00:00 2001 From: Ron Keizer Date: Fri, 3 Apr 2026 13:21:10 -0700 Subject: [PATCH 4/5] fix: align MDV/DV checks on consistent missing/BLQ definition Reviewer feedback: the MDV/DV checks were internally inconsistent. mdv_dv_consistent used only is.na(DV), but dv_leq_zero_mdv expected DV <= 0 to have MDV=1, and mdv_nonmissing_obs used DV != 0 instead of DV > 0. A valid dataset with BLQ records could not pass all three. Changes: - Unified "missing/BLQ" = is.na(DV) | DV <= 0 across all MDV checks - mdv_dv_consistent now expects MDV=1 for NA or DV<=0, MDV=0 for DV>0 - Removed redundant dv_leq_zero_mdv check (covered by updated above) - Fixed mdv_nonmissing_obs filter to use DV > 0 (matching description) - Added test for BLQ (DV <= 0) MDV consistency Co-Authored-By: Claude Opus 4.6 (1M context) --- R/check_nm_dataset.R | 30 +++++++------------------- tests/testthat/test-check_nm_dataset.R | 21 ++++++++++++++++-- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/R/check_nm_dataset.R b/R/check_nm_dataset.R index fdcc2b7..5245f58 100644 --- a/R/check_nm_dataset.R +++ b/R/check_nm_dataset.R @@ -336,10 +336,12 @@ check_nm_dataset <- function( dv <- get_col("DV") evid <- get_col("EVID") obs <- evid == 0 - expected_mdv <- ifelse(is.na(dv[obs]), 1, 0) + # Treat NA and DV <= 0 as missing/BLQ → MDV should be 1; + # DV > 0 → MDV should be 0 + expected_mdv <- ifelse(is.na(dv[obs]) | dv[obs] <= 0, 1, 0) log_check( "mdv_dv_consistent", - "MDV is consistent with is.na(DV) for observation records (EVID 0)", + "MDV matches DV for observations: MDV=1 when DV is NA or <= 0 (BLQ), MDV=0 when DV > 0", if (all(mdv[obs] == expected_mdv, na.rm = TRUE)) "PASS" else "FAIL" ) dosing <- evid %in% c(1, 4) @@ -351,14 +353,14 @@ check_nm_dataset <- function( } if (has_col("DV") && is.numeric(get_col("DV"))) { dv <- get_col("DV") - nonmissing_obs <- !is.na(dv) & dv != 0 + valid_obs <- !is.na(dv) & dv > 0 if (has_col("EVID")) { - nonmissing_obs <- nonmissing_obs & get_col("EVID") == 0 + valid_obs <- valid_obs & get_col("EVID") == 0 } log_check( "mdv_nonmissing_obs", - "MDV is 0 for non-missing observation records (DV > 0)", - if (all(mdv[nonmissing_obs] == 0, na.rm = TRUE)) "PASS" else "FAIL" + "MDV is 0 for valid observation records (DV > 0)", + if (all(mdv[valid_obs] == 0, na.rm = TRUE)) "PASS" else "FAIL" ) } } @@ -410,22 +412,6 @@ check_nm_dataset <- function( } } - # ────────────────────────────────────────────────────────────────────────── - # DV / MDV / BLQ consistency - # ────────────────────────────────────────────────────────────────────────── - if (has_col("DV") && has_col("MDV") && has_col("EVID") && is.numeric(get_col("DV"))) { - dv <- get_col("DV") - mdv <- get_col("MDV") - evid <- get_col("EVID") - obs <- evid == 0 - dv_leq_zero <- obs & !is.na(dv) & dv <= 0 - log_check( - "dv_leq_zero_mdv", - "DV <= 0 observations have MDV = 1 (or BLQ flag)", - if (sum(dv_leq_zero) == 0 || all(mdv[dv_leq_zero] == 1)) "PASS" else "FAIL" - ) - } - # ────────────────────────────────────────────────────────────────────────── # RATE checks # ────────────────────────────────────────────────────────────────────────── diff --git a/tests/testthat/test-check_nm_dataset.R b/tests/testthat/test-check_nm_dataset.R index 30ce6f5..3f3bdc7 100644 --- a/tests/testthat/test-check_nm_dataset.R +++ b/tests/testthat/test-check_nm_dataset.R @@ -108,11 +108,28 @@ test_that("DV present for dosing records is flagged", { expect_equal(res$result[res$check == "dv_missing_dose"], "FAIL") }) -test_that("MDV consistency is checked", { +test_that("MDV = 1 for valid observation (DV > 0) is flagged", { d <- make_nm() - d$MDV[2] <- 1 # observation with non-missing DV should have MDV = 0 + d$MDV[2] <- 1 # observation with DV > 0 should have MDV = 0 res <- check_nm_dataset(d, verbose = FALSE) expect_equal(res$result[res$check == "mdv_nonmissing_obs"], "FAIL") + expect_equal(res$result[res$check == "mdv_dv_consistent"], "FAIL") +}) + +test_that("MDV/DV consistency treats DV <= 0 as BLQ (MDV should be 1)", { + d <- make_nm() + # Set an observation DV to a negative value (e.g. log-transformed BLQ) + # and MDV = 0 — this should FAIL because DV <= 0 requires MDV = 1 + d$DV[2] <- -2 + d$MDV[2] <- 0 + res <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res$result[res$check == "mdv_dv_consistent"], "FAIL") + + # Now set MDV = 1 for the BLQ record — should PASS + d$MDV[2] <- 1 + res2 <- check_nm_dataset(d, verbose = FALSE) + expect_equal(res2$result[res2$check == "mdv_dv_consistent"], "PASS") + expect_equal(res2$result[res2$check == "mdv_nonmissing_obs"], "PASS") }) test_that("exclusion flag checks", { From 50dc8752d1f0ea8954f70e44119972ebabab830e Mon Sep 17 00:00:00 2001 From: Ron Keizer Date: Fri, 3 Apr 2026 13:22:53 -0700 Subject: [PATCH 5/5] fix: sort dose records by TIME before ADDL overlap detection The overlap check iterated dose records in row order, which could produce false positives/negatives on unsorted datasets. Now sorts dose_idx by TIME within each ID before comparing expanded dose windows against subsequent doses. Co-Authored-By: Claude Opus 4.6 (1M context) --- R/check_nm_dataset.R | 3 +++ tests/testthat/test-check_nm_dataset.R | 28 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/R/check_nm_dataset.R b/R/check_nm_dataset.R index 5245f58..84ec070 100644 --- a/R/check_nm_dataset.R +++ b/R/check_nm_dataset.R @@ -520,8 +520,11 @@ check_nm_dataset <- function( ii_vec <- get_col("II") for (id in unique(id_vec)) { idx <- which(id_vec == id) + # Sort dose records by TIME to avoid false positives/negatives + # when the dataset is not ordered dose_idx <- idx[evid_vec[idx] %in% c(1, 4)] if (length(dose_idx) < 2) next + dose_idx <- dose_idx[order(time_vec[dose_idx])] for (i in seq_len(length(dose_idx) - 1)) { ri <- dose_idx[i] rn <- dose_idx[i + 1] diff --git a/tests/testthat/test-check_nm_dataset.R b/tests/testthat/test-check_nm_dataset.R index 3f3bdc7..4acbf7a 100644 --- a/tests/testthat/test-check_nm_dataset.R +++ b/tests/testthat/test-check_nm_dataset.R @@ -213,6 +213,34 @@ test_that("ADDL overlap is detected", { expect_equal(res$result[res$check == "addl_overlap"], "FAIL") }) +test_that("ADDL overlap detection works on unsorted datasets", { + # Build an overlapping dataset but shuffle the rows so it's not sorted + d <- data.frame( + ID = c(1, 1, 1, 1), + TIME = c(0, 24, 24, 48), + DV = c(NA, NA, 5, 3), + MDV = c(1, 1, 0, 0), + EVID = c(1, 1, 0, 0), + AMT = c(100, 100, 0, 0), + CMT = c(1, 1, 1, 1), + ADDL = c(5, 0, 0, 0), + II = c(12, 0, 0, 0), + stringsAsFactors = FALSE + ) + # TIME=0, ADDL=5, II=12 => last expanded at 60 >= next dose at 24 + # Shuffle rows so dose at TIME=24 comes before dose at TIME=0 + d_shuffled <- d[c(3, 2, 4, 1), ] + res <- check_nm_dataset(d_shuffled, verbose = FALSE) + expect_equal(res$result[res$check == "addl_overlap"], "FAIL") + + # Non-overlapping case: reduce ADDL so expansion ends before next dose + d2 <- d + d2$ADDL[1] <- 1 # last expanded at TIME = 12, next dose at 24 → OK + d2_shuffled <- d2[c(3, 2, 4, 1), ] + res2 <- check_nm_dataset(d2_shuffled, verbose = FALSE) + expect_equal(res2$result[res2$check == "addl_overlap"], "PASS") +}) + test_that("covariate missing values are detected", { d <- make_nm() d$WT <- c(70, 80, NA, 75, 65, 90, 85, 70)