InsightRX · roninsightrx · Mar 30, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pharmr.extra
 Title: Extension of pharmr (Pharmpy) functionality
-Version: 0.0.0.9028
+Version: 0.0.0.9029
 Authors@R: c(
     person("Ron", "Keizer", email = "ron@insight-rx.com", role = c("cre", "aut")),
     person("Michael", "McCarthy", email = "michael.mccarthy@insight-rx.com", role = "ctb"),

diff --git a/R/create_sim_dataset.R b/R/create_sim_dataset.R
@@ -62,6 +62,7 @@ create_sim_dataset <- function(
       cli::cli_abort("Could not load model into Pharmpy. Please check the supplied model file.")
     }
   }
+  build_from_scratch <- FALSE
   if (!is.null(data)) {
     idx <- get_required_input_variables(model, data)
     if (inherits(data, "character")) {
@@ -80,9 +81,29 @@ create_sim_dataset <- function(
       cli::cli_abort("Number of columns for input dataset is lower than number of columns in $INPUT. Please check dataset and $INPUT. Cannot continue creating dataset.")
     }
   } else {
-    input_data <- as.data.frame(model$dataset)
+    raw_dataset <- model$dataset
+    if (!is.null(raw_dataset)) {
+      input_data <- as.data.frame(raw_dataset)
+    } else {
+      ## model$dataset is NULL (the $DATA file could not be found) — attempt
+      ## to build the simulation dataset from scratch using regimen/t_obs/covariates.
+      build_from_scratch <- TRUE
+      if (is.null(regimen)) {
+        cli::cli_abort(
+          c(
+            "No dataset is attached to this model (the {.field $DATA} file cannot be found) and no {.arg data} argument was supplied.",
+            i = "Provide {.arg regimen} (and optionally {.arg t_obs}, {.arg covariates}, {.arg n_subjects}) to build a simulation dataset from scratch."
+          )
+        )
+      }
+      if (is.null(n_subjects)) {
+        n_subjects <- if (!is.null(covariates)) nrow(covariates) else 1L
+      }
+      if (verbose) cli::cli_alert_info("No dataset attached to model \u2014 building simulation dataset from scratch")
+      input_data <- data.frame(ID = seq_len(n_subjects))
+    }
   }
-  
+
   if (!"ID" %in% names(input_data)) {
     cli::cli_abort(
       c("Column `ID` not found in the dataset.",
@@ -92,7 +113,8 @@ create_sim_dataset <- function(
 
   input_has_column <- list()
   for (key in c("CMT", "EVID", "MDV", "RATE")) {
-    input_has_column[[key]] <- key %in% names(input_data)
+    ## When building from scratch, include all standard NONMEM columns in the output.
+    input_has_column[[key]] <- build_from_scratch || key %in% names(input_data)
   }
 
   ## make sure we have regimen as a data.frame
@@ -156,20 +178,33 @@ create_sim_dataset <- function(
     }
     new_covariates <- names(covariates)
     new_covariates <- new_covariates[new_covariates != "ID" & new_covariates %in% names(sim_data)]
-    if (verbose) cli::cli_alert_info("Updating covariates: {new_covariates}")
+    all_cov_cols <- setdiff(names(covariates), "ID")
+    if (verbose) cli::cli_alert_info("Updating covariates: {all_cov_cols}")
 
     sim_data_cols <- names(sim_data)
+    ## When building from scratch sim_data only has ID; union ensures newly-joined
+    ## covariate columns are retained by the following select().
+    sim_data_cols <- union(sim_data_cols, all_cov_cols)
     sim_data <- sim_data |>
       dplyr::select(-dplyr::all_of(new_covariates)) |>
       dplyr::left_join(covariates, by = "ID") |>
       dplyr::select(dplyr::all_of(sim_data_cols)) |>
-      tidyr::fill(dplyr::all_of(new_covariates), .direction = "downup")
+      tidyr::fill(dplyr::all_of(all_cov_cols), .direction = "downup")
   }
 
   if (!is.null(regimen_df)) {
     if (verbose) cli::cli_alert_info("Creating new regimens for subjects in simulation")
     advan <- get_advan(model)
+    ## When building from scratch, sim_data contains only placeholder rows (one per
+    ## subject, carrying covariate values but no NONMEM columns). Mark them so they
+    ## can be removed after fill() propagates their covariate values into dose rows.
+    if (build_from_scratch) {
+      sim_data$.placeholder <- TRUE
+    }
     doses <- create_dosing_records(regimen_df, sim_data, n_subjects, advan)
+    ## Setting .placeholder = FALSE on dose rows prevents fill() from propagating
+    ## TRUE upward from placeholder rows (which sort last due to NA TIME).
+    if (build_from_scratch) doses$.placeholder <- FALSE
     doses <- match_type(doses, sim_data, c("AMT", "RATE", "DV"))
     if ("EVID" %in% names(sim_data)) {
       sim_data <- sim_data |>
@@ -181,6 +216,12 @@ create_sim_dataset <- function(
       dplyr::group_by(.data$ID) |>
       tidyr::fill(tidyselect::everything(), .direction = "downup") |>
       dplyr::mutate(dplyr::across(dplyr::everything(), ~ fill_missing(.x)))
+    ## Remove placeholder rows now that covariates have been propagated to dose rows.
+    if (build_from_scratch) {
+      sim_data <- sim_data |>
+        dplyr::filter(!.data$.placeholder) |>
+        dplyr::select(-".placeholder")
+    }
     if (is.null(t_obs)) {
       t_max <- max(sim_data$TIME) + round(diff(utils::tail(sim_data$TIME, 2)))
       t_obs <- seq(0, t_max, 4)

diff --git a/R/run_sim.R b/R/run_sim.R
@@ -74,6 +74,13 @@ run_sim <- function(
   }
   input_data <- model$dataset
 
+  if (is.null(input_data) && is.null(data)) {
+    cli::cli_abort(
+      c("No dataset is attached to this model and no `data` argument was provided.",
+        i = "Attach a dataset to the model, or supply a simulation dataset via the `data` argument (see {.fn create_sim_dataset}).")
+    )
+  }
+
   tool <- match.arg(tool)
   if(tool == "auto") {
     if(inherits(model, "pharmpy.model.external.nonmem.model.Model")) {
@@ -103,6 +110,22 @@ run_sim <- function(
     }
   }
 
+  ## Validate that required columns are present in the simulation dataset
+  req_vars <- tryCatch(
+    get_required_input_variables(model),
+    error = function(e) NULL
+  )
+  if (!is.null(req_vars)) {
+    required_cols <- req_vars$data_col[req_vars$required & !is.na(req_vars$data_col)]
+    missing_cols <- setdiff(required_cols, names(sim_data))
+    if (length(missing_cols) > 0) {
+      cli::cli_abort(
+        c("The simulation dataset is missing required column(s): {missing_cols}.",
+          i = "Use {.fn create_sim_dataset} to prepare a valid simulation dataset, or add the missing columns manually.")
+      )
+    }
+  }
+
   ## get unique regimens / datasets to simulate
   unique_regimens <- unique(sim_data[[".regimen"]])
   comb <- list()

diff --git a/tests/testthat/test-create_sim_dataset.R b/tests/testthat/test-create_sim_dataset.R
@@ -337,3 +337,145 @@ test_that("create_sim_dataset: error when required covariates are missing", {
     "Not all required covariates"
   )
 })
+
+# ===========================================================================
+# Build from scratch (model$dataset is NULL — no $DATA file on disk)
+# ===========================================================================
+
+## Helper: model with an absolute $DATA path that definitely does not exist,
+## so that model$dataset returns NULL.  Uses ADVAN1 (1-cmt IV) with CL/V only.
+.make_no_data_model <- function() {
+  pharmr::read_model_from_string(paste0(
+    "$PROBLEM no-data\n",
+    "$INPUT ID TIME DV AMT EVID MDV\n",
+    "$DATA /nonexistent/pharmr_extra_test_data.csv IGNORE=@\n",
+    "$SUBROUTINES ADVAN1 TRANS2\n",
+    "$PK\nCL=THETA(1)\nV=THETA(2)\nS1=V\n",
+    "$ERROR\nY=F+EPS(1)\n",
+    "$THETA (0,10)\n$THETA (0,50)\n",
+    "$SIGMA 0.1\n",
+    "$EST METHOD=1\n"
+  ))
+}
+
+test_that("create_sim_dataset (no-data): error when model has no dataset and regimen is NULL", {
+  local_pharmr.extra_options()
+  skip_if_nonmem_not_available()
+
+  mod <- .make_no_data_model()
+  skip_if(
+    !is.null(mod$dataset),
+    "Pharmpy returned a non-NULL dataset for a missing $DATA file — from-scratch path not triggered"
+  )
+
+  expect_error(
+    create_sim_dataset(model = mod, verbose = FALSE),
+    "No dataset is attached"
+  )
+})
+
+test_that("create_sim_dataset (no-data): regimen-only produces dose + obs rows", {
+  local_pharmr.extra_options()
+  skip_if_nonmem_not_available()
+
+  mod <- .make_no_data_model()
+  skip_if(
+    !is.null(mod$dataset),
+    "Pharmpy returned a non-NULL dataset for a missing $DATA file — from-scratch path not triggered"
+  )
+
+  out <- create_sim_dataset(
+    model   = mod,
+    regimen = list(dose = 100, interval = 12, n = 3, route = "iv"),
+    t_obs   = seq(0, 36, 6),
+    verbose = FALSE
+  )
+
+  expect_s3_class(out, "data.frame")
+  expect_true(nrow(out) > 0)
+  expect_true(any(out$EVID == 1))   # dose rows present
+  expect_true(any(out$EVID == 0))   # obs rows present
+  expect_true(all(c("ID", "TIME", "AMT", "EVID", "MDV") %in% names(out)))
+})
+
+test_that("create_sim_dataset (no-data): n_subjects defaults to 1 when no covariates", {
+  local_pharmr.extra_options()
+  skip_if_nonmem_not_available()
+
+  mod <- .make_no_data_model()
+  skip_if(!is.null(mod$dataset), "Pharmpy returned a non-NULL dataset")
+
+  out <- create_sim_dataset(
+    model   = mod,
+    regimen = list(dose = 100, interval = 12, n = 3, route = "iv"),
+    t_obs   = seq(0, 36, 6),
+    verbose = FALSE
+  )
+  expect_equal(length(unique(out$ID)), 1L)
+})
+
+test_that("create_sim_dataset (no-data): n_subjects controls number of subjects", {
+  local_pharmr.extra_options()
+  skip_if_nonmem_not_available()
+
+  mod <- .make_no_data_model()
+  skip_if(!is.null(mod$dataset), "Pharmpy returned a non-NULL dataset")
+
+  out <- create_sim_dataset(
+    model       = mod,
+    regimen     = list(dose = 100, interval = 12, n = 3, route = "iv"),
+    t_obs       = seq(0, 36, 6),
+    n_subjects  = 5,
+    verbose     = FALSE
+  )
+  expect_equal(length(unique(out$ID)), 5L)
+})
+
+test_that("create_sim_dataset (no-data): no .placeholder column in output", {
+  local_pharmr.extra_options()
+  skip_if_nonmem_not_available()
+
+  mod <- .make_no_data_model()
+  skip_if(!is.null(mod$dataset), "Pharmpy returned a non-NULL dataset")
+
+  out <- create_sim_dataset(
+    model   = mod,
+    regimen = list(dose = 100, interval = 12, n = 3, route = "iv"),
+    t_obs   = seq(0, 36, 6),
+    verbose = FALSE
+  )
+  expect_false(".placeholder" %in% names(out))
+})
+
+test_that("create_sim_dataset (no-data): covariates are applied and appear in output", {
+  local_pharmr.extra_options()
+  skip_if_nonmem_not_available()
+
+  ## Use pheno model with a non-existent absolute $DATA path so model$dataset is NULL
+  pheno_code <- pharmr::get_model_code(pharmr::load_example_model("pheno"))
+  ## Replace the $DATA line with an absolute nonexistent path
+  pheno_code_no_data <- gsub(
+    "(?i)\\$DATA[^\n]*",
+    "$DATA /nonexistent/pharmr_extra_test_pheno.csv IGNORE=@",
+    pheno_code,
+    perl = TRUE
+  )
+  mod <- pharmr::read_model_from_string(pheno_code_no_data)
+  skip_if(!is.null(mod$dataset), "Pharmpy returned a non-NULL dataset")
+
+  covs <- data.frame(WGT = c(50, 100), APGR = c(6, 8))
+  out <- create_sim_dataset(
+    model      = mod,
+    regimen    = list(dose = 25, interval = 12, n = 3, route = "iv"),
+    t_obs      = seq(0, 36, 12),
+    covariates = covs,
+    verbose    = FALSE
+  )
+
+  expect_s3_class(out, "data.frame")
+  expect_equal(length(unique(out$ID)), 2L)
+  expect_true("WGT" %in% names(out))
+  ## Covariate values match per subject
+  expect_equal(unique(out$WGT[out$ID == 1]), 50)
+  expect_equal(unique(out$WGT[out$ID == 2]), 100)
+})
diff --git a/tests/testthat/test-run_sim.R b/tests/testthat/test-run_sim.R
@@ -650,3 +650,82 @@ test_that("run_sim (stub): covariates with ID column still works (regression)",
   expect_equal(sort(unique(captured_sim_data$ID)), 1:2)
 })
 
+# ===========================================================================
+# run_sim() with data=NULL — uses model's attached dataset
+# ===========================================================================
+
+test_that("run_sim (stub): data=NULL uses model's attached dataset", {
+  local_pharmr.extra_options()
+  skip_if_nonmem_not_available()
+  withr::local_dir(tempdir())
+
+  mod <- pharmr::load_example_model("pheno")
+
+  captured_data <- NULL
+  local_mocked_bindings(
+    run_nlme = function(data, ...) {
+      captured_data <<- utils::read.csv(data)
+      .mock_nlme_result()
+    },
+    .package = "pharmr.extra"
+  )
+
+  ## No `data` argument — should fall back to model$dataset
+  out <- run_sim(model = mod, verbose = FALSE)
+
+  expect_s3_class(out, "data.frame")
+  expect_true(nrow(out) > 0)
+  ## Dataset sent to NONMEM must have rows (model has real data attached)
+  expect_true(!is.null(captured_data) && nrow(captured_data) > 0)
+})
+
+test_that("run_sim: error when model has no dataset and data=NULL", {
+  local_pharmr.extra_options()
+  skip_if_nonmem_not_available()
+
+  ## read_model_from_string with a non-existent $DATA file — model$dataset is NULL
+  mod <- pharmr::read_model_from_string(
+    "$PROBLEM Test\n$INPUT ID TIME DV AMT EVID MDV\n$DATA /nonexistent/path/data.csv IGNORE=@\n$SUBROUTINES ADVAN1 TRANS2\n$PK\nCL=THETA(1)\nV=THETA(2)\nS1=V\n$ERROR\nY=F+EPS(1)\n$THETA (0,10)\n$THETA (0,50)\n$SIGMA 0.1\n$EST METHOD=1\n"
+  )
+  skip_if(
+    !is.null(mod$dataset),
+    "Pharmpy returned a non-NULL dataset for a missing $DATA file"
+  )
+
+  expect_error(
+    run_sim(model = mod, verbose = FALSE),
+    "No dataset is attached"
+  )
+})
+
+test_that("run_sim: error when data is missing a required column", {
+  local_pharmr.extra_options()
+  skip_if_nonmem_not_available()
+  withr::local_dir(tempdir())
+
+  mod <- make_model_without_cov() # $INPUT ID TIME DV AMT EVID MDV
+
+  ## Remove AMT — a required reserved column
+  dat_missing_col <- .sim_dat() |> dplyr::select(-"AMT")
+
+  expect_error(
+    run_sim(model = mod, data = dat_missing_col, verbose = FALSE),
+    "missing required column"
+  )
+})
+
+test_that("run_sim: error mentions which column is missing", {
+  local_pharmr.extra_options()
+  skip_if_nonmem_not_available()
+  withr::local_dir(tempdir())
+
+  mod <- make_model_without_cov() # $INPUT ID TIME DV AMT EVID MDV
+
+  dat_missing_col <- .sim_dat() |> dplyr::select(-"AMT")
+
+  expect_error(
+    run_sim(model = mod, data = dat_missing_col, verbose = FALSE),
+    "AMT"
+  )
+})
+