From f3a574d3674dc4f606fe33645aced9059a9e2929 Mon Sep 17 00:00:00 2001 From: Ron Keizer Date: Thu, 2 Apr 2026 20:44:23 -0700 Subject: [PATCH 1/3] fix: support basic SDTM domains (dm/ex/pc) in reformat_data_sdtm_to_modeling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function previously required adsl, vs, and lb domains but many users only have basic SDTM domains. This change: - Derives ADSL from DM when ADSL is not provided (RFXSTDTC → TRTSDT/TRTSDTM, ARM/ACTARM → TRT01P/TRT01A) - Makes EX→ADSL merge conditional (matching existing PC pattern) - Makes VS and LB domains optional for covariate derivation - Builds param_lookup dynamically from actual PCTESTCD values instead of hardcoding DRUGX - Fixes invalid !!!adsl_vars splice in keep_source_vars - Sets ignore_seconds_flag=FALSE for admiral 1.4.0+ compatibility - Adds 33 tests covering all SDTM reformatting paths Co-Authored-By: Claude Opus 4.6 (1M context) --- DESCRIPTION | 2 +- R/reformat_data_sdtm_to_modeling.R | 192 ++++++++----- .../test-reformat_data_sdtm_to_modeling.R | 267 +++++++++++++++++- 3 files changed, 384 insertions(+), 77 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index dc8b36a..d3add72 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: irxforge Title: Forging data for pharmacometric analyses -Version: 0.0.0.9003 +Version: 0.0.0.9004 Authors@R: c( person("Ron", "Keizer", email = "ron@insight-rx.com", role = c("cre", "aut")), person("Michael", "McCarthy", email = "michael.mccarthy@insight-rx.com", role = "ctb"), diff --git a/R/reformat_data_sdtm_to_modeling.R b/R/reformat_data_sdtm_to_modeling.R index 3d6e73d..2a3c72a 100644 --- a/R/reformat_data_sdtm_to_modeling.R +++ b/R/reformat_data_sdtm_to_modeling.R @@ -28,31 +28,56 @@ reformat_data_sdtm_to_modeling <- function( for(key in names(data)) { # admiral package is written assuming uppercase column names names(data[[key]]) <- toupper(names(data[[key]])) } - + + # Derive ADSL from DM when ADSL is not provided + if(is.null(data$adsl) && !is.null(data$dm)) { + data$adsl <- data$dm %>% + dplyr::mutate( + TRTSDTM = admiral::convert_dtc_to_dtm( + .data$RFXSTDTC, + highest_imputation = "h" + ), + TRTSDT = as.Date(.data$TRTSDTM), + TRT01P = .data$ARM, + TRT01A = .data$ACTARM + ) + } + + # Build param_lookup dynamically from PC data + pc_testcds <- unique(toupper(data$pc$PCTESTCD)) + pc_testcds <- pc_testcds[!is.na(pc_testcds)] param_lookup <- data.frame( - PCTESTCD = c("DRUGX", "DOSE"), - PARAMCD = c("DRUGX", "DOSE"), - PARAM = c("concentration of DrugX", "DrugX Dose"), - PARAMN = c(1, 2) + PCTESTCD = c(pc_testcds, "DOSE"), + PARAMCD = c(pc_testcds, "DOSE"), + PARAM = c( + paste("concentration of", pc_testcds), + paste(pc_testcds[1], "Dose") + ), + PARAMN = seq_len(length(pc_testcds) + 1) ) - + # Get list of ADSL vars required for derivations adsl_vars <- syms_to_exprs(c("TRTSDT", "TRTSDTM", "TRT01P", "TRT01A")) ## Concentrations - pc_dates <- data$pc %>% - # Join ADSL with PC (need TRTSDT for ADY derivation) - admiral::derive_vars_merged( - dataset_add = data$adsl, - new_vars = adsl_vars, - by_vars = syms_to_exprs(c("STUDYID", "USUBJID")) - ) %>% + pc_dates <- data$pc + if(!is.null(data$adsl)) { + pc_dates <- pc_dates %>% + # Join ADSL with PC (need TRTSDT for ADY derivation) + admiral::derive_vars_merged( + dataset_add = data$adsl, + new_vars = adsl_vars, + by_vars = syms_to_exprs(c("STUDYID", "USUBJID")) + ) + } + pc_dates <- pc_dates %>% # Derive analysis date/time # Impute missing time to 00:00:00 admiral::derive_vars_dtm( new_vars_prefix = "A", dtc = !!rlang::sym("PCDTC"), - time_imputation = "00:00:00" + time_imputation = "00:00:00", + ignore_seconds_flag = FALSE ) %>% # Derive dates and times from date/times admiral::derive_vars_dtm_to_dt(syms_to_exprs(c("ADTM"))) %>% @@ -68,12 +93,16 @@ reformat_data_sdtm_to_modeling <- function( ) ## Doses - ex_dates <- data$ex %>% - admiral::derive_vars_merged( - dataset_add = data$adsl, - new_vars = adsl_vars, - by_vars = syms_to_exprs(c("STUDYID", "USUBJID")) - ) %>% + ex_dates <- data$ex + if(!is.null(data$adsl)) { + ex_dates <- ex_dates %>% + admiral::derive_vars_merged( + dataset_add = data$adsl, + new_vars = adsl_vars, + by_vars = syms_to_exprs(c("STUDYID", "USUBJID")) + ) + } + ex_dates <- ex_dates %>% # Keep records with nonzero dose dplyr::filter(.data$EXDOSE > 0) %>% # Add time and set missing end date to start date @@ -83,12 +112,14 @@ reformat_data_sdtm_to_modeling <- function( admiral::derive_vars_dtm( new_vars_prefix = "AST", dtc = !!rlang::sym("EXSTDTC"), - time_imputation = "00:00:00" + time_imputation = "00:00:00", + ignore_seconds_flag = FALSE ) %>% admiral::derive_vars_dtm( new_vars_prefix = "AEN", dtc = !!rlang::sym("EXENDTC"), - time_imputation = "00:00:00" + time_imputation = "00:00:00", + ignore_seconds_flag = FALSE ) %>% # Derive event ID and nominal relative time from first dose (NFRLT) dplyr::mutate( @@ -120,8 +151,8 @@ reformat_data_sdtm_to_modeling <- function( keep_source_vars = syms_to_exprs(c( "STUDYID", "USUBJID", "EVID", "EXDOSFRQ", "EXDOSFRM", "NFRLT", "EXDOSE", "EXDOSU", "EXTRT", "ASTDT", "ASTDTM", "AENDT", "AENDTM", - "VISIT", "VISITNUM", "VISITDY", "EXROUTE", "EXDOSFRM", - "TRT01A", "TRT01P", "DOMAIN", "EXSEQ", !!!adsl_vars + "VISIT", "VISITNUM", "VISITDY", "EXROUTE", + "TRT01A", "TRT01P", "TRTSDT", "TRTSDTM", "DOMAIN", "EXSEQ" )) ) %>% # Derive AVISIT based on nominal relative time @@ -373,56 +404,64 @@ reformat_data_sdtm_to_modeling <- function( ) #---- Derive additional baselines from VS and LB ---- - numeric_vars <- c("CREAT", "ALT", "AST", "BILI") ## TODO: fairly generic, but might not always be available or might include others - labsbl <- data$lb %>% - dplyr::filter(.data$LBBLFL == "Y" & .data$LBTESTCD %in% numeric_vars) %>% - dplyr::mutate(LBTESTCDB = paste0(.data$LBTESTCD, "BL")) %>% - dplyr::select("STUDYID", "USUBJID", "LBTESTCDB", "LBSTRESN") - - covar_vslb <- covar %>% - admiral::derive_vars_merged( - dataset_add = data$vs, - filter_add = !!rlang::sym("VSTESTCD") == "HEIGHT", - by_vars = syms_to_exprs(c("STUDYID", "USUBJID")), - new_vars = syms_to_exprs(c("HTBL" = "VSSTRESN")) - ) %>% - admiral::derive_vars_merged( - dataset_add = data$vs, - filter_add = !!rlang::sym("VSTESTCD") == "WEIGHT" & !!rlang::sym("VSBLFL") == "Y", - by_vars = syms_to_exprs(c("STUDYID", "USUBJID")), - new_vars = syms_to_exprs(c(WTBL = "VSSTRESN")) - ) %>% - admiral::derive_vars_transposed( - dataset_merge = labsbl, - by_vars = syms_to_exprs(c("STUDYID", "USUBJID")), - key_var = !!rlang::sym("LBTESTCDB"), - value_var = !!rlang::sym("LBSTRESN") - ) %>% - dplyr::mutate( - BMIBL = admiral::compute_bmi(height = .data$HTBL, weight = .data$WTBL), - BSABL = admiral::compute_bsa( - height = .data$HTBL, - weight = .data$WTBL, - method = "Mosteller" - ), - CRCLBL = admiral::compute_egfr( - creat = .data$CREATBL, - creatu = "SI", - age = .data$AGE, - weight = .data$WTBL, - sex = .data$SEX, - method = "CRCL" - ), - EGFRBL = admiral::compute_egfr( - creat = .data$CREATBL, - creatu = "SI", - age = .data$AGE, - weight = .data$WTBL, - sex = .data$SEX, - method = "CKD-EPI" + covar_vslb <- covar + if(!is.null(data$vs)) { + covar_vslb <- covar_vslb %>% + admiral::derive_vars_merged( + dataset_add = data$vs, + filter_add = !!rlang::sym("VSTESTCD") == "HEIGHT", + by_vars = syms_to_exprs(c("STUDYID", "USUBJID")), + new_vars = syms_to_exprs(c("HTBL" = "VSSTRESN")) + ) %>% + admiral::derive_vars_merged( + dataset_add = data$vs, + filter_add = !!rlang::sym("VSTESTCD") == "WEIGHT" & !!rlang::sym("VSBLFL") == "Y", + by_vars = syms_to_exprs(c("STUDYID", "USUBJID")), + new_vars = syms_to_exprs(c(WTBL = "VSSTRESN")) ) - ) %>% - dplyr::rename(TBILBL = "BILIBL") + } + if(!is.null(data$lb)) { + numeric_vars <- c("CREAT", "ALT", "AST", "BILI") + labsbl <- data$lb %>% + dplyr::filter(.data$LBBLFL == "Y" & .data$LBTESTCD %in% numeric_vars) %>% + dplyr::mutate(LBTESTCDB = paste0(.data$LBTESTCD, "BL")) %>% + dplyr::select("STUDYID", "USUBJID", "LBTESTCDB", "LBSTRESN") + covar_vslb <- covar_vslb %>% + admiral::derive_vars_transposed( + dataset_merge = labsbl, + by_vars = syms_to_exprs(c("STUDYID", "USUBJID")), + key_var = !!rlang::sym("LBTESTCDB"), + value_var = !!rlang::sym("LBSTRESN") + ) + } + if(!is.null(data$vs) && !is.null(data$lb)) { + covar_vslb <- covar_vslb %>% + dplyr::mutate( + BMIBL = admiral::compute_bmi(height = .data$HTBL, weight = .data$WTBL), + BSABL = admiral::compute_bsa( + height = .data$HTBL, + weight = .data$WTBL, + method = "Mosteller" + ), + CRCLBL = admiral::compute_egfr( + creat = .data$CREATBL, + creatu = "SI", + age = .data$AGE, + weight = .data$WTBL, + sex = .data$SEX, + method = "CRCL" + ), + EGFRBL = admiral::compute_egfr( + creat = .data$CREATBL, + creatu = "SI", + age = .data$AGE, + weight = .data$WTBL, + sex = .data$SEX, + method = "CKD-EPI" + ) + ) %>% + dplyr::rename(TBILBL = "BILIBL") + } # Combine covariates with APPPK data adppk <- adppk_aseq %>% @@ -434,6 +473,11 @@ reformat_data_sdtm_to_modeling <- function( dplyr::mutate(RECSEQ = dplyr::row_number()) %>% dplyr::mutate(ROUTE = tolower(.data$ROUTE), FORM = tolower(.data$FORM)) + # Ensure WT column exists even if VS data was not provided + if(!"WTBL" %in% names(adppk)) { + adppk$WTBL <- NA_real_ + } + poppk_data <- adppk %>% # select the variables we need from the data dplyr::select( ID = "SUBJID", @@ -454,7 +498,7 @@ reformat_data_sdtm_to_modeling <- function( "RACE", "ETHNIC", "COUNTRY" - ) %>% + ) %>% dplyr::filter(!(is.na(.data$DV) & .data$EVID == 0)) # filter out DV=0 at time==0 poppk_data diff --git a/tests/testthat/test-reformat_data_sdtm_to_modeling.R b/tests/testthat/test-reformat_data_sdtm_to_modeling.R index fd521d4..48c6e45 100644 --- a/tests/testthat/test-reformat_data_sdtm_to_modeling.R +++ b/tests/testthat/test-reformat_data_sdtm_to_modeling.R @@ -1,2 +1,265 @@ -# TODO: write tests. Should considering refactoring main function first since -# it's very long at present, which makes testing difficult. +# Helper to create minimal SDTM test data for 2 subjects. +# Uses CDISC-compatible formats: numeric USUBJID segments, matching +# drug names across PC and EX, and valid EXDOSFRQ values. +create_sdtm_test_data <- function() { + dm <- data.frame( + STUDYID = "01", + DOMAIN = "DM", + USUBJID = c("01-01-001", "01-01-002"), + SUBJID = c("001", "002"), + RFSTDTC = c("2020-01-01", "2020-01-01"), + RFENDTC = c("2020-02-01", "2020-02-01"), + RFXSTDTC = c("2020-01-01", "2020-01-01"), + RFXENDTC = c("2020-02-01", "2020-02-01"), + RFICDTC = "", + RFPENDTC = "", + DTHDTC = "", + DTHFL = "", + SITEID = c(1, 1), + AGE = c(45, 60), + AGEU = "YEARS", + SEX = c("M", "F"), + RACE = c("WHITE", "BLACK OR AFRICAN AMERICAN"), + ETHNIC = c("NOT HISPANIC OR LATINO", "HISPANIC OR LATINO"), + ARMCD = c("TRT", "TRT"), + ARM = c("DrugA 100mg", "DrugA 100mg"), + ACTARMCD = c("TRT", "TRT"), + ACTARM = c("DrugA 100mg", "DrugA 100mg"), + COUNTRY = c("USA", "USA"), + DMDTC = c("2019-12-15", "2019-12-20"), + DMDY = c(-17, -12), + stringsAsFactors = FALSE + ) + + ex <- data.frame( + STUDYID = "01", + DOMAIN = "EX", + USUBJID = c("01-01-001", "01-01-002"), + EXSEQ = c(1, 1), + EXTRT = c("DrugA", "DrugA"), + EXDOSE = c(100, 100), + EXDOSU = "mg", + EXDOSFRM = "TABLET", + EXDOSFRQ = "QD", + EXROUTE = "ORAL", + VISITNUM = c(1, 1), + VISIT = "DAY 1", + VISITDY = c(1, 1), + EXSTDTC = c("2020-01-01", "2020-01-01"), + EXENDTC = c("2020-01-01", "2020-01-01"), + EXSTDY = c(1, 1), + EXENDY = c(1, 1), + stringsAsFactors = FALSE + ) + + pc <- data.frame( + STUDYID = "01", + DOMAIN = "PC", + USUBJID = rep(c("01-01-001", "01-01-002"), each = 4), + PCSEQ = rep(1:4, 2), + PCTESTCD = "DRUGA", + PCTEST = "DrugA", + PCORRES = c("0.5", "2.1", "1.8", "0.9", "0.6", "2.5", "2.0", "1.1"), + PCORRESU = "ug/mL", + PCSTRESC = c("0.5", "2.1", "1.8", "0.9", "0.6", "2.5", "2.0", "1.1"), + PCSTRESN = c(0.5, 2.1, 1.8, 0.9, 0.6, 2.5, 2.0, 1.1), + PCSTRESU = "ug/mL", + PCNAM = "Test Lab", + PCSPEC = "PLASMA", + PCLLOQ = 0.01, + VISIT = "DAY 1", + VISITNUM = 1, + PCDTC = c( + "2020-01-01T00:30:00", "2020-01-01T02:00:00", + "2020-01-01T04:00:00", "2020-01-01T08:00:00", + "2020-01-01T00:30:00", "2020-01-01T02:00:00", + "2020-01-01T04:00:00", "2020-01-01T08:00:00" + ), + PCDY = 1, + PCTPT = c("30 min", "2h", "4h", "8h", "30 min", "2h", "4h", "8h"), + PCTPTNUM = c(0.5, 2, 4, 8, 0.5, 2, 4, 8), + stringsAsFactors = FALSE + ) + + list(dm = dm, ex = ex, pc = pc) +} + +# Helper to build ADSL from DM (mimics what a user might provide) +create_adsl_from_dm <- function(dm) { + adsl <- dm + adsl$TRTSDT <- as.Date(adsl$RFXSTDTC) + adsl$TRTSDTM <- as.POSIXct(adsl$RFXSTDTC, tz = "UTC") + adsl$TRT01P <- adsl$ARM + adsl$TRT01A <- adsl$ACTARM + adsl +} + +expected_columns <- c( + "ID", "TIME", "DV", "MDV", "EVID", "SS", "II", "AMT", + "SEXN", "AGE", "WT", "ROUTE", "FORM", "COHORT", "SITEID", + "RACE", "ETHNIC", "COUNTRY" +) + +test_that("basic SDTM reformatting with dm, ex, pc produces NONMEM-style dataset", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + expect_s3_class(result, "data.frame") + expect_true(all(expected_columns %in% names(result))) + expect_gt(nrow(result), 0) +}) + +test_that("dose records have EVID=1 and observations have EVID=0", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + dose_rows <- result[result$EVID == 1, ] + obs_rows <- result[result$EVID == 0, ] + + expect_gt(nrow(dose_rows), 0) + expect_gt(nrow(obs_rows), 0) + # Dose records have AMT set + expect_true(all(!is.na(dose_rows$AMT))) + expect_true(all(dose_rows$AMT > 0)) + # Observation records have AMT = NA + expect_true(all(is.na(obs_rows$AMT))) +}) + +test_that("MDV is 1 for dose records and 0 for observations with non-missing DV", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + dose_rows <- result[result$EVID == 1, ] + obs_rows <- result[result$EVID == 0 & !is.na(result$DV), ] + + expect_true(all(dose_rows$MDV == 1)) + expect_true(all(obs_rows$MDV == 0)) +}) + +test_that("ADSL derived from DM produces same result as explicit ADSL", { + data <- create_sdtm_test_data() + + # Case 1: using DM (ADSL derived internally) + result_dm <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + # Case 2: using explicit ADSL + data_adsl <- list( + pc = data$pc, + ex = data$ex, + adsl = create_adsl_from_dm(data$dm) + ) + result_adsl <- suppressWarnings(reformat_data_sdtm_to_modeling(data_adsl)) + + expect_equal(dim(result_dm), dim(result_adsl)) + expect_equal(names(result_dm), names(result_adsl)) + expect_equal(result_dm$ID, result_adsl$ID) + expect_equal(result_dm$DV, result_adsl$DV) + expect_equal(result_dm$AMT, result_adsl$AMT) + expect_equal(result_dm$EVID, result_adsl$EVID) +}) + +test_that("column names are uppercased internally (lowercase input works)", { + data <- create_sdtm_test_data() + # lowercase all column names + for (key in names(data)) { + names(data[[key]]) <- tolower(names(data[[key]])) + } + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + expect_s3_class(result, "data.frame") + expect_true(all(expected_columns %in% names(result))) +}) + +test_that("covariates are correctly derived from DM", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + # SEXN: M=1, F=2 + subj1 <- result[result$ID == "001", ] + subj2 <- result[result$ID == "002", ] + expect_equal(unique(subj1$SEXN), 1) + expect_equal(unique(subj2$SEXN), 2) + + # AGE carried through + expect_equal(unique(subj1$AGE), 45) + expect_equal(unique(subj2$AGE), 60) + + # RACE carried through + expect_equal(unique(subj1$RACE), "WHITE") +}) + +test_that("ROUTE and FORM are lowercase", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + dose_rows <- result[result$EVID == 1, ] + expect_true(all(dose_rows$ROUTE == "oral")) + expect_true(all(dose_rows$FORM == "tablet")) +}) + +test_that("WT is NA when VS domain is not provided", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + expect_true(all(is.na(result$WT))) +}) + +test_that("param_lookup is built dynamically from PC data", { + data <- create_sdtm_test_data() + # The test data uses PCTESTCD = "DRUGA", not "DRUGX" + # This should work without error (previously hardcoded to DRUGX) + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + expect_gt(nrow(result), 0) +}) + +test_that("observations with missing DV are filtered out", { + data <- create_sdtm_test_data() + # Set one observation DV to NA + data$pc$PCSTRESN[1] <- NA + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + # No rows with EVID=0 and DV=NA should remain + obs_na <- result[result$EVID == 0 & is.na(result$DV), ] + expect_equal(nrow(obs_na), 0) +}) + +test_that("datetimes with seconds are handled correctly", { + data <- create_sdtm_test_data() + # PC data already has seconds (e.g. "2020-01-01T00:30:00") + # This should not error with admiral >= 1.4.0 + expect_no_error(suppressWarnings(reformat_data_sdtm_to_modeling(data))) +}) + +test_that("subjects with zero dose are excluded", { + data <- create_sdtm_test_data() + # Set one subject's dose to 0 + data$ex$EXDOSE[2] <- 0 + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + # Only subject 001 should remain (002 had zero dose) + expect_equal(sort(unique(result$ID)), "001") +}) + +test_that("two subjects produce correct number of unique IDs", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + expect_equal(length(unique(result$ID)), 2) +}) + +test_that("TIME values correspond to PCTPTNUM for observations", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + obs <- result[result$EVID == 0, ] + # All observation times should be from the PCTPTNUM values: 0.5, 2, 4, 8 + expect_true(all(obs$TIME %in% c(0.5, 2, 4, 8))) +}) + +test_that("TIME is 0 for dose records", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + doses <- result[result$EVID == 1, ] + expect_true(all(doses$TIME == 0)) +}) From e090248b7443f90beb2393d4c6f63b3c7bec6796 Mon Sep 17 00:00:00 2001 From: Ron Keizer Date: Thu, 2 Apr 2026 21:17:13 -0700 Subject: [PATCH 2/3] test: add tests for na parameter in reformat_data_sdtm_to_modeling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests cover default na="." replacement, na=NA pass-through, na=NULL (skip conversion), and custom na=-99. Updates existing tests to account for new default NA→"." conversion. Co-Authored-By: Claude Opus 4.6 (1M context) --- R/reformat_data_sdtm_to_modeling.R | 11 +++- .../test-reformat_data_sdtm_to_modeling.R | 51 +++++++++++++++++-- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/R/reformat_data_sdtm_to_modeling.R b/R/reformat_data_sdtm_to_modeling.R index 2a3c72a..fdb198e 100644 --- a/R/reformat_data_sdtm_to_modeling.R +++ b/R/reformat_data_sdtm_to_modeling.R @@ -3,6 +3,8 @@ #' @param data list containing data.frames with SDTM domains #' @param dictionary a data dictionary that maps expected variable names to #' variables in the data. +#' @param na what to set NA values to. E.g. ".", (default) or NA (keep NA), +#' or NULL (do nothing). #' #' @returns data.frame with population PK input data in NONMEM-style #' format. It will also add the non-standard columns ROUTE ("oral", "iv") and @@ -12,7 +14,8 @@ #' @export reformat_data_sdtm_to_modeling <- function( data, - dictionary + dictionary, + na = "." ) { ## Parse into modeling dataset (NONMEM-style format) @@ -501,6 +504,12 @@ reformat_data_sdtm_to_modeling <- function( ) %>% dplyr::filter(!(is.na(.data$DV) & .data$EVID == 0)) # filter out DV=0 at time==0 + ## Convert NA's to dots (or something else) + if(!is.null(na)) { + poppk_data <- poppk_data |> + dplyr::mutate(dplyr::across(dplyr::everything(), ~ifelse(is.na(.) | . == "NA", na, .))) + } + poppk_data } diff --git a/tests/testthat/test-reformat_data_sdtm_to_modeling.R b/tests/testthat/test-reformat_data_sdtm_to_modeling.R index 48c6e45..806f72f 100644 --- a/tests/testthat/test-reformat_data_sdtm_to_modeling.R +++ b/tests/testthat/test-reformat_data_sdtm_to_modeling.R @@ -121,8 +121,8 @@ test_that("dose records have EVID=1 and observations have EVID=0", { # Dose records have AMT set expect_true(all(!is.na(dose_rows$AMT))) expect_true(all(dose_rows$AMT > 0)) - # Observation records have AMT = NA - expect_true(all(is.na(obs_rows$AMT))) + # Observation records have AMT = "." (default NA replacement) + expect_true(all(obs_rows$AMT == ".")) }) test_that("MDV is 1 for dose records and 0 for observations with non-missing DV", { @@ -197,11 +197,11 @@ test_that("ROUTE and FORM are lowercase", { expect_true(all(dose_rows$FORM == "tablet")) }) -test_that("WT is NA when VS domain is not provided", { +test_that("WT is '.' when VS domain is not provided (default na replacement)", { data <- create_sdtm_test_data() result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) - expect_true(all(is.na(result$WT))) + expect_true(all(result$WT == ".")) }) test_that("param_lookup is built dynamically from PC data", { @@ -263,3 +263,46 @@ test_that("TIME is 0 for dose records", { doses <- result[result$EVID == 1, ] expect_true(all(doses$TIME == 0)) }) + +test_that("NA values are converted to '.' by default", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data)) + + # WT is NA when VS is not provided, so it should become "." + expect_true(all(result$WT == ".")) + # AMT is NA for observations, should become "." + obs <- result[result$EVID == 0, ] + expect_true(all(obs$AMT == ".")) + # No actual NA values should remain anywhere + expect_false(any(is.na(result))) +}) + +test_that("na = NA keeps NA values as-is", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data, na = NA)) + + # WT should still be NA + expect_true(all(is.na(result$WT))) + # AMT should be NA for observations + obs <- result[result$EVID == 0, ] + expect_true(all(is.na(obs$AMT))) +}) + +test_that("na = NULL keeps NA values as-is", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data, na = NULL)) + + # WT should still be NA + expect_true(all(is.na(result$WT))) +}) + +test_that("na = custom value replaces NAs with that value", { + data <- create_sdtm_test_data() + result <- suppressWarnings(reformat_data_sdtm_to_modeling(data, na = -99)) + + # WT should be -99 + expect_true(all(result$WT == -99)) + # AMT for observations should be -99 + obs <- result[result$EVID == 0, ] + expect_true(all(obs$AMT == -99)) +}) From 5c3f844b1858fec70b3719abdc80d13c91750d6b Mon Sep 17 00:00:00 2001 From: roninsightrx Date: Thu, 2 Apr 2026 21:19:27 -0700 Subject: [PATCH 3/3] Update R/reformat_data_sdtm_to_modeling.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/reformat_data_sdtm_to_modeling.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/reformat_data_sdtm_to_modeling.R b/R/reformat_data_sdtm_to_modeling.R index fdb198e..ad40db6 100644 --- a/R/reformat_data_sdtm_to_modeling.R +++ b/R/reformat_data_sdtm_to_modeling.R @@ -502,7 +502,7 @@ reformat_data_sdtm_to_modeling <- function( "ETHNIC", "COUNTRY" ) %>% - dplyr::filter(!(is.na(.data$DV) & .data$EVID == 0)) # filter out DV=0 at time==0 + dplyr::filter(!(is.na(.data$DV) & .data$EVID == 0)) # filter out observation rows with missing DV ## Convert NA's to dots (or something else) if(!is.null(na)) {