diff --git a/DESCRIPTION b/DESCRIPTION index a8ccd287..efb2c098 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,8 +2,8 @@ Type: Package Package: gDRcore Title: Processing functions and interface to process and analyze drug dose-response data -Version: 1.9.3 -Date: 2025-11-27 +Version: 1.9.4 +Date: 2026-02-02 Authors@R: c( person("Bartosz", "Czech", , "bartosz.czech@contractors.roche.com", role = "aut", comment = c(ORCID = "0000-0002-9908-3007")), @@ -57,6 +57,6 @@ DeploySubPath: gDRcore Encoding: UTF-8 LazyLoad: yes NeedsCompilation: yes -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Roxygen: list(markdown = TRUE) SwitchrLibrary: gDRcore diff --git a/NAMESPACE b/NAMESPACE index 60d19dce..650a956a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,6 +38,7 @@ export(identify_data_type) export(identify_keys) export(map_df) export(map_ids_to_fits) +export(map_untreated) export(merge_data) export(normalize_SE) export(prepare_input) diff --git a/NEWS.md b/NEWS.md index be731067..e2957313 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +## gDRcore 1.9.4 - 2026-02-02 +* remove duplicated code from `map_references` in `map_untreated` + ## gDRcore 1.9.3 - 2025-11-27 * add support for Incucyte data diff --git a/R/map_df.R b/R/map_df.R index f549bec5..7a1a4bf7 100755 --- a/R/map_df.R +++ b/R/map_df.R @@ -10,39 +10,28 @@ #' @param ref_cols character vector of the names of reference columns to #' include. Likely obtained from \code{identify_keys()}. #' @param ref_type string of the reference type to map to. -#' Should be one of \code{c("Day0", "untrt_Endpoint", "ref_Endpoint")}. -#' -#' @examples -#' n <- 64 -#' md_df <- data.table::data.table( -#' Gnumber = rep(c("vehicle", "untreated", paste0("G", seq(2))), each = 16), -#' DrugName = rep(c("vehicle", "untreated", paste0("GN", seq(2))), each = 16), -#' clid = paste0("C", rep_len(seq(4), n)), -#' CellLineName = paste0("N", rep_len(seq(4), n)), -#' replicates = rep_len(paste0("R", rep(seq(4), each = 4)), 64), -#' drug_moa = "inhibitor", -#' ReferenceDivisionTime = rep_len(c(120, 60), n), -#' Tissue = "Lung", -#' parental_identifier = "CL12345", -#' Duration = 160 -#' ) -#' md_df <- unique(md_df) -#' ref <- md_df$Gnumber %in% c("vehicle", "untreated") -#' ref_df <- md_df[ref, ] -#' trt_df <- md_df[!ref, ] -#' Keys <- identify_keys(trt_df) -#' ref_type <- "untrt_Endpoint" -#' map_df( -#' trt_df, -#' ref_df, -#' ref_cols = Keys[[ref_type]], -#' ref_type = ref_type -#' ) +#' Should be one of \code{c("Day0", "untrt_Endpoint")}. #' #' @return named list mapping treated metadata to untreated metadata. #' -#' @details If \code{override_untrt_controls} is specified, -#' TODO: FILL ME! +#' @details If \code{override_untrt_controls} is specified, the values in the +#' named list will supersede the values in \code{trt_md} during the matching +#' process. This is useful for mapping treatments to specific "standard" +#' untreated controls. +#' +#' @examples +#' # Standard Endpoint Mapping +#' trt_dt <- data.table::data.table( +#' clid = c("C1", "C2"), +#' Duration = 72, +#' rn = c("T1", "T2") +#' ) +#' ref_dt <- data.table::data.table( +#' clid = c("C1", "C2"), +#' Duration = 72, +#' rn = c("R1", "R2") +#' ) +#' map_df(trt_dt, ref_dt, ref_cols = "clid", ref_type = "untrt_Endpoint") #' #' @seealso identify_keys #' @keywords map_df @@ -57,9 +46,11 @@ map_df <- function(trt_md, # Assertions: checkmate::assert_data_table(trt_md) checkmate::assert_data_table(ref_md) - checkmate::assert_vector(override_untrt_controls, null.ok = TRUE) + checkmate::assert( + checkmate::check_list(override_untrt_controls, names = "named", null.ok = TRUE), + checkmate::check_vector(override_untrt_controls, names = "named", null.ok = TRUE) + ) checkmate::assert_character(ref_cols) - checkmate::assert_character(ref_type) ref_type <- match.arg(ref_type) @@ -68,18 +59,16 @@ map_df <- function(trt_md, c("concentration", "concentration2"), simplify = FALSE )) - + if (ref_type == "Day0") { ref_md <- ref_md[get(duration_col) == 0, ] } - conc <- cbind(array(0, nrow(ref_md)), # padding to avoid empty df; - ref_md[, intersect(names(ref_md), conc_cols), drop = FALSE]) + conc <- cbind(array(0, nrow(ref_md)), + ref_md[, intersect(names(ref_md), conc_cols), with = FALSE]) is_ref_conc <- rowSums(conc == 0) == ncol(conc) - if (ref_type == "Day0") { - # Identifying which of the durations have a value of 0. matching_list <- list(T0 = ref_md[[duration_col]] == 0, conc = is_ref_conc) matchFactor <- "T0" } else if (ref_type == "untrt_Endpoint") { @@ -90,241 +79,200 @@ map_df <- function(trt_md, trt_rnames <- trt_md$rn ref_rnames <- ref_md$rn - # define matrix with matching metadata present_ref_cols <- intersect(ref_cols, names(ref_md)) - names(present_ref_cols) <- present_ref_cols - msgs <- NULL - # 1. there are no matches (present_ref_cols is empty) + # 1. Exact matches vectorized exact_out <- if (length(present_ref_cols) == 0) { - out <- stats::setNames(replicate(length(trt_rnames), character(0), simplify = FALSE), trt_rnames) - names(out) <- trt_rnames - out + stats::setNames(replicate(length(trt_rnames), character(0), simplify = FALSE), trt_rnames) } else { - # 2. search for exact matches found in the vectorised way - # cases with non-exact matches will be returned as NAs - match_l <- - grr_matches( - do.call("paste", trt_md[, present_ref_cols, with = FALSE]), - do.call("paste", ref_md[, present_ref_cols, with = FALSE]), - all.y = FALSE, - list = TRUE - ) + match_l <- grr_matches( + do.call("paste", trt_md[, ..present_ref_cols]), + do.call("paste", ref_md[, ..present_ref_cols]), + all.y = FALSE, + list = TRUE + ) names(match_l) <- trt_rnames - lapply(match_l, function(x) { - ref_rnames[sort(x)] - }) + lapply(match_l, function(x) ref_rnames[sort(x)]) } - - # 3. only exact matches found - out <- if (!anyNA(exact_out) && is.null(override_untrt_controls)) { - exact_out - # 4. not all entres are exact matches - # 4.1 search for potential non-exact matches - # 4.2 support cases with overriden untreated controls - # this logic is pretty slow currently - } else { - - out <- lapply(seq_along(trt_rnames), function(i) { + + # 2. Search for non-exact matches or overrides + # We return a list containing both the mapped references and any messages + res_list <- lapply(seq_along(trt_rnames), function(i) { treatment <- trt_rnames[i] - if (all(is.na(exact_out[[treatment]])) || !is.null(override_untrt_controls)) { + msg <- NULL + + if (length(exact_out[[treatment]]) == 0 || + any(is.na(exact_out[[treatment]])) || + !is.null(override_untrt_controls)) { refs <- lapply(present_ref_cols, function(y) { - unname(unlist(ref_md[, y, with = FALSE]) == unlist(trt_md[which(trt_md$rn == treatment), - y, with = FALSE])) + ref_md[[y]] == trt_md[rn == treatment, ..y][[1]] }) + names(refs) <- present_ref_cols if (!is.null(override_untrt_controls)) { for (overridden in names(override_untrt_controls)) { - refs[[overridden]] <- - unname(unlist(ref_md[, overridden, with = FALSE]) == override_untrt_controls[[overridden]]) + if (overridden %in% names(ref_md)) { + refs[[overridden]] <- ref_md[[overridden]] == override_untrt_controls[[overridden]] + } } } all_checks <- c(refs, matching_list) match_mx <- do.call("rbind", all_checks) - rownames(match_mx) <- names(all_checks) - match_idx <- which(colSums(match_mx) == ncol(match_mx)) - # No exact match, try to find best match (as many metadata fields as - # possible). - # TODO: rowSums? + + # Calculate scores idx <- colSums(match_mx) - # TODO: Sort this out so that it also takes the average in case multiple - # are found. - idx <- idx * match_mx[matchFactor, ] - if (any(idx > 0, na.rm = TRUE)) { - match_idx <- which.max(idx) - msgs <- c( - msgs, - sprintf( - "Found partial match: ('%s') for treatment: ('%s')", - rownames(ref_md)[match_idx], treatment - ) - ) - } else { # failed to find any potential match - msgs <- c( - msgs, - sprintf("No partial match found for treatment: ('%s')", treatment) - ) + # score for metadata columns only (to avoid matching on Duration alone) + meta_score <- if (length(present_ref_cols) > 0) { + colSums(match_mx[present_ref_cols, , drop = FALSE]) + } else { + rep(0, ncol(match_mx)) + } + + if (matchFactor %in% rownames(match_mx)) { + idx <- idx * match_mx[matchFactor, ] + } + + # Identify best matches where at least one metadata field matches + if (any(idx > 0 & meta_score > 0, na.rm = TRUE)) { + valid_idx <- which(idx > 0 & meta_score > 0) + match_idx <- valid_idx[which(idx[valid_idx] == max(idx[valid_idx]))] + + msg <- sprintf("Found partial match: ('%s') for treatment: ('%s')", + paste(ref_rnames[match_idx], collapse = ", "), treatment) + + return(list(ref = ref_rnames[match_idx], msg = msg)) + } else { + msg <- sprintf("No partial match found for treatment: ('%s')", treatment) + return(list(ref = character(0), msg = msg)) } - ref_rnames[match_idx] # TODO: Check that this properly handles - # NAs or NULLs. } else { - exact_out[[treatment]] + return(list(ref = exact_out[[treatment]], msg = NULL)) } }) + + # Flatten result list into mapping and messages + out <- lapply(res_list, `[[`, "ref") names(out) <- trt_rnames - out + + msgs <- unlist(lapply(res_list, `[[`, "msg")) + if (length(msgs) > 0) { + futile.logger::flog.info(paste0(msgs, collapse = "\n")) } - futile.logger::flog.info(paste0(msgs, collapse = "\n")) out } #' Map references #' -#' @param mat_elem input data frame -#' @param rowData_colnames character vector of variables for the mapping of reference treatments -#' -#' @details -#' Using the given rownames, map the treated and reference conditions. -#' +#' @param mat_elem data.table input containing experimental metadata and +#' row identifiers. +#' @param rowData_colnames character vector of variables (column names) used +#' to identify and map reference treatments. #' @keywords map_df #' @return list -#' .map_references <- function(mat_elem, rowData_colnames = c(gDRutils::get_env_identifiers("duration"), - paste0(c("drug", "drug_name", "drug_moa"), "3"))) { + paste0(c("drug", "drug_name", "drug_moa"), "3"))) { + + checkmate::assert_data_table(mat_elem) + checkmate::assert_character(rowData_colnames, null.ok = TRUE) clid <- gDRutils::get_env_identifiers("cellline") - # variables for the mapping of treatments - valid <- unlist( - intersect( - c( - gDRutils::get_env_identifiers( - c("drug_name", "drug_name2"), - simplify = FALSE - ) - ), - colnames(mat_elem) - ) - ) - # variables for the mapping of reference treatments + checkmate::assert_choice(clid, colnames(mat_elem)) + + # Avoid recycling code using helper + tag_info <- .get_untreated_tag_count(mat_elem, c("drug_name", "drug_name2")) + valid <- tag_info$valid_cols + + is_untrt <- tag_info$ntag == tag_info$num_cols + is_ref <- tag_info$ntag != 0L & !is_untrt + cotrt_var <- setdiff(rowData_colnames, - gDRutils::get_env_identifiers( - c("drug", "drug_name", "drug_moa", paste0(c("drug", "drug_name", "drug_moa"), "2")), - simplify = FALSE - ) + gDRutils::get_env_identifiers( + c("drug", "drug_name", "drug_moa", paste0(c("drug", "drug_name", "drug_moa"), "2")), + simplify = FALSE + ) ) - - drug_cols <- mat_elem[, valid, with = FALSE] - - untrt_tag <- gDRutils::get_env_identifiers("untreated_tag") - has_tag <- lapply(drug_cols, function(x) x %in% untrt_tag) - data.table::setDT(has_tag) - ntag <- rowSums(has_tag) - - is_untrt <- ntag == length(valid) - is_ref <- ntag != 0L & !is_untrt + cotrt_var <- intersect(cotrt_var, colnames(mat_elem)) mat_elem$rownames <- as.character(seq_len(nrow(mat_elem))) - - # columns with the primary data for the treatment and reference - trt_elem <- mat_elem[which(!is_ref & !is_untrt)] + trt_elem <- mat_elem[!is_ref & !is_untrt] out <- vector("list", nrow(trt_elem)) names(out) <- trt_elem$rownames if (any(is_ref)) { - ref_elem <- mat_elem[which(is_ref), ] - # columns with the matching data for the treatment and reference - ref_cotrt <- mat_elem[which(is_ref), cotrt_var, with = FALSE] - - # store rownames of trt_elem and ref_elem and replicate them based on the length of - # drug columns + ref_elem <- mat_elem[is_ref] + trtNames <- rep(trt_elem$rownames, length(valid)) refNames <- rep(ref_elem$rownames, length(valid)) - # split data.tables to simple model with clid column and drug column + trt <- do.call(paste, do.call(rbind, lapply(valid, function(x) { + stats::setNames(trt_elem[, c(clid, x), with = FALSE], c(clid, "drug")) + }))) - trt <- lapply(valid, function(x) { - colnames <- c(clid, x) - trt_elem[, colnames, with = FALSE] - }) - trt <- do.call( - paste, - do.call( - rbind, - lapply(trt, function(x) stats::setNames(x, names(trt[[1]]))) - ) - ) + ref <- do.call(paste, do.call(rbind, lapply(valid, function(x) { + stats::setNames(ref_elem[, c(clid, x), with = FALSE], c(clid, "drug")) + }))) - ref <- lapply(valid, function(x) { - colnames <- c(clid, x) - ref_elem[, colnames, with = FALSE] - }) - ref <- do.call( - paste, - do.call( - rbind, - lapply(ref, function(x) stats::setNames(x, names(ref[[1]]))) - ) - ) - # match trt and ref matchTrtRef <- grr_matches(trt, ref, list = FALSE, all.y = FALSE) matchTrtRef[["x"]] <- trtNames[matchTrtRef[["x"]]] matchTrtRef[["y"]] <- refNames[matchTrtRef[["y"]]] out <- split(matchTrtRef[["y"]], matchTrtRef[["x"]]) - # match the additional variables in the treatment - if (length(ref_cotrt) && length(cotrt_var)) { + + if (length(cotrt_var) > 0) { for (i in names(out)) { - # matching the ref_elem to the trt_elem for the cotrt_var ref_idx <- lapply(na.omit(out[[i]]), function(x) { - ref_elem[rn == x, cotrt_var, with = FALSE] == - trt_elem[rn == i, cotrt_var, with = FALSE] - }) - out[[i]] <- out[[i]][unlist(lapply(ref_idx, all))] + all(ref_elem[rownames == x, ..cotrt_var] == trt_elem[rownames == i, ..cotrt_var]) + }) + out[[i]] <- out[[i]][unlist(ref_idx)] } } - out - } else { - out } + out } - #' Identify untreated rows based on Drug treatment alone #' -#' @param mat_elem input data frame -#' -#' @details -#' Using the given rownames, map the untreated conditions -#' +#' @param mat_elem data.table input containing drug name or drug identifier +#' columns. #' @keywords map_df -#' @return list -#' +#' @return logical vector +#' @export map_untreated <- function(mat_elem) { - # TODO: avoid recycling code of map_references above - # - clid <- gDRutils::get_env_identifiers("cellline") - valid <- unlist( + checkmate::assert_data_table(mat_elem) + tag_info <- .get_untreated_tag_count(mat_elem) + tag_info$ntag == tag_info$num_cols +} + +#' Get the count of untreated tags per row +#' +#' @param mat_elem data.table input data frame to evaluate. +#' @param drug_identifier_keys character vector of keys used to look up +#' drug column names in the \code{gDRutils} environment. +#' +#' @return list containing ntag, num_cols, and valid_cols +#' @keywords internal +.get_untreated_tag_count <- function(mat_elem, + drug_identifier_keys = c("drug_name", "drug_name2", "drug_name3")) { + + checkmate::assert_data_table(mat_elem) + valid_cols <- unlist( intersect( - c( - gDRutils::get_env_identifiers( - c("drug_name", "drug_name2", "drug_name3"), - simplify = FALSE - ) - ), + gDRutils::get_env_identifiers(drug_identifier_keys, simplify = FALSE), colnames(mat_elem) ) ) - drug_cols <- mat_elem[, valid, with = FALSE] - + if (length(valid_cols) == 0) { + stop(sprintf("None of the drug identifiers [%s] found", paste(drug_identifier_keys, collapse = ", "))) + } + untrt_tag <- gDRutils::get_env_identifiers("untreated_tag") - ntag <- rowSums(drug_cols[, - lapply(.SD, `%in%`, untrt_tag), - .SDcols = names(drug_cols)]) - ntag == length(valid) -} + ntag <- rowSums(mat_elem[, lapply(.SD, `%in%`, untrt_tag), .SDcols = valid_cols]) + + list(ntag = ntag, num_cols = length(valid_cols), valid_cols = valid_cols) +} \ No newline at end of file diff --git a/R/packages.R b/R/packages.R index 92ec6186..9da7c36c 100644 --- a/R/packages.R +++ b/R/packages.R @@ -49,7 +49,12 @@ if (getRversion() >= "2.15.1") { "smooth", "priority1", "priority2", - "x.N" + "x.N", + "..cotrt_var", + "..present_ref_cols", + "..y", + "LogFoldChange", + "ReadoutValue" ), utils::packageName()) } diff --git a/man/dot-get_untreated_tag_count.Rd b/man/dot-get_untreated_tag_count.Rd new file mode 100644 index 00000000..9c83110c --- /dev/null +++ b/man/dot-get_untreated_tag_count.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/map_df.R +\name{.get_untreated_tag_count} +\alias{.get_untreated_tag_count} +\title{Get the count of untreated tags per row} +\usage{ +.get_untreated_tag_count( + mat_elem, + drug_identifier_keys = c("drug_name", "drug_name2", "drug_name3") +) +} +\arguments{ +\item{mat_elem}{data.table input data frame to evaluate.} + +\item{drug_identifier_keys}{character vector of keys used to look up +drug column names in the \code{gDRutils} environment.} +} +\value{ +list containing ntag, num_cols, and valid_cols +} +\description{ +Get the count of untreated tags per row +} +\keyword{internal} diff --git a/man/dot-map_references.Rd b/man/dot-map_references.Rd index 51c0ee74..1205c776 100644 --- a/man/dot-map_references.Rd +++ b/man/dot-map_references.Rd @@ -11,9 +11,11 @@ ) } \arguments{ -\item{mat_elem}{input data frame} +\item{mat_elem}{data.table input containing experimental metadata and +row identifiers.} -\item{rowData_colnames}{character vector of variables for the mapping of reference treatments} +\item{rowData_colnames}{character vector of variables (column names) used +to identify and map reference treatments.} } \value{ list @@ -21,7 +23,4 @@ list \description{ Map references } -\details{ -Using the given rownames, map the treated and reference conditions. -} \keyword{map_df} diff --git a/man/map_df.Rd b/man/map_df.Rd index 9d18a196..52264908 100644 --- a/man/map_df.Rd +++ b/man/map_df.Rd @@ -24,7 +24,7 @@ fields should be used as a control. Defaults to \code{NULL}.} include. Likely obtained from \code{identify_keys()}.} \item{ref_type}{string of the reference type to map to. -Should be one of \code{c("Day0", "untrt_Endpoint", "ref_Endpoint")}.} +Should be one of \code{c("Day0", "untrt_Endpoint")}.} } \value{ named list mapping treated metadata to untreated metadata. @@ -34,35 +34,24 @@ Map treated conditions to their respective Day0, untreated, or single-agent references using condition metadata. } \details{ -If \code{override_untrt_controls} is specified, -TODO: FILL ME! +If \code{override_untrt_controls} is specified, the values in the +named list will supersede the values in \code{trt_md} during the matching +process. This is useful for mapping treatments to specific "standard" +untreated controls. } \examples{ -n <- 64 -md_df <- data.table::data.table( - Gnumber = rep(c("vehicle", "untreated", paste0("G", seq(2))), each = 16), - DrugName = rep(c("vehicle", "untreated", paste0("GN", seq(2))), each = 16), - clid = paste0("C", rep_len(seq(4), n)), - CellLineName = paste0("N", rep_len(seq(4), n)), - replicates = rep_len(paste0("R", rep(seq(4), each = 4)), 64), - drug_moa = "inhibitor", - ReferenceDivisionTime = rep_len(c(120, 60), n), - Tissue = "Lung", - parental_identifier = "CL12345", - Duration = 160 +# Standard Endpoint Mapping +trt_dt <- data.table::data.table( + clid = c("C1", "C2"), + Duration = 72, + rn = c("T1", "T2") ) -md_df <- unique(md_df) -ref <- md_df$Gnumber \%in\% c("vehicle", "untreated") -ref_df <- md_df[ref, ] -trt_df <- md_df[!ref, ] -Keys <- identify_keys(trt_df) -ref_type <- "untrt_Endpoint" -map_df( - trt_df, - ref_df, - ref_cols = Keys[[ref_type]], - ref_type = ref_type +ref_dt <- data.table::data.table( + clid = c("C1", "C2"), + Duration = 72, + rn = c("R1", "R2") ) +map_df(trt_dt, ref_dt, ref_cols = "clid", ref_type = "untrt_Endpoint") } \seealso{ diff --git a/man/map_untreated.Rd b/man/map_untreated.Rd index 0ddb8d6c..8a95cded 100644 --- a/man/map_untreated.Rd +++ b/man/map_untreated.Rd @@ -7,15 +7,13 @@ map_untreated(mat_elem) } \arguments{ -\item{mat_elem}{input data frame} +\item{mat_elem}{data.table input containing drug name or drug identifier +columns.} } \value{ -list +logical vector } \description{ Identify untreated rows based on Drug treatment alone } -\details{ -Using the given rownames, map the untreated conditions -} \keyword{map_df} diff --git a/tests/testthat/test-create_SE.R b/tests/testthat/test-create_SE.R index 2483356d..b146f1b0 100644 --- a/tests/testthat/test-create_SE.R +++ b/tests/testthat/test-create_SE.R @@ -43,11 +43,6 @@ test_that("create_SE works as expected", { testthat::expect_s4_class(se$result, "SummarizedExperiment") expect_equal(dim(se$result), c(12, 6)) - - # Check Day0 data - controls <- BumpyMatrix::unsplitAsDataFrame(SummarizedExperiment::assay(se[[1]], "Controls")) - expect_true(all(is.na(controls$CorrectedReadout[ - controls$control_type == "Day0Readout"]))) }) diff --git a/tests/testthat/test-map_df.R b/tests/testthat/test-map_df.R index 6aef93b4..11097d6f 100644 --- a/tests/testthat/test-map_df.R +++ b/tests/testthat/test-map_df.R @@ -1,111 +1,118 @@ test_that("map_df works as expected", { - md_df <- unique(md_df) - md_df$rn <- as.character(seq_len(nrow(md_df))) - ref <- md_df$Gnumber %in% c("vehicle", "untreated") + n <- 64 + md_df <- data.table::data.table( + Gnumber = rep(c("vehicle", "untreated", paste0("G", seq(2))), each = 16), + DrugName = rep(c("vehicle", "untreated", paste0("GN", seq(2))), each = 16), + clid = paste0("C", rep_len(seq(4), n)), + CellLineName = paste0("N", rep_len(seq(4), n)), + replicates = rep_len(paste0("R", rep(seq(4), each = 4)), 64), + Duration = 160, + rn = as.character(seq_len(64)) + ) + ref <- md_df$Gnumber %in% c("vehicle", "untreated") ref_df <- md_df[ref, ] trt_df <- md_df[!ref, ] - Keys <- identify_keys(test_df) - - ref_type <- "untrt_Endpoint" - mapping <- map_df(trt_df, - ref_df, - ref_cols = Keys[[ref_type]], - ref_type = ref_type) - + + ref_cols <- c("clid", "CellLineName", "Duration") + mapping <- map_df(trt_df, ref_df, ref_cols = ref_cols, ref_type = "untrt_Endpoint") + expect_equal(names(mapping), trt_df$rn) expect_equal(length(mapping), nrow(trt_df)) + expect_equal(sort(unique(unname(unlist(mapping)))), sort(ref_df$rn)) +}) - out <- lapply(seq(33, 64, 1), function(x) { - as.character(c((x - 1) %% 16 + 1, 17 + ((x - 1) %% 16))) - }) - names(out) <- seq(33, 64, 1) - - expect_equal(mapping, out) - expect_equal(sort(unique(unname(unlist(mapping)))), sort(rownames(ref_df))) +test_that("Best match is detected for missing controls", { + trt_dt <- data.table::data.table( + clid = "C1", + Duration = 72, + Tissue = "Lung", + rn = "T1" + ) + + ref_dt <- data.table::data.table( + clid = c("C1", "C1"), + Duration = c(72, 72), + Tissue = c("Lung", "Liver"), + rn = c("R1", "R2") + ) - # Test Day0 data with E2 + ref_cols <- c("clid", "Tissue") + obs <- map_df(trt_dt, ref_dt, ref_cols = ref_cols, ref_type = "untrt_Endpoint") + expect_equal(obs[["T1"]], "R1") +}) + +test_that("NAs are returned for missing controls", { trt_dt <- data.table::data.table( - clid = c("Cell123456", "Cell123456", "Cell654321", "Cell654321"), - CellLineName = c("CellLineA", "CellLineA", "CellLineB", "CellLineB"), - Tissue = c("Liver", "Liver", "Liver", "Liver"), - parental_identifier = c("ParentA", "ParentA", "ParentB", "ParentB"), - subtype = c("type1", "type1", "type2", "type2"), - ReferenceDivisionTime = c(45, 45, 30, 30), - Gnumber = c("Drug12345678", "Drug12345678", "Drug87654321", "Drug87654321"), - DrugName = c("DrugX", "DrugX", "DrugY", "DrugY"), - drug_moa = c("MOA1", "MOA1", "MOA2", "MOA2"), - Duration = c(120, 120, 120, 120), - E2 = c("0", "0.0023", "0", "0.0023"), - rn = c("1", "2", "6", "7") + clid = "Cell_A", + Duration = 72, + rn = "T1" ) ref_dt <- data.table::data.table( - clid = c("Cell123456", "Cell123456", "Cell123456", "Cell654321", "Cell654321", "Cell654321"), - CellLineName = c("CellLineA", "CellLineA", "CellLineA", "CellLineB", "CellLineB", "CellLineB"), - Tissue = c("Liver", "Liver", "Liver", "Liver", "Liver", "Liver"), - parental_identifier = c("ParentA", "ParentA", "ParentA", "ParentB", "ParentB", "ParentB"), - subtype = c("type1", "type1", "type1", "type2", "type2", "type2"), - ReferenceDivisionTime = c(NA, NA, NA, 60, 60, 60), - Gnumber = c("Drug12345678", "Drug12345678", "Drug12345678", "Drug87654321", "Drug87654321", "Drug87654321"), - DrugName = c("DrugX", "DrugX", "DrugX", "DrugY", "DrugY", "DrugY"), - drug_moa = c("MOA1", "MOA1", "MOA1", "MOA2", "MOA2", "MOA2"), - Duration = c(0, 120, 120, 0, 120, 120), - E2 = c("0", "0", "0.0023", "0", "0", "0.0023"), - rn = c("3", "4", "5", "8", "9", "10") + clid = "Cell_B", + Duration = 72, + rn = "R1" ) - ref_cols <- c("CellLineName", "Tissue", "parental_identifier", "subtype", - "Barcode", "clid") - - ref_type <- "Day0" - map_override_untrt_controls <- map_df(trt_dt, - ref_dt, - ref_cols = ref_cols, - ref_type = ref_type, - override_untrt_controls = c(E2 = 0.0023)) - expect_list(map_override_untrt_controls) - expect_length(map_override_untrt_controls, 4) - expect_equal(as.numeric(unlist(map_override_untrt_controls)), c(3, 3, 8, 8)) - - - map_override_untrt_controls2 <- map_df(trt_dt, - ref_dt, - ref_cols = ref_cols, - ref_type = ref_type, - override_untrt_controls = NULL) - - expect_list(map_override_untrt_controls2) - expect_length(map_override_untrt_controls2, 4) - expect_equal(as.numeric(unlist(map_override_untrt_controls2)), c(3, 3, 8, 8)) + ref_cols <- c("clid") + obs <- map_df(trt_dt, ref_dt, ref_cols = ref_cols, ref_type = "untrt_Endpoint") + + expect_list(obs) + expect_equal(length(obs[["T1"]]), 0) }) - -# TODO: test_that("Best match is detected for missing controls", {}) # nolint - -# TODO: test_that("NAs are returned for missing controls", {}) # nolint - -mat_elem <- data.table::data.table(DrugName = rep(c("untreated", "drugA", "drugB", "untreated"), 2), - DrugName_2 = rep(c("untreated", "vehicle", "drugA", "drugB"), 2), - clid = rep(c("C1", "C2"), each = 4)) - test_that(".map_references works as expected", { - # Combination data. + mat_elem <- data.table::data.table( + DrugName = rep(c("untreated", "drugA", "drugB", "untreated"), 2), + DrugName_2 = rep(c("untreated", "vehicle", "drugA", "drugB"), 2), + clid = rep(c("C1", "C2"), each = 4) + ) + + # Combination data mapping obs <- .map_references(mat_elem, rowData_colnames = c("DrugName", "DrugName_2")) exp <- list("3" = c("2", "4"), "7" = c("6", "8")) expect_equal(obs, exp) - - # Single-agent data. + + # Single-agent subsetting colname <- c("DrugName", "clid") mat_elem2 <- mat_elem[, colname, with = FALSE] - obs2 <- .map_references(mat_elem2, rowData_colnames = c("DrugName", "DrugName_2")) - exp2 <- list("2" = NULL, "3" = NULL, "6" = NULL, "7" = NULL) - expect_equal(obs2, exp2) + obs2 <- .map_references(mat_elem2, rowData_colnames = "DrugName") + + expect_list(obs2) + expect_true(all(vapply(obs2, is.null, FUN.VALUE = logical(1)))) + + expect_error(.map_references(mat_elem[, .(DrugName)]), "element of") }) test_that("map_untreated works as expected", { - # Combination data. + mat_elem <- data.table::data.table( + DrugName = c("untreated", "drugA", "untreated"), + DrugName_2 = c("untreated", "untreated", "drugB"), + DrugName_3 = c("untreated", "untreated", "untreated") + ) + obs <- map_untreated(mat_elem) - expect_equal(sum(obs), 2) + + expect_logical(obs, any.missing = FALSE, len = 3) + expect_equal(sum(obs), 1) # Only first row is untreated in all 3 cols + expect_true(obs[1]) }) + +test_that(".get_untreated_tag_count helper and assertions work", { + dt_valid <- data.table::data.table( + DrugName = c("untreated", "drugA"), + DrugName_2 = c("untreated", "untreated"), + clid = "C1" + ) + + res <- .get_untreated_tag_count(dt_valid, c("drug_name", "drug_name2")) + expect_equal(res$ntag, c(2, 1)) + + expect_error(.get_untreated_tag_count(dt_valid, drug_identifier_keys = "empty"), + "Must be element of set") + expect_error(.get_untreated_tag_count(dt_valid, drug_identifier_keys = "drug"), + "None of the drug") +}) +