diff --git a/NEWS.md b/NEWS.md index 3066582..8018fa8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# discord 1.3 (Development) + +## New Features + +* Added support for non-numeric (categorical) predictors in `discord_data()` and `make_mean_diffs()`. When using categorical predictors (e.g., location = "south"/"north"), the individual values are preserved in `variable_1` and `variable_2` columns, while `variable_diff` and `variable_mean` are set to NA since differences and means are not meaningful for categorical data. + # discord 1.3 * Adding new full data tutorial in response to reviewers * Adding links to external repositories for reproducible examples from publications diff --git a/R/func_discord_data.R b/R/func_discord_data.R index 89bcb7f..7c30a4b 100644 --- a/R/func_discord_data.R +++ b/R/func_discord_data.R @@ -5,6 +5,9 @@ #' interest. #' @param predictors A character vector containing the column names for #' predicting the outcome. Can be NULL if no predictors are desired. +#' Predictors can be either numeric (for which differences and means will be +#' calculated) or non-numeric/categorical (e.g., location = "south"/"north", +#' for which values will be preserved but differences and means set to NA). #' @param id Default's to NULL. If supplied, must specify the column name #' corresponding to unique kinship pair identifiers. #' @param sex A character string for the sex column name. @@ -25,6 +28,7 @@ #' #' @examples #' +#' # Example with numeric predictor #' discord_data( #' data = data_sample, #' outcome = "height", @@ -35,6 +39,31 @@ #' demographics = "none" #' ) #' +#' \dontrun{ +#' # Example with categorical predictor +#' # Create sample data with location predictor +#' sample_data <- data.frame( +#' id = 1:5, +#' age_s1 = c(25, 30, 35, 40, 45), +#' age_s2 = c(23, 28, 33, 38, 43), +#' location_s1 = c("south", "north", "south", "north", "south"), +#' location_s2 = c("south", "south", "north", "north", "south") +#' ) +#' +#' discord_data( +#' data = sample_data, +#' outcome = "age", +#' predictors = "location", +#' id = "id", +#' pair_identifiers = c("_s1", "_s2"), +#' sex = NULL, +#' race = NULL, +#' demographics = "none" +#' ) +#' # For categorical predictors, location_1 and location_2 contain the values, +#' # while location_diff and location_mean are set to NA +#' } +#' discord_data <- function(data, outcome, predictors, diff --git a/R/helpers_regression.R b/R/helpers_regression.R index 451d6de..dcf4090 100644 --- a/R/helpers_regression.R +++ b/R/helpers_regression.R @@ -100,6 +100,8 @@ check_sibling_order_fast <- function(data, outcome, pair_identifiers) { #' @description This function calculates differences and means of a given variable for each kinship pair. The order of subtraction and the variables' names in the output dataframe depend on the order column set by check_sibling_order(). #' If the demographics parameter is set to "race", "sex", or "both", it also prepares demographic information accordingly, #' swapping the order of demographics as per the order column. +#' +#' For numeric variables, this function computes the difference and mean between pairs. For non-numeric variables (e.g., categorical predictors like location = "south"/"north"), the individual values are preserved in _1 and _2 columns, while _diff and _mean are set to NA. #' @inheritParams discord_data #' @inheritParams check_sibling_order #' @@ -124,13 +126,21 @@ make_mean_diffs_ram_optimized <- function(data, id, sex, race, demographics, data <- data[row, ] + # Check if variable is numeric + is_numeric_var <- is.numeric(data[[S1]]) && is.numeric(data[[S2]]) # write the core of the of the make_mean_diffs # This always runs -- ignoring sex or race variables if (data[, "order"] == "s1") { - # no need to be yelled at by r for subtracting strings) - diff <- suppressMessages(suppressWarnings(data[[S1]] - data[[S2]])) - mean <- suppressMessages(suppressWarnings(base::mean(c(data[[S1]], data[[S2]])))) + if (is_numeric_var) { + # For numeric variables, compute diff and mean + diff <- data[[S1]] - data[[S2]] + mean <- base::mean(c(data[[S1]], data[[S2]])) + } else { + # For non-numeric variables, set diff and mean to NA + diff <- NA + mean <- NA + } output <- data.frame( id = data[[id]], @@ -140,9 +150,15 @@ make_mean_diffs_ram_optimized <- function(data, id, sex, race, demographics, variable_mean = mean ) } else if (data[, "order"] == "s2") { - # no need to be yelled at by r for subtracting strings) - diff <- suppressMessages(suppressWarnings(data[[S2]] - data[[S1]])) - mean <- suppressMessages(suppressWarnings(base::mean(c(data[[S1]], data[[S2]])))) + if (is_numeric_var) { + # For numeric variables, compute diff and mean + diff <- data[[S2]] - data[[S1]] + mean <- base::mean(c(data[[S1]], data[[S2]])) + } else { + # For non-numeric variables, set diff and mean to NA + diff <- NA + mean <- NA + } output <- data.frame( id = data[[id]], @@ -321,8 +337,19 @@ make_mean_diffs_fast <- function(data, id, sex, race, demographics, data[[paste0(var, pair_identifiers[1])]] ) - diff <- var1 - var2 - mean_ <- (var1 + var2) / 2 + # Check if variable is numeric + is_numeric_var <- is.numeric(data[[paste0(var, pair_identifiers[1])]]) && + is.numeric(data[[paste0(var, pair_identifiers[2])]]) + + if (is_numeric_var) { + # For numeric variables, compute diff and mean + diff <- var1 - var2 + mean_ <- (var1 + var2) / 2 + } else { + # For non-numeric variables, set diff and mean to NA + diff <- rep(NA, length(var1)) + mean_ <- rep(NA, length(var1)) + } tmp <- data.frame( id = data[[id]], diff --git a/tests/testthat/test-categorical_predictors.R b/tests/testthat/test-categorical_predictors.R new file mode 100644 index 0000000..67f9def --- /dev/null +++ b/tests/testthat/test-categorical_predictors.R @@ -0,0 +1,188 @@ +# Tests for non-numeric (categorical) predictors + +test_that("discord_data works with non-numeric predictors", { + # Create sample data with categorical predictor + set.seed(2023) + + # Create a simple dataset with a categorical predictor "location" + test_data <- data.frame( + id = 1:10, + age_s1 = c(25, 30, 35, 40, 45, 50, 55, 60, 65, 70), + age_s2 = c(23, 28, 33, 38, 43, 48, 53, 58, 63, 68), + location_s1 = c("south", "north", "south", "north", "south", + "north", "south", "north", "south", "north"), + location_s2 = c("south", "south", "north", "north", "south", + "south", "north", "north", "south", "south") + ) + + # Test discord_data with categorical predictor + result <- discord_data( + data = test_data, + outcome = "age", + predictors = "location", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_s1", "_s2"), + demographics = "none" + ) + + # Check that the result has the expected columns + expect_true("location_1" %in% names(result)) + expect_true("location_2" %in% names(result)) + expect_true("location_diff" %in% names(result)) + expect_true("location_mean" %in% names(result)) + + # Check that location_1 and location_2 are preserved correctly + expect_true(all(result$location_1 %in% c("south", "north"))) + expect_true(all(result$location_2 %in% c("south", "north"))) + + # Check that location_diff and location_mean are NA for non-numeric variables + expect_true(all(is.na(result$location_diff))) + expect_true(all(is.na(result$location_mean))) + + # Check that age diff and mean are still computed for numeric variables + expect_true(all(!is.na(result$age_diff))) + expect_true(all(!is.na(result$age_mean))) +}) + +test_that("discord_data works with mixed numeric and non-numeric predictors", { + set.seed(2023) + + # Create a dataset with both numeric and categorical predictors + test_data <- data.frame( + id = 1:10, + outcome_s1 = c(100, 110, 120, 130, 140, 150, 160, 170, 180, 190), + outcome_s2 = c(95, 105, 115, 125, 135, 145, 155, 165, 175, 185), + income_s1 = c(50000, 60000, 70000, 80000, 90000, + 100000, 110000, 120000, 130000, 140000), + income_s2 = c(48000, 58000, 68000, 78000, 88000, + 98000, 108000, 118000, 128000, 138000), + region_s1 = c("west", "east", "west", "east", "west", + "east", "west", "east", "west", "east"), + region_s2 = c("west", "west", "east", "east", "west", + "west", "east", "east", "west", "west") + ) + + # Test discord_data with both predictor types + result <- discord_data( + data = test_data, + outcome = "outcome", + predictors = c("income", "region"), + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_s1", "_s2"), + demographics = "none" + ) + + # Check numeric predictor (income) has valid diff and mean + expect_true(all(!is.na(result$income_diff))) + expect_true(all(!is.na(result$income_mean))) + expect_true(is.numeric(result$income_diff)) + expect_true(is.numeric(result$income_mean)) + + # Check categorical predictor (region) has NA for diff and mean + expect_true(all(is.na(result$region_diff))) + expect_true(all(is.na(result$region_mean))) + + # Check categorical values are preserved + expect_true(all(result$region_1 %in% c("west", "east"))) + expect_true(all(result$region_2 %in% c("west", "east"))) +}) + +test_that("discord_data with non-numeric predictors works with fast=FALSE", { + set.seed(2023) + + test_data <- data.frame( + id = 1:5, + score_s1 = c(85, 90, 95, 100, 105), + score_s2 = c(80, 85, 90, 95, 100), + city_s1 = c("A", "B", "A", "B", "A"), + city_s2 = c("B", "A", "B", "A", "B") + ) + + result <- discord_data( + data = test_data, + outcome = "score", + predictors = "city", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_s1", "_s2"), + demographics = "none", + fast = FALSE + ) + + # Verify results + expect_true("city_1" %in% names(result)) + expect_true("city_2" %in% names(result)) + expect_true(all(is.na(result$city_diff))) + expect_true(all(is.na(result$city_mean))) + expect_true(all(result$city_1 %in% c("A", "B"))) + expect_true(all(result$city_2 %in% c("A", "B"))) +}) + +test_that("discord_data with non-numeric predictors works with fast=TRUE", { + set.seed(2023) + + test_data <- data.frame( + id = 1:5, + score_s1 = c(85, 90, 95, 100, 105), + score_s2 = c(80, 85, 90, 95, 100), + city_s1 = c("A", "B", "A", "B", "A"), + city_s2 = c("B", "A", "B", "A", "B") + ) + + result <- discord_data( + data = test_data, + outcome = "score", + predictors = "city", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_s1", "_s2"), + demographics = "none", + fast = TRUE + ) + + # Verify results + expect_true("city_1" %in% names(result)) + expect_true("city_2" %in% names(result)) + expect_true(all(is.na(result$city_diff))) + expect_true(all(is.na(result$city_mean))) + expect_true(all(result$city_1 %in% c("A", "B"))) + expect_true(all(result$city_2 %in% c("A", "B"))) +}) + +test_that("discord_data with factor predictors", { + set.seed(2023) + + # Test with factor type + test_data <- data.frame( + id = 1:5, + value_s1 = c(10, 20, 30, 40, 50), + value_s2 = c(9, 18, 27, 36, 45), + category_s1 = factor(c("low", "high", "low", "high", "low")), + category_s2 = factor(c("high", "low", "high", "low", "high")) + ) + + result <- discord_data( + data = test_data, + outcome = "value", + predictors = "category", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_s1", "_s2"), + demographics = "none" + ) + + # Verify that factor values are preserved + expect_true("category_1" %in% names(result)) + expect_true("category_2" %in% names(result)) + + # Verify that diff and mean are NA for factor variables + expect_true(all(is.na(result$category_diff))) + expect_true(all(is.na(result$category_mean))) +})