R-Computing-Lab · Copilot · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# discord 1.3 (Development)
+
+## New Features
+
+* Added support for non-numeric (categorical) predictors in `discord_data()` and `make_mean_diffs()`. When using categorical predictors (e.g., location = "south"/"north"), the individual values are preserved in `variable_1` and `variable_2` columns, while `variable_diff` and `variable_mean` are set to NA since differences and means are not meaningful for categorical data.
+
 # discord 1.3
 * Adding new full data tutorial in response to reviewers
 * Adding links to external repositories for reproducible examples from publications

diff --git a/R/func_discord_data.R b/R/func_discord_data.R
@@ -5,6 +5,9 @@
 #'   interest.
 #' @param predictors A character vector containing the column names for
 #'   predicting the outcome. Can be NULL if no predictors are desired.
+#'   Predictors can be either numeric (for which differences and means will be
+#'   calculated) or non-numeric/categorical (e.g., location = "south"/"north",
+#'   for which values will be preserved but differences and means set to NA).
 #' @param id Default's to NULL. If supplied, must specify the column name
 #'   corresponding to unique kinship pair identifiers.
 #' @param sex A character string for the sex column name.
@@ -25,6 +28,7 @@
 #'
 #' @examples
 #'
+#' # Example with numeric predictor
 #' discord_data(
 #'   data = data_sample,
 #'   outcome = "height",
@@ -35,6 +39,31 @@
 #'   demographics = "none"
 #' )
 #'
+#' \dontrun{
+#' # Example with categorical predictor
+#' # Create sample data with location predictor
+#' sample_data <- data.frame(
+#'   id = 1:5,
+#'   age_s1 = c(25, 30, 35, 40, 45),
+#'   age_s2 = c(23, 28, 33, 38, 43),
+#'   location_s1 = c("south", "north", "south", "north", "south"),
+#'   location_s2 = c("south", "south", "north", "north", "south")
+#' )
+#'
+#' discord_data(
+#'   data = sample_data,
+#'   outcome = "age",
+#'   predictors = "location",
+#'   id = "id",
+#'   pair_identifiers = c("_s1", "_s2"),
+#'   sex = NULL,
+#'   race = NULL,
+#'   demographics = "none"
+#' )
+#' # For categorical predictors, location_1 and location_2 contain the values,
+#' # while location_diff and location_mean are set to NA
+#' }
+#'
 discord_data <- function(data,
                          outcome,
                          predictors,

diff --git a/R/helpers_regression.R b/R/helpers_regression.R
@@ -100,6 +100,8 @@ check_sibling_order_fast <- function(data, outcome, pair_identifiers) {
 #' @description This function calculates differences and means of a given variable for each kinship pair. The order of subtraction and the variables' names in the output dataframe depend on the order column set by check_sibling_order().
 #' If the demographics parameter is set to "race", "sex", or "both", it also prepares demographic information accordingly,
 #' swapping the order of demographics as per the order column.
+#' 
+#' For numeric variables, this function computes the difference and mean between pairs. For non-numeric variables (e.g., categorical predictors like location = "south"/"north"), the individual values are preserved in _1 and _2 columns, while _diff and _mean are set to NA.
 #' @inheritParams discord_data
 #' @inheritParams check_sibling_order
 #'
@@ -124,13 +126,21 @@ make_mean_diffs_ram_optimized <- function(data, id, sex, race, demographics,
 
   data <- data[row, ]
 
+  # Check if variable is numeric
+  is_numeric_var <- is.numeric(data[[S1]]) && is.numeric(data[[S2]])
 
   # write the core of the of the make_mean_diffs
   # This always runs -- ignoring sex or race variables
   if (data[, "order"] == "s1") {
-    # no need to be yelled at by r for subtracting strings)
-    diff <- suppressMessages(suppressWarnings(data[[S1]] - data[[S2]]))
-    mean <- suppressMessages(suppressWarnings(base::mean(c(data[[S1]], data[[S2]]))))
+    if (is_numeric_var) {
+      # For numeric variables, compute diff and mean
+      diff <- data[[S1]] - data[[S2]]
+      mean <- base::mean(c(data[[S1]], data[[S2]]))
+    } else {
+      # For non-numeric variables, set diff and mean to NA
+      diff <- NA
+      mean <- NA
+    }
 
     output <- data.frame(
       id = data[[id]],
@@ -140,9 +150,15 @@ make_mean_diffs_ram_optimized <- function(data, id, sex, race, demographics,
       variable_mean = mean
     )
   } else if (data[, "order"] == "s2") {
-    # no need to be yelled at by r for subtracting strings)
-    diff <- suppressMessages(suppressWarnings(data[[S2]] - data[[S1]]))
-    mean <- suppressMessages(suppressWarnings(base::mean(c(data[[S1]], data[[S2]]))))
+    if (is_numeric_var) {
+      # For numeric variables, compute diff and mean
+      diff <- data[[S2]] - data[[S1]]
+      mean <- base::mean(c(data[[S1]], data[[S2]]))
+    } else {
+      # For non-numeric variables, set diff and mean to NA
+      diff <- NA
+      mean <- NA
+    }
 
     output <- data.frame(
       id = data[[id]],
@@ -321,8 +337,19 @@ make_mean_diffs_fast <- function(data, id, sex, race, demographics,
       data[[paste0(var, pair_identifiers[1])]]
     )
 
-    diff <- var1 - var2
-    mean_ <- (var1 + var2) / 2
+    # Check if variable is numeric
+    is_numeric_var <- is.numeric(data[[paste0(var, pair_identifiers[1])]]) && 
+                      is.numeric(data[[paste0(var, pair_identifiers[2])]])
+
+    if (is_numeric_var) {
+      # For numeric variables, compute diff and mean
+      diff <- var1 - var2
+      mean_ <- (var1 + var2) / 2
+    } else {
+      # For non-numeric variables, set diff and mean to NA
+      diff <- rep(NA, length(var1))
+      mean_ <- rep(NA, length(var1))
+    }
 
     tmp <- data.frame(
       id = data[[id]],

diff --git a/tests/testthat/test-categorical_predictors.R b/tests/testthat/test-categorical_predictors.R
@@ -0,0 +1,188 @@
+# Tests for non-numeric (categorical) predictors
+
+test_that("discord_data works with non-numeric predictors", {
+  # Create sample data with categorical predictor
+  set.seed(2023)
+
+  # Create a simple dataset with a categorical predictor "location"
+  test_data <- data.frame(
+    id = 1:10,
+    age_s1 = c(25, 30, 35, 40, 45, 50, 55, 60, 65, 70),
+    age_s2 = c(23, 28, 33, 38, 43, 48, 53, 58, 63, 68),
+    location_s1 = c("south", "north", "south", "north", "south", 
+                    "north", "south", "north", "south", "north"),
+    location_s2 = c("south", "south", "north", "north", "south", 
+                    "south", "north", "north", "south", "south")
+  )
+
+  # Test discord_data with categorical predictor
+  result <- discord_data(
+    data = test_data,
+    outcome = "age",
+    predictors = "location",
+    id = "id",
+    sex = NULL,
+    race = NULL,
+    pair_identifiers = c("_s1", "_s2"),
+    demographics = "none"
+  )
+
+  # Check that the result has the expected columns
+  expect_true("location_1" %in% names(result))
+  expect_true("location_2" %in% names(result))
+  expect_true("location_diff" %in% names(result))
+  expect_true("location_mean" %in% names(result))
+
+  # Check that location_1 and location_2 are preserved correctly
+  expect_true(all(result$location_1 %in% c("south", "north")))
+  expect_true(all(result$location_2 %in% c("south", "north")))
+
+  # Check that location_diff and location_mean are NA for non-numeric variables
+  expect_true(all(is.na(result$location_diff)))
+  expect_true(all(is.na(result$location_mean)))
+
+  # Check that age diff and mean are still computed for numeric variables
+  expect_true(all(!is.na(result$age_diff)))
+  expect_true(all(!is.na(result$age_mean)))
+})
+
+test_that("discord_data works with mixed numeric and non-numeric predictors", {
+  set.seed(2023)
+
+  # Create a dataset with both numeric and categorical predictors
+  test_data <- data.frame(
+    id = 1:10,
+    outcome_s1 = c(100, 110, 120, 130, 140, 150, 160, 170, 180, 190),
+    outcome_s2 = c(95, 105, 115, 125, 135, 145, 155, 165, 175, 185),
+    income_s1 = c(50000, 60000, 70000, 80000, 90000, 
+                  100000, 110000, 120000, 130000, 140000),
+    income_s2 = c(48000, 58000, 68000, 78000, 88000, 
+                  98000, 108000, 118000, 128000, 138000),
+    region_s1 = c("west", "east", "west", "east", "west", 
+                  "east", "west", "east", "west", "east"),
+    region_s2 = c("west", "west", "east", "east", "west", 
+                  "west", "east", "east", "west", "west")
+  )
+
+  # Test discord_data with both predictor types
+  result <- discord_data(
+    data = test_data,
+    outcome = "outcome",
+    predictors = c("income", "region"),
+    id = "id",
+    sex = NULL,
+    race = NULL,
+    pair_identifiers = c("_s1", "_s2"),
+    demographics = "none"
+  )
+
+  # Check numeric predictor (income) has valid diff and mean
+  expect_true(all(!is.na(result$income_diff)))
+  expect_true(all(!is.na(result$income_mean)))
+  expect_true(is.numeric(result$income_diff))
+  expect_true(is.numeric(result$income_mean))
+
+  # Check categorical predictor (region) has NA for diff and mean
+  expect_true(all(is.na(result$region_diff)))
+  expect_true(all(is.na(result$region_mean)))
+
+  # Check categorical values are preserved
+  expect_true(all(result$region_1 %in% c("west", "east")))
+  expect_true(all(result$region_2 %in% c("west", "east")))
+})
+
+test_that("discord_data with non-numeric predictors works with fast=FALSE", {
+  set.seed(2023)
+
+  test_data <- data.frame(
+    id = 1:5,
+    score_s1 = c(85, 90, 95, 100, 105),
+    score_s2 = c(80, 85, 90, 95, 100),
+    city_s1 = c("A", "B", "A", "B", "A"),
+    city_s2 = c("B", "A", "B", "A", "B")
+  )
+
+  result <- discord_data(
+    data = test_data,
+    outcome = "score",
+    predictors = "city",
+    id = "id",
+    sex = NULL,
+    race = NULL,
+    pair_identifiers = c("_s1", "_s2"),
+    demographics = "none",
+    fast = FALSE
+  )
+
+  # Verify results
+  expect_true("city_1" %in% names(result))
+  expect_true("city_2" %in% names(result))
+  expect_true(all(is.na(result$city_diff)))
+  expect_true(all(is.na(result$city_mean)))
+  expect_true(all(result$city_1 %in% c("A", "B")))
+  expect_true(all(result$city_2 %in% c("A", "B")))
+})
+
+test_that("discord_data with non-numeric predictors works with fast=TRUE", {
+  set.seed(2023)
+
+  test_data <- data.frame(
+    id = 1:5,
+    score_s1 = c(85, 90, 95, 100, 105),
+    score_s2 = c(80, 85, 90, 95, 100),
+    city_s1 = c("A", "B", "A", "B", "A"),
+    city_s2 = c("B", "A", "B", "A", "B")
+  )
+
+  result <- discord_data(
+    data = test_data,
+    outcome = "score",
+    predictors = "city",
+    id = "id",
+    sex = NULL,
+    race = NULL,
+    pair_identifiers = c("_s1", "_s2"),
+    demographics = "none",
+    fast = TRUE
+  )
+
+  # Verify results
+  expect_true("city_1" %in% names(result))
+  expect_true("city_2" %in% names(result))
+  expect_true(all(is.na(result$city_diff)))
+  expect_true(all(is.na(result$city_mean)))
+  expect_true(all(result$city_1 %in% c("A", "B")))
+  expect_true(all(result$city_2 %in% c("A", "B")))
+})
+
+test_that("discord_data with factor predictors", {
+  set.seed(2023)
+
+  # Test with factor type
+  test_data <- data.frame(
+    id = 1:5,
+    value_s1 = c(10, 20, 30, 40, 50),
+    value_s2 = c(9, 18, 27, 36, 45),
+    category_s1 = factor(c("low", "high", "low", "high", "low")),
+    category_s2 = factor(c("high", "low", "high", "low", "high"))
+  )
+
+  result <- discord_data(
+    data = test_data,
+    outcome = "value",
+    predictors = "category",
+    id = "id",
+    sex = NULL,
+    race = NULL,
+    pair_identifiers = c("_s1", "_s2"),
+    demographics = "none"
+  )
+
+  # Verify that factor values are preserved
+  expect_true("category_1" %in% names(result))
+  expect_true("category_2" %in% names(result))
+
+  # Verify that diff and mean are NA for factor variables
+  expect_true(all(is.na(result$category_diff)))
+  expect_true(all(is.na(result$category_mean)))
+})