Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# discord 1.3 (Development)

## New Features

* Added support for non-numeric (categorical) predictors in `discord_data()` and `make_mean_diffs()`. When using categorical predictors (e.g., location = "south"/"north"), the individual values are preserved in `variable_1` and `variable_2` columns, while `variable_diff` and `variable_mean` are set to NA since differences and means are not meaningful for categorical data.

# discord 1.3
* Adding new full data tutorial in response to reviewers
* Adding links to external repositories for reproducible examples from publications
Expand Down
29 changes: 29 additions & 0 deletions R/func_discord_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
#' interest.
#' @param predictors A character vector containing the column names for
#' predicting the outcome. Can be NULL if no predictors are desired.
#' Predictors can be either numeric (for which differences and means will be
#' calculated) or non-numeric/categorical (e.g., location = "south"/"north",
#' for which values will be preserved but differences and means set to NA).
#' @param id Default's to NULL. If supplied, must specify the column name
#' corresponding to unique kinship pair identifiers.
#' @param sex A character string for the sex column name.
Expand All @@ -25,6 +28,7 @@
#'
#' @examples
#'
#' # Example with numeric predictor
#' discord_data(
#' data = data_sample,
#' outcome = "height",
Expand All @@ -35,6 +39,31 @@
#' demographics = "none"
#' )
#'
#' \dontrun{
#' # Example with categorical predictor
#' # Create sample data with location predictor
#' sample_data <- data.frame(
#' id = 1:5,
#' age_s1 = c(25, 30, 35, 40, 45),
#' age_s2 = c(23, 28, 33, 38, 43),
#' location_s1 = c("south", "north", "south", "north", "south"),
#' location_s2 = c("south", "south", "north", "north", "south")
#' )
#'
#' discord_data(
#' data = sample_data,
#' outcome = "age",
#' predictors = "location",
#' id = "id",
#' pair_identifiers = c("_s1", "_s2"),
#' sex = NULL,
#' race = NULL,
#' demographics = "none"
#' )
#' # For categorical predictors, location_1 and location_2 contain the values,
#' # while location_diff and location_mean are set to NA
#' }
#'
discord_data <- function(data,
outcome,
predictors,
Expand Down
43 changes: 35 additions & 8 deletions R/helpers_regression.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ check_sibling_order_fast <- function(data, outcome, pair_identifiers) {
#' @description This function calculates differences and means of a given variable for each kinship pair. The order of subtraction and the variables' names in the output dataframe depend on the order column set by check_sibling_order().
#' If the demographics parameter is set to "race", "sex", or "both", it also prepares demographic information accordingly,
#' swapping the order of demographics as per the order column.
#'
#' For numeric variables, this function computes the difference and mean between pairs. For non-numeric variables (e.g., categorical predictors like location = "south"/"north"), the individual values are preserved in _1 and _2 columns, while _diff and _mean are set to NA.
#' @inheritParams discord_data
#' @inheritParams check_sibling_order
#'
Expand All @@ -124,13 +126,21 @@ make_mean_diffs_ram_optimized <- function(data, id, sex, race, demographics,

data <- data[row, ]

# Check if variable is numeric
is_numeric_var <- is.numeric(data[[S1]]) && is.numeric(data[[S2]])

# write the core of the of the make_mean_diffs
# This always runs -- ignoring sex or race variables
if (data[, "order"] == "s1") {
# no need to be yelled at by r for subtracting strings)
diff <- suppressMessages(suppressWarnings(data[[S1]] - data[[S2]]))
mean <- suppressMessages(suppressWarnings(base::mean(c(data[[S1]], data[[S2]]))))
if (is_numeric_var) {
# For numeric variables, compute diff and mean
diff <- data[[S1]] - data[[S2]]
mean <- base::mean(c(data[[S1]], data[[S2]]))
} else {
# For non-numeric variables, set diff and mean to NA
diff <- NA
mean <- NA
}

output <- data.frame(
id = data[[id]],
Expand All @@ -140,9 +150,15 @@ make_mean_diffs_ram_optimized <- function(data, id, sex, race, demographics,
variable_mean = mean
)
} else if (data[, "order"] == "s2") {
# no need to be yelled at by r for subtracting strings)
diff <- suppressMessages(suppressWarnings(data[[S2]] - data[[S1]]))
mean <- suppressMessages(suppressWarnings(base::mean(c(data[[S1]], data[[S2]]))))
if (is_numeric_var) {
# For numeric variables, compute diff and mean
diff <- data[[S2]] - data[[S1]]
mean <- base::mean(c(data[[S1]], data[[S2]]))
} else {
# For non-numeric variables, set diff and mean to NA
diff <- NA
mean <- NA
}

output <- data.frame(
id = data[[id]],
Expand Down Expand Up @@ -321,8 +337,19 @@ make_mean_diffs_fast <- function(data, id, sex, race, demographics,
data[[paste0(var, pair_identifiers[1])]]
)

diff <- var1 - var2
mean_ <- (var1 + var2) / 2
# Check if variable is numeric
is_numeric_var <- is.numeric(data[[paste0(var, pair_identifiers[1])]]) &&
is.numeric(data[[paste0(var, pair_identifiers[2])]])

if (is_numeric_var) {
# For numeric variables, compute diff and mean
diff <- var1 - var2
mean_ <- (var1 + var2) / 2
} else {
# For non-numeric variables, set diff and mean to NA
diff <- rep(NA, length(var1))
mean_ <- rep(NA, length(var1))
}

tmp <- data.frame(
id = data[[id]],
Expand Down
188 changes: 188 additions & 0 deletions tests/testthat/test-categorical_predictors.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# Tests for non-numeric (categorical) predictors

test_that("discord_data works with non-numeric predictors", {
# Create sample data with categorical predictor
set.seed(2023)

# Create a simple dataset with a categorical predictor "location"
test_data <- data.frame(
id = 1:10,
age_s1 = c(25, 30, 35, 40, 45, 50, 55, 60, 65, 70),
age_s2 = c(23, 28, 33, 38, 43, 48, 53, 58, 63, 68),
location_s1 = c("south", "north", "south", "north", "south",
"north", "south", "north", "south", "north"),
location_s2 = c("south", "south", "north", "north", "south",
"south", "north", "north", "south", "south")
)

# Test discord_data with categorical predictor
result <- discord_data(
data = test_data,
outcome = "age",
predictors = "location",
id = "id",
sex = NULL,
race = NULL,
pair_identifiers = c("_s1", "_s2"),
demographics = "none"
)

# Check that the result has the expected columns
expect_true("location_1" %in% names(result))
expect_true("location_2" %in% names(result))
expect_true("location_diff" %in% names(result))
expect_true("location_mean" %in% names(result))

# Check that location_1 and location_2 are preserved correctly
expect_true(all(result$location_1 %in% c("south", "north")))
expect_true(all(result$location_2 %in% c("south", "north")))

# Check that location_diff and location_mean are NA for non-numeric variables
expect_true(all(is.na(result$location_diff)))
expect_true(all(is.na(result$location_mean)))

# Check that age diff and mean are still computed for numeric variables
expect_true(all(!is.na(result$age_diff)))
expect_true(all(!is.na(result$age_mean)))
})

test_that("discord_data works with mixed numeric and non-numeric predictors", {
set.seed(2023)

# Create a dataset with both numeric and categorical predictors
test_data <- data.frame(
id = 1:10,
outcome_s1 = c(100, 110, 120, 130, 140, 150, 160, 170, 180, 190),
outcome_s2 = c(95, 105, 115, 125, 135, 145, 155, 165, 175, 185),
income_s1 = c(50000, 60000, 70000, 80000, 90000,
100000, 110000, 120000, 130000, 140000),
income_s2 = c(48000, 58000, 68000, 78000, 88000,
98000, 108000, 118000, 128000, 138000),
region_s1 = c("west", "east", "west", "east", "west",
"east", "west", "east", "west", "east"),
region_s2 = c("west", "west", "east", "east", "west",
"west", "east", "east", "west", "west")
)

# Test discord_data with both predictor types
result <- discord_data(
data = test_data,
outcome = "outcome",
predictors = c("income", "region"),
id = "id",
sex = NULL,
race = NULL,
pair_identifiers = c("_s1", "_s2"),
demographics = "none"
)

# Check numeric predictor (income) has valid diff and mean
expect_true(all(!is.na(result$income_diff)))
expect_true(all(!is.na(result$income_mean)))
expect_true(is.numeric(result$income_diff))
expect_true(is.numeric(result$income_mean))

# Check categorical predictor (region) has NA for diff and mean
expect_true(all(is.na(result$region_diff)))
expect_true(all(is.na(result$region_mean)))

# Check categorical values are preserved
expect_true(all(result$region_1 %in% c("west", "east")))
expect_true(all(result$region_2 %in% c("west", "east")))
})

test_that("discord_data with non-numeric predictors works with fast=FALSE", {
set.seed(2023)

test_data <- data.frame(
id = 1:5,
score_s1 = c(85, 90, 95, 100, 105),
score_s2 = c(80, 85, 90, 95, 100),
city_s1 = c("A", "B", "A", "B", "A"),
city_s2 = c("B", "A", "B", "A", "B")
)

result <- discord_data(
data = test_data,
outcome = "score",
predictors = "city",
id = "id",
sex = NULL,
race = NULL,
pair_identifiers = c("_s1", "_s2"),
demographics = "none",
fast = FALSE
)

# Verify results
expect_true("city_1" %in% names(result))
expect_true("city_2" %in% names(result))
expect_true(all(is.na(result$city_diff)))
expect_true(all(is.na(result$city_mean)))
expect_true(all(result$city_1 %in% c("A", "B")))
expect_true(all(result$city_2 %in% c("A", "B")))
})

test_that("discord_data with non-numeric predictors works with fast=TRUE", {
set.seed(2023)

test_data <- data.frame(
id = 1:5,
score_s1 = c(85, 90, 95, 100, 105),
score_s2 = c(80, 85, 90, 95, 100),
city_s1 = c("A", "B", "A", "B", "A"),
city_s2 = c("B", "A", "B", "A", "B")
)

result <- discord_data(
data = test_data,
outcome = "score",
predictors = "city",
id = "id",
sex = NULL,
race = NULL,
pair_identifiers = c("_s1", "_s2"),
demographics = "none",
fast = TRUE
)

# Verify results
expect_true("city_1" %in% names(result))
expect_true("city_2" %in% names(result))
expect_true(all(is.na(result$city_diff)))
expect_true(all(is.na(result$city_mean)))
expect_true(all(result$city_1 %in% c("A", "B")))
expect_true(all(result$city_2 %in% c("A", "B")))
})

test_that("discord_data with factor predictors", {
set.seed(2023)

# Test with factor type
test_data <- data.frame(
id = 1:5,
value_s1 = c(10, 20, 30, 40, 50),
value_s2 = c(9, 18, 27, 36, 45),
category_s1 = factor(c("low", "high", "low", "high", "low")),
category_s2 = factor(c("high", "low", "high", "low", "high"))
)

result <- discord_data(
data = test_data,
outcome = "value",
predictors = "category",
id = "id",
sex = NULL,
race = NULL,
pair_identifiers = c("_s1", "_s2"),
demographics = "none"
)

# Verify that factor values are preserved
expect_true("category_1" %in% names(result))
expect_true("category_2" %in% names(result))

# Verify that diff and mean are NA for factor variables
expect_true(all(is.na(result$category_diff)))
expect_true(all(is.na(result$category_mean)))
})