From dc50fd449f57eea8322442815254479ee0ce3af6 Mon Sep 17 00:00:00 2001 From: Soumya Ray Date: Tue, 29 Nov 2022 15:50:47 +0800 Subject: [PATCH] Compute functions for linear and logistic regression --- R/logit_compute.R | 129 ++++++++++++++++++++++++++++++++++ R/regression_compute.R | 24 +++++++ tests/testthat/test-compute.R | 10 +++ 3 files changed, 163 insertions(+) create mode 100644 R/logit_compute.R create mode 100644 R/regression_compute.R create mode 100644 tests/testthat/test-compute.R diff --git a/R/logit_compute.R b/R/logit_compute.R new file mode 100644 index 0000000..671c4b8 --- /dev/null +++ b/R/logit_compute.R @@ -0,0 +1,129 @@ +# Classification Matrix of a logit model +logit_classification <- function(log_model, predictions = NULL, threshold = 0.5) { + if (is.null(predictions)) { + predictions <- predict(log_model, newdata = log_model$data, type = "response") + } + + outcomes_df <- data.frame( + actuals = log_model$y, + predicted = ifelse(predictions >= threshold, 1, 0)) + + class_table <- xtabs(~ predicted + actuals, data = outcomes_df) + correct <- sum(diag(class_table)) + total <- sum(class_table) + accuracy <- (100 * correct / total) |> round(2) |> paste("%", sep = "") + perf <- performance(threshold, predictions, outcomes_df$actuals) + + list( + class_table = class_table, + correct = correct, + total = total, + accuracy = accuracy, + performance = perf + ) +} + +# pseudo-R^2 for any model given actual binary Y and model probabilities of Y +pseudo_rsq <- function(log_model) { + Y <- log_model$y + Yprob <- log_model$fitted.values + n <- length(Y) + Ytab <- table(Y) + + LL <- log_likelihood(Y, Yprob) # -105.0325 + bLL <- log_likelihood(Y, baseline_PofY(Y)) # -133.6794 + + LR <- -2*bLL - -2*LL + + cs_rsq <- 1 - exp((-2*LL - -2*bLL)/n) # 0.19914 + n_rsq <- cs_rsq / (1 - exp(-(-2*bLL/n))) # 0.30863 + + list( + LL = LL, + bLL = bLL, + cs_rsq = cs_rsq, + n_rsq = n_rsq + ) +} + +log_likelihood <- function(Y, PofY) { + sum( Y*log(PofY) + (1 - Y)*(log(1 - PofY)) ) +} + +baseline_PofY <- function(Y) { + Ytab <- table(Y) + Ytab[[2]] / sum(Ytab) # num of cases of 1 over total num of cases +} + + +## ROC Functions + +insample_logit_performance <- function(log_model, classfxn_data = NULL) { + if (is.null(classfxn_data)) classfxn_data <- log_model$data + dv <- as.character(log_model$terms[[2]]) + preds <- predict(log_model, newdata = classfxn_data, + type = "response") |> unname() + roc_curve(actuals = classfxn_data[, dv], predictions = preds) +} + +oosample_logit_performance <- function(log_model) { + outcomes <- k_fold(log_model) + roc_curve(outcomes$actuals, outcomes$predictions) +} + +roc_curve <- function(actuals, predictions) { + threshholds <- predictions |> sort() |> unique() + threshholds <- c(0, threshholds, 1) + sapply(threshholds, performance, + preds = predictions, actuals = actuals) |> t() +} + +logit_performance_point <- function(threshold, log_model, predictions = NULL, classfxn_data = NULL) { + dv <- as.character(log_model$terms[[2]]) + if (is.null(classfxn_data)) classfxn_data <- log_model$data + + if (is.null(predictions)) predictions <- predict(log_model, newdata = classfxn_data, type="response") + performance(threshold, preds = predictions, actuals = classfxn_data[, dv]) |> t() +} + +performance <- function(threshold, preds, actuals) { + thresh_pred_true <- ifelse(preds >= threshold, TRUE, FALSE) + actual_trues <- actuals == 1 + actual_falses <- actuals == 0 + tp <- (thresh_pred_true) & (actual_trues) + fp <- (thresh_pred_true) & (actual_falses) + c(fpr = sum(fp) / sum(actual_falses), + tpr = sum(tp) / sum(actual_trues), + threshold = threshold) +} + +## K=fold Cross Validation + +k_fold <- function(log_model, k = NULL) { + if (is.null(k)) k <- nrow(log_model$data) + cases <- nrow(log_model$data) |> seq_len() + shuffled_indices <- cases |> sample() + dataset <- log_model$data[shuffled_indices, ] + actuals <- log_model$y[shuffled_indices] + + folds <- cut(cases, breaks = k, labels = FALSE) + + kfold_predictions <- + sapply(1:k, \(kth) { fold_predict(kth, folds, log_model, dataset) }) |> + unlist() + + results <- data.frame(actuals, predictions = kfold_predictions) + + # unshuffle indices and return results + results[order(as.numeric(row.names(results))), ] +} + + # fold_predict(1, folds, log_model, dataset) +fold_predict <- function(kth, folds, log_model, dataset) { + test_indices <- which(folds == kth) + test_set <- dataset[test_indices, ] + train_set <- dataset[-test_indices, ] + trained_model <- update(log_model, data = train_set) + + predict(trained_model, newdata = test_set, type = "response") +} diff --git a/R/regression_compute.R b/R/regression_compute.R new file mode 100644 index 0000000..bc1c0af --- /dev/null +++ b/R/regression_compute.R @@ -0,0 +1,24 @@ +#' Computes log likelihood for linear regression +#' based on https://stats.stackexchange.com/questions/73196/recalculate-log-likelihood-from-a-simple-r-lm-model +#' set.seed(23982938) +#' y <- rnorm(1000) +#' x1 <- rnorm(1000) +#' x2 <- rnorm(1000) +#' reg <- lm(y ~ x1 + x2) +#' +#' loglik_lm(reg) +#' # compare against: logLik(reg) +loglik_lm <- function(lm_model) { + n <- nrow(lm_model$model) # sample size + k <- dim(model.matrix(lm_model))[2] # number of coefficients + df <- n - k # degrees of freedom + + y <- lm_model$model[, 1] # dv + y_hat <- lm_model$fitted.values # fitted dv + + sigma <- sqrt(sum((y - y_hat)^2) / df) # OLS residual standard error + sigma_ml <- sigma * sqrt(df / n) # ML residual standard error + + y_probs <- dnorm(y, mean = y_hat, sd = sigma_ml) + sum(log(y_probs)) +} diff --git a/tests/testthat/test-compute.R b/tests/testthat/test-compute.R new file mode 100644 index 0000000..1c6ff9b --- /dev/null +++ b/tests/testthat/test-compute.R @@ -0,0 +1,10 @@ +# Linear regression plotting +test_that("Log likelihood of linear regression correct computed", { + set.seed(23982938) + y <- rnorm(1000) + x1 <- rnorm(1000) + x2 <- rnorm(1000) + reg <- lm(y ~ x1 + x2) + + expect_equal(compstatslib:::loglik_lm(reg), logLik(reg)[1]) +})