diff --git a/.Rbuildignore b/.Rbuildignore index a0f6a52..d92b3e9 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,12 @@ ^vignettes/articles/*_files$ ^vignettes/articles$ ^codecov\.yml$ +^data-raw$ +^README\.Rmd$ +^\.lintr\.R$ +^_quarto\.yml$ +^.*\.qmd$ +^.*\.png$ +^Vary.*\.r$ +^sim_.*\.r$ +^simulation.*\.qmd$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml new file mode 100644 index 0000000..f01eb71 --- /dev/null +++ b/.github/workflows/R-CMD-check.yaml @@ -0,0 +1,48 @@ +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: R-CMD-check + +permissions: + contents: read + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + + env: + GITHUB_PAT: ${{ github.token }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true + build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 0000000..f238786 --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,46 @@ +on: + push: + branches: [main, master] + release: + types: [published] + workflow_dispatch: + +name: pkgdown + +permissions: + contents: write + pages: write + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ github.token }} + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to GitHub pages 🚀 + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@v4 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/.gitignore b/.gitignore index cbe14e1..5b09519 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,10 @@ simulation1_edited_files simulation1_edited.html *.rmarkdown **/.quarto/ -data/ +data/* +!data/.gitignore +!data/mock_posterior_draws.rda +!data/mock_case_data.rda /.quarto/ *.pdf README_files diff --git a/DESCRIPTION b/DESCRIPTION index 6b43288..52f29cd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: shigella Title: What the Package Does (One Line, Title Case) -Version: 0.0.0.9004 +Version: 0.0.0.9005 Authors@R: c( person("Kwan Ho", "Lee", , "ksjlee@ucdavis.edu", role = c("aut", "cre")), person("Douglas Ezra", "Morrison", , "demorrison@ucdavis.edu", role = c("aut"), @@ -9,14 +9,15 @@ Description: What the package does (one paragraph). License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Imports: + cli, dplyr, - future, - future.apply, ggplot2, - gridExtra, - serodynamics (>= 0.0.0.9011) + rlang, + serocalculator, + tibble, + tidyr URL: https://ucd-serg.github.io/shigella/ Remotes: UCD-SERG/serodynamics @@ -26,8 +27,11 @@ Suggests: coda, forcats, furrr, + future, + future.apply, ggeasy, ggmcmc, + gridExtra, gtsummary, haven, here, @@ -42,15 +46,18 @@ Suggests: purrr, quarto, readxl, + rmarkdown, runjags, scales, + serodynamics (>= 0.0.0.9011), spelling, table1, - tibble, - tidyr, + testthat (>= 3.0.0), tidyverse +VignetteBuilder: knitr Depends: R (>= 3.5) LazyData: true +Config/testthat/edition: 3 Config/Needs/website: quarto Language: en-US diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..aec8417 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2024 +COPYRIGHT HOLDER: Kwan Ho Lee, Douglas Ezra Morrison diff --git a/NAMESPACE b/NAMESPACE index 56e7b54..0d108b7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,12 @@ # Generated by roxygen2: do not edit by hand -export(postprocess_jags_output) +export(compute_residual_metrics) +export(fig2_overall_newperson) +export(fmt_mci) +export(make_model_comparison_table) +export(model_comparison_table) +export(prep_newperson_params) +export(process_shigella_data) importFrom(dplyr,any_of) importFrom(dplyr,filter) importFrom(dplyr,mutate) @@ -13,3 +19,5 @@ importFrom(ggplot2,geom_point) importFrom(ggplot2,ggplot) importFrom(ggplot2,labs) importFrom(ggplot2,theme_minimal) +importFrom(rlang,.data) +importFrom(rlang,ensym) diff --git a/NEWS.md b/NEWS.md index 637844b..943bd49 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,7 +10,7 @@ * added `.lintr` configuration file * Moved helper functions into `R/` subdirectory (#1) -* simplified spell-check Github Action (#3) +* simplified spell-check GitHub Action (#3) # shigella 0.0.0 diff --git a/R/compute_residual_metrics.R b/R/compute_residual_metrics.R new file mode 100644 index 0000000..89f124a --- /dev/null +++ b/R/compute_residual_metrics.R @@ -0,0 +1,183 @@ +#' Residual-based posterior predictive metrics for longitudinal antibody curves +#' +#' Computes residuals between observed antibody measurements and posterior +#' median predictions evaluated at the observed time points. Returns pointwise +#' residuals or aggregated error metrics (MAE, RMSE, SSE) at multiple +#' summary levels. +#' +#' @param model A data frame of posterior draws in long format with columns: +#' \code{Subject}, \code{Iso_type}, \code{Chain}, \code{Iteration}, +#' \code{Parameter}, \code{value}. +#' @param dataset A \code{serodynamics} case dataset produced by +#' \code{serodynamics::as_case_data()} (must contain \code{id} and +#' \code{antigen_iso} columns, and time/value attributes). +#' @param ids Character vector of subject IDs to include (matched against +#' \code{dataset$id} and \code{model$Subject}). +#' @param antigen_iso Character scalar specifying the antigen/isotype to analyze +#' (matched against \code{dataset$antigen_iso} and \code{model$Iso_type}). +#' @param scale Scale on which to compute residuals. One of \code{"original"} +#' or \code{"log"}. If \code{"log"}, residuals are computed on the natural +#' log scale and observations/predictions \eqn{\le 0} are removed. +#' @param summary_level Level at which to summarize metrics. One of: +#' \describe{ +#' \item{\code{"pointwise"}}{ +#' Return pointwise residuals at each observed time point. +#' } +#' \item{\code{"id_antigen"}}{ +#' Summarize by \code{id} and \code{antigen_iso}. +#' } +#' \item{\code{"antigen"}}{Summarize by \code{antigen_iso} only.} +#' \item{\code{"overall"}}{Single summary across all included observations.} +#' } +#' +#' @return A tibble. For \code{summary_level = "pointwise"}, returns +#' per-observation residuals. Otherwise returns MAE, RMSE, SSE, and +#' \code{n_obs} at the requested summary level. +#' +#' @details +#' The posterior predictive summary uses the posterior median at each observed +#' time point, with a 95\% credible interval computed from draw-level +#' predictions. Predictions are generated by +#' \code{\link{predict_posterior_at_times}}. +#' +#' @examples +#' \dontrun{ +#' # Per-ID error metrics (original scale) +#' m_id <- compute_residual_metrics( +#' model = overall_sf2a, +#' dataset = dL_clean_sf2a, +#' ids = unique(dL_clean_sf2a$id), +#' antigen_iso = "IgG", +#' scale = "original", +#' summary_level = "id_antigen" +#' ) +#' +#' # Pointwise residuals on log scale +#' r_pw <- compute_residual_metrics( +#' model = overall_sf2a, +#' dataset = dL_clean_sf2a, +#' ids = "SOSAR-22008", +#' antigen_iso = "IgA", +#' scale = "log", +#' summary_level = "pointwise" +#' ) +#' } +#' +#' @export +compute_residual_metrics <- function(model, + dataset, + ids, + antigen_iso, + scale = c("original", "log"), + summary_level = c( + "id_antigen", + "pointwise", + "antigen", + "overall" + )) { + + scale <- match.arg(scale) + summary_level <- match.arg(summary_level) + + time_var <- get_timeindays_var(dataset) + value_var <- serocalculator::get_values_var(dataset) + + observed_data <- dataset |> + dplyr::rename( + t = !!rlang::sym(time_var), + obs = !!rlang::sym(value_var) + ) |> + dplyr::select(dplyr::all_of(c("id", "t", "obs", "antigen_iso"))) |> + dplyr::mutate(id = as.character(.data$id)) |> + dplyr::filter(.data$id %in% ids, .data$antigen_iso == antigen_iso) + + if (nrow(observed_data) == 0) { + cli::cli_abort( + "No observed data found for the specified IDs and antigen_iso." + ) + } + + obs_times <- sort(unique(observed_data$t)) + + predictions_all <- predict_posterior_at_times( + model = model, + ids = ids, + antigen_iso = antigen_iso, + times = obs_times + ) + + pred_summary <- predictions_all |> + dplyr::summarise( + .by = c("id", "t"), + pred_med = stats::median(.data$res, na.rm = TRUE), + pred_lower = stats::quantile(.data$res, probs = 0.025, na.rm = TRUE), + pred_upper = stats::quantile(.data$res, probs = 0.975, na.rm = TRUE) + ) |> + dplyr::mutate(id = as.character(.data$id)) + + residual_data <- observed_data |> + dplyr::inner_join(pred_summary, by = c("id", "t")) + + if (scale == "log") { + residual_data <- residual_data |> + dplyr::filter(.data$obs > 0, .data$pred_med > 0) |> + dplyr::mutate( + residual = log(.data$obs) - log(.data$pred_med), + abs_residual = abs(.data$residual), + sq_residual = .data$residual^2 + ) + } else { + residual_data <- residual_data |> + dplyr::mutate( + residual = .data$obs - .data$pred_med, + abs_residual = abs(.data$residual), + sq_residual = .data$residual^2 + ) + } + + if (summary_level == "pointwise") { + return(residual_data |> + dplyr::select( + "id", + "antigen_iso", + "t", + "obs", + "pred_med", + "pred_lower", + "pred_upper", + "residual", + "abs_residual", + "sq_residual" + )) + } + + if (summary_level == "id_antigen") { + return(residual_data |> + dplyr::summarise( + .by = c("id", "antigen_iso"), + MAE = mean(.data$abs_residual, na.rm = TRUE), + RMSE = sqrt(mean(.data$sq_residual, na.rm = TRUE)), + SSE = sum(.data$sq_residual, na.rm = TRUE), + n_obs = dplyr::n() + )) + } + + if (summary_level == "antigen") { + return(residual_data |> + dplyr::summarise( + .by = "antigen_iso", + MAE = mean(.data$abs_residual, na.rm = TRUE), + RMSE = sqrt(mean(.data$sq_residual, na.rm = TRUE)), + SSE = sum(.data$sq_residual, na.rm = TRUE), + n_obs = dplyr::n() + )) + } + + residual_data |> + dplyr::summarise( + MAE = mean(.data$abs_residual, na.rm = TRUE), + RMSE = sqrt(mean(.data$sq_residual, na.rm = TRUE)), + SSE = sum(.data$sq_residual, na.rm = TRUE), + n_obs = dplyr::n() + ) +} diff --git a/R/create_incidence_table.R b/R/create_incidence_table.R deleted file mode 100644 index 6a965f4..0000000 --- a/R/create_incidence_table.R +++ /dev/null @@ -1,17 +0,0 @@ -# create table of incidence.rate of each region -create_incidence_table <- function(...) { - # Capture input objects and their names - est_list <- list(...) - country_names <- names(est_list) - - # Extract incidence.rate from the summary() of each estimate object - incidence_rates <- sapply(est_list, function(x) summary(x)$incidence.rate) - - # Create a tidy tibble - incidence_table <- tibble( - Country = country_names, - Incidence_Rate = incidence_rates - ) - - return(incidence_table) -} \ No newline at end of file diff --git a/R/create_noise_df.R b/R/create_noise_df.R deleted file mode 100644 index f4640d6..0000000 --- a/R/create_noise_df.R +++ /dev/null @@ -1,12 +0,0 @@ -create_noise_df <- function(country) { - noise_df <- tibble( - antigen_iso = c("IgG"), - Country = factor(c(country)), # Use input argument for Country - y.low = c(25), - eps = c(0.25), - nu = c(0.5), - y.high = c(200000) - ) - - return(noise_df) -} \ No newline at end of file diff --git a/R/create_xs_data.R b/R/create_xs_data.R deleted file mode 100644 index 9a0f0a3..0000000 --- a/R/create_xs_data.R +++ /dev/null @@ -1,29 +0,0 @@ -# create function for generating specific region data -create_xs_data <- function(df, filter_countries, filter_antigen_iso, value_col) { - df %>% - # First, create/rename columns - mutate( - id = sid, - Country = site_name, - study = study_name, - age = age, - antigen_iso = factor(isotype_name), - value = {{ value_col } - } # value_col is specified by the user, e.g. n_ipab_MFI - ) %>% - # Then filter by the desired catchment(s) and antigen iso(s) - filter( - Country %in% filter_countries, - antigen_iso %in% filter_antigen_iso - ) %>% - # Create age categories - mutate( - ageCat = factor(case_when( - age < 5 ~ "<5", - age >= 5 & age <= 15 ~ "5-15", - age > 15 ~ "16+" - )) - ) %>% - # Optionally select only the needed columns - select(id, Country, study, age, antigen_iso, value, ageCat) -} \ No newline at end of file diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..dd553d6 --- /dev/null +++ b/R/data.R @@ -0,0 +1,65 @@ +# Mock data documentation + +#' Mock posterior draws for testing +#' +#' A mock dataset of posterior parameter draws in long format, mimicking the +#' structure expected by functions like \code{\link{compute_residual_metrics}} +#' and \code{\link{predict_posterior_at_times}}. +#' +#' @format A data frame with columns: +#' \describe{ +#' \item{Subject}{Character. Subject ID (e.g., "newperson", "SOSAR-22008")} +#' \item{Iso_type}{Character. Isotype (e.g., "IgG", "IgA")} +#' \item{Chain}{Integer. MCMC chain number} +#' \item{Iteration}{Integer. MCMC iteration number} +#' \item{Parameter}{Character. Parameter name (y0, y1, t1, alpha, shape)} +#' \item{value}{Numeric. Parameter value} +#' } +#' +#' @details +#' This is synthetic data generated for testing and examples. Real Shigella +#' posterior draws will be added to the package separately. +#' +#' Parameters represent: +#' \itemize{ +#' \item \code{y0}: Baseline antibody level +#' \item \code{y1}: Peak antibody level +#' \item \code{t1}: Time to peak (days) +#' \item \code{alpha}: Decay rate parameter +#' \item \code{shape}: Decay shape parameter (rho) +#' } +#' +#' @examples +#' head(mock_posterior_draws) +#' table(mock_posterior_draws$Parameter) +"mock_posterior_draws" + +#' Mock case data for testing +#' +#' A mock longitudinal antibody dataset compatible with +#' \code{serodynamics::as_case_data()}, for testing functions like +#' \code{\link{compute_residual_metrics}}. +#' +#' @format A data frame with class \code{c("case_data", "data.frame")} +#' and columns: +#' \describe{ +#' \item{id}{Character. Subject ID} +#' \item{antigen_iso}{Character. Isotype (e.g., "IgG", "IgA")} +#' \item{timepoint}{Numeric. Time in days since infection} +#' \item{value}{Numeric. Antibody measurement} +#' } +#' +#' @details +#' This is synthetic data generated for testing and examples. Real Shigella +#' case data will be added separately. +#' +#' The dataset has attributes: +#' \itemize{ +#' \item \code{attr(mock_case_data, "timeindays") = "timepoint"} +#' \item \code{attr(mock_case_data, "value_var") = "value"} +#' } +#' +#' @examples +#' head(mock_case_data) +#' table(mock_case_data$id) +"mock_case_data" diff --git a/R/fig2_overall_newperson.R b/R/fig2_overall_newperson.R new file mode 100644 index 0000000..2c2e35b --- /dev/null +++ b/R/fig2_overall_newperson.R @@ -0,0 +1,120 @@ +utils::globalVariables(c("antigen", "iso")) + +#' Summarize population-level ("newperson") antibody trajectories +#' from overall models +#' +#' Computes median and credible interval bands of the antibody trajectory for a +#' hypothetical new individual drawn from the population distribution +#' ("newperson"). +#' +#' @param overall_models Named list of posterior draws in long format +#' (one per antigen), +#' with columns at least: Subject, Iso_type, Chain, Iteration, +#' Parameter, value. +#' @param osps Character vector of antigen names +#' (must match names in `overall_models`). +#' @param ids Character vector of Subject IDs to include (default: "newperson"). +#' @param isotypes Character vector of isotypes +#' (default: c("IgG", "IgA")). +#' @param t_grid Numeric vector of time points (days) to evaluate. +#' @param cred Credible level (default 0.95). +#' @param log_y Logical; if TRUE, applies log10 scale to y when returning plot. +#' @param xlim Optional numeric length-2 vector for x-axis limits when +#' returning plot. +#' @param ylab Y-axis label for plot. +#' @param line_color Line color for plot. +#' @param ribbon_alpha Alpha for credible ribbon in plot. +#' @param facet_scales Passed to ggplot facet scales. +#' @param return_data If TRUE, returns a list with `plot` and `data`. +#' +#' @return By default, a ggplot. If `return_data = TRUE`, returns +#' list(plot = p, data = df). +#' +#' @details +#' Requires that each model draw can be pivoted to wide parameters including: +#' y0, y1, t1, alpha, shape. +#' +#' @export +fig2_overall_newperson <- function( + overall_models, + osps = c("IpaB", "Sf2a", "Sf3a", "Sf6", "Sonnei"), + ids = "newperson", + isotypes = c("IgG", "IgA"), + t_grid = seq(0, 210, by = 5), + cred = 0.95, + log_y = TRUE, + xlim = c(0, 210), + ylab = "Normalized MFI", + line_color = "#1f77b4", + ribbon_alpha = 0.20, + facet_scales = "fixed", + return_data = FALSE +) { + q_lo <- (1 - cred) / 2 + q_hi <- 1 - q_lo + + get_sum_overall <- function(model_df, osp, iso) { + model_df |> + dplyr::filter(.data$Subject %in% ids, .data$Iso_type == iso) |> + dplyr::select(.data$Chain, .data$Iteration, .data$Iso_type, + .data$Parameter, .data$value, .data$Subject) |> + tidyr::pivot_wider( + names_from = .data$Parameter, + values_from = .data$value + ) |> + dplyr::mutate( + antigen = osp, + iso = factor(.data$Iso_type, levels = c("IgG", "IgA")) + ) |> + tidyr::crossing(t = t_grid) |> + dplyr::mutate( + res = ab( + t = .data$t, + y0 = .data$y0, + y1 = .data$y1, + t1 = .data$t1, + alpha = .data$alpha, + shape = .data$shape + ) + ) |> + dplyr::filter(is.finite(.data$res), .data$res > 0) |> + dplyr::summarise( + res.med = stats::quantile(.data$res, 0.50, na.rm = TRUE), + res.low = stats::quantile(.data$res, q_lo, na.rm = TRUE), + res.high = stats::quantile(.data$res, q_hi, na.rm = TRUE), + .by = c("antigen", "iso", "t") + ) + } + + sum_all <- lapply(osps, function(osp) { + dplyr::bind_rows(lapply(isotypes, function(iso) { + get_sum_overall(overall_models[[osp]], osp, iso) + })) + }) |> + dplyr::bind_rows() |> + dplyr::mutate(antigen = factor(.data$antigen, levels = osps)) + + p <- ggplot2::ggplot(sum_all, ggplot2::aes(x = .data$t)) + + ggplot2::geom_ribbon( + ggplot2::aes(ymin = .data$res.low, ymax = .data$res.high), + alpha = ribbon_alpha + ) + + ggplot2::geom_line( + ggplot2::aes(y = .data$res.med), + linewidth = 1.1, + color = line_color, + lineend = "round" + ) + + ggplot2::labs(x = "Days since fever onset", y = ylab) + + ggplot2::facet_grid(rows = ggplot2::vars(antigen), + cols = ggplot2::vars(iso), + scales = facet_scales) + + ggplot2::theme_minimal() + + ggplot2::theme(legend.position = "none") + + if (log_y) p <- p + ggplot2::scale_y_log10() + if (!is.null(xlim)) p <- p + ggplot2::coord_cartesian(xlim = xlim) + + if (return_data) return(list(plot = p, data = sum_all)) + p +} diff --git a/R/fmt_mci.R b/R/fmt_mci.R new file mode 100644 index 0000000..3cc4162 --- /dev/null +++ b/R/fmt_mci.R @@ -0,0 +1,21 @@ +#' Format median and credible interval as a single string +#' +#' @param med Median value. +#' @param lo Lower bound. +#' @param hi Upper bound. +#' @param digits Number of digits. +#' @param sci Logical; if TRUE uses scientific notation. +#' +#' @return A string like "1.23 (0.50--2.00)". +#' +#' @export +fmt_mci <- function(med, lo, hi, digits = 2, sci = FALSE) { + f <- function(x) { + if (sci) { + formatC(x, format = "e", digits = digits) + } else { + formatC(x, format = "f", digits = digits) + } + } + sprintf("%s (%s--%s)", f(med), f(lo), f(hi)) +} diff --git a/R/generate_final_table.R b/R/generate_final_table.R deleted file mode 100644 index d84a3f0..0000000 --- a/R/generate_final_table.R +++ /dev/null @@ -1,28 +0,0 @@ -# Define a function to generate final tables -generate_final_table <- function(results_list, sample_size) { - # Initialize an empty list to store the results - summary_results <- list() - - # Loop through each of the 100 results and extract the required columns - for (i in 1:200) { - # Extract the summary for each result - result_summary <- summary(results_list[[i]]$est1) - - # Select the required columns - extracted_columns <- result_summary %>% - select(incidence.rate, SE, CI.lwr, CI.upr) - - # Add a column for the index (optional, for tracking) - extracted_columns <- extracted_columns %>% - mutate(index = i) - - # Append to the list - summary_results[[i]] <- extracted_columns - } - - # Combine all results into a single data frame - final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity - - return(final_table) -} diff --git a/R/model_comparison_table.R b/R/model_comparison_table.R new file mode 100644 index 0000000..4f1418a --- /dev/null +++ b/R/model_comparison_table.R @@ -0,0 +1,171 @@ +#' Compare summary residual metrics between two model outputs +#' +#' Builds a compact comparison table from pre-computed summary metrics for an +#' overall model and a pointwise model. +#' +#' @param metrics_overall Data frame with one row containing at least +#' \code{MAE}, \code{RMSE}, \code{SSE}, and \code{n_obs} +#' for the overall model. +#' @param metrics_pointwise Data frame with one row containing at least +#' \code{MAE}, \code{RMSE}, \code{SSE}, and \code{n_obs} +#' for the pointwise model. +#' @param model_overall_label Label used for the overall model row. +#' @param model_pointwise_label Label used for the pointwise model row. +#' +#' @return A tibble with one row per model and comparison columns: +#' \code{delta_MAE}, \code{delta_RMSE}, \code{pct_improve_MAE}, +#' \code{pct_improve_RMSE}. +#' +#' @export +model_comparison_table <- function(metrics_overall, + metrics_pointwise, + model_overall_label = "Overall Model", + model_pointwise_label = "Pointwise Model") { + out <- dplyr::bind_rows( + dplyr::mutate( + metrics_overall, + Model = model_overall_label + ), + dplyr::mutate( + metrics_pointwise, + Model = model_pointwise_label + ) + ) |> + dplyr::select( + "Model", + "MAE", + "RMSE", + "SSE", + "n_obs" + ) + + base_mae <- metrics_overall$MAE[[1]] + base_rmse <- metrics_overall$RMSE[[1]] + + out |> + dplyr::mutate( + delta_MAE = .data$MAE - base_mae, + delta_RMSE = .data$RMSE - base_rmse, + pct_improve_MAE = dplyr::case_when( + is.na(base_mae) ~ NA_real_, + abs(base_mae) <= .Machine$double.eps ~ NA_real_, + TRUE ~ 100 * .data$delta_MAE / base_mae + ), + pct_improve_RMSE = dplyr::case_when( + is.na(base_rmse) ~ NA_real_, + abs(base_rmse) <= .Machine$double.eps ~ NA_real_, + TRUE ~ 100 * .data$delta_RMSE / base_rmse + ) + ) +} + +#' Compare serotype-specific vs overall models using residual metrics +#' +#' Computes per-ID residual metrics for two models on the intersection of IDs +#' present in both datasets, then reports absolute and percent differences. +#' +#' @param model_serospecific Posterior draws (long format) +#' for the serotype-specific model. +#' @param data_serospecific Case dataset used for the serotype-specific model. +#' @param model_overall Posterior draws (long format) for the overall model. +#' @param data_overall Case dataset used for the overall model. +#' @param antigen_iso Character scalar antigen/isotype label. +#' @param scale "original" or "log". +#' @param tie_tol Numeric tolerance to declare a tie. +#' +#' @return A tibble with per-ID MAE/RMSE for each model and +#' deltas/winner labels. +#' +#' @export +make_model_comparison_table <- function(model_serospecific, data_serospecific, + model_overall, data_overall, + antigen_iso, + scale = c("original", "log"), + tie_tol = 1e-8) { + scale <- match.arg(scale) + + ids_common <- intersect(unique(data_serospecific$id), unique(data_overall$id)) + # TODO: add early error for empty ids_common + + m_sero <- compute_residual_metrics( + model = model_serospecific, + dataset = data_serospecific, + ids = ids_common, + antigen_iso = antigen_iso, + scale = scale, + summary_level = "id_antigen" + ) |> + dplyr::select( + .data$id, + .data$antigen_iso, + .data$MAE, + .data$RMSE, + .data$n_obs + ) |> + dplyr::rename( + MAE_serospecific = .data$MAE, + RMSE_serospecific = .data$RMSE, + n_obs_serospecific = .data$n_obs + ) + + m_over <- compute_residual_metrics( + model = model_overall, + dataset = data_overall, + ids = ids_common, + antigen_iso = antigen_iso, + scale = scale, + summary_level = "id_antigen" + ) |> + dplyr::select( + .data$id, + .data$antigen_iso, + .data$MAE, + .data$RMSE, + .data$n_obs + ) |> + dplyr::rename( + MAE_overall = .data$MAE, + RMSE_overall = .data$RMSE, + n_obs_overall = .data$n_obs + ) + + dplyr::full_join(m_sero, m_over, by = c("id", "antigen_iso")) |> + dplyr::mutate( + delta_MAE = .data$MAE_overall - .data$MAE_serospecific, + delta_RMSE = .data$RMSE_overall - .data$RMSE_serospecific, + pct_improve_MAE = dplyr::case_when( + is.na(.data$MAE_overall) ~ NA_real_, + abs(.data$MAE_overall) <= .Machine$double.eps ~ NA_real_, + TRUE ~ 100 * .data$delta_MAE / .data$MAE_overall + ), + pct_improve_RMSE = dplyr::case_when( + is.na(.data$RMSE_overall) ~ NA_real_, + abs(.data$RMSE_overall) <= .Machine$double.eps ~ NA_real_, + TRUE ~ 100 * .data$delta_RMSE / .data$RMSE_overall + ), + best_MAE = dplyr::case_when( + is.na(.data$MAE_overall) | is.na(.data$MAE_serospecific) ~ + NA_character_, + abs(.data$delta_MAE) <= tie_tol ~ "tie", + .data$delta_MAE > 0 ~ "serospecific", + TRUE ~ "overall" + ), + best_RMSE = dplyr::case_when( + is.na(.data$RMSE_overall) | is.na(.data$RMSE_serospecific) ~ + NA_character_, + abs(.data$delta_RMSE) <= tie_tol ~ "tie", + .data$delta_RMSE > 0 ~ "serospecific", + TRUE ~ "overall" + ), + best_overall = dplyr::case_when( + .data$best_MAE == "serospecific" & + .data$best_RMSE == "serospecific" ~ "serospecific", + .data$best_MAE == "overall" & + .data$best_RMSE == "overall" ~ "overall", + .data$best_MAE == "tie" & + .data$best_RMSE == "tie" ~ "tie", + TRUE ~ "mixed" + ) + ) |> + dplyr::arrange(.data$id) +} diff --git a/R/plot_summary_metrics.R b/R/plot_summary_metrics.R deleted file mode 100644 index ce58750..0000000 --- a/R/plot_summary_metrics.R +++ /dev/null @@ -1,14 +0,0 @@ -plot_summary_metrics <- function(summary_metrics) { - summary_metrics |> - ggplot() + - aes(x = sample_size, y = empirical_se, color = Age_Group) + - geom_line() + - geom_point() + - labs( - title = "Empirical Standard Error vs. Sample Size", - x = "Sample Size", - y = "Empirical Standard Error", - color = "Age Group" - ) + - theme_minimal() -} \ No newline at end of file diff --git a/R/postprocess_jags_output.R b/R/postprocess_jags_output.R deleted file mode 100644 index 32bfd22..0000000 --- a/R/postprocess_jags_output.R +++ /dev/null @@ -1,47 +0,0 @@ -#' Postprocess JAGS output -#' -#' @param jags_output output from [runjags::run.jags()] -#' -#' @returns a [tibble::tbl_df] -#' @export -#' -postprocess_jags_output <- function(jags_output) { - mcmc_list <- coda::as.mcmc.list(jags_output) - - mcmc_df <- ggmcmc::ggs(mcmc_list) - - wide_predpar_df <- mcmc_df |> - mutate( - parameter = sub("^(\\w+)\\[.*", "\\1", .data$Parameter), - index_id = as.numeric(sub("^\\w+\\[(\\d+),.*", "\\1", .data$Parameter)), - antigen_iso = as.numeric(sub("^\\w+\\[\\d+,(\\d+).*", "\\1", .data$Parameter)) - ) |> - mutate( - index_id = factor(index_id, labels = c(unique(dL_clean$index_id), "newperson")), - antigen_iso = factor(antigen_iso, labels = unique(dL_clean$antigen_iso)) - ) |> - filter(index_id == "newperson") |> - select(-Parameter) |> - pivot_wider(names_from = "parameter", values_from = "value") |> - rowwise() |> - droplevels() |> - ungroup() |> - rename(r = shape) - - # Assuming wide_predpar_df is your data frame - curve_params <- wide_predpar_df - - # Set class and attributes for serocalculator - class(curve_params) <- c("curve_params", class(curve_params)) - antigen_isos <- unique(curve_params$antigen_iso) - attr(curve_params, "antigen_isos") <- antigen_isos - - to_return <- curve_params |> - mutate( - iter = Iteration, - chain = Chain, - ) |> - select(antigen_iso, iter, chain, y0, y1, t1, alpha, r) - - return(to_return) -} diff --git a/R/predict_posterior_at_times.R b/R/predict_posterior_at_times.R new file mode 100644 index 0000000..b7a505c --- /dev/null +++ b/R/predict_posterior_at_times.R @@ -0,0 +1,92 @@ +#' Posterior predictions at specified times for given subjects +#' and antigen/isotype +#' +#' Generates draw-level posterior predictions of the antibody trajectory at +#' user-specified time points, for one or more subjects and a selected +#' antigen/isotype. This is a low-level helper used by +#' residual-based posterior predictive diagnostics. +#' +#' @param model A data frame of posterior draws in long format with columns: +#' \code{Subject}, \code{Iso_type}, \code{Chain}, \code{Iteration}, +#' \code{Parameter}, \code{value}. +#' @param ids Character vector of subject IDs to include +#' (matched against \code{Subject}). +#' @param antigen_iso Character scalar specifying the antigen/isotype to include +#' (matched against \code{Iso_type}). +#' @param times Numeric vector of time points (days) at which to evaluate +#' predictions. +#' +#' @return A tibble with one row per +#' (posterior draw \eqn{\times} time \eqn{\times} subject), including the +#' evaluated prediction \code{res}. Output includes at least: +#' \describe{ +#' \item{id}{Subject ID (character).} +#' \item{t}{Time (days) at which prediction was evaluated.} +#' \item{Chain}{MCMC chain index (if present in \code{model}).} +#' \item{Iteration}{MCMC iteration index (if present in \code{model}).} +#' \item{sample_id}{Row index for the draw (added if missing).} +#' \item{y0, y1, t1, alpha, shape}{Model parameters (wide).} +#' \item{res}{Predicted antibody level at time \code{t}.} +#' } +#' +#' @details +#' This function pivots posterior draws to wide format (parameters as columns), +#' expands them over \code{times}, and evaluates the antibody curve via +#' an internal implementation of the antibody kinetics model using parameters +#' \code{y0}, \code{y1}, \code{t1}, \code{alpha}, and \code{shape}. +#' +#' @seealso \code{\link{compute_residual_metrics}} +#' +#' @examples +#' \dontrun{ +#' preds <- predict_posterior_at_times( +#' model = overall_sf2a, +#' ids = "newperson", +#' antigen_iso = "IgG", +#' times = c(0, 30, 90, 180) +#' ) +#' } +#' +#' @keywords internal +predict_posterior_at_times <- function(model, ids, antigen_iso, times) { + + sr_model_sub <- model |> + dplyr::filter(.data$Subject %in% ids, .data$Iso_type == antigen_iso) + + param_wide <- sr_model_sub |> + dplyr::select(.data$Chain, .data$Iteration, .data$Iso_type, + .data$Parameter, .data$value, .data$Subject) |> + tidyr::pivot_wider( + names_from = .data$Parameter, + values_from = .data$value + ) |> + dplyr::arrange(.data$Chain, .data$Iteration) |> + dplyr::mutate( + antigen_iso = factor(.data$Iso_type), + id = as.character(.data$Subject) + ) |> + dplyr::select(-c("Iso_type", "Subject")) + + if (!"sample_id" %in% names(param_wide)) { + param_wide <- param_wide |> + dplyr::mutate(sample_id = dplyr::row_number()) + } + + dt <- tibble::tibble(t = times) |> + dplyr::mutate(idx = dplyr::row_number()) |> + tidyr::pivot_wider( + names_from = "idx", + values_from = "t", + names_prefix = "time" + ) |> + dplyr::slice(rep(seq_len(dplyr::n()), each = nrow(param_wide))) + + predictions <- cbind(param_wide, dt) |> + tidyr::pivot_longer(cols = dplyr::starts_with("time"), values_to = "t") |> + dplyr::select(-"name") |> + dplyr::mutate( + res = ab(.data$t, .data$y0, .data$y1, .data$t1, .data$alpha, .data$shape) + ) + + predictions +} diff --git a/R/prep_newperson_params.R b/R/prep_newperson_params.R new file mode 100644 index 0000000..b55d0a1 --- /dev/null +++ b/R/prep_newperson_params.R @@ -0,0 +1,26 @@ +#' Extract "newperson" parameter draws and summarize for Table 2 +#' +#' @param draws_long Posterior draws in long format with columns: +#' Subject, Iso_type, Chain, Iteration, Parameter, value. +#' @param antigen_label Character scalar antigen name to attach. +#' +#' @return A tibble of newperson draws in wide format with columns: +#' Iteration, Chain, antigen, Iso_type, y0, y1, t1, alpha, rho. +#' +#' @export +prep_newperson_params <- function(draws_long, antigen_label) { + draws_long |> + dplyr::filter(.data$Subject == "newperson") |> + dplyr::mutate(antigen = antigen_label) |> + dplyr::select(.data$Iteration, .data$Chain, .data$antigen, + .data$Iso_type, .data$Parameter, .data$value) |> + tidyr::pivot_wider( + names_from = .data$Parameter, + values_from = .data$value + ) |> + # The model output uses "shape" for the decay-shape parameter, + # which is renamed to "rho" for consistency + dplyr::rename(rho = .data$shape) |> + dplyr::select(.data$Iteration, .data$Chain, .data$antigen, .data$Iso_type, + .data$y0, .data$y1, .data$t1, .data$alpha, .data$rho) +} diff --git a/R/prepare_df_for_serocalculator.R b/R/prepare_df_for_serocalculator.R deleted file mode 100644 index 00f4c29..0000000 --- a/R/prepare_df_for_serocalculator.R +++ /dev/null @@ -1,22 +0,0 @@ -# Create function to clearly define cross-sectional data -prepare_df_for_serocalculator <- function(df, age_col = "age", value_col = "value") { - # Ensure correct column names - df <- df %>% - rename(age = all_of(age_col)) - - # Assign attributes for serocalculator - attr(df, "age_var") <- "age" - attr(df, "value_var") <- value_col - - # Check if serocalculator recognizes attributes - get_value_var <- serocalculator:::get_value_var - detected_value_var <- get_value_var(df) - - if (is.null(detected_value_var)) { - warning("serocalculator did not detect the 'value' column. Check column naming.") - } else { - message("serocalculator recognized 'value' column: ", detected_value_var) - } - - return(df) -} \ No newline at end of file diff --git a/R/process_shigella_data.R b/R/process_shigella_data.R index 8bb1d01..d1f56db 100644 --- a/R/process_shigella_data.R +++ b/R/process_shigella_data.R @@ -1,27 +1,57 @@ +#' Reshape Shigella longitudinal data for serodynamics workflows +#' +#' Filters a dataset to a given study and antigen column, standardizes column +#' names, and returns a visit-ordered long dataset suitable for conversion to +#' `serodynamics::as_case_data()`. +#' +#' @param data A data frame containing longitudinal measurements. +#' @param study_filter Character scalar. Value of `study_name` to keep +#' (e.g. "SOSAR"). +#' @param antigen Unquoted column name for the antigen measurement +#' (e.g. n_ipab_MFI). +#' +#' @return A tibble with standardized columns: +#' \describe{ +#' \item{index_id}{Participant ID (copied from `sid`).} +#' \item{antigen_iso}{Isotype label (copied from `isotype_name`).} +#' \item{visit}{Visit label (copied from `timepoint`).} +#' \item{timeindays}{Time since infection (copied from `Actual day`).} +#' \item{result}{Antibody measurement (from `antigen`).} +#' } +#' +#' @examples +#' \dontrun{ +#' dat_long <- process_shigella_data(df, "SOSAR", n_ipab_MFI) +#' dL <- serodynamics::as_case_data(dat_long, +#' id_var = "index_id", biomarker_var = "antigen_iso", +#' time_in_days = "timeindays", value_var = "result" +#' ) +#' } +#' +#' @importFrom rlang ensym +#' @export process_shigella_data <- function(data, study_filter, antigen) { - # Filter the data for the specific study - filtered_data <- data %>% - filter(study_name == study_filter) - - # Capture the column name of the antigen - antigen_col <- ensym(antigen) - - # Manipulate and restructure the data - processed_data <- filtered_data %>% - select(isotype_name, sid, timepoint, `Actual day`, !!antigen_col) %>% - mutate( - index_id = sid, - antigen_iso = isotype_name, - visit = timepoint, - timeindays = `Actual day`, - result = !!antigen_col - ) %>% - group_by(index_id, antigen_iso) %>% - arrange(visit) %>% - mutate(visit_num = rank(visit, ties.method = "first")) %>% - ungroup() %>% - # Remove rows with NA in timeindays - filter(!is.na(timeindays)) - - return(processed_data) -} \ No newline at end of file + antigen_col <- rlang::ensym(antigen) + + data |> + dplyr::filter(.data$study_name == study_filter) |> + dplyr::select( + .data$isotype_name, + .data$sid, + .data$timepoint, + .data$`Actual day`, + !!antigen_col + ) |> + dplyr::mutate( + index_id = .data$sid, + antigen_iso = .data$isotype_name, + visit = .data$timepoint, + timeindays = .data$`Actual day`, + result = !!antigen_col + ) |> + dplyr::group_by(.data$index_id, .data$antigen_iso) |> + dplyr::arrange(.data$visit, .by_group = TRUE) |> + dplyr::mutate(visit_num = rank(.data$visit, ties.method = "first")) |> + dplyr::ungroup() |> + dplyr::filter(!is.na(.data$timeindays)) +} diff --git a/R/shigella-package.R b/R/shigella-package.R index d809e4e..a0576ad 100644 --- a/R/shigella-package.R +++ b/R/shigella-package.R @@ -14,5 +14,6 @@ #' @importFrom ggplot2 ggplot #' @importFrom ggplot2 labs #' @importFrom ggplot2 theme_minimal +#' @importFrom rlang .data ## usethis namespace: end NULL diff --git a/R/simulate_seroincidence.R b/R/simulate_seroincidence.R deleted file mode 100644 index 1c7908b..0000000 --- a/R/simulate_seroincidence.R +++ /dev/null @@ -1,98 +0,0 @@ -# Define the simulation function -simulate_seroincidence <- function( - dmcmc, # Curve parameters - nrep, - n_sim, - observed, - range = NULL, - batch_size = 40, parallel = TRUE, - antibodies = c("IgG"), # Antigen-isotypes - dlims = rbind("IgG" = c(min = 0, max = 0.5)), # Biologic noise distribution - # Noise parameters - cond = tibble( - antigen_iso = c("IgG"), - nu = c(0.5), - eps = c(0.25), - y.low = c(25), - y.high = c(200000) - ) -) { - - lambda <- observed # Simulated incidence rate per person-year - - - - # Calculate number of batches - n_batches <- ceiling(n_sim / batch_size) - - # Parallel processing: Use future_map for efficiency - if (parallel) { - plan(multisession, workers = max(1, future::availableCores() - 1)) - - results <- future_map(1:n_batches, function(batch) { - # Run simulations within each batch - replicate(batch_size, expr = { - csdata <- sim_pop_data( - curve_params = dmcmc, - lambda = lambda, - n_samples = nrep, - age_range = range, - antigen_isos = antibodies, - n_mcmc_samples = 0, - renew_params = FALSE, # Keep the same params for speed - add_noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - # Estimate seroincidence - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = FALSE, # Disable visualization for speed - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - list(csdata = csdata, est1 = est) - }, simplify = FALSE) # Keep list structure - }, .options = furrr_options(seed = TRUE)) # Set seed for reproducibility - - # Flatten nested list - results <- unlist(results, recursive = FALSE) - } else { - # Sequential execution (for debugging or single-core use) - results <- lapply(1:n_sim, function(i) { - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = FALSE, - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = FALSE, - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - list(csdata = csdata, est1 = est) - }) - } - - return(results) -} \ No newline at end of file diff --git a/R/utils_internal.R b/R/utils_internal.R new file mode 100644 index 0000000..ed966a0 --- /dev/null +++ b/R/utils_internal.R @@ -0,0 +1,63 @@ +#' Internal utility functions +#' +#' @keywords internal +#' @name utils_internal +NULL + +# NOTE: ab() and get_timeindays_var() are internal re-implementations of +# functions from the serodynamics package (ucdavis/serodynamics). +# They are duplicated here to avoid using ::: (non-exported access), +# which is not allowed in R packages passing R CMD check. +# If serodynamics updates these functions, this file must be updated to match. + +#' Get time variable name from a case_data object +#' +#' @param dataset A serodynamics case_data object +#' @return Character scalar with the time variable name +#' @keywords internal +#' @noRd +get_timeindays_var <- function(dataset) { + var <- attr(dataset, "timeindays") + if (is.null(var)) var <- "timeindays" + return(var) +} + +#' Compute log-linear rise rate for antibody kinetics +#' +#' Helper that computes the exponential rise rate beta = log(y1/y0) / t1, +#' used internally by `ab()`. +#' +#' @param y0 Baseline antibody level +#' @param y1 Peak antibody level +#' @param t1 Time to peak (days) +#' @return Numeric scalar: the log-linear rise rate +#' @keywords internal +#' @noRd +bt <- function(y0, y1, t1) { + to_return <- log(y1 / y0) / t1 + return(to_return) +} + +#' Antibody kinetics trajectory function +#' +#' Evaluates the antibody trajectory model at specified time points. +#' Mirrors `serodynamics:::ab()` -- see ucdavis/serodynamics/R/ab.R. +#' +#' @param t Numeric vector of time points (days) +#' @param y0 Baseline antibody level +#' @param y1 Peak antibody level +#' @param t1 Time to peak (days) +#' @param alpha Decay rate parameter +#' @param shape Decay shape parameter (rho) +#' @return Numeric vector of predicted antibody levels +#' @keywords internal +#' @noRd +ab <- function(t, y0, y1, t1, alpha, shape) { + beta <- bt(y0, y1, t1) + yt <- ifelse( + t <= t1, + y0 * exp(beta * t), + (y1^(1 - shape) - (1 - shape) * alpha * (t - t1))^(1 / (1 - shape)) + ) + return(yt) +} diff --git a/README.Rmd b/README.Rmd index d17fbb6..73daa41 100644 --- a/README.Rmd +++ b/README.Rmd @@ -12,11 +12,66 @@ knitr::opts_chunk$set( ``` # shigella -This repo is for the Shigella analyses + +Analysis tools for longitudinal antibody kinetics data from Shigella infection studies. [![Codecov test coverage](https://codecov.io/gh/UCD-SERG/shigella/graph/badge.svg)](https://app.codecov.io/gh/UCD-SERG/shigella) [![R-CMD-check](https://github.com/UCD-SERG/shigella/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/UCD-SERG/shigella/actions/workflows/R-CMD-check.yaml) -The goal of shigella is to ... +## Overview + +The `shigella` package provides tools for analyzing longitudinal antibody kinetics following Shigella infection. Key features include: + +- Data processing and reshaping for `serodynamics` workflows +- Residual-based model evaluation and comparison +- Visualization of population-level antibody trajectories +- Mock data for testing and development + +## Installation + +You can install the development version from GitHub: + +```r +# install.packages("remotes") +remotes::install_github("UCD-SERG/shigella") +``` + +## Usage + +```r +library(shigella) + +# Process raw data +dat_long <- process_shigella_data( + data = raw_data, + study_filter = "SOSAR", + antigen = n_ipab_MFI +) + +# Compute residual metrics +metrics <- compute_residual_metrics( + model = posterior_draws, + dataset = case_data, + ids = unique(case_data$id), + antigen_iso = "IgG", + scale = "original", + summary_level = "id_antigen" +) + +# Compare models +comparison <- model_comparison_table( + metrics_overall = metrics_model1, + metrics_pointwise = metrics_model2 +) +``` + +## Documentation + +- [Package website](https://ucd-serg.github.io/shigella/) +- [Getting started vignette](https://ucd-serg.github.io/shigella/articles/getting-started.html) + +## Note on Data + +This package currently uses mock datasets for testing and examples. Real Shigella antibody data will be added in a future release. diff --git a/data-raw/mock_data.R b/data-raw/mock_data.R new file mode 100644 index 0000000..29c7fdf --- /dev/null +++ b/data-raw/mock_data.R @@ -0,0 +1,88 @@ +## Code to prepare mock datasets for testing and examples +## This script generates mock data that mimics the structure expected by shigella functions + +set.seed(2024) + +# Mock posterior draws dataset ------------------------------------------------- +# Structure: Subject, Iso_type, Chain, Iteration, Parameter, value +# Parameters: y0, y1, t1, alpha, shape (rho) + +n_chains <- 3 +n_iter <- 100 +subjects <- c("newperson", "SOSAR-22008", "SOSAR-22015") +isotypes <- c("IgG", "IgA") +parameters <- c("y0", "y1", "t1", "alpha", "shape") + +mock_posterior_draws <- expand.grid( + Subject = subjects, + Iso_type = isotypes, + Chain = 1:n_chains, + Iteration = 1:n_iter, + Parameter = parameters, + stringsAsFactors = FALSE +) + +# Generate plausible parameter values +mock_posterior_draws$value <- NA +for (i in seq_len(nrow(mock_posterior_draws))) { + param <- mock_posterior_draws$Parameter[i] + mock_posterior_draws$value[i] <- switch(param, + y0 = runif(1, 100, 500), # baseline antibody level + y1 = runif(1, 1000, 5000), # peak antibody level + t1 = runif(1, 5, 20), # time to peak (days) + alpha = runif(1, 0.001, 0.05), # decay rate + shape = runif(1, 0.5, 2.0) # decay shape (rho) + ) +} + +# Mock case dataset ------------------------------------------------------------- +# Structure compatible with serodynamics::as_case_data() +# Required: id, antigen_iso, time (in days), value columns + +n_timepoints <- 6 +timepoints <- c(0, 7, 14, 30, 60, 90) +ids_case <- c("SOSAR-22008", "SOSAR-22015", "SOSAR-22020", "SOSAR-22025") + +mock_case_data_raw <- expand.grid( + id = ids_case, + antigen_iso = c("IgG", "IgA"), + timepoint = timepoints, + stringsAsFactors = FALSE +) + +# Generate mock antibody values using a simple kinetics model +mock_case_data_raw$value <- apply(mock_case_data_raw, 1, function(row) { + t <- as.numeric(row["timepoint"]) + # Simple rise-decay pattern with noise + y0 <- runif(1, 100, 300) + y1 <- runif(1, 1000, 3000) + t1 <- 10 + alpha <- 0.02 + shape <- 1.2 + + if (t <= t1) { + pred <- y0 + (y1 - y0) * (t / t1) + } else { + pred <- y0 + (y1 - y0) * exp(-alpha * ((t - t1)^shape)) + } + + # Add measurement noise (20% CV) + pred * rnorm(1, 1, 0.2) +}) + +# Convert to serodynamics case_data format +mock_case_data <- mock_case_data_raw +attr(mock_case_data, "timeindays") <- "timepoint" +attr(mock_case_data, "value_var") <- "value" +class(mock_case_data) <- c("case_data", "data.frame") + +# Save datasets ----------------------------------------------------------------- +if (!dir.exists("data")) { + dir.create("data", recursive = TRUE) +} +save(mock_posterior_draws, file = file.path("data", "mock_posterior_draws.rda")) +save(mock_case_data, file = file.path("data", "mock_case_data.rda")) + +message("Mock datasets created successfully!") +message("- mock_posterior_draws: ", nrow(mock_posterior_draws), " rows") +message("- mock_case_data: ", nrow(mock_case_data), " rows") diff --git a/data/.gitignore b/data/.gitignore index 53c221b..069e8b7 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -1,2 +1,4 @@ *.rda *.rds +!mock_posterior_draws.rda +!mock_case_data.rda diff --git a/data/mock_case_data.rda b/data/mock_case_data.rda new file mode 100644 index 0000000..3c404e5 Binary files /dev/null and b/data/mock_case_data.rda differ diff --git a/data/mock_posterior_draws.rda b/data/mock_posterior_draws.rda new file mode 100644 index 0000000..af6922e Binary files /dev/null and b/data/mock_posterior_draws.rda differ diff --git a/inst/WORDLIST b/inst/WORDLIST index d314679..d92948f 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,7 +1,23 @@ CMD Codecov -Github +GitHub +IgA +IgG +Iso +Isotype +MFI ORCID Postprocess +SOSAR Seroresponse +df +ggplot +ipab +isotype +isotypes +newperson +pre repo +serodynamics +serotype +tibble diff --git a/man/compute_residual_metrics.Rd b/man/compute_residual_metrics.Rd new file mode 100644 index 0000000..c966227 --- /dev/null +++ b/man/compute_residual_metrics.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute_residual_metrics.R +\name{compute_residual_metrics} +\alias{compute_residual_metrics} +\title{Residual-based posterior predictive metrics for longitudinal antibody curves} +\usage{ +compute_residual_metrics( + model, + dataset, + ids, + antigen_iso, + scale = c("original", "log"), + summary_level = c("id_antigen", "pointwise", "antigen", "overall") +) +} +\arguments{ +\item{model}{A data frame of posterior draws in long format with columns: +\code{Subject}, \code{Iso_type}, \code{Chain}, \code{Iteration}, +\code{Parameter}, \code{value}.} + +\item{dataset}{A \code{serodynamics} case dataset produced by +\code{serodynamics::as_case_data()} (must contain \code{id} and +\code{antigen_iso} columns, and time/value attributes).} + +\item{ids}{Character vector of subject IDs to include (matched against +\code{dataset$id} and \code{model$Subject}).} + +\item{antigen_iso}{Character scalar specifying the antigen/isotype to analyze +(matched against \code{dataset$antigen_iso} and \code{model$Iso_type}).} + +\item{scale}{Scale on which to compute residuals. One of \code{"original"} +or \code{"log"}. If \code{"log"}, residuals are computed on the natural +log scale and observations/predictions \eqn{\le 0} are removed.} + +\item{summary_level}{Level at which to summarize metrics. One of: +\describe{ +\item{\code{"pointwise"}}{ +Return pointwise residuals at each observed time point. +} +\item{\code{"id_antigen"}}{ +Summarize by \code{id} and \code{antigen_iso}. +} +\item{\code{"antigen"}}{Summarize by \code{antigen_iso} only.} +\item{\code{"overall"}}{Single summary across all included observations.} +}} +} +\value{ +A tibble. For \code{summary_level = "pointwise"}, returns +per-observation residuals. Otherwise returns MAE, RMSE, SSE, and +\code{n_obs} at the requested summary level. +} +\description{ +Computes residuals between observed antibody measurements and posterior +median predictions evaluated at the observed time points. Returns pointwise +residuals or aggregated error metrics (MAE, RMSE, SSE) at multiple +summary levels. +} +\details{ +The posterior predictive summary uses the posterior median at each observed +time point, with a 95\\% credible interval computed from draw-level +predictions. Predictions are generated by +\code{\link{predict_posterior_at_times}}. +} +\examples{ +\dontrun{ +# Per-ID error metrics (original scale) +m_id <- compute_residual_metrics( + model = overall_sf2a, + dataset = dL_clean_sf2a, + ids = unique(dL_clean_sf2a$id), + antigen_iso = "IgG", + scale = "original", + summary_level = "id_antigen" +) + +# Pointwise residuals on log scale +r_pw <- compute_residual_metrics( + model = overall_sf2a, + dataset = dL_clean_sf2a, + ids = "SOSAR-22008", + antigen_iso = "IgA", + scale = "log", + summary_level = "pointwise" +) +} + +} diff --git a/man/fig2_overall_newperson.Rd b/man/fig2_overall_newperson.Rd new file mode 100644 index 0000000..c8a5701 --- /dev/null +++ b/man/fig2_overall_newperson.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fig2_overall_newperson.R +\name{fig2_overall_newperson} +\alias{fig2_overall_newperson} +\title{Summarize population-level ("newperson") antibody trajectories +from overall models} +\usage{ +fig2_overall_newperson( + overall_models, + osps = c("IpaB", "Sf2a", "Sf3a", "Sf6", "Sonnei"), + ids = "newperson", + isotypes = c("IgG", "IgA"), + t_grid = seq(0, 210, by = 5), + cred = 0.95, + log_y = TRUE, + xlim = c(0, 210), + ylab = "Normalized MFI", + line_color = "#1f77b4", + ribbon_alpha = 0.2, + facet_scales = "fixed", + return_data = FALSE +) +} +\arguments{ +\item{overall_models}{Named list of posterior draws in long format +(one per antigen), +with columns at least: Subject, Iso_type, Chain, Iteration, +Parameter, value.} + +\item{osps}{Character vector of antigen names +(must match names in \code{overall_models}).} + +\item{ids}{Character vector of Subject IDs to include (default: "newperson").} + +\item{isotypes}{Character vector of isotypes +(default: c("IgG", "IgA")).} + +\item{t_grid}{Numeric vector of time points (days) to evaluate.} + +\item{cred}{Credible level (default 0.95).} + +\item{log_y}{Logical; if TRUE, applies log10 scale to y when returning plot.} + +\item{xlim}{Optional numeric length-2 vector for x-axis limits when +returning plot.} + +\item{ylab}{Y-axis label for plot.} + +\item{line_color}{Line color for plot.} + +\item{ribbon_alpha}{Alpha for credible ribbon in plot.} + +\item{facet_scales}{Passed to ggplot facet scales.} + +\item{return_data}{If TRUE, returns a list with \code{plot} and \code{data}.} +} +\value{ +By default, a ggplot. If \code{return_data = TRUE}, returns +list(plot = p, data = df). +} +\description{ +Computes median and credible interval bands of the antibody trajectory for a +hypothetical new individual drawn from the population distribution +("newperson"). +} +\details{ +Requires that each model draw can be pivoted to wide parameters including: +y0, y1, t1, alpha, shape. +} diff --git a/man/fmt_mci.Rd b/man/fmt_mci.Rd new file mode 100644 index 0000000..ad0dd2e --- /dev/null +++ b/man/fmt_mci.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fmt_mci.R +\name{fmt_mci} +\alias{fmt_mci} +\title{Format median and credible interval as a single string} +\usage{ +fmt_mci(med, lo, hi, digits = 2, sci = FALSE) +} +\arguments{ +\item{med}{Median value.} + +\item{lo}{Lower bound.} + +\item{hi}{Upper bound.} + +\item{digits}{Number of digits.} + +\item{sci}{Logical; if TRUE uses scientific notation.} +} +\value{ +A string like "1.23 (0.50--2.00)". +} +\description{ +Format median and credible interval as a single string +} diff --git a/man/make_model_comparison_table.Rd b/man/make_model_comparison_table.Rd new file mode 100644 index 0000000..783e9dd --- /dev/null +++ b/man/make_model_comparison_table.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/model_comparison_table.R +\name{make_model_comparison_table} +\alias{make_model_comparison_table} +\title{Compare serotype-specific vs overall models using residual metrics} +\usage{ +make_model_comparison_table( + model_serospecific, + data_serospecific, + model_overall, + data_overall, + antigen_iso, + scale = c("original", "log"), + tie_tol = 1e-08 +) +} +\arguments{ +\item{model_serospecific}{Posterior draws (long format) +for the serotype-specific model.} + +\item{data_serospecific}{Case dataset used for the serotype-specific model.} + +\item{model_overall}{Posterior draws (long format) for the overall model.} + +\item{data_overall}{Case dataset used for the overall model.} + +\item{antigen_iso}{Character scalar antigen/isotype label.} + +\item{scale}{"original" or "log".} + +\item{tie_tol}{Numeric tolerance to declare a tie.} +} +\value{ +A tibble with per-ID MAE/RMSE for each model and +deltas/winner labels. +} +\description{ +Computes per-ID residual metrics for two models on the intersection of IDs +present in both datasets, then reports absolute and percent differences. +} diff --git a/man/mock_case_data.Rd b/man/mock_case_data.Rd new file mode 100644 index 0000000..22c3509 --- /dev/null +++ b/man/mock_case_data.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{mock_case_data} +\alias{mock_case_data} +\title{Mock case data for testing} +\format{ +A data frame with class \code{c("case_data", "data.frame")} +and columns: +\describe{ +\item{id}{Character. Subject ID} +\item{antigen_iso}{Character. Isotype (e.g., "IgG", "IgA")} +\item{timepoint}{Numeric. Time in days since infection} +\item{value}{Numeric. Antibody measurement} +} +} +\usage{ +mock_case_data +} +\description{ +A mock longitudinal antibody dataset compatible with +\code{serodynamics::as_case_data()}, for testing functions like +\code{\link{compute_residual_metrics}}. +} +\details{ +This is synthetic data generated for testing and examples. Real Shigella +case data will be added separately. + +The dataset has attributes: +\itemize{ +\item \code{attr(mock_case_data, "timeindays") = "timepoint"} +\item \code{attr(mock_case_data, "value_var") = "value"} +} +} +\examples{ +head(mock_case_data) +table(mock_case_data$id) +} +\keyword{datasets} diff --git a/man/mock_posterior_draws.Rd b/man/mock_posterior_draws.Rd new file mode 100644 index 0000000..136fe75 --- /dev/null +++ b/man/mock_posterior_draws.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{mock_posterior_draws} +\alias{mock_posterior_draws} +\title{Mock posterior draws for testing} +\format{ +A data frame with columns: +\describe{ +\item{Subject}{Character. Subject ID (e.g., "newperson", "SOSAR-22008")} +\item{Iso_type}{Character. Isotype (e.g., "IgG", "IgA")} +\item{Chain}{Integer. MCMC chain number} +\item{Iteration}{Integer. MCMC iteration number} +\item{Parameter}{Character. Parameter name (y0, y1, t1, alpha, shape)} +\item{value}{Numeric. Parameter value} +} +} +\usage{ +mock_posterior_draws +} +\description{ +A mock dataset of posterior parameter draws in long format, mimicking the +structure expected by functions like \code{\link{compute_residual_metrics}} +and \code{\link{predict_posterior_at_times}}. +} +\details{ +This is synthetic data generated for testing and examples. Real Shigella +posterior draws will be added to the package separately. + +Parameters represent: +\itemize{ +\item \code{y0}: Baseline antibody level +\item \code{y1}: Peak antibody level +\item \code{t1}: Time to peak (days) +\item \code{alpha}: Decay rate parameter +\item \code{shape}: Decay shape parameter (rho) +} +} +\examples{ +head(mock_posterior_draws) +table(mock_posterior_draws$Parameter) +} +\keyword{datasets} diff --git a/man/model_comparison_table.Rd b/man/model_comparison_table.Rd new file mode 100644 index 0000000..7fda11a --- /dev/null +++ b/man/model_comparison_table.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/model_comparison_table.R +\name{model_comparison_table} +\alias{model_comparison_table} +\title{Compare summary residual metrics between two model outputs} +\usage{ +model_comparison_table( + metrics_overall, + metrics_pointwise, + model_overall_label = "Overall Model", + model_pointwise_label = "Pointwise Model" +) +} +\arguments{ +\item{metrics_overall}{Data frame with one row containing at least +\code{MAE}, \code{RMSE}, \code{SSE}, and \code{n_obs} +for the overall model.} + +\item{metrics_pointwise}{Data frame with one row containing at least +\code{MAE}, \code{RMSE}, \code{SSE}, and \code{n_obs} +for the pointwise model.} + +\item{model_overall_label}{Label used for the overall model row.} + +\item{model_pointwise_label}{Label used for the pointwise model row.} +} +\value{ +A tibble with one row per model and comparison columns: +\code{delta_MAE}, \code{delta_RMSE}, \code{pct_improve_MAE}, +\code{pct_improve_RMSE}. +} +\description{ +Builds a compact comparison table from pre-computed summary metrics for an +overall model and a pointwise model. +} diff --git a/man/postprocess_jags_output.Rd b/man/postprocess_jags_output.Rd deleted file mode 100644 index 001ab79..0000000 --- a/man/postprocess_jags_output.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/postprocess_jags_output.R -\name{postprocess_jags_output} -\alias{postprocess_jags_output} -\title{Postprocess JAGS output} -\usage{ -postprocess_jags_output(jags_output) -} -\arguments{ -\item{jags_output}{output from \code{\link[runjags:run.jags]{runjags::run.jags()}}} -} -\value{ -a \link[tibble:tbl_df-class]{tibble::tbl_df} -} -\description{ -Postprocess JAGS output -} diff --git a/man/predict_posterior_at_times.Rd b/man/predict_posterior_at_times.Rd new file mode 100644 index 0000000..c18258d --- /dev/null +++ b/man/predict_posterior_at_times.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict_posterior_at_times.R +\name{predict_posterior_at_times} +\alias{predict_posterior_at_times} +\title{Posterior predictions at specified times for given subjects +and antigen/isotype} +\usage{ +predict_posterior_at_times(model, ids, antigen_iso, times) +} +\arguments{ +\item{model}{A data frame of posterior draws in long format with columns: +\code{Subject}, \code{Iso_type}, \code{Chain}, \code{Iteration}, +\code{Parameter}, \code{value}.} + +\item{ids}{Character vector of subject IDs to include +(matched against \code{Subject}).} + +\item{antigen_iso}{Character scalar specifying the antigen/isotype to include +(matched against \code{Iso_type}).} + +\item{times}{Numeric vector of time points (days) at which to evaluate +predictions.} +} +\value{ +A tibble with one row per +(posterior draw \eqn{\times} time \eqn{\times} subject), including the +evaluated prediction \code{res}. Output includes at least: +\describe{ +\item{id}{Subject ID (character).} +\item{t}{Time (days) at which prediction was evaluated.} +\item{Chain}{MCMC chain index (if present in \code{model}).} +\item{Iteration}{MCMC iteration index (if present in \code{model}).} +\item{sample_id}{Row index for the draw (added if missing).} +\item{y0, y1, t1, alpha, shape}{Model parameters (wide).} +\item{res}{Predicted antibody level at time \code{t}.} +} +} +\description{ +Generates draw-level posterior predictions of the antibody trajectory at +user-specified time points, for one or more subjects and a selected +antigen/isotype. This is a low-level helper used by +residual-based posterior predictive diagnostics. +} +\details{ +This function pivots posterior draws to wide format (parameters as columns), +expands them over \code{times}, and evaluates the antibody curve via +an internal implementation of the antibody kinetics model using parameters +\code{y0}, \code{y1}, \code{t1}, \code{alpha}, and \code{shape}. +} +\examples{ +\dontrun{ +preds <- predict_posterior_at_times( + model = overall_sf2a, + ids = "newperson", + antigen_iso = "IgG", + times = c(0, 30, 90, 180) +) +} + +} +\seealso{ +\code{\link{compute_residual_metrics}} +} +\keyword{internal} diff --git a/man/prep_newperson_params.Rd b/man/prep_newperson_params.Rd new file mode 100644 index 0000000..83811be --- /dev/null +++ b/man/prep_newperson_params.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prep_newperson_params.R +\name{prep_newperson_params} +\alias{prep_newperson_params} +\title{Extract "newperson" parameter draws and summarize for Table 2} +\usage{ +prep_newperson_params(draws_long, antigen_label) +} +\arguments{ +\item{draws_long}{Posterior draws in long format with columns: +Subject, Iso_type, Chain, Iteration, Parameter, value.} + +\item{antigen_label}{Character scalar antigen name to attach.} +} +\value{ +A tibble of newperson draws in wide format with columns: +Iteration, Chain, antigen, Iso_type, y0, y1, t1, alpha, rho. +} +\description{ +Extract "newperson" parameter draws and summarize for Table 2 +} diff --git a/man/process_shigella_data.Rd b/man/process_shigella_data.Rd new file mode 100644 index 0000000..491c184 --- /dev/null +++ b/man/process_shigella_data.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process_shigella_data.R +\name{process_shigella_data} +\alias{process_shigella_data} +\title{Reshape Shigella longitudinal data for serodynamics workflows} +\usage{ +process_shigella_data(data, study_filter, antigen) +} +\arguments{ +\item{data}{A data frame containing longitudinal measurements.} + +\item{study_filter}{Character scalar. Value of \code{study_name} to keep +(e.g. "SOSAR").} + +\item{antigen}{Unquoted column name for the antigen measurement +(e.g. n_ipab_MFI).} +} +\value{ +A tibble with standardized columns: +\describe{ +\item{index_id}{Participant ID (copied from \code{sid}).} +\item{antigen_iso}{Isotype label (copied from \code{isotype_name}).} +\item{visit}{Visit label (copied from \code{timepoint}).} +\item{timeindays}{Time since infection (copied from \verb{Actual day}).} +\item{result}{Antibody measurement (from \code{antigen}).} +} +} +\description{ +Filters a dataset to a given study and antigen column, standardizes column +names, and returns a visit-ordered long dataset suitable for conversion to +\code{serodynamics::as_case_data()}. +} +\examples{ +\dontrun{ +dat_long <- process_shigella_data(df, "SOSAR", n_ipab_MFI) +dL <- serodynamics::as_case_data(dat_long, + id_var = "index_id", biomarker_var = "antigen_iso", + time_in_days = "timeindays", value_var = "result" +) +} + +} diff --git a/man/utils_internal.Rd b/man/utils_internal.Rd new file mode 100644 index 0000000..0baed5f --- /dev/null +++ b/man/utils_internal.Rd @@ -0,0 +1,9 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils_internal.R +\name{utils_internal} +\alias{utils_internal} +\title{Internal utility functions} +\description{ +Internal utility functions +} +\keyword{internal} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..8f6c457 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,12 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html + +library(testthat) +library(shigella) + +test_check("shigella") diff --git a/tests/testthat/test-compute_residual_metrics.R b/tests/testthat/test-compute_residual_metrics.R new file mode 100644 index 0000000..1d07832 --- /dev/null +++ b/tests/testthat/test-compute_residual_metrics.R @@ -0,0 +1,102 @@ +test_that("compute_residual_metrics returns expected structure", { + # Note: This test uses mock data structure + # Once mock_posterior_draws and mock_case_data are generated, + # these tests should work + + # Create minimal mock data inline for testing + mock_model <- data.frame( + Subject = rep("test_id", 15), + Iso_type = rep("IgG", 15), + Chain = rep(1, 15), + Iteration = rep(1:3, 5), + Parameter = rep(c("y0", "y1", "t1", "alpha", "shape"), each = 3), + value = c( + 200, 210, 190, # y0 + 2000, 2100, 1900, # y1 + 10, 12, 9, # t1 + 0.02, 0.025, 0.015, # alpha + 1.0, 1.1, 0.9 # shape + ) + ) + + mock_dataset <- data.frame( + id = rep("test_id", 4), + antigen_iso = rep("IgG", 4), + timepoint = c(0, 7, 30, 90), + value = c(250, 1500, 1200, 800) + ) + attr(mock_dataset, "timeindays") <- "timepoint" + attr(mock_dataset, "value_var") <- "value" + class(mock_dataset) <- c("case_data", "data.frame") + + # Test pointwise summary + result_pw <- compute_residual_metrics( + model = mock_model, + dataset = mock_dataset, + ids = "test_id", + antigen_iso = "IgG", + scale = "original", + summary_level = "pointwise" + ) + + expect_s3_class(result_pw, "data.frame") + expect_true(nrow(result_pw) > 0) + expect_true(all(c("id", "antigen_iso", "t", "obs", "pred_med", + "residual", "abs_residual", "sq_residual") %in% names(result_pw))) + expect_true(all(is.finite(result_pw$residual))) + + # Test id_antigen summary + result_id <- compute_residual_metrics( + model = mock_model, + dataset = mock_dataset, + ids = "test_id", + antigen_iso = "IgG", + scale = "original", + summary_level = "id_antigen" + ) + + expect_s3_class(result_id, "data.frame") + expect_equal(nrow(result_id), 1) + expect_true(all(c("id", "antigen_iso", "MAE", "RMSE", "SSE", "n_obs") %in% names(result_id))) + expect_true(result_id$MAE >= 0) + expect_true(result_id$RMSE >= 0) + expect_true(result_id$SSE >= 0) + expect_true(result_id$n_obs > 0) +}) + +test_that("compute_residual_metrics handles log scale", { + # Minimal mock data + mock_model <- data.frame( + Subject = rep("test_id", 15), + Iso_type = rep("IgG", 15), + Chain = rep(1, 15), + Iteration = rep(1:3, 5), + Parameter = rep(c("y0", "y1", "t1", "alpha", "shape"), each = 3), + value = c(200, 210, 190, 2000, 2100, 1900, 10, 12, 9, + 0.02, 0.025, 0.015, 1.0, 1.1, 0.9) + ) + + mock_dataset <- data.frame( + id = rep("test_id", 3), + antigen_iso = rep("IgG", 3), + timepoint = c(7, 30, 90), + value = c(1500, 1200, 800) + ) + attr(mock_dataset, "timeindays") <- "timepoint" + attr(mock_dataset, "value_var") <- "value" + class(mock_dataset) <- c("case_data", "data.frame") + + result_log <- compute_residual_metrics( + model = mock_model, + dataset = mock_dataset, + ids = "test_id", + antigen_iso = "IgG", + scale = "log", + summary_level = "overall" + ) + + expect_s3_class(result_log, "data.frame") + expect_equal(nrow(result_log), 1) + expect_true(all(is.finite(result_log$MAE))) + expect_true(all(is.finite(result_log$RMSE))) +}) diff --git a/tests/testthat/test-model_comparison_table.R b/tests/testthat/test-model_comparison_table.R new file mode 100644 index 0000000..4f8f81f --- /dev/null +++ b/tests/testthat/test-model_comparison_table.R @@ -0,0 +1,68 @@ +test_that("model_comparison_table computes differences correctly", { + # Create mock metrics for two models + metrics_overall <- data.frame( + MAE = 150, + RMSE = 200, + SSE = 40000, + n_obs = 100 + ) + + metrics_pointwise <- data.frame( + MAE = 120, + RMSE = 180, + SSE = 32400, + n_obs = 100 + ) + + result <- model_comparison_table( + metrics_overall = metrics_overall, + metrics_pointwise = metrics_pointwise, + model_overall_label = "Overall Model", + model_pointwise_label = "Pointwise Model" + ) + + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 2) + expect_true(all(c("Model", "MAE", "RMSE", "SSE", "n_obs") %in% names(result))) + expect_true("delta_MAE" %in% names(result)) + expect_true("delta_RMSE" %in% names(result)) + + # Check that differences are computed correctly + delta_mae <- result$delta_MAE[result$Model == "Pointwise Model"] + expect_equal(delta_mae, 120 - 150) + + delta_rmse <- result$delta_RMSE[result$Model == "Pointwise Model"] + expect_equal(delta_rmse, 180 - 200) +}) + +test_that("model_comparison_table computes percent improvement", { + metrics_overall <- data.frame( + MAE = 100, + RMSE = 200, + SSE = 40000, + n_obs = 100 + ) + + metrics_pointwise <- data.frame( + MAE = 80, + RMSE = 160, + SSE = 25600, + n_obs = 100 + ) + + result <- model_comparison_table( + metrics_overall = metrics_overall, + metrics_pointwise = metrics_pointwise + ) + + expect_true("pct_improve_MAE" %in% names(result)) + expect_true("pct_improve_RMSE" %in% names(result)) + + # Check percent improvements are computed correctly + # Pointwise model: (80 - 100) / 100 * 100 = -20% (20% improvement) + pct_mae <- result$pct_improve_MAE[result$Model == "Pointwise Model"] + expect_equal(pct_mae, -20) + + pct_rmse <- result$pct_improve_RMSE[result$Model == "Pointwise Model"] + expect_equal(pct_rmse, -20) +}) diff --git a/tests/testthat/test-predict_posterior_at_times.R b/tests/testthat/test-predict_posterior_at_times.R new file mode 100644 index 0000000..1cf05a7 --- /dev/null +++ b/tests/testthat/test-predict_posterior_at_times.R @@ -0,0 +1,68 @@ +test_that("predict_posterior_at_times returns expected structure", { + # Create minimal mock posterior data + mock_model <- data.frame( + Subject = rep("test_id", 15), + Iso_type = rep("IgG", 15), + Chain = rep(1, 15), + Iteration = rep(1:3, 5), + Parameter = rep(c("y0", "y1", "t1", "alpha", "shape"), each = 3), + value = c( + 200, 210, 190, # y0 + 2000, 2100, 1900, # y1 + 10, 12, 9, # t1 + 0.02, 0.025, 0.015, # alpha + 1.0, 1.1, 0.9 # shape + ) + ) + + times <- c(0, 30, 90) + + result <- shigella:::predict_posterior_at_times( + model = mock_model, + ids = "test_id", + antigen_iso = "IgG", + times = times + ) + + expect_s3_class(result, "data.frame") + expect_true(nrow(result) > 0) + expect_true(all(c("id", "t", "res") %in% names(result))) + expect_true(all(result$t %in% times)) + expect_true(all(is.finite(result$res))) + expect_true(all(result$res > 0)) +}) + +test_that("predict_posterior_at_times handles multiple subjects", { + # Create mock data for multiple subjects + mock_model <- rbind( + data.frame( + Subject = rep("id1", 15), + Iso_type = rep("IgG", 15), + Chain = rep(1, 15), + Iteration = rep(1:3, 5), + Parameter = rep(c("y0", "y1", "t1", "alpha", "shape"), each = 3), + value = c(200, 210, 190, 2000, 2100, 1900, 10, 12, 9, + 0.02, 0.025, 0.015, 1.0, 1.1, 0.9) + ), + data.frame( + Subject = rep("id2", 15), + Iso_type = rep("IgG", 15), + Chain = rep(1, 15), + Iteration = rep(1:3, 5), + Parameter = rep(c("y0", "y1", "t1", "alpha", "shape"), each = 3), + value = c(250, 260, 240, 2500, 2600, 2400, 12, 14, 11, + 0.03, 0.035, 0.025, 1.2, 1.3, 1.1) + ) + ) + + result <- shigella:::predict_posterior_at_times( + model = mock_model, + ids = c("id1", "id2"), + antigen_iso = "IgG", + times = c(0, 30) + ) + + expect_s3_class(result, "data.frame") + expect_true(all(c("id1", "id2") %in% result$id)) + expect_true(all(is.finite(result$res))) +}) diff --git a/tests/testthat/test-process_shigella_data.R b/tests/testthat/test-process_shigella_data.R new file mode 100644 index 0000000..cf7323e --- /dev/null +++ b/tests/testthat/test-process_shigella_data.R @@ -0,0 +1,46 @@ +test_that("process_shigella_data filters and reshapes correctly", { + # Create mock input data + mock_raw_data <- data.frame( + study_name = rep(c("SOSAR", "OTHER"), each = 6), + isotype_name = rep(c("IgG", "IgA"), 6), + sid = rep(c("ID001", "ID002"), each = 6), + timepoint = rep(c("V1", "V2", "V3"), 4), + `Actual day` = rep(c(0, 7, 30), 4), + n_ipab_MFI = runif(12, 100, 5000), + check.names = FALSE + ) + + result <- process_shigella_data( + data = mock_raw_data, + study_filter = "SOSAR", + antigen = n_ipab_MFI + ) + + expect_s3_class(result, "data.frame") + expect_true(nrow(result) > 0) + expect_true(all(c("index_id", "antigen_iso", "visit", "timeindays", "result") %in% names(result))) + expect_true(all(result$index_id %in% c("ID001", "ID002"))) + expect_true(all(result$antigen_iso %in% c("IgG", "IgA"))) + expect_true(all(!is.na(result$timeindays))) +}) + +test_that("process_shigella_data removes NA timepoints", { + mock_raw_data <- data.frame( + study_name = rep("SOSAR", 6), + isotype_name = rep("IgG", 6), + sid = rep("ID001", 6), + timepoint = c("V1", "V2", "V3", "V4", "V5", "V6"), + `Actual day` = c(0, 7, NA, 30, NA, 90), + n_ipab_MFI = runif(6, 100, 5000), + check.names = FALSE + ) + + result <- process_shigella_data( + data = mock_raw_data, + study_filter = "SOSAR", + antigen = n_ipab_MFI + ) + + expect_true(all(!is.na(result$timeindays))) + expect_equal(nrow(result), 4) # Should only have 4 rows (excluding 2 NAs) +}) diff --git a/tests/testthat/test-utils_internal.R b/tests/testthat/test-utils_internal.R new file mode 100644 index 0000000..2aac02e --- /dev/null +++ b/tests/testthat/test-utils_internal.R @@ -0,0 +1,64 @@ +test_that("ab() returns y0 at t = 0", { + result <- shigella:::ab( + t = 0, y0 = 200, y1 = 2000, t1 = 10, alpha = 0.02, shape = 1.2 + ) + expect_equal(result, 200) +}) + +test_that("ab() returns y1 at t = t1", { + result <- shigella:::ab( + t = 10, y0 = 200, y1 = 2000, t1 = 10, alpha = 0.02, shape = 1.2 + ) + expect_equal(result, 2000) +}) + +test_that("ab() shows expected rise for t < t1", { + result <- shigella:::ab( + t = 5, y0 = 200, y1 = 2000, t1 = 10, alpha = 0.02, shape = 1.2 + ) + expected <- 200 * exp((log(2000 / 200) / 10) * 5) + expect_equal(result, expected) +}) + +test_that("ab() decays toward zero for large t (shape > 1)", { + result <- shigella:::ab( + t = 10000, y0 = 200, y1 = 2000, t1 = 10, alpha = 0.02, shape = 1.2 + ) + expect_true(is.finite(result)) + expect_true(result >= 0) +}) + +test_that("ab() handles vector input", { + times <- c(0, 5, 10, 30, 90) + result <- shigella:::ab( + t = times, y0 = 200, y1 = 2000, t1 = 10, alpha = 0.02, shape = 1.2 + ) + expect_length(result, 5) + expect_true(all(is.finite(result))) + expect_equal(result[1], 200) + expect_equal(result[3], 2000) +}) + +test_that("ab() handles t1 = 0 same as upstream", { + # When t1 = 0, bt() returns Inf, which is expected + # upstream behavior. We don't add special handling. + result <- shigella:::ab( + t = c(0, 10, 30), + y0 = 200, y1 = 2000, t1 = 0, + alpha = 0.02, shape = 1.2 + ) + expect_length(result, 3) + # Results may be NaN/Inf -- that's OK, matches upstream +}) + +test_that("get_timeindays_var() finds time variable from attribute", { + mock_data <- data.frame(timepoint = c(0, 7, 30), value = c(100, 200, 300)) + attr(mock_data, "timeindays") <- "timepoint" + expect_equal(shigella:::get_timeindays_var(mock_data), "timepoint") +}) + +test_that( + "get_timeindays_var() defaults to timeindays when attribute missing", { + mock_data <- data.frame(timeindays = c(0, 7, 30), value = c(100, 200, 300)) + expect_equal(shigella:::get_timeindays_var(mock_data), "timeindays") +}) diff --git a/vignettes/getting-started.Rmd b/vignettes/getting-started.Rmd new file mode 100644 index 0000000..d25dd23 --- /dev/null +++ b/vignettes/getting-started.Rmd @@ -0,0 +1,134 @@ +--- +title: "Getting Started with shigella" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Getting Started with shigella} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(shigella) +``` + +## Overview + +The `shigella` package provides tools for analyzing longitudinal antibody kinetics data from Shigella infection studies. This vignette introduces the main functionality using mock data. + +**Note:** This package currently uses mock data for testing and examples. Real Shigella datasets will be added in a future release. + +## Key Functions + +### Data Processing + +The `process_shigella_data()` function reshapes raw longitudinal data into a format compatible with the `serodynamics` package: + +```{r, eval=FALSE} +# Example with real data (not run) +dat_long <- process_shigella_data( + data = raw_data, + study_filter = "SOSAR", + antigen = n_ipab_MFI +) + +# Convert to case_data format +dL <- serodynamics::as_case_data( + dat_long, + id_var = "index_id", + biomarker_var = "antigen_iso", + time_in_days = "timeindays", + value_var = "result" +) +``` + +### Model Evaluation + +The package provides functions to evaluate and compare longitudinal antibody models: + +#### Computing Residual Metrics + +```{r, eval=FALSE} +# Compute residuals at individual level +metrics_id <- compute_residual_metrics( + model = posterior_draws, + dataset = case_data, + ids = unique(case_data$id), + antigen_iso = "IgG", + scale = "original", + summary_level = "id_antigen" +) + +# Overall summary metrics +metrics_overall <- compute_residual_metrics( + model = posterior_draws, + dataset = case_data, + ids = unique(case_data$id), + antigen_iso = "IgG", + scale = "original", + summary_level = "overall" +) +``` + +#### Model Comparison + +Compare model fit metrics between different modeling approaches: + +```{r, eval=FALSE} +comparison <- model_comparison_table( + metrics_overall = metrics_model1, + metrics_pointwise = metrics_model2, + model_overall_label = "Population Model", + model_pointwise_label = "Individual Model" +) +``` + +### Visualization + +The `fig2_overall_newperson()` function creates trajectory plots for population-level ("newperson") predictions: + +```{r, eval=FALSE} +fig <- fig2_overall_newperson( + overall_models = list( + IpaB = model_ipab, + Sf2a = model_sf2a + ), + osps = c("IpaB", "Sf2a"), + isotypes = c("IgG", "IgA"), + t_grid = seq(0, 210, by = 5), + log_y = TRUE +) +``` + +## Mock Data + +The package includes two mock datasets for testing: + +- `mock_posterior_draws`: Posterior parameter draws in long format +- `mock_case_data`: Longitudinal antibody measurements + +These datasets have the same structure as real data but contain synthetic values. + +```{r, eval=FALSE} +# Load mock data +data("mock_posterior_draws") +data("mock_case_data") + +# View structure +head(mock_posterior_draws) +head(mock_case_data) +``` + +## Next Steps + +For more detailed examples and analysis workflows, see: + +- Package documentation: `?shigella` +- Function help: `?compute_residual_metrics`, `?process_shigella_data` +- GitHub repository: https://github.com/UCD-SERG/shigella