diff --git a/.Rbuildignore b/.Rbuildignore index a0f6a52..0694f55 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,6 @@ ^vignettes/articles/*_files$ ^vignettes/articles$ ^codecov\.yml$ +^shigella\.Rcheck$ +^shigella.*\.tar\.gz$ +^shigella.*\.tgz$ diff --git a/.gitignore b/.gitignore index cbe14e1..80788f5 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ data/ /.quarto/ *.pdf README_files +shigella.Rcheck/ +shigella*.tar.gz +shigella*.tgz diff --git a/DESCRIPTION b/DESCRIPTION index 43d2893..6b43288 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: shigella Title: What the Package Does (One Line, Title Case) -Version: 0.0.0.9003 +Version: 0.0.0.9004 Authors@R: c( person("Kwan Ho", "Lee", , "ksjlee@ucdavis.edu", role = c("aut", "cre")), person("Douglas Ezra", "Morrison", , "demorrison@ucdavis.edu", role = c("aut"), diff --git a/NAMESPACE b/NAMESPACE index 56e7b54..155493c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,9 @@ # Generated by roxygen2: do not edit by hand +export(calculate_metrics) +export(generate_final_table) export(postprocess_jags_output) +export(simulate_seroincidence2) importFrom(dplyr,any_of) importFrom(dplyr,filter) importFrom(dplyr,mutate) diff --git a/NEWS.md b/NEWS.md index c3eade1..1fe3b1e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ ## Bug fixes +None yet + ## Internal changes * Moved helper functions into `R/` subdirectory (#1) diff --git a/R/calculate_metrics.R b/R/calculate_metrics.R new file mode 100644 index 0000000..2d9cf37 --- /dev/null +++ b/R/calculate_metrics.R @@ -0,0 +1,33 @@ +#' Calculate Empirical Standard Error for Incidence Rates +#' +#' This function computes the empirical standard error (SE) of incidence rates +#' from the provided dataset and adds sample size and age group information. +#' +#' @param data A data frame containing a column named \code{incidence.rate}. +#' @param sample_size Optional. Integer representing the sample size. If NULL, retrieved from \code{attr(data, "sample_size")}. +#' @param age_group Optional. Character string specifying the age group. If NULL, retrieved from \code{attr(data, "age_group")}. +#' +#' @return A data frame with \code{sample_size}, \code{empirical_se}, and \code{Age_Group}. +#' @examples +#' \dontrun{ +#' data <- data.frame(incidence.rate = c(0.1, 0.2, 0.15, 0.18)) +#' attr(data, "sample_size") <- 100 +#' attr(data, "age_group") <- "Age 0-2" +#' calculate_metrics(data) +#' } +#' @export +calculate_metrics <- function(data, sample_size = NULL, age_group = NULL) { + if (is.null(sample_size)) { + sample_size <- attr(data, "sample_size") + } + if (is.null(age_group)) { + age_group <- attr(data, "age_group") + } + + data.frame( + sample_size = sample_size, + empirical_se = sd(data$incidence.rate, na.rm = TRUE), + Age_Group = age_group + ) +} + diff --git a/R/generate_final_table.R b/R/generate_final_table.R index d84a3f0..6fab724 100644 --- a/R/generate_final_table.R +++ b/R/generate_final_table.R @@ -1,14 +1,30 @@ -# Define a function to generate final tables +#' Generate Final Table from Simulation Results +#' +#' This function loops through a list of simulation results, extracts the required columns, +#' and combines them into a single data frame. It also adds the sample size for clarity. +#' +#' @param results_list A list of simulation results. Each element should contain an element named \code{est1} +#' which can be summarized. +#' @param sample_size An integer specifying the sample size used in the simulation. +#' +#' @return A data frame combining the selected columns (\code{incidence.rate}, \code{SE}, \code{CI.lwr}, \code{CI.upr}) +#' from each simulation result along with the sample size and an index for tracking. +#' @examples +#' \dontrun{ +#' # Suppose you have a list of simulation results +#' final_table <- generate_final_table(results_list, sample_size = 100) +#' } +#' @export generate_final_table <- function(results_list, sample_size) { # Initialize an empty list to store the results summary_results <- list() - # Loop through each of the 100 results and extract the required columns - for (i in 1:200) { + # Loop through each result and extract the required columns + for (i in 1:length(results_list)) { # Extract the summary for each result result_summary <- summary(results_list[[i]]$est1) - # Select the required columns + # Select the required columns (ensure that result_summary is a data frame) extracted_columns <- result_summary %>% select(incidence.rate, SE, CI.lwr, CI.upr) @@ -16,13 +32,13 @@ generate_final_table <- function(results_list, sample_size) { extracted_columns <- extracted_columns %>% mutate(index = i) - # Append to the list + # Append the extracted data to the list summary_results[[i]] <- extracted_columns } - # Combine all results into a single data frame + # Combine all results into a single data frame and add the sample size column for clarity final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity + mutate(sample_size = sample_size) return(final_table) } diff --git a/R/postprocess_jags_output.R b/R/postprocess_jags_output.R index 32bfd22..660cc78 100644 --- a/R/postprocess_jags_output.R +++ b/R/postprocess_jags_output.R @@ -45,3 +45,7 @@ postprocess_jags_output <- function(jags_output) { return(to_return) } + + + + diff --git a/R/simulate_seroincidence2.R b/R/simulate_seroincidence2.R new file mode 100644 index 0000000..953fa4b --- /dev/null +++ b/R/simulate_seroincidence2.R @@ -0,0 +1,75 @@ +#' Description of the function here. +#' @param nrep Number of repetitions. +#' @param n_sim Number of simulations. +#' @param observed Observed incidence rate. +#' @param range Range for simulation. +#' @return A list of simulated seroincidence results. +#' @export +# Define the simulation function +# Define the simulation function +simulate_seroincidence2 <- function(nrep, n_sim, observed, range = NULL) { + # Set parallel plan inside function to avoid issues with distributed nodes + plan(multicore) # Use multiple cores for parallel processing (works best on HPC) + + # Parameters + dmcmc <- curve_params_shigella_ipab # Curve parameters + antibodies <- c("IgG") # Antigen-isotypes + lambda <- observed # Simulated incidence rate per person-year + + # Biologic noise distribution + dlims <- rbind("IgG" = c(min = 0, max = 0.5)) + + # Noise parameters + cond <- tibble( + antigen_iso = c("IgG"), + nu = c(0.5), # Biologic noise (nu) + eps = c(0.25), # Measurement noise (eps) + y.low = c(25), # Low cutoff (llod) + y.high = c(200000) # High cutoff (y.high) + ) + + # Perform simulations in parallel + results <- future_map(1:n_sim, function(i) { + tryCatch({ + # Generate cross-sectional data + csdata <- sim_pop_data( + curve_params = dmcmc, + lambda = lambda, + n.smpl = nrep, + age_range = range, + antigen_isos = antibodies, + n.mc = 0, + renew_params = TRUE, # Use different parameters for each simulation + add.noise = TRUE, + noise_limits = dlims, + format = "long" + ) + + # Estimate seroincidence + est <- est.incidence( + pop_data = csdata, + curve_params = dmcmc, + noise_params = cond, + lambda_start = 0.1, + build_graph = TRUE, + verbose = FALSE, + print_graph = FALSE, + antigen_isos = antibodies + ) + + # Return results for this simulation + list(csdata = csdata, est1 = est) + }, error = function(e) { + return(list(error = e$message)) # Capture and store errors instead of stopping execution + }) + }, .options = furrr_options(seed = TRUE)) + + # Ensure sequential processing after function execution + plan(sequential) + results <- results |> + structure( + sample_size = nrep, + age_range = paste(range, collapse = " - ") + ) + return(results) +} diff --git a/inst/WORDLIST b/inst/WORDLIST index d314679..4121428 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -5,3 +5,10 @@ ORCID Postprocess Seroresponse repo +HPC +IgG +IpaB +isotypes +qmd +seroincidence +ses \ No newline at end of file diff --git a/man/calculate_metrics.Rd b/man/calculate_metrics.Rd new file mode 100644 index 0000000..47c1a9e --- /dev/null +++ b/man/calculate_metrics.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/calculate_metrics.R +\name{calculate_metrics} +\alias{calculate_metrics} +\title{Calculate Empirical Standard Error for Incidence Rates} +\usage{ +calculate_metrics(data, sample_size = NULL, age_group = NULL) +} +\arguments{ +\item{data}{A data frame containing a column named \code{incidence.rate}.} + +\item{sample_size}{Optional. Integer representing the sample size. If NULL, retrieved from \code{attr(data, "sample_size")}.} + +\item{age_group}{Optional. Character string specifying the age group. If NULL, retrieved from \code{attr(data, "age_group")}.} +} +\value{ +A data frame with \code{sample_size}, \code{empirical_se}, and \code{Age_Group}. +} +\description{ +This function computes the empirical standard error (SE) of incidence rates +from the provided dataset and adds sample size and age group information. +} +\examples{ +\dontrun{ +data <- data.frame(incidence.rate = c(0.1, 0.2, 0.15, 0.18)) +attr(data, "sample_size") <- 100 +attr(data, "age_group") <- "Age 0-2" +calculate_metrics(data) +} +} diff --git a/man/generate_final_table.Rd b/man/generate_final_table.Rd new file mode 100644 index 0000000..9f3d67b --- /dev/null +++ b/man/generate_final_table.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/generate_final_table.R +\name{generate_final_table} +\alias{generate_final_table} +\title{Generate Final Table from Simulation Results} +\usage{ +generate_final_table(results_list, sample_size) +} +\arguments{ +\item{results_list}{A list of simulation results. Each element should contain an element named \code{est1} +which can be summarized.} + +\item{sample_size}{An integer specifying the sample size used in the simulation.} +} +\value{ +A data frame combining the selected columns (\code{incidence.rate}, \code{SE}, \code{CI.lwr}, \code{CI.upr}) +from each simulation result along with the sample size and an index for tracking. +} +\description{ +This function loops through a list of simulation results, extracts the required columns, +and combines them into a single data frame. It also adds the sample size for clarity. +} +\examples{ +\dontrun{ +# Suppose you have a list of simulation results +final_table <- generate_final_table(results_list, sample_size = 100) +} +} diff --git a/man/simulate_seroincidence2.Rd b/man/simulate_seroincidence2.Rd new file mode 100644 index 0000000..be1bf24 --- /dev/null +++ b/man/simulate_seroincidence2.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simulate_seroincidence2.R +\name{simulate_seroincidence2} +\alias{simulate_seroincidence2} +\title{Description of the function here.} +\usage{ +simulate_seroincidence2(nrep, n_sim, observed, range = NULL) +} +\arguments{ +\item{nrep}{Number of repetitions.} + +\item{n_sim}{Number of simulations.} + +\item{observed}{Observed incidence rate.} + +\item{range}{Range for simulation.} +} +\value{ +A list of simulated seroincidence results. +} +\description{ +Description of the function here. +} diff --git a/project.Rproj b/project.Rproj index 21a5a96..008497b 100644 --- a/project.Rproj +++ b/project.Rproj @@ -1,5 +1,4 @@ Version: 1.0 -ProjectId: 3be2b60e-1279-4ad7-941b-69ed270815d6 RestoreWorkspace: Default SaveWorkspace: Default diff --git a/sim_hpc_ipab.qmd b/sim_hpc_ipab.qmd deleted file mode 100644 index 90a24aa..0000000 --- a/sim_hpc_ipab.qmd +++ /dev/null @@ -1,408 +0,0 @@ ---- -title: "Simulation with the Smallest Lambda for IpaB IgG" -format: - pdf: - number-sections: true - number-depth: 2 - number-offset: [0, 0] - pdf-engine: pdflatex -header-includes: - - \usepackage{graphicx} # Required for ggplot figures - - \usepackage{float} # Helps place figures properly - - \usepackage{wrapfig} # Now safely included since you installed it -editor: visual -editor_options: - chunk_output_type: console ---- - -```{r setup, include=FALSE,echo=FALSE} -library(knitr) -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r, echo=FALSE, message=FALSE} -library(tibble) -library(dplyr) -library(serocalculator) -library(haven) -library(knitr) -library(plotly) -library(kableExtra) -library(tidyr) -library(arsenal) -library(forcats) -library(huxtable) -library(magrittr) -library(parameters) -library(ggplot2) -library(ggeasy) -library(scales) -library(plotly) -library(patchwork) -library(tidyverse) -library(readxl) -library(purrr) -library(here) -library(table1) -library(furrr) -library(future) -``` - - -# Vary simulation lambdas across the range in the real cross-sectional data - -## 1. Estimate the shigella incidence rate using the real cross-sectional Shigella data, -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Load data -load("~/ipab.RData") -``` - -## Get parameters from longitudinal data - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Set noise -create_noise_df <- function(country) { - noise_df <- tibble( - antigen_iso = c("IgG"), - Country = factor(c(country)), # Use input argument for Country - y.low = c(25), - eps = c(0.25), - nu = c(0.5), - y.high = c(200000) - ) - - return(noise_df) -} - -noise_df_USA <- create_noise_df("MA USA") -noise_df_Niger <- create_noise_df("Niger") -noise_df_Sierra <- create_noise_df("Sierra Leone") -noise_df_Ghana <- create_noise_df("Ghana") -``` - -## 2. Run simulations using: -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# MA USA -est_USA <- est.incidence( - pop_data = df_xs_USA_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_USA, - antigen_isos = c("IgG"), -) - -# Ghana -est_Ghana <- est.incidence( - pop_data = df_xs_Ghana_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Ghana, - antigen_isos = c("IgG"), -) - -# Niger -est_Niger <- est.incidence( - pop_data = df_xs_Niger_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Niger, - antigen_isos = c("IgG"), -) - -# Sierra Leone -est_Sierra <- est.incidence( - pop_data = df_xs_Sierra_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Sierra, - antigen_isos = c("IgG"), -) - -# create table of incidence.rate of each region - -create_incidence_table <- function(...) { - # Capture input objects and their names - est_list <- list(...) - country_names <- names(est_list) - - # Extract incidence.rate from the summary() of each estimate object - incidence_rates <- sapply(est_list, function(x) summary(x)$incidence.rate) - - # Create a tidy tibble - incidence_table <- tibble( - Country = country_names, - Incidence_Rate = incidence_rates - ) - - return(incidence_table) -} -# Example usage with four different country estimates: -incidence_summary <- create_incidence_table( - USA = est_USA, - Ghana = est_Ghana, - Niger = est_Niger, - Sierra_Leone = est_Sierra -) - -# Display the summary table -print(incidence_summary) - -# 1) The median of the incidence rate estimates from step 1. -# Get incidence rate estimates from each region (4 regions) and do median -med_ipab_IgG<-median(incidence_summary$Incidence_Rate) - -#1.188 - -# 2) 2x the maximum incidence rate estimate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the maximum one and 2x -max2_ipab_IgG<-2*max(incidence_summary$Incidence_Rate) - -# 2.788 - -# 3) 1/2 the minimum incidence rate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the minimum one and 1/2 -min_half_ipab_IgG<-0.5*min(incidence_summary$Incidence_Rate) - -#0.185 -# Create a data frame for the incidence rate summary -incidence_rate_summary <- data.frame( - Metric = c("Median Incidence Rate", - "2x Maximum Incidence Rate", - "1/2 Minimum Incidence Rate"), - Value = c(median(incidence_summary$Incidence_Rate), # Median of incidence rates - 2 * max(incidence_summary$Incidence_Rate), # 2x Maximum incidence rate - 0.5 * min(incidence_summary$Incidence_Rate)) # 1/2 Minimum incidence rate -) - -# Print the table -print(incidence_rate_summary) - -``` -These three values are preliminary observations of lambda from ipab_IgG. - -We choose a lambda that is half of the minimum lambda from the four regions. - -\newpage - -# Simulation (300times) with the smallest lambda -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} - -# Define the simulation function -simulate_seroincidence <- function(nrep, n_sim, observed, range = NULL) { - # Set parallel plan inside function to avoid issues with distributed nodes - plan(multicore) # Use multiple cores for parallel processing (works best on HPC) - - # Parameters - dmcmc <- curve_params_shigella # Curve parameters - antibodies <- c("IgG") # Antigen-isotypes - lambda <- observed # Simulated incidence rate per person-year - - # Biologic noise distribution - dlims <- rbind("IgG" = c(min = 0, max = 0.5)) - - # Noise parameters - cond <- tibble( - antigen_iso = c("IgG"), - nu = c(0.5), # Biologic noise (nu) - eps = c(0.25), # Measurement noise (eps) - y.low = c(25), # Low cutoff (llod) - y.high = c(200000) # High cutoff (y.high) - ) - - # Perform simulations in parallel - results <- future_map(1:n_sim, function(i) { - tryCatch({ - # Generate cross-sectional data - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = TRUE, # Use different parameters for each simulation - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - # Estimate seroincidence - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = TRUE, - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - # Return results for this simulation - list(csdata = csdata, est1 = est) - }, error = function(e) { - return(list(error = e$message)) # Capture and store errors instead of stopping execution - }) - }, .options = furrr_options(seed = TRUE)) - - # Ensure sequential processing after function execution - plan(sequential) - - return(results) -} - -# ------------------------ # -# 🔹 Run Simulations in Parallel -# ------------------------ # - -# Set up parallel processing (Ensure future::plan() is set before execution) -plan(multicore) # Works best on HPC - -# Define parameter sets -sim_params <- list( - list(nrep = 100, range = c(0, 2)), - list(nrep = 100, range = c(2, 5)), - list(nrep = 200, range = c(0, 2)), - list(nrep = 200, range = c(2, 5)), - list(nrep = 300, range = c(0, 2)), - list(nrep = 300, range = c(2, 5)), - list(nrep = 400, range = c(0, 2)), - list(nrep = 400, range = c(2, 5)) -) - -# Run simulations in a loop to reduce redundant code -results_list <- lapply(sim_params, function(params) { - cat("Running simulation for nrep =", params$nrep, "range =", params$range, "\n") - simulate_seroincidence(nrep = params$nrep, n_sim = 300, observed = min_half_ipab_IgG, range = params$range) -}) - -# Assign results to variables -results_100_1 <- results_list[[1]] -results_100_2 <- results_list[[2]] -results_200_1 <- results_list[[3]] -results_200_2 <- results_list[[4]] -results_300_1 <- results_list[[5]] -results_300_2 <- results_list[[6]] -results_400_1 <- results_list[[7]] -results_400_2 <- results_list[[8]] - -# Stop parallel processing -plan(sequential) # Return to sequential processing after execution - -``` - - -## Store each sample size in table -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to generate final tables -generate_final_table <- function(results_list, sample_size) { - # Initialize an empty list to store the results - summary_results <- list() - - # Loop through each of the 100 results and extract the required columns - for (i in 1:300) { - # Extract the summary for each result - result_summary <- summary(results_list[[i]]$est1) - - # Select the required columns - extracted_columns <- result_summary %>% - select(incidence.rate, SE, CI.lwr, CI.upr) - - # Add a column for the index (optional, for tracking) - extracted_columns <- extracted_columns %>% - mutate(index = i) - - # Append to the list - summary_results[[i]] <- extracted_columns - } - - # Combine all results into a single data frame - final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity - - return(final_table) -} - -# Example usage for different sample sizes -final_table_100_1 <- generate_final_table(results_100_1, 100) -final_table_100_2 <- generate_final_table(results_100_2, 100) - -final_table_200_1 <- generate_final_table(results_200_1, 200) -final_table_200_2 <- generate_final_table(results_200_2, 200) - -final_table_300_1 <- generate_final_table(results_300_1, 300) -final_table_300_2 <- generate_final_table(results_300_2, 300) - -final_table_400_1 <- generate_final_table(results_400_1, 400) -final_table_400_2 <- generate_final_table(results_400_2, 400) -``` - - - -## Graphs of where the x axis is the sample size and the y axis is the empirical standard error - -Set the preliminary observed lambda as half of the minimum incidence rate from the four regions. - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# Function to calculate metrics for each table -calculate_metrics <- function(data, sample_size) { - empirical_se <- sd(data$incidence.rate) - - - # Return a data frame with results - data.frame( - sample_size = sample_size, - empirical_se = empirical_se - - ) -} - -# Apply the function to each table -metrics_100_1 <- calculate_metrics(final_table_100_1, 100) -metrics_100_2 <- calculate_metrics(final_table_100_2, 100) - -metrics_200_1 <- calculate_metrics(final_table_200_1, 200) -metrics_200_2 <- calculate_metrics(final_table_200_2, 200) - -metrics_300_1 <- calculate_metrics(final_table_300_1, 300) -metrics_300_2 <- calculate_metrics(final_table_300_2, 300) - -metrics_400_1 <- calculate_metrics(final_table_400_1, 400) -metrics_400_2 <- calculate_metrics(final_table_400_2, 400) - - -# Combine the results into a single summary table -summary_metrics_1 <- bind_rows(metrics_100_1, metrics_200_1, metrics_300_1, - metrics_400_1) - - -summary_metrics_2 <- bind_rows(metrics_100_2, metrics_200_2, metrics_300_2, - metrics_400_2) - -# Add a column to distinguish the datasets -summary_metrics_1 <- summary_metrics_1 %>% - mutate(Age_Group = "Age 0-2") - -summary_metrics_2 <- summary_metrics_2 %>% - mutate(Age_Group = "Age 2-5") - -# Combine both datasets into one -summary_metrics_combined <- bind_rows(summary_metrics_1, summary_metrics_2) - -# Plot with color to differentiate age groups -ggplot(summary_metrics_combined, aes(x = sample_size, y = empirical_se, color = Age_Group)) + - geom_line() + - geom_point() + - labs( - title = "Empirical Standard Error vs. Sample Size", - x = "Sample Size", - y = "Empirical Standard Error", - color = "Age Group" - ) + - theme_minimal() -``` - -\newpage - -## Table - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -summary_metrics_combined %>% - kable() -``` \ No newline at end of file diff --git a/sim_hpc_ipab_med.qmd b/sim_hpc_ipab_med.qmd deleted file mode 100644 index d0bf79a..0000000 --- a/sim_hpc_ipab_med.qmd +++ /dev/null @@ -1,408 +0,0 @@ ---- -title: "Simulation with the Median Lambda for IpaB IgG" -format: - pdf: - number-sections: true - number-depth: 2 - number-offset: [0, 0] - pdf-engine: pdflatex -header-includes: - - \usepackage{graphicx} # Required for ggplot figures - - \usepackage{float} # Helps place figures properly - - \usepackage{wrapfig} # Now safely included since you installed it -editor: visual -editor_options: - chunk_output_type: console ---- - -```{r setup, include=FALSE,echo=FALSE} -library(knitr) -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r, echo=FALSE, message=FALSE} -library(tibble) -library(dplyr) -library(serocalculator) -library(haven) -library(knitr) -library(plotly) -library(kableExtra) -library(tidyr) -library(arsenal) -library(forcats) -library(huxtable) -library(magrittr) -library(parameters) -library(ggplot2) -library(ggeasy) -library(scales) -library(plotly) -library(patchwork) -library(tidyverse) -library(readxl) -library(purrr) -library(here) -library(table1) -library(furrr) -library(future) -``` - - -# Vary simulation lambdas across the range in the real cross-sectional data - -## 1. Estimate the shigella incidence rate using the real cross-sectional Shigella data, -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Load data -load("~/ipab.RData") -``` - -## Get parameters from longitudinal data - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Set noise -create_noise_df <- function(country) { - noise_df <- tibble( - antigen_iso = c("IgG"), - Country = factor(c(country)), # Use input argument for Country - y.low = c(25), - eps = c(0.25), - nu = c(0.5), - y.high = c(200000) - ) - - return(noise_df) -} - -noise_df_USA <- create_noise_df("MA USA") -noise_df_Niger <- create_noise_df("Niger") -noise_df_Sierra <- create_noise_df("Sierra Leone") -noise_df_Ghana <- create_noise_df("Ghana") -``` - -## 2. Run simulations using: -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# MA USA -est_USA <- est.incidence( - pop_data = df_xs_USA_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_USA, - antigen_isos = c("IgG"), -) - -# Ghana -est_Ghana <- est.incidence( - pop_data = df_xs_Ghana_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Ghana, - antigen_isos = c("IgG"), -) - -# Niger -est_Niger <- est.incidence( - pop_data = df_xs_Niger_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Niger, - antigen_isos = c("IgG"), -) - -# Sierra Leone -est_Sierra <- est.incidence( - pop_data = df_xs_Sierra_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Sierra, - antigen_isos = c("IgG"), -) - -# create table of incidence.rate of each region - -create_incidence_table <- function(...) { - # Capture input objects and their names - est_list <- list(...) - country_names <- names(est_list) - - # Extract incidence.rate from the summary() of each estimate object - incidence_rates <- sapply(est_list, function(x) summary(x)$incidence.rate) - - # Create a tidy tibble - incidence_table <- tibble( - Country = country_names, - Incidence_Rate = incidence_rates - ) - - return(incidence_table) -} -# Example usage with four different country estimates: -incidence_summary <- create_incidence_table( - USA = est_USA, - Ghana = est_Ghana, - Niger = est_Niger, - Sierra_Leone = est_Sierra -) - -# Display the summary table -print(incidence_summary) - -# 1) The median of the incidence rate estimates from step 1. -# Get incidence rate estimates from each region (4 regions) and do median -med_ipab_IgG<-median(incidence_summary$Incidence_Rate) - -#1.188 - -# 2) 2x the maximum incidence rate estimate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the maximum one and 2x -max2_ipab_IgG<-2*max(incidence_summary$Incidence_Rate) - -# 2.788 - -# 3) 1/2 the minimum incidence rate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the minimum one and 1/2 -min_half_ipab_IgG<-0.5*min(incidence_summary$Incidence_Rate) - -#0.185 -# Create a data frame for the incidence rate summary -incidence_rate_summary <- data.frame( - Metric = c("Median Incidence Rate", - "2x Maximum Incidence Rate", - "1/2 Minimum Incidence Rate"), - Value = c(median(incidence_summary$Incidence_Rate), # Median of incidence rates - 2 * max(incidence_summary$Incidence_Rate), # 2x Maximum incidence rate - 0.5 * min(incidence_summary$Incidence_Rate)) # 1/2 Minimum incidence rate -) - -# Print the table -print(incidence_rate_summary) - -``` -These three values are preliminary observations of lambda from ipab_IgG. - -We choose a lambda that is the median of the four regions. - -\newpage - -# Simulation (300times) with the smallest lambda -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} - -# Define the simulation function -simulate_seroincidence <- function(nrep, n_sim, observed, range = NULL) { - # Set parallel plan inside function to avoid issues with distributed nodes - plan(multicore) # Use multiple cores for parallel processing (works best on HPC) - - # Parameters - dmcmc <- curve_params_shigella # Curve parameters - antibodies <- c("IgG") # Antigen-isotypes - lambda <- observed # Simulated incidence rate per person-year - - # Biologic noise distribution - dlims <- rbind("IgG" = c(min = 0, max = 0.5)) - - # Noise parameters - cond <- tibble( - antigen_iso = c("IgG"), - nu = c(0.5), # Biologic noise (nu) - eps = c(0.25), # Measurement noise (eps) - y.low = c(25), # Low cutoff (llod) - y.high = c(200000) # High cutoff (y.high) - ) - - # Perform simulations in parallel - results <- future_map(1:n_sim, function(i) { - tryCatch({ - # Generate cross-sectional data - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = TRUE, # Use different parameters for each simulation - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - # Estimate seroincidence - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = TRUE, - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - # Return results for this simulation - list(csdata = csdata, est1 = est) - }, error = function(e) { - return(list(error = e$message)) # Capture and store errors instead of stopping execution - }) - }, .options = furrr_options(seed = TRUE)) - - # Ensure sequential processing after function execution - plan(sequential) - - return(results) -} - -# ------------------------ # -# 🔹 Run Simulations in Parallel -# ------------------------ # - -# Set up parallel processing (Ensure future::plan() is set before execution) -plan(multicore) # Works best on HPC - -# Define parameter sets -sim_params <- list( - list(nrep = 100, range = c(0, 2)), - list(nrep = 100, range = c(2, 5)), - list(nrep = 200, range = c(0, 2)), - list(nrep = 200, range = c(2, 5)), - list(nrep = 300, range = c(0, 2)), - list(nrep = 300, range = c(2, 5)), - list(nrep = 400, range = c(0, 2)), - list(nrep = 400, range = c(2, 5)) -) - -# Run simulations in a loop to reduce redundant code -results_list <- lapply(sim_params, function(params) { - cat("Running simulation for nrep =", params$nrep, "range =", params$range, "\n") - simulate_seroincidence(nrep = params$nrep, n_sim = 300, observed = med_ipab_IgG, range = params$range) -}) - -# Assign results to variables -results_100_1 <- results_list[[1]] -results_100_2 <- results_list[[2]] -results_200_1 <- results_list[[3]] -results_200_2 <- results_list[[4]] -results_300_1 <- results_list[[5]] -results_300_2 <- results_list[[6]] -results_400_1 <- results_list[[7]] -results_400_2 <- results_list[[8]] - -# Stop parallel processing -plan(sequential) # Return to sequential processing after execution - -``` - - -## Store each sample size in table -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to generate final tables -generate_final_table <- function(results_list, sample_size) { - # Initialize an empty list to store the results - summary_results <- list() - - # Loop through each of the 100 results and extract the required columns - for (i in 1:300) { - # Extract the summary for each result - result_summary <- summary(results_list[[i]]$est1) - - # Select the required columns - extracted_columns <- result_summary %>% - select(incidence.rate, SE, CI.lwr, CI.upr) - - # Add a column for the index (optional, for tracking) - extracted_columns <- extracted_columns %>% - mutate(index = i) - - # Append to the list - summary_results[[i]] <- extracted_columns - } - - # Combine all results into a single data frame - final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity - - return(final_table) -} - -# Example usage for different sample sizes -final_table_100_1 <- generate_final_table(results_100_1, 100) -final_table_100_2 <- generate_final_table(results_100_2, 100) - -final_table_200_1 <- generate_final_table(results_200_1, 200) -final_table_200_2 <- generate_final_table(results_200_2, 200) - -final_table_300_1 <- generate_final_table(results_300_1, 300) -final_table_300_2 <- generate_final_table(results_300_2, 300) - -final_table_400_1 <- generate_final_table(results_400_1, 400) -final_table_400_2 <- generate_final_table(results_400_2, 400) -``` - - - -## Graphs of where the x axis is the sample size and the y axis is the empirical standard error - -Set the preliminary observed lambda as the median incidence rate across the four regions. - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# Function to calculate metrics for each table -calculate_metrics <- function(data, sample_size) { - empirical_se <- sd(data$incidence.rate) - - - # Return a data frame with results - data.frame( - sample_size = sample_size, - empirical_se = empirical_se - - ) -} - -# Apply the function to each table -metrics_100_1 <- calculate_metrics(final_table_100_1, 100) -metrics_100_2 <- calculate_metrics(final_table_100_2, 100) - -metrics_200_1 <- calculate_metrics(final_table_200_1, 200) -metrics_200_2 <- calculate_metrics(final_table_200_2, 200) - -metrics_300_1 <- calculate_metrics(final_table_300_1, 300) -metrics_300_2 <- calculate_metrics(final_table_300_2, 300) - -metrics_400_1 <- calculate_metrics(final_table_400_1, 400) -metrics_400_2 <- calculate_metrics(final_table_400_2, 400) - - -# Combine the results into a single summary table -summary_metrics_1 <- bind_rows(metrics_100_1, metrics_200_1, metrics_300_1, - metrics_400_1) - - -summary_metrics_2 <- bind_rows(metrics_100_2, metrics_200_2, metrics_300_2, - metrics_400_2) - -# Add a column to distinguish the datasets -summary_metrics_1 <- summary_metrics_1 %>% - mutate(Age_Group = "Age 0-2") - -summary_metrics_2 <- summary_metrics_2 %>% - mutate(Age_Group = "Age 2-5") - -# Combine both datasets into one -summary_metrics_combined <- bind_rows(summary_metrics_1, summary_metrics_2) - -# Plot with color to differentiate age groups -ggplot(summary_metrics_combined, aes(x = sample_size, y = empirical_se, color = Age_Group)) + - geom_line() + - geom_point() + - labs( - title = "Empirical Standard Error vs. Sample Size", - x = "Sample Size", - y = "Empirical Standard Error", - color = "Age Group" - ) + - theme_minimal() -``` - -\newpage - -## Table - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -summary_metrics_combined %>% - kable() -``` \ No newline at end of file diff --git a/sim_hpc_sf2a.qmd b/sim_hpc_sf2a.qmd deleted file mode 100644 index d586d9f..0000000 --- a/sim_hpc_sf2a.qmd +++ /dev/null @@ -1,401 +0,0 @@ ---- -title: "Simulation with the Smallest Lambda for Sf2a IgG" -format: - pdf: - number-sections: true - number-depth: 2 - number-offset: [0, 0] - pdf-engine: pdflatex # <-- Use pdflatex instead of xelatex or lualatex -editor: visual -editor_options: - chunk_output_type: console ---- - -```{r setup, include=FALSE,echo=FALSE} -library(knitr) -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r, echo=FALSE, message=FALSE} -library(tibble) -library(dplyr) -library(serocalculator) -library(haven) -library(knitr) -library(plotly) -library(kableExtra) -library(tidyr) -library(arsenal) -library(forcats) -library(huxtable) -library(magrittr) -library(parameters) -library(ggplot2) -library(ggeasy) -library(scales) -library(plotly) -library(patchwork) -library(tidyverse) -library(readxl) -library(purrr) -library(here) -library(table1) -library(furrr) -library(future) -``` - - -# Vary simulation lambdas across the range in the real cross-sectional data - -## 1. Estimate the shigella incidence rate using the real cross-sectional Shigella data, -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Load data -load("~/sf2a.RData") -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Set noise -create_noise_df <- function(country) { - noise_df <- tibble( - antigen_iso = c("IgG"), - Country = factor(c(country)), # Use input argument for Country - y.low = c(25), - eps = c(0.25), - nu = c(0.5), - y.high = c(200000) - ) - - return(noise_df) -} - -noise_df_USA <- create_noise_df("MA USA") -noise_df_Niger <- create_noise_df("Niger") -noise_df_Sierra <- create_noise_df("Sierra Leone") -noise_df_Ghana <- create_noise_df("Ghana") -``` - -## 2. Run simulations using: -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# MA USA -est_USA <- est.incidence( - pop_data = df_xs_USA_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_USA, - antigen_isos = c("IgG"), -) - -# Ghana -est_Ghana <- est.incidence( - pop_data = df_xs_Ghana_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Ghana, - antigen_isos = c("IgG"), -) - -# Niger -est_Niger <- est.incidence( - pop_data = df_xs_Niger_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Niger, - antigen_isos = c("IgG"), -) - -# Sierra Leone -est_Sierra <- est.incidence( - pop_data = df_xs_Sierra_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Sierra, - antigen_isos = c("IgG"), -) - -# create table of incidence.rate of each region - -create_incidence_table <- function(...) { - # Capture input objects and their names - est_list <- list(...) - country_names <- names(est_list) - - # Extract incidence.rate from the summary() of each estimate object - incidence_rates <- sapply(est_list, function(x) summary(x)$incidence.rate) - - # Create a tidy tibble - incidence_table <- tibble( - Country = country_names, - Incidence_Rate = incidence_rates - ) - - return(incidence_table) -} -# Example usage with four different country estimates: -incidence_summary <- create_incidence_table( - USA = est_USA, - Ghana = est_Ghana, - Niger = est_Niger, - Sierra_Leone = est_Sierra -) - -# Display the summary table -print(incidence_summary) - -# 1) The median of the incidence rate estimates from step 1. -# Get incidence rate estimates from each region (4 regions) and do median -med_sf2a_IgG<-median(incidence_summary$Incidence_Rate) - -#0.756 - -# 2) 2x the maximum incidence rate estimate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the maximum one and 2x -max2_sf2a_IgG<-2*max(incidence_summary$Incidence_Rate) - -# 2.058 - -# 3) 1/2 the minimum incidence rate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the minimum one and 1/2 -min_half_sf2a_IgG<-0.5*min(incidence_summary$Incidence_Rate) - -#0.288 - -# Create a data frame for the incidence rate summary -incidence_rate_summary <- data.frame( - Metric = c("Median Incidence Rate", - "2x Maximum Incidence Rate", - "1/2 Minimum Incidence Rate"), - Value = c(median(incidence_summary$Incidence_Rate), # Median of incidence rates - 2 * max(incidence_summary$Incidence_Rate), # 2x Maximum incidence rate - 0.5 * min(incidence_summary$Incidence_Rate)) # 1/2 Minimum incidence rate -) - -# Print the table -print(incidence_rate_summary) -``` -These three values are preliminary observations of lambda from sf2a_IgG. - -We choose a lambda that is half of the minimum lambda from the four regions. - -\newpage - -# Simulation (300times) with the smallest lambda -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} - -# Define the simulation function -simulate_seroincidence <- function(nrep, n_sim, observed, range = NULL) { - # Set parallel plan inside function to avoid issues with distributed nodes - plan(multicore) # Use multiple cores for parallel processing (works best on HPC) - - # Parameters - dmcmc <- curve_params_shigella # Curve parameters - antibodies <- c("IgG") # Antigen-isotypes - lambda <- observed # Simulated incidence rate per person-year - - # Biologic noise distribution - dlims <- rbind("IgG" = c(min = 0, max = 0.5)) - - # Noise parameters - cond <- tibble( - antigen_iso = c("IgG"), - nu = c(0.5), # Biologic noise (nu) - eps = c(0.25), # Measurement noise (eps) - y.low = c(25), # Low cutoff (llod) - y.high = c(200000) # High cutoff (y.high) - ) - - # Perform simulations in parallel - results <- future_map(1:n_sim, function(i) { - tryCatch({ - # Generate cross-sectional data - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = TRUE, # Use different parameters for each simulation - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - # Estimate seroincidence - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = TRUE, - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - # Return results for this simulation - list(csdata = csdata, est1 = est) - }, error = function(e) { - return(list(error = e$message)) # Capture and store errors instead of stopping execution - }) - }, .options = furrr_options(seed = TRUE)) - - # Ensure sequential processing after function execution - plan(sequential) - - return(results) -} - -# ------------------------ # -# 🔹 Run Simulations in Parallel -# ------------------------ # - -# Set up parallel processing (Ensure future::plan() is set before execution) -plan(multicore) # Works best on HPC - -# Define parameter sets -sim_params <- list( - list(nrep = 100, range = c(0, 2)), - list(nrep = 100, range = c(2, 5)), - list(nrep = 200, range = c(0, 2)), - list(nrep = 200, range = c(2, 5)), - list(nrep = 300, range = c(0, 2)), - list(nrep = 300, range = c(2, 5)), - list(nrep = 400, range = c(0, 2)), - list(nrep = 400, range = c(2, 5)) -) - -# Run simulations in a loop to reduce redundant code -results_list <- lapply(sim_params, function(params) { - cat("Running simulation for nrep =", params$nrep, "range =", params$range, "\n") - simulate_seroincidence(nrep = params$nrep, n_sim = 300, observed = min_half_sf2a_IgG, range = params$range) -}) - -# Assign results to variables -results_100_1 <- results_list[[1]] -results_100_2 <- results_list[[2]] -results_200_1 <- results_list[[3]] -results_200_2 <- results_list[[4]] -results_300_1 <- results_list[[5]] -results_300_2 <- results_list[[6]] -results_400_1 <- results_list[[7]] -results_400_2 <- results_list[[8]] - -# Stop parallel processing -plan(sequential) # Return to sequential processing after execution -``` - - -## Store each sample size in table -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to generate final tables -generate_final_table <- function(results_list, sample_size) { - # Initialize an empty list to store the results - summary_results <- list() - - # Loop through each of the 100 results and extract the required columns - for (i in 1:300) { - # Extract the summary for each result - result_summary <- summary(results_list[[i]]$est1) - - # Select the required columns - extracted_columns <- result_summary %>% - select(incidence.rate, SE, CI.lwr, CI.upr) - - # Add a column for the index (optional, for tracking) - extracted_columns <- extracted_columns %>% - mutate(index = i) - - # Append to the list - summary_results[[i]] <- extracted_columns - } - - # Combine all results into a single data frame - final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity - - return(final_table) -} - -# Example usage for different sample sizes -final_table_100_1 <- generate_final_table(results_100_1, 100) -final_table_100_2 <- generate_final_table(results_100_2, 100) - -final_table_200_1 <- generate_final_table(results_200_1, 200) -final_table_200_2 <- generate_final_table(results_200_2, 200) - -final_table_300_1 <- generate_final_table(results_300_1, 300) -final_table_300_2 <- generate_final_table(results_300_2, 300) - -final_table_400_1 <- generate_final_table(results_400_1, 400) -final_table_400_2 <- generate_final_table(results_400_2, 400) -``` - - - -## Graphs of where the x axis is the sample size and the y axis is the empirical standard error - -Set the preliminary observed lambda as half of the minimum incidence rate from the four regions. - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# Function to calculate metrics for each table -calculate_metrics <- function(data, sample_size) { - empirical_se <- sd(data$incidence.rate) - - - # Return a data frame with results - data.frame( - sample_size = sample_size, - empirical_se = empirical_se - - ) -} - -# Apply the function to each table -metrics_100_1 <- calculate_metrics(final_table_100_1, 100) -metrics_100_2 <- calculate_metrics(final_table_100_2, 100) - -metrics_200_1 <- calculate_metrics(final_table_200_1, 200) -metrics_200_2 <- calculate_metrics(final_table_200_2, 200) - -metrics_300_1 <- calculate_metrics(final_table_300_1, 300) -metrics_300_2 <- calculate_metrics(final_table_300_2, 300) - -metrics_400_1 <- calculate_metrics(final_table_400_1, 400) -metrics_400_2 <- calculate_metrics(final_table_400_2, 400) - - -# Combine the results into a single summary table -summary_metrics_1 <- bind_rows(metrics_100_1, metrics_200_1, metrics_300_1, - metrics_400_1) - - -summary_metrics_2 <- bind_rows(metrics_100_2, metrics_200_2, metrics_300_2, - metrics_400_2) - -# Add a column to distinguish the datasets -summary_metrics_1 <- summary_metrics_1 %>% - mutate(Age_Group = "Age 0-2") - -summary_metrics_2 <- summary_metrics_2 %>% - mutate(Age_Group = "Age 2-5") - -# Combine both datasets into one -summary_metrics_combined <- bind_rows(summary_metrics_1, summary_metrics_2) - -# Plot with color to differentiate age groups -ggplot(summary_metrics_combined, aes(x = sample_size, y = empirical_se, color = Age_Group)) + - geom_line() + - geom_point() + - labs( - title = "Empirical Standard Error vs. Sample Size", - x = "Sample Size", - y = "Empirical Standard Error", - color = "Age Group" - ) + - theme_minimal() -``` - -\newpage - -## Table - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -summary_metrics_combined %>% - kable() -``` \ No newline at end of file diff --git a/sim_hpc_sf2a_med.qmd b/sim_hpc_sf2a_med.qmd deleted file mode 100644 index f808dfb..0000000 --- a/sim_hpc_sf2a_med.qmd +++ /dev/null @@ -1,401 +0,0 @@ ---- -title: "Simulation with the Median Lambda for Sf2a IgG" -format: - pdf: - number-sections: true - number-depth: 2 - number-offset: [0, 0] - pdf-engine: pdflatex # <-- Use pdflatex instead of xelatex or lualatex -editor: visual -editor_options: - chunk_output_type: console ---- - -```{r setup, include=FALSE,echo=FALSE} -library(knitr) -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r, echo=FALSE, message=FALSE} -library(tibble) -library(dplyr) -library(serocalculator) -library(haven) -library(knitr) -library(plotly) -library(kableExtra) -library(tidyr) -library(arsenal) -library(forcats) -library(huxtable) -library(magrittr) -library(parameters) -library(ggplot2) -library(ggeasy) -library(scales) -library(plotly) -library(patchwork) -library(tidyverse) -library(readxl) -library(purrr) -library(here) -library(table1) -library(furrr) -library(future) -``` - - -# Vary simulation lambdas across the range in the real cross-sectional data - -## 1. Estimate the shigella incidence rate using the real cross-sectional Shigella data, -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Load data -load("~/sf2a.RData") -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Set noise -create_noise_df <- function(country) { - noise_df <- tibble( - antigen_iso = c("IgG"), - Country = factor(c(country)), # Use input argument for Country - y.low = c(25), - eps = c(0.25), - nu = c(0.5), - y.high = c(200000) - ) - - return(noise_df) -} - -noise_df_USA <- create_noise_df("MA USA") -noise_df_Niger <- create_noise_df("Niger") -noise_df_Sierra <- create_noise_df("Sierra Leone") -noise_df_Ghana <- create_noise_df("Ghana") -``` - -## 2. Run simulations using: -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# MA USA -est_USA <- est.incidence( - pop_data = df_xs_USA_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_USA, - antigen_isos = c("IgG"), -) - -# Ghana -est_Ghana <- est.incidence( - pop_data = df_xs_Ghana_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Ghana, - antigen_isos = c("IgG"), -) - -# Niger -est_Niger <- est.incidence( - pop_data = df_xs_Niger_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Niger, - antigen_isos = c("IgG"), -) - -# Sierra Leone -est_Sierra <- est.incidence( - pop_data = df_xs_Sierra_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Sierra, - antigen_isos = c("IgG"), -) - -# create table of incidence.rate of each region - -create_incidence_table <- function(...) { - # Capture input objects and their names - est_list <- list(...) - country_names <- names(est_list) - - # Extract incidence.rate from the summary() of each estimate object - incidence_rates <- sapply(est_list, function(x) summary(x)$incidence.rate) - - # Create a tidy tibble - incidence_table <- tibble( - Country = country_names, - Incidence_Rate = incidence_rates - ) - - return(incidence_table) -} -# Example usage with four different country estimates: -incidence_summary <- create_incidence_table( - USA = est_USA, - Ghana = est_Ghana, - Niger = est_Niger, - Sierra_Leone = est_Sierra -) - -# Display the summary table -print(incidence_summary) - -# 1) The median of the incidence rate estimates from step 1. -# Get incidence rate estimates from each region (4 regions) and do median -med_sf2a_IgG<-median(incidence_summary$Incidence_Rate) - -#0.756 - -# 2) 2x the maximum incidence rate estimate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the maximum one and 2x -max2_sf2a_IgG<-2*max(incidence_summary$Incidence_Rate) - -# 2.058 - -# 3) 1/2 the minimum incidence rate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the minimum one and 1/2 -min_half_sf2a_IgG<-0.5*min(incidence_summary$Incidence_Rate) - -#0.288 - -# Create a data frame for the incidence rate summary -incidence_rate_summary <- data.frame( - Metric = c("Median Incidence Rate", - "2x Maximum Incidence Rate", - "1/2 Minimum Incidence Rate"), - Value = c(median(incidence_summary$Incidence_Rate), # Median of incidence rates - 2 * max(incidence_summary$Incidence_Rate), # 2x Maximum incidence rate - 0.5 * min(incidence_summary$Incidence_Rate)) # 1/2 Minimum incidence rate -) - -# Print the table -print(incidence_rate_summary) -``` -These three values are preliminary observations of lambda from sf2a_IgG. - -We choose a lambda that is the median of the four regions. - -\newpage - -# Simulation (300times) with the smallest lambda -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} - -# Define the simulation function -simulate_seroincidence <- function(nrep, n_sim, observed, range = NULL) { - # Set parallel plan inside function to avoid issues with distributed nodes - plan(multicore) # Use multiple cores for parallel processing (works best on HPC) - - # Parameters - dmcmc <- curve_params_shigella # Curve parameters - antibodies <- c("IgG") # Antigen-isotypes - lambda <- observed # Simulated incidence rate per person-year - - # Biologic noise distribution - dlims <- rbind("IgG" = c(min = 0, max = 0.5)) - - # Noise parameters - cond <- tibble( - antigen_iso = c("IgG"), - nu = c(0.5), # Biologic noise (nu) - eps = c(0.25), # Measurement noise (eps) - y.low = c(25), # Low cutoff (llod) - y.high = c(200000) # High cutoff (y.high) - ) - - # Perform simulations in parallel - results <- future_map(1:n_sim, function(i) { - tryCatch({ - # Generate cross-sectional data - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = TRUE, # Use different parameters for each simulation - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - # Estimate seroincidence - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = TRUE, - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - # Return results for this simulation - list(csdata = csdata, est1 = est) - }, error = function(e) { - return(list(error = e$message)) # Capture and store errors instead of stopping execution - }) - }, .options = furrr_options(seed = TRUE)) - - # Ensure sequential processing after function execution - plan(sequential) - - return(results) -} - -# ------------------------ # -# 🔹 Run Simulations in Parallel -# ------------------------ # - -# Set up parallel processing (Ensure future::plan() is set before execution) -plan(multicore) # Works best on HPC - -# Define parameter sets -sim_params <- list( - list(nrep = 100, range = c(0, 2)), - list(nrep = 100, range = c(2, 5)), - list(nrep = 200, range = c(0, 2)), - list(nrep = 200, range = c(2, 5)), - list(nrep = 300, range = c(0, 2)), - list(nrep = 300, range = c(2, 5)), - list(nrep = 400, range = c(0, 2)), - list(nrep = 400, range = c(2, 5)) -) - -# Run simulations in a loop to reduce redundant code -results_list <- lapply(sim_params, function(params) { - cat("Running simulation for nrep =", params$nrep, "range =", params$range, "\n") - simulate_seroincidence(nrep = params$nrep, n_sim = 300, observed = med_sf2a_IgG, range = params$range) -}) - -# Assign results to variables -results_100_1 <- results_list[[1]] -results_100_2 <- results_list[[2]] -results_200_1 <- results_list[[3]] -results_200_2 <- results_list[[4]] -results_300_1 <- results_list[[5]] -results_300_2 <- results_list[[6]] -results_400_1 <- results_list[[7]] -results_400_2 <- results_list[[8]] - -# Stop parallel processing -plan(sequential) # Return to sequential processing after execution -``` - - -## Store each sample size in table -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to generate final tables -generate_final_table <- function(results_list, sample_size) { - # Initialize an empty list to store the results - summary_results <- list() - - # Loop through each of the 100 results and extract the required columns - for (i in 1:300) { - # Extract the summary for each result - result_summary <- summary(results_list[[i]]$est1) - - # Select the required columns - extracted_columns <- result_summary %>% - select(incidence.rate, SE, CI.lwr, CI.upr) - - # Add a column for the index (optional, for tracking) - extracted_columns <- extracted_columns %>% - mutate(index = i) - - # Append to the list - summary_results[[i]] <- extracted_columns - } - - # Combine all results into a single data frame - final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity - - return(final_table) -} - -# Example usage for different sample sizes -final_table_100_1 <- generate_final_table(results_100_1, 100) -final_table_100_2 <- generate_final_table(results_100_2, 100) - -final_table_200_1 <- generate_final_table(results_200_1, 200) -final_table_200_2 <- generate_final_table(results_200_2, 200) - -final_table_300_1 <- generate_final_table(results_300_1, 300) -final_table_300_2 <- generate_final_table(results_300_2, 300) - -final_table_400_1 <- generate_final_table(results_400_1, 400) -final_table_400_2 <- generate_final_table(results_400_2, 400) -``` - - - -## Graphs of where the x axis is the sample size and the y axis is the empirical standard error - -Set the preliminary observed lambda as the median incidence rate across the four regions. - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# Function to calculate metrics for each table -calculate_metrics <- function(data, sample_size) { - empirical_se <- sd(data$incidence.rate) - - - # Return a data frame with results - data.frame( - sample_size = sample_size, - empirical_se = empirical_se - - ) -} - -# Apply the function to each table -metrics_100_1 <- calculate_metrics(final_table_100_1, 100) -metrics_100_2 <- calculate_metrics(final_table_100_2, 100) - -metrics_200_1 <- calculate_metrics(final_table_200_1, 200) -metrics_200_2 <- calculate_metrics(final_table_200_2, 200) - -metrics_300_1 <- calculate_metrics(final_table_300_1, 300) -metrics_300_2 <- calculate_metrics(final_table_300_2, 300) - -metrics_400_1 <- calculate_metrics(final_table_400_1, 400) -metrics_400_2 <- calculate_metrics(final_table_400_2, 400) - - -# Combine the results into a single summary table -summary_metrics_1 <- bind_rows(metrics_100_1, metrics_200_1, metrics_300_1, - metrics_400_1) - - -summary_metrics_2 <- bind_rows(metrics_100_2, metrics_200_2, metrics_300_2, - metrics_400_2) - -# Add a column to distinguish the datasets -summary_metrics_1 <- summary_metrics_1 %>% - mutate(Age_Group = "Age 0-2") - -summary_metrics_2 <- summary_metrics_2 %>% - mutate(Age_Group = "Age 2-5") - -# Combine both datasets into one -summary_metrics_combined <- bind_rows(summary_metrics_1, summary_metrics_2) - -# Plot with color to differentiate age groups -ggplot(summary_metrics_combined, aes(x = sample_size, y = empirical_se, color = Age_Group)) + - geom_line() + - geom_point() + - labs( - title = "Empirical Standard Error vs. Sample Size", - x = "Sample Size", - y = "Empirical Standard Error", - color = "Age Group" - ) + - theme_minimal() -``` - -\newpage - -## Table - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -summary_metrics_combined %>% - kable() -``` \ No newline at end of file diff --git a/sim_hpc_sf3a.qmd b/sim_hpc_sf3a.qmd deleted file mode 100644 index cf6f3f3..0000000 --- a/sim_hpc_sf3a.qmd +++ /dev/null @@ -1,401 +0,0 @@ ---- -title: "Simulation with the Smallest Lambda for Sf3a IgG" -format: - pdf: - number-sections: true - number-depth: 2 - number-offset: [0, 0] - pdf-engine: pdflatex # <-- Use pdflatex instead of xelatex or lualatex -editor: visual -editor_options: - chunk_output_type: console ---- - -```{r setup, include=FALSE,echo=FALSE} -library(knitr) -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r, echo=FALSE, message=FALSE} -library(tibble) -library(dplyr) -library(serocalculator) -library(haven) -library(knitr) -library(plotly) -library(kableExtra) -library(tidyr) -library(arsenal) -library(forcats) -library(huxtable) -library(magrittr) -library(parameters) -library(ggplot2) -library(ggeasy) -library(scales) -library(plotly) -library(patchwork) -library(tidyverse) -library(readxl) -library(purrr) -library(here) -library(table1) -library(furrr) -library(future) -``` - - -# Vary simulation lambdas across the range in the real cross-sectional data - -## 1. Estimate the shigella incidence rate using the real cross-sectional Shigella data, -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Load data -load("~/sf3a.RData") -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Set noise -create_noise_df <- function(country) { - noise_df <- tibble( - antigen_iso = c("IgG"), - Country = factor(c(country)), # Use input argument for Country - y.low = c(25), - eps = c(0.25), - nu = c(0.5), - y.high = c(200000) - ) - - return(noise_df) -} - -noise_df_USA <- create_noise_df("MA USA") -noise_df_Niger <- create_noise_df("Niger") -noise_df_Sierra <- create_noise_df("Sierra Leone") -noise_df_Ghana <- create_noise_df("Ghana") -``` - -## 2. Run simulations using: -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# MA USA -est_USA <- est.incidence( - pop_data = df_xs_USA_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_USA, - antigen_isos = c("IgG"), -) - -# Ghana -est_Ghana <- est.incidence( - pop_data = df_xs_Ghana_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Ghana, - antigen_isos = c("IgG"), -) - -# Niger -est_Niger <- est.incidence( - pop_data = df_xs_Niger_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Niger, - antigen_isos = c("IgG"), -) - -# Sierra Leone -est_Sierra <- est.incidence( - pop_data = df_xs_Sierra_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Sierra, - antigen_isos = c("IgG"), -) - -# create table of incidence.rate of each region - -create_incidence_table <- function(...) { - # Capture input objects and their names - est_list <- list(...) - country_names <- names(est_list) - - # Extract incidence.rate from the summary() of each estimate object - incidence_rates <- sapply(est_list, function(x) summary(x)$incidence.rate) - - # Create a tidy tibble - incidence_table <- tibble( - Country = country_names, - Incidence_Rate = incidence_rates - ) - - return(incidence_table) -} -# Example usage with four different country estimates: -incidence_summary <- create_incidence_table( - USA = est_USA, - Ghana = est_Ghana, - Niger = est_Niger, - Sierra_Leone = est_Sierra -) - -# Display the summary table -print(incidence_summary) - -# 1) The median of the incidence rate estimates from step 1. -# Get incidence rate estimates from each region (4 regions) and do median -med_sf3a_IgG<-median(incidence_summary$Incidence_Rate) - -#0.756 - -# 2) 2x the maximum incidence rate estimate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the maximum one and 2x -max2_sf3a_IgG<-2*max(incidence_summary$Incidence_Rate) - -# 2.058 - -# 3) 1/2 the minimum incidence rate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the minimum one and 1/2 -min_half_sf3a_IgG<-0.5*min(incidence_summary$Incidence_Rate) - -#0.288 - -# Create a data frame for the incidence rate summary -incidence_rate_summary <- data.frame( - Metric = c("Median Incidence Rate", - "2x Maximum Incidence Rate", - "1/2 Minimum Incidence Rate"), - Value = c(median(incidence_summary$Incidence_Rate), # Median of incidence rates - 2 * max(incidence_summary$Incidence_Rate), # 2x Maximum incidence rate - 0.5 * min(incidence_summary$Incidence_Rate)) # 1/2 Minimum incidence rate -) - -# Print the table -print(incidence_rate_summary) -``` -These three values are preliminary observations of lambda from sf3a_IgG. - -We choose a lambda that is half of the minimum lambda from the four regions. - -\newpage - -# Simulation (300times) with the smallest lambda -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} - -# Define the simulation function -simulate_seroincidence <- function(nrep, n_sim, observed, range = NULL) { - # Set parallel plan inside function to avoid issues with distributed nodes - plan(multicore) # Use multiple cores for parallel processing (works best on HPC) - - # Parameters - dmcmc <- curve_params_shigella # Curve parameters - antibodies <- c("IgG") # Antigen-isotypes - lambda <- observed # Simulated incidence rate per person-year - - # Biologic noise distribution - dlims <- rbind("IgG" = c(min = 0, max = 0.5)) - - # Noise parameters - cond <- tibble( - antigen_iso = c("IgG"), - nu = c(0.5), # Biologic noise (nu) - eps = c(0.25), # Measurement noise (eps) - y.low = c(25), # Low cutoff (llod) - y.high = c(200000) # High cutoff (y.high) - ) - - # Perform simulations in parallel - results <- future_map(1:n_sim, function(i) { - tryCatch({ - # Generate cross-sectional data - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = TRUE, # Use different parameters for each simulation - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - # Estimate seroincidence - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = TRUE, - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - # Return results for this simulation - list(csdata = csdata, est1 = est) - }, error = function(e) { - return(list(error = e$message)) # Capture and store errors instead of stopping execution - }) - }, .options = furrr_options(seed = TRUE)) - - # Ensure sequential processing after function execution - plan(sequential) - - return(results) -} - -# ------------------------ # -# 🔹 Run Simulations in Parallel -# ------------------------ # - -# Set up parallel processing (Ensure future::plan() is set before execution) -plan(multicore) # Works best on HPC - -# Define parameter sets -sim_params <- list( - list(nrep = 100, range = c(0, 2)), - list(nrep = 100, range = c(2, 5)), - list(nrep = 200, range = c(0, 2)), - list(nrep = 200, range = c(2, 5)), - list(nrep = 300, range = c(0, 2)), - list(nrep = 300, range = c(2, 5)), - list(nrep = 400, range = c(0, 2)), - list(nrep = 400, range = c(2, 5)) -) - -# Run simulations in a loop to reduce redundant code -results_list <- lapply(sim_params, function(params) { - cat("Running simulation for nrep =", params$nrep, "range =", params$range, "\n") - simulate_seroincidence(nrep = params$nrep, n_sim = 300, observed = min_half_sf3a_IgG, range = params$range) -}) - -# Assign results to variables -results_100_1 <- results_list[[1]] -results_100_2 <- results_list[[2]] -results_200_1 <- results_list[[3]] -results_200_2 <- results_list[[4]] -results_300_1 <- results_list[[5]] -results_300_2 <- results_list[[6]] -results_400_1 <- results_list[[7]] -results_400_2 <- results_list[[8]] - -# Stop parallel processing -plan(sequential) # Return to sequential processing after execution -``` - - -## Store each sample size in table -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to generate final tables -generate_final_table <- function(results_list, sample_size) { - # Initialize an empty list to store the results - summary_results <- list() - - # Loop through each of the 100 results and extract the required columns - for (i in 1:300) { - # Extract the summary for each result - result_summary <- summary(results_list[[i]]$est1) - - # Select the required columns - extracted_columns <- result_summary %>% - select(incidence.rate, SE, CI.lwr, CI.upr) - - # Add a column for the index (optional, for tracking) - extracted_columns <- extracted_columns %>% - mutate(index = i) - - # Append to the list - summary_results[[i]] <- extracted_columns - } - - # Combine all results into a single data frame - final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity - - return(final_table) -} - -# Example usage for different sample sizes -final_table_100_1 <- generate_final_table(results_100_1, 100) -final_table_100_2 <- generate_final_table(results_100_2, 100) - -final_table_200_1 <- generate_final_table(results_200_1, 200) -final_table_200_2 <- generate_final_table(results_200_2, 200) - -final_table_300_1 <- generate_final_table(results_300_1, 300) -final_table_300_2 <- generate_final_table(results_300_2, 300) - -final_table_400_1 <- generate_final_table(results_400_1, 400) -final_table_400_2 <- generate_final_table(results_400_2, 400) -``` - - - -## Graphs of where the x axis is the sample size and the y axis is the empirical standard error - -Set the preliminary observed lambda as half of the minimum incidence rate from the four regions. - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# Function to calculate metrics for each table -calculate_metrics <- function(data, sample_size) { - empirical_se <- sd(data$incidence.rate) - - - # Return a data frame with results - data.frame( - sample_size = sample_size, - empirical_se = empirical_se - - ) -} - -# Apply the function to each table -metrics_100_1 <- calculate_metrics(final_table_100_1, 100) -metrics_100_2 <- calculate_metrics(final_table_100_2, 100) - -metrics_200_1 <- calculate_metrics(final_table_200_1, 200) -metrics_200_2 <- calculate_metrics(final_table_200_2, 200) - -metrics_300_1 <- calculate_metrics(final_table_300_1, 300) -metrics_300_2 <- calculate_metrics(final_table_300_2, 300) - -metrics_400_1 <- calculate_metrics(final_table_400_1, 400) -metrics_400_2 <- calculate_metrics(final_table_400_2, 400) - - -# Combine the results into a single summary table -summary_metrics_1 <- bind_rows(metrics_100_1, metrics_200_1, metrics_300_1, - metrics_400_1) - - -summary_metrics_2 <- bind_rows(metrics_100_2, metrics_200_2, metrics_300_2, - metrics_400_2) - -# Add a column to distinguish the datasets -summary_metrics_1 <- summary_metrics_1 %>% - mutate(Age_Group = "Age 0-2") - -summary_metrics_2 <- summary_metrics_2 %>% - mutate(Age_Group = "Age 2-5") - -# Combine both datasets into one -summary_metrics_combined <- bind_rows(summary_metrics_1, summary_metrics_2) - -# Plot with color to differentiate age groups -ggplot(summary_metrics_combined, aes(x = sample_size, y = empirical_se, color = Age_Group)) + - geom_line() + - geom_point() + - labs( - title = "Empirical Standard Error vs. Sample Size", - x = "Sample Size", - y = "Empirical Standard Error", - color = "Age Group" - ) + - theme_minimal() -``` - -\newpage - -## Table - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -summary_metrics_combined %>% - kable() -``` \ No newline at end of file diff --git a/sim_hpc_sf3a_med.qmd b/sim_hpc_sf3a_med.qmd deleted file mode 100644 index f7c0c71..0000000 --- a/sim_hpc_sf3a_med.qmd +++ /dev/null @@ -1,401 +0,0 @@ ---- -title: "Simulation with the Median Lambda for Sf3a IgG" -format: - pdf: - number-sections: true - number-depth: 2 - number-offset: [0, 0] - pdf-engine: pdflatex # <-- Use pdflatex instead of xelatex or lualatex -editor: visual -editor_options: - chunk_output_type: console ---- - -```{r setup, include=FALSE,echo=FALSE} -library(knitr) -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r, echo=FALSE, message=FALSE} -library(tibble) -library(dplyr) -library(serocalculator) -library(haven) -library(knitr) -library(plotly) -library(kableExtra) -library(tidyr) -library(arsenal) -library(forcats) -library(huxtable) -library(magrittr) -library(parameters) -library(ggplot2) -library(ggeasy) -library(scales) -library(plotly) -library(patchwork) -library(tidyverse) -library(readxl) -library(purrr) -library(here) -library(table1) -library(furrr) -library(future) -``` - - -# Vary simulation lambdas across the range in the real cross-sectional data - -## 1. Estimate the shigella incidence rate using the real cross-sectional Shigella data, -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Load data -load("~/sf3a.RData") -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Set noise -create_noise_df <- function(country) { - noise_df <- tibble( - antigen_iso = c("IgG"), - Country = factor(c(country)), # Use input argument for Country - y.low = c(25), - eps = c(0.25), - nu = c(0.5), - y.high = c(200000) - ) - - return(noise_df) -} - -noise_df_USA <- create_noise_df("MA USA") -noise_df_Niger <- create_noise_df("Niger") -noise_df_Sierra <- create_noise_df("Sierra Leone") -noise_df_Ghana <- create_noise_df("Ghana") -``` - -## 2. Run simulations using: -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# MA USA -est_USA <- est.incidence( - pop_data = df_xs_USA_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_USA, - antigen_isos = c("IgG"), -) - -# Ghana -est_Ghana <- est.incidence( - pop_data = df_xs_Ghana_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Ghana, - antigen_isos = c("IgG"), -) - -# Niger -est_Niger <- est.incidence( - pop_data = df_xs_Niger_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Niger, - antigen_isos = c("IgG"), -) - -# Sierra Leone -est_Sierra <- est.incidence( - pop_data = df_xs_Sierra_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Sierra, - antigen_isos = c("IgG"), -) - -# create table of incidence.rate of each region - -create_incidence_table <- function(...) { - # Capture input objects and their names - est_list <- list(...) - country_names <- names(est_list) - - # Extract incidence.rate from the summary() of each estimate object - incidence_rates <- sapply(est_list, function(x) summary(x)$incidence.rate) - - # Create a tidy tibble - incidence_table <- tibble( - Country = country_names, - Incidence_Rate = incidence_rates - ) - - return(incidence_table) -} -# Example usage with four different country estimates: -incidence_summary <- create_incidence_table( - USA = est_USA, - Ghana = est_Ghana, - Niger = est_Niger, - Sierra_Leone = est_Sierra -) - -# Display the summary table -print(incidence_summary) - -# 1) The median of the incidence rate estimates from step 1. -# Get incidence rate estimates from each region (4 regions) and do median -med_sf3a_IgG<-median(incidence_summary$Incidence_Rate) - -#0.756 - -# 2) 2x the maximum incidence rate estimate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the maximum one and 2x -max2_sf3a_IgG<-2*max(incidence_summary$Incidence_Rate) - -# 2.058 - -# 3) 1/2 the minimum incidence rate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the minimum one and 1/2 -min_half_sf3a_IgG<-0.5*min(incidence_summary$Incidence_Rate) - -#0.288 - -# Create a data frame for the incidence rate summary -incidence_rate_summary <- data.frame( - Metric = c("Median Incidence Rate", - "2x Maximum Incidence Rate", - "1/2 Minimum Incidence Rate"), - Value = c(median(incidence_summary$Incidence_Rate), # Median of incidence rates - 2 * max(incidence_summary$Incidence_Rate), # 2x Maximum incidence rate - 0.5 * min(incidence_summary$Incidence_Rate)) # 1/2 Minimum incidence rate -) - -# Print the table -print(incidence_rate_summary) -``` -These three values are preliminary observations of lambda from sf3a_IgG. - -We choose a lambda that is the median of the four regions. - -\newpage - -# Simulation (300times) with the smallest lambda -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} - -# Define the simulation function -simulate_seroincidence <- function(nrep, n_sim, observed, range = NULL) { - # Set parallel plan inside function to avoid issues with distributed nodes - plan(multicore) # Use multiple cores for parallel processing (works best on HPC) - - # Parameters - dmcmc <- curve_params_shigella # Curve parameters - antibodies <- c("IgG") # Antigen-isotypes - lambda <- observed # Simulated incidence rate per person-year - - # Biologic noise distribution - dlims <- rbind("IgG" = c(min = 0, max = 0.5)) - - # Noise parameters - cond <- tibble( - antigen_iso = c("IgG"), - nu = c(0.5), # Biologic noise (nu) - eps = c(0.25), # Measurement noise (eps) - y.low = c(25), # Low cutoff (llod) - y.high = c(200000) # High cutoff (y.high) - ) - - # Perform simulations in parallel - results <- future_map(1:n_sim, function(i) { - tryCatch({ - # Generate cross-sectional data - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = TRUE, # Use different parameters for each simulation - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - # Estimate seroincidence - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = TRUE, - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - # Return results for this simulation - list(csdata = csdata, est1 = est) - }, error = function(e) { - return(list(error = e$message)) # Capture and store errors instead of stopping execution - }) - }, .options = furrr_options(seed = TRUE)) - - # Ensure sequential processing after function execution - plan(sequential) - - return(results) -} - -# ------------------------ # -# 🔹 Run Simulations in Parallel -# ------------------------ # - -# Set up parallel processing (Ensure future::plan() is set before execution) -plan(multicore) # Works best on HPC - -# Define parameter sets -sim_params <- list( - list(nrep = 100, range = c(0, 2)), - list(nrep = 100, range = c(2, 5)), - list(nrep = 200, range = c(0, 2)), - list(nrep = 200, range = c(2, 5)), - list(nrep = 300, range = c(0, 2)), - list(nrep = 300, range = c(2, 5)), - list(nrep = 400, range = c(0, 2)), - list(nrep = 400, range = c(2, 5)) -) - -# Run simulations in a loop to reduce redundant code -results_list <- lapply(sim_params, function(params) { - cat("Running simulation for nrep =", params$nrep, "range =", params$range, "\n") - simulate_seroincidence(nrep = params$nrep, n_sim = 300, observed = med_sf3a_IgG, range = params$range) -}) - -# Assign results to variables -results_100_1 <- results_list[[1]] -results_100_2 <- results_list[[2]] -results_200_1 <- results_list[[3]] -results_200_2 <- results_list[[4]] -results_300_1 <- results_list[[5]] -results_300_2 <- results_list[[6]] -results_400_1 <- results_list[[7]] -results_400_2 <- results_list[[8]] - -# Stop parallel processing -plan(sequential) # Return to sequential processing after execution -``` - - -## Store each sample size in table -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to generate final tables -generate_final_table <- function(results_list, sample_size) { - # Initialize an empty list to store the results - summary_results <- list() - - # Loop through each of the 100 results and extract the required columns - for (i in 1:300) { - # Extract the summary for each result - result_summary <- summary(results_list[[i]]$est1) - - # Select the required columns - extracted_columns <- result_summary %>% - select(incidence.rate, SE, CI.lwr, CI.upr) - - # Add a column for the index (optional, for tracking) - extracted_columns <- extracted_columns %>% - mutate(index = i) - - # Append to the list - summary_results[[i]] <- extracted_columns - } - - # Combine all results into a single data frame - final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity - - return(final_table) -} - -# Example usage for different sample sizes -final_table_100_1 <- generate_final_table(results_100_1, 100) -final_table_100_2 <- generate_final_table(results_100_2, 100) - -final_table_200_1 <- generate_final_table(results_200_1, 200) -final_table_200_2 <- generate_final_table(results_200_2, 200) - -final_table_300_1 <- generate_final_table(results_300_1, 300) -final_table_300_2 <- generate_final_table(results_300_2, 300) - -final_table_400_1 <- generate_final_table(results_400_1, 400) -final_table_400_2 <- generate_final_table(results_400_2, 400) -``` - - - -## Graphs of where the x axis is the sample size and the y axis is the empirical standard error - -Set the preliminary observed lambda as the median incidence rate across the four regions. - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# Function to calculate metrics for each table -calculate_metrics <- function(data, sample_size) { - empirical_se <- sd(data$incidence.rate) - - - # Return a data frame with results - data.frame( - sample_size = sample_size, - empirical_se = empirical_se - - ) -} - -# Apply the function to each table -metrics_100_1 <- calculate_metrics(final_table_100_1, 100) -metrics_100_2 <- calculate_metrics(final_table_100_2, 100) - -metrics_200_1 <- calculate_metrics(final_table_200_1, 200) -metrics_200_2 <- calculate_metrics(final_table_200_2, 200) - -metrics_300_1 <- calculate_metrics(final_table_300_1, 300) -metrics_300_2 <- calculate_metrics(final_table_300_2, 300) - -metrics_400_1 <- calculate_metrics(final_table_400_1, 400) -metrics_400_2 <- calculate_metrics(final_table_400_2, 400) - - -# Combine the results into a single summary table -summary_metrics_1 <- bind_rows(metrics_100_1, metrics_200_1, metrics_300_1, - metrics_400_1) - - -summary_metrics_2 <- bind_rows(metrics_100_2, metrics_200_2, metrics_300_2, - metrics_400_2) - -# Add a column to distinguish the datasets -summary_metrics_1 <- summary_metrics_1 %>% - mutate(Age_Group = "Age 0-2") - -summary_metrics_2 <- summary_metrics_2 %>% - mutate(Age_Group = "Age 2-5") - -# Combine both datasets into one -summary_metrics_combined <- bind_rows(summary_metrics_1, summary_metrics_2) - -# Plot with color to differentiate age groups -ggplot(summary_metrics_combined, aes(x = sample_size, y = empirical_se, color = Age_Group)) + - geom_line() + - geom_point() + - labs( - title = "Empirical Standard Error vs. Sample Size", - x = "Sample Size", - y = "Empirical Standard Error", - color = "Age Group" - ) + - theme_minimal() -``` - -\newpage - -## Table - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -summary_metrics_combined %>% - kable() -``` \ No newline at end of file diff --git a/sim_lambdas_ipab.r b/sim_lambdas_ipab.r deleted file mode 100644 index 7969800..0000000 --- a/sim_lambdas_ipab.r +++ /dev/null @@ -1,335 +0,0 @@ -library(tibble) -library(dplyr) -library(serocalculator) -library(haven) -library(knitr) -library(plotly) -library(kableExtra) -library(tidyr) -library(arsenal) -library(forcats) -library(huxtable) -library(magrittr) -library(parameters) -library(ggplot2) -library(ggeasy) -library(scales) -library(plotly) -library(patchwork) -library(tidyverse) -library(readxl) -library(purrr) -library(here) -library(table1) -library(furrr) -library(future) - -### Vary simulation lambdas across the range in the real cross-sectional data - -## 1. Estimate the shigella incidence rate using the real cross-sectional Shigella data, -## separately for each geographic region in the data. - -load("~/ipab.RData") - -################################################################################ -## Set noise -create_noise_df <- function(country) { - noise_df <- tibble( - antigen_iso = c("IgG"), - Country = factor(c(country)), # Use input argument for Country - y.low = c(25), - eps = c(0.25), - nu = c(0.5), - y.high = c(200000) - ) - - return(noise_df) -} - -noise_df_USA <- create_noise_df("MA USA") -noise_df_Niger <- create_noise_df("Niger") -noise_df_Sierra <- create_noise_df("Sierra Leone") -noise_df_Ghana <- create_noise_df("Ghana") - - -################################################################################ -## 2. Run simulations using: -# MA USA -est_USA <- est.incidence( - pop_data = df_xs_USA_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_USA, - antigen_isos = c("IgG"), -) - -# Ghana -est_Ghana <- est.incidence( - pop_data = df_xs_Ghana_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Ghana, - antigen_isos = c("IgG"), -) - -# Niger -est_Niger <- est.incidence( - pop_data = df_xs_Niger_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Niger, - antigen_isos = c("IgG"), -) - -# Sierra Leone -est_Sierra <- est.incidence( - pop_data = df_xs_Sierra_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Sierra, - antigen_isos = c("IgG"), -) - -# create table of incidence.rate of each region - -create_incidence_table <- function(...) { - # Capture input objects and their names - est_list <- list(...) - country_names <- names(est_list) - - # Extract incidence.rate from the summary() of each estimate object - incidence_rates <- sapply(est_list, function(x) summary(x)$incidence.rate) - - # Create a tidy tibble - incidence_table <- tibble( - Country = country_names, - Incidence_Rate = incidence_rates - ) - - return(incidence_table) -} -# Example usage with four different country estimates: -incidence_summary <- create_incidence_table( - USA = est_USA, - Ghana = est_Ghana, - Niger = est_Niger, - Sierra_Leone = est_Sierra -) - -# Display the summary table -print(incidence_summary) - -# 1) The median of the incidence rate estimates from step 1. -# Get incidence rate estimates from each region (4 regions) and do median -med_ipab_IgG<-median(incidence_summary$Incidence_Rate) -#1.188 - -# 2) 2x the maximum incidence rate estimate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the maximum one and 2x -max2_ipab_IgG<-2*max(incidence_summary$Incidence_Rate) -# 2.788 - -# 3) 1/2 the minimum incidence rate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the minimum one and 1/2 -min_half_ipab_IgG<-0.5*min(incidence_summary$Incidence_Rate) -#0.185 - -## These three values are preliminary observations of lambda from ipab_IgG. - -################################################################################ -################################################################################ -################################################################################ - -## Simulation (200times) -# Define the simulation function - -simulate_seroincidence <- function(nrep, n_sim, observed, range = NULL) { - # Set parallel plan inside function to avoid issues with distributed nodes - plan(multicore) # Use multiple cores for parallel processing (works best on HPC) - - # Parameters - dmcmc <- curve_params_shigella # Curve parameters - antibodies <- c("IgG") # Antigen-isotypes - lambda <- observed # Simulated incidence rate per person-year - - # Biologic noise distribution - dlims <- rbind("IgG" = c(min = 0, max = 0.5)) - - # Noise parameters - cond <- tibble( - antigen_iso = c("IgG"), - nu = c(0.5), # Biologic noise (nu) - eps = c(0.25), # Measurement noise (eps) - y.low = c(25), # Low cutoff (llod) - y.high = c(200000) # High cutoff (y.high) - ) - - # Perform simulations in parallel - results <- future_map(1:n_sim, function(i) { - tryCatch({ - # Generate cross-sectional data - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = TRUE, # Use different parameters for each simulation - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - # Estimate seroincidence - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = TRUE, - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - # Return results for this simulation - list(csdata = csdata, est1 = est) - }, error = function(e) { - return(list(error = e$message)) # Capture and store errors instead of stopping execution - }) - }, .options = furrr_options(seed = TRUE)) - - # Ensure sequential processing after function execution - plan(sequential) - - return(results) -} - -# ------------------------ # -# 🔹 Run Simulations in Parallel -# ------------------------ # - -# Set up parallel processing (Ensure future::plan() is set before execution) -plan(multicore) # Works best on HPC - -# Define parameter sets -sim_params <- list( - list(nrep = 100, range = c(0, 2)), - list(nrep = 100, range = c(2, 5)), - list(nrep = 200, range = c(0, 2)), - list(nrep = 200, range = c(2, 5)), - list(nrep = 300, range = c(0, 2)), - list(nrep = 300, range = c(2, 5)), - list(nrep = 400, range = c(0, 2)), - list(nrep = 400, range = c(2, 5)) -) - -# Run simulations in a loop to reduce redundant code -results_list <- lapply(sim_params, function(params) { - cat("Running simulation for nrep =", params$nrep, "range =", params$range, "\n") - simulate_seroincidence(nrep = params$nrep, n_sim = 200, observed = min_half_ipab_IgG, range = params$range) -}) - -# Assign results to variables -results_100_1 <- results_list[[1]] -results_100_2 <- results_list[[2]] -results_200_1 <- results_list[[3]] -results_200_2 <- results_list[[4]] -results_300_1 <- results_list[[5]] -results_300_2 <- results_list[[6]] -results_400_1 <- results_list[[7]] -results_400_2 <- results_list[[8]] - -# Stop parallel processing -plan(sequential) # Return to sequential processing after execution - - -## Store each sample size in table -# Define a function to generate final tables -generate_final_table <- function(results_list, sample_size) { - # Initialize an empty list to store the results - summary_results <- list() - - # Loop through each of the 100 results and extract the required columns - for (i in 1:200) { - # Extract the summary for each result - result_summary <- summary(results_list[[i]]$est1) - - # Select the required columns - extracted_columns <- result_summary %>% - select(incidence.rate, SE, CI.lwr, CI.upr) - - # Add a column for the index (optional, for tracking) - extracted_columns <- extracted_columns %>% - mutate(index = i) - - # Append to the list - summary_results[[i]] <- extracted_columns - } - - # Combine all results into a single data frame - final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity - - return(final_table) -} - -# Example usage for different sample sizes -final_table_100_1 <- generate_final_table(results_100_1, 100) -final_table_100_2 <- generate_final_table(results_100_2, 100) - -final_table_200_1 <- generate_final_table(results_200_1, 200) -final_table_200_2 <- generate_final_table(results_200_2, 200) - -final_table_300_1 <- generate_final_table(results_300_1, 300) -final_table_300_2 <- generate_final_table(results_300_2, 300) - -final_table_400_1 <- generate_final_table(results_400_1, 400) -final_table_400_2 <- generate_final_table(results_400_2, 400) - - -## Graphs of where the x axis is the sample size and the y axis is -## the empirical standard error - -# Function to calculate metrics for each table -calculate_metrics <- function(data, sample_size) { - empirical_se <- sd(data$incidence.rate) - - - # Return a data frame with results - data.frame( - sample_size = sample_size, - empirical_se = empirical_se - - ) -} - -# Apply the function to each table -metrics_100_1 <- calculate_metrics(final_table_100_1, 100) -metrics_100_2 <- calculate_metrics(final_table_100_2, 100) - -metrics_200_1 <- calculate_metrics(final_table_200_1, 100) -metrics_200_2 <- calculate_metrics(final_table_200_2, 100) - -metrics_300_1 <- calculate_metrics(final_table_300_1, 100) -metrics_300_2 <- calculate_metrics(final_table_300_2, 100) - -metrics_400_1 <- calculate_metrics(final_table_400_1, 100) -metrics_400_2 <- calculate_metrics(final_table_400_2, 100) - - -# Combine the results into a single summary table -summary_metrics_1 <- bind_rows(metrics_100_1, metrics_200_1, metrics_300_1, - metrics_400_1) - -summary_metrics_2 <- bind_rows(metrics_100_2, metrics_200_2, metrics_300_2, - metrics_400_2) - -# Add a column to distinguish the datasets -summary_metrics_1 <- summary_metrics_1 %>% - mutate(Age_Group = "Age 0-2") - -summary_metrics_2 <- summary_metrics_2 %>% - mutate(Age_Group = "Age 2-5") - -# Combine both datasets into one -summary_metrics_combined <- bind_rows(summary_metrics_1, summary_metrics_2) - -save.image("ipab_results.RData") \ No newline at end of file diff --git a/simulation1.qmd b/simulation1.qmd deleted file mode 100644 index 8ce424e..0000000 --- a/simulation1.qmd +++ /dev/null @@ -1,621 +0,0 @@ ---- -title: "Simulation with the Smallest Lambda for IpaB IgG" -format: - pdf: - number-sections: true - number-depth: 2 - number-offset: [0, 0] -editor: visual -output: - pdf_document: - orientation: landscape -editor_options: - chunk_output_type: console ---- - -```{r setup, include=FALSE,echo=FALSE} -library(knitr) -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r, echo=FALSE, message=FALSE} -library(gridExtra) -library(mgcv) # For advanced GAM smoothing -library(haven) -library(knitr) -library(plotly) -library(kableExtra) -library(tidyr) -library(arsenal) -library(dplyr) -library(forcats) -library(huxtable) -library(magrittr) -library(parameters) -library(kableExtra) -library(ggplot2) -library(ggeasy) -library(scales) -library(plotly) -library(patchwork) -library(tidyverse) -library(gtsummary) -library(readxl) -library(purrr) -library(serocalculator) -library(runjags) -library(coda) -library(ggmcmc) -library(here) -library(bayesplot) -library(table1) -library(tibble) -library(furrr) -library(dplyr) -``` - - -# Vary simulation lambdas across the range in the real cross-sectional data - -## 1. Estimate the shigella incidence rate using the real cross-sectional Shigella data, -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## separately for each geographic region in the data. - -# load shigella data -df <- read_excel("3.8.2024 Compiled Shigella datav2.xlsx", - sheet = "Compiled") - - -# create function for generating specific region data -create_xs_data <- function(df, filter_countries, filter_antigen_iso, value_col) { - df %>% - # First, create/rename columns - mutate( - id = sid, - Country = site_name, - study = study_name, - age = age, - antigen_iso = factor(isotype_name), - value = {{ value_col }} # value_col is specified by the user, e.g. n_ipab_MFI - ) %>% - # Then filter by the desired catchment(s) and antigen iso(s) - filter( - Country %in% filter_countries, - antigen_iso %in% filter_antigen_iso - ) %>% - # Create age categories - mutate( - ageCat = factor(case_when( - age < 5 ~ "<5", - age >= 5 & age <=15 ~ "5-15", - age > 15 ~ "16+" - )) - ) %>% - # Optionally select only the needed columns - select(id, Country, study, age, antigen_iso, value, ageCat) -} - -# Cross-sectional data: MA USA, ipab_IgG -df_xs_USA_ipab_IgG <- create_xs_data( - df, - filter_countries = c("MA USA"), - filter_antigen_iso = c("IgG"), - value_col = n_ipab_MFI -) - -# explicitly rename the age column -df_xs_USA_ipab_IgG <- df_xs_USA_ipab_IgG %>% - rename(age = age) # Ensure it is correctly named - -# If get_age_var() looks for an attribute, you can manually assign it: -attr(df_xs_USA_ipab_IgG , "age_var") <- "age" - -# Similarly, check if the antibody value column is recognized: -get_value_var <- serocalculator:::get_value_var -get_value_var(df_xs_USA_ipab_IgG ) - -# If it returns NULL, assign it: -attr(df_xs_USA_ipab_IgG , "value_var") <- "value" - -# Cross-sectional data: MA USA, ipab_IgA - -# Cross-sectional data: Ghana, ipab_IgG -df_xs_Ghana_ipab_IgG <- create_xs_data( - df, - filter_countries = c("Ghana"), - filter_antigen_iso = c("IgG"), - value_col = n_ipab_MFI -) - -# Cross-sectional data: Ghana, ipab_IgA - -# Cross-sectional data: Niger, ipab_IgG -df_xs_Niger_ipab_IgG <- create_xs_data( - df, - filter_countries = c("Niger"), - filter_antigen_iso = c("IgG"), - value_col = n_ipab_MFI -) - -# Cross-sectional data: Niger, ipab_IgA - -# Cross-sectional data: Sierra Leone, ipab_IgG -df_xs_Sierra_ipab_IgG <- create_xs_data( - df, - filter_countries = c("Sierra Leone"), - filter_antigen_iso = c("IgG"), - value_col = n_ipab_MFI -) - -# Cross-sectional data: Sierra Leone, ipab_IgA - -# Create function to clearly define cross-sectional data -prepare_df_for_serocalculator <- function(df, age_col = "age", value_col = "value") { - # Ensure correct column names - df <- df %>% - rename(age = all_of(age_col)) - - # Assign attributes for serocalculator - attr(df, "age_var") <- "age" - attr(df, "value_var") <- value_col - - # Check if serocalculator recognizes attributes - get_value_var <- serocalculator:::get_value_var - detected_value_var <- get_value_var(df) - - if (is.null(detected_value_var)) { - warning("serocalculator did not detect the 'value' column. Check column naming.") - } else { - message("serocalculator recognized 'value' column: ", detected_value_var) - } - - return(df) -} - -# Application -df_xs_USA_ipab_IgG <- prepare_df_for_serocalculator(df_xs_USA_ipab_IgG) -df_xs_Ghana_ipab_IgG<- prepare_df_for_serocalculator(df_xs_Ghana_ipab_IgG) -df_xs_Niger_ipab_IgG<- prepare_df_for_serocalculator(df_xs_Niger_ipab_IgG) -df_xs_Sierra_ipab_IgG<- prepare_df_for_serocalculator(df_xs_Sierra_ipab_IgG) -``` - - - -## Get parameters from longitudinal data -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to filter and manipulate Shigella data -process_shigella_data <- function(data, study_filter, antigen) { - # Filter the data for the specific study - filtered_data <- data %>% - filter(study_name == study_filter) - - # Capture the column name of the antigen - antigen_col <- ensym(antigen) - - # Manipulate and restructure the data - processed_data <- filtered_data %>% - select(isotype_name, sid, timepoint, `Actual day`, !!antigen_col) %>% - mutate( - index_id = sid, - antigen_iso = isotype_name, - visit = timepoint, - timeindays = `Actual day`, - result = !!antigen_col - ) %>% - group_by(index_id, antigen_iso) %>% - arrange(visit) %>% - mutate(visit_num = rank(visit, ties.method = "first")) %>% - ungroup() %>% - # Remove rows with NA in timeindays - filter(!is.na(timeindays)) - - return(processed_data) -} - - -dL_clean <- process_shigella_data(data = df, study_filter = "SOSAR", antigen = n_ipab_MFI) - - -# Construct the path to "prep_data.r" using here -prep_data_path <- here::here("R", "prep_data.r") -prep_priors_path <- here::here("R", "prep_priors.R") - -# Source the file to load the prep_data function -source(prep_data_path) -source(prep_priors_path) - -#prepare data for modeline -# Create 5 different longdata -longdata <- prep_data(dL_clean) -priors <- prep_priors(max_antigens = longdata$n_antigen_isos) - - -nchains <- 4; # nr of MC chains to run simultaneously -nadapt <- 1000; # nr of iterations for adaptation -nburnin <- 100; # nr of iterations to use for burn-in -nmc <- 100; # nr of samples in posterior chains -niter <- 200; # nr of iterations for posterior sample -nthin <- round(niter/nmc); # thinning needed to produce nmc from niter - -tomonitor <- c("y0", "y1", "t1", "alpha", "shape"); - -#This handles the seed to reproduce the results -initsfunction <- function(chain){ - stopifnot(chain %in% (1:4)); # max 4 chains allowed... - .RNG.seed <- (1:4)[chain]; - .RNG.name <- c("base::Wichmann-Hill","base::Marsaglia-Multicarry", - "base::Super-Duper","base::Mersenne-Twister")[chain]; - return(list(".RNG.seed"=.RNG.seed,".RNG.name"=.RNG.name)); -} - -file.mod <- here::here("inst", "extdata", "model.jags.r") - -set.seed(11325) -jags.post <- run.jags(model = file.mod, - data = c(longdata, priors), - inits = initsfunction, - method = "parallel", - adapt = nadapt, - burnin = nburnin, - thin = nthin, - sample = nmc, - n.chains = nchains, - monitor = tomonitor, - summarise = FALSE) - -mcmc_list <- as.mcmc.list(jags.post) - -mcmc_df <- ggs(mcmc_list) - -wide_predpar_df <- mcmc_df %>% - mutate( - parameter = sub("^(\\w+)\\[.*", "\\1", Parameter), - index_id = as.numeric(sub("^\\w+\\[(\\d+),.*", "\\1", Parameter)), - antigen_iso = as.numeric(sub("^\\w+\\[\\d+,(\\d+).*", "\\1", Parameter)) - ) %>% - mutate( - index_id = factor(index_id, labels = c(unique(dL_clean$index_id), "newperson")), - antigen_iso = factor(antigen_iso, labels = unique(dL_clean$antigen_iso))) %>% - filter(index_id == "newperson") %>% - select(-Parameter) %>% - pivot_wider(names_from = "parameter", values_from="value") %>% - rowwise() %>% - droplevels() %>% - ungroup() %>% - rename(r = shape) - -# Assuming wide_predpar_df is your data frame -curve_params <- wide_predpar_df - -# Set class and attributes for serocalculator -class(curve_params) <- c("curve_params", class(curve_params)) -antigen_isos <- unique(curve_params$antigen_iso) -attr(curve_params, "antigen_isos") <- antigen_isos - -curve_params<-curve_params%>% - mutate( - iter=Iteration)%>% - select(antigen_iso,iter,y0,y1,t1,alpha,r) - -curve_params_shigella<-curve_params -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Set noise -create_noise_df <- function(country) { - noise_df <- tibble( - antigen_iso = c("IgG"), - Country = factor(c(country)), # Use input argument for Country - y.low = c(25), - eps = c(0.25), - nu = c(0.5), - y.high = c(200000) - ) - - return(noise_df) -} - -noise_df_USA <- create_noise_df("MA USA") -noise_df_Niger <- create_noise_df("Niger") -noise_df_Sierra <- create_noise_df("Sierra Leone") -noise_df_Ghana <- create_noise_df("Ghana") -``` - -## 2. Run simulations using: -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# MA USA -est_USA <- est.incidence( - pop_data = df_xs_USA_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_USA, - antigen_isos = c("IgG"), -) - -# Ghana -est_Ghana <- est.incidence( - pop_data = df_xs_Ghana_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Ghana, - antigen_isos = c("IgG"), -) - -# Niger -est_Niger <- est.incidence( - pop_data = df_xs_Niger_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Niger, - antigen_isos = c("IgG"), -) - -# Sierra Leone -est_Sierra <- est.incidence( - pop_data = df_xs_Sierra_ipab_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Sierra, - antigen_isos = c("IgG"), -) - -# create table of incidence.rate of each region - -create_incidence_table <- function(...) { - # Capture input objects and their names - est_list <- list(...) - country_names <- names(est_list) - - # Extract incidence.rate from the summary() of each estimate object - incidence_rates <- sapply(est_list, function(x) summary(x)$incidence.rate) - - # Create a tidy tibble - incidence_table <- tibble( - Country = country_names, - Incidence_Rate = incidence_rates - ) - - return(incidence_table) -} -# Example usage with four different country estimates: -incidence_summary <- create_incidence_table( - USA = est_USA, - Ghana = est_Ghana, - Niger = est_Niger, - Sierra_Leone = est_Sierra -) - -# Display the summary table -print(incidence_summary) - -# 1) The median of the incidence rate estimates from step 1. -# Get incidence rate estimates from each region (4 regions) and do median -med_ipab_IgG<-median(incidence_summary$Incidence_Rate) -#1.188 -med_ipab_IgG -# 2) 2x the maximum incidence rate estimate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the maximum one and 2x -max2_ipab_IgG<-2*max(incidence_summary$Incidence_Rate) -# 2.788 -max2_ipab_IgG -# 3) 1/2 the minimum incidence rate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the minimum one and 1/2 -min_half_ipab_IgG<-0.5*min(incidence_summary$Incidence_Rate) -#0.185 -min_half_ipab_IgG -``` -These three values are preliminary observations of lambda from ipab_IgG. - -We choose a lambda that is half of the minimum lambda from the four regions. - - -# Simulation (200times) with the smallest lambda -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} - -# Define the simulation function -simulate_seroincidence <- function(nrep, n_sim, observed,range=NULL) { - # Parameters - dmcmc <- curve_params_shigella # Curve parameters - antibodies <- c("IgG") # Antigen-isotypes - lambda <- observed # Simulated incidence rate per person-year - - # Biologic noise distribution - dlims <- rbind( - - "IgG" = c(min = 0, max = 0.5) - ) - - # Noise parameters - cond <- tibble( - antigen_iso = c("IgG"), - nu = c(0.5), # Biologic noise (nu) - eps = c(0.25), # Measurement noise (eps) - y.low = c(25), # Low cutoff (llod) - y.high = c(200000) # High cutoff (y.high) - ) - - # Perform simulations in parallel - results <- future_map(1:n_sim, function(i) { - # Generate cross-sectional data - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = TRUE, # Use different parameters for each simulation - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - # Estimate seroincidence - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = TRUE, - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - # Return results for this simulation - list( - csdata = csdata, - est1 = est - ) - }, .options = furrr_options(seed = TRUE)) - - return(results) -} - -## Generate -# Set up parallel processing with `future` -plan(multisession) # Use multiple sessions for parallelism (local machine) - -# Run the simulations in parallel -set.seed(206251) -results_100_1 <- simulate_seroincidence(nrep = 100, n_sim = 200, - observed=min_half_ipab_IgG,range=c(0,2)) -results_100_2 <- simulate_seroincidence(nrep = 100, n_sim = 200, - observed=min_half_ipab_IgG,range=c(2,5)) -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -set.seed(206252) -results_200_1 <- simulate_seroincidence(nrep = 200, n_sim = 200, - observed=min_half_ipab_IgG,range=c(0,2)) -results_200_2 <- simulate_seroincidence(nrep = 200, n_sim = 200, - observed=min_half_ipab_IgG,range=c(2,5)) - -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -set.seed(206253) -results_300_1 <- simulate_seroincidence(nrep = 300, n_sim = 200, - observed=min_half_ipab_IgG,range=c(0,2)) -results_300_2 <- simulate_seroincidence(nrep = 300, n_sim = 200, - observed=min_half_ipab_IgG,range=c(2,5)) -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -set.seed(206254) -results_400_1 <- simulate_seroincidence(nrep = 400, n_sim = 200, - observed=min_half_ipab_IgG,range=c(0,2)) -results_400_2 <- simulate_seroincidence(nrep = 400, n_sim = 200, - observed=min_half_ipab_IgG,range=c(2,5)) -# Stop parallel processing -plan(sequential) # Return to sequential processing -``` - - -## Store each sample size in table -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to generate final tables -generate_final_table <- function(results_list, sample_size) { - # Initialize an empty list to store the results - summary_results <- list() - - # Loop through each of the 100 results and extract the required columns - for (i in 1:200) { - # Extract the summary for each result - result_summary <- summary(results_list[[i]]$est1) - - # Select the required columns - extracted_columns <- result_summary %>% - select(incidence.rate, SE, CI.lwr, CI.upr) - - # Add a column for the index (optional, for tracking) - extracted_columns <- extracted_columns %>% - mutate(index = i) - - # Append to the list - summary_results[[i]] <- extracted_columns - } - - # Combine all results into a single data frame - final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity - - return(final_table) -} - -# Example usage for different sample sizes -final_table_100_1 <- generate_final_table(results_100_1, 100) -final_table_100_2 <- generate_final_table(results_100_2, 100) - -final_table_200_1 <- generate_final_table(results_200_1, 200) -final_table_200_2 <- generate_final_table(results_200_2, 200) - -final_table_300_1 <- generate_final_table(results_300_1, 300) -final_table_300_2 <- generate_final_table(results_300_2, 300) - -final_table_400_1 <- generate_final_table(results_400_1, 400) -final_table_400_2 <- generate_final_table(results_400_2, 400) -``` - - - -## Graphs of where the x axis is the sample size and the y axis is the empirical standard error - -Set the preliminary observed lambda as half of the minimum incidence rate from the four regions. - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# Function to calculate metrics for each table -calculate_metrics <- function(data, sample_size) { - empirical_se <- sd(data$incidence.rate) - - - # Return a data frame with results - data.frame( - sample_size = sample_size, - empirical_se = empirical_se - - ) -} - -# Apply the function to each table -metrics_100_1 <- calculate_metrics(final_table_100_1, 100) -metrics_100_2 <- calculate_metrics(final_table_100_2, 100) - -metrics_200_1 <- calculate_metrics(final_table_200_1, 200) -metrics_200_2 <- calculate_metrics(final_table_200_2, 200) - -metrics_300_1 <- calculate_metrics(final_table_300_1, 300) -metrics_300_2 <- calculate_metrics(final_table_300_2, 300) - -metrics_400_1 <- calculate_metrics(final_table_400_1, 400) -metrics_400_2 <- calculate_metrics(final_table_400_2, 400) - - -# Combine the results into a single summary table -summary_metrics_1 <- bind_rows(metrics_100_1, metrics_200_1, metrics_300_1, - metrics_400_1) - - -summary_metrics_2 <- bind_rows(metrics_100_2, metrics_200_2, metrics_300_2, - metrics_400_2) - -# Add a column to distinguish the datasets -summary_metrics_1 <- summary_metrics_1 %>% - mutate(Age_Group = "Age 0-2") - -summary_metrics_2 <- summary_metrics_2 %>% - mutate(Age_Group = "Age 2-5") - -# Combine both datasets into one -summary_metrics_combined <- bind_rows(summary_metrics_1, summary_metrics_2) - -# Plot with color to differentiate age groups -ggplot(summary_metrics_combined, aes(x = sample_size, y = empirical_se, color = Age_Group)) + - geom_line() + - geom_point() + - labs( - title = "Empirical Standard Error vs. Sample Size", - x = "Sample Size", - y = "Empirical Standard Error", - color = "Age Group" - ) + - theme_minimal() -``` - - diff --git a/simulation2.qmd b/simulation2.qmd deleted file mode 100644 index 73c83c0..0000000 --- a/simulation2.qmd +++ /dev/null @@ -1,651 +0,0 @@ ---- -title: "Simulation with the Smallest Lambda for sf3a IgG" -format: - pdf: - number-sections: true - number-depth: 2 - number-offset: [0, 0] -editor: visual -output: - pdf_document: - orientation: landscape -editor_options: - chunk_output_type: console ---- - -```{r setup, include=FALSE,echo=FALSE} -library(knitr) -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r, echo=FALSE, message=FALSE} -library(future.apply) -library(future) -library(gridExtra) -library(mgcv) # For advanced GAM smoothing -library(haven) -library(knitr) -library(plotly) -library(kableExtra) -library(tidyr) -library(arsenal) -library(dplyr) -library(forcats) -library(huxtable) -library(magrittr) -library(parameters) -library(kableExtra) -library(ggplot2) -library(ggeasy) -library(scales) -library(plotly) -library(patchwork) -library(tidyverse) -library(gtsummary) -library(readxl) -library(purrr) -library(serocalculator) -library(runjags) -library(coda) -library(ggmcmc) -library(here) -library(bayesplot) -library(table1) -library(tibble) -library(furrr) -library(dplyr) -``` - - -# Vary simulation lambdas across the range in the real cross-sectional data - -## 1. Estimate the shigella incidence rate using the real cross-sectional Shigella data, -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## separately for each geographic region in the data. - -# load shigella data -df <- read_excel("3.8.2024 Compiled Shigella datav2.xlsx", - sheet = "Compiled") - - -# create function for generating specific region data -create_xs_data <- function(df, filter_countries, filter_antigen_iso, value_col) { - df %>% - # First, create/rename columns - mutate( - id = sid, - Country = site_name, - study = study_name, - age = age, - antigen_iso = factor(isotype_name), - value = {{ value_col }} # value_col is specified by the user, e.g. n_ipab_MFI - ) %>% - # Then filter by the desired catchment(s) and antigen iso(s) - filter( - Country %in% filter_countries, - antigen_iso %in% filter_antigen_iso - ) %>% - # Create age categories - mutate( - ageCat = factor(case_when( - age < 5 ~ "<5", - age >= 5 & age <=15 ~ "5-15", - age > 15 ~ "16+" - )) - ) %>% - # Optionally select only the needed columns - select(id, Country, study, age, antigen_iso, value, ageCat) -} - -# Cross-sectional data: MA USA, sf3a_IgG -df_xs_USA_sf3a_IgG <- create_xs_data( - df, - filter_countries = c("MA USA"), - filter_antigen_iso = c("IgG"), - value_col = n_sf3aospbsa_MFI -) - -# Cross-sectional data: Ghana, sf3a_IgG -df_xs_Ghana_sf3a_IgG <- create_xs_data( - df, - filter_countries = c("Ghana"), - filter_antigen_iso = c("IgG"), - value_col = n_sf3aospbsa_MFI -) - - -# Cross-sectional data: Niger, sf3a_IgG -df_xs_Niger_sf3a_IgG <- create_xs_data( - df, - filter_countries = c("Niger"), - filter_antigen_iso = c("IgG"), - value_col =n_sf3aospbsa_MFI -) - - -# Cross-sectional data: Sierra Leone, sf3a_IgG -df_xs_Sierra_sf3a_IgG <- create_xs_data( - df, - filter_countries = c("Sierra Leone"), - filter_antigen_iso = c("IgG"), - value_col = n_sf3aospbsa_MFI -) - -# Create function to clearly define cross-sectional data -prepare_df_for_serocalculator <- function(df, age_col = "age", value_col = "value") { - # Ensure correct column names - df <- df %>% - rename(age = all_of(age_col)) - - # Assign attributes for serocalculator - attr(df, "age_var") <- "age" - attr(df, "value_var") <- value_col - - # Check if serocalculator recognizes attributes - get_value_var <- serocalculator:::get_value_var - detected_value_var <- get_value_var(df) - - if (is.null(detected_value_var)) { - warning("serocalculator did not detect the 'value' column. Check column naming.") - } else { - message("serocalculator recognized 'value' column: ", detected_value_var) - } - - return(df) -} - -# Application -df_xs_USA_sf3a_IgG <- prepare_df_for_serocalculator(df_xs_USA_sf3a_IgG) -df_xs_Ghana_sf3a_IgG<- prepare_df_for_serocalculator(df_xs_Ghana_sf3a_IgG) -df_xs_Niger_sf3a_IgG<- prepare_df_for_serocalculator(df_xs_Niger_sf3a_IgG) -df_xs_Sierra_sf3a_IgG<- prepare_df_for_serocalculator(df_xs_Sierra_sf3a_IgG) -``` - - - -## Get parameters from longitudinal data -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to filter and manipulate Shigella data -process_shigella_data <- function(data, study_filter, antigen) { - # Filter the data for the specific study - filtered_data <- data %>% - filter(study_name == study_filter) - - # Capture the column name of the antigen - antigen_col <- ensym(antigen) - - # Manipulate and restructure the data - processed_data <- filtered_data %>% - select(isotype_name, sid, timepoint, `Actual day`, !!antigen_col) %>% - mutate( - index_id = sid, - antigen_iso = isotype_name, - visit = timepoint, - timeindays = `Actual day`, - result = !!antigen_col - ) %>% - group_by(index_id, antigen_iso) %>% - arrange(visit) %>% - mutate(visit_num = rank(visit, ties.method = "first")) %>% - ungroup() %>% - # Remove rows with NA in timeindays - filter(!is.na(timeindays)) - - return(processed_data) -} - - -dL_clean <- process_shigella_data(data = df, study_filter = "SOSAR", antigen = n_sf3aospbsa_MFI) - - -# Construct the path to "prep_data.r" using here -prep_data_path <- here::here("R", "prep_data.r") -prep_priors_path <- here::here("R", "prep_priors.R") - -# Source the file to load the prep_data function -source(prep_data_path) -source(prep_priors_path) - -#prepare data for modeline -# Create 5 different longdata -longdata <- prep_data(dL_clean) -priors <- prep_priors(max_antigens = longdata$n_antigen_isos) - - -nchains <- 4; # nr of MC chains to run simultaneously -nadapt <- 1000; # nr of iterations for adaptation -nburnin <- 100; # nr of iterations to use for burn-in -nmc <- 100; # nr of samples in posterior chains -niter <- 200; # nr of iterations for posterior sample -nthin <- round(niter/nmc); # thinning needed to produce nmc from niter - -tomonitor <- c("y0", "y1", "t1", "alpha", "shape"); - -#This handles the seed to reproduce the results -initsfunction <- function(chain){ - stopifnot(chain %in% (1:4)); # max 4 chains allowed... - .RNG.seed <- (1:4)[chain]; - .RNG.name <- c("base::Wichmann-Hill","base::Marsaglia-Multicarry", - "base::Super-Duper","base::Mersenne-Twister")[chain]; - return(list(".RNG.seed"=.RNG.seed,".RNG.name"=.RNG.name)); -} - -file.mod <- here::here("inst", "extdata", "model.jags.r") - -set.seed(11325) -jags.post <- run.jags(model = file.mod, - data = c(longdata, priors), - inits = initsfunction, - method = "parallel", - adapt = nadapt, - burnin = nburnin, - thin = nthin, - sample = nmc, - n.chains = nchains, - monitor = tomonitor, - summarise = FALSE) - -mcmc_list <- as.mcmc.list(jags.post) - -mcmc_df <- ggs(mcmc_list) - -wide_predpar_df <- mcmc_df %>% - mutate( - parameter = sub("^(\\w+)\\[.*", "\\1", Parameter), - index_id = as.numeric(sub("^\\w+\\[(\\d+),.*", "\\1", Parameter)), - antigen_iso = as.numeric(sub("^\\w+\\[\\d+,(\\d+).*", "\\1", Parameter)) - ) %>% - mutate( - index_id = factor(index_id, labels = c(unique(dL_clean$index_id), "newperson")), - antigen_iso = factor(antigen_iso, labels = unique(dL_clean$antigen_iso))) %>% - filter(index_id == "newperson") %>% - select(-Parameter) %>% - pivot_wider(names_from = "parameter", values_from="value") %>% - rowwise() %>% - droplevels() %>% - ungroup() %>% - rename(r = shape) - -# Assuming wide_predpar_df is your data frame -curve_params <- wide_predpar_df - -# Set class and attributes for serocalculator -class(curve_params) <- c("curve_params", class(curve_params)) -antigen_isos <- unique(curve_params$antigen_iso) -attr(curve_params, "antigen_isos") <- antigen_isos - -curve_params<-curve_params%>% - mutate( - iter=Iteration)%>% - select(antigen_iso,iter,y0,y1,t1,alpha,r) - -curve_params_shigella<-curve_params -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Set noise -create_noise_df <- function(country) { - noise_df <- tibble( - antigen_iso = c("IgG"), - Country = factor(c(country)), # Use input argument for Country - y.low = c(25), - eps = c(0.25), - nu = c(0.5), - y.high = c(200000) - ) - - return(noise_df) -} - -noise_df_USA <- create_noise_df("MA USA") -noise_df_Niger <- create_noise_df("Niger") -noise_df_Sierra <- create_noise_df("Sierra Leone") -noise_df_Ghana <- create_noise_df("Ghana") -``` - -## 2. Run simulations using: -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# MA USA -est_USA <- est.incidence( - pop_data = df_xs_USA_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_USA, - antigen_isos = c("IgG"), -) - -# Ghana -est_Ghana <- est.incidence( - pop_data = df_xs_Ghana_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Ghana, - antigen_isos = c("IgG"), -) - -# Niger -est_Niger <- est.incidence( - pop_data = df_xs_Niger_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Niger, - antigen_isos = c("IgG"), -) - -# Sierra Leone -est_Sierra <- est.incidence( - pop_data = df_xs_Sierra_sf3a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Sierra, - antigen_isos = c("IgG"), -) - -# create table of incidence.rate of each region - -create_incidence_table <- function(...) { - # Capture input objects and their names - est_list <- list(...) - country_names <- names(est_list) - - # Extract incidence.rate from the summary() of each estimate object - incidence_rates <- sapply(est_list, function(x) summary(x)$incidence.rate) - - # Create a tidy tibble - incidence_table <- tibble( - Country = country_names, - Incidence_Rate = incidence_rates - ) - - return(incidence_table) -} -# Example usage with four different country estimates: -incidence_summary <- create_incidence_table( - USA = est_USA, - Ghana = est_Ghana, - Niger = est_Niger, - Sierra_Leone = est_Sierra -) - -# Display the summary table -print(incidence_summary) - -# 1) The median of the incidence rate estimates from step 1. -# Get incidence rate estimates from each region (4 regions) and do median -med_sf3a_IgG<-median(incidence_summary$Incidence_Rate) -med_sf3a_IgG -#0.756 - -# 2) 2x the maximum incidence rate estimate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the maximum one and 2x -max2_sf3a_IgG<-2*max(incidence_summary$Incidence_Rate) -max2_sf3a_IgG -# 2.058 - -# 3) 1/2 the minimum incidence rate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the minimum one and 1/2 -min_half_sf3a_IgG<-0.5*min(incidence_summary$Incidence_Rate) -min_half_sf3a_IgG -#0.288 -``` -These three values are preliminary observations of lambda from sf3a_IgG. - -We choose a lambda that is half of the minimum lambda from the four regions. - - -# Simulation (200times) with the smallest lambda -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} - -# Define the simulation function -simulate_seroincidence <- function(nrep, n_sim, observed, range = NULL, batch_size = 40, parallel = TRUE) { - # Parameters - dmcmc <- curve_params_shigella # Curve parameters - antibodies <- c("IgG") # Antigen-isotypes - lambda <- observed # Simulated incidence rate per person-year - - # Biologic noise distribution - dlims <- rbind( - "IgG" = c(min = 0, max = 0.5) - ) - - # Noise parameters - cond <- tibble( - antigen_iso = c("IgG"), - nu = c(0.5), - eps = c(0.25), - y.low = c(25), - y.high = c(200000) - ) - - # Calculate number of batches - n_batches <- ceiling(n_sim / batch_size) - - # Parallel processing: Use future_map for efficiency - if (parallel) { - plan(multisession, workers = max(1, future::availableCores() - 1)) - - results <- future_map(1:n_batches, function(batch) { - # Run simulations within each batch - replicate(batch_size, expr = { - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = FALSE, # Keep the same params for speed - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - # Estimate seroincidence - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = FALSE, # Disable visualization for speed - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - list(csdata = csdata, est1 = est) - }, simplify = FALSE) # Keep list structure - }, .options = furrr_options(seed = TRUE)) # Set seed for reproducibility - - # Flatten nested list - results <- unlist(results, recursive = FALSE) - - } else { - # Sequential execution (for debugging or single-core use) - results <- lapply(1:n_sim, function(i) { - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = FALSE, - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = FALSE, - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - list(csdata = csdata, est1 = est) - }) - } - - return(results) -} - -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Run the optimized simulations with n_sim = 200 -set.seed(206251) - -results_100_1 <- simulate_seroincidence(nrep = 100, n_sim = 200, - observed = min_half_sf3a_IgG, range = c(0,2), - batch_size = 40) - -results_100_2 <- simulate_seroincidence(nrep = 100, n_sim = 200, - observed = min_half_sf3a_IgG, range = c(2,5), - batch_size = 40) - -results_200_1 <- simulate_seroincidence(nrep = 200, n_sim = 200, - observed = min_half_sf3a_IgG, range = c(0,2), - batch_size = 40) - -results_200_2 <- simulate_seroincidence(nrep = 200, n_sim = 200, - observed = min_half_sf3a_IgG, range = c(2,5), - batch_size = 40) - -results_300_1 <- simulate_seroincidence(nrep = 300, n_sim = 200, - observed = min_half_sf3a_IgG, range = c(0,2), - batch_size = 40) - -results_300_2 <- simulate_seroincidence(nrep = 300, n_sim = 200, - observed = min_half_sf3a_IgG, range = c(2,5), - batch_size = 40) - -results_400_1 <- simulate_seroincidence(nrep = 400, n_sim = 200, - observed = min_half_sf3a_IgG, range = c(0,2), - batch_size = 40) - -results_400_2 <- simulate_seroincidence(nrep = 400, n_sim = 200, - observed = min_half_sf3a_IgG, range = c(2,5), - batch_size = 40) - -# Stop parallel processing to free memory -plan(sequential) -``` - - -## Store each sample size in table -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to generate final tables -generate_final_table <- function(results_list, sample_size) { - # Initialize an empty list to store the results - summary_results <- list() - - # Loop through each of the 100 results and extract the required columns - for (i in 1:200) { - # Extract the summary for each result - result_summary <- summary(results_list[[i]]$est1) - - # Select the required columns - extracted_columns <- result_summary %>% - select(incidence.rate, SE, CI.lwr, CI.upr) - - # Add a column for the index (optional, for tracking) - extracted_columns <- extracted_columns %>% - mutate(index = i) - - # Append to the list - summary_results[[i]] <- extracted_columns - } - - # Combine all results into a single data frame - final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity - - return(final_table) -} - -# Example usage for different sample sizes -final_table_100_1 <- generate_final_table(results_100_1, 100) -final_table_100_2 <- generate_final_table(results_100_2, 100) - -final_table_200_1 <- generate_final_table(results_200_1, 200) -final_table_200_2 <- generate_final_table(results_200_2, 200) - -final_table_300_1 <- generate_final_table(results_300_1, 300) -final_table_300_2 <- generate_final_table(results_300_2, 300) - -final_table_400_1 <- generate_final_table(results_400_1, 400) -final_table_400_2 <- generate_final_table(results_400_2, 400) -``` - - - -## Graphs of where the x axis is the sample size and the y axis is the empirical standard error - -Set the preliminary observed lambda as half of the minimum incidence rate from the four regions. - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# Function to calculate metrics for each table -calculate_metrics <- function(data, sample_size) { - empirical_se <- sd(data$incidence.rate) - - - # Return a data frame with results - data.frame( - sample_size = sample_size, - empirical_se = empirical_se - - ) -} - -# Apply the function to each table -metrics_100_1 <- calculate_metrics(final_table_100_1, 100) -metrics_100_2 <- calculate_metrics(final_table_100_2, 100) - -metrics_200_1 <- calculate_metrics(final_table_200_1, 200) -metrics_200_2 <- calculate_metrics(final_table_200_2, 200) - -metrics_300_1 <- calculate_metrics(final_table_300_1, 300) -metrics_300_2 <- calculate_metrics(final_table_300_2, 300) - -metrics_400_1 <- calculate_metrics(final_table_400_1, 400) -metrics_400_2 <- calculate_metrics(final_table_400_2, 400) - - -# Combine the results into a single summary table -summary_metrics_1 <- bind_rows(metrics_100_1, metrics_200_1, metrics_300_1, - metrics_400_1) - - -summary_metrics_2 <- bind_rows(metrics_100_2, metrics_200_2, metrics_300_2, - metrics_400_2) - -# Add a column to distinguish the datasets -summary_metrics_1 <- summary_metrics_1 %>% - mutate(Age_Group = "Age 0-2") - -summary_metrics_2 <- summary_metrics_2 %>% - mutate(Age_Group = "Age 2-5") - -# Combine both datasets into one -summary_metrics_combined <- bind_rows(summary_metrics_1, summary_metrics_2) - -# Plot with color to differentiate age groups -ggplot(summary_metrics_combined, aes(x = sample_size, y = empirical_se, color = Age_Group)) + - geom_line() + - geom_point() + - labs( - title = "Empirical Standard Error vs. Sample Size", - x = "Sample Size", - y = "Empirical Standard Error", - color = "Age Group" - ) + - theme_minimal() -``` - - -## Table - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -summary_metrics_combined %>% - kable() -``` \ No newline at end of file diff --git a/simulation3.qmd b/simulation3.qmd deleted file mode 100644 index da260b2..0000000 --- a/simulation3.qmd +++ /dev/null @@ -1,651 +0,0 @@ ---- -title: "Simulation with the Smallest Lambda for sf2a IgG" -format: - pdf: - number-sections: true - number-depth: 2 - number-offset: [0, 0] -editor: visual -output: - pdf_document: - orientation: landscape -editor_options: - chunk_output_type: console ---- - -```{r setup, include=FALSE,echo=FALSE} -library(knitr) -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r, echo=FALSE, message=FALSE} -library(future.apply) -library(future) -library(gridExtra) -library(mgcv) # For advanced GAM smoothing -library(haven) -library(knitr) -library(plotly) -library(kableExtra) -library(tidyr) -library(arsenal) -library(dplyr) -library(forcats) -library(huxtable) -library(magrittr) -library(parameters) -library(kableExtra) -library(ggplot2) -library(ggeasy) -library(scales) -library(plotly) -library(patchwork) -library(tidyverse) -library(gtsummary) -library(readxl) -library(purrr) -library(serocalculator) -library(runjags) -library(coda) -library(ggmcmc) -library(here) -library(bayesplot) -library(table1) -library(tibble) -library(furrr) -library(dplyr) -``` - - -# Vary simulation lambdas across the range in the real cross-sectional data - -## 1. Estimate the shigella incidence rate using the real cross-sectional Shigella data, -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## separately for each geographic region in the data. - -# load shigella data -df <- read_excel("3.8.2024 Compiled Shigella datav2.xlsx", - sheet = "Compiled") - - -# create function for generating specific region data -create_xs_data <- function(df, filter_countries, filter_antigen_iso, value_col) { - df %>% - # First, create/rename columns - mutate( - id = sid, - Country = site_name, - study = study_name, - age = age, - antigen_iso = factor(isotype_name), - value = {{ value_col }} # value_col is specified by the user, e.g. n_ipab_MFI - ) %>% - # Then filter by the desired catchment(s) and antigen iso(s) - filter( - Country %in% filter_countries, - antigen_iso %in% filter_antigen_iso - ) %>% - # Create age categories - mutate( - ageCat = factor(case_when( - age < 5 ~ "<5", - age >= 5 & age <=15 ~ "5-15", - age > 15 ~ "16+" - )) - ) %>% - # Optionally select only the needed columns - select(id, Country, study, age, antigen_iso, value, ageCat) -} - -# Cross-sectional data: MA USA, sf2a_IgG -df_xs_USA_sf2a_IgG <- create_xs_data( - df, - filter_countries = c("MA USA"), - filter_antigen_iso = c("IgG"), - value_col = n_sf2aospbsa_MFI -) - -# Cross-sectional data: Ghana, sf2a_IgG -df_xs_Ghana_sf2a_IgG <- create_xs_data( - df, - filter_countries = c("Ghana"), - filter_antigen_iso = c("IgG"), - value_col = n_sf2aospbsa_MFI -) - - -# Cross-sectional data: Niger, sf2a_IgG -df_xs_Niger_sf2a_IgG <- create_xs_data( - df, - filter_countries = c("Niger"), - filter_antigen_iso = c("IgG"), - value_col =n_sf2aospbsa_MFI -) - - -# Cross-sectional data: Sierra Leone, sf2a_IgG -df_xs_Sierra_sf2a_IgG <- create_xs_data( - df, - filter_countries = c("Sierra Leone"), - filter_antigen_iso = c("IgG"), - value_col = n_sf2aospbsa_MFI -) - -# Create function to clearly define cross-sectional data -prepare_df_for_serocalculator <- function(df, age_col = "age", value_col = "value") { - # Ensure correct column names - df <- df %>% - rename(age = all_of(age_col)) - - # Assign attributes for serocalculator - attr(df, "age_var") <- "age" - attr(df, "value_var") <- value_col - - # Check if serocalculator recognizes attributes - get_value_var <- serocalculator:::get_value_var - detected_value_var <- get_value_var(df) - - if (is.null(detected_value_var)) { - warning("serocalculator did not detect the 'value' column. Check column naming.") - } else { - message("serocalculator recognized 'value' column: ", detected_value_var) - } - - return(df) -} - -# Application -df_xs_USA_sf2a_IgG <- prepare_df_for_serocalculator(df_xs_USA_sf2a_IgG) -df_xs_Ghana_sf2a_IgG<- prepare_df_for_serocalculator(df_xs_Ghana_sf2a_IgG) -df_xs_Niger_sf2a_IgG<- prepare_df_for_serocalculator(df_xs_Niger_sf2a_IgG) -df_xs_Sierra_sf2a_IgG<- prepare_df_for_serocalculator(df_xs_Sierra_sf2a_IgG) -``` - - - -## Get parameters from longitudinal data -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to filter and manipulate Shigella data -process_shigella_data <- function(data, study_filter, antigen) { - # Filter the data for the specific study - filtered_data <- data %>% - filter(study_name == study_filter) - - # Capture the column name of the antigen - antigen_col <- ensym(antigen) - - # Manipulate and restructure the data - processed_data <- filtered_data %>% - select(isotype_name, sid, timepoint, `Actual day`, !!antigen_col) %>% - mutate( - index_id = sid, - antigen_iso = isotype_name, - visit = timepoint, - timeindays = `Actual day`, - result = !!antigen_col - ) %>% - group_by(index_id, antigen_iso) %>% - arrange(visit) %>% - mutate(visit_num = rank(visit, ties.method = "first")) %>% - ungroup() %>% - # Remove rows with NA in timeindays - filter(!is.na(timeindays)) - - return(processed_data) -} - - -dL_clean <- process_shigella_data(data = df, study_filter = "SOSAR", antigen = n_sf2aospbsa_MFI) - - -# Construct the path to "prep_data.r" using here -prep_data_path <- here::here("R", "prep_data.r") -prep_priors_path <- here::here("R", "prep_priors.R") - -# Source the file to load the prep_data function -source(prep_data_path) -source(prep_priors_path) - -#prepare data for modeline -# Create 5 different longdata -longdata <- prep_data(dL_clean) -priors <- prep_priors(max_antigens = longdata$n_antigen_isos) - - -nchains <- 4; # nr of MC chains to run simultaneously -nadapt <- 1000; # nr of iterations for adaptation -nburnin <- 100; # nr of iterations to use for burn-in -nmc <- 100; # nr of samples in posterior chains -niter <- 200; # nr of iterations for posterior sample -nthin <- round(niter/nmc); # thinning needed to produce nmc from niter - -tomonitor <- c("y0", "y1", "t1", "alpha", "shape"); - -#This handles the seed to reproduce the results -initsfunction <- function(chain){ - stopifnot(chain %in% (1:4)); # max 4 chains allowed... - .RNG.seed <- (1:4)[chain]; - .RNG.name <- c("base::Wichmann-Hill","base::Marsaglia-Multicarry", - "base::Super-Duper","base::Mersenne-Twister")[chain]; - return(list(".RNG.seed"=.RNG.seed,".RNG.name"=.RNG.name)); -} - -file.mod <- here::here("inst", "extdata", "model.jags.r") - -set.seed(11325) -jags.post <- run.jags(model = file.mod, - data = c(longdata, priors), - inits = initsfunction, - method = "parallel", - adapt = nadapt, - burnin = nburnin, - thin = nthin, - sample = nmc, - n.chains = nchains, - monitor = tomonitor, - summarise = FALSE) - -mcmc_list <- as.mcmc.list(jags.post) - -mcmc_df <- ggs(mcmc_list) - -wide_predpar_df <- mcmc_df %>% - mutate( - parameter = sub("^(\\w+)\\[.*", "\\1", Parameter), - index_id = as.numeric(sub("^\\w+\\[(\\d+),.*", "\\1", Parameter)), - antigen_iso = as.numeric(sub("^\\w+\\[\\d+,(\\d+).*", "\\1", Parameter)) - ) %>% - mutate( - index_id = factor(index_id, labels = c(unique(dL_clean$index_id), "newperson")), - antigen_iso = factor(antigen_iso, labels = unique(dL_clean$antigen_iso))) %>% - filter(index_id == "newperson") %>% - select(-Parameter) %>% - pivot_wider(names_from = "parameter", values_from="value") %>% - rowwise() %>% - droplevels() %>% - ungroup() %>% - rename(r = shape) - -# Assuming wide_predpar_df is your data frame -curve_params <- wide_predpar_df - -# Set class and attributes for serocalculator -class(curve_params) <- c("curve_params", class(curve_params)) -antigen_isos <- unique(curve_params$antigen_iso) -attr(curve_params, "antigen_isos") <- antigen_isos - -curve_params<-curve_params%>% - mutate( - iter=Iteration)%>% - select(antigen_iso,iter,y0,y1,t1,alpha,r) - -curve_params_shigella<-curve_params -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -## Set noise -create_noise_df <- function(country) { - noise_df <- tibble( - antigen_iso = c("IgG"), - Country = factor(c(country)), # Use input argument for Country - y.low = c(25), - eps = c(0.25), - nu = c(0.5), - y.high = c(200000) - ) - - return(noise_df) -} - -noise_df_USA <- create_noise_df("MA USA") -noise_df_Niger <- create_noise_df("Niger") -noise_df_Sierra <- create_noise_df("Sierra Leone") -noise_df_Ghana <- create_noise_df("Ghana") -``` - -## 2. Run simulations using: -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# MA USA -est_USA <- est.incidence( - pop_data = df_xs_USA_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_USA, - antigen_isos = c("IgG"), -) - -# Ghana -est_Ghana <- est.incidence( - pop_data = df_xs_Ghana_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Ghana, - antigen_isos = c("IgG"), -) - -# Niger -est_Niger <- est.incidence( - pop_data = df_xs_Niger_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Niger, - antigen_isos = c("IgG"), -) - -# Sierra Leone -est_Sierra <- est.incidence( - pop_data = df_xs_Sierra_sf2a_IgG, - curve_params = curve_params_shigella, - noise_params = noise_df_Sierra, - antigen_isos = c("IgG"), -) - -# create table of incidence.rate of each region - -create_incidence_table <- function(...) { - # Capture input objects and their names - est_list <- list(...) - country_names <- names(est_list) - - # Extract incidence.rate from the summary() of each estimate object - incidence_rates <- sapply(est_list, function(x) summary(x)$incidence.rate) - - # Create a tidy tibble - incidence_table <- tibble( - Country = country_names, - Incidence_Rate = incidence_rates - ) - - return(incidence_table) -} -# Example usage with four different country estimates: -incidence_summary <- create_incidence_table( - USA = est_USA, - Ghana = est_Ghana, - Niger = est_Niger, - Sierra_Leone = est_Sierra -) - -# Display the summary table -print(incidence_summary) - -# 1) The median of the incidence rate estimates from step 1. -# Get incidence rate estimates from each region (4 regions) and do median -med_sf2a_IgG<-median(incidence_summary$Incidence_Rate) -med_sf2a_IgG -#0.710 - -# 2) 2x the maximum incidence rate estimate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the maximum one and 2x -max2_sf2a_IgG<-2*max(incidence_summary$Incidence_Rate) -max2_sf2a_IgG -# 1.878 - -# 3) 1/2 the minimum incidence rate from step 1 -# Get incidence rate estimates from each region (4 regions) and get the minimum one and 1/2 -min_half_sf2a_IgG<-0.5*min(incidence_summary$Incidence_Rate) -min_half_sf2a_IgG -#0.252 -``` -These three values are preliminary observations of lambda from sf2a_IgG. - -We choose a lambda that is half of the minimum lambda from the four regions. - - -# Simulation (200times) with the smallest lambda -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} - -# Define the simulation function -simulate_seroincidence <- function(nrep, n_sim, observed, range = NULL, batch_size = 40, parallel = TRUE) { - # Parameters - dmcmc <- curve_params_shigella # Curve parameters - antibodies <- c("IgG") # Antigen-isotypes - lambda <- observed # Simulated incidence rate per person-year - - # Biologic noise distribution - dlims <- rbind( - "IgG" = c(min = 0, max = 0.5) - ) - - # Noise parameters - cond <- tibble( - antigen_iso = c("IgG"), - nu = c(0.5), - eps = c(0.25), - y.low = c(25), - y.high = c(200000) - ) - - # Calculate number of batches - n_batches <- ceiling(n_sim / batch_size) - - # Parallel processing: Use future_map for efficiency - if (parallel) { - plan(multisession, workers = max(1, future::availableCores() - 1)) - - results <- future_map(1:n_batches, function(batch) { - # Run simulations within each batch - replicate(batch_size, expr = { - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = FALSE, # Keep the same params for speed - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - # Estimate seroincidence - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = FALSE, # Disable visualization for speed - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - list(csdata = csdata, est1 = est) - }, simplify = FALSE) # Keep list structure - }, .options = furrr_options(seed = TRUE)) # Set seed for reproducibility - - # Flatten nested list - results <- unlist(results, recursive = FALSE) - - } else { - # Sequential execution (for debugging or single-core use) - results <- lapply(1:n_sim, function(i) { - csdata <- sim.cs( - curve_params = dmcmc, - lambda = lambda, - n.smpl = nrep, - age_range = range, - antigen_isos = antibodies, - n.mc = 0, - renew_params = FALSE, - add.noise = TRUE, - noise_limits = dlims, - format = "long" - ) - - est <- est.incidence( - pop_data = csdata, - curve_params = dmcmc, - noise_params = cond, - lambda_start = 0.1, - build_graph = FALSE, - verbose = FALSE, - print_graph = FALSE, - antigen_isos = antibodies - ) - - list(csdata = csdata, est1 = est) - }) - } - - return(results) -} - -``` - -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Run the optimized simulations with n_sim = 200 -set.seed(206251) - -results_100_1 <- simulate_seroincidence(nrep = 100, n_sim = 200, - observed = min_half_sf2a_IgG, range = c(0,2), - batch_size = 40) - -results_100_2 <- simulate_seroincidence(nrep = 100, n_sim = 200, - observed = min_half_sf2a_IgG, range = c(2,5), - batch_size = 40) - -results_200_1 <- simulate_seroincidence(nrep = 200, n_sim = 200, - observed = min_half_sf2a_IgG, range = c(0,2), - batch_size = 40) - -results_200_2 <- simulate_seroincidence(nrep = 200, n_sim = 200, - observed = min_half_sf2a_IgG, range = c(2,5), - batch_size = 40) - -results_300_1 <- simulate_seroincidence(nrep = 300, n_sim = 200, - observed = min_half_sf2a_IgG, range = c(0,2), - batch_size = 40) - -results_300_2 <- simulate_seroincidence(nrep = 300, n_sim = 200, - observed = min_half_sf2a_IgG, range = c(2,5), - batch_size = 40) - -results_400_1 <- simulate_seroincidence(nrep = 400, n_sim = 200, - observed = min_half_sf2a_IgG, range = c(0,2), - batch_size = 40) - -results_400_2 <- simulate_seroincidence(nrep = 400, n_sim = 200, - observed = min_half_sf2a_IgG, range = c(2,5), - batch_size = 40) - -# Stop parallel processing to free memory -plan(sequential) -``` - - -## Store each sample size in table -```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} -# Define a function to generate final tables -generate_final_table <- function(results_list, sample_size) { - # Initialize an empty list to store the results - summary_results <- list() - - # Loop through each of the 100 results and extract the required columns - for (i in 1:200) { - # Extract the summary for each result - result_summary <- summary(results_list[[i]]$est1) - - # Select the required columns - extracted_columns <- result_summary %>% - select(incidence.rate, SE, CI.lwr, CI.upr) - - # Add a column for the index (optional, for tracking) - extracted_columns <- extracted_columns %>% - mutate(index = i) - - # Append to the list - summary_results[[i]] <- extracted_columns - } - - # Combine all results into a single data frame - final_table <- bind_rows(summary_results) %>% - mutate(sample_size = sample_size) # Add sample size column for clarity - - return(final_table) -} - -# Example usage for different sample sizes -final_table_100_1 <- generate_final_table(results_100_1, 100) -final_table_100_2 <- generate_final_table(results_100_2, 100) - -final_table_200_1 <- generate_final_table(results_200_1, 200) -final_table_200_2 <- generate_final_table(results_200_2, 200) - -final_table_300_1 <- generate_final_table(results_300_1, 300) -final_table_300_2 <- generate_final_table(results_300_2, 300) - -final_table_400_1 <- generate_final_table(results_400_1, 400) -final_table_400_2 <- generate_final_table(results_400_2, 400) -``` - - - -## Graphs of where the x axis is the sample size and the y axis is the empirical standard error - -Set the preliminary observed lambda as half of the minimum incidence rate from the four regions. - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -# Function to calculate metrics for each table -calculate_metrics <- function(data, sample_size) { - empirical_se <- sd(data$incidence.rate) - - - # Return a data frame with results - data.frame( - sample_size = sample_size, - empirical_se = empirical_se - - ) -} - -# Apply the function to each table -metrics_100_1 <- calculate_metrics(final_table_100_1, 100) -metrics_100_2 <- calculate_metrics(final_table_100_2, 100) - -metrics_200_1 <- calculate_metrics(final_table_200_1, 200) -metrics_200_2 <- calculate_metrics(final_table_200_2, 200) - -metrics_300_1 <- calculate_metrics(final_table_300_1, 300) -metrics_300_2 <- calculate_metrics(final_table_300_2, 300) - -metrics_400_1 <- calculate_metrics(final_table_400_1, 400) -metrics_400_2 <- calculate_metrics(final_table_400_2, 400) - - -# Combine the results into a single summary table -summary_metrics_1 <- bind_rows(metrics_100_1, metrics_200_1, metrics_300_1, - metrics_400_1) - - -summary_metrics_2 <- bind_rows(metrics_100_2, metrics_200_2, metrics_300_2, - metrics_400_2) - -# Add a column to distinguish the datasets -summary_metrics_1 <- summary_metrics_1 %>% - mutate(Age_Group = "Age 0-2") - -summary_metrics_2 <- summary_metrics_2 %>% - mutate(Age_Group = "Age 2-5") - -# Combine both datasets into one -summary_metrics_combined <- bind_rows(summary_metrics_1, summary_metrics_2) - -# Plot with color to differentiate age groups -ggplot(summary_metrics_combined, aes(x = sample_size, y = empirical_se, color = Age_Group)) + - geom_line() + - geom_point() + - labs( - title = "Empirical Standard Error vs. Sample Size", - x = "Sample Size", - y = "Empirical Standard Error", - color = "Age Group" - ) + - theme_minimal() -``` - - -## Table - -```{r,echo=FALSE, message=FALSE, warning=FALSE} -summary_metrics_combined %>% - kable() -``` \ No newline at end of file diff --git a/vignettes/articles/_sec_outtakes-data-load.qmd b/vignettes/articles/_sec_outtakes-data-load.qmd index 9e8e839..682df16 100644 --- a/vignettes/articles/_sec_outtakes-data-load.qmd +++ b/vignettes/articles/_sec_outtakes-data-load.qmd @@ -1,73 +1,153 @@ +--- +title: Summary of Incidence Rates Across Cross-Sectional Data Regions +--- ```{r} -# Cross-sectional data: MA USA, ipab_IgG -df_xs_USA_ipab_IgG <- create_xs_data( - df, - filter_countries = c("MA USA"), - filter_antigen_iso = c("IgG"), - value_col = n_ipab_MFI -) +#| label: setup +#| message: false +#| include: false +#| echo: false -# Cross-sectional data: MA USA, ipab_IgA +library(knitr) +library(future.apply) +library(future) +library(gridExtra) +library(mgcv) # For advanced GAM smoothing +library(haven) +library(knitr) +library(plotly) +library(kableExtra) +library(tidyr) +library(arsenal) +library(dplyr) +library(forcats) +library(huxtable) +library(magrittr) +library(parameters) +library(kableExtra) +library(ggplot2) +library(ggeasy) +library(scales) +library(patchwork) +library(tidyverse) +library(gtsummary) +library(readxl) +library(purrr) +library(serocalculator) +library(serodynamics) +library(runjags) +library(coda) +library(ggmcmc) +library(here) +library(bayesplot) +library(table1) +library(tibble) +library(furrr) +library(dplyr) +library(shigella) +devtools::load_all() +``` -# Cross-sectional data: Ghana, ipab_IgG -df_xs_Ghana_ipab_IgG <- create_xs_data( - df, - filter_countries = c("Ghana"), - filter_antigen_iso = c("IgG"), - value_col = n_ipab_MFI -) +# Load the dataset, extract cross-sectional data from a specific region -# Cross-sectional data: Ghana, ipab_IgA +## Load the dataset +```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} +#| label: "load shigella data" +df <- + fs::path_package( + package = "shigella", + "extdata/3.8.2024 Compiled Shigella datav2.xlsx" + ) |> + read_excel(sheet = "Compiled") +``` -# Cross-sectional data: Niger, ipab_IgG -df_xs_Niger_ipab_IgG <- create_xs_data( - df, - filter_countries = c("Niger"), - filter_antigen_iso = c("IgG"), - value_col = n_ipab_MFI -) +```{r} +df_xs_strat <- + df |> + rename(antigen_iso = isotype_name, + Country = site_name) |> + filter(Country != "Dhaka") |> + as_pop_data( + antigen_isos = "IgG", + value = "n_ipab_MFI", ## Choose an antigen + age = "age", + id = "sid" + ) +``` + +## Create table of incidence.rate of each region + +```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} +#| label: "Load the simulated longitudinal dataset" +data("curve_params_shigella_ipab", package = "shigella") +``` + + +```{r} +# Generate noise for each region +noise_df_USA <- create_noise_df("MA USA") +noise_df_Niger <- create_noise_df("Niger") +noise_df_Sierra <- create_noise_df("Sierra Leone") +noise_df_Ghana <- create_noise_df("Ghana") + +noise_df_strat <- + bind_rows(noise_df_USA, + noise_df_Niger, + noise_df_Sierra, + noise_df_Ghana) |> + serocalculator::as_noise_params() +``` -# Cross-sectional data: Niger, ipab_IgA -# Cross-sectional data: Sierra Leone, ipab_IgG -df_xs_Sierra_ipab_IgG <- create_xs_data( - df, - filter_countries = c("Sierra Leone"), - filter_antigen_iso = c("IgG"), - value_col = n_ipab_MFI +```{r,echo=FALSE, message=FALSE, warning=FALSE} + +#| label: tbl-ests-strat +#| tbl-cap: "Stratified estimates of shigella: IpaB" + +ests_strat <- est.incidence.by( + pop_data = df_xs_strat, + curve_param = curve_params_shigella_ipab, + noise_params = noise_df_strat, + strata = "Country", + noise_strata_varnames = "Country", + curve_strata_varnames = NULL, + antigen_isos = "IgG", + num_cores = 2, + verbose = FALSE ) -# Cross-sectional data: Sierra Leone, ipab_IgA +incidence_summary <- summary(ests_strat) -# Application -df_xs_USA_ipab_IgG <- - df_xs_USA_ipab_IgG |> - as_pop_data( - value = "value", - id = "id", - age = "age" - ) -df_xs_Ghana_ipab_IgG <- - df_xs_Ghana_ipab_IgG |> - as_pop_data( - value = "value", - id = "id", - age = "age" - ) -df_xs_Niger_ipab_IgG <- - df_xs_Niger_ipab_IgG |> - as_pop_data( - value = "value", - id = "id", - age = "age" - ) -df_xs_Sierra_ipab_IgG <- - df_xs_Sierra_ipab_IgG |> - as_pop_data( - value = "value", - id = "id", - age = "age" +# Select only the Country and incidence.rate columns to generate the table +incidence_summary %>% + select(Country, incidence.rate) %>% + kable(caption = "Seroincidence Rates by Country") +``` + +```{r} +#| label: tbl-ests-stat +#| tbl-cap: "Descriptive statistics of shigella: IpaB" + +# Calculate median, 2x max, and 1/2 min in a single step +incidence_stats <- incidence_summary %>% + summarise( + Median = median(incidence.rate), + Max2 = 2 * max(incidence.rate), + MinHalf = 0.5 * min(incidence.rate) ) +# Convert to long format for the summary table +incidence_rate_summary <- incidence_stats %>% + pivot_longer(cols = everything(), names_to = "Metric", values_to = "Value") %>% + mutate(Metric = recode(Metric, + "Median" = "Median Incidence Rate", + "Max2" = "2x Maximum Incidence Rate", + "MinHalf" = "1/2 Minimum Incidence Rate")) + +kable(incidence_rate_summary, caption = "Incidence Rate Summary") ``` + +```{r} +incidence_stats_ipab<-incidence_stats +usethis::use_data(incidence_stats_ipab, overwrite = TRUE) # Saving this .rda file for use in the last .qmd file +``` \ No newline at end of file diff --git a/vignettes/articles/shigella-kinetics.qmd b/vignettes/articles/shigella-kinetics.qmd index 0df4df6..0a700dc 100644 --- a/vignettes/articles/shigella-kinetics.qmd +++ b/vignettes/articles/shigella-kinetics.qmd @@ -44,12 +44,13 @@ library(table1) library(tibble) library(furrr) library(dplyr) -# library(shigella) +library(shigella) devtools::load_all() ``` +# Load the dataset, extract cross-sectional data from a specific region, and generate simulated longitudinal data for specific antigen-isotypes. - +## Load the dataset ```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} #| label: "load shigella data" df <- @@ -60,7 +61,11 @@ df <- read_excel(sheet = "Compiled") ``` + + +I think the commented-out section above is not necessary in this .qmd file. I would like to generate simulated data from the longitudinal data for this .qmd file only. The section above will be handled in the _ses_outtakes-data-load.qmd file. +## Generate simulated longitudinal data (choose your antigen) ```{r,echo=FALSE, message=FALSE, warning=FALSE,results='hide'} # Define a function to filter and manipulate Shigella data dL_clean <- process_shigella_data( data = df, study_filter = "SOSAR", - antigen = n_ipab_MFI + antigen = n_ipab_MFI ### Modify this input to use different antigens. ) |> as_case_data( id_var = "index_id", @@ -150,10 +198,19 @@ jags.post <- run.jags( summarise = FALSE ) -curve_params_shigella <- jags.post |> postprocess_jags_output() -usethis::use_data(curve_params_shigella, overwrite = TRUE) + +#curve_params_shigella <- jags.post |> postprocess_jags_output() #somehow it doesn't work + +# Use Sam's package rather than my current R function +curve_params_shigella_ipab <- jags.post |> serodynamics::postprocess_jags_output( + ids = attr(jags_prepped_data, "ids"), + antigen_isos = attr(jags_prepped_data, "antigens") +) + +usethis::use_data(curve_params_shigella_ipab, overwrite = TRUE) ``` +## Exploring the simulated longitudinal data ```{r} curve_params_shigella |> @@ -165,3 +222,4 @@ curve_params_shigella |> curve_params_shigella |> autoplot() ``` + diff --git a/vignettes/articles/simulation-shigella.qmd b/vignettes/articles/simulation-shigella.qmd new file mode 100644 index 0000000..6ac0f37 --- /dev/null +++ b/vignettes/articles/simulation-shigella.qmd @@ -0,0 +1,157 @@ +--- +title: "Estimating Optimal Sample Size Through 300 Simulations: IpaB" +--- + +```{r} +#| label: setup +#| message: false +#| include: false +#| echo: false + +library(knitr) +library(future.apply) +library(future) +library(gridExtra) +library(mgcv) # For advanced GAM smoothing +library(haven) +library(knitr) +library(plotly) +library(kableExtra) +library(tidyr) +library(arsenal) +library(dplyr) +library(forcats) +library(huxtable) +library(magrittr) +library(parameters) +library(kableExtra) +library(ggplot2) +library(ggeasy) +library(scales) +library(patchwork) +library(tidyverse) +library(gtsummary) +library(readxl) +library(purrr) +library(serocalculator) +library(serodynamics) +library(runjags) +library(coda) +library(ggmcmc) +library(here) +library(bayesplot) +library(table1) +library(tibble) +library(furrr) +library(dplyr) +#library(shigella) +devtools::load_all() +``` + +```{r} +data("curve_params_shigella_ipab", package = "shigella") +data("incidence_stats_ipab", package = "shigella") +``` + + +# Simulation (300 iterations) with the smallest lambda (Requires HPC) +```{r} +plan(multicore) # Works best on HPC + +# Define parameter sets +sim_params <- list( + list(nrep = 100, range = c(0, 2)), + list(nrep = 100, range = c(2, 5)), + list(nrep = 200, range = c(0, 2)), + list(nrep = 200, range = c(2, 5)), + list(nrep = 300, range = c(0, 2)), + list(nrep = 300, range = c(2, 5)), + list(nrep = 400, range = c(0, 2)), + list(nrep = 400, range = c(2, 5)) +) + +# You probably need to use the HPC to run the code below to speed it up +results_list <- lapply(sim_params, function(params) { + cat("Running simulation for nrep =", params$nrep, "range =", params$range, "\n") + simulate_seroincidence2(nrep = params$nrep, n_sim = 300, observed = incidence_stats_ipab$MinHalf, range = params$range) ### Choose the observed value based on the lambda you want to simulate +}) + +# Stop parallel processing +plan(sequential) # Return to sequential processing after execution +``` +The simulation above should be run on the HPC rather than on a local computer. + +## Generate the final dataset for IpaB with the smallest lambda +```{r} +# Assign results to variables +results_100_1 <- results_list[[1]] +results_100_2 <- results_list[[2]] +results_200_1 <- results_list[[3]] +results_200_2 <- results_list[[4]] +results_300_1 <- results_list[[5]] +results_300_2 <- results_list[[6]] +results_400_1 <- results_list[[7]] +results_400_2 <- results_list[[8]] + +# For different sample sizes +final_table_100_1 <- generate_final_table(results_100_1, 100) +final_table_100_2 <- generate_final_table(results_100_2, 100) +final_table_200_1 <- generate_final_table(results_200_1, 200) +final_table_200_2 <- generate_final_table(results_200_2, 200) +final_table_300_1 <- generate_final_table(results_300_1, 300) +final_table_300_2 <- generate_final_table(results_300_2, 300) +final_table_400_1 <- generate_final_table(results_400_1, 400) +final_table_400_2 <- generate_final_table(results_400_2, 400) +``` + +```{r} +# Prepare Data and Parameters +# List of all final tables +final_tables <- list(final_table_100_1, final_table_100_2, + final_table_200_1, final_table_200_2, + final_table_300_1, final_table_300_2, + final_table_400_1, final_table_400_2) + +# Corresponding sample sizes +sample_sizes <- rep(c(100, 200, 300, 400), each = 2) +# Corresponding age groups +age_groups <- rep(c("Age 0-2", "Age 2-5"), times = 4) + +# Calculate Metrics in a Single Step +summary_metrics_combined <- pmap_dfr( + list(final_tables, sample_sizes, age_groups), + calculate_metrics +) + +# Display the summary_metrics_combined as a table +kable(summary_metrics_combined, + caption = "Summary of Empirical Standard Errors by Sample Size and Age Group", + digits = 3, # Rounds numerical values to 3 decimal places + align = "c") # Center align all columns +``` + + +## Plot with color to differentiate age groups +```{r} +#| fig-cap: "IpaB-IgG with the Smallest Lambda" +#| label: fig-se-IpaB-IgG-min-lambda +ggplot(summary_metrics_combined, aes(x = sample_size, y = empirical_se, color = Age_Group)) + + geom_line() + + geom_point() + + labs( + x = "Sample Size", + y = "Empirical Standard Error", + color = "Age Group" + ) + + theme_minimal() +``` + +## Table: Smallest Lambda for IpaB-IgG +```{r} +summary_metrics_combined %>% + kable() +``` + +# Simulation (300 iterations) with the median lambda (Requires HPC) + +This part follows the exact same procedure as above, except for changing one input variable from the smallest lambda to the median lambda. \ No newline at end of file