diff --git a/.DS_Store b/.DS_Store index 0fded88..9a4ff71 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.Rhistory b/.Rhistory index 97d322f..91d6d6d 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,512 +1,512 @@ -"PSM")), -Validation_Dataset = str_remove(Validation_Dataset, "_val"), -Validation_Dataset = forcats::fct_recode(Validation_Dataset, -"PSM" = "raw", -"MI with Y" = "MIwithY", -"MI no Y" = "MInoY", -"Mean/Mode" = "mean_imputed"), -Validation_Dataset = forcats::fct_relevel(Validation_Dataset, -c("CCA", -"Mean/Mode", -"RI", -"MI with Y", -"MI no Y", -"PSM"))) %>% -rename("Target" = name) %>% -ggplot(aes(x = bias_mean, -y = Validation_Dataset, -color = Metric)) + -geom_point(aes(shape = Metric)) + -geom_errorbar(aes(xmin = bias_lower, -xmax = bias_upper), width=.1) + -scale_color_brewer(palette = "Set1") + -xlab("Bias") + -ylab("Validation Data Imputation Method") + -geom_vline(data = data.frame("Metric" = c("AUC", -"Brier Score", -"Calibration Intercept", -"Calibration Slope"), -"bias_mean" = c(0,0,0,0)), -aes(xintercept = bias_mean), -linetype = "dashed") + -ggh4x::facet_grid2(CPM ~ Target, scales = "fixed") + -ggtitle("Targetted Performance ") + -theme_minimal(base_size = 12) + -theme(legend.position = "bottom", -panel.background = element_rect(fill = "gray90"), -panel.spacing.x = unit(0.5, "lines"), -axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), -plot.title = element_text(size = 14, hjust = 0.5), -panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) -##plot results from the empirical study -Bootstrap_InternalValidation %>% -left_join(Bootstrap_InternalValidation %>% -filter(Validation_Dataset == "mean_imputed_val") %>% -select(-Validation_Dataset) %>% -rename("MeanMode_est" = "est", -"MeanMode_var" = "var"), -by = c("Bootstrap_Index", "CPM", "Metric")) %>% -left_join(Bootstrap_InternalValidation %>% -filter(Validation_Dataset == "RI_val") %>% -select(-Validation_Dataset) %>% -rename("RI_est" = "est", -"RI_var" = "var"), -by = c("Bootstrap_Index", "CPM", "Metric")) %>% -left_join(Bootstrap_InternalValidation %>% -filter(Validation_Dataset == "MInoY_val") %>% -select(-Validation_Dataset) %>% -rename("MInoY_est" = "est", -"MInoY_var" = "var"), -by = c("Bootstrap_Index", "CPM", "Metric")) %>% -left_join(Bootstrap_InternalValidation %>% -filter(Validation_Dataset == "raw_val") %>% -select(-Validation_Dataset)%>% -rename("PSM_est" = "est", -"PSM_var" = "var"), -by = c("Bootstrap_Index", "CPM", "Metric")) %>% -select(Bootstrap_Index, CPM, Validation_Dataset, Metric, contains("est")) %>% -pivot_longer(cols = ends_with("_est")) %>% -mutate(bias = est - value) %>% -group_by(CPM, Validation_Dataset, Metric, name) %>% -summarise(bias_mean = mean(bias, na.rm = TRUE), -bias_lower = quantile(bias, 0.025, na.rm = TRUE), -bias_upper = quantile(bias, 0.975, na.rm = TRUE), -.groups = "drop") %>% -mutate(name = str_remove(name, "_est"), -name = forcats::fct_relevel(name, -c("MeanMode", "RI", "MInoY", "PSM")), -name = forcats::fct_recode(name, -"Targetting\n MI-no Y Performance" = "MInoY", -"Targetting\n Mean/Mode Performance" = "MeanMode", -"Targetting\n PSM Performance" = "PSM", -"Targetting\n RI Performance" = "RI"), -CPM = str_remove(CPM, "PR_"), -CPM = forcats::fct_recode(CPM, -"CCA\n developed CPM" = "CCA", -"RI\n developed CPM" = "RI", -"PSM\n developed CPM" = "patternsubmodel", -"MI with Y\n developed CPM" = "MIwithY", -"MI no Y\n developed CPM" = "MInoY", -"Mean/Mode\n developed CPM" = "meanimputed"), -CPM = forcats::fct_relevel(CPM, -c("CCA\n developed CPM", -"Mean/Mode\n developed CPM", -"RI\n developed CPM", -"MI with Y\n developed CPM", -"MI no Y\n developed CPM", -"PSM\n developed CPM")), -Validation_Dataset = str_remove(Validation_Dataset, "_val"), -Validation_Dataset = forcats::fct_recode(Validation_Dataset, -"PSM" = "raw", -"MI with Y" = "MIwithY", -"MI no Y" = "MInoY", -"Mean/Mode" = "mean_imputed"), -Validation_Dataset = forcats::fct_relevel(Validation_Dataset, -c("CCA", -"Mean/Mode", -"RI", -"MI with Y", -"MI no Y", -"PSM"))) %>% -rename("Target" = name) %>% -ggplot(aes(x = bias_mean, -y = Validation_Dataset, -color = Metric)) + -geom_point(aes(shape = Metric)) + -geom_errorbar(aes(xmin = bias_lower, -xmax = bias_upper), width=.1) + -scale_color_brewer(palette = "Set1") + -xlab("Bias") + -ylab("Validation Data Imputation Method") + -geom_vline(data = data.frame("Metric" = c("AUC", -"Brier Score", -"Calibration Intercept", -"Calibration Slope"), -"bias_mean" = c(0,0,0,0)), -aes(xintercept = bias_mean), -linetype = "dashed") + -ggh4x::facet_grid2(CPM ~ Target, scales = "fixed") + -ggtitle("Targetted Performance ") + -theme_minimal(base_size = 12) + -theme(legend.position = "bottom", -panel.background = element_rect(fill = "gray90"), -panel.spacing.x = unit(0.5, "lines"), -axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), -plot.title = element_text(size = 14, hjust = 0.5), -panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) -##plot results from the empirical study -Bootstrap_InternalValidation %>% -left_join(Bootstrap_InternalValidation %>% -filter(Validation_Dataset == "mean_imputed_val") %>% -select(-Validation_Dataset) %>% -rename("MeanMode_est" = "est", -"MeanMode_var" = "var"), -by = c("Bootstrap_Index", "CPM", "Metric")) %>% -left_join(Bootstrap_InternalValidation %>% -filter(Validation_Dataset == "RI_val") %>% -select(-Validation_Dataset) %>% -rename("RI_est" = "est", -"RI_var" = "var"), -by = c("Bootstrap_Index", "CPM", "Metric")) %>% -left_join(Bootstrap_InternalValidation %>% -filter(Validation_Dataset == "MInoY_val") %>% -select(-Validation_Dataset) %>% -rename("MInoY_est" = "est", -"MInoY_var" = "var"), -by = c("Bootstrap_Index", "CPM", "Metric")) %>% -left_join(Bootstrap_InternalValidation %>% -filter(Validation_Dataset == "raw_val") %>% -select(-Validation_Dataset)%>% -rename("PSM_est" = "est", -"PSM_var" = "var"), -by = c("Bootstrap_Index", "CPM", "Metric")) %>% -select(Bootstrap_Index, CPM, Validation_Dataset, Metric, contains("est")) %>% -pivot_longer(cols = ends_with("_est")) %>% -mutate(bias = est - value) %>% -group_by(CPM, Validation_Dataset, Metric, name) %>% -summarise(bias_mean = mean(bias, na.rm = TRUE), -bias_lower = quantile(bias, 0.025, na.rm = TRUE), -bias_upper = quantile(bias, 0.975, na.rm = TRUE), -.groups = "drop") %>% -mutate(name = str_remove(name, "_est"), -name = forcats::fct_relevel(name, -c("MeanMode", "RI", "MInoY", "PSM")), -name = forcats::fct_recode(name, -"Targetting\n MI-no Y Performance" = "MInoY", -"Targetting\n Mean/Mode Performance" = "MeanMode", -"Targetting\n PSM Performance" = "PSM", -"Targetting\n RI Performance" = "RI"), -CPM = str_remove(CPM, "PR_"), -CPM = forcats::fct_recode(CPM, -"CCA\n developed CPM" = "CCA", -"RI\n developed CPM" = "RI", -"PSM\n developed CPM" = "patternsubmodel", -"MI with Y\n developed CPM" = "MIwithY", -"MI no Y\n developed CPM" = "MInoY", -"Mean/Mode\n developed CPM" = "meanimputed"), -CPM = forcats::fct_relevel(CPM, -c("CCA\n developed CPM", -"Mean/Mode\n developed CPM", -"RI\n developed CPM", -"MI with Y\n developed CPM", -"MI no Y\n developed CPM", -"PSM\n developed CPM")), -Validation_Dataset = str_remove(Validation_Dataset, "_val"), -Validation_Dataset = forcats::fct_recode(Validation_Dataset, -"PSM" = "raw", -"MI with Y" = "MIwithY", -"MI no Y" = "MInoY", -"Mean/Mode" = "mean_imputed"), -Validation_Dataset = forcats::fct_relevel(Validation_Dataset, -c("CCA", -"Mean/Mode", -"RI", -"MI with Y", -"MI no Y", -"PSM"))) %>% -rename("Target" = name) %>% -ggplot(aes(x = bias_mean, -y = Validation_Dataset, -color = Metric)) + -geom_point(aes(shape = Metric)) + -geom_errorbar(aes(xmin = bias_lower, -xmax = bias_upper), width=.1) + -scale_color_brewer(palette = "Set1") + -xlab("Bias") + -ylab("Validation Data Imputation Method") + -geom_vline(data = data.frame("Metric" = c("AUC", -"Brier Score", -"Calibration Intercept", -"Calibration Slope"), -"bias_mean" = c(0,0,0,0)), -aes(xintercept = bias_mean), -linetype = "dashed") + -ggh4x::facet_grid2(CPM ~ Target, scales = "fixed") + -ggtitle("Targetted Performance ") + -theme_minimal(base_size = 12) + -theme(legend.position = "bottom", -panel.background = element_rect(fill = "gray90"), -panel.spacing.x = unit(0.5, "lines"), -axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), -plot.title = element_text(size = 14, hjust = 0.5), -panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) -ggsave(file = here::here("Manuscript", "Fig6.tiff")) -ggsave(file = here::here("Manuscript", "Fig6.tiff"), height = 10, width = 10, dpi = 300) +source("Code/00_Simulation_Functions.R") +#run the simulation study: +simulation_results <- simulation_nrun_fnc(n_iter = 100, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[s], +X_categorical = sims_parameters$X_categorical[s], +R_prev = sims_parameters$R_prev[s], +beta_x1_dev = sims_parameters$beta_x1_dev[s], +beta_x2_dev = sims_parameters$beta_x2_dev[s], +beta_Y_dev = sims_parameters$beta_Y_dev[s], +beta_x1_val = sims_parameters$beta_x1_val[s], +beta_x2_val = sims_parameters$beta_x2_val[s], +beta_Y_val = sims_parameters$beta_Y_val[s], +rho_X = sims_parameters$rho_X[s], +gamma_x1 = sims_parameters$gamma_x1[s], +gamma_x2 = sims_parameters$gamma_x2[s]) +#extract each imputed version of the validation set +val_imputed_datasets <- imputed_datasets[ +which(stringr::str_detect(names(imputed_datasets), "_val")) +] +preds_per_data_set <- purrr::map(.x = val_imputed_datasets, +.f = make_predictions_fnc, +model_coefs = model_coefs, +X_categorical = X_categorical) +preds_per_data_set$RI_val_transportedimputation <- preds_per_data_set$RI_val_transportedimputation %>% +dplyr::select(-.imp) +preds_per_data_set$RI_val_newimputation <- preds_per_data_set$RI_val_newimputation %>% +dplyr::select(-.imp) +preds_per_data_set +source("Code/00_Simulation_Functions.R") +source("Code/00_Simulation_Functions.R") +#run the simulation study: +simulation_results <- simulation_nrun_fnc(n_iter = 100, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[s], +X_categorical = sims_parameters$X_categorical[s], +R_prev = sims_parameters$R_prev[s], +beta_x1_dev = sims_parameters$beta_x1_dev[s], +beta_x2_dev = sims_parameters$beta_x2_dev[s], +beta_Y_dev = sims_parameters$beta_Y_dev[s], +beta_x1_val = sims_parameters$beta_x1_val[s], +beta_x2_val = sims_parameters$beta_x2_val[s], +beta_Y_val = sims_parameters$beta_Y_val[s], +rho_X = sims_parameters$rho_X[s], +gamma_x1 = sims_parameters$gamma_x1[s], +gamma_x2 = sims_parameters$gamma_x2[s]) +#Apply the sub-model to the raw (non-imputed) validation data: +Target_measures <- Target_measures %>% +dplyr::bind_rows(purrr::map_dfr(.x = preds_per_data_set[which(names(preds_per_data_set)=="observed_val")], +.f = validation_fnc, +PR_name = "PR_patternsubmodel", +.id = "Validation_Dataset") %>% +dplyr::mutate("CPM" = "PR_patternsubmodel", +.before = "Validation_Dataset")) +View(Target_measures) +View(Target_measures) +source("Code/00_Simulation_Functions.R") +#run the simulation study: +simulation_results <- simulation_nrun_fnc(n_iter = 100, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[s], +X_categorical = sims_parameters$X_categorical[s], +R_prev = sims_parameters$R_prev[s], +beta_x1_dev = sims_parameters$beta_x1_dev[s], +beta_x2_dev = sims_parameters$beta_x2_dev[s], +beta_Y_dev = sims_parameters$beta_Y_dev[s], +beta_x1_val = sims_parameters$beta_x1_val[s], +beta_x2_val = sims_parameters$beta_x2_val[s], +beta_Y_val = sims_parameters$beta_Y_val[s], +rho_X = sims_parameters$rho_X[s], +gamma_x1 = sims_parameters$gamma_x1[s], +gamma_x2 = sims_parameters$gamma_x2[s]) +#run the simulation study: +simulation_results <- simulation_nrun_fnc(n_iter = 2, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[s], +X_categorical = sims_parameters$X_categorical[s], +R_prev = sims_parameters$R_prev[s], +beta_x1_dev = sims_parameters$beta_x1_dev[s], +beta_x2_dev = sims_parameters$beta_x2_dev[s], +beta_Y_dev = sims_parameters$beta_Y_dev[s], +beta_x1_val = sims_parameters$beta_x1_val[s], +beta_x2_val = sims_parameters$beta_x2_val[s], +beta_Y_val = sims_parameters$beta_Y_val[s], +rho_X = sims_parameters$rho_X[s], +gamma_x1 = sims_parameters$gamma_x1[s], +gamma_x2 = sims_parameters$gamma_x2[s]) +View(simulation_results) +#Define the size of development, validation and implementation, respectively: +N_dev <- 100000 +N_val <- 100000 +#run the simulation study: +simulation_results <- simulation_nrun_fnc(n_iter = 2, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[s], +X_categorical = sims_parameters$X_categorical[s], +R_prev = sims_parameters$R_prev[s], +beta_x1_dev = sims_parameters$beta_x1_dev[s], +beta_x2_dev = sims_parameters$beta_x2_dev[s], +beta_Y_dev = sims_parameters$beta_Y_dev[s], +beta_x1_val = sims_parameters$beta_x1_val[s], +beta_x2_val = sims_parameters$beta_x2_val[s], +beta_Y_val = sims_parameters$beta_Y_val[s], +rho_X = sims_parameters$rho_X[s], +gamma_x1 = sims_parameters$gamma_x1[s], +gamma_x2 = sims_parameters$gamma_x2[s]) +View(simulation_results) +sims_parameters <- tidyr::crossing( +Y_prev = c(0.2), +X_categorical = c(TRUE, FALSE), +R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively +beta_x1_dev = c(0, 0.5), +beta_x2_dev = c(0, 0.5), +beta_Y_dev = c(0, 0.5), +beta_x1_val = c(0, 0.5), +beta_x2_val = c(0, 0.5), +beta_Y_val = c(0, 0.5), +rho_X = c(0, 0.75), +gamma_x1 = c(0, 0.5), +gamma_x2 = 0.5 +) %>% +mutate(beta_cases_dev = case_when( +beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev", +TRUE ~ as.character("not_applicable") +), +beta_cases_val = case_when( +beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val", +TRUE ~ as.character("not_applicable") +)) %>% +filter(beta_cases_dev != "not_applicable" & +beta_cases_val != "not_applicable") %>% +select(-beta_cases_dev, +-beta_cases_val) library(tidyverse) -library(ggh4x) -simulation_results_summarised <- read_rds(here::here("outputs", -"simulation_results_summarised.RDS")) -simulation_results_summarised <- simulation_results_summarised %>% -dplyr::mutate("Scenario" = paste(paste(Missing_Mech_Dev, "Dev", sep=" "), -paste(Missing_Mech_Val, "Val", sep=" "), -sep = " -\n "), -Scenario = forcats::fct_relevel(Scenario, -"MCAR Dev -\n MCAR Val", -"MAR Dev -\n MAR Val", -"MNAR_X Dev -\n MNAR_X Val", -"MNAR_Y Dev -\n MNAR_Y Val", -"MNAR_XY Dev -\n MNAR_XY Val", -"MCAR Dev -\n MAR Val", -"MCAR Dev -\n MNAR_X Val", -"MCAR Dev -\n MNAR_Y Val", -"MCAR Dev -\n MNAR_XY Val", -"MAR Dev -\n MNAR_X Val", -"MAR Dev -\n MNAR_Y Val", -"MAR Dev -\n MNAR_XY Val", -"MNAR_X Dev -\n MNAR_Y Val", -"MNAR_X Dev -\n MNAR_XY Val"), -Validation_Dataset = forcats::fct_recode(Validation_Dataset, -"CCA" = "CCA_val", -"MI no Y (refit)" = "MI_val_noY_newimputation", -"MI no Y (transported)" = "MI_val_noY_transportedimputation", -"MI with Y (refit)" = "MI_val_withY_newimputation", -"MI with Y (transported)" = "MI_val_withY_transportedimputation", -"RI (refit)" = "RI_val_newimputation", -"RI (transported)" = "RI_val_transportedimputation", -"All Data Required" = "fullyobserved_val", -"Mean Imputation" = "mean_val", -"Risk Absent Imputation" ="zero_val", -"Pattern Sub-Model" = "observed_val")) -####---------------------------------------------------------------------------- -### Select which estimand we want to produce plots for: -target_performance <- "All Data Required" -### Function to help summarise the results for a given iteration number: -plotting_fnc <- function(df, -scenario_number, -target_performance_imputation_method = c("All Data Required", -"CCA", -"Mean Imputation", -"RI (refit)", -"RI (transported)", -"MI no Y (refit)", -"MI no Y (transported)", -"MI with Y (refit)", -"MI with Y (transported)", -"Pattern Sub-Model")) { -target_performance_imputation_method <- as.character(match.arg(target_performance_imputation_method)) -df <- df %>% -dplyr::mutate("combinations" = case_when( -CPM == "PR_fullyobserved" | -CPM == "PR_CCA" | -CPM == "PR_mean" ~ Validation_Dataset %in% c("All Data Required", -"CCA", -"Mean Imputation", -"MI with Y (refit)", -"MI no Y (refit)", -"RI (refit)"), -CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required", -"CCA", -"Mean Imputation", -"MI with Y (refit)", -"MI no Y (refit)", -"RI (transported)", -"RI (refit)"), -CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required", -"CCA", -"Mean Imputation", -"MI with Y (transported)", -"MI with Y (refit)", -"MI no Y (refit)", -"RI (refit)"), -CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required", -"CCA", -"Mean Imputation", -"MI with Y (refit)", -"MI no Y (transported)", -"MI no Y (refit)", -"RI (refit)"), -CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required", -"CCA", -"Mean Imputation", -"MI with Y (refit)", -"MI no Y (refit)", -"RI (refit)", -"Pattern Sub-Model"), -.default = NA)) %>% -dplyr::filter(combinations == 1) -scenario_df <- df %>% -dplyr::filter(Simulation_Scenario %in% scenario_number) %>% -dplyr::select(Simulation_Scenario, Scenario, -CPM, Validation_Dataset, -contains("_Mean"), -contains("_quantileLower"), -contains("_quantileUpper")) %>% -pivot_longer(cols = c(contains("_Mean"), -contains("_quantileLower"), -contains("_quantileUpper"))) %>% -separate_wider_delim(name, -delim = "_", -names = c("Metric", "Summary_Type")) %>% -pivot_wider(id_cols = c("Simulation_Scenario", "Scenario", -"CPM", "Validation_Dataset", "Metric"), -names_from = "Summary_Type", -values_from = "value") -results_using_for_implementation <- df %>% -dplyr::filter(Simulation_Scenario %in% scenario_number) %>% -dplyr::filter(Validation_Dataset == target_performance_imputation_method) %>% -dplyr::select(Simulation_Scenario, Scenario, -CPM, Validation_Dataset, -contains("_Mean"), -contains("_quantileLower"), -contains("_quantileUpper")) %>% -pivot_longer(cols = c(contains("_Mean"), -contains("_quantileLower"), -contains("_quantileUpper"))) %>% -separate_wider_delim(name, -delim = "_", -names = c("Metric", "Summary_Type")) %>% -pivot_wider(id_cols = c("Simulation_Scenario", "Scenario", -"CPM", "Validation_Dataset", "Metric"), -names_from = "Summary_Type", -values_from = "value") %>% -dplyr::rename("Imp_Data" = Validation_Dataset, -"Imp_Mean" = Mean, -"Imp_quantileLower" = quantileLower, -"Imp_quantileUpper" = quantileUpper) -plot_df <- scenario_df %>% -dplyr::left_join(results_using_for_implementation, -by = c("Simulation_Scenario", -"Scenario", -"CPM", -"Metric")) %>% -dplyr::mutate("Bias" = Mean - Imp_Mean, -"Bias_Lower" = quantileLower - Imp_quantileLower, -"Bias_Upper" = quantileUpper - Imp_quantileUpper) %>% -dplyr::mutate(Metric = forcats::fct_recode(Metric, -"Brier Score" = "Brier", -"Calibration Intercept" = "CalInt", -"Calibration Slope" = "CalSlope")) %>% -mutate(CPM = stringr::str_remove(CPM, "PR_"), -CPM = forcats::fct_recode(CPM, -"CCA \n developed CPM" = "CCA", -"RI \n developed CPM" = "RI", -"MI no Y \n developed CPM" = "MInoY", -"MI with Y \n developed CPM" = "MIwithY", -"Fully Observed \n developed CPM" = "fullyobserved", -"Mean Imputation \n developed CPM" = "mean", -"PSM \n developed CPM" = "patternsubmodel"), -CPM = forcats::fct_relevel(CPM, -"Fully Observed \n developed CPM", -"CCA \n developed CPM", -"Mean Imputation \n developed CPM", -"RI \n developed CPM", -"MI with Y \n developed CPM", -"MI no Y \n developed CPM", -"PSM \n developed CPM")) -plot_df %>% -tidyr::drop_na() %>% -dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset, -"Fully Observed Data" = "All Data Required"), -Validation_Dataset = forcats::fct_relevel(Validation_Dataset, -"Fully Observed Data", -"CCA", -"Mean Imputation", -"RI (refit)", -"RI (transported)", -"MI with Y (refit)", -"MI with Y (transported)", -"MI no Y (refit)", -"MI no Y (transported)", -"Pattern Sub-Model")) %>% -ggplot(aes(x = Bias, -y = Validation_Dataset, -color = Metric)) + -geom_point(aes(shape = Metric)) + -geom_errorbar(aes(xmin = Bias_Lower, -xmax = Bias_Upper), width=.1) + -scale_color_brewer(palette = "Set1") + -xlab("Bias") + -ylab("Validation Data Imputation Method") + -geom_vline(data = data.frame("Metric" = c("AUC", -"Brier Score", -"Calibration Intercept", -"Calibration Slope"), -"Bias" = c(0,0,0,0)), -aes(xintercept = Bias), -linetype = "dashed") + -ggh4x::facet_grid2(Scenario ~ CPM, scales = "fixed") + -ggtitle(paste("Targetting ", target_performance_imputation_method, " Performance", sep = "")) + -theme_minimal(base_size = 12) + -theme(legend.position = "bottom", -panel.background = element_rect(fill = "gray90"), -panel.spacing.x = unit(0.5, "lines"), -axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), -plot.title = element_text(size = 14, hjust = 0.5), -panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) -} -#rho = 0.75 -#MCAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(24, 56, 32, 64), -target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(184, 161, 193), -target_performance_imputation_method = target_performance) -#MNAR-X+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(416, 448, 256), -target_performance_imputation_method = target_performance) -## Inconsistent missingness mechanism plots: -#rho = 0 -#MCAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(20, 52, 28, 60), -target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(180, 156, 188), -target_performance_imputation_method = target_performance) -#MNAR-X+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(416, 448, 256), -target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(184, 161, 193), -target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(180, 156, 188), -target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(184, 161, 193), -target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(185, 161, 193), -target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(185, 161, 192), -target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(184, 161, 192), -target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(180, 156, 188), -target_performance_imputation_method = target_performance) -## Inconsistent missingness mechanism plots: -#rho = 0 -#MCAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(20, 52, 28, 60), -target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(180, 156, 188), -target_performance_imputation_method = target_performance) -## Inconsistent missingness mechanism plots: -#rho = 0 -#MCAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(20, 52, 28, 60), -target_performance_imputation_method = target_performance) -target_performance <- "Mean Imputation" -#MAR+something -plotting_fnc(df = simulation_results_summarised, -scenario_number = c(180, 156, 188), -target_performance_imputation_method = target_performance) +# Define a dataset that includes all combinations of simulation parameters: +sims_parameters <- tidyr::crossing( +Y_prev = c(0.2), +X_categorical = c(TRUE, FALSE), +R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively +beta_x1_dev = c(0, 0.5), +beta_x2_dev = c(0, 0.5), +beta_Y_dev = c(0, 0.5), +beta_x1_val = c(0, 0.5), +beta_x2_val = c(0, 0.5), +beta_Y_val = c(0, 0.5), +rho_X = c(0, 0.75), +gamma_x1 = c(0, 0.5), +gamma_x2 = 0.5 +) %>% +mutate(beta_cases_dev = case_when( +beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev", +TRUE ~ as.character("not_applicable") +), +beta_cases_val = case_when( +beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val", +TRUE ~ as.character("not_applicable") +)) %>% +filter(beta_cases_dev != "not_applicable" & +beta_cases_val != "not_applicable") %>% +select(-beta_cases_dev, +-beta_cases_val) +?case_when +# Define a dataset that includes all combinations of simulation parameters: +sims_parameters <- tidyr::crossing( +Y_prev = c(0.2), +X_categorical = c(TRUE, FALSE), +R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively +beta_x1_dev = c(0, 0.5), +beta_x2_dev = c(0, 0.5), +beta_Y_dev = c(0, 0.5), +beta_x1_val = c(0, 0.5), +beta_x2_val = c(0, 0.5), +beta_Y_val = c(0, 0.5), +rho_X = c(0, 0.75), +gamma_x1 = c(0, 0.5), +gamma_x2 = 0.5 +) %>% +dplyr::mutate(beta_cases_dev = dplyr::case_when( +beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev", +TRUE ~ as.character("not_applicable") +), +beta_cases_val = dplyr::case_when( +beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val", +TRUE ~ as.character("not_applicable") +)) %>% +dplyr::filter(beta_cases_dev != "not_applicable" & +beta_cases_val != "not_applicable") %>% +dplyr::select(-beta_cases_dev, +-beta_cases_val) +dim(sims_parameters) +source("Code/00_Simulation_Functions.R") +#run the simulation study: +simulation_results <- simulation_nrun_fnc(n_iter = 2, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[s], +X_categorical = sims_parameters$X_categorical[s], +R_prev = sims_parameters$R_prev[s], +beta_x1_dev = sims_parameters$beta_x1_dev[s], +beta_x2_dev = sims_parameters$beta_x2_dev[s], +beta_Y_dev = sims_parameters$beta_Y_dev[s], +beta_x1_val = sims_parameters$beta_x1_val[s], +beta_x2_val = sims_parameters$beta_x2_val[s], +beta_Y_val = sims_parameters$beta_Y_val[s], +rho_X = sims_parameters$rho_X[s], +gamma_x1 = sims_parameters$gamma_x1[s], +gamma_x2 = sims_parameters$gamma_x2[s]) +source("Code/00_Simulation_Functions.R") +#run the simulation study: +simulation_results <- simulation_nrun_fnc(n_iter = 2, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[s], +X_categorical = sims_parameters$X_categorical[s], +R_prev = sims_parameters$R_prev[s], +beta_x1_dev = sims_parameters$beta_x1_dev[s], +beta_x2_dev = sims_parameters$beta_x2_dev[s], +beta_Y_dev = sims_parameters$beta_Y_dev[s], +beta_x1_val = sims_parameters$beta_x1_val[s], +beta_x2_val = sims_parameters$beta_x2_val[s], +beta_Y_val = sims_parameters$beta_Y_val[s], +rho_X = sims_parameters$rho_X[s], +gamma_x1 = sims_parameters$gamma_x1[s], +gamma_x2 = sims_parameters$gamma_x2[s]) +View(simulation_results) +View(sims_parameters) +View(simulation_results) +#run the simulation study: +simulation_results <- simulation_nrun_fnc(n_iter = 20, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[s], +X_categorical = sims_parameters$X_categorical[s], +R_prev = sims_parameters$R_prev[s], +beta_x1_dev = sims_parameters$beta_x1_dev[s], +beta_x2_dev = sims_parameters$beta_x2_dev[s], +beta_Y_dev = sims_parameters$beta_Y_dev[s], +beta_x1_val = sims_parameters$beta_x1_val[s], +beta_x2_val = sims_parameters$beta_x2_val[s], +beta_Y_val = sims_parameters$beta_Y_val[s], +rho_X = sims_parameters$rho_X[s], +gamma_x1 = sims_parameters$gamma_x1[s], +gamma_x2 = sims_parameters$gamma_x2[s]) +View(simulation_results) +View(simulation_results %>% group_by(CPM, Validation_Dataset) %>% summarise(across(CalInt_est:Brier_var, ~mean(.)))) +#Load the simulation functions +source("./00_Simulation_Functions.R") +#Load the simulation functions +source("Code/00_Simulation_Functions.R") +library(tidyverse) +# Define a dataset that includes all combinations of simulation parameters: +sims_parameters <- tidyr::crossing( +Y_prev = c(0.2), +X_categorical = c(TRUE, FALSE), +R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively +beta_x1_dev = c(0, 0.5), +beta_x2_dev = c(0, 0.5), +beta_Y_dev = c(0, 0.5), +beta_x1_val = c(0, 0.5), +beta_x2_val = c(0, 0.5), +beta_Y_val = c(0, 0.5), +rho_X = c(0, 0.75), +gamma_x1 = c(0, 0.5), +gamma_x2 = 0.5 +) %>% +dplyr::mutate(beta_cases_dev = dplyr::case_when( +beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev", +TRUE ~ as.character("not_applicable") +), +beta_cases_val = dplyr::case_when( +beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val", +TRUE ~ as.character("not_applicable") +)) %>% +dplyr::filter(beta_cases_dev != "not_applicable" & +beta_cases_val != "not_applicable") %>% +dplyr::select(-beta_cases_dev, +-beta_cases_val) +# number of repeats per scenario +n_rep <- 100 +#Define the size of development, validation and implementation, respectively: +N_dev <- 100000 +N_val <- 100000 +head(sims_parameters) +taskid <- 1 +simulation_nrun_fnc(n_iter = 1, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[taskid], +X_categorical = sims_parameters$X_categorical[taskid], +R_prev = sims_parameters$R_prev[taskid], +beta_x1_dev = sims_parameters$beta_x1_dev[taskid], +beta_x2_dev = sims_parameters$beta_x2_dev[taskid], +beta_Y_dev = sims_parameters$beta_Y_dev[taskid], +beta_x1_val = sims_parameters$beta_x1_val[taskid], +beta_x2_val = sims_parameters$beta_x2_val[taskid], +beta_Y_val = sims_parameters$beta_Y_val[taskid], +rho_X = sims_parameters$rho_X[taskid], +gamma_x1 = sims_parameters$gamma_x1[taskid], +gamma_x2 = sims_parameters$gamma_x2[taskid]) +300/24 +library(furrr) +?plan +(availableCores() - 3) +12/5 +plan(multicore, workers = (availableCores() - 3)) +plan(multisession, workers = (availableCores() - 3)) +dim(sims_parameters %>% +dplyr::filter(X_categorical == FALSE)) +simulation_results_XContinuous <- sims_parameters %>% +dplyr::filter(X_categorical == FALSE) %>% +dplyr::mutate("results" = future_pmap(.l = list(n_iter = 100, +N_dev = 100000, +N_val = 100000, +Y_prev = Y_prev, +X_categorical = X_categorical, +R_prev = R_prev, +beta_x1_dev = beta_x1_dev, +beta_x2_dev = beta_x2_dev, +beta_Y_dev = beta_Y_dev, +beta_x1_val = beta_x1_val, +beta_x2_val = beta_x2_val, +beta_Y_val = beta_Y_val, +rho_X = rho_X, +gamma_x1 = gamma_x1, +gamma_x2 = gamma_x2), +.f = simulation_nrun_fnc, +.progress = TRUE, +.options = furrr_options(seed = as.integer(465475)))) +system.time(simulation_nrun_fnc(n_iter = 1, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[taskid], +X_categorical = sims_parameters$X_categorical[taskid], +R_prev = sims_parameters$R_prev[taskid], +beta_x1_dev = sims_parameters$beta_x1_dev[taskid], +beta_x2_dev = sims_parameters$beta_x2_dev[taskid], +beta_Y_dev = sims_parameters$beta_Y_dev[taskid], +beta_x1_val = sims_parameters$beta_x1_val[taskid], +beta_x2_val = sims_parameters$beta_x2_val[taskid], +beta_Y_val = sims_parameters$beta_Y_val[taskid], +rho_X = sims_parameters$rho_X[taskid], +gamma_x1 = sims_parameters$gamma_x1[taskid], +gamma_x2 = sims_parameters$gamma_x2[taskid])) +source("Code/00_Simulation_Functions.R") +library(tidyverse) +# Define a dataset that includes all combinations of simulation parameters: +sims_parameters <- tidyr::crossing( +Y_prev = c(0.2), +X_categorical = c(TRUE, FALSE), +R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively +beta_x1_dev = c(0, 0.5), +beta_x2_dev = c(0, 0.5), +beta_Y_dev = c(0, 0.5), +beta_x1_val = c(0, 0.5), +beta_x2_val = c(0, 0.5), +beta_Y_val = c(0, 0.5), +rho_X = c(0, 0.75), +gamma_x1 = c(0, 0.5), +gamma_x2 = 0.5 +) %>% +dplyr::mutate(beta_cases_dev = dplyr::case_when( +beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev", +TRUE ~ as.character("not_applicable") +), +beta_cases_val = dplyr::case_when( +beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val", +TRUE ~ as.character("not_applicable") +)) %>% +dplyr::filter(beta_cases_dev != "not_applicable" & +beta_cases_val != "not_applicable") %>% +dplyr::select(-beta_cases_dev, +-beta_cases_val) +taskid <- 100 +# number of repeats per scenario +n_rep <- 100 +#Define the size of development, validation and implementation, respectively: +N_dev <- 100000 +N_val <- 100000 +system.time(simulation_nrun_fnc(n_iter = 1, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[taskid], +X_categorical = sims_parameters$X_categorical[taskid], +R_prev = sims_parameters$R_prev[taskid], +beta_x1_dev = sims_parameters$beta_x1_dev[taskid], +beta_x2_dev = sims_parameters$beta_x2_dev[taskid], +beta_Y_dev = sims_parameters$beta_Y_dev[taskid], +beta_x1_val = sims_parameters$beta_x1_val[taskid], +beta_x2_val = sims_parameters$beta_x2_val[taskid], +beta_Y_val = sims_parameters$beta_Y_val[taskid], +rho_X = sims_parameters$rho_X[taskid], +gamma_x1 = sims_parameters$gamma_x1[taskid], +gamma_x2 = sims_parameters$gamma_x2[taskid])) +source("Code/00_Simulation_Functions.R") +library(tidyverse) +# Define a dataset that includes all combinations of simulation parameters: +sims_parameters <- tidyr::crossing( +Y_prev = c(0.2), +X_categorical = c(TRUE, FALSE), +R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively +beta_x1_dev = c(0, 0.5), +beta_x2_dev = c(0, 0.5), +beta_Y_dev = c(0, 0.5), +beta_x1_val = c(0, 0.5), +beta_x2_val = c(0, 0.5), +beta_Y_val = c(0, 0.5), +rho_X = c(0, 0.75), +gamma_x1 = c(0, 0.5), +gamma_x2 = 0.5 +) %>% +dplyr::mutate(beta_cases_dev = dplyr::case_when( +beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev", +beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev", +beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev", +TRUE ~ as.character("not_applicable") +), +beta_cases_val = dplyr::case_when( +beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val", +beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val", +beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val", +TRUE ~ as.character("not_applicable") +)) %>% +dplyr::filter(beta_cases_dev != "not_applicable" & +beta_cases_val != "not_applicable") %>% +dplyr::select(-beta_cases_dev, +-beta_cases_val) +taskid <- 100 +#Define the size of development, validation and implementation, respectively: +N_dev <- 100000 +N_val <- 100000 +system.time(simulation_nrun_fnc(n_iter = 1, +N_dev = N_dev, +N_val = N_val, +Y_prev = sims_parameters$Y_prev[taskid], +X_categorical = sims_parameters$X_categorical[taskid], +R_prev = sims_parameters$R_prev[taskid], +beta_x1_dev = sims_parameters$beta_x1_dev[taskid], +beta_x2_dev = sims_parameters$beta_x2_dev[taskid], +beta_Y_dev = sims_parameters$beta_Y_dev[taskid], +beta_x1_val = sims_parameters$beta_x1_val[taskid], +beta_x2_val = sims_parameters$beta_x2_val[taskid], +beta_Y_val = sims_parameters$beta_Y_val[taskid], +rho_X = sims_parameters$rho_X[taskid], +gamma_x1 = sims_parameters$gamma_x1[taskid], +gamma_x2 = sims_parameters$gamma_x2[taskid])) +80*100 +80/60 +*100 +(80/60)*100 +600/15 +40*((80/60)*100) +(40*((80/60)*100))/24 +((80/60)*100)/60 +(40*(((80/60)*100)/60))/24 +library(tidyverse) +simulation_results_all <- read_rds(here::here("Code", +"simulation_results_all.RDS")) +simulation_results_all <- read_rds(here::here("Code", +"simulation_results_all.RDS")) +View(simulation_results_all) +unique(simulation_results_all$Simulation_Scenario) +sort(unique(simulation_results_all$Simulation_Scenario)) +length(sort(unique(simulation_results_all$Simulation_Scenario))) diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index d012415..fbd2509 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -1,2 +1 @@ -C:/Users/mbchkgm5/OneDrive - The University of Manchester/1. Projects/SARC Children Non Fatal Strangulation/Code/02_NFSAnalysis.R="8196FE78" -C:/Users/mbchkgm5/OneDrive - The University of Manchester/2. Areas/Missing Data Handling in Prediction models/1. Projects/Compatibility of missing data handling across CPM Production/.gitignore="2EB11F40" +/Users/glenmartin/Library/CloudStorage/OneDrive-TheUniversityofManchester/2. Areas/Missing Data Handling in Prediction models/1. Projects/Compatibility of missing data handling across CPM Production/.gitignore="1040BE61" diff --git a/.gitignore b/.gitignore index ac62074..da136b9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ /Manuscript /Templates /Outputs/ncorr_analysis_results.RDS -/Outputs/simulation_results.7z \ No newline at end of file +/Outputs/simulation_results_old.7z +/Outputs/simulation_results_all.RDS \ No newline at end of file diff --git a/Code/.DS_Store b/Code/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/Code/.DS_Store differ diff --git a/Code/00_Simulation_Functions.R b/Code/00_Simulation_Functions.R index f2db746..2447afb 100644 --- a/Code/00_Simulation_Functions.R +++ b/Code/00_Simulation_Functions.R @@ -3,16 +3,7 @@ # Author of code: Antonia D. Tsvetanova & Glen P. Martin # This is code for a simulation study presented in a manuscript entitled: -# Compatibility of missing data handling methods across validation and -# deployment of a Clinical Prediction Model -# Authors: -# Antonia Tsvetanova -# Matthew Sperrin -# Niels Peek -# David Jenkins -# Iain Buchan -# Stephanie Hyland -# Glen P. Martin +# Compatibility of missing data handling methods across stages of producing CPMs # ############################################################################## @@ -27,14 +18,13 @@ simulation_nrun_fnc <- function(n_iter, R_prev, beta_x1_dev, beta_x2_dev, - beta_U_dev, + beta_Y_dev, beta_x1_val, beta_x2_val, - beta_U_val, + beta_Y_val, rho_X, gamma_x1, - gamma_x2, - gamma_U) { + gamma_x2) { #Input: n_iter = the number of iterations to repeat the simulation over # N_dev = size of the data to simulate as the development set # N_val = size of the data to simulate as the validation set @@ -45,22 +35,20 @@ simulation_nrun_fnc <- function(n_iter, # development dataset # beta_x2_dev = association between X2 and prob of missing data in the # development dataset - # beta_U_dev = association between U (unmeasured variable) and prob of - # missing data in the development dataset + # beta_Y_dev = association between Y and prob of missing data in the + # development dataset # beta_x1_val = association between X1 and prob of missing data in the # validation dataset # beta_x2_val = association between X2 and prob of missing data in the # validation dataset - # beta_U_val = association between U (unmeasured variable) and prob of - # missing data in the validation dataset + # beta_Y_val = association between Y and prob of missing data in the + # validation dataset # rho_X = the correlation between X1 and X2 - if X1 is categorical, the # inputted correlation is between the latent variable Z1 and # X2 where Z1 is then used to generate X1 at a set prevalence # (see paper for details) # gamma_x1 = association between X1 and prob of outcome # gamma_x2 = association between X2 and prob of outcome - # gamma_U = association between U (unmeasured variable) and prob of - # outcome library(MASS) library(tidyverse) @@ -80,14 +68,13 @@ simulation_nrun_fnc <- function(n_iter, R_prev =R_prev, beta_x1_dev = beta_x1_dev, beta_x2_dev = beta_x2_dev, - beta_U_dev = beta_U_dev, + beta_Y_dev = beta_Y_dev, beta_x1_val = beta_x1_val, beta_x2_val = beta_x2_val, - beta_U_val = beta_U_val, + beta_Y_val = beta_Y_val, rho_X = rho_X, gamma_x1 = gamma_x1, - gamma_x2 = gamma_x2, - gamma_U = gamma_U) + gamma_x2 = gamma_x2) results <- results %>% dplyr::bind_rows(tibble::as_tibble(simulation_results) %>% @@ -111,31 +98,29 @@ simulation_singlerun_fnc <- function(N_dev, R_prev, beta_x1_dev, beta_x2_dev, - beta_U_dev, + beta_Y_dev, beta_x1_val, beta_x2_val, - beta_U_val, + beta_Y_val, rho_X, gamma_x1, - gamma_x2, - gamma_U) { + gamma_x2) { # Generate the data according to the DGM-------------------------------------- gen_data <- DGM_fnc(N_dev = N_dev, N_val = N_val, Y_prev = Y_prev, X_categorical = X_categorical, - R_prev =R_prev, + R_prev = R_prev, beta_x1_dev = beta_x1_dev, beta_x2_dev = beta_x2_dev, - beta_U_dev = beta_U_dev, + beta_Y_dev = beta_Y_dev, beta_x1_val = beta_x1_val, beta_x2_val = beta_x2_val, - beta_U_val = beta_U_val, + beta_Y_val = beta_Y_val, rho_X = rho_X, gamma_x1 = gamma_x1, - gamma_x2 = gamma_x2, - gamma_U = gamma_U) + gamma_x2 = gamma_x2) # Impute missing data in the development and validation data under # different methods ---------------------------------------------------------- @@ -170,7 +155,8 @@ simulation_singlerun_fnc <- function(N_dev, #calculate predictive performance of each model in each imputed dataset: if(X_categorical){ - CPM_names <- c("PR_fullyobserved", + CPM_names <- c("PR_DGM", + "PR_fullyobserved", "PR_CCA", "PR_RI", "PR_MInoY", @@ -178,7 +164,8 @@ simulation_singlerun_fnc <- function(N_dev, "PR_patternsubmodel", "PR_zero") }else { - CPM_names <- c("PR_fullyobserved", + CPM_names <- c("PR_DGM", + "PR_fullyobserved", "PR_CCA", "PR_RI", "PR_MInoY", @@ -222,53 +209,44 @@ DGM_fnc <- function(N_dev, R_prev, beta_x1_dev, beta_x2_dev, - beta_U_dev, + beta_Y_dev, beta_x1_val, beta_x2_val, - beta_U_val, + beta_Y_val, rho_X, gamma_x1, - gamma_x2, - gamma_U) { + gamma_x2) { N <- N_dev + N_val #total sample size to generate #generate (potentially correlated) latent predictor variables from a MVN: Z <- MASS::mvrnorm(n = N, mu = c(0,0), Sigma = toeplitz(c(1, rho_X))) - #generate an "unmeasured variable" U: - U <- rnorm(N, mean = 0, sd = 1) - if(X_categorical == FALSE) { #if continuous use the latent variables directly as X IPD <- tibble::tibble("ID" = 1:N, "x_1" = Z[,1], - "x_2" = Z[,2], - "U" = U) + "x_2" = Z[,2]) } else{ #if binary convert the latent variable to achieve binary variable prevalence IPD <- tibble::tibble("ID" = 1:N, "x_1" = ifelse(Z[,1]>=qnorm(0.3, lower.tail = FALSE), 1, 0), - "x_2" = Z[,2], - "U" = U) + "x_2" = Z[,2]) } #calculate gamma_0 to give inputted prevalence of the outcome: gamma_0 <- as.numeric(coef(glm(rbinom(N, 1, prob = Y_prev) ~ offset((gamma_x1*x_1) + - (gamma_x2*x_2) + - (gamma_U*U)), + (gamma_x2*x_2)), family = binomial(link = "logit"), data = IPD))[1]) #simulate outcomes based on the data generating model: + IPD$Pi <- expit_fnc(gamma_0 + (gamma_x1*IPD$x_1) + (gamma_x2*IPD$x_2)) IPD$Y <- rbinom(N, size = 1, - prob = expit_fnc(gamma_0 + - (gamma_x1*IPD$x_1) + - (gamma_x2*IPD$x_2) + - (gamma_U*IPD$U))) + prob = IPD$Pi) #separate IPD into development and validation data dev_IPD <- IPD %>% @@ -285,13 +263,13 @@ DGM_fnc <- function(N_dev, beta_0_dev <- as.numeric(coef(glm(rbinom(N_dev, 1, prob = R_prev) ~ offset((beta_x1_dev*x_1) + (beta_x2_dev*x_2) + - (beta_U_dev*U)), + (beta_Y_dev*Y)), family = binomial(link = "logit"), data = dev_IPD))[1]) beta_0_val <- as.numeric(coef(glm(rbinom(N_val, 1, prob = R_prev) ~ offset((beta_x1_val*x_1) + (beta_x2_val*x_2) + - (beta_U_val*U)), + (beta_Y_val*Y)), family = binomial(link = "logit"), data = val_IPD))[1]) @@ -302,7 +280,7 @@ DGM_fnc <- function(N_dev, prob = expit_fnc(beta_0_dev + (beta_x1_dev*dev_IPD$x_1) + (beta_x2_dev*dev_IPD$x_2) + - (beta_U_dev*dev_IPD$U))) + (beta_Y_dev*dev_IPD$Y))) val_IPD$R_1 <- rbinom(N_val, @@ -310,7 +288,7 @@ DGM_fnc <- function(N_dev, prob = expit_fnc(beta_0_val + (beta_x1_val*val_IPD$x_1) + (beta_x2_val*val_IPD$x_2) + - (beta_U_val*val_IPD$U))) + (beta_Y_val*val_IPD$Y))) observed_dev_IPD <- dev_IPD %>% dplyr::mutate(x_1 = ifelse(R_1 == 1, x_1, NA)) @@ -474,8 +452,8 @@ mice_fnc <- function(df, m = 5, include_Y) { #we never want MI to impute the missing values based on ID, U or R1: predmat[,"ID"] <- 0 predmat["ID",] <- 0 - predmat[,"U"] <- 0 - predmat["U",] <- 0 + predmat[,"Pi"] <- 0 + predmat["Pi",] <- 0 predmat[,"R_1"] <- 0 predmat["R_1",] <- 0 @@ -502,8 +480,8 @@ regimputation_fnc <- function(df, X_categorical) { #we never want MI to impute the missing values based on ID, U or R1: predmat[,"ID"] <- 0 predmat["ID",] <- 0 - predmat[,"U"] <- 0 - predmat["U",] <- 0 + predmat[,"Pi"] <- 0 + predmat["Pi",] <- 0 predmat[,"R_1"] <- 0 predmat["R_1",] <- 0 @@ -662,7 +640,8 @@ make_predictions_fnc <- function(imputed_dataset, if(is.mids(imputed_dataset) == FALSE){ output_predictions <- tibble::tibble("ID" = imputed_dataset$ID, - "Y" = imputed_dataset$Y) + "Y" = imputed_dataset$Y, + "PR_DGM" = imputed_dataset$Pi) if(any(is.na(imputed_dataset$x_1))) { #apply pattern submodel only - its the observed val data, so has @@ -726,7 +705,8 @@ make_predictions_fnc <- function(imputed_dataset, output_predictions <- tibble::tibble(".imp" = MI_long$.imp, "ID" = MI_long$ID, - "Y" = MI_long$Y) + "Y" = MI_long$Y, + "PR_DGM" = MI_long$Pi) #define the design matrix: DM <- model.matrix(~x_1 + x_2, MI_long) diff --git a/Code/01_Simulation_Run_CSF.R b/Code/01_Simulation_Run_CSF.R index 7dbc388..db5f3f6 100644 --- a/Code/01_Simulation_Run_CSF.R +++ b/Code/01_Simulation_Run_CSF.R @@ -3,16 +3,7 @@ # Author of code: Antonia D. Tsvetanova & Glen P. Martin # This is code for a simulation study presented in a manuscript entitled: -# Compatibility of missing data handling methods across validation and -# deployment of a Clinical Prediction Model -# Authors: -# Antonia Tsvetanova -# Matthew Sperrin -# Niels Peek -# David Jenkins -# Iain Buchan -# Stephanie Hyland -# Glen P. Martin +# Compatibility of missing data handling methods across stages of producing CPMs # ############################################################################## @@ -33,65 +24,82 @@ sims_parameters <- tidyr::crossing( R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively beta_x1_dev = c(0, 0.5), beta_x2_dev = c(0, 0.5), - beta_U_dev = c(0, 0.5), + beta_Y_dev = c(0, 0.5), beta_x1_val = c(0, 0.5), beta_x2_val = c(0, 0.5), - beta_U_val = c(0, 0.5), + beta_Y_val = c(0, 0.5), rho_X = c(0, 0.75), gamma_x1 = c(0, 0.5), - gamma_x2 = 0.5, - gamma_U = c(0, 0.5) -) + gamma_x2 = 0.5 +) %>% + dplyr::mutate(beta_cases_dev = dplyr::case_when( + beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev", + beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev", + beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev", + beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev", + beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev", + TRUE ~ as.character("not_applicable") + ), + beta_cases_val = dplyr::case_when( + beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val", + beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val", + beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val", + beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val", + beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val", + TRUE ~ as.character("not_applicable") + )) %>% + dplyr::filter(beta_cases_dev != "not_applicable" & + beta_cases_val != "not_applicable") %>% + dplyr::select(-beta_cases_dev, + -beta_cases_val) -args <- commandArgs(trailingOnly = T) #pull in all arguments from the qsub file -s <- as.numeric(args[1]) #Note that we need to specify what class the argument is +taskid <- commandArgs(trailingOnly = T) +taskid <- as.numeric(taskid) # number of repeats per scenario n_rep <- 100 #Define the size of development, validation and implementation, respectively: -N_dev <- 50000 -N_val <- 50000 +N_dev <- 100000 +N_val <- 100000 -set.seed(465475 * s) +set.seed(465475 * taskid) #run the simulation study: simulation_results <- simulation_nrun_fnc(n_iter = n_rep, N_dev = N_dev, N_val = N_val, - Y_prev = sims_parameters$Y_prev[s], - X_categorical = sims_parameters$X_categorical[s], - R_prev = sims_parameters$R_prev[s], - beta_x1_dev = sims_parameters$beta_x1_dev[s], - beta_x2_dev = sims_parameters$beta_x2_dev[s], - beta_U_dev = sims_parameters$beta_U_dev[s], - beta_x1_val = sims_parameters$beta_x1_val[s], - beta_x2_val = sims_parameters$beta_x2_val[s], - beta_U_val = sims_parameters$beta_U_val[s], - rho_X = sims_parameters$rho_X[s], - gamma_x1 = sims_parameters$gamma_x1[s], - gamma_x2 = sims_parameters$gamma_x2[s], - gamma_U = sims_parameters$gamma_U[s]) + Y_prev = sims_parameters$Y_prev[taskid], + X_categorical = sims_parameters$X_categorical[taskid], + R_prev = sims_parameters$R_prev[taskid], + beta_x1_dev = sims_parameters$beta_x1_dev[taskid], + beta_x2_dev = sims_parameters$beta_x2_dev[taskid], + beta_Y_dev = sims_parameters$beta_Y_dev[taskid], + beta_x1_val = sims_parameters$beta_x1_val[taskid], + beta_x2_val = sims_parameters$beta_x2_val[taskid], + beta_Y_val = sims_parameters$beta_Y_val[taskid], + rho_X = sims_parameters$rho_X[taskid], + gamma_x1 = sims_parameters$gamma_x1[taskid], + gamma_x2 = sims_parameters$gamma_x2[taskid]) #attach the simulation parameters for this scenario to the simulation results: simulation_results <- simulation_results %>% - dplyr::mutate("Simulation_Scenario" = s, - "Y_prev" = sims_parameters$Y_prev[s], - "X_categorical" = sims_parameters$X_categorical[s], - "R_prev" = sims_parameters$R_prev[s], - "beta_x1_dev" = sims_parameters$beta_x1_dev[s], - "beta_x2_dev" = sims_parameters$beta_x2_dev[s], - "beta_U_dev" = sims_parameters$beta_U_dev[s], - "beta_x1_val" = sims_parameters$beta_x1_val[s], - "beta_x2_val" = sims_parameters$beta_x2_val[s], - "beta_U_val" = sims_parameters$beta_U_val[s], - "rho_X" = sims_parameters$rho_X[s], - "gamma_x1" = sims_parameters$gamma_x1[s], - "gamma_x2" = sims_parameters$gamma_x2[s], - "gamma_U" = sims_parameters$gamma_U[s], + dplyr::mutate("Simulation_Scenario" = taskid, + "Y_prev" = sims_parameters$Y_prev[taskid], + "X_categorical" = sims_parameters$X_categorical[taskid], + "R_prev" = sims_parameters$R_prev[taskid], + "beta_x1_dev" = sims_parameters$beta_x1_dev[taskid], + "beta_x2_dev" = sims_parameters$beta_x2_dev[taskid], + "beta_Y_dev" = sims_parameters$beta_Y_dev[taskid], + "beta_x1_val" = sims_parameters$beta_x1_val[taskid], + "beta_x2_val" = sims_parameters$beta_x2_val[taskid], + "beta_Y_val" = sims_parameters$beta_Y_val[taskid], + "rho_X" = sims_parameters$rho_X[taskid], + "gamma_x1" = sims_parameters$gamma_x1[taskid], + "gamma_x2" = sims_parameters$gamma_x2[taskid], .before = "Iteration") readr::write_rds(simulation_results, - file = paste("./simulation_results_", s, + file = paste("./simulation_results_", taskid, ".RDS", sep = "")) warnings() \ No newline at end of file diff --git a/Code/03_Simulation_Results.R b/Code/03_Simulation_Results.R index c36b187..b0d3b76 100644 --- a/Code/03_Simulation_Results.R +++ b/Code/03_Simulation_Results.R @@ -6,52 +6,35 @@ simulation_results_all <- read_rds(here::here("Outputs", "simulation_results_all.RDS")) -##### Add the missingness mechanism to each simulation scenario (can differ -##### across development and validation sets): +##### Add the missingness DAG type to each simulation scenario (can differ +##### across development and validation sets - see manuscript): simulation_results_all <- simulation_results_all %>% - dplyr::mutate( - Missing_Mech_Dev = case_when( - beta_x1_dev == 0 & beta_x2_dev == 0 & beta_U_dev == 0 ~ "MCAR", - - beta_x1_dev == 0 & beta_x2_dev != 0 & beta_U_dev == 0 ~ "MAR", - - beta_x1_dev != 0 & beta_x2_dev != 0 & beta_U_dev == 0 ~ "MNAR_X", - beta_x1_dev != 0 & beta_x2_dev == 0 & beta_U_dev == 0 ~ "MNAR_X", - - beta_x1_dev == 0 & beta_x2_dev != 0 & beta_U_dev != 0 ~ "MNAR_Y", - beta_x1_dev == 0 & beta_x2_dev == 0 & beta_U_dev != 0 ~ "MNAR_Y", - - beta_x1_dev != 0 & beta_x2_dev != 0 & beta_U_dev != 0 ~ "MNAR_XY", - beta_x1_dev != 0 & beta_x2_dev == 0 & beta_U_dev != 0 ~ "MNAR_XY", - TRUE ~ as.character("error") - ) - , - Missing_Mech_Val = case_when( - beta_x1_val == 0 & beta_x2_val == 0 & beta_U_val == 0 ~ "MCAR", - - beta_x1_val == 0 & beta_x2_val != 0 & beta_U_val == 0 ~ "MAR", - - beta_x1_val != 0 & beta_x2_val != 0 & beta_U_val == 0 ~ "MNAR_X", - beta_x1_val != 0 & beta_x2_val == 0 & beta_U_val == 0 ~ "MNAR_X", - - beta_x1_val == 0 & beta_x2_val != 0 & beta_U_val != 0 ~ "MNAR_Y", - beta_x1_val == 0 & beta_x2_val == 0 & beta_U_val != 0 ~ "MNAR_Y", - - beta_x1_val != 0 & beta_x2_val != 0 & beta_U_val != 0 ~ "MNAR_XY", - beta_x1_val != 0 & beta_x2_val == 0 & beta_U_val != 0 ~ "MNAR_XY", - TRUE ~ as.character("error") - ) - ) + dplyr::mutate(DAG_dev = dplyr::case_when( + beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG (b)", + beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG (c)", + beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG (d)", + beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG (e)", + beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG (f)", + TRUE ~ as.character("not_applicable") + ), + DAG_val = dplyr::case_when( + beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG (b)", + beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG (c)", + beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG (d)", + beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG (e)", + beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG (f)", + TRUE ~ as.character("not_applicable") + )) ##### Summarise the results across all iterations for each simulation scenario simulation_results_summarised <- simulation_results_all %>% - dplyr::group_by(Simulation_Scenario, Missing_Mech_Dev, Missing_Mech_Val, + dplyr::group_by(Simulation_Scenario, DAG_dev, DAG_val, Y_prev, X_categorical, R_prev, - beta_x1_dev, beta_x2_dev, beta_U_dev, - beta_x1_val, beta_x2_val, beta_U_val, + beta_x1_dev, beta_x2_dev, beta_Y_dev, + beta_x1_val, beta_x2_val, beta_Y_val, rho_X, - gamma_x1, gamma_x2, gamma_U, + gamma_x1, gamma_x2, CPM, Validation_Dataset) %>% dplyr::summarise(CalInt_Mean = mean(CalInt_est), @@ -93,31 +76,22 @@ simulation_results_summarised <- simulation_results_all %>% -Brier_acrossSE) write_rds(simulation_results_summarised, - file = here::here("outputs", "simulation_results_summarised.RDS")) -# simulation_results_summarised <- read_rds(here::here("outputs", + file = here::here("Outputs", "simulation_results_summarised.RDS")) +# simulation_results_summarised <- read_rds(here::here("Outputs", # "simulation_results_summarised.RDS")) simulation_results_summarised <- simulation_results_summarised %>% - dplyr::mutate("Scenario" = paste(paste(Missing_Mech_Dev, "Dev", sep=" "), - paste(Missing_Mech_Val, "Val", sep=" "), + dplyr::mutate("Scenario" = paste(paste(DAG_dev, "Dev", sep=" "), + paste(DAG_val, "Val", sep=" "), sep = " -\n "), Scenario = forcats::fct_relevel(Scenario, - "MCAR Dev -\n MCAR Val", - "MAR Dev -\n MAR Val", - "MNAR_X Dev -\n MNAR_X Val", - "MNAR_Y Dev -\n MNAR_Y Val", - "MNAR_XY Dev -\n MNAR_XY Val", - "MCAR Dev -\n MAR Val", - "MCAR Dev -\n MNAR_X Val", - "MCAR Dev -\n MNAR_Y Val", - "MCAR Dev -\n MNAR_XY Val", - "MAR Dev -\n MNAR_X Val", - "MAR Dev -\n MNAR_Y Val", - "MAR Dev -\n MNAR_XY Val", - "MNAR_X Dev -\n MNAR_Y Val", - "MNAR_X Dev -\n MNAR_XY Val"), + "DAG (b) Dev -\n DAG (b) Val", + "DAG (c) Dev -\n DAG (c) Val", + "DAG (d) Dev -\n DAG (d) Val", + "DAG (e) Dev -\n DAG (e) Val", + "DAG (f) Dev -\n DAG (f) Val"), Validation_Dataset = forcats::fct_recode(Validation_Dataset, "CCA" = "CCA_val", @@ -130,608 +104,595 @@ simulation_results_summarised <- simulation_results_summarised %>% "All Data Required" = "fullyobserved_val", "Mean Imputation" = "mean_val", "Risk Absent Imputation" ="zero_val", - "Pattern Sub-Model" = "observed_val")) - - + "Pattern Sub-Model" = "observed_val")) ####---------------------------------------------------------------------------- -### Select which estimand we want to produce plots for: -target_performance <- "All Data Required" -# target_performance <- "Mean Imputation" -# target_performance <- "RI (transported)" -# target_performance <- "RI (refit)" -# target_performance <- "MI no Y (transported)" -# target_performance <- "MI no Y (refit)" -# target_performance <- "MI with Y (transported)" -# target_performance <- "MI with Y (refit)" -# target_performance <- "Pattern Sub-Model" - -### The below plots look at scenarios where X_1 is continuous, contains 50% -### missing data, and where gamma_1=gamma_2=gamma_3=0.5. Results were quantitively -### similar in other scenarios -# Define scenario numbers of interest. We focus on 50% missingness and -# continuous X1 (results similar over other scenarios) -#Some code to extract scenario numbers as follows: -# simulation_results_summarised %>% -# filter(rho_X == 0.75, X_categorical == FALSE, R_prev == 0.5, -# gamma_x1 == 0, gamma_U == 0, -# Missing_Mech_Dev == "MNAR_Y" & Missing_Mech_Val == "MNAR_XY") %>% -# select(Simulation_Scenario) %>% -# pull() %>% -# unique() +## Define two functions for plotting model degradation and model bias for +## given simulation scenarios ####---------------------------------------------------------------------------- - -### Function to plot the predictive performance of each CPM: -performance_plotting_fnc <- function(df, - scenario_number, - target_performance_imputation_method = c("All Data Required", - "CCA", - "Mean Imputation", - "RI (refit)", - "RI (transported)", - "MI no Y (refit)", - "MI no Y (transported)", - "MI with Y (refit)", - "MI with Y (transported)", - "Pattern Sub-Model")) { +model_degrdation_fnc <- function(df, + DAG_Scenario, + X_categorical_value, + R_prev_value, + rho_X_value, + gamma_x1_value, + target_estimands) { - target_performance_imputation_method <- as.character(match.arg(target_performance_imputation_method)) + if(all((target_estimands %in% c("E-all", + "E-mean", + "E-RI", + "E-MI", + "E-PSM")))==FALSE) { + stop("target_estimands vector needs to take correct estimand characters") + } - df <- df %>% - dplyr::mutate("combinations" = case_when( - CPM == "PR_fullyobserved" | - CPM == "PR_CCA" | - CPM == "PR_mean" ~ Validation_Dataset %in% c("All Data Required", - "CCA", - "Mean Imputation", - "MI with Y (refit)", - "MI no Y (refit)", - "RI (refit)"), - CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required", - "CCA", - "Mean Imputation", - "MI with Y (refit)", - "MI no Y (refit)", - "RI (transported)", - "RI (refit)"), - CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required", - "CCA", - "Mean Imputation", - "MI with Y (transported)", - "MI with Y (refit)", - "MI no Y (refit)", - "RI (refit)"), - CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required", - "CCA", - "Mean Imputation", - "MI with Y (refit)", - "MI no Y (transported)", - "MI no Y (refit)", - "RI (refit)"), - - CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required", - "CCA", - "Mean Imputation", - "MI with Y (refit)", - "MI no Y (refit)", - "RI (refit)", - "Pattern Sub-Model"), - .default = NA)) %>% - dplyr::filter(combinations == 1) + if(X_categorical_value == FALSE) { + df <- df %>% + dplyr::filter(Scenario == DAG_Scenario & + X_categorical == X_categorical_value & + R_prev == R_prev_value & + rho_X == rho_X_value & + gamma_x1 == gamma_x1_value, + + Validation_Dataset %in% c( "All Data Required", + "Mean Imputation", + "RI (refit)", + "MI no Y (refit)", + "Pattern Sub-Model")) %>% + dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset, + "E_all" = "All Data Required", + "E_mean" = "Mean Imputation", + "E_RI" = "RI (refit)", + "E_MI" = "MI no Y (refit)", + "E_PSM" = "Pattern Sub-Model")) + } else { + df <- df %>% + dplyr::filter(Scenario == DAG_Scenario & + X_categorical == X_categorical_value & + R_prev == R_prev_value & + rho_X == rho_X_value & + gamma_x1 == gamma_x1_value, + + Validation_Dataset %in% c( "All Data Required", + "Risk Absent Imputation", + "RI (refit)", + "MI no Y (refit)", + "Pattern Sub-Model")) %>% + dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset, + "E_all" = "All Data Required", + "E_mean" = "Risk Absent Imputation", + "E_RI" = "RI (refit)", + "E_MI" = "MI no Y (refit)", + "E_PSM" = "Pattern Sub-Model")) + } + + #Extract 'true' predictive performance as the performance of the DGM within + #the validation set corresponding to the target estimand + true_predictive_performance <- df %>% + dplyr::filter(CPM == "PR_DGM") %>% + dplyr::select(Simulation_Scenario, + Scenario, + Y_prev, + X_categorical, + R_prev, + rho_X, + gamma_x1, + gamma_x2, + Validation_Dataset, + dplyr::ends_with("_Mean")) %>% + tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"), + values_to = "True_Performance", + names_to = "Performance_Metric") - scenario_df <- df %>% - dplyr::filter(Simulation_Scenario %in% scenario_number) %>% - dplyr::filter(Validation_Dataset == target_performance_imputation_method) %>% - dplyr::select(Simulation_Scenario, Scenario, - CPM, Validation_Dataset, - contains("_Mean"), - contains("_quantileLower"), - contains("_quantileUpper")) %>% - pivot_longer(cols = c(contains("_Mean"), - contains("_quantileLower"), - contains("_quantileUpper"))) %>% - separate_wider_delim(name, - delim = "_", - names = c("Metric", "Summary_Type")) %>% - pivot_wider(id_cols = c("Simulation_Scenario", "Scenario", - "CPM", "Validation_Dataset", "Metric"), - names_from = "Summary_Type", - values_from = "value") + #Extract the predictive performance of each CPM within the + #validation set corresponding to the target estimand + CPM_predictive_performance <- df %>% + dplyr::filter(CPM != "PR_DGM") %>% + dplyr::select(Simulation_Scenario, + Scenario, + Y_prev, + X_categorical, + R_prev, + rho_X, + gamma_x1, + gamma_x2, + CPM, + Validation_Dataset, + dplyr::ends_with("_Mean")) %>% + tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"), + values_to = "Model_Performance", + names_to = "Performance_Metric") + + #Calculate model degradation: + ModelDegradation <- CPM_predictive_performance %>% + dplyr::left_join(true_predictive_performance, + by = c("Simulation_Scenario", + "Scenario", + "Y_prev", + "X_categorical", + "R_prev", + "rho_X", + "gamma_x1", + "gamma_x2", + "Validation_Dataset", + "Performance_Metric")) + ModelDegradation$True_Performance[which( + ModelDegradation$Validation_Dataset == "E_PSM" + )] <- true_predictive_performance$True_Performance[which( + true_predictive_performance$Validation_Dataset == "E_all" + )] - plot_df <- scenario_df %>% - dplyr::mutate(Metric = forcats::fct_recode(Metric, - "Brier Score" = "Brier", - "Calibration Intercept" = "CalInt", - "Calibration Slope" = "CalSlope")) %>% - mutate(CPM = stringr::str_remove(CPM, "PR_"), - - CPM = forcats::fct_recode(CPM, - "CCA" = "CCA", - "RI" = "RI", - "MI no Y" = "MInoY", - "MI with Y" = "MIwithY", - "Fully Observed" = "fullyobserved", - "Mean Imputation" = "mean", - "PSM" = "patternsubmodel"), - - CPM = forcats::fct_relevel(CPM, - "Fully Observed", - "CCA", - "Mean Imputation", - "RI", - "MI with Y", - "MI no Y", - "PSM")) + if(X_categorical_value == FALSE) { + + ModelDegradation <- ModelDegradation %>% + dplyr::mutate("Model_Degradation" = Model_Performance - True_Performance) %>% + dplyr::mutate(CPM = str_remove(CPM, "PR_"), + CPM = forcats::fct_recode(CPM, + "CCA" = "CCA", + "RI" = "RI", + "MI no Y" = "MInoY", + "MI with Y" = "MIwithY", + "Fully Observed" = "fullyobserved", + "Mean Imputation" = "mean", + "PSM" = "patternsubmodel"), + CPM = forcats::fct_relevel(CPM, + "Fully Observed", + "CCA", + "Mean Imputation", + "RI", + "MI with Y", + "MI no Y", + "PSM"), + + Performance_Metric = stringr::str_remove(Performance_Metric, + "_Mean"), + Performance_Metric = forcats::fct_recode(Performance_Metric, + "Brier Score" = "Brier", + "Calibration Intercept" = "CalInt", + "Calibration Slope" = "CalSlope"), + + Validation_Dataset = forcats::fct_relevel(Validation_Dataset, + "E_all", + "E_mean", + "E_RI", + "E_MI", + "E_PSM"), + Validation_Dataset = forcats::fct_recode(Validation_Dataset, + "E-all" = "E_all", + "E-mean" = "E_mean", + "E-RI" = "E_RI", + "E-MI" = "E_MI", + "E-PSM" = "E_PSM")) + } else{ + + ModelDegradation <- ModelDegradation %>% + dplyr::mutate("Model_Degradation" = Model_Performance - True_Performance) %>% + dplyr::mutate(CPM = str_remove(CPM, "PR_"), + CPM = forcats::fct_recode(CPM, + "CCA" = "CCA", + "RI" = "RI", + "MI no Y" = "MInoY", + "MI with Y" = "MIwithY", + "Fully Observed" = "fullyobserved", + "Mean Imputation" = "zero", + "PSM" = "patternsubmodel"), + CPM = forcats::fct_relevel(CPM, + "Fully Observed", + "CCA", + "Mean Imputation", + "RI", + "MI with Y", + "MI no Y", + "PSM"), + + Performance_Metric = stringr::str_remove(Performance_Metric, + "_Mean"), + Performance_Metric = forcats::fct_recode(Performance_Metric, + "Brier Score" = "Brier", + "Calibration Intercept" = "CalInt", + "Calibration Slope" = "CalSlope"), + + Validation_Dataset = forcats::fct_relevel(Validation_Dataset, + "E_all", + "E_mean", + "E_RI", + "E_MI", + "E_PSM"), + Validation_Dataset = forcats::fct_recode(Validation_Dataset, + "E-all" = "E_all", + "E-mean" = "E_mean", + "E-RI" = "E_RI", + "E-MI" = "E_MI", + "E-PSM" = "E_PSM")) + } - plot_df %>% - dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset, - "Fully Observed Data" = "All Data Required"), - Validation_Dataset = forcats::fct_relevel(Validation_Dataset, - "Fully Observed Data", - "CCA", - "Mean Imputation", - "RI (refit)", - "RI (transported)", - "MI with Y (refit)", - "MI with Y (transported)", - "MI no Y (refit)", - "MI no Y (transported)", - "Pattern Sub-Model")) %>% - ggplot(aes(x = Mean, - y = CPM)) + + ModelDegradation %>% + dplyr::filter(Validation_Dataset %in% target_estimands) %>% + ggplot(aes(x = Model_Degradation, y = CPM)) + geom_point() + - geom_errorbar(aes(xmin = quantileLower, - xmax = quantileUpper), width=.1) + - xlab("Predictive Performance") + + geom_vline(xintercept = 0, linetype = "dotted") + + ggh4x::facet_grid2(Performance_Metric ~ Validation_Dataset, scales = "free_y") + + xlab("Model Degradation \n (deployment performance - true model performance)") + ylab("Developed CPM") + - geom_vline(data = data.frame("Metric" = c("Calibration Intercept", - "Calibration Slope"), - "Mean" = c(0,1)), - aes(xintercept = Mean), - linetype = "dashed") + - ggh4x::facet_grid2(Scenario ~ Metric, scales = "free") + - ggtitle(paste("Targetting ", target_performance_imputation_method, " Performance", sep = "")) + theme_minimal(base_size = 12) + theme(legend.position = "bottom", - panel.background = element_rect(fill = "gray90"), + panel.background = element_rect(fill = "gray90"), panel.spacing.x = unit(0.5, "lines"), axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), plot.title = element_text(size = 14, hjust = 0.5), - panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) + panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) } -## Consistent missingness mechanism plots: -#rho = 0 -performance_plotting_fnc(df = simulation_results_summarised, - scenario_number = c(4, 148), - target_performance_imputation_method = target_performance) - -performance_plotting_fnc(df = simulation_results_summarised, - scenario_number = c(436, 220, 508), - target_performance_imputation_method = target_performance) - -#rho = 0.75 -performance_plotting_fnc(df = simulation_results_summarised, - scenario_number = c(8, 152), - target_performance_imputation_method = target_performance) - -performance_plotting_fnc(df = simulation_results_summarised, - scenario_number = c(440, 224, 512), - target_performance_imputation_method = target_performance) -## Inconsistent missingness mechanism plots: -#rho = 0 -#MCAR+something -performance_plotting_fnc(df = simulation_results_summarised, - scenario_number = c(20, 52, 28, 60), - target_performance_imputation_method = target_performance) -#MAR+something -performance_plotting_fnc(df = simulation_results_summarised, - scenario_number = c(180, 156, 188), - target_performance_imputation_method = target_performance) -#MNAR+something -performance_plotting_fnc(df = simulation_results_summarised, - scenario_number = c(412, 444, 252), - target_performance_imputation_method = target_performance) - -#rho = 0.75 -#MCAR+something -performance_plotting_fnc(df = simulation_results_summarised, - scenario_number = c(24, 56, 32, 64), - target_performance_imputation_method = target_performance) -#MAR+something -performance_plotting_fnc(df = simulation_results_summarised, - scenario_number = c(184, 161, 193), - target_performance_imputation_method = target_performance) -#MNAR-X+something -performance_plotting_fnc(df = simulation_results_summarised, - scenario_number = c(416, 448, 256), - target_performance_imputation_method = target_performance) - - - -### Function to plot the bias in validation results: -plotting_fnc <- function(df, - scenario_number, - target_performance_imputation_method = c("All Data Required", - "CCA", - "Mean Imputation", - "RI (refit)", - "RI (transported)", - "MI no Y (refit)", - "MI no Y (transported)", - "MI with Y (refit)", - "MI with Y (transported)", - "Pattern Sub-Model")) { +model_bias_fnc <- function(df, + DAG_Scenario, + X_categorical_value, + R_prev_value, + rho_X_value, + gamma_x1_value, + target_estimands) { + + if(all((target_estimands %in% c("E-all", + "E-mean", + "E-RI", + "E-MI", + "E-PSM")))==FALSE) { + stop("target_estimands vector needs to take correct estimand characters") + } - target_performance_imputation_method <- as.character(match.arg(target_performance_imputation_method)) + #filter to the required simulation scenario: + df <- df %>% + dplyr::filter(Scenario == DAG_Scenario & + X_categorical == X_categorical_value & + R_prev == R_prev_value & + rho_X == rho_X_value & + gamma_x1 == gamma_x1_value) + #remove the DGM model performance as dont need true performance for this part df <- df %>% - dplyr::mutate("combinations" = case_when( - CPM == "PR_fullyobserved" | - CPM == "PR_CCA" | - CPM == "PR_mean" ~ Validation_Dataset %in% c("All Data Required", - "CCA", - "Mean Imputation", - "MI with Y (refit)", - "MI no Y (refit)", - "RI (refit)"), - CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required", - "CCA", - "Mean Imputation", - "MI with Y (refit)", - "MI no Y (refit)", - "RI (transported)", - "RI (refit)"), - CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required", + dplyr::filter(CPM != "PR_DGM") + + #define deployment performance, for each CPM across all estimands + if(X_categorical_value == FALSE) { + Deployment_Performance <- df %>% + dplyr::filter(Validation_Dataset %in% c( "All Data Required", + "Mean Imputation", + "RI (refit)", + "MI no Y (refit)", + "Pattern Sub-Model")) %>% + dplyr::mutate(Target_Estimand = forcats::fct_recode(Validation_Dataset, + "E_all" = "All Data Required", + "E_mean" = "Mean Imputation", + "E_RI" = "RI (refit)", + "E_MI" = "MI no Y (refit)", + "E_PSM" = "Pattern Sub-Model")) %>% + dplyr::select(Simulation_Scenario, + Scenario, + Y_prev, + X_categorical, + R_prev, + rho_X, + gamma_x1, + gamma_x2, + CPM, + Target_Estimand, + dplyr::ends_with("_Mean")) %>% + tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"), + values_to = "Deployment_Performance", + names_to = "Performance_Metric") %>% + tidyr::pivot_wider(id_cols = c(Simulation_Scenario, + Scenario, + Y_prev, + X_categorical, + R_prev, + rho_X, + gamma_x1, + gamma_x2, + CPM, + Performance_Metric), + names_from = "Target_Estimand", + values_from = "Deployment_Performance") + } else { + Deployment_Performance <- df %>% + dplyr::filter(Validation_Dataset %in% c( "All Data Required", + "Risk Absent Imputation", + "RI (refit)", + "MI no Y (refit)", + "Pattern Sub-Model")) %>% + dplyr::mutate(Target_Estimand = forcats::fct_recode(Validation_Dataset, + "E_all" = "All Data Required", + "E_mean" = "Risk Absent Imputation", + "E_RI" = "RI (refit)", + "E_MI" = "MI no Y (refit)", + "E_PSM" = "Pattern Sub-Model")) %>% + dplyr::select(Simulation_Scenario, + Scenario, + Y_prev, + X_categorical, + R_prev, + rho_X, + gamma_x1, + gamma_x2, + CPM, + Target_Estimand, + dplyr::ends_with("_Mean")) %>% + tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"), + values_to = "Deployment_Performance", + names_to = "Performance_Metric") %>% + tidyr::pivot_wider(id_cols = c(Simulation_Scenario, + Scenario, + Y_prev, + X_categorical, + R_prev, + rho_X, + gamma_x1, + gamma_x2, + CPM, + Performance_Metric), + names_from = "Target_Estimand", + values_from = "Deployment_Performance") + } + + if(X_categorical_value == FALSE) { + + CPM_performance <- df %>% + #obtain valid development-validation pairs (table 1 of paper): + dplyr::mutate("combinations" = case_when( + CPM == "PR_fullyobserved" | + CPM == "PR_CCA" | + CPM == "PR_mean" ~ Validation_Dataset %in% c("All Data Required", + "CCA", + "Mean Imputation", + "MI with Y (refit)", + "MI no Y (refit)", + "RI (refit)"), + CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required", + "CCA", + "Mean Imputation", + "MI with Y (refit)", + "MI no Y (refit)", + "RI (transported)", + "RI (refit)"), + CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required", + "CCA", + "Mean Imputation", + "MI with Y (transported)", + "MI with Y (refit)", + "MI no Y (refit)", + "RI (refit)"), + CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required", "CCA", "Mean Imputation", - "MI with Y (transported)", "MI with Y (refit)", + "MI no Y (transported)", "MI no Y (refit)", "RI (refit)"), - CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required", - "CCA", - "Mean Imputation", - "MI with Y (refit)", - "MI no Y (transported)", - "MI no Y (refit)", - "RI (refit)"), - - CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required", + + CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required", + "CCA", + "Mean Imputation", + "MI with Y (refit)", + "MI no Y (refit)", + "RI (refit)", + "Pattern Sub-Model"), + .default = NA)) %>% + dplyr::filter(combinations == 1) %>% + dplyr::select(Simulation_Scenario, + Scenario, + Y_prev, + X_categorical, + R_prev, + rho_X, + gamma_x1, + gamma_x2, + CPM, + Validation_Dataset, + dplyr::ends_with("_Mean")) %>% + tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"), + values_to = "Model_Performance", + names_to = "Performance_Metric") + + ModelBias <- CPM_performance %>% + dplyr::left_join(Deployment_Performance, + by = c("Simulation_Scenario", + "Scenario", + "Y_prev", + "X_categorical", + "R_prev", + "rho_X", + "gamma_x1", + "gamma_x2", + "CPM", + "Performance_Metric")) %>% + dplyr::mutate("Eall_Model_Bias" = Model_Performance - E_all, + "Emean_Model_Bias" = Model_Performance - E_mean, + "ERI_Model_Bias" = Model_Performance - E_RI, + "EMI_Model_Bias" = Model_Performance - E_MI, + "EPSM_Model_Bias" = Model_Performance - E_PSM) %>% + dplyr::mutate(CPM = str_remove(CPM, "PR_"), + CPM = forcats::fct_recode(CPM, + "CCA \n developed CPM" = "CCA", + "RI \n developed CPM" = "RI", + "MI no Y \n developed CPM" = "MInoY", + "MI with Y \n developed CPM" = "MIwithY", + "Fully Observed \n developed CPM" = "fullyobserved", + "Mean Imputation \n developed CPM" = "mean", + "PSM \n developed CPM" = "patternsubmodel"), + CPM = forcats::fct_relevel(CPM, + "Fully Observed \n developed CPM", + "CCA \n developed CPM", + "Mean Imputation \n developed CPM", + "RI \n developed CPM", + "MI with Y \n developed CPM", + "MI no Y \n developed CPM", + "PSM \n developed CPM"), + + Performance_Metric = stringr::str_remove(Performance_Metric, + "_Mean"), + Performance_Metric = forcats::fct_recode(Performance_Metric, + "Brier Score" = "Brier", + "Calibration Intercept" = "CalInt", + "Calibration Slope" = "CalSlope"), + + Validation_Dataset = forcats::fct_recode(Validation_Dataset, + "Fully Observed Data" = "All Data Required"), + Validation_Dataset = forcats::fct_relevel(Validation_Dataset, + "Fully Observed Data", "CCA", "Mean Imputation", + "RI (refit)", + "RI (transported)", "MI with Y (refit)", + "MI with Y (transported)", "MI no Y (refit)", - "RI (refit)", - "Pattern Sub-Model"), - .default = NA)) %>% - dplyr::filter(combinations == 1) - - scenario_df <- df %>% - dplyr::filter(Simulation_Scenario %in% scenario_number) %>% - dplyr::select(Simulation_Scenario, Scenario, - CPM, Validation_Dataset, - contains("_Mean"), - contains("_quantileLower"), - contains("_quantileUpper")) %>% - pivot_longer(cols = c(contains("_Mean"), - contains("_quantileLower"), - contains("_quantileUpper"))) %>% - separate_wider_delim(name, - delim = "_", - names = c("Metric", "Summary_Type")) %>% - pivot_wider(id_cols = c("Simulation_Scenario", "Scenario", - "CPM", "Validation_Dataset", "Metric"), - names_from = "Summary_Type", - values_from = "value") - - results_using_for_implementation <- df %>% - dplyr::filter(Simulation_Scenario %in% scenario_number) %>% - dplyr::filter(Validation_Dataset == target_performance_imputation_method) %>% - dplyr::select(Simulation_Scenario, Scenario, - CPM, Validation_Dataset, - contains("_Mean"), - contains("_quantileLower"), - contains("_quantileUpper")) %>% - pivot_longer(cols = c(contains("_Mean"), - contains("_quantileLower"), - contains("_quantileUpper"))) %>% - separate_wider_delim(name, - delim = "_", - names = c("Metric", "Summary_Type")) %>% - pivot_wider(id_cols = c("Simulation_Scenario", "Scenario", - "CPM", "Validation_Dataset", "Metric"), - names_from = "Summary_Type", - values_from = "value") %>% - dplyr::rename("Imp_Data" = Validation_Dataset, - "Imp_Mean" = Mean, - "Imp_quantileLower" = quantileLower, - "Imp_quantileUpper" = quantileUpper) - - plot_df <- scenario_df %>% - dplyr::left_join(results_using_for_implementation, - by = c("Simulation_Scenario", - "Scenario", - "CPM", - "Metric")) %>% - dplyr::mutate("Bias" = Mean - Imp_Mean, - "Bias_Lower" = quantileLower - Imp_quantileLower, - "Bias_Upper" = quantileUpper - Imp_quantileUpper) %>% - dplyr::mutate(Metric = forcats::fct_recode(Metric, - "Brier Score" = "Brier", - "Calibration Intercept" = "CalInt", - "Calibration Slope" = "CalSlope")) %>% - mutate(CPM = stringr::str_remove(CPM, "PR_"), - - CPM = forcats::fct_recode(CPM, - "CCA \n developed CPM" = "CCA", - "RI \n developed CPM" = "RI", - "MI no Y \n developed CPM" = "MInoY", - "MI with Y \n developed CPM" = "MIwithY", - "Fully Observed \n developed CPM" = "fullyobserved", - "Mean Imputation \n developed CPM" = "mean", - "PSM \n developed CPM" = "patternsubmodel"), - - CPM = forcats::fct_relevel(CPM, - "Fully Observed \n developed CPM", - "CCA \n developed CPM", - "Mean Imputation \n developed CPM", - "RI \n developed CPM", - "MI with Y \n developed CPM", - "MI no Y \n developed CPM", - "PSM \n developed CPM")) - - plot_df %>% - tidyr::drop_na() %>% - dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset, - "Fully Observed Data" = "All Data Required"), - Validation_Dataset = forcats::fct_relevel(Validation_Dataset, - "Fully Observed Data", - "CCA", - "Mean Imputation", - "RI (refit)", - "RI (transported)", - "MI with Y (refit)", - "MI with Y (transported)", - "MI no Y (refit)", - "MI no Y (transported)", - "Pattern Sub-Model")) %>% - ggplot(aes(x = Bias, - y = Validation_Dataset, - color = Metric)) + - geom_point(aes(shape = Metric)) + - geom_errorbar(aes(xmin = Bias_Lower, - xmax = Bias_Upper), width=.1) + - scale_color_brewer(palette = "Set1") + - xlab("Bias") + - ylab("Validation Data Imputation Method") + - geom_vline(data = data.frame("Metric" = c("AUC", - "Brier Score", - "Calibration Intercept", - "Calibration Slope"), - "Bias" = c(0,0,0,0)), - aes(xintercept = Bias), - linetype = "dashed") + - ggh4x::facet_grid2(Scenario ~ CPM, scales = "fixed") + - ggtitle(paste("Targetting ", target_performance_imputation_method, " Performance", sep = "")) + - theme_minimal(base_size = 12) + - theme(legend.position = "bottom", - panel.background = element_rect(fill = "gray90"), - panel.spacing.x = unit(0.5, "lines"), - axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), - plot.title = element_text(size = 14, hjust = 0.5), - panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) -} - -## Consistent missingness mechanism plots: -#rho = 0 -plotting_fnc(df = simulation_results_summarised, - scenario_number = c(4, 148), - target_performance_imputation_method = target_performance) - -plotting_fnc(df = simulation_results_summarised, - scenario_number = c(436, 220, 508), - target_performance_imputation_method = target_performance) - -#rho = 0.75 -plotting_fnc(df = simulation_results_summarised, - scenario_number = c(8, 152), - target_performance_imputation_method = target_performance) - -plotting_fnc(df = simulation_results_summarised, - scenario_number = c(440, 224, 512), - target_performance_imputation_method = target_performance) - -## Inconsistent missingness mechanism plots: -#rho = 0 -#MCAR+something -plotting_fnc(df = simulation_results_summarised, - scenario_number = c(20, 52, 28, 60), - target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, - scenario_number = c(180, 156, 188), - target_performance_imputation_method = target_performance) -#MNAR+something -plotting_fnc(df = simulation_results_summarised, - scenario_number = c(412, 444, 252), - target_performance_imputation_method = target_performance) - -#rho = 0.75 -#MCAR+something -plotting_fnc(df = simulation_results_summarised, - scenario_number = c(24, 56, 32, 64), - target_performance_imputation_method = target_performance) -#MAR+something -plotting_fnc(df = simulation_results_summarised, - scenario_number = c(184, 161, 193), - target_performance_imputation_method = target_performance) -#MNAR-X+something -plotting_fnc(df = simulation_results_summarised, - scenario_number = c(416, 448, 256), - target_performance_imputation_method = target_performance) - - - - -###---------------------------------------------------------------------------- -## Now look at a few X_categorical cases to check similar results - -plotting_fnc <- function(df, - scenario_number, - target_performance_imputation_method = c("All Data Required", - "CCA", - "Risk Absent Imputation", - "RI (refit)", - "RI (transported)", - "MI no Y (refit)", - "MI no Y (transported)", - "MI with Y (refit)", - "MI with Y (transported)", - "Pattern Sub-Model")) { - - target_performance_imputation_method <- as.character(match.arg(target_performance_imputation_method)) - - df <- df %>% - dplyr::mutate("combinations" = case_when( - CPM == "PR_fullyobserved" | - CPM == "PR_CCA" | - CPM == "PR_zero" ~ Validation_Dataset %in% c("All Data Required", - "CCA", - "Risk Absent Imputation", - "MI with Y (refit)", - "MI no Y (refit)", - "RI (refit)"), - CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required", - "CCA", - "Risk Absent Imputation", - "MI with Y (refit)", - "MI no Y (refit)", - "RI (transported)", - "RI (refit)"), - CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required", + "MI no Y (transported)", + "Pattern Sub-Model")) + } else { + + CPM_performance <- df %>% + #obtain valid development-validation pairs (table 1 of paper): + dplyr::mutate("combinations" = case_when( + CPM == "PR_fullyobserved" | + CPM == "PR_CCA" | + CPM == "PR_zero" ~ Validation_Dataset %in% c("All Data Required", + "CCA", + "Risk Absent Imputation", + "MI with Y (refit)", + "MI no Y (refit)", + "RI (refit)"), + CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required", + "CCA", + "Risk Absent Imputation", + "MI with Y (refit)", + "MI no Y (refit)", + "RI (transported)", + "RI (refit)"), + CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required", + "CCA", + "Risk Absent Imputation", + "MI with Y (transported)", + "MI with Y (refit)", + "MI no Y (refit)", + "RI (refit)"), + CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required", "CCA", "Risk Absent Imputation", - "MI with Y (transported)", "MI with Y (refit)", + "MI no Y (transported)", "MI no Y (refit)", "RI (refit)"), - CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required", - "CCA", - "Risk Absent Imputation", - "MI with Y (refit)", - "MI no Y (transported)", - "MI no Y (refit)", - "RI (refit)"), - - CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required", + + CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required", + "CCA", + "Risk Absent Imputation", + "MI with Y (refit)", + "MI no Y (refit)", + "RI (refit)", + "Pattern Sub-Model"), + .default = NA)) %>% + dplyr::filter(combinations == 1) %>% + dplyr::select(Simulation_Scenario, + Scenario, + Y_prev, + X_categorical, + R_prev, + rho_X, + gamma_x1, + gamma_x2, + CPM, + Validation_Dataset, + dplyr::ends_with("_Mean")) %>% + tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"), + values_to = "Model_Performance", + names_to = "Performance_Metric") + + ModelBias <- CPM_performance %>% + dplyr::left_join(Deployment_Performance, + by = c("Simulation_Scenario", + "Scenario", + "Y_prev", + "X_categorical", + "R_prev", + "rho_X", + "gamma_x1", + "gamma_x2", + "CPM", + "Performance_Metric")) %>% + dplyr::mutate("Eall_Model_Bias" = Model_Performance - E_all, + "Emean_Model_Bias" = Model_Performance - E_mean, + "ERI_Model_Bias" = Model_Performance - E_RI, + "EMI_Model_Bias" = Model_Performance - E_MI, + "EPSM_Model_Bias" = Model_Performance - E_PSM) %>% + dplyr::mutate(CPM = str_remove(CPM, "PR_"), + CPM = forcats::fct_recode(CPM, + "CCA \n developed CPM" = "CCA", + "RI \n developed CPM" = "RI", + "MI no Y \n developed CPM" = "MInoY", + "MI with Y \n developed CPM" = "MIwithY", + "Fully Observed \n developed CPM" = "fullyobserved", + "Mean Imputation \n developed CPM" = "zero", + "PSM \n developed CPM" = "patternsubmodel"), + CPM = forcats::fct_relevel(CPM, + "Fully Observed \n developed CPM", + "CCA \n developed CPM", + "Mean Imputation \n developed CPM", + "RI \n developed CPM", + "MI with Y \n developed CPM", + "MI no Y \n developed CPM", + "PSM \n developed CPM"), + + Performance_Metric = stringr::str_remove(Performance_Metric, + "_Mean"), + Performance_Metric = forcats::fct_recode(Performance_Metric, + "Brier Score" = "Brier", + "Calibration Intercept" = "CalInt", + "Calibration Slope" = "CalSlope"), + + Validation_Dataset = forcats::fct_recode(Validation_Dataset, + "Fully Observed Data" = "All Data Required"), + Validation_Dataset = forcats::fct_relevel(Validation_Dataset, + "Fully Observed Data", "CCA", - "Risk Absent Imputation", + "Mean Imputation", + "RI (refit)", + "RI (transported)", "MI with Y (refit)", + "MI with Y (transported)", "MI no Y (refit)", - "RI (refit)", - "Pattern Sub-Model"), - .default = NA)) %>% - dplyr::filter(combinations == 1) - - scenario_df <- df %>% - dplyr::filter(Simulation_Scenario %in% scenario_number) %>% - dplyr::select(Simulation_Scenario, Scenario, - CPM, Validation_Dataset, - contains("_Mean"), - contains("_quantileLower"), - contains("_quantileUpper")) %>% - pivot_longer(cols = c(contains("_Mean"), - contains("_quantileLower"), - contains("_quantileUpper"))) %>% - separate_wider_delim(name, - delim = "_", - names = c("Metric", "Summary_Type")) %>% - pivot_wider(id_cols = c("Simulation_Scenario", "Scenario", - "CPM", "Validation_Dataset", "Metric"), - names_from = "Summary_Type", - values_from = "value") + "MI no Y (transported)", + "Pattern Sub-Model")) + } - results_using_for_implementation <- df %>% - dplyr::filter(Simulation_Scenario %in% scenario_number) %>% - dplyr::filter(Validation_Dataset == target_performance_imputation_method) %>% - dplyr::select(Simulation_Scenario, Scenario, - CPM, Validation_Dataset, - contains("_Mean"), - contains("_quantileLower"), - contains("_quantileUpper")) %>% - pivot_longer(cols = c(contains("_Mean"), - contains("_quantileLower"), - contains("_quantileUpper"))) %>% - separate_wider_delim(name, - delim = "_", - names = c("Metric", "Summary_Type")) %>% - pivot_wider(id_cols = c("Simulation_Scenario", "Scenario", - "CPM", "Validation_Dataset", "Metric"), - names_from = "Summary_Type", - values_from = "value") %>% - dplyr::rename("Imp_Data" = Validation_Dataset, - "Imp_Mean" = Mean, - "Imp_quantileLower" = quantileLower, - "Imp_quantileUpper" = quantileUpper) - - plot_df <- scenario_df %>% - dplyr::left_join(results_using_for_implementation, - by = c("Simulation_Scenario", - "Scenario", - "CPM", - "Metric")) %>% - dplyr::mutate("Bias" = Mean - Imp_Mean, - "Bias_Lower" = quantileLower - Imp_quantileLower, - "Bias_Upper" = quantileUpper - Imp_quantileUpper) %>% - dplyr::mutate(Metric = forcats::fct_recode(Metric, - "Brier Score" = "Brier", - "Calibration Intercept" = "CalInt", - "Calibration Slope" = "CalSlope")) %>% - mutate(CPM = stringr::str_remove(CPM, "PR_"), - - CPM = forcats::fct_recode(CPM, - "CCA \n developed CPM" = "CCA", - "RI \n developed CPM" = "RI", - "MI no Y \n developed CPM" = "MInoY", - "MI with Y \n developed CPM" = "MIwithY", - "Fully Observed \n developed CPM" = "fullyobserved", - "Risk Absent Imputation \n developed CPM" = "zero", - "Pattern Sub-Model \n developed CPM" = "patternsubmodel"), - - CPM = forcats::fct_relevel(CPM, - "Fully Observed \n developed CPM", - "CCA \n developed CPM", - "Risk Absent Imputation \n developed CPM", - "RI \n developed CPM", - "MI with Y \n developed CPM", - "MI no Y \n developed CPM", - "Pattern Sub-Model \n developed CPM")) - - plot_df %>% - tidyr::drop_na() %>% - dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset, - "Fully Observed Data" = "All Data Required"), - Validation_Dataset = forcats::fct_relevel(Validation_Dataset, - "Fully Observed Data", - "CCA", - "Risk Absent Imputation", - "RI (refit)", - "RI (transported)", - "MI with Y (refit)", - "MI with Y (transported)", - "MI no Y (refit)", - "MI no Y (transported)", - "Pattern Sub-Model")) %>% + ModelBias %>% + dplyr::select(CPM, + Validation_Dataset, + Performance_Metric, + dplyr::ends_with("_Bias")) %>% + tidyr::pivot_longer(cols = dplyr::ends_with("_Bias"), + names_to = "Estimand", + values_to = "Bias") %>% + dplyr::mutate(Estimand = stringr:::str_remove(Estimand, + "_Model_Bias"), + Estimand = forcats::fct_recode(Estimand, + "E-all" = "Eall", + "E-mean" = "Emean", + "E-RI" = "ERI", + "E-MI" = "EMI", + "E-PSM" = "EPSM"), + Estimand = forcats::fct_relevel(Estimand, + "E-all", + "E-mean" , + "E-RI" , + "E-MI" , + "E-PSM" )) %>% + dplyr::filter(!is.na(Bias)) %>% + dplyr::filter(Estimand %in% target_estimands) %>% ggplot(aes(x = Bias, y = Validation_Dataset, - color = Metric)) + - geom_point(aes(shape = Metric)) + - geom_errorbar(aes(xmin = Bias_Lower, - xmax = Bias_Upper), width=.1) + + color = Performance_Metric)) + + geom_point(aes(shape = Performance_Metric)) + scale_color_brewer(palette = "Set1") + - xlab("Bias") + + xlab("Validation Bias \n (validation performance - deployment performance)") + ylab("Validation Data Imputation Method") + geom_vline(data = data.frame("Metric" = c("AUC", "Brier Score", @@ -740,19 +701,119 @@ plotting_fnc <- function(df, "Bias" = c(0,0,0,0)), aes(xintercept = Bias), linetype = "dashed") + - ggh4x::facet_grid2(Scenario ~ CPM, scales = "fixed") + - ggtitle(paste("Targetting ", target_performance_imputation_method, " Performance", sep = "")) + - theme_minimal() + + ggh4x::facet_grid2(Estimand ~ CPM, scales = "fixed") + + theme_minimal(base_size = 12) + theme(legend.position = "bottom", panel.background = element_rect(fill = "gray90"), panel.spacing.x = unit(0.5, "lines"), - axis.title.y = element_text(size=14), axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), plot.title = element_text(size = 14, hjust = 0.5), panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) } -plotting_fnc(df = simulation_results_summarised, - scenario_number = c(3023), - target_performance_imputation_method = "All Data Required") + + +####---------------------------------------------------------------------------- +### The below plots look at scenarios where X_1 is continuous, contains 50% +### missing data, rho_X=0.75, and where gamma_1=gamma_2=0.5. Results were +### quantitatively similar in other scenarios +####---------------------------------------------------------------------------- + +### First look at cases with consistent missingness DAGs across development and validation +model_degrdation_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (b) Dev -\n DAG (b) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) +model_bias_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (b) Dev -\n DAG (b) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) + + +model_degrdation_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (c) Dev -\n DAG (c) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) +model_bias_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (c) Dev -\n DAG (c) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) + + +model_degrdation_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (d) Dev -\n DAG (d) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) +model_bias_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (d) Dev -\n DAG (d) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) + + +model_degrdation_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (e) Dev -\n DAG (e) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) +model_bias_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (e) Dev -\n DAG (e) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) + + +model_degrdation_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (f) Dev -\n DAG (f) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) +model_bias_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (f) Dev -\n DAG (f) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) + + +### Second look at cases with different missingness DAGs across development and validation + +model_degrdation_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (c) Dev -\n DAG (f) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) +model_bias_fnc(df = simulation_results_summarised, + DAG_Scenario = "DAG (c) Dev -\n DAG (f) Val", + X_categorical_value = FALSE, + R_prev_value = 0.5, + rho_X_value = 0.75, + gamma_x1_value = 0.5, + target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM")) diff --git a/Code/04_ncorr_analysis.R b/Code/04_ncorr_analysis.R index ecc66e2..4ea53cd 100644 --- a/Code/04_ncorr_analysis.R +++ b/Code/04_ncorr_analysis.R @@ -843,10 +843,10 @@ Bootstrap_InternalValidation %>% name = forcats::fct_relevel(name, c("MeanMode", "RI", "MInoY", "PSM")), name = forcats::fct_recode(name, - "Targetting\n MI-no Y Performance" = "MInoY", - "Targetting\n Mean/Mode Performance" = "MeanMode", - "Targetting\n PSM Performance" = "PSM", - "Targetting\n RI Performance" = "RI"), + "E-MI" = "MInoY", + "E-mean" = "MeanMode", + "E-PSM" = "PSM", + "E-RI" = "RI"), CPM = str_remove(CPM, "PR_"), CPM = forcats::fct_recode(CPM, @@ -884,7 +884,7 @@ Bootstrap_InternalValidation %>% geom_errorbar(aes(xmin = bias_lower, xmax = bias_upper), width=.1) + scale_color_brewer(palette = "Set1") + - xlab("Bias") + + xlab("Validation Performance - Deployment Performance") + ylab("Validation Data Imputation Method") + geom_vline(data = data.frame("Metric" = c("AUC", "Brier Score", @@ -893,8 +893,7 @@ Bootstrap_InternalValidation %>% "bias_mean" = c(0,0,0,0)), aes(xintercept = bias_mean), linetype = "dashed") + - ggh4x::facet_grid2(CPM ~ Target, scales = "fixed") + - ggtitle("Targetted Performance ") + + ggh4x::facet_grid2(Target ~ CPM, scales = "fixed") + theme_minimal(base_size = 12) + theme(legend.position = "bottom", panel.background = element_rect(fill = "gray90"), @@ -904,6 +903,5 @@ Bootstrap_InternalValidation %>% panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) ggsave(file = here::here("Manuscript", "Fig6.tiff"), - height = 10, width = 10, dpi = 300) diff --git a/Code/SimulationCSFRun.txt b/Code/SimulationCSFRun.txt new file mode 100644 index 0000000..17ac602 --- /dev/null +++ b/Code/SimulationCSFRun.txt @@ -0,0 +1,10 @@ +#!/bin/bash --login +#SBATCH -p himem +#SBATCH --mem=1000G +#SBATCH -t 7-0 + +#SBATCH -a 1-600 + +module load apps/gcc/R/4.5.0 + +Rscript 01_Simulation_Run_CSF.R ${SLURM_ARRAY_TASK_ID} \ No newline at end of file diff --git a/Code/SimulationRun.qsub b/Code/SimulationRun.qsub deleted file mode 100644 index 8331cf8..0000000 --- a/Code/SimulationRun.qsub +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash --login -#$ -cwd - -module load apps/gcc/R/4.4.0 -module load tools/gcc/cmake/3.25.1 - -#$ -t 1-3072 - -Rscript 01_Simulation_Run_CSF.R $SGE_TASK_ID > simulation_warnings.$SGE_TASK_ID diff --git a/Outputs/simulation_results_summarised.RDS b/Outputs/simulation_results_summarised.RDS index d543def..0bb979d 100644 Binary files a/Outputs/simulation_results_summarised.RDS and b/Outputs/simulation_results_summarised.RDS differ