diff --git a/.DS_Store b/.DS_Store
index 0fded88..9a4ff71 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/.Rhistory b/.Rhistory
index 97d322f..91d6d6d 100644
--- a/.Rhistory
+++ b/.Rhistory
@@ -1,512 +1,512 @@
-"PSM")),
-Validation_Dataset = str_remove(Validation_Dataset, "_val"),
-Validation_Dataset = forcats::fct_recode(Validation_Dataset,
-"PSM" = "raw",
-"MI with Y" = "MIwithY",
-"MI no Y" = "MInoY",
-"Mean/Mode" = "mean_imputed"),
-Validation_Dataset = forcats::fct_relevel(Validation_Dataset,
-c("CCA",
-"Mean/Mode",
-"RI",
-"MI with Y",
-"MI no Y",
-"PSM"))) %>%
-rename("Target" = name) %>%
-ggplot(aes(x = bias_mean,
-y = Validation_Dataset,
-color = Metric)) +
-geom_point(aes(shape = Metric)) +
-geom_errorbar(aes(xmin = bias_lower,
-xmax = bias_upper), width=.1) +
-scale_color_brewer(palette = "Set1") +
-xlab("Bias") +
-ylab("Validation Data Imputation Method") +
-geom_vline(data = data.frame("Metric" = c("AUC",
-"Brier Score",
-"Calibration Intercept",
-"Calibration Slope"),
-"bias_mean" = c(0,0,0,0)),
-aes(xintercept = bias_mean),
-linetype = "dashed") +
-ggh4x::facet_grid2(CPM ~ Target, scales = "fixed") +
-ggtitle("Targetted Performance ") +
-theme_minimal(base_size = 12) +
-theme(legend.position = "bottom",
-panel.background = element_rect(fill = "gray90"),
-panel.spacing.x = unit(0.5, "lines"),
-axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
-plot.title = element_text(size = 14, hjust = 0.5),
-panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5))
-##plot results from the empirical study
-Bootstrap_InternalValidation %>%
-left_join(Bootstrap_InternalValidation %>%
-filter(Validation_Dataset == "mean_imputed_val") %>%
-select(-Validation_Dataset) %>%
-rename("MeanMode_est" = "est",
-"MeanMode_var" = "var"),
-by = c("Bootstrap_Index", "CPM", "Metric")) %>%
-left_join(Bootstrap_InternalValidation %>%
-filter(Validation_Dataset == "RI_val") %>%
-select(-Validation_Dataset) %>%
-rename("RI_est" = "est",
-"RI_var" = "var"),
-by = c("Bootstrap_Index", "CPM", "Metric")) %>%
-left_join(Bootstrap_InternalValidation %>%
-filter(Validation_Dataset == "MInoY_val") %>%
-select(-Validation_Dataset) %>%
-rename("MInoY_est" = "est",
-"MInoY_var" = "var"),
-by = c("Bootstrap_Index", "CPM", "Metric")) %>%
-left_join(Bootstrap_InternalValidation %>%
-filter(Validation_Dataset == "raw_val")  %>%
-select(-Validation_Dataset)%>%
-rename("PSM_est" = "est",
-"PSM_var" = "var"),
-by = c("Bootstrap_Index", "CPM", "Metric")) %>%
-select(Bootstrap_Index, CPM, Validation_Dataset, Metric, contains("est")) %>%
-pivot_longer(cols = ends_with("_est")) %>%
-mutate(bias = est - value) %>%
-group_by(CPM, Validation_Dataset, Metric, name) %>%
-summarise(bias_mean = mean(bias, na.rm = TRUE),
-bias_lower = quantile(bias, 0.025, na.rm = TRUE),
-bias_upper = quantile(bias, 0.975, na.rm = TRUE),
-.groups = "drop") %>%
-mutate(name = str_remove(name, "_est"),
-name = forcats::fct_relevel(name,
-c("MeanMode", "RI", "MInoY", "PSM")),
-name = forcats::fct_recode(name,
-"Targetting\n MI-no Y Performance" = "MInoY",
-"Targetting\n Mean/Mode Performance" = "MeanMode",
-"Targetting\n PSM Performance" = "PSM",
-"Targetting\n RI Performance" = "RI"),
-CPM = str_remove(CPM, "PR_"),
-CPM = forcats::fct_recode(CPM,
-"CCA\n developed CPM" = "CCA",
-"RI\n developed CPM" = "RI",
-"PSM\n developed CPM" = "patternsubmodel",
-"MI with Y\n developed CPM" = "MIwithY",
-"MI no Y\n developed CPM" = "MInoY",
-"Mean/Mode\n developed CPM" = "meanimputed"),
-CPM = forcats::fct_relevel(CPM,
-c("CCA\n developed CPM",
-"Mean/Mode\n developed CPM",
-"RI\n developed CPM",
-"MI with Y\n developed CPM",
-"MI no Y\n developed CPM",
-"PSM\n developed CPM")),
-Validation_Dataset = str_remove(Validation_Dataset, "_val"),
-Validation_Dataset = forcats::fct_recode(Validation_Dataset,
-"PSM" = "raw",
-"MI with Y" = "MIwithY",
-"MI no Y" = "MInoY",
-"Mean/Mode" = "mean_imputed"),
-Validation_Dataset = forcats::fct_relevel(Validation_Dataset,
-c("CCA",
-"Mean/Mode",
-"RI",
-"MI with Y",
-"MI no Y",
-"PSM"))) %>%
-rename("Target" = name) %>%
-ggplot(aes(x = bias_mean,
-y = Validation_Dataset,
-color = Metric)) +
-geom_point(aes(shape = Metric)) +
-geom_errorbar(aes(xmin = bias_lower,
-xmax = bias_upper), width=.1) +
-scale_color_brewer(palette = "Set1") +
-xlab("Bias") +
-ylab("Validation Data Imputation Method") +
-geom_vline(data = data.frame("Metric" = c("AUC",
-"Brier Score",
-"Calibration Intercept",
-"Calibration Slope"),
-"bias_mean" = c(0,0,0,0)),
-aes(xintercept = bias_mean),
-linetype = "dashed") +
-ggh4x::facet_grid2(CPM ~ Target, scales = "fixed") +
-ggtitle("Targetted Performance ") +
-theme_minimal(base_size = 12) +
-theme(legend.position = "bottom",
-panel.background = element_rect(fill = "gray90"),
-panel.spacing.x = unit(0.5, "lines"),
-axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
-plot.title = element_text(size = 14, hjust = 0.5),
-panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5))
-##plot results from the empirical study
-Bootstrap_InternalValidation %>%
-left_join(Bootstrap_InternalValidation %>%
-filter(Validation_Dataset == "mean_imputed_val") %>%
-select(-Validation_Dataset) %>%
-rename("MeanMode_est" = "est",
-"MeanMode_var" = "var"),
-by = c("Bootstrap_Index", "CPM", "Metric")) %>%
-left_join(Bootstrap_InternalValidation %>%
-filter(Validation_Dataset == "RI_val") %>%
-select(-Validation_Dataset) %>%
-rename("RI_est" = "est",
-"RI_var" = "var"),
-by = c("Bootstrap_Index", "CPM", "Metric")) %>%
-left_join(Bootstrap_InternalValidation %>%
-filter(Validation_Dataset == "MInoY_val") %>%
-select(-Validation_Dataset) %>%
-rename("MInoY_est" = "est",
-"MInoY_var" = "var"),
-by = c("Bootstrap_Index", "CPM", "Metric")) %>%
-left_join(Bootstrap_InternalValidation %>%
-filter(Validation_Dataset == "raw_val")  %>%
-select(-Validation_Dataset)%>%
-rename("PSM_est" = "est",
-"PSM_var" = "var"),
-by = c("Bootstrap_Index", "CPM", "Metric")) %>%
-select(Bootstrap_Index, CPM, Validation_Dataset, Metric, contains("est")) %>%
-pivot_longer(cols = ends_with("_est")) %>%
-mutate(bias = est - value) %>%
-group_by(CPM, Validation_Dataset, Metric, name) %>%
-summarise(bias_mean = mean(bias, na.rm = TRUE),
-bias_lower = quantile(bias, 0.025, na.rm = TRUE),
-bias_upper = quantile(bias, 0.975, na.rm = TRUE),
-.groups = "drop") %>%
-mutate(name = str_remove(name, "_est"),
-name = forcats::fct_relevel(name,
-c("MeanMode", "RI", "MInoY", "PSM")),
-name = forcats::fct_recode(name,
-"Targetting\n MI-no Y Performance" = "MInoY",
-"Targetting\n Mean/Mode Performance" = "MeanMode",
-"Targetting\n PSM Performance" = "PSM",
-"Targetting\n RI Performance" = "RI"),
-CPM = str_remove(CPM, "PR_"),
-CPM = forcats::fct_recode(CPM,
-"CCA\n developed CPM" = "CCA",
-"RI\n developed CPM" = "RI",
-"PSM\n developed CPM" = "patternsubmodel",
-"MI with Y\n developed CPM" = "MIwithY",
-"MI no Y\n developed CPM" = "MInoY",
-"Mean/Mode\n developed CPM" = "meanimputed"),
-CPM = forcats::fct_relevel(CPM,
-c("CCA\n developed CPM",
-"Mean/Mode\n developed CPM",
-"RI\n developed CPM",
-"MI with Y\n developed CPM",
-"MI no Y\n developed CPM",
-"PSM\n developed CPM")),
-Validation_Dataset = str_remove(Validation_Dataset, "_val"),
-Validation_Dataset = forcats::fct_recode(Validation_Dataset,
-"PSM" = "raw",
-"MI with Y" = "MIwithY",
-"MI no Y" = "MInoY",
-"Mean/Mode" = "mean_imputed"),
-Validation_Dataset = forcats::fct_relevel(Validation_Dataset,
-c("CCA",
-"Mean/Mode",
-"RI",
-"MI with Y",
-"MI no Y",
-"PSM"))) %>%
-rename("Target" = name) %>%
-ggplot(aes(x = bias_mean,
-y = Validation_Dataset,
-color = Metric)) +
-geom_point(aes(shape = Metric)) +
-geom_errorbar(aes(xmin = bias_lower,
-xmax = bias_upper), width=.1) +
-scale_color_brewer(palette = "Set1") +
-xlab("Bias") +
-ylab("Validation Data Imputation Method") +
-geom_vline(data = data.frame("Metric" = c("AUC",
-"Brier Score",
-"Calibration Intercept",
-"Calibration Slope"),
-"bias_mean" = c(0,0,0,0)),
-aes(xintercept = bias_mean),
-linetype = "dashed") +
-ggh4x::facet_grid2(CPM ~ Target, scales = "fixed") +
-ggtitle("Targetted Performance ") +
-theme_minimal(base_size = 12) +
-theme(legend.position = "bottom",
-panel.background = element_rect(fill = "gray90"),
-panel.spacing.x = unit(0.5, "lines"),
-axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
-plot.title = element_text(size = 14, hjust = 0.5),
-panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5))
-ggsave(file = here::here("Manuscript", "Fig6.tiff"))
-ggsave(file = here::here("Manuscript", "Fig6.tiff"), height = 10, width = 10, dpi = 300)
+source("Code/00_Simulation_Functions.R")
+#run the simulation study:
+simulation_results <- simulation_nrun_fnc(n_iter = 100,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[s],
+X_categorical = sims_parameters$X_categorical[s],
+R_prev = sims_parameters$R_prev[s],
+beta_x1_dev = sims_parameters$beta_x1_dev[s],
+beta_x2_dev = sims_parameters$beta_x2_dev[s],
+beta_Y_dev = sims_parameters$beta_Y_dev[s],
+beta_x1_val = sims_parameters$beta_x1_val[s],
+beta_x2_val = sims_parameters$beta_x2_val[s],
+beta_Y_val = sims_parameters$beta_Y_val[s],
+rho_X = sims_parameters$rho_X[s],
+gamma_x1 = sims_parameters$gamma_x1[s],
+gamma_x2 = sims_parameters$gamma_x2[s])
+#extract each imputed version of the validation set
+val_imputed_datasets <- imputed_datasets[
+which(stringr::str_detect(names(imputed_datasets), "_val"))
+]
+preds_per_data_set <- purrr::map(.x = val_imputed_datasets,
+.f = make_predictions_fnc,
+model_coefs = model_coefs,
+X_categorical = X_categorical)
+preds_per_data_set$RI_val_transportedimputation <- preds_per_data_set$RI_val_transportedimputation %>%
+dplyr::select(-.imp)
+preds_per_data_set$RI_val_newimputation <- preds_per_data_set$RI_val_newimputation %>%
+dplyr::select(-.imp)
+preds_per_data_set
+source("Code/00_Simulation_Functions.R")
+source("Code/00_Simulation_Functions.R")
+#run the simulation study:
+simulation_results <- simulation_nrun_fnc(n_iter = 100,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[s],
+X_categorical = sims_parameters$X_categorical[s],
+R_prev = sims_parameters$R_prev[s],
+beta_x1_dev = sims_parameters$beta_x1_dev[s],
+beta_x2_dev = sims_parameters$beta_x2_dev[s],
+beta_Y_dev = sims_parameters$beta_Y_dev[s],
+beta_x1_val = sims_parameters$beta_x1_val[s],
+beta_x2_val = sims_parameters$beta_x2_val[s],
+beta_Y_val = sims_parameters$beta_Y_val[s],
+rho_X = sims_parameters$rho_X[s],
+gamma_x1 = sims_parameters$gamma_x1[s],
+gamma_x2 = sims_parameters$gamma_x2[s])
+#Apply the sub-model to the raw (non-imputed) validation data:
+Target_measures <- Target_measures %>%
+dplyr::bind_rows(purrr::map_dfr(.x = preds_per_data_set[which(names(preds_per_data_set)=="observed_val")],
+.f = validation_fnc,
+PR_name = "PR_patternsubmodel",
+.id = "Validation_Dataset") %>%
+dplyr::mutate("CPM" = "PR_patternsubmodel",
+.before = "Validation_Dataset"))
+View(Target_measures)
+View(Target_measures)
+source("Code/00_Simulation_Functions.R")
+#run the simulation study:
+simulation_results <- simulation_nrun_fnc(n_iter = 100,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[s],
+X_categorical = sims_parameters$X_categorical[s],
+R_prev = sims_parameters$R_prev[s],
+beta_x1_dev = sims_parameters$beta_x1_dev[s],
+beta_x2_dev = sims_parameters$beta_x2_dev[s],
+beta_Y_dev = sims_parameters$beta_Y_dev[s],
+beta_x1_val = sims_parameters$beta_x1_val[s],
+beta_x2_val = sims_parameters$beta_x2_val[s],
+beta_Y_val = sims_parameters$beta_Y_val[s],
+rho_X = sims_parameters$rho_X[s],
+gamma_x1 = sims_parameters$gamma_x1[s],
+gamma_x2 = sims_parameters$gamma_x2[s])
+#run the simulation study:
+simulation_results <- simulation_nrun_fnc(n_iter = 2,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[s],
+X_categorical = sims_parameters$X_categorical[s],
+R_prev = sims_parameters$R_prev[s],
+beta_x1_dev = sims_parameters$beta_x1_dev[s],
+beta_x2_dev = sims_parameters$beta_x2_dev[s],
+beta_Y_dev = sims_parameters$beta_Y_dev[s],
+beta_x1_val = sims_parameters$beta_x1_val[s],
+beta_x2_val = sims_parameters$beta_x2_val[s],
+beta_Y_val = sims_parameters$beta_Y_val[s],
+rho_X = sims_parameters$rho_X[s],
+gamma_x1 = sims_parameters$gamma_x1[s],
+gamma_x2 = sims_parameters$gamma_x2[s])
+View(simulation_results)
+#Define the size of development, validation and implementation, respectively:
+N_dev <- 100000
+N_val <- 100000
+#run the simulation study:
+simulation_results <- simulation_nrun_fnc(n_iter = 2,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[s],
+X_categorical = sims_parameters$X_categorical[s],
+R_prev = sims_parameters$R_prev[s],
+beta_x1_dev = sims_parameters$beta_x1_dev[s],
+beta_x2_dev = sims_parameters$beta_x2_dev[s],
+beta_Y_dev = sims_parameters$beta_Y_dev[s],
+beta_x1_val = sims_parameters$beta_x1_val[s],
+beta_x2_val = sims_parameters$beta_x2_val[s],
+beta_Y_val = sims_parameters$beta_Y_val[s],
+rho_X = sims_parameters$rho_X[s],
+gamma_x1 = sims_parameters$gamma_x1[s],
+gamma_x2 = sims_parameters$gamma_x2[s])
+View(simulation_results)
+sims_parameters <- tidyr::crossing(
+Y_prev = c(0.2),
+X_categorical = c(TRUE, FALSE),
+R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively
+beta_x1_dev = c(0, 0.5),
+beta_x2_dev = c(0, 0.5),
+beta_Y_dev = c(0, 0.5),
+beta_x1_val = c(0, 0.5),
+beta_x2_val = c(0, 0.5),
+beta_Y_val = c(0, 0.5),
+rho_X = c(0, 0.75),
+gamma_x1 = c(0, 0.5),
+gamma_x2 = 0.5
+) %>%
+mutate(beta_cases_dev = case_when(
+beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev",
+TRUE ~ as.character("not_applicable")
+),
+beta_cases_val = case_when(
+beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val",
+TRUE ~ as.character("not_applicable")
+)) %>%
+filter(beta_cases_dev != "not_applicable" &
+beta_cases_val != "not_applicable") %>%
+select(-beta_cases_dev,
+-beta_cases_val)
 library(tidyverse)
-library(ggh4x)
-simulation_results_summarised <- read_rds(here::here("outputs",
-"simulation_results_summarised.RDS"))
-simulation_results_summarised <- simulation_results_summarised %>%
-dplyr::mutate("Scenario" = paste(paste(Missing_Mech_Dev, "Dev", sep=" "),
-paste(Missing_Mech_Val, "Val", sep=" "),
-sep = " -\n "),
-Scenario = forcats::fct_relevel(Scenario,
-"MCAR Dev -\n MCAR Val",
-"MAR Dev -\n MAR Val",
-"MNAR_X Dev -\n MNAR_X Val",
-"MNAR_Y Dev -\n MNAR_Y Val",
-"MNAR_XY Dev -\n MNAR_XY Val",
-"MCAR Dev -\n MAR Val",
-"MCAR Dev -\n MNAR_X Val",
-"MCAR Dev -\n MNAR_Y Val",
-"MCAR Dev -\n MNAR_XY Val",
-"MAR Dev -\n MNAR_X Val",
-"MAR Dev -\n MNAR_Y Val",
-"MAR Dev -\n MNAR_XY Val",
-"MNAR_X Dev -\n MNAR_Y Val",
-"MNAR_X Dev -\n MNAR_XY Val"),
-Validation_Dataset = forcats::fct_recode(Validation_Dataset,
-"CCA" = "CCA_val",
-"MI no Y (refit)" = "MI_val_noY_newimputation",
-"MI no Y (transported)" = "MI_val_noY_transportedimputation",
-"MI with Y (refit)" = "MI_val_withY_newimputation",
-"MI with Y (transported)" = "MI_val_withY_transportedimputation",
-"RI (refit)" = "RI_val_newimputation",
-"RI (transported)" = "RI_val_transportedimputation",
-"All Data Required" = "fullyobserved_val",
-"Mean Imputation" = "mean_val",
-"Risk Absent Imputation" ="zero_val",
-"Pattern Sub-Model" = "observed_val"))
-####----------------------------------------------------------------------------
-### Select which estimand we want to produce plots for:
-target_performance <- "All Data Required"
-### Function to help summarise the results for a given iteration number:
-plotting_fnc <- function(df,
-scenario_number,
-target_performance_imputation_method = c("All Data Required",
-"CCA",
-"Mean Imputation",
-"RI (refit)",
-"RI (transported)",
-"MI no Y (refit)",
-"MI no Y (transported)",
-"MI with Y (refit)",
-"MI with Y (transported)",
-"Pattern Sub-Model")) {
-target_performance_imputation_method <- as.character(match.arg(target_performance_imputation_method))
-df <- df %>%
-dplyr::mutate("combinations" = case_when(
-CPM == "PR_fullyobserved" |
-CPM == "PR_CCA" |
-CPM == "PR_mean" ~ Validation_Dataset %in% c("All Data Required",
-"CCA",
-"Mean Imputation",
-"MI with Y (refit)",
-"MI no Y (refit)",
-"RI (refit)"),
-CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required",
-"CCA",
-"Mean Imputation",
-"MI with Y (refit)",
-"MI no Y (refit)",
-"RI (transported)",
-"RI (refit)"),
-CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required",
-"CCA",
-"Mean Imputation",
-"MI with Y (transported)",
-"MI with Y (refit)",
-"MI no Y (refit)",
-"RI (refit)"),
-CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required",
-"CCA",
-"Mean Imputation",
-"MI with Y (refit)",
-"MI no Y (transported)",
-"MI no Y (refit)",
-"RI (refit)"),
-CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required",
-"CCA",
-"Mean Imputation",
-"MI with Y (refit)",
-"MI no Y (refit)",
-"RI (refit)",
-"Pattern Sub-Model"),
-.default = NA)) %>%
-dplyr::filter(combinations == 1)
-scenario_df <- df %>%
-dplyr::filter(Simulation_Scenario %in% scenario_number) %>%
-dplyr::select(Simulation_Scenario, Scenario,
-CPM, Validation_Dataset,
-contains("_Mean"),
-contains("_quantileLower"),
-contains("_quantileUpper")) %>%
-pivot_longer(cols = c(contains("_Mean"),
-contains("_quantileLower"),
-contains("_quantileUpper"))) %>%
-separate_wider_delim(name,
-delim = "_",
-names = c("Metric", "Summary_Type")) %>%
-pivot_wider(id_cols = c("Simulation_Scenario", "Scenario",
-"CPM", "Validation_Dataset", "Metric"),
-names_from = "Summary_Type",
-values_from = "value")
-results_using_for_implementation <- df %>%
-dplyr::filter(Simulation_Scenario %in% scenario_number) %>%
-dplyr::filter(Validation_Dataset == target_performance_imputation_method) %>%
-dplyr::select(Simulation_Scenario, Scenario,
-CPM, Validation_Dataset,
-contains("_Mean"),
-contains("_quantileLower"),
-contains("_quantileUpper")) %>%
-pivot_longer(cols = c(contains("_Mean"),
-contains("_quantileLower"),
-contains("_quantileUpper"))) %>%
-separate_wider_delim(name,
-delim = "_",
-names = c("Metric", "Summary_Type")) %>%
-pivot_wider(id_cols = c("Simulation_Scenario", "Scenario",
-"CPM", "Validation_Dataset", "Metric"),
-names_from = "Summary_Type",
-values_from = "value") %>%
-dplyr::rename("Imp_Data" = Validation_Dataset,
-"Imp_Mean" = Mean,
-"Imp_quantileLower" = quantileLower,
-"Imp_quantileUpper" = quantileUpper)
-plot_df <- scenario_df %>%
-dplyr::left_join(results_using_for_implementation,
-by = c("Simulation_Scenario",
-"Scenario",
-"CPM",
-"Metric")) %>%
-dplyr::mutate("Bias" = Mean - Imp_Mean,
-"Bias_Lower" = quantileLower - Imp_quantileLower,
-"Bias_Upper" = quantileUpper - Imp_quantileUpper) %>%
-dplyr::mutate(Metric = forcats::fct_recode(Metric,
-"Brier Score" = "Brier",
-"Calibration Intercept" = "CalInt",
-"Calibration Slope" = "CalSlope")) %>%
-mutate(CPM = stringr::str_remove(CPM, "PR_"),
-CPM = forcats::fct_recode(CPM,
-"CCA \n developed CPM" = "CCA",
-"RI \n developed CPM" = "RI",
-"MI no Y \n developed CPM" = "MInoY",
-"MI with Y \n developed CPM" = "MIwithY",
-"Fully Observed \n developed CPM" = "fullyobserved",
-"Mean Imputation \n developed CPM" = "mean",
-"PSM \n developed CPM" = "patternsubmodel"),
-CPM = forcats::fct_relevel(CPM,
-"Fully Observed \n developed CPM",
-"CCA \n developed CPM",
-"Mean Imputation \n developed CPM",
-"RI \n developed CPM",
-"MI with Y \n developed CPM",
-"MI no Y \n developed CPM",
-"PSM \n developed CPM"))
-plot_df %>%
-tidyr::drop_na() %>%
-dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset,
-"Fully Observed Data" = "All Data Required"),
-Validation_Dataset = forcats::fct_relevel(Validation_Dataset,
-"Fully Observed Data",
-"CCA",
-"Mean Imputation",
-"RI (refit)",
-"RI (transported)",
-"MI with Y (refit)",
-"MI with Y (transported)",
-"MI no Y (refit)",
-"MI no Y (transported)",
-"Pattern Sub-Model")) %>%
-ggplot(aes(x = Bias,
-y = Validation_Dataset,
-color = Metric)) +
-geom_point(aes(shape = Metric)) +
-geom_errorbar(aes(xmin = Bias_Lower,
-xmax = Bias_Upper), width=.1) +
-scale_color_brewer(palette = "Set1") +
-xlab("Bias") +
-ylab("Validation Data Imputation Method") +
-geom_vline(data = data.frame("Metric" = c("AUC",
-"Brier Score",
-"Calibration Intercept",
-"Calibration Slope"),
-"Bias" = c(0,0,0,0)),
-aes(xintercept = Bias),
-linetype = "dashed") +
-ggh4x::facet_grid2(Scenario ~ CPM, scales = "fixed") +
-ggtitle(paste("Targetting ", target_performance_imputation_method, " Performance", sep = "")) +
-theme_minimal(base_size = 12) +
-theme(legend.position = "bottom",
-panel.background = element_rect(fill = "gray90"),
-panel.spacing.x = unit(0.5, "lines"),
-axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
-plot.title = element_text(size = 14, hjust = 0.5),
-panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5))
-}
-#rho = 0.75
-#MCAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(24, 56, 32, 64),
-target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(184, 161, 193),
-target_performance_imputation_method = target_performance)
-#MNAR-X+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(416, 448, 256),
-target_performance_imputation_method = target_performance)
-## Inconsistent missingness mechanism plots:
-#rho = 0
-#MCAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(20, 52, 28, 60),
-target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(180, 156, 188),
-target_performance_imputation_method = target_performance)
-#MNAR-X+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(416, 448, 256),
-target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(184, 161, 193),
-target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(180, 156, 188),
-target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(184, 161, 193),
-target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(185, 161, 193),
-target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(185, 161, 192),
-target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(184, 161, 192),
-target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(180, 156, 188),
-target_performance_imputation_method = target_performance)
-## Inconsistent missingness mechanism plots:
-#rho = 0
-#MCAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(20, 52, 28, 60),
-target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(180, 156, 188),
-target_performance_imputation_method = target_performance)
-## Inconsistent missingness mechanism plots:
-#rho = 0
-#MCAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(20, 52, 28, 60),
-target_performance_imputation_method = target_performance)
-target_performance <- "Mean Imputation"
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-scenario_number = c(180, 156, 188),
-target_performance_imputation_method = target_performance)
+# Define a dataset that includes all combinations of simulation parameters:
+sims_parameters <- tidyr::crossing(
+Y_prev = c(0.2),
+X_categorical = c(TRUE, FALSE),
+R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively
+beta_x1_dev = c(0, 0.5),
+beta_x2_dev = c(0, 0.5),
+beta_Y_dev = c(0, 0.5),
+beta_x1_val = c(0, 0.5),
+beta_x2_val = c(0, 0.5),
+beta_Y_val = c(0, 0.5),
+rho_X = c(0, 0.75),
+gamma_x1 = c(0, 0.5),
+gamma_x2 = 0.5
+) %>%
+mutate(beta_cases_dev = case_when(
+beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev",
+TRUE ~ as.character("not_applicable")
+),
+beta_cases_val = case_when(
+beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val",
+TRUE ~ as.character("not_applicable")
+)) %>%
+filter(beta_cases_dev != "not_applicable" &
+beta_cases_val != "not_applicable") %>%
+select(-beta_cases_dev,
+-beta_cases_val)
+?case_when
+# Define a dataset that includes all combinations of simulation parameters:
+sims_parameters <- tidyr::crossing(
+Y_prev = c(0.2),
+X_categorical = c(TRUE, FALSE),
+R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively
+beta_x1_dev = c(0, 0.5),
+beta_x2_dev = c(0, 0.5),
+beta_Y_dev = c(0, 0.5),
+beta_x1_val = c(0, 0.5),
+beta_x2_val = c(0, 0.5),
+beta_Y_val = c(0, 0.5),
+rho_X = c(0, 0.75),
+gamma_x1 = c(0, 0.5),
+gamma_x2 = 0.5
+) %>%
+dplyr::mutate(beta_cases_dev = dplyr::case_when(
+beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev",
+TRUE ~ as.character("not_applicable")
+),
+beta_cases_val = dplyr::case_when(
+beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val",
+TRUE ~ as.character("not_applicable")
+)) %>%
+dplyr::filter(beta_cases_dev != "not_applicable" &
+beta_cases_val != "not_applicable") %>%
+dplyr::select(-beta_cases_dev,
+-beta_cases_val)
+dim(sims_parameters)
+source("Code/00_Simulation_Functions.R")
+#run the simulation study:
+simulation_results <- simulation_nrun_fnc(n_iter = 2,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[s],
+X_categorical = sims_parameters$X_categorical[s],
+R_prev = sims_parameters$R_prev[s],
+beta_x1_dev = sims_parameters$beta_x1_dev[s],
+beta_x2_dev = sims_parameters$beta_x2_dev[s],
+beta_Y_dev = sims_parameters$beta_Y_dev[s],
+beta_x1_val = sims_parameters$beta_x1_val[s],
+beta_x2_val = sims_parameters$beta_x2_val[s],
+beta_Y_val = sims_parameters$beta_Y_val[s],
+rho_X = sims_parameters$rho_X[s],
+gamma_x1 = sims_parameters$gamma_x1[s],
+gamma_x2 = sims_parameters$gamma_x2[s])
+source("Code/00_Simulation_Functions.R")
+#run the simulation study:
+simulation_results <- simulation_nrun_fnc(n_iter = 2,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[s],
+X_categorical = sims_parameters$X_categorical[s],
+R_prev = sims_parameters$R_prev[s],
+beta_x1_dev = sims_parameters$beta_x1_dev[s],
+beta_x2_dev = sims_parameters$beta_x2_dev[s],
+beta_Y_dev = sims_parameters$beta_Y_dev[s],
+beta_x1_val = sims_parameters$beta_x1_val[s],
+beta_x2_val = sims_parameters$beta_x2_val[s],
+beta_Y_val = sims_parameters$beta_Y_val[s],
+rho_X = sims_parameters$rho_X[s],
+gamma_x1 = sims_parameters$gamma_x1[s],
+gamma_x2 = sims_parameters$gamma_x2[s])
+View(simulation_results)
+View(sims_parameters)
+View(simulation_results)
+#run the simulation study:
+simulation_results <- simulation_nrun_fnc(n_iter = 20,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[s],
+X_categorical = sims_parameters$X_categorical[s],
+R_prev = sims_parameters$R_prev[s],
+beta_x1_dev = sims_parameters$beta_x1_dev[s],
+beta_x2_dev = sims_parameters$beta_x2_dev[s],
+beta_Y_dev = sims_parameters$beta_Y_dev[s],
+beta_x1_val = sims_parameters$beta_x1_val[s],
+beta_x2_val = sims_parameters$beta_x2_val[s],
+beta_Y_val = sims_parameters$beta_Y_val[s],
+rho_X = sims_parameters$rho_X[s],
+gamma_x1 = sims_parameters$gamma_x1[s],
+gamma_x2 = sims_parameters$gamma_x2[s])
+View(simulation_results)
+View(simulation_results %>% group_by(CPM, Validation_Dataset) %>% summarise(across(CalInt_est:Brier_var, ~mean(.))))
+#Load the simulation functions
+source("./00_Simulation_Functions.R")
+#Load the simulation functions
+source("Code/00_Simulation_Functions.R")
+library(tidyverse)
+# Define a dataset that includes all combinations of simulation parameters:
+sims_parameters <- tidyr::crossing(
+Y_prev = c(0.2),
+X_categorical = c(TRUE, FALSE),
+R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively
+beta_x1_dev = c(0, 0.5),
+beta_x2_dev = c(0, 0.5),
+beta_Y_dev = c(0, 0.5),
+beta_x1_val = c(0, 0.5),
+beta_x2_val = c(0, 0.5),
+beta_Y_val = c(0, 0.5),
+rho_X = c(0, 0.75),
+gamma_x1 = c(0, 0.5),
+gamma_x2 = 0.5
+) %>%
+dplyr::mutate(beta_cases_dev = dplyr::case_when(
+beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev",
+TRUE ~ as.character("not_applicable")
+),
+beta_cases_val = dplyr::case_when(
+beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val",
+TRUE ~ as.character("not_applicable")
+)) %>%
+dplyr::filter(beta_cases_dev != "not_applicable" &
+beta_cases_val != "not_applicable") %>%
+dplyr::select(-beta_cases_dev,
+-beta_cases_val)
+# number of repeats per scenario
+n_rep <- 100
+#Define the size of development, validation and implementation, respectively:
+N_dev <- 100000
+N_val <- 100000
+head(sims_parameters)
+taskid <- 1
+simulation_nrun_fnc(n_iter = 1,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[taskid],
+X_categorical = sims_parameters$X_categorical[taskid],
+R_prev = sims_parameters$R_prev[taskid],
+beta_x1_dev = sims_parameters$beta_x1_dev[taskid],
+beta_x2_dev = sims_parameters$beta_x2_dev[taskid],
+beta_Y_dev = sims_parameters$beta_Y_dev[taskid],
+beta_x1_val = sims_parameters$beta_x1_val[taskid],
+beta_x2_val = sims_parameters$beta_x2_val[taskid],
+beta_Y_val = sims_parameters$beta_Y_val[taskid],
+rho_X = sims_parameters$rho_X[taskid],
+gamma_x1 = sims_parameters$gamma_x1[taskid],
+gamma_x2 = sims_parameters$gamma_x2[taskid])
+300/24
+library(furrr)
+?plan
+(availableCores() - 3)
+12/5
+plan(multicore, workers = (availableCores() - 3))
+plan(multisession, workers = (availableCores() - 3))
+dim(sims_parameters %>%
+dplyr::filter(X_categorical == FALSE))
+simulation_results_XContinuous <- sims_parameters %>%
+dplyr::filter(X_categorical == FALSE) %>%
+dplyr::mutate("results" = future_pmap(.l = list(n_iter = 100,
+N_dev = 100000,
+N_val = 100000,
+Y_prev = Y_prev,
+X_categorical = X_categorical,
+R_prev = R_prev,
+beta_x1_dev = beta_x1_dev,
+beta_x2_dev = beta_x2_dev,
+beta_Y_dev = beta_Y_dev,
+beta_x1_val = beta_x1_val,
+beta_x2_val = beta_x2_val,
+beta_Y_val = beta_Y_val,
+rho_X = rho_X,
+gamma_x1 = gamma_x1,
+gamma_x2 = gamma_x2),
+.f = simulation_nrun_fnc,
+.progress = TRUE,
+.options = furrr_options(seed = as.integer(465475))))
+system.time(simulation_nrun_fnc(n_iter = 1,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[taskid],
+X_categorical = sims_parameters$X_categorical[taskid],
+R_prev = sims_parameters$R_prev[taskid],
+beta_x1_dev = sims_parameters$beta_x1_dev[taskid],
+beta_x2_dev = sims_parameters$beta_x2_dev[taskid],
+beta_Y_dev = sims_parameters$beta_Y_dev[taskid],
+beta_x1_val = sims_parameters$beta_x1_val[taskid],
+beta_x2_val = sims_parameters$beta_x2_val[taskid],
+beta_Y_val = sims_parameters$beta_Y_val[taskid],
+rho_X = sims_parameters$rho_X[taskid],
+gamma_x1 = sims_parameters$gamma_x1[taskid],
+gamma_x2 = sims_parameters$gamma_x2[taskid]))
+source("Code/00_Simulation_Functions.R")
+library(tidyverse)
+# Define a dataset that includes all combinations of simulation parameters:
+sims_parameters <- tidyr::crossing(
+Y_prev = c(0.2),
+X_categorical = c(TRUE, FALSE),
+R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively
+beta_x1_dev = c(0, 0.5),
+beta_x2_dev = c(0, 0.5),
+beta_Y_dev = c(0, 0.5),
+beta_x1_val = c(0, 0.5),
+beta_x2_val = c(0, 0.5),
+beta_Y_val = c(0, 0.5),
+rho_X = c(0, 0.75),
+gamma_x1 = c(0, 0.5),
+gamma_x2 = 0.5
+) %>%
+dplyr::mutate(beta_cases_dev = dplyr::case_when(
+beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev",
+TRUE ~ as.character("not_applicable")
+),
+beta_cases_val = dplyr::case_when(
+beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val",
+TRUE ~ as.character("not_applicable")
+)) %>%
+dplyr::filter(beta_cases_dev != "not_applicable" &
+beta_cases_val != "not_applicable") %>%
+dplyr::select(-beta_cases_dev,
+-beta_cases_val)
+taskid <- 100
+# number of repeats per scenario
+n_rep <- 100
+#Define the size of development, validation and implementation, respectively:
+N_dev <- 100000
+N_val <- 100000
+system.time(simulation_nrun_fnc(n_iter = 1,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[taskid],
+X_categorical = sims_parameters$X_categorical[taskid],
+R_prev = sims_parameters$R_prev[taskid],
+beta_x1_dev = sims_parameters$beta_x1_dev[taskid],
+beta_x2_dev = sims_parameters$beta_x2_dev[taskid],
+beta_Y_dev = sims_parameters$beta_Y_dev[taskid],
+beta_x1_val = sims_parameters$beta_x1_val[taskid],
+beta_x2_val = sims_parameters$beta_x2_val[taskid],
+beta_Y_val = sims_parameters$beta_Y_val[taskid],
+rho_X = sims_parameters$rho_X[taskid],
+gamma_x1 = sims_parameters$gamma_x1[taskid],
+gamma_x2 = sims_parameters$gamma_x2[taskid]))
+source("Code/00_Simulation_Functions.R")
+library(tidyverse)
+# Define a dataset that includes all combinations of simulation parameters:
+sims_parameters <- tidyr::crossing(
+Y_prev = c(0.2),
+X_categorical = c(TRUE, FALSE),
+R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively
+beta_x1_dev = c(0, 0.5),
+beta_x2_dev = c(0, 0.5),
+beta_Y_dev = c(0, 0.5),
+beta_x1_val = c(0, 0.5),
+beta_x2_val = c(0, 0.5),
+beta_Y_val = c(0, 0.5),
+rho_X = c(0, 0.75),
+gamma_x1 = c(0, 0.5),
+gamma_x2 = 0.5
+) %>%
+dplyr::mutate(beta_cases_dev = dplyr::case_when(
+beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev",
+beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev",
+beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev",
+TRUE ~ as.character("not_applicable")
+),
+beta_cases_val = dplyr::case_when(
+beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val",
+beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val",
+beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val",
+TRUE ~ as.character("not_applicable")
+)) %>%
+dplyr::filter(beta_cases_dev != "not_applicable" &
+beta_cases_val != "not_applicable") %>%
+dplyr::select(-beta_cases_dev,
+-beta_cases_val)
+taskid <- 100
+#Define the size of development, validation and implementation, respectively:
+N_dev <- 100000
+N_val <- 100000
+system.time(simulation_nrun_fnc(n_iter = 1,
+N_dev = N_dev,
+N_val = N_val,
+Y_prev = sims_parameters$Y_prev[taskid],
+X_categorical = sims_parameters$X_categorical[taskid],
+R_prev = sims_parameters$R_prev[taskid],
+beta_x1_dev = sims_parameters$beta_x1_dev[taskid],
+beta_x2_dev = sims_parameters$beta_x2_dev[taskid],
+beta_Y_dev = sims_parameters$beta_Y_dev[taskid],
+beta_x1_val = sims_parameters$beta_x1_val[taskid],
+beta_x2_val = sims_parameters$beta_x2_val[taskid],
+beta_Y_val = sims_parameters$beta_Y_val[taskid],
+rho_X = sims_parameters$rho_X[taskid],
+gamma_x1 = sims_parameters$gamma_x1[taskid],
+gamma_x2 = sims_parameters$gamma_x2[taskid]))
+80*100
+80/60
+*100
+(80/60)*100
+600/15
+40*((80/60)*100)
+(40*((80/60)*100))/24
+((80/60)*100)/60
+(40*(((80/60)*100)/60))/24
+library(tidyverse)
+simulation_results_all <- read_rds(here::here("Code",
+"simulation_results_all.RDS"))
+simulation_results_all <- read_rds(here::here("Code",
+"simulation_results_all.RDS"))
+View(simulation_results_all)
+unique(simulation_results_all$Simulation_Scenario)
+sort(unique(simulation_results_all$Simulation_Scenario))
+length(sort(unique(simulation_results_all$Simulation_Scenario)))
diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths
index d012415..fbd2509 100644
--- a/.Rproj.user/shared/notebooks/paths
+++ b/.Rproj.user/shared/notebooks/paths
@@ -1,2 +1 @@
-C:/Users/mbchkgm5/OneDrive - The University of Manchester/1. Projects/SARC Children Non Fatal Strangulation/Code/02_NFSAnalysis.R="8196FE78"
-C:/Users/mbchkgm5/OneDrive - The University of Manchester/2. Areas/Missing Data Handling in Prediction models/1. Projects/Compatibility of missing data handling across CPM Production/.gitignore="2EB11F40"
+/Users/glenmartin/Library/CloudStorage/OneDrive-TheUniversityofManchester/2. Areas/Missing Data Handling in Prediction models/1. Projects/Compatibility of missing data handling across CPM Production/.gitignore="1040BE61"
diff --git a/.gitignore b/.gitignore
index ac62074..da136b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,5 @@
 /Manuscript
 /Templates
 /Outputs/ncorr_analysis_results.RDS
-/Outputs/simulation_results.7z
\ No newline at end of file
+/Outputs/simulation_results_old.7z
+/Outputs/simulation_results_all.RDS
\ No newline at end of file
diff --git a/Code/.DS_Store b/Code/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/Code/.DS_Store differ
diff --git a/Code/00_Simulation_Functions.R b/Code/00_Simulation_Functions.R
index f2db746..2447afb 100644
--- a/Code/00_Simulation_Functions.R
+++ b/Code/00_Simulation_Functions.R
@@ -3,16 +3,7 @@
 # Author of code: Antonia D. Tsvetanova & Glen P. Martin
 
 # This is code for a simulation study presented in a manuscript entitled: 
-# Compatibility of missing data handling methods across validation and 
-# deployment of a Clinical Prediction Model
-# Authors:
-#   Antonia Tsvetanova
-#   Matthew Sperrin
-#   Niels Peek
-#   David Jenkins
-#   Iain Buchan
-#   Stephanie Hyland
-#   Glen P. Martin
+# Compatibility of missing data handling methods across stages of producing CPMs
 
 # ##############################################################################
 
@@ -27,14 +18,13 @@ simulation_nrun_fnc <- function(n_iter,
                                 R_prev,
                                 beta_x1_dev, 
                                 beta_x2_dev, 
-                                beta_U_dev, 
+                                beta_Y_dev, 
                                 beta_x1_val, 
                                 beta_x2_val, 
-                                beta_U_val, 
+                                beta_Y_val, 
                                 rho_X,
                                 gamma_x1, 
-                                gamma_x2, 
-                                gamma_U) {
+                                gamma_x2) {
   #Input: n_iter = the number of iterations to repeat the simulation over
   #       N_dev = size of the data to simulate as the development set
   #       N_val = size of the data to simulate as the validation set
@@ -45,22 +35,20 @@ simulation_nrun_fnc <- function(n_iter,
   #                     development dataset
   #       beta_x2_dev = association between X2 and prob of missing data in the
   #                     development dataset
-  #       beta_U_dev = association between U (unmeasured variable) and prob of 
-  #                     missing data in the development dataset
+  #       beta_Y_dev = association between Y and prob of missing data in the
+  #                     development dataset
   #       beta_x1_val = association between X1 and prob of missing data in the
   #                     validation dataset
   #       beta_x2_val = association between X2 and prob of missing data in the
   #                     validation dataset
-  #       beta_U_val = association between U (unmeasured variable) and prob of 
-  #                     missing data in the validation dataset
+  #       beta_Y_val = association between Y and prob of missing data in the
+  #                     validation dataset
   #       rho_X = the correlation between X1 and X2 - if X1 is categorical, the 
   #               inputted correlation is between the latent variable Z1 and 
   #               X2 where Z1 is then used to generate X1 at a set prevalence 
   #               (see paper for details)
   #       gamma_x1 = association between X1 and prob of outcome
   #       gamma_x2 = association between X2 and prob of outcome
-  #       gamma_U = association between U (unmeasured variable) and prob of 
-  #                 outcome
   
   library(MASS)
   library(tidyverse)
@@ -80,14 +68,13 @@ simulation_nrun_fnc <- function(n_iter,
                                                    R_prev  =R_prev,
                                                    beta_x1_dev = beta_x1_dev, 
                                                    beta_x2_dev = beta_x2_dev, 
-                                                   beta_U_dev = beta_U_dev, 
+                                                   beta_Y_dev = beta_Y_dev, 
                                                    beta_x1_val = beta_x1_val, 
                                                    beta_x2_val = beta_x2_val, 
-                                                   beta_U_val = beta_U_val, 
+                                                   beta_Y_val = beta_Y_val, 
                                                    rho_X = rho_X,
                                                    gamma_x1 = gamma_x1, 
-                                                   gamma_x2 = gamma_x2, 
-                                                   gamma_U = gamma_U)
+                                                   gamma_x2 = gamma_x2)
     
     results <- results %>%
       dplyr::bind_rows(tibble::as_tibble(simulation_results) %>% 
@@ -111,31 +98,29 @@ simulation_singlerun_fnc <- function(N_dev,
                                      R_prev,
                                      beta_x1_dev, 
                                      beta_x2_dev, 
-                                     beta_U_dev, 
+                                     beta_Y_dev, 
                                      beta_x1_val, 
                                      beta_x2_val, 
-                                     beta_U_val, 
+                                     beta_Y_val, 
                                      rho_X,
                                      gamma_x1, 
-                                     gamma_x2, 
-                                     gamma_U) {
+                                     gamma_x2) {
   
   # Generate the data according to the DGM--------------------------------------
   gen_data <- DGM_fnc(N_dev = N_dev,
                       N_val = N_val, 
                       Y_prev = Y_prev,
                       X_categorical = X_categorical, 
-                      R_prev  =R_prev,
+                      R_prev = R_prev,
                       beta_x1_dev = beta_x1_dev, 
                       beta_x2_dev = beta_x2_dev, 
-                      beta_U_dev = beta_U_dev, 
+                      beta_Y_dev = beta_Y_dev, 
                       beta_x1_val = beta_x1_val, 
                       beta_x2_val = beta_x2_val, 
-                      beta_U_val = beta_U_val, 
+                      beta_Y_val = beta_Y_val, 
                       rho_X = rho_X,
                       gamma_x1 = gamma_x1, 
-                      gamma_x2 = gamma_x2, 
-                      gamma_U = gamma_U) 
+                      gamma_x2 = gamma_x2) 
   
   # Impute missing data in the development and validation data under
   # different methods ----------------------------------------------------------
@@ -170,7 +155,8 @@ simulation_singlerun_fnc <- function(N_dev,
   
   #calculate predictive performance of each model in each imputed dataset:
   if(X_categorical){
-    CPM_names <- c("PR_fullyobserved",
+    CPM_names <- c("PR_DGM",
+                   "PR_fullyobserved",
                    "PR_CCA",
                    "PR_RI",
                    "PR_MInoY",
@@ -178,7 +164,8 @@ simulation_singlerun_fnc <- function(N_dev,
                    "PR_patternsubmodel",
                    "PR_zero")
   }else {
-    CPM_names <- c("PR_fullyobserved",
+    CPM_names <- c("PR_DGM",
+                   "PR_fullyobserved",
                    "PR_CCA",
                    "PR_RI",
                    "PR_MInoY",
@@ -222,53 +209,44 @@ DGM_fnc <- function(N_dev,
                     R_prev,
                     beta_x1_dev, 
                     beta_x2_dev, 
-                    beta_U_dev, 
+                    beta_Y_dev, 
                     beta_x1_val, 
                     beta_x2_val, 
-                    beta_U_val, 
+                    beta_Y_val, 
                     rho_X,
                     gamma_x1, 
-                    gamma_x2, 
-                    gamma_U) {
+                    gamma_x2) {
   
   N <- N_dev + N_val #total sample size to generate
   
   #generate (potentially correlated) latent predictor variables from a MVN:
   Z <- MASS::mvrnorm(n = N, mu = c(0,0), Sigma = toeplitz(c(1, rho_X)))
   
-  #generate an "unmeasured variable" U:
-  U <- rnorm(N, mean = 0, sd = 1)
-  
   if(X_categorical == FALSE) {
     #if continuous use the latent variables directly as X
     IPD <- tibble::tibble("ID" = 1:N,
                           "x_1" = Z[,1], 
-                          "x_2" = Z[,2], 
-                          "U" = U)
+                          "x_2" = Z[,2])
   } else{
     #if binary convert the latent variable to achieve binary variable prevalence
     IPD <- tibble::tibble("ID" = 1:N,
                           "x_1" = ifelse(Z[,1]>=qnorm(0.3, 
                                                       lower.tail = FALSE), 
                                          1, 0), 
-                          "x_2" = Z[,2], 
-                          "U" = U)
+                          "x_2" = Z[,2])
   }
   
   #calculate gamma_0 to give inputted prevalence of the outcome:
   gamma_0 <- as.numeric(coef(glm(rbinom(N, 1, prob = Y_prev) ~ 
                                    offset((gamma_x1*x_1) + 
-                                            (gamma_x2*x_2) + 
-                                            (gamma_U*U)), 
+                                            (gamma_x2*x_2)), 
                                  family = binomial(link = "logit"),
                                  data = IPD))[1])
   #simulate outcomes based on the data generating model:
+  IPD$Pi <- expit_fnc(gamma_0 + (gamma_x1*IPD$x_1) + (gamma_x2*IPD$x_2))
   IPD$Y <- rbinom(N, 
                   size = 1, 
-                  prob = expit_fnc(gamma_0 + 
-                                     (gamma_x1*IPD$x_1) + 
-                                     (gamma_x2*IPD$x_2) + 
-                                     (gamma_U*IPD$U)))
+                  prob = IPD$Pi)
   
   #separate IPD into development and validation data
   dev_IPD <- IPD %>% 
@@ -285,13 +263,13 @@ DGM_fnc <- function(N_dev,
   beta_0_dev <- as.numeric(coef(glm(rbinom(N_dev, 1, prob = R_prev) ~ 
                                       offset((beta_x1_dev*x_1) +
                                                (beta_x2_dev*x_2) +
-                                               (beta_U_dev*U)), 
+                                               (beta_Y_dev*Y)), 
                                 family = binomial(link = "logit"),
                                 data = dev_IPD))[1])
   beta_0_val <- as.numeric(coef(glm(rbinom(N_val, 1, prob = R_prev) ~ 
                                       offset((beta_x1_val*x_1) +
                                                (beta_x2_val*x_2) +
-                                               (beta_U_val*U)), 
+                                               (beta_Y_val*Y)), 
                                     family = binomial(link = "logit"),
                                     data = val_IPD))[1])
   
@@ -302,7 +280,7 @@ DGM_fnc <- function(N_dev,
                         prob = expit_fnc(beta_0_dev + 
                                            (beta_x1_dev*dev_IPD$x_1) + 
                                            (beta_x2_dev*dev_IPD$x_2) + 
-                                           (beta_U_dev*dev_IPD$U))) 
+                                           (beta_Y_dev*dev_IPD$Y))) 
   
   
   val_IPD$R_1 <- rbinom(N_val, 
@@ -310,7 +288,7 @@ DGM_fnc <- function(N_dev,
                         prob = expit_fnc(beta_0_val + 
                                            (beta_x1_val*val_IPD$x_1) + 
                                            (beta_x2_val*val_IPD$x_2) + 
-                                           (beta_U_val*val_IPD$U)))  
+                                           (beta_Y_val*val_IPD$Y)))  
   
   observed_dev_IPD <- dev_IPD %>%
     dplyr::mutate(x_1 = ifelse(R_1 == 1, x_1, NA))
@@ -474,8 +452,8 @@ mice_fnc <- function(df, m = 5, include_Y) {
   #we never want MI to impute the missing values based on ID, U or R1:
   predmat[,"ID"] <- 0 
   predmat["ID",] <- 0
-  predmat[,"U"] <- 0
-  predmat["U",] <- 0
+  predmat[,"Pi"] <- 0
+  predmat["Pi",] <- 0
   predmat[,"R_1"] <- 0
   predmat["R_1",] <- 0
   
@@ -502,8 +480,8 @@ regimputation_fnc <- function(df, X_categorical) {
   #we never want MI to impute the missing values based on ID, U or R1:
   predmat[,"ID"] <- 0 
   predmat["ID",] <- 0
-  predmat[,"U"] <- 0
-  predmat["U",] <- 0
+  predmat[,"Pi"] <- 0
+  predmat["Pi",] <- 0
   predmat[,"R_1"] <- 0
   predmat["R_1",] <- 0
   
@@ -662,7 +640,8 @@ make_predictions_fnc <- function(imputed_dataset,
   if(is.mids(imputed_dataset) == FALSE){ 
     
     output_predictions <- tibble::tibble("ID" = imputed_dataset$ID,
-                                         "Y" = imputed_dataset$Y)
+                                         "Y" = imputed_dataset$Y,
+                                         "PR_DGM" = imputed_dataset$Pi)
     
     if(any(is.na(imputed_dataset$x_1))) {
       #apply pattern submodel only - its the observed val data, so has 
@@ -726,7 +705,8 @@ make_predictions_fnc <- function(imputed_dataset,
     
     output_predictions <- tibble::tibble(".imp" = MI_long$.imp,
                                          "ID" = MI_long$ID,
-                                         "Y" = MI_long$Y)
+                                         "Y" = MI_long$Y,
+                                         "PR_DGM" = MI_long$Pi)
     
     #define the design matrix:
     DM <- model.matrix(~x_1 + x_2, MI_long)
diff --git a/Code/01_Simulation_Run_CSF.R b/Code/01_Simulation_Run_CSF.R
index 7dbc388..db5f3f6 100644
--- a/Code/01_Simulation_Run_CSF.R
+++ b/Code/01_Simulation_Run_CSF.R
@@ -3,16 +3,7 @@
 # Author of code: Antonia D. Tsvetanova & Glen P. Martin
 
 # This is code for a simulation study presented in a manuscript entitled: 
-# Compatibility of missing data handling methods across validation and 
-# deployment of a Clinical Prediction Model
-# Authors:
-#   Antonia Tsvetanova
-#   Matthew Sperrin
-#   Niels Peek
-#   David Jenkins
-#   Iain Buchan
-#   Stephanie Hyland
-#   Glen P. Martin
+# Compatibility of missing data handling methods across stages of producing CPMs
 
 # ##############################################################################
 
@@ -33,65 +24,82 @@ sims_parameters <- tidyr::crossing(
   R_prev = c(0.9, 0.8, 0.5), #10%, 20% and 50% missingness, respectively
   beta_x1_dev = c(0, 0.5), 
   beta_x2_dev = c(0, 0.5), 
-  beta_U_dev = c(0, 0.5),  
+  beta_Y_dev = c(0, 0.5),  
   beta_x1_val = c(0, 0.5), 
   beta_x2_val = c(0, 0.5), 
-  beta_U_val = c(0, 0.5),  
+  beta_Y_val = c(0, 0.5),  
   rho_X = c(0, 0.75),
   gamma_x1 = c(0, 0.5), 
-  gamma_x2 = 0.5, 
-  gamma_U = c(0, 0.5)
-)
+  gamma_x2 = 0.5
+) %>% 
+  dplyr::mutate(beta_cases_dev = dplyr::case_when(
+    beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG_b_dev",
+    beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_c_dev", 
+    beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG_d_dev",
+    beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_e_dev",
+    beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG_f_dev",
+    TRUE ~ as.character("not_applicable")
+  ),
+  beta_cases_val = dplyr::case_when(
+    beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG_b_val",
+    beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_c_val", 
+    beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG_d_val",
+    beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_e_val",
+    beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG_f_val",
+    TRUE ~ as.character("not_applicable")
+  )) %>% 
+  dplyr::filter(beta_cases_dev != "not_applicable" & 
+                  beta_cases_val != "not_applicable") %>%
+  dplyr::select(-beta_cases_dev,
+                -beta_cases_val)
 
-args <- commandArgs(trailingOnly = T) #pull in all arguments from the qsub file
-s <- as.numeric(args[1]) #Note that we need to specify what class the argument is
+taskid <- commandArgs(trailingOnly = T) 
+taskid <- as.numeric(taskid)
 
 # number of repeats per scenario
 n_rep <- 100
 #Define the size of development, validation and implementation, respectively:
-N_dev <- 50000
-N_val <- 50000 
+N_dev <- 100000
+N_val <- 100000 
 
-set.seed(465475 * s)
+set.seed(465475 * taskid)
 
 #run the simulation study:
 simulation_results <- simulation_nrun_fnc(n_iter = n_rep, 
                                           N_dev = N_dev,
                                           N_val = N_val, 
-                                          Y_prev = sims_parameters$Y_prev[s],
-                                          X_categorical = sims_parameters$X_categorical[s], 
-                                          R_prev = sims_parameters$R_prev[s],
-                                          beta_x1_dev = sims_parameters$beta_x1_dev[s], 
-                                          beta_x2_dev = sims_parameters$beta_x2_dev[s], 
-                                          beta_U_dev = sims_parameters$beta_U_dev[s], 
-                                          beta_x1_val = sims_parameters$beta_x1_val[s], 
-                                          beta_x2_val = sims_parameters$beta_x2_val[s],  
-                                          beta_U_val = sims_parameters$beta_U_val[s], 
-                                          rho_X = sims_parameters$rho_X[s],
-                                          gamma_x1 = sims_parameters$gamma_x1[s], 
-                                          gamma_x2 = sims_parameters$gamma_x2[s], 
-                                          gamma_U = sims_parameters$gamma_U[s])
+                                          Y_prev = sims_parameters$Y_prev[taskid],
+                                          X_categorical = sims_parameters$X_categorical[taskid], 
+                                          R_prev = sims_parameters$R_prev[taskid],
+                                          beta_x1_dev = sims_parameters$beta_x1_dev[taskid], 
+                                          beta_x2_dev = sims_parameters$beta_x2_dev[taskid], 
+                                          beta_Y_dev = sims_parameters$beta_Y_dev[taskid], 
+                                          beta_x1_val = sims_parameters$beta_x1_val[taskid], 
+                                          beta_x2_val = sims_parameters$beta_x2_val[taskid],  
+                                          beta_Y_val = sims_parameters$beta_Y_val[taskid], 
+                                          rho_X = sims_parameters$rho_X[taskid],
+                                          gamma_x1 = sims_parameters$gamma_x1[taskid], 
+                                          gamma_x2 = sims_parameters$gamma_x2[taskid])
 
 #attach the simulation parameters for this scenario to the simulation results:
 simulation_results <- simulation_results %>%
-  dplyr::mutate("Simulation_Scenario" = s,
-                "Y_prev" = sims_parameters$Y_prev[s],
-                "X_categorical" = sims_parameters$X_categorical[s], 
-                "R_prev" = sims_parameters$R_prev[s],
-                "beta_x1_dev" = sims_parameters$beta_x1_dev[s], 
-                "beta_x2_dev" = sims_parameters$beta_x2_dev[s], 
-                "beta_U_dev" = sims_parameters$beta_U_dev[s], 
-                "beta_x1_val" = sims_parameters$beta_x1_val[s], 
-                "beta_x2_val" = sims_parameters$beta_x2_val[s],  
-                "beta_U_val" = sims_parameters$beta_U_val[s], 
-                "rho_X" = sims_parameters$rho_X[s],
-                "gamma_x1" = sims_parameters$gamma_x1[s], 
-                "gamma_x2" = sims_parameters$gamma_x2[s], 
-                "gamma_U" = sims_parameters$gamma_U[s],
+  dplyr::mutate("Simulation_Scenario" = taskid,
+                "Y_prev" = sims_parameters$Y_prev[taskid],
+                "X_categorical" = sims_parameters$X_categorical[taskid], 
+                "R_prev" = sims_parameters$R_prev[taskid],
+                "beta_x1_dev" = sims_parameters$beta_x1_dev[taskid], 
+                "beta_x2_dev" = sims_parameters$beta_x2_dev[taskid], 
+                "beta_Y_dev" = sims_parameters$beta_Y_dev[taskid], 
+                "beta_x1_val" = sims_parameters$beta_x1_val[taskid], 
+                "beta_x2_val" = sims_parameters$beta_x2_val[taskid],  
+                "beta_Y_val" = sims_parameters$beta_Y_val[taskid], 
+                "rho_X" = sims_parameters$rho_X[taskid],
+                "gamma_x1" = sims_parameters$gamma_x1[taskid], 
+                "gamma_x2" = sims_parameters$gamma_x2[taskid],
                 .before = "Iteration")
 
 readr::write_rds(simulation_results, 
-                 file = paste("./simulation_results_", s, 
+                 file = paste("./simulation_results_", taskid, 
                               ".RDS", sep = ""))
 
 warnings()
\ No newline at end of file
diff --git a/Code/03_Simulation_Results.R b/Code/03_Simulation_Results.R
index c36b187..b0d3b76 100644
--- a/Code/03_Simulation_Results.R
+++ b/Code/03_Simulation_Results.R
@@ -6,52 +6,35 @@ simulation_results_all <- read_rds(here::here("Outputs",
                                               "simulation_results_all.RDS"))
 
 
-##### Add the missingness mechanism to each simulation scenario (can differ
-##### across development and validation sets):
+##### Add the missingness DAG type to each simulation scenario (can differ
+##### across development and validation sets - see manuscript):
 simulation_results_all <- simulation_results_all %>% 
-  dplyr::mutate(
-    Missing_Mech_Dev = case_when(
-      beta_x1_dev == 0 & beta_x2_dev == 0 & beta_U_dev == 0 ~ "MCAR",
-      
-      beta_x1_dev == 0 & beta_x2_dev != 0 & beta_U_dev == 0 ~ "MAR", 
-      
-      beta_x1_dev != 0 & beta_x2_dev != 0 & beta_U_dev == 0 ~ "MNAR_X",
-      beta_x1_dev != 0 & beta_x2_dev == 0 & beta_U_dev == 0 ~ "MNAR_X",
-      
-      beta_x1_dev == 0 & beta_x2_dev != 0 & beta_U_dev != 0 ~ "MNAR_Y",
-      beta_x1_dev == 0 & beta_x2_dev == 0 & beta_U_dev != 0 ~ "MNAR_Y",
-      
-      beta_x1_dev != 0 & beta_x2_dev != 0 & beta_U_dev != 0 ~ "MNAR_XY",
-      beta_x1_dev != 0 & beta_x2_dev == 0 & beta_U_dev != 0 ~ "MNAR_XY",
-      TRUE ~ as.character("error")
-    )
-    ,
-    Missing_Mech_Val = case_when(
-      beta_x1_val == 0 & beta_x2_val == 0 & beta_U_val == 0 ~ "MCAR",
-      
-      beta_x1_val == 0 & beta_x2_val != 0 & beta_U_val == 0 ~ "MAR", 
-      
-      beta_x1_val != 0 & beta_x2_val != 0 & beta_U_val == 0 ~ "MNAR_X",
-      beta_x1_val != 0 & beta_x2_val == 0 & beta_U_val == 0 ~ "MNAR_X",
-      
-      beta_x1_val == 0 & beta_x2_val != 0 & beta_U_val != 0 ~ "MNAR_Y",
-      beta_x1_val == 0 & beta_x2_val == 0 & beta_U_val != 0 ~ "MNAR_Y",
-      
-      beta_x1_val != 0 & beta_x2_val != 0 & beta_U_val != 0 ~ "MNAR_XY",
-      beta_x1_val != 0 & beta_x2_val == 0 & beta_U_val != 0 ~ "MNAR_XY",
-      TRUE ~ as.character("error")
-    )
-  )
+  dplyr::mutate(DAG_dev = dplyr::case_when(
+    beta_x1_dev == 0 & beta_x2_dev == 0 & beta_Y_dev == 0 ~ "DAG (b)",
+    beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG (c)", 
+    beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev == 0 ~ "DAG (d)",
+    beta_x1_dev == 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG (e)",
+    beta_x1_dev != 0 & beta_x2_dev != 0 & beta_Y_dev != 0 ~ "DAG (f)",
+    TRUE ~ as.character("not_applicable")
+  ),
+  DAG_val = dplyr::case_when(
+    beta_x1_val == 0 & beta_x2_val == 0 & beta_Y_val == 0 ~ "DAG (b)",
+    beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG (c)", 
+    beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val == 0 ~ "DAG (d)",
+    beta_x1_val == 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG (e)",
+    beta_x1_val != 0 & beta_x2_val != 0 & beta_Y_val != 0 ~ "DAG (f)",
+    TRUE ~ as.character("not_applicable")
+  ))
 
 
 ##### Summarise the results across all iterations for each simulation scenario
 simulation_results_summarised <- simulation_results_all %>%
-  dplyr::group_by(Simulation_Scenario, Missing_Mech_Dev, Missing_Mech_Val, 
+  dplyr::group_by(Simulation_Scenario, DAG_dev, DAG_val, 
                   Y_prev, X_categorical, R_prev,
-                  beta_x1_dev, beta_x2_dev, beta_U_dev, 
-                  beta_x1_val, beta_x2_val, beta_U_val,
+                  beta_x1_dev, beta_x2_dev, beta_Y_dev, 
+                  beta_x1_val, beta_x2_val, beta_Y_val,
                   rho_X,
-                  gamma_x1, gamma_x2, gamma_U,
+                  gamma_x1, gamma_x2,
                   CPM,
                   Validation_Dataset) %>%
   dplyr::summarise(CalInt_Mean = mean(CalInt_est),
@@ -93,31 +76,22 @@ simulation_results_summarised <- simulation_results_all %>%
                 -Brier_acrossSE)
 
 write_rds(simulation_results_summarised,
-          file = here::here("outputs", "simulation_results_summarised.RDS"))
-# simulation_results_summarised <- read_rds(here::here("outputs", 
+          file = here::here("Outputs", "simulation_results_summarised.RDS"))
+# simulation_results_summarised <- read_rds(here::here("Outputs", 
 #                                                      "simulation_results_summarised.RDS"))
 
 
 simulation_results_summarised <- simulation_results_summarised %>%
-  dplyr::mutate("Scenario" = paste(paste(Missing_Mech_Dev, "Dev", sep=" "), 
-                                   paste(Missing_Mech_Val, "Val", sep=" "), 
+  dplyr::mutate("Scenario" = paste(paste(DAG_dev, "Dev", sep=" "), 
+                                   paste(DAG_val, "Val", sep=" "), 
                                    sep = " -\n "),
                 
                 Scenario = forcats::fct_relevel(Scenario,
-                                                "MCAR Dev -\n MCAR Val",
-                                                "MAR Dev -\n MAR Val",
-                                                "MNAR_X Dev -\n MNAR_X Val",
-                                                "MNAR_Y Dev -\n MNAR_Y Val",
-                                                "MNAR_XY Dev -\n MNAR_XY Val",
-                                                "MCAR Dev -\n MAR Val",
-                                                "MCAR Dev -\n MNAR_X Val",
-                                                "MCAR Dev -\n MNAR_Y Val",
-                                                "MCAR Dev -\n MNAR_XY Val",
-                                                "MAR Dev -\n MNAR_X Val",
-                                                "MAR Dev -\n MNAR_Y Val",
-                                                "MAR Dev -\n MNAR_XY Val",
-                                                "MNAR_X Dev -\n MNAR_Y Val",
-                                                "MNAR_X Dev -\n MNAR_XY Val"),
+                                                "DAG (b) Dev -\n DAG (b) Val",
+                                                "DAG (c) Dev -\n DAG (c) Val",
+                                                "DAG (d) Dev -\n DAG (d) Val",
+                                                "DAG (e) Dev -\n DAG (e) Val",
+                                                "DAG (f) Dev -\n DAG (f) Val"),
                 
                 Validation_Dataset = forcats::fct_recode(Validation_Dataset,
                                                          "CCA" = "CCA_val",
@@ -130,608 +104,595 @@ simulation_results_summarised <- simulation_results_summarised %>%
                                                          "All Data Required" = "fullyobserved_val",
                                                          "Mean Imputation" = "mean_val",
                                                          "Risk Absent Imputation" ="zero_val",
-                                                         "Pattern Sub-Model" = "observed_val")) 
-
-
+                                                         "Pattern Sub-Model" = "observed_val"))
 
 ####----------------------------------------------------------------------------
-### Select which estimand we want to produce plots for:
-target_performance <- "All Data Required"
-# target_performance <- "Mean Imputation"
-# target_performance <- "RI (transported)"
-# target_performance <- "RI (refit)"
-# target_performance <- "MI no Y (transported)"
-# target_performance <- "MI no Y (refit)"
-# target_performance <- "MI with Y (transported)"
-# target_performance <- "MI with Y (refit)"
-# target_performance <- "Pattern Sub-Model"
-
-### The below plots look at scenarios where X_1 is continuous, contains 50%
-### missing data, and where gamma_1=gamma_2=gamma_3=0.5. Results were quantitively
-### similar in other scenarios 
-# Define scenario numbers of interest. We focus on 50% missingness and 
-# continuous X1 (results similar over other scenarios)
-#Some code to extract scenario numbers as follows:
-# simulation_results_summarised %>% 
-#   filter(rho_X == 0.75, X_categorical == FALSE, R_prev == 0.5, 
-#          gamma_x1 == 0, gamma_U == 0, 
-#          Missing_Mech_Dev == "MNAR_Y" & Missing_Mech_Val == "MNAR_XY") %>% 
-#   select(Simulation_Scenario) %>% 
-#   pull() %>% 
-#   unique()
+## Define two functions for plotting model degradation and model bias for 
+## given simulation scenarios
 ####----------------------------------------------------------------------------
-
-### Function to plot the predictive performance of each CPM:
-performance_plotting_fnc <- function(df,
-                                     scenario_number,
-                                     target_performance_imputation_method = c("All Data Required",
-                                                                              "CCA",
-                                                                              "Mean Imputation",
-                                                                              "RI (refit)",
-                                                                              "RI (transported)",
-                                                                              "MI no Y (refit)",
-                                                                              "MI no Y (transported)",
-                                                                              "MI with Y (refit)",
-                                                                              "MI with Y (transported)",
-                                                                              "Pattern Sub-Model")) {
+model_degrdation_fnc <- function(df,
+                                 DAG_Scenario,
+                                 X_categorical_value, 
+                                 R_prev_value, 
+                                 rho_X_value,
+                                 gamma_x1_value,
+                                 target_estimands) {
   
-  target_performance_imputation_method <- as.character(match.arg(target_performance_imputation_method))
+  if(all((target_estimands %in% c("E-all",
+                                  "E-mean",
+                                  "E-RI",
+                                  "E-MI",
+                                  "E-PSM")))==FALSE) {
+    stop("target_estimands vector needs to take correct estimand characters")
+  }
   
-  df <- df %>%
-    dplyr::mutate("combinations" = case_when(
-      CPM == "PR_fullyobserved" |
-        CPM == "PR_CCA" |
-        CPM == "PR_mean" ~ Validation_Dataset %in% c("All Data Required",
-                                                     "CCA",
-                                                     "Mean Imputation",
-                                                     "MI with Y (refit)",
-                                                     "MI no Y (refit)",
-                                                     "RI (refit)"),
-      CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required",
-                                                 "CCA",
-                                                 "Mean Imputation",
-                                                 "MI with Y (refit)",
-                                                 "MI no Y (refit)",
-                                                 "RI (transported)",
-                                                 "RI (refit)"),
-      CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required",
-                                                      "CCA",
-                                                      "Mean Imputation",
-                                                      "MI with Y (transported)",
-                                                      "MI with Y (refit)",
-                                                      "MI no Y (refit)",
-                                                      "RI (refit)"),
-      CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required",
-                                                    "CCA",
-                                                    "Mean Imputation",
-                                                    "MI with Y (refit)",
-                                                    "MI no Y (transported)",
-                                                    "MI no Y (refit)",
-                                                    "RI (refit)"),
-      
-      CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required",
-                                                              "CCA",
-                                                              "Mean Imputation",
-                                                              "MI with Y (refit)",
-                                                              "MI no Y (refit)",
-                                                              "RI (refit)",
-                                                              "Pattern Sub-Model"),
-      .default = NA)) %>%
-    dplyr::filter(combinations == 1)
+  if(X_categorical_value == FALSE) {
+    df <- df %>%
+      dplyr::filter(Scenario == DAG_Scenario &
+                      X_categorical == X_categorical_value & 
+                      R_prev == R_prev_value & 
+                      rho_X == rho_X_value &
+                      gamma_x1 == gamma_x1_value,
+                    
+                    Validation_Dataset %in% c( "All Data Required",
+                                               "Mean Imputation",
+                                               "RI (refit)",
+                                               "MI no Y (refit)",
+                                               "Pattern Sub-Model")) %>%
+      dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset,
+                                                             "E_all" = "All Data Required",
+                                                             "E_mean" = "Mean Imputation",
+                                                             "E_RI" = "RI (refit)",
+                                                             "E_MI" = "MI no Y (refit)",
+                                                             "E_PSM" = "Pattern Sub-Model"))
+  } else {
+    df <- df %>%
+      dplyr::filter(Scenario == DAG_Scenario &
+                      X_categorical == X_categorical_value & 
+                      R_prev == R_prev_value & 
+                      rho_X == rho_X_value &
+                      gamma_x1 == gamma_x1_value,
+                    
+                    Validation_Dataset %in% c( "All Data Required",
+                                               "Risk Absent Imputation",
+                                               "RI (refit)",
+                                               "MI no Y (refit)",
+                                               "Pattern Sub-Model")) %>%
+      dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset,
+                                                             "E_all" = "All Data Required",
+                                                             "E_mean" = "Risk Absent Imputation",
+                                                             "E_RI" = "RI (refit)",
+                                                             "E_MI" = "MI no Y (refit)",
+                                                             "E_PSM" = "Pattern Sub-Model"))
+  }
+  
+  #Extract 'true' predictive performance as the performance of the DGM within
+  #the validation set corresponding to the target estimand
+  true_predictive_performance <- df %>%
+    dplyr::filter(CPM == "PR_DGM") %>%
+    dplyr::select(Simulation_Scenario,
+                  Scenario,
+                  Y_prev,
+                  X_categorical,
+                  R_prev,
+                  rho_X,
+                  gamma_x1,
+                  gamma_x2,
+                  Validation_Dataset,
+                  dplyr::ends_with("_Mean")) %>%
+    tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"),
+                        values_to = "True_Performance",
+                        names_to = "Performance_Metric")
   
-  scenario_df <- df %>%
-    dplyr::filter(Simulation_Scenario %in% scenario_number) %>%
-    dplyr::filter(Validation_Dataset == target_performance_imputation_method) %>%
-    dplyr::select(Simulation_Scenario, Scenario,
-                  CPM, Validation_Dataset,
-                  contains("_Mean"),
-                  contains("_quantileLower"),
-                  contains("_quantileUpper")) %>%
-    pivot_longer(cols = c(contains("_Mean"),
-                          contains("_quantileLower"),
-                          contains("_quantileUpper"))) %>%
-    separate_wider_delim(name,
-                         delim = "_",
-                         names = c("Metric", "Summary_Type")) %>%
-    pivot_wider(id_cols = c("Simulation_Scenario", "Scenario",
-                            "CPM", "Validation_Dataset", "Metric"),
-                names_from = "Summary_Type",
-                values_from = "value") 
+  #Extract the predictive performance of each CPM within the 
+  #validation set corresponding to the target estimand 
+  CPM_predictive_performance <- df %>%
+    dplyr::filter(CPM != "PR_DGM") %>%
+    dplyr::select(Simulation_Scenario,
+                  Scenario,
+                  Y_prev,
+                  X_categorical,
+                  R_prev,
+                  rho_X,
+                  gamma_x1,
+                  gamma_x2,
+                  CPM,
+                  Validation_Dataset,
+                  dplyr::ends_with("_Mean")) %>%
+    tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"),
+                        values_to = "Model_Performance",
+                        names_to = "Performance_Metric")
+  
+  #Calculate model degradation:
+  ModelDegradation <- CPM_predictive_performance %>%
+    dplyr::left_join(true_predictive_performance,
+                     by = c("Simulation_Scenario",
+                            "Scenario",
+                            "Y_prev",
+                            "X_categorical",
+                            "R_prev",
+                            "rho_X",
+                            "gamma_x1",
+                            "gamma_x2",
+                            "Validation_Dataset",
+                            "Performance_Metric"))
+  ModelDegradation$True_Performance[which(
+    ModelDegradation$Validation_Dataset == "E_PSM"
+    )] <- true_predictive_performance$True_Performance[which(
+      true_predictive_performance$Validation_Dataset == "E_all"
+    )]
   
-  plot_df <- scenario_df %>%
-    dplyr::mutate(Metric = forcats::fct_recode(Metric,
-                                               "Brier Score" = "Brier",
-                                               "Calibration Intercept" = "CalInt",
-                                               "Calibration Slope" = "CalSlope")) %>%
-    mutate(CPM = stringr::str_remove(CPM, "PR_"),
-           
-           CPM = forcats::fct_recode(CPM,
-                                     "CCA" = "CCA",
-                                     "RI" = "RI",
-                                     "MI no Y" = "MInoY",
-                                     "MI with Y" = "MIwithY",
-                                     "Fully Observed" = "fullyobserved",
-                                     "Mean Imputation" = "mean",
-                                     "PSM" = "patternsubmodel"),
-           
-           CPM = forcats::fct_relevel(CPM,
-                                      "Fully Observed",
-                                      "CCA",
-                                      "Mean Imputation",
-                                      "RI",
-                                      "MI with Y",
-                                      "MI no Y",
-                                      "PSM"))
+  if(X_categorical_value == FALSE) {
+    
+    ModelDegradation <- ModelDegradation %>%
+      dplyr::mutate("Model_Degradation" = Model_Performance - True_Performance) %>%
+      dplyr::mutate(CPM = str_remove(CPM, "PR_"),
+                    CPM = forcats::fct_recode(CPM,
+                                              "CCA" = "CCA",
+                                              "RI" = "RI",
+                                              "MI no Y" = "MInoY",
+                                              "MI with Y" = "MIwithY",
+                                              "Fully Observed" = "fullyobserved",
+                                              "Mean Imputation" = "mean",
+                                              "PSM" = "patternsubmodel"),
+                    CPM = forcats::fct_relevel(CPM,
+                                               "Fully Observed",
+                                               "CCA",
+                                               "Mean Imputation",
+                                               "RI",
+                                               "MI with Y",
+                                               "MI no Y",
+                                               "PSM"),
+                    
+                    Performance_Metric = stringr::str_remove(Performance_Metric,
+                                                             "_Mean"),
+                    Performance_Metric = forcats::fct_recode(Performance_Metric,
+                                                             "Brier Score" = "Brier",
+                                                             "Calibration Intercept" = "CalInt",
+                                                             "Calibration Slope" = "CalSlope"),
+                    
+                    Validation_Dataset = forcats::fct_relevel(Validation_Dataset,
+                                                              "E_all",
+                                                              "E_mean",
+                                                              "E_RI",
+                                                              "E_MI",
+                                                              "E_PSM"),
+                    Validation_Dataset = forcats::fct_recode(Validation_Dataset,
+                                                             "E-all" = "E_all",
+                                                             "E-mean" = "E_mean",
+                                                             "E-RI" = "E_RI",
+                                                             "E-MI" = "E_MI",
+                                                             "E-PSM" = "E_PSM"))
+  } else{
+    
+    ModelDegradation <- ModelDegradation %>%
+      dplyr::mutate("Model_Degradation" = Model_Performance - True_Performance) %>%
+      dplyr::mutate(CPM = str_remove(CPM, "PR_"),
+                    CPM = forcats::fct_recode(CPM,
+                                              "CCA" = "CCA",
+                                              "RI" = "RI",
+                                              "MI no Y" = "MInoY",
+                                              "MI with Y" = "MIwithY",
+                                              "Fully Observed" = "fullyobserved",
+                                              "Mean Imputation" = "zero",
+                                              "PSM" = "patternsubmodel"),
+                    CPM = forcats::fct_relevel(CPM,
+                                               "Fully Observed",
+                                               "CCA",
+                                               "Mean Imputation",
+                                               "RI",
+                                               "MI with Y",
+                                               "MI no Y",
+                                               "PSM"),
+                    
+                    Performance_Metric = stringr::str_remove(Performance_Metric,
+                                                             "_Mean"),
+                    Performance_Metric = forcats::fct_recode(Performance_Metric,
+                                                             "Brier Score" = "Brier",
+                                                             "Calibration Intercept" = "CalInt",
+                                                             "Calibration Slope" = "CalSlope"),
+                    
+                    Validation_Dataset = forcats::fct_relevel(Validation_Dataset,
+                                                              "E_all",
+                                                              "E_mean",
+                                                              "E_RI",
+                                                              "E_MI",
+                                                              "E_PSM"),
+                    Validation_Dataset = forcats::fct_recode(Validation_Dataset,
+                                                             "E-all" = "E_all",
+                                                             "E-mean" = "E_mean",
+                                                             "E-RI" = "E_RI",
+                                                             "E-MI" = "E_MI",
+                                                             "E-PSM" = "E_PSM"))
+  }
   
-  plot_df %>%
-    dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset,
-                                                           "Fully Observed Data" = "All Data Required"),
-                  Validation_Dataset = forcats::fct_relevel(Validation_Dataset,
-                                                            "Fully Observed Data",
-                                                            "CCA",
-                                                            "Mean Imputation",
-                                                            "RI (refit)",
-                                                            "RI (transported)",
-                                                            "MI with Y (refit)",
-                                                            "MI with Y (transported)",
-                                                            "MI no Y (refit)",
-                                                            "MI no Y (transported)",
-                                                            "Pattern Sub-Model")) %>%
-    ggplot(aes(x = Mean,
-               y = CPM)) +
+  ModelDegradation %>%
+    dplyr::filter(Validation_Dataset %in% target_estimands) %>%
+    ggplot(aes(x = Model_Degradation, y = CPM)) +
     geom_point() +
-    geom_errorbar(aes(xmin = quantileLower, 
-                      xmax = quantileUpper), width=.1) +
-    xlab("Predictive Performance") +
+    geom_vline(xintercept = 0, linetype = "dotted") +
+    ggh4x::facet_grid2(Performance_Metric ~ Validation_Dataset, scales = "free_y") +
+    xlab("Model Degradation \n (deployment performance - true model performance)") +
     ylab("Developed CPM") +
-    geom_vline(data = data.frame("Metric" = c("Calibration Intercept",
-                                              "Calibration Slope"),
-                                 "Mean" = c(0,1)),
-               aes(xintercept = Mean),
-               linetype = "dashed") +
-    ggh4x::facet_grid2(Scenario ~ Metric, scales = "free") +
-    ggtitle(paste("Targetting ", target_performance_imputation_method, " Performance", sep = "")) +
     theme_minimal(base_size = 12) +
     theme(legend.position = "bottom",
-          panel.background = element_rect(fill = "gray90"),  
+          panel.background = element_rect(fill = "gray90"),
           panel.spacing.x = unit(0.5, "lines"),
           axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
           plot.title = element_text(size = 14, hjust = 0.5),
-          panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) 
+          panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5))
 }
 
-## Consistent missingness mechanism plots:
-#rho = 0 
-performance_plotting_fnc(df = simulation_results_summarised,
-                         scenario_number = c(4, 148),
-                         target_performance_imputation_method = target_performance)
-
-performance_plotting_fnc(df = simulation_results_summarised,
-                         scenario_number = c(436, 220, 508),
-                         target_performance_imputation_method = target_performance)
-
-#rho = 0.75 
-performance_plotting_fnc(df = simulation_results_summarised,
-                         scenario_number = c(8, 152),
-                         target_performance_imputation_method = target_performance)
-
-performance_plotting_fnc(df = simulation_results_summarised,
-                         scenario_number = c(440, 224, 512),
-                         target_performance_imputation_method = target_performance)
 
-## Inconsistent missingness mechanism plots:
-#rho = 0
-#MCAR+something
-performance_plotting_fnc(df = simulation_results_summarised,
-                         scenario_number = c(20, 52, 28, 60),
-                         target_performance_imputation_method = target_performance)
-#MAR+something
-performance_plotting_fnc(df = simulation_results_summarised,
-                         scenario_number = c(180, 156, 188),
-                         target_performance_imputation_method = target_performance)
-#MNAR+something
-performance_plotting_fnc(df = simulation_results_summarised,
-                         scenario_number = c(412, 444, 252),
-                         target_performance_imputation_method = target_performance)
-
-#rho = 0.75
-#MCAR+something
-performance_plotting_fnc(df = simulation_results_summarised,
-                         scenario_number = c(24, 56, 32, 64),
-                         target_performance_imputation_method = target_performance)
-#MAR+something
-performance_plotting_fnc(df = simulation_results_summarised,
-                         scenario_number = c(184, 161, 193),
-                         target_performance_imputation_method = target_performance)
-#MNAR-X+something
-performance_plotting_fnc(df = simulation_results_summarised,
-                         scenario_number = c(416, 448, 256),
-                         target_performance_imputation_method = target_performance)
-
-
-
-### Function to plot the bias in validation results:
-plotting_fnc <- function(df,
-                         scenario_number,
-                         target_performance_imputation_method = c("All Data Required",
-                                                                  "CCA",
-                                                                  "Mean Imputation",
-                                                                  "RI (refit)",
-                                                                  "RI (transported)",
-                                                                  "MI no Y (refit)",
-                                                                  "MI no Y (transported)",
-                                                                  "MI with Y (refit)",
-                                                                  "MI with Y (transported)",
-                                                                  "Pattern Sub-Model")) {
+model_bias_fnc <- function(df,
+                           DAG_Scenario,
+                           X_categorical_value, 
+                           R_prev_value, 
+                           rho_X_value,
+                           gamma_x1_value,
+                           target_estimands) {
+  
+  if(all((target_estimands %in% c("E-all",
+                                 "E-mean",
+                                 "E-RI",
+                                 "E-MI",
+                                 "E-PSM")))==FALSE) {
+    stop("target_estimands vector needs to take correct estimand characters")
+  }
   
-  target_performance_imputation_method <- as.character(match.arg(target_performance_imputation_method))
+  #filter to the required simulation scenario:
+  df <- df %>%
+    dplyr::filter(Scenario == DAG_Scenario &
+                    X_categorical == X_categorical_value & 
+                    R_prev == R_prev_value & 
+                    rho_X == rho_X_value &
+                    gamma_x1 == gamma_x1_value)
   
+  #remove the DGM model performance as dont need true performance for this part
   df <- df %>%
-    dplyr::mutate("combinations" = case_when(
-      CPM == "PR_fullyobserved" |
-        CPM == "PR_CCA" |
-        CPM == "PR_mean" ~ Validation_Dataset %in% c("All Data Required",
-                                                     "CCA",
-                                                     "Mean Imputation",
-                                                     "MI with Y (refit)",
-                                                     "MI no Y (refit)",
-                                                     "RI (refit)"),
-      CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required",
-                                                 "CCA",
-                                                 "Mean Imputation",
-                                                 "MI with Y (refit)",
-                                                 "MI no Y (refit)",
-                                                 "RI (transported)",
-                                                 "RI (refit)"),
-      CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required",
+    dplyr::filter(CPM != "PR_DGM")
+  
+  #define deployment performance, for each CPM across all estimands
+  if(X_categorical_value == FALSE) {
+    Deployment_Performance <- df %>%
+      dplyr::filter(Validation_Dataset %in% c( "All Data Required",
+                                               "Mean Imputation",
+                                               "RI (refit)",
+                                               "MI no Y (refit)",
+                                               "Pattern Sub-Model")) %>%
+      dplyr::mutate(Target_Estimand = forcats::fct_recode(Validation_Dataset,
+                                                          "E_all" = "All Data Required",
+                                                          "E_mean" = "Mean Imputation",
+                                                          "E_RI" = "RI (refit)",
+                                                          "E_MI" = "MI no Y (refit)",
+                                                          "E_PSM" = "Pattern Sub-Model")) %>%
+      dplyr::select(Simulation_Scenario,
+                    Scenario,
+                    Y_prev,
+                    X_categorical,
+                    R_prev,
+                    rho_X,
+                    gamma_x1,
+                    gamma_x2,
+                    CPM,
+                    Target_Estimand,
+                    dplyr::ends_with("_Mean")) %>%
+      tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"),
+                          values_to = "Deployment_Performance",
+                          names_to = "Performance_Metric") %>%
+      tidyr::pivot_wider(id_cols = c(Simulation_Scenario,
+                                     Scenario,
+                                     Y_prev,
+                                     X_categorical,
+                                     R_prev,
+                                     rho_X,
+                                     gamma_x1,
+                                     gamma_x2,
+                                     CPM,
+                                     Performance_Metric),
+                         names_from = "Target_Estimand",
+                         values_from = "Deployment_Performance")
+  } else {
+    Deployment_Performance <- df %>%
+      dplyr::filter(Validation_Dataset %in% c( "All Data Required",
+                                               "Risk Absent Imputation",
+                                               "RI (refit)",
+                                               "MI no Y (refit)",
+                                               "Pattern Sub-Model")) %>%
+      dplyr::mutate(Target_Estimand = forcats::fct_recode(Validation_Dataset,
+                                                          "E_all" = "All Data Required",
+                                                          "E_mean" = "Risk Absent Imputation",
+                                                          "E_RI" = "RI (refit)",
+                                                          "E_MI" = "MI no Y (refit)",
+                                                          "E_PSM" = "Pattern Sub-Model")) %>%
+      dplyr::select(Simulation_Scenario,
+                    Scenario,
+                    Y_prev,
+                    X_categorical,
+                    R_prev,
+                    rho_X,
+                    gamma_x1,
+                    gamma_x2,
+                    CPM,
+                    Target_Estimand,
+                    dplyr::ends_with("_Mean")) %>%
+      tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"),
+                          values_to = "Deployment_Performance",
+                          names_to = "Performance_Metric") %>%
+      tidyr::pivot_wider(id_cols = c(Simulation_Scenario,
+                                     Scenario,
+                                     Y_prev,
+                                     X_categorical,
+                                     R_prev,
+                                     rho_X,
+                                     gamma_x1,
+                                     gamma_x2,
+                                     CPM,
+                                     Performance_Metric),
+                         names_from = "Target_Estimand",
+                         values_from = "Deployment_Performance")
+  }
+  
+  if(X_categorical_value == FALSE) {
+    
+    CPM_performance <- df %>%
+      #obtain valid development-validation pairs (table 1 of paper):
+      dplyr::mutate("combinations" = case_when(
+        CPM == "PR_fullyobserved" |
+          CPM == "PR_CCA" |
+          CPM == "PR_mean" ~ Validation_Dataset %in% c("All Data Required",
+                                                       "CCA",
+                                                       "Mean Imputation",
+                                                       "MI with Y (refit)",
+                                                       "MI no Y (refit)",
+                                                       "RI (refit)"),
+        CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required",
+                                                   "CCA",
+                                                   "Mean Imputation",
+                                                   "MI with Y (refit)",
+                                                   "MI no Y (refit)",
+                                                   "RI (transported)",
+                                                   "RI (refit)"),
+        CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required",
+                                                        "CCA",
+                                                        "Mean Imputation",
+                                                        "MI with Y (transported)",
+                                                        "MI with Y (refit)",
+                                                        "MI no Y (refit)",
+                                                        "RI (refit)"),
+        CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required",
                                                       "CCA",
                                                       "Mean Imputation",
-                                                      "MI with Y (transported)",
                                                       "MI with Y (refit)",
+                                                      "MI no Y (transported)",
                                                       "MI no Y (refit)",
                                                       "RI (refit)"),
-      CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required",
-                                                    "CCA",
-                                                    "Mean Imputation",
-                                                    "MI with Y (refit)",
-                                                    "MI no Y (transported)",
-                                                    "MI no Y (refit)",
-                                                    "RI (refit)"),
-      
-      CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required",
+        
+        CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required",
+                                                                "CCA",
+                                                                "Mean Imputation",
+                                                                "MI with Y (refit)",
+                                                                "MI no Y (refit)",
+                                                                "RI (refit)",
+                                                                "Pattern Sub-Model"),
+        .default = NA)) %>%
+      dplyr::filter(combinations == 1) %>%
+      dplyr::select(Simulation_Scenario,
+                    Scenario,
+                    Y_prev,
+                    X_categorical,
+                    R_prev,
+                    rho_X,
+                    gamma_x1,
+                    gamma_x2,
+                    CPM,
+                    Validation_Dataset,
+                    dplyr::ends_with("_Mean")) %>%
+      tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"),
+                          values_to = "Model_Performance",
+                          names_to = "Performance_Metric")
+    
+    ModelBias <- CPM_performance %>%
+      dplyr::left_join(Deployment_Performance,
+                       by = c("Simulation_Scenario",
+                              "Scenario",
+                              "Y_prev",
+                              "X_categorical",
+                              "R_prev",
+                              "rho_X",
+                              "gamma_x1",
+                              "gamma_x2",
+                              "CPM",
+                              "Performance_Metric")) %>%
+      dplyr::mutate("Eall_Model_Bias" = Model_Performance - E_all,
+                    "Emean_Model_Bias" = Model_Performance - E_mean,
+                    "ERI_Model_Bias" = Model_Performance - E_RI,
+                    "EMI_Model_Bias" = Model_Performance - E_MI,
+                    "EPSM_Model_Bias" = Model_Performance - E_PSM) %>%
+      dplyr::mutate(CPM = str_remove(CPM, "PR_"),
+                    CPM = forcats::fct_recode(CPM,
+                                              "CCA \n developed CPM" = "CCA",
+                                              "RI \n developed CPM" = "RI",
+                                              "MI no Y \n developed CPM" = "MInoY",
+                                              "MI with Y \n developed CPM" = "MIwithY",
+                                              "Fully Observed \n developed CPM" = "fullyobserved",
+                                              "Mean Imputation \n developed CPM" = "mean",
+                                              "PSM \n developed CPM" = "patternsubmodel"),
+                    CPM = forcats::fct_relevel(CPM,
+                                               "Fully Observed \n developed CPM",
+                                               "CCA \n developed CPM",
+                                               "Mean Imputation \n developed CPM",
+                                               "RI \n developed CPM",
+                                               "MI with Y \n developed CPM",
+                                               "MI no Y \n developed CPM",
+                                               "PSM \n developed CPM"),
+                    
+                    Performance_Metric = stringr::str_remove(Performance_Metric,
+                                                             "_Mean"),
+                    Performance_Metric = forcats::fct_recode(Performance_Metric,
+                                                             "Brier Score" = "Brier",
+                                                             "Calibration Intercept" = "CalInt",
+                                                             "Calibration Slope" = "CalSlope"),
+                    
+                    Validation_Dataset = forcats::fct_recode(Validation_Dataset,
+                                                             "Fully Observed Data" = "All Data Required"),
+                    Validation_Dataset = forcats::fct_relevel(Validation_Dataset,
+                                                              "Fully Observed Data",
                                                               "CCA",
                                                               "Mean Imputation",
+                                                              "RI (refit)",
+                                                              "RI (transported)",
                                                               "MI with Y (refit)",
+                                                              "MI with Y (transported)",
                                                               "MI no Y (refit)",
-                                                              "RI (refit)",
-                                                              "Pattern Sub-Model"),
-      .default = NA)) %>%
-    dplyr::filter(combinations == 1)
-  
-  scenario_df <- df %>%
-    dplyr::filter(Simulation_Scenario %in% scenario_number) %>%
-    dplyr::select(Simulation_Scenario, Scenario,
-                  CPM, Validation_Dataset,
-                  contains("_Mean"),
-                  contains("_quantileLower"),
-                  contains("_quantileUpper")) %>%
-    pivot_longer(cols = c(contains("_Mean"),
-                          contains("_quantileLower"),
-                          contains("_quantileUpper"))) %>%
-    separate_wider_delim(name,
-                         delim = "_",
-                         names = c("Metric", "Summary_Type")) %>%
-    pivot_wider(id_cols = c("Simulation_Scenario", "Scenario",
-                            "CPM", "Validation_Dataset", "Metric"),
-                names_from = "Summary_Type",
-                values_from = "value")
-  
-  results_using_for_implementation <- df %>%
-    dplyr::filter(Simulation_Scenario %in% scenario_number) %>%
-    dplyr::filter(Validation_Dataset == target_performance_imputation_method) %>%
-    dplyr::select(Simulation_Scenario, Scenario,
-                  CPM, Validation_Dataset,
-                  contains("_Mean"),
-                  contains("_quantileLower"),
-                  contains("_quantileUpper")) %>%
-    pivot_longer(cols = c(contains("_Mean"),
-                          contains("_quantileLower"),
-                          contains("_quantileUpper"))) %>%
-    separate_wider_delim(name,
-                         delim = "_",
-                         names = c("Metric", "Summary_Type")) %>%
-    pivot_wider(id_cols = c("Simulation_Scenario", "Scenario",
-                            "CPM", "Validation_Dataset", "Metric"),
-                names_from = "Summary_Type",
-                values_from = "value") %>%
-    dplyr::rename("Imp_Data" = Validation_Dataset,
-                  "Imp_Mean" = Mean,
-                  "Imp_quantileLower" = quantileLower,
-                  "Imp_quantileUpper" = quantileUpper)
-  
-  plot_df <- scenario_df %>%
-    dplyr::left_join(results_using_for_implementation,
-                     by = c("Simulation_Scenario",
-                            "Scenario",
-                            "CPM",
-                            "Metric")) %>%
-    dplyr::mutate("Bias" = Mean - Imp_Mean,
-                  "Bias_Lower" = quantileLower - Imp_quantileLower,
-                  "Bias_Upper" = quantileUpper - Imp_quantileUpper) %>%
-    dplyr::mutate(Metric = forcats::fct_recode(Metric,
-                                               "Brier Score" = "Brier",
-                                               "Calibration Intercept" = "CalInt",
-                                               "Calibration Slope" = "CalSlope")) %>%
-    mutate(CPM = stringr::str_remove(CPM, "PR_"),
-           
-           CPM = forcats::fct_recode(CPM,
-                                     "CCA \n developed CPM" = "CCA",
-                                     "RI \n developed CPM" = "RI",
-                                     "MI no Y \n developed CPM" = "MInoY",
-                                     "MI with Y \n developed CPM" = "MIwithY",
-                                     "Fully Observed \n developed CPM" = "fullyobserved",
-                                     "Mean Imputation \n developed CPM" = "mean",
-                                     "PSM \n developed CPM" = "patternsubmodel"),
-           
-           CPM = forcats::fct_relevel(CPM,
-                                      "Fully Observed \n developed CPM",
-                                      "CCA \n developed CPM",
-                                      "Mean Imputation \n developed CPM",
-                                      "RI \n developed CPM",
-                                      "MI with Y \n developed CPM",
-                                      "MI no Y \n developed CPM",
-                                      "PSM \n developed CPM"))
-  
-  plot_df %>%
-    tidyr::drop_na() %>%
-    dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset,
-                                                           "Fully Observed Data" = "All Data Required"),
-                  Validation_Dataset = forcats::fct_relevel(Validation_Dataset,
-                                                            "Fully Observed Data",
-                                                            "CCA",
-                                                            "Mean Imputation",
-                                                            "RI (refit)",
-                                                            "RI (transported)",
-                                                            "MI with Y (refit)",
-                                                            "MI with Y (transported)",
-                                                            "MI no Y (refit)",
-                                                            "MI no Y (transported)",
-                                                            "Pattern Sub-Model")) %>%
-    ggplot(aes(x = Bias,
-               y = Validation_Dataset,
-               color = Metric)) +
-    geom_point(aes(shape = Metric)) +
-    geom_errorbar(aes(xmin = Bias_Lower, 
-                      xmax = Bias_Upper), width=.1) +
-    scale_color_brewer(palette = "Set1") +
-    xlab("Bias") +
-    ylab("Validation Data Imputation Method") +
-    geom_vline(data = data.frame("Metric" = c("AUC",
-                                              "Brier Score",
-                                              "Calibration Intercept",
-                                              "Calibration Slope"),
-                                 "Bias" = c(0,0,0,0)),
-               aes(xintercept = Bias),
-               linetype = "dashed") +
-    ggh4x::facet_grid2(Scenario ~ CPM, scales = "fixed") +
-    ggtitle(paste("Targetting ", target_performance_imputation_method, " Performance", sep = "")) +
-    theme_minimal(base_size = 12) +
-    theme(legend.position = "bottom",
-          panel.background = element_rect(fill = "gray90"),  
-          panel.spacing.x = unit(0.5, "lines"),
-          axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
-          plot.title = element_text(size = 14, hjust = 0.5),
-          panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) 
-}
-
-## Consistent missingness mechanism plots:
-#rho = 0 
-plotting_fnc(df = simulation_results_summarised,
-             scenario_number = c(4, 148),
-             target_performance_imputation_method = target_performance)
-
-plotting_fnc(df = simulation_results_summarised,
-             scenario_number = c(436, 220, 508),
-             target_performance_imputation_method = target_performance)
-
-#rho = 0.75 
-plotting_fnc(df = simulation_results_summarised,
-             scenario_number = c(8, 152),
-             target_performance_imputation_method = target_performance)
-
-plotting_fnc(df = simulation_results_summarised,
-             scenario_number = c(440, 224, 512),
-             target_performance_imputation_method = target_performance)
-
-## Inconsistent missingness mechanism plots:
-#rho = 0
-#MCAR+something
-plotting_fnc(df = simulation_results_summarised,
-             scenario_number = c(20, 52, 28, 60),
-             target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-             scenario_number = c(180, 156, 188),
-             target_performance_imputation_method = target_performance)
-#MNAR+something
-plotting_fnc(df = simulation_results_summarised,
-             scenario_number = c(412, 444, 252),
-             target_performance_imputation_method = target_performance)
-
-#rho = 0.75
-#MCAR+something
-plotting_fnc(df = simulation_results_summarised,
-             scenario_number = c(24, 56, 32, 64),
-             target_performance_imputation_method = target_performance)
-#MAR+something
-plotting_fnc(df = simulation_results_summarised,
-             scenario_number = c(184, 161, 193),
-             target_performance_imputation_method = target_performance)
-#MNAR-X+something
-plotting_fnc(df = simulation_results_summarised,
-             scenario_number = c(416, 448, 256),
-             target_performance_imputation_method = target_performance)
-
-
-
-
-###----------------------------------------------------------------------------
-## Now look at a few X_categorical cases to check similar results
-
-plotting_fnc <- function(df,
-                         scenario_number,
-                         target_performance_imputation_method = c("All Data Required",
-                                                                  "CCA",
-                                                                  "Risk Absent Imputation",
-                                                                  "RI (refit)",
-                                                                  "RI (transported)",
-                                                                  "MI no Y (refit)",
-                                                                  "MI no Y (transported)",
-                                                                  "MI with Y (refit)",
-                                                                  "MI with Y (transported)",
-                                                                  "Pattern Sub-Model")) {
-  
-  target_performance_imputation_method <- as.character(match.arg(target_performance_imputation_method))
-  
-  df <- df %>%
-    dplyr::mutate("combinations" = case_when(
-      CPM == "PR_fullyobserved" |
-        CPM == "PR_CCA" |
-        CPM == "PR_zero" ~ Validation_Dataset %in% c("All Data Required",
-                                                     "CCA",
-                                                     "Risk Absent Imputation",
-                                                     "MI with Y (refit)",
-                                                     "MI no Y (refit)",
-                                                     "RI (refit)"),
-      CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required",
-                                                 "CCA",
-                                                 "Risk Absent Imputation",
-                                                 "MI with Y (refit)",
-                                                 "MI no Y (refit)",
-                                                 "RI (transported)",
-                                                 "RI (refit)"),
-      CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required",
+                                                              "MI no Y (transported)",
+                                                              "Pattern Sub-Model")) 
+  } else {
+    
+    CPM_performance <- df %>%
+      #obtain valid development-validation pairs (table 1 of paper):
+      dplyr::mutate("combinations" = case_when(
+        CPM == "PR_fullyobserved" |
+          CPM == "PR_CCA" |
+          CPM == "PR_zero" ~ Validation_Dataset %in% c("All Data Required",
+                                                       "CCA",
+                                                       "Risk Absent Imputation",
+                                                       "MI with Y (refit)",
+                                                       "MI no Y (refit)",
+                                                       "RI (refit)"),
+        CPM == "PR_RI" ~ Validation_Dataset %in% c("All Data Required",
+                                                   "CCA",
+                                                   "Risk Absent Imputation",
+                                                   "MI with Y (refit)",
+                                                   "MI no Y (refit)",
+                                                   "RI (transported)",
+                                                   "RI (refit)"),
+        CPM == "PR_MIwithY" ~ Validation_Dataset %in% c("All Data Required",
+                                                        "CCA",
+                                                        "Risk Absent Imputation",
+                                                        "MI with Y (transported)",
+                                                        "MI with Y (refit)",
+                                                        "MI no Y (refit)",
+                                                        "RI (refit)"),
+        CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required",
                                                       "CCA",
                                                       "Risk Absent Imputation",
-                                                      "MI with Y (transported)",
                                                       "MI with Y (refit)",
+                                                      "MI no Y (transported)",
                                                       "MI no Y (refit)",
                                                       "RI (refit)"),
-      CPM == "PR_MInoY" ~ Validation_Dataset %in% c("All Data Required",
-                                                    "CCA",
-                                                    "Risk Absent Imputation",
-                                                    "MI with Y (refit)",
-                                                    "MI no Y (transported)",
-                                                    "MI no Y (refit)",
-                                                    "RI (refit)"),
-      
-      CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required",
+        
+        CPM == "PR_patternsubmodel" ~ Validation_Dataset %in% c("All Data Required",
+                                                                "CCA",
+                                                                "Risk Absent Imputation",
+                                                                "MI with Y (refit)",
+                                                                "MI no Y (refit)",
+                                                                "RI (refit)",
+                                                                "Pattern Sub-Model"),
+        .default = NA)) %>%
+      dplyr::filter(combinations == 1) %>%
+      dplyr::select(Simulation_Scenario,
+                    Scenario,
+                    Y_prev,
+                    X_categorical,
+                    R_prev,
+                    rho_X,
+                    gamma_x1,
+                    gamma_x2,
+                    CPM,
+                    Validation_Dataset,
+                    dplyr::ends_with("_Mean")) %>%
+      tidyr::pivot_longer(cols = dplyr::ends_with("_Mean"),
+                          values_to = "Model_Performance",
+                          names_to = "Performance_Metric")
+    
+    ModelBias <- CPM_performance %>%
+      dplyr::left_join(Deployment_Performance,
+                       by = c("Simulation_Scenario",
+                              "Scenario",
+                              "Y_prev",
+                              "X_categorical",
+                              "R_prev",
+                              "rho_X",
+                              "gamma_x1",
+                              "gamma_x2",
+                              "CPM",
+                              "Performance_Metric")) %>%
+      dplyr::mutate("Eall_Model_Bias" = Model_Performance - E_all,
+                    "Emean_Model_Bias" = Model_Performance - E_mean,
+                    "ERI_Model_Bias" = Model_Performance - E_RI,
+                    "EMI_Model_Bias" = Model_Performance - E_MI,
+                    "EPSM_Model_Bias" = Model_Performance - E_PSM) %>%
+      dplyr::mutate(CPM = str_remove(CPM, "PR_"),
+                    CPM = forcats::fct_recode(CPM,
+                                              "CCA \n developed CPM" = "CCA",
+                                              "RI \n developed CPM" = "RI",
+                                              "MI no Y \n developed CPM" = "MInoY",
+                                              "MI with Y \n developed CPM" = "MIwithY",
+                                              "Fully Observed \n developed CPM" = "fullyobserved",
+                                              "Mean Imputation \n developed CPM" = "zero",
+                                              "PSM \n developed CPM" = "patternsubmodel"),
+                    CPM = forcats::fct_relevel(CPM,
+                                               "Fully Observed \n developed CPM",
+                                               "CCA \n developed CPM",
+                                               "Mean Imputation \n developed CPM",
+                                               "RI \n developed CPM",
+                                               "MI with Y \n developed CPM",
+                                               "MI no Y \n developed CPM",
+                                               "PSM \n developed CPM"),
+                    
+                    Performance_Metric = stringr::str_remove(Performance_Metric,
+                                                             "_Mean"),
+                    Performance_Metric = forcats::fct_recode(Performance_Metric,
+                                                             "Brier Score" = "Brier",
+                                                             "Calibration Intercept" = "CalInt",
+                                                             "Calibration Slope" = "CalSlope"),
+                    
+                    Validation_Dataset = forcats::fct_recode(Validation_Dataset,
+                                                             "Fully Observed Data" = "All Data Required"),
+                    Validation_Dataset = forcats::fct_relevel(Validation_Dataset,
+                                                              "Fully Observed Data",
                                                               "CCA",
-                                                              "Risk Absent Imputation",
+                                                              "Mean Imputation",
+                                                              "RI (refit)",
+                                                              "RI (transported)",
                                                               "MI with Y (refit)",
+                                                              "MI with Y (transported)",
                                                               "MI no Y (refit)",
-                                                              "RI (refit)",
-                                                              "Pattern Sub-Model"),
-      .default = NA)) %>%
-    dplyr::filter(combinations == 1)
-  
-  scenario_df <- df %>%
-    dplyr::filter(Simulation_Scenario %in% scenario_number) %>%
-    dplyr::select(Simulation_Scenario, Scenario,
-                  CPM, Validation_Dataset,
-                  contains("_Mean"),
-                  contains("_quantileLower"),
-                  contains("_quantileUpper")) %>%
-    pivot_longer(cols = c(contains("_Mean"),
-                          contains("_quantileLower"),
-                          contains("_quantileUpper"))) %>%
-    separate_wider_delim(name,
-                         delim = "_",
-                         names = c("Metric", "Summary_Type")) %>%
-    pivot_wider(id_cols = c("Simulation_Scenario", "Scenario",
-                            "CPM", "Validation_Dataset", "Metric"),
-                names_from = "Summary_Type",
-                values_from = "value")
+                                                              "MI no Y (transported)",
+                                                              "Pattern Sub-Model")) 
+  }
   
-  results_using_for_implementation <- df %>%
-    dplyr::filter(Simulation_Scenario %in% scenario_number) %>%
-    dplyr::filter(Validation_Dataset == target_performance_imputation_method) %>%
-    dplyr::select(Simulation_Scenario, Scenario,
-                  CPM, Validation_Dataset,
-                  contains("_Mean"),
-                  contains("_quantileLower"),
-                  contains("_quantileUpper")) %>%
-    pivot_longer(cols = c(contains("_Mean"),
-                          contains("_quantileLower"),
-                          contains("_quantileUpper"))) %>%
-    separate_wider_delim(name,
-                         delim = "_",
-                         names = c("Metric", "Summary_Type")) %>%
-    pivot_wider(id_cols = c("Simulation_Scenario", "Scenario",
-                            "CPM", "Validation_Dataset", "Metric"),
-                names_from = "Summary_Type",
-                values_from = "value") %>%
-    dplyr::rename("Imp_Data" = Validation_Dataset,
-                  "Imp_Mean" = Mean,
-                  "Imp_quantileLower" = quantileLower,
-                  "Imp_quantileUpper" = quantileUpper)
-  
-  plot_df <- scenario_df %>%
-    dplyr::left_join(results_using_for_implementation,
-                     by = c("Simulation_Scenario",
-                            "Scenario",
-                            "CPM",
-                            "Metric")) %>%
-    dplyr::mutate("Bias" = Mean - Imp_Mean,
-                  "Bias_Lower" = quantileLower - Imp_quantileLower,
-                  "Bias_Upper" = quantileUpper - Imp_quantileUpper) %>%
-    dplyr::mutate(Metric = forcats::fct_recode(Metric,
-                                               "Brier Score" = "Brier",
-                                               "Calibration Intercept" = "CalInt",
-                                               "Calibration Slope" = "CalSlope")) %>%
-    mutate(CPM = stringr::str_remove(CPM, "PR_"),
-           
-           CPM = forcats::fct_recode(CPM,
-                                     "CCA \n developed CPM" = "CCA",
-                                     "RI \n developed CPM" = "RI",
-                                     "MI no Y \n developed CPM" = "MInoY",
-                                     "MI with Y \n developed CPM" = "MIwithY",
-                                     "Fully Observed \n developed CPM" = "fullyobserved",
-                                     "Risk Absent Imputation \n developed CPM" = "zero",
-                                     "Pattern Sub-Model \n developed CPM" = "patternsubmodel"),
-           
-           CPM = forcats::fct_relevel(CPM,
-                                      "Fully Observed \n developed CPM",
-                                      "CCA \n developed CPM",
-                                      "Risk Absent Imputation \n developed CPM",
-                                      "RI \n developed CPM",
-                                      "MI with Y \n developed CPM",
-                                      "MI no Y \n developed CPM",
-                                      "Pattern Sub-Model \n developed CPM"))
-  
-  plot_df %>%
-    tidyr::drop_na() %>%
-    dplyr::mutate(Validation_Dataset = forcats::fct_recode(Validation_Dataset,
-                                                           "Fully Observed Data" = "All Data Required"),
-                  Validation_Dataset = forcats::fct_relevel(Validation_Dataset,
-                                                            "Fully Observed Data",
-                                                            "CCA",
-                                                            "Risk Absent Imputation",
-                                                            "RI (refit)",
-                                                            "RI (transported)",
-                                                            "MI with Y (refit)",
-                                                            "MI with Y (transported)",
-                                                            "MI no Y (refit)",
-                                                            "MI no Y (transported)",
-                                                            "Pattern Sub-Model")) %>%
+  ModelBias %>%
+    dplyr::select(CPM, 
+                  Validation_Dataset, 
+                  Performance_Metric, 
+                  dplyr::ends_with("_Bias")) %>%
+    tidyr::pivot_longer(cols = dplyr::ends_with("_Bias"),
+                        names_to = "Estimand",
+                        values_to = "Bias") %>%
+    dplyr::mutate(Estimand = stringr:::str_remove(Estimand,
+                                                  "_Model_Bias"),
+                  Estimand = forcats::fct_recode(Estimand,
+                                                 "E-all" = "Eall",
+                                                 "E-mean" = "Emean",
+                                                 "E-RI" = "ERI",
+                                                 "E-MI" = "EMI",
+                                                 "E-PSM" = "EPSM"),
+                  Estimand = forcats::fct_relevel(Estimand,
+                                                 "E-all",
+                                                 "E-mean" ,
+                                                 "E-RI" ,
+                                                 "E-MI" ,
+                                                 "E-PSM" )) %>%
+    dplyr::filter(!is.na(Bias)) %>%
+    dplyr::filter(Estimand %in% target_estimands) %>%
     ggplot(aes(x = Bias,
                y = Validation_Dataset,
-               color = Metric)) +
-    geom_point(aes(shape = Metric)) +
-    geom_errorbar(aes(xmin = Bias_Lower, 
-                      xmax = Bias_Upper), width=.1) +
+               color = Performance_Metric)) +
+    geom_point(aes(shape = Performance_Metric)) +
     scale_color_brewer(palette = "Set1") +
-    xlab("Bias") +
+    xlab("Validation Bias \n (validation performance - deployment performance)") +
     ylab("Validation Data Imputation Method") +
     geom_vline(data = data.frame("Metric" = c("AUC",
                                               "Brier Score",
@@ -740,19 +701,119 @@ plotting_fnc <- function(df,
                                  "Bias" = c(0,0,0,0)),
                aes(xintercept = Bias),
                linetype = "dashed") +
-    ggh4x::facet_grid2(Scenario ~ CPM, scales = "fixed") +
-    ggtitle(paste("Targetting ", target_performance_imputation_method, " Performance", sep = "")) +
-    theme_minimal() +
+    ggh4x::facet_grid2(Estimand ~ CPM, scales = "fixed") +
+    theme_minimal(base_size = 12) +
     theme(legend.position = "bottom",
           panel.background = element_rect(fill = "gray90"),  
           panel.spacing.x = unit(0.5, "lines"),
-          axis.title.y = element_text(size=14),
           axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
           plot.title = element_text(size = 14, hjust = 0.5),
           panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) 
 }
 
 
-plotting_fnc(df = simulation_results_summarised,
-             scenario_number = c(3023),
-             target_performance_imputation_method = "All Data Required")
+
+
+####----------------------------------------------------------------------------
+### The below plots look at scenarios where X_1 is continuous, contains 50%
+### missing data, rho_X=0.75, and where gamma_1=gamma_2=0.5. Results were 
+### quantitatively similar in other scenarios 
+####----------------------------------------------------------------------------
+
+### First look at cases with consistent missingness DAGs across development and validation
+model_degrdation_fnc(df = simulation_results_summarised,
+                     DAG_Scenario = "DAG (b) Dev -\n DAG (b) Val",
+                     X_categorical_value = FALSE, 
+                     R_prev_value = 0.5, 
+                     rho_X_value = 0.75,
+                     gamma_x1_value = 0.5,
+                     target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
+model_bias_fnc(df = simulation_results_summarised,
+               DAG_Scenario = "DAG (b) Dev -\n DAG (b) Val",
+               X_categorical_value = FALSE, 
+               R_prev_value = 0.5, 
+               rho_X_value = 0.75,
+               gamma_x1_value = 0.5,
+               target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
+
+
+model_degrdation_fnc(df = simulation_results_summarised,
+                     DAG_Scenario = "DAG (c) Dev -\n DAG (c) Val",
+                     X_categorical_value = FALSE, 
+                     R_prev_value = 0.5, 
+                     rho_X_value = 0.75,
+                     gamma_x1_value = 0.5,
+                     target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
+model_bias_fnc(df = simulation_results_summarised,
+               DAG_Scenario = "DAG (c) Dev -\n DAG (c) Val",
+               X_categorical_value = FALSE, 
+               R_prev_value = 0.5, 
+               rho_X_value = 0.75,
+               gamma_x1_value = 0.5,
+               target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
+
+
+model_degrdation_fnc(df = simulation_results_summarised,
+                     DAG_Scenario = "DAG (d) Dev -\n DAG (d) Val",
+                     X_categorical_value = FALSE, 
+                     R_prev_value = 0.5, 
+                     rho_X_value = 0.75,
+                     gamma_x1_value = 0.5,
+                     target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
+model_bias_fnc(df = simulation_results_summarised,
+               DAG_Scenario = "DAG (d) Dev -\n DAG (d) Val",
+               X_categorical_value = FALSE, 
+               R_prev_value = 0.5, 
+               rho_X_value = 0.75,
+               gamma_x1_value = 0.5,
+               target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
+
+
+model_degrdation_fnc(df = simulation_results_summarised,
+                     DAG_Scenario = "DAG (e) Dev -\n DAG (e) Val",
+                     X_categorical_value = FALSE, 
+                     R_prev_value = 0.5, 
+                     rho_X_value = 0.75,
+                     gamma_x1_value = 0.5,
+                     target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
+model_bias_fnc(df = simulation_results_summarised,
+               DAG_Scenario = "DAG (e) Dev -\n DAG (e) Val",
+               X_categorical_value = FALSE, 
+               R_prev_value = 0.5, 
+               rho_X_value = 0.75,
+               gamma_x1_value = 0.5,
+               target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
+
+
+model_degrdation_fnc(df = simulation_results_summarised,
+                     DAG_Scenario = "DAG (f) Dev -\n DAG (f) Val",
+                     X_categorical_value = FALSE, 
+                     R_prev_value = 0.5, 
+                     rho_X_value = 0.75,
+                     gamma_x1_value = 0.5,
+                     target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
+model_bias_fnc(df = simulation_results_summarised,
+               DAG_Scenario = "DAG (f) Dev -\n DAG (f) Val",
+               X_categorical_value = FALSE, 
+               R_prev_value = 0.5, 
+               rho_X_value = 0.75,
+               gamma_x1_value = 0.5,
+               target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
+
+
+### Second look at cases with different missingness DAGs across development and validation
+
+model_degrdation_fnc(df = simulation_results_summarised,
+                     DAG_Scenario = "DAG (c) Dev -\n DAG (f) Val",
+                     X_categorical_value = FALSE, 
+                     R_prev_value = 0.5, 
+                     rho_X_value = 0.75,
+                     gamma_x1_value = 0.5,
+                     target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
+model_bias_fnc(df = simulation_results_summarised,
+               DAG_Scenario = "DAG (c) Dev -\n DAG (f) Val",
+               X_categorical_value = FALSE, 
+               R_prev_value = 0.5, 
+               rho_X_value = 0.75,
+               gamma_x1_value = 0.5,
+               target_estimands = c("E-all", "E-mean", "E-RI", "E-MI", "E-PSM"))
diff --git a/Code/04_ncorr_analysis.R b/Code/04_ncorr_analysis.R
index ecc66e2..4ea53cd 100644
--- a/Code/04_ncorr_analysis.R
+++ b/Code/04_ncorr_analysis.R
@@ -843,10 +843,10 @@ Bootstrap_InternalValidation %>%
          name = forcats::fct_relevel(name,
                                      c("MeanMode", "RI", "MInoY", "PSM")),
          name = forcats::fct_recode(name,
-                                    "Targetting\n MI-no Y Performance" = "MInoY",
-                                    "Targetting\n Mean/Mode Performance" = "MeanMode",
-                                    "Targetting\n PSM Performance" = "PSM",
-                                    "Targetting\n RI Performance" = "RI"),
+                                    "E-MI" = "MInoY",
+                                    "E-mean" = "MeanMode",
+                                    "E-PSM" = "PSM",
+                                    "E-RI" = "RI"),
          
          CPM = str_remove(CPM, "PR_"),
          CPM = forcats::fct_recode(CPM,
@@ -884,7 +884,7 @@ Bootstrap_InternalValidation %>%
   geom_errorbar(aes(xmin = bias_lower, 
                     xmax = bias_upper), width=.1) +
   scale_color_brewer(palette = "Set1") +
-  xlab("Bias") +
+  xlab("Validation Performance - Deployment Performance") +
   ylab("Validation Data Imputation Method") +
   geom_vline(data = data.frame("Metric" = c("AUC",
                                             "Brier Score",
@@ -893,8 +893,7 @@ Bootstrap_InternalValidation %>%
                                "bias_mean" = c(0,0,0,0)),
              aes(xintercept = bias_mean),
              linetype = "dashed") +
-  ggh4x::facet_grid2(CPM ~ Target, scales = "fixed") +
-  ggtitle("Targetted Performance ") +
+  ggh4x::facet_grid2(Target ~ CPM, scales = "fixed") +
   theme_minimal(base_size = 12) +
   theme(legend.position = "bottom",
         panel.background = element_rect(fill = "gray90"),  
@@ -904,6 +903,5 @@ Bootstrap_InternalValidation %>%
         panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)) 
 
 ggsave(file = here::here("Manuscript", "Fig6.tiff"), 
-       height = 10, width = 10, 
        dpi = 300)
 
diff --git a/Code/SimulationCSFRun.txt b/Code/SimulationCSFRun.txt
new file mode 100644
index 0000000..17ac602
--- /dev/null
+++ b/Code/SimulationCSFRun.txt
@@ -0,0 +1,10 @@
+#!/bin/bash --login
+#SBATCH -p himem
+#SBATCH --mem=1000G
+#SBATCH -t 7-0
+
+#SBATCH -a 1-600
+
+module load apps/gcc/R/4.5.0
+
+Rscript 01_Simulation_Run_CSF.R ${SLURM_ARRAY_TASK_ID}
\ No newline at end of file
diff --git a/Code/SimulationRun.qsub b/Code/SimulationRun.qsub
deleted file mode 100644
index 8331cf8..0000000
--- a/Code/SimulationRun.qsub
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash --login
-#$ -cwd
-
-module load apps/gcc/R/4.4.0
-module load tools/gcc/cmake/3.25.1
-
-#$ -t 1-3072
-
-Rscript 01_Simulation_Run_CSF.R $SGE_TASK_ID > simulation_warnings.$SGE_TASK_ID
diff --git a/Outputs/simulation_results_summarised.RDS b/Outputs/simulation_results_summarised.RDS
index d543def..0bb979d 100644
Binary files a/Outputs/simulation_results_summarised.RDS and b/Outputs/simulation_results_summarised.RDS differ