diff --git a/.Rbuildignore b/.Rbuildignore index 9c86736..224a30a 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,3 +5,5 @@ ^pkg.Rproj$ figure$ cache$ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore index f53558f..561017c 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,7 @@ inst/doc # R stuff *.Rout *.Rhistory -*.RData +*.RData *.Rapp.history # Mac stuff @@ -27,4 +27,10 @@ inst/doc test.R -*-vignette.pdf \ No newline at end of file +*-vignette.pdf +.Rproj.user + +# scratch folder +/scratch +tobacco_replication/tobacco_replication_cache +tobacco_replication/tobacco_replication_files/figure-html diff --git a/DESCRIPTION b/DESCRIPTION index 096311b..f8d5289 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,7 +23,7 @@ Remotes: License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 Suggests: testthat, CausalImpact, diff --git a/NAMESPACE b/NAMESPACE index c32e4f6..2cb0839 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,13 @@ # Generated by roxygen2: do not edit by hand +S3method(dim,augsynth) +S3method(dim,multisynth) +S3method(n_time,augsynth) +S3method(n_time,multisynth) +S3method(n_treated,augsynth) +S3method(n_treated,multisynth) +S3method(n_unit,augsynth) +S3method(n_unit,multisynth) S3method(plot,augsynth) S3method(plot,augsynth_multiout) S3method(plot,multisynth) @@ -18,13 +26,26 @@ S3method(print,summary.multisynth) S3method(summary,augsynth) S3method(summary,augsynth_multiout) S3method(summary,multisynth) +export(RMSPE) +export(add_inference) export(augsynth) export(augsynth_multiout) +export(covariate_balance_table) +export(donor_table) +export(is.augsynth) +export(is.summary.augsynth) export(multisynth) +export(n_time) +export(n_treated) +export(n_unit) +export(permutation_plot) +export(placebo_distribution) export(rdirichlet_b) export(rmultinom_b) export(rwild_b) export(single_augsynth) +export(treated_table) +export(update_augsynth) import(dplyr) import(tidyr) importFrom(ggplot2,aes) diff --git a/R/augsynth.R b/R/augsynth.R index ab68883..9e872f2 100644 --- a/R/augsynth.R +++ b/R/augsynth.R @@ -1,10 +1,10 @@ -################################################################################ -## Main functions for single-period treatment augmented synthetic controls Method -################################################################################ + +#### Main functions for single-period treatment augmented synthetic controls Method #### + #' Fit Augmented SCM -#' +#' #' @param form outcome ~ treatment | auxillary covariates #' @param unit Name of unit column #' @param time Name of time column @@ -14,14 +14,13 @@ #' ridge=Ridge regression (allows for standard errors), #' none=No outcome model, #' en=Elastic Net, RF=Random Forest, GSYN=gSynth, -#' mcp=MCPanel, +#' mcp=MCPanel, #' cits=Comparitive Interuppted Time Series #' causalimpact=Bayesian structural time series with CausalImpact -#' @param scm Whether the SCM weighting function is used -#' @param fixedeff Whether to include a unit fixed effect, default F +#' @param scm Whether the SCM weighting function is used. If FALSE, then package will fit the outcome model, but not calculate new donor weights to match pre-treatment covariates. Instead, each donor unit will be equally weighted. If TRUE, weights on donor pool will be calculated. +#' @param fixedeff Whether to include a unit fixed effect, default F #' @param cov_agg Covariate aggregation functions, if NULL then use mean with NAs omitted -#' @param ... optional arguments for outcome model -#' +#' @param plot Whether or not to return a plot of the augsynth model #' @return augsynth object that contains: #' \itemize{ #' \item{"weights"}{Ridge ASCM weights} @@ -30,13 +29,17 @@ #' \item{"mhat"}{Outcome model estimate} #' \item{"data"}{Panel data as matrices} #' } +#' @param ... optional arguments for outcome model #' @export single_augsynth <- function(form, unit, time, t_int, data, - progfunc = "ridge", - scm=T, - fixedeff = FALSE, - cov_agg=NULL, ...) { + progfunc = "ridge", + scm=T, + fixedeff = FALSE, + cov_agg=NULL, + ...) { + call_name <- match.call() + #inf_type = match.arg(inf_type) form <- Formula::Formula(form) unit <- enquo(unit) @@ -48,30 +51,39 @@ single_augsynth <- function(form, unit, time, t_int, data, wide <- format_data(outcome, trt, unit, time, t_int, data) synth_data <- do.call(format_synth, wide) - + treated_units <- data %>% filter(!!trt == 1) %>% distinct(!!unit) %>% pull(!!unit) - control_units <- data %>% filter(!(!!unit %in% treated_units)) %>% - distinct(!!unit) %>% arrange(!!unit) %>% pull(!!unit) - ## add covariates + control_units <- data %>% filter(!(!!unit %in% treated_units)) %>% + distinct(!!unit) %>% arrange(!!unit) %>% pull(!!unit) + ## add covariates if(length(form)[2] == 2) { Z <- extract_covariates(form, unit, time, t_int, data, cov_agg) } else { Z <- NULL } - + # fit augmented SCM - augsynth <- fit_augsynth_internal(wide, synth_data, Z, progfunc, + augsynth <- fit_augsynth_internal(wide, synth_data, Z, progfunc, scm, fixedeff, ...) - + # add some extra data augsynth$data$time <- data %>% distinct(!!time) %>% - arrange(!!time) %>% pull(!!time) + arrange(!!time) %>% pull(!!time) augsynth$call <- call_name - augsynth$t_int <- t_int - + augsynth$t_int <- t_int + augsynth$weights <- matrix(augsynth$weights) rownames(augsynth$weights) <- control_units + # TODO update similar attribute for multi <- if want to use for plotting later + augsynth$trt_unit <- data %>% filter(!!as.name(trt) == 1) %>% + pull(quo_name(unit)) %>% unique() + augsynth$time_var <- quo_name(time) + augsynth$unit_var <- quo_name(unit) + augsynth$raw_data <- data + augsynth$form <- form + augsynth$cov_agg <- cov_agg + return(augsynth) } @@ -85,14 +97,19 @@ single_augsynth <- function(form, unit, time, t_int, data, #' @param fixedeff Whether to de-mean synth #' @param V V matrix for Synth, default NULL #' @param ... Extra args for outcome model -#' +#' #' @noRd -#' +#' fit_augsynth_internal <- function(wide, synth_data, Z, progfunc, scm, fixedeff, V = NULL, ...) { n <- nrow(wide$X) t0 <- ncol(wide$X) + + if ( progfunc == "ridge" && abs( n - t0 ) <= 1 ) { + warning( glue::glue( "Tuning ridge regression when number of units ({n}) is almost equal to the number of time periods ({t0}) is often unstable." ), call. = FALSE ) + } + ttot <- t0 + ncol(wide$y) if(fixedeff) { demeaned <- demean_data(wide, synth_data) @@ -108,33 +125,39 @@ fit_augsynth_internal <- function(wide, synth_data, Z, progfunc, progfunc = "none" } progfunc = tolower(progfunc) + ## fit augsynth if(progfunc == "ridge") { # Ridge ASCM augsynth <- do.call(fit_ridgeaug_formatted, list(wide_data = fit_wide, synth_data = fit_synth_data, - Z = Z, V = V, scm = scm, ...)) + Z = Z, V = V, + ridge = TRUE, scm = scm, ...)) } else if(progfunc == "none") { ## Just SCM + if ( !scm ) { + stop( "Cannot run with progfunc='none' AND no SCM weights" ) + } augsynth <- do.call(fit_ridgeaug_formatted, - c(list(wide_data = fit_wide, - synth_data = fit_synth_data, - Z = Z, ridge = F, scm = T, V = V, ...))) + c(list(wide_data = fit_wide, + synth_data = fit_synth_data, + Z = Z, V = V, + ridge = FALSE, scm = scm, ...))) } else { ## Other outcome models - progfuncs = c("ridge", "none", "en", "rf", "gsyn", "mcp", + progfuncs = c("ridge", "None", "en", "rf", "gsyn", "mcp", "cits", "causalimpact", "seq2seq") if (progfunc %in% progfuncs) { - augsynth <- fit_augsyn(fit_wide, fit_synth_data, + augsynth <- fit_augsyn(fit_wide, fit_synth_data, progfunc, scm, ...) } else { stop("progfunc must be one of 'EN', 'RF', 'GSYN', 'MCP', 'CITS', 'CausalImpact', 'seq2seq', 'None'") } - + } - augsynth$mhat <- mhat + cbind(matrix(0, nrow = n, ncol = t0), + augsynth$mhat <- mhat + cbind(matrix(0, nrow = n, ncol = t0), augsynth$mhat) augsynth$data <- wide augsynth$data$Z <- Z @@ -168,13 +191,13 @@ predict.augsynth <- function(object, att = F, ...) { # att <- F # } augsynth <- object - + X <- augsynth$data$X y <- augsynth$data$y comb <- cbind(X, y) trt <- augsynth$data$trt mhat <- augsynth$mhat - + m1 <- colMeans(mhat[trt==1,,drop=F]) resid <- (comb[trt==0,,drop=F] - mhat[trt==0,drop=F]) @@ -191,147 +214,81 @@ predict.augsynth <- function(object, att = F, ...) { } -#' Print function for augsynth -#' @param x augsynth object -#' @param ... Optional arguments -#' @export -print.augsynth <- function(x, ...) { - augsynth <- x - - ## straight from lm - cat("\nCall:\n", paste(deparse(augsynth$call), sep="\n", collapse="\n"), "\n\n", sep="") - ## print att estimates - tint <- ncol(augsynth$data$X) - ttotal <- tint + ncol(augsynth$data$y) - att_post <- predict(augsynth, att = T)[(tint + 1):ttotal] - cat(paste("Average ATT Estimate: ", - format(round(mean(att_post),3), nsmall = 3), "\n\n", sep="")) -} - - -#' Plot function for augsynth -#' @importFrom graphics plot -#' -#' @param x Augsynth object to be plotted -#' @param inf Boolean, whether to get confidence intervals around the point estimates -#' @param cv If True, plot cross validation MSE against hyper-parameter, otherwise plot effects -#' @param ... Optional arguments -#' @export -plot.augsynth <- function(x, inf = T, cv = F, ...) { - # if ("se" %in% names(list(...))) { - # se <- list(...)$se - # } else { - # se <- T - # } - - augsynth <- x - - if (cv == T) { - errors = data.frame(lambdas = augsynth$lambdas, - errors = augsynth$lambda_errors, - errors_se = augsynth$lambda_errors_se) - p <- ggplot2::ggplot(errors, ggplot2::aes(x = lambdas, y = errors)) + - ggplot2::geom_point(size = 2) + - ggplot2::geom_errorbar( - ggplot2::aes(ymin = errors, - ymax = errors + errors_se), - width=0.2, size = 0.5) - p <- p + ggplot2::labs(title = bquote("Cross Validation MSE over " ~ lambda), - x = expression(lambda), y = "Cross Validation MSE", - parse = TRUE) - p <- p + ggplot2::scale_x_log10() - - # find minimum and min + 1se lambda to plot - min_lambda <- choose_lambda(augsynth$lambdas, - augsynth$lambda_errors, - augsynth$lambda_errors_se, - F) - min_1se_lambda <- choose_lambda(augsynth$lambdas, - augsynth$lambda_errors, - augsynth$lambda_errors_se, - T) - min_lambda_index <- which(augsynth$lambdas == min_lambda) - min_1se_lambda_index <- which(augsynth$lambdas == min_1se_lambda) - - p <- p + ggplot2::geom_point( - ggplot2::aes(x = min_lambda, - y = augsynth$lambda_errors[min_lambda_index]), - color = "gold") - p + ggplot2::geom_point( - ggplot2::aes(x = min_1se_lambda, - y = augsynth$lambda_errors[min_1se_lambda_index]), - color = "gold") + - ggplot2::theme_bw() - } else { - plot(summary(augsynth, ...), inf = inf) - } -} - - -#' Summary function for augsynth +#' Function to add inference to augsynth object #' @param object augsynth object -#' @param inf Boolean, whether to get confidence intervals around the point estimates #' @param inf_type Type of inference algorithm. Options are #' \itemize{ #' \item{"conformal"}{Conformal inference (default)} #' \item{"jackknife+"}{Jackknife+ algorithm over time periods} #' \item{"jackknife"}{Jackknife over units} +#' \item{"permutation"}{Placebo permutation, raw ATT} +#' \item{"permutation_rstat"}{Placebo permutation, RMSPE adjusted ATT} #' } -#' @param linear_effect Boolean, whether to invert the conformal inference hypothesis test to get confidence intervals for a linear-in-time treatment effect: intercept + slope * time +#' @param linear_effect Boolean, whether to invert the conformal inference hypothesis test to get confidence intervals for a linear-in-time treatment effect: intercept + slope * time #' @param ... Optional arguments for inference, for more details for each `inf_type` see #' \itemize{ #' \item{"conformal"}{`conformal_inf`} #' \item{"jackknife+"}{`time_jackknife_plus`} #' \item{"jackknife"}{`jackknife_se_single`} +#' \item{"permutation"}{`permutation_inf`} #' } #' @export -summary.augsynth <- function(object, inf = T, inf_type = "conformal", - linear_effect = F, - ...) { + +add_inference <- function(object, inf_type = "conformal", linear_effect = F, ...) { augsynth <- object - summ <- list() t0 <- ncol(augsynth$data$X) t_final <- t0 + ncol(augsynth$data$y) - if(inf) { + if (tolower(inf_type) != "none") { + if(inf_type == "jackknife") { att_se <- jackknife_se_single(augsynth) } else if(inf_type == "jackknife+") { att_se <- time_jackknife_plus(augsynth, ...) } else if(inf_type == "conformal") { - att_se <- conformal_inf(augsynth, ...) - # get CIs for linear treatment effects - if(linear_effect) { - att_linear <- conformal_inf_linear(augsynth, ...) - } + att_se <- conformal_inf(augsynth, ...) + + if(linear_effect) { + att_linear <- conformal_inf_linear(augsynth, ...) + } + + } else if (inf_type %in% c('permutation', 'permutation_rstat')) { + if (is.null(augsynth$results$permutations)) { + augsynth <- add_placebo_distribution(augsynth) + } + att_se <- permutation_inf(augsynth, inf_type) } else { stop(paste(inf_type, "is not a valid choice of 'inf_type'")) } att <- data.frame(Time = augsynth$data$time, Estimate = att_se$att[1:t_final]) + rownames(att) <- att$Time + if(inf_type == "jackknife") { - att$Std.Error <- att_se$se[1:t_final] - att_avg_se <- att_se$se[t_final + 1] + att$Std.Error <- att_se$se[1:t_final] + att_avg_se <- att_se$se[t_final + 1] + } else { + att_avg_se <- NA + } + if( length( att_se$att ) > t_final ) { + att_avg <- att_se$att[t_final + 1] } else { - att_avg_se <- NA + att_avg <- mean(att$Estimate[(t0 + 1):t_final]) } - att_avg <- att_se$att[t_final + 1] - if(inf_type %in% c("jackknife+", "nonpar_bs", "t_dist", "conformal")) { + if(inf_type %in% c("jackknife+", "nonpar_bs", "t_dist", "conformal", "permutation", "permutation_rstat")) { att$lower_bound <- att_se$lb[1:t_final] att$upper_bound <- att_se$ub[1:t_final] } - if(inf_type == "conformal") { - att$p_val <- att_se$p_val[1:t_final] + if(inf_type %in% c("conformal", "permutation", "permutation_rstat")) { + att$p_val <- att_se$p_val[1:t_final] } - } else { - t0 <- ncol(augsynth$data$X) - t_final <- t0 + ncol(augsynth$data$y) - att_est <- predict(augsynth, att = T) + # No inference, make table of estimates and NAs for SEs, etc. + att_est <- predict(augsynth, att = TRUE) att <- data.frame(Time = augsynth$data$time, Estimate = att_est) att$Std.Error <- NA @@ -339,62 +296,103 @@ summary.augsynth <- function(object, inf = T, inf_type = "conformal", att_avg_se <- NA } - summ$att <- att - - if(inf) { - if(inf_type %in% c("jackknife+")) { - summ$average_att <- data.frame(Value = "Average Post-Treatment Effect", - Estimate = att_avg, Std.Error = att_avg_se) - summ$average_att$lower_bound <- att_se$lb[t_final + 1] - summ$average_att$upper_bound <- att_se$ub[t_final + 1] - summ$alpha <- att_se$alpha - } - if(inf_type == "conformal") { - # summ$average_att$p_val <- att_se$p_val[t_final + 1] - # summ$average_att$lower_bound <- att_se$lb[t_final + 1] - # summ$average_att$upper_bound <- att_se$ub[t_final + 1] - # summ$alpha <- att_se$alpha - if(linear_effect) { - summ$average_att <- data.frame( - Value = c("Average Post-Treatment Effect", - "Treatment Effect Intercept", - "Treatment Effect Slope"), - Estimate = c(att_avg, att_linear$est_int, - att_linear$est_slope), - Std.Error = c(att_avg_se, NA, NA), - p_val = c(att_se$p_val[t_final + 1], NA, NA), - lower_bound = c(att_se$lb[t_final + 1], - att_linear$ci_int[1], - att_linear$ci_slope[1]), - upper_bound = c(att_se$ub[t_final + 1], - att_linear$ci_int[2], - att_linear$ci_slope[2]) - ) - } else { - summ$average_att <- data.frame( - Value = c("Average Post-Treatment Effect"), - Estimate = att_avg, - Std.Error = att_avg_se, - p_val = att_se$p_val[t_final + 1], - lower_bound = att_se$lb[t_final + 1], - upper_bound = att_se$ub[t_final + 1] - ) - + augsynth$results$att <- att + augsynth$results$average_att <- data.frame(Value = "Average Post-Treatment Effect", + Estimate = att_avg, Std.Error = att_avg_se) + + + if(inf_type %in% c("jackknife+", "conformal", "permutation", "permutation_rstat")) { + augsynth$results$average_att$lower_bound <- att_se$lb[t_final + 1] + augsynth$results$average_att$upper_bound <- att_se$ub[t_final + 1] + augsynth$results$alpha <- att_se$alpha + + if (inf_type == 'conformal') { + if(linear_effect) { + augsynth$results$average_att <- data.frame( + Value = c("Average Post-Treatment Effect", + "Treatment Effect Intercept", + "Treatment Effect Slope"), + Estimate = c(att_avg, att_linear$est_int, + att_linear$est_slope), + Std.Error = c(att_avg_se, NA, NA), + p_val = c(att_se$p_val[t_final + 1], NA, NA), + lower_bound = c(att_se$lb[t_final + 1], + att_linear$ci_int[1], + att_linear$ci_slope[1]), + upper_bound = c(att_se$ub[t_final + 1], + att_linear$ci_int[2], + att_linear$ci_slope[2]) + ) + } else { + augsynth$results$average_att <- data.frame( + Value = c("Average Post-Treatment Effect"), + Estimate = att_avg, + Std.Error = att_avg_se, + p_val = att_se$p_val[t_final + 1], + lower_bound = att_se$lb[t_final + 1], + upper_bound = att_se$ub[t_final + 1] + ) + + } } - summ$alpha <- att_se$alpha - } - } else { - summ$average_att <- data.frame(Value = "Average Post-Treatment Effect", - Estimate = att_avg, Std.Error = att_avg_se) } - summ$t_int <- augsynth$t_int - summ$call <- augsynth$call - summ$l2_imbalance <- augsynth$l2_imbalance - summ$scaled_l2_imbalance <- augsynth$scaled_l2_imbalance - if(!is.null(augsynth$covariate_l2_imbalance)) { - summ$covariate_l2_imbalance <- augsynth$covariate_l2_imbalance - summ$scaled_covariate_l2_imbalance <- augsynth$scaled_covariate_l2_imbalance + if(inf_type %in% c("conformal", "permutation", "permutation_rstat")) { + augsynth$results$average_att$p_val <- att_se$p_val[t_final + 1] } + + augsynth$results$inf_type <- inf_type + + return(augsynth) + +} + + +#### Summary methods #### + + +#' Summary function for augsynth +#' +#' Summary summarizes an augsynth result by (usually) adding an +#' inferential result, if that has not been calculated already, and +#' calculating a few other summary statistics such as estimated bias. +#' This method does this via `add_inference()`, if inference is +#' needed. +#' +#' @param object augsynth object +#' +#' @param inf_type Type of inference algorithm. If left NULL, inherits +#' inf_type from `object` or otherwise defaults to "conformal." +#' Options are +#' \itemize{ +#' \item{"conformal"}{Conformal inference (default)} +#' \item{"jackknife+"}{Jackknife+ algorithm over time periods} +#' \item{"jackknife"}{Jackknife over units} +#' \item{"permutation"}{Placebo permutation, raw ATT} +#' \item{"permutation_rstat"}{Placebo permutation, RMSPE adjusted ATT} +#' } +#' @param ... Optional arguments for inference, for more details for +#' each `inf_type` see +#' \itemize{ +#' \item{"conformal"}{`conformal_inf`} +#' \item{"jackknife+"}{`time_jackknife_plus`} +#' \item{"jackknife"}{`jackknife_se_single`} +#' \item{"permutation", "permutation_rstat"}{`permutation_inf`} +#' } +#' @export +summary.augsynth <- function(object, inf_type = 'conformal', ...) { + augsynth <- object + + t0 <- ncol(augsynth$data$X) + t_final <- t0 + ncol(augsynth$data$y) + + augsynth <- add_inference(augsynth, inf_type = inf_type) + + # Copy over all of OG object except for data + nms = names(augsynth) + nms = nms[!nms %in% c("data", "raw_data", "results")] + summ <- augsynth$results + summ <- c( summ, augsynth[nms] ) + ## get estimated bias if(tolower(augsynth$progfunc) == "ridge") { @@ -410,32 +408,63 @@ summary.augsynth <- function(object, inf = T, inf_type = "conformal", if(tolower(augsynth$progfunc) == "none" | (!augsynth$scm)) { summ$bias_est <- NA } else { - summ$bias_est <- m1 - t(mhat[trt==0,,drop=F]) %*% w + summ$bias_est <- m1 - t(mhat[trt==0,,drop=F]) %*% w } - - - summ$inf_type <- if(inf) inf_type else "None" + + summ$n_unit = n_unit(augsynth) + summ$n_time = n_time(augsynth) + summ$n_tx = n_treated(augsynth)[1] + summ$time_tx = t0 + summ$donor_table = donor_table( augsynth ) + + summ$treated_table = treated_table( augsynth ) + class(summ) <- "summary.augsynth" return(summ) } + + + +#' Methods for accessing details of summary.augsynth object +#' +#' @param x summary.augsynth result object +#' +#' @rdname summary.augsynth_class +#' +#' @return is.summary.augsynth: TRUE if object is a augsynth object. +#' +#' @export +is.summary.augsynth <- function(x) { + inherits(x, "summary.augsynth") +} + + + #' Print function for summary function for augsynth +#' #' @param x summary object #' @param ... Optional arguments #' @export print.summary.augsynth <- function(x, ...) { summ <- x - + ## straight from lm - cat("\nCall:\n", paste(deparse(summ$call), sep="\n", collapse="\n"), "\n\n", sep="") + cat("\nCall:\n", paste(deparse(summ$call), sep="\n", collapse="\n"), "\n", sep="") t_final <- nrow(summ$att) + cat( sprintf( " Fit to %d units and %d+%d = %d time points; %g treated at %s %g.\n", + summ$n_unit, summ$time_tx, t_final - summ$time_tx, t_final, + summ$n_tx, + summ$time_var, + summ$att$Time[[summ$time_tx+1]]) ) + cat( "\n" ) ## distinction between pre and post treatment att_est <- summ$att$Estimate t_total <- length(att_est) t_int <- summ$att %>% filter(Time <= summ$t_int) %>% nrow() - + att_pre <- att_est[1:(t_int-1)] att_post <- att_est[t_int:t_total] @@ -447,20 +476,22 @@ print.summary.augsynth <- function(x, ...) { att_post <- summ$average_att$Estimate[1] se_est <- summ$att$Std.Error if(summ$inf_type == "jackknife") { - se_avg <- summ$average_att$Std.Error + se_avg <- summ$average_att$Std.Error - out_msg <- paste("Average ATT Estimate (Jackknife Std. Error): ", - format(round(att_post,3), nsmall=3), - " (", - format(round(se_avg,3)), ")\n") - inf_type <- "Jackknife over units" + out_msg <- paste("Average ATT Estimate (Jackknife Std. Error): ", + format(round(att_post,3), nsmall=3), + " (", + format(round(se_avg,3)), ")\n") + inf_type <- "Jackknife over units" } else if(summ$inf_type == "conformal") { - p_val <- summ$average_att$p_val[1] - out_msg <- paste("Average ATT Estimate (p Value for Joint Null): ", - format(att_post, digits = 3), - " (", - format(p_val, digits = 2), ")\n") - inf_type <- "Conformal inference" + + p_val <- summ$average_att$p_val + out_msg <- paste("Average ATT Estimate (p Value for Joint Null): ", + format(att_post, digits = 3), + " (", + format(p_val, digits = 2), ")\n") + inf_type <- "Conformal inference" + if("Treatment Effect Slope" %in% summ$average_att$Value) { lowers <- summ$average_att$lower_bound[2:3] uppers <- summ$average_att$upper_bound[2:3] @@ -471,112 +502,131 @@ print.summary.augsynth <- function(x, ...) { format(uppers[2], digits = 3), "]\n") out_msg <- paste0(out_msg, out_msg_line2) } + } else if(summ$inf_type == "jackknife+") { - out_msg <- paste("Average ATT Estimate: ", - format(round(att_post,3), nsmall=3), "\n") - inf_type <- "Jackknife+ over time periods" + out_msg <- paste("Average ATT Estimate: ", + format(round(att_post,3), nsmall=3), "\n") + inf_type <- "Jackknife+ over time periods" + } else if (summ$inf_type %in% c('permutation', "permutation_rstat")) { + out_msg <- paste("Average ATT Estimate: ", + format(round(att_post,3), nsmall=3), "\n") + inf_type <- ifelse(summ$inf_type == 'permutation', + "Permutation inference", + "Permutation inference (RMSPE-adjusted)") + out_msg <-paste0( out_msg, "\n", + ( sprintf( "Donor RMSPE range from %.2f to %.2f\n", + min( summ$donor_table$RMSPE ), max( summ$donor_table$RMSPE ) ) ) ) + } else { - out_msg <- paste("Average ATT Estimate: ", - format(round(att_post,3), nsmall=3), "\n") - inf_type <- "None" + out_msg <- paste("Average ATT Estimate: ", + format(round(att_post,3), nsmall=3), "\n") + inf_type <- "None" } - out_msg <- paste(out_msg, - "L2 Imbalance: ", - format(round(summ$l2_imbalance,3), nsmall=3), "\n", - "Percent improvement from uniform weights: ", - format(round(1 - summ$scaled_l2_imbalance,3)*100), "%\n\n", - sep="") - if(!is.null(summ$covariate_l2_imbalance)) { - out_msg <- paste(out_msg, - "Covariate L2 Imbalance: ", - format(round(summ$covariate_l2_imbalance,3), - nsmall=3), - "\n", + "L2 Imbalance: ", + format(round(summ$l2_imbalance,3), nsmall=3), "\n", "Percent improvement from uniform weights: ", - format(round(1 - summ$scaled_covariate_l2_imbalance,3)*100), - "%\n\n", + format(round(1 - summ$scaled_l2_imbalance,3)*100), "%\n\n", sep="") - } - out_msg <- paste(out_msg, - "Avg Estimated Bias: ", - format(round(mean(summ$bias_est), 3),nsmall=3), "\n\n", - "Inference type: ", - inf_type, - "\n\n", - sep="") - cat(out_msg) + if(!is.null(summ$covariate_l2_imbalance)) { + + out_msg <- paste(out_msg, + "Covariate L2 Imbalance: ", + format(round(summ$covariate_l2_imbalance,3), + nsmall=3), + "\n", + "Percent improvement from uniform weights: ", + format(round(1 - summ$scaled_covariate_l2_imbalance,3)*100), + "%\n\n", + sep="") + + } + out_msg <- paste(out_msg, + "Avg Estimated Bias: ", + format(round(mean(summ$bias_est), 3),nsmall=3), "\n\n", + "Inference type: ", + inf_type, + "\n\n", + sep="") + + rng = range( summ$donor_table$weight[ summ$donor_table$weight > 1/(1000*summ$n_unit) ] ) + cat( sprintf( "%d donor units used with weights of %.3f to %.3f\n", + sum( abs(summ$donor_table$weight) > 1/(1000*summ$n_unit) ), rng[[1]], rng[[2]] ) ) + cat(out_msg) + if(summ$inf_type == "jackknife") { - out_att <- summ$att[t_int:t_final,] %>% - select(Time, Estimate, Std.Error) + out_att <- summ$att[t_int:t_final,] %>% + select(Time, Estimate, Std.Error) } else if(summ$inf_type == "conformal") { - out_att <- summ$att[t_int:t_final,] %>% - select(Time, Estimate, lower_bound, upper_bound, p_val) - names(out_att) <- c("Time", "Estimate", - paste0((1 - summ$alpha) * 100, "% CI Lower Bound"), - paste0((1 - summ$alpha) * 100, "% CI Upper Bound"), - paste0("p Value")) + out_att <- summ$att[t_int:t_final,] %>% + select(Time, Estimate, lower_bound, upper_bound, p_val) + names(out_att) <- c("Time", "Estimate", + paste0((1 - summ$alpha) * 100, "% CI Lower Bound"), + paste0((1 - summ$alpha) * 100, "% CI Upper Bound"), + paste0("p Value")) } else if(summ$inf_type == "jackknife+") { - out_att <- summ$att[t_int:t_final,] %>% - select(Time, Estimate, lower_bound, upper_bound) - names(out_att) <- c("Time", "Estimate", - paste0((1 - summ$alpha) * 100, "% CI Lower Bound"), - paste0((1 - summ$alpha) * 100, "% CI Upper Bound")) + out_att <- summ$att[t_int:t_final,] %>% + select(Time, Estimate, lower_bound, upper_bound) + names(out_att) <- c("Time", "Estimate", + paste0((1 - summ$alpha) * 100, "% CI Lower Bound"), + paste0((1 - summ$alpha) * 100, "% CI Upper Bound")) + } else if (summ$inf_type %in% c('permutation', "permutation_rstat")) { + out_att <- summ$att[t_int:t_final, ] %>% + select(Time, Estimate, lower_bound, upper_bound, p_val) + names(out_att) <- c("Time", "Estimate", + paste0((1 - summ$alpha) * 100, "% CI Lower Bound"), + paste0((1 - summ$alpha) * 100, "% CI Upper Bound"), + paste0('p Value')) } else { - out_att <- summ$att[t_int:t_final,] %>% - select(Time, Estimate) + out_att <- summ$att[t_int:t_final,] %>% + select(Time, Estimate) } out_att %>% - mutate_at(vars(-Time), ~ round(., 3)) %>% - print(row.names = F) + mutate_at(vars(-Time), ~ round(., 3)) %>% + print(row.names = F) - + invisible( summ ) } -#' Plot function for summary function for augsynth -#' @param x Summary object -#' @param inf Boolean, whether to plot confidence intervals -#' @param ... Optional arguments + + +#' Plot results from summarized augsynth +#' +#' Make a variety of plots, depending on plot_type. Default is to +#' plot impacts with associated uncertainty (if present). Other +#' options are estimates with no uncertainty ("estimate only"), the +#' level of outcome ("outcomes"), level of outcomes with the raw trend +#' as well ("outcomes raw average"), and the classic spagetti plot +#' ("placebo"). +#' +#' @param x summary.augsynth object +#' @inheritParams plot_augsynth_results +#' #' @export -plot.summary.augsynth <- function(x, inf = T, ...) { +plot.summary.augsynth <- function(x, + plot_type = 'estimate', + ...) { + summ <- x - # if ("inf" %in% names(list(...))) { - # inf <- list(...)$inf - # } else { - # inf <- T - # } - - p <- summ$att %>% - ggplot2::ggplot(ggplot2::aes(x=Time, y=Estimate)) - if(inf) { - if(all(is.na(summ$att$lower_bound))) { - p <- p + ggplot2::geom_ribbon(ggplot2::aes(ymin=Estimate-2*Std.Error, - ymax=Estimate+2*Std.Error), - alpha=0.2) - } else { - p <- p + ggplot2::geom_ribbon(ggplot2::aes(ymin=lower_bound, - ymax=upper_bound), - alpha=0.2) - } - } - p + ggplot2::geom_line() + - ggplot2::geom_vline(xintercept=summ$t_int, lty=2) + - ggplot2::geom_hline(yintercept=0, lty=2) + - ggplot2::theme_bw() + plot_augsynth_results( summ, plot_type = plot_type, ... ) } + + +#### Package documentation #### + + #' augsynth -#' +#' #' @description A package implementing the Augmented Synthetic Controls Method -#' @docType package #' @name augsynth-package #' @importFrom magrittr "%>%" #' @importFrom purrr reduce @@ -584,9 +634,10 @@ plot.summary.augsynth <- function(x, inf = T, ...) { #' @import tidyr #' @importFrom stats terms #' @importFrom stats formula -#' @importFrom stats update -#' @importFrom stats delete.response -#' @importFrom stats model.matrix -#' @importFrom stats model.frame +#' @importFrom stats update +#' @importFrom stats delete.response +#' @importFrom stats model.matrix +#' @importFrom stats model.frame #' @importFrom stats na.omit -NULL +"_PACKAGE" + diff --git a/R/augsynth_class.R b/R/augsynth_class.R new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/R/augsynth_class.R @@ -0,0 +1 @@ + diff --git a/R/augsynth_pre.R b/R/augsynth_pre.R index a9dfe14..8593d2a 100644 --- a/R/augsynth_pre.R +++ b/R/augsynth_pre.R @@ -1,21 +1,29 @@ + ################################################################################ ## Main function for the augmented synthetic controls Method ################################################################################ #' Fit Augmented SCM +#' +#' The general `augsynth()` method dispatches to `single_augsynth`, +#' `augsynth_multiout`, or `multisynth` based on the number of +#' outcomes and treatment times. See documentation for these methods +#' for further detail. +#' #' @param form outcome ~ treatment | auxillary covariates #' @param unit Name of unit column #' @param time Name of time column #' @param data Panel data as dataframe -#' @param t_int Time of intervention (used for single-period treatment only) +#' @param t_int Time of intervention (used for single-period treatment +#' only) #' @param ... Optional arguments #' \itemize{ #' \item Single period augsynth with/without multiple outcomes #' \itemize{ #' \item{"progfunc"}{What function to use to impute control outcomes: Ridge=Ridge regression (allows for standard errors), None=No outcome model, EN=Elastic Net, RF=Random Forest, GSYN=gSynth, MCP=MCPanel, CITS=CITS, CausalImpact=Bayesian structural time series with CausalImpact, seq2seq=Sequence to sequence learning with feedforward nets} #' \item{"scm"}{Whether the SCM weighting function is used} -#' \item{"fixedeff"}{Whether to include a unit fixed effect, default F } +#' \item{"fixedeff"}{Whether to include a unit fixed effect, default is FALSE } #' \item{"cov_agg"}{Covariate aggregation functions, if NULL then use mean with NAs omitted} #' } #' \item Multi period (staggered) augsynth @@ -29,14 +37,16 @@ #' \item{"n_factors"}{Number of factors for interactive fixed effects, default does CV} #' } #' } -#' -#' @return augsynth object that contains: +#' +#' @return augsynth or multisynth object (depending on dispatch) that contains (among other things): #' \itemize{ #' \item{"weights"}{weights} #' \item{"data"}{Panel data as matrices} #' } +#' +#' @seealso `single_augsynth`, `augsynth_multiout`, `multisynth` #' @export -#' +#' augsynth <- function(form, unit, time, data, t_int=NULL, ...) { call_name <- match.call() @@ -44,7 +54,7 @@ augsynth <- function(form, unit, time, data, t_int=NULL, ...) { form <- Formula::Formula(form) unit_quosure <- enquo(unit) time_quosure <- enquo(time) - + ## format data outcome <- terms(formula(form, rhs=1))[[2]] @@ -69,7 +79,7 @@ augsynth <- function(form, unit, time, data, t_int=NULL, ...) { if("progfunc" %in% names(list(...))) { warning("`progfunc` is not an argument for multisynth, so it is ignored") } - return(multisynth(form, !!enquo(unit), !!enquo(time), data, ...)) + return(multisynth(form, !!enquo(unit), !!enquo(time), data, ...)) } else { if (is.null(t_int)) { t_int <- trt_time %>% filter(is.finite(trt_time)) %>% diff --git a/R/calc_covariate_balance.R b/R/calc_covariate_balance.R new file mode 100644 index 0000000..ecc38d3 --- /dev/null +++ b/R/calc_covariate_balance.R @@ -0,0 +1,51 @@ + + +#' Make covariate balance table. +#' +#' Make a table comparing means of covariates in the treated group, +#' the raw control group, and the new weighted control group (the +#' synthetic control) +#' +#' @param ascm An augsynth result object from single_augsynth +#' @param pre_period List of names of the pre-period timepoints to +#' calculate balance for. NULL means none. +#' +#' @export +covariate_balance_table = function( ascm, pre_period = NULL ) { + + stopifnot( is.augsynth( ascm ) ) + + trt = ascm$data$trt + weight = rep( 0, length( trt ) ) + weight[ trt == 0 ] = ascm$synw + stopifnot( abs( sum( weight ) - 1 ) < 0.000001 ) + + Z = ascm$data$Z + if ( !is.null( pre_period ) ) { + xx <- ascm$data$X[ , colnames(ascm$data$X) %in% pre_period, drop = FALSE] + if ( ncol( xx ) > 0 ) { + Z = cbind( Z, xx ) + } + } + + Co_means = t( Z ) %*% weight + + # Means of the outcome at lagged time points + #Co_means_2 = ascm$data$synth_data$Z0 %*% ascm$synw + + # Unform weighting + n_donor = length(ascm$synw) + unit_weight = rep( 1 / n_donor, nrow(Z) ) + unit_weight[ trt == 1 ] = 0 + raw_means = t( Z ) %*% unit_weight + + Tx_means = Z[ trt == 1, ] + + means = tibble( variable = names( Tx_means ), + Tx = as.numeric( Tx_means ), + Co = as.numeric( Co_means ), + Raw = as.numeric( raw_means ) ) + + + means +} diff --git a/R/donor_control.R b/R/donor_control.R new file mode 100644 index 0000000..485ef88 --- /dev/null +++ b/R/donor_control.R @@ -0,0 +1,119 @@ + +# This file contains methods to monitor and modify set of donor units +# in augmented synthetic control method + + +#' Return a summary data frame donor units used in the model with +#' their synthetic weights. +#' +#' If permutation inference has been conducted, table will also +#' include RMSPEs. This can be forced with include_RMSPE flag. +#' +#' If the augsynth object does not have permutation-based inference +#' results, the function will call that form of inference, in order to +#' calculate the RMSPEs for each donor unit in turn. +#' +#' @param augsynth Augsynth object to be plotted +#' @param include_RMSPE Include RMSPEs in the table even if +#' permutation inference has not yet been conducted. +#' @param zap_weights all weights smaller than this value will be set +#' to zero. Set to NULL to keep all weights. +#' @export +donor_table <- function(augsynth, include_RMSPE = TRUE, zap_weights = 0.0000001 ) { + + if ( is.summary.augsynth( augsynth ) ) { + if ( !is.null( augsynth$donor_table ) ) { + return( augsynth$donor_table ) + } else { + stop( "Call donor_table on original result from augsynth() or a permutation-inference summary" ) + } + } + + stopifnot( is.augsynth(augsynth) ) + + trt_index <- which(augsynth$data$trt == 1) + unit_var <- augsynth$unit_var + + tbl = tibble( + unit = rownames( augsynth$weights ), + weight = as.numeric(augsynth$weights) ) + names(tbl)[[1]] = unit_var + + # If RMPSEs already exist, or flag says to calculate them, then calculate them + if ( include_RMSPE || (!is.null(augsynth$results) && augsynth$results$inf_type %in% c("permutation", "permutation_rstat")) ) { + + if (is.null(augsynth$results) || (!(augsynth$results$inf_type %in% c("permutation", "permutaton_rstat")) ) ) { + augsynth <- add_inference(augsynth, inf_type = 'permutation') + } + RMSPEs <- augsynth$results$permutations$placebo_dist %>% + select(!!unit_var, RMSPE) %>% + distinct() + tbl <- left_join(tbl, RMSPEs, by = unit_var) + } + + if ( !is.null(zap_weights) ) { + tbl <- tbl %>% mutate(weight = ifelse(abs(weight) < zap_weights, 0, weight)) + } + + return(tbl) +} + + +#' Return a new augsynth object with specified donor units removed +#' +#' @param augsynth Augsynth object to be plotted +#' +#' @param drop Drop donor units, based on pre-treatment RMSPE or unit +#' name(s). Default of 20 means drop units with an RMSPE 20x higher +#' than the treated unit. The `drop` parameter can also be a character vector of unit +#' IDs to drop. +#' +#' @export +update_augsynth <- function(augsynth, drop = 20){ + + if (is.null(augsynth$results)){ + inf_type = 'none' + } else { + inf_type <- augsynth$results$inf_type + } + + # run placebo tests if necessary + if (!inf_type %in% c('permutation', 'permutation_rstat')) { + augsynth <- add_inference(augsynth, inf_type = 'permutation') + } + + unit_var <- augsynth$unit_var + # pre-treatment RMSPE among donors + donor_RMSPE <- augsynth$results$permutations$placebo_dist %>% + filter(!!as.name(augsynth$time_var) < augsynth$t_int) %>% + group_by(!!as.name(augsynth$unit_var)) %>% + summarise(RMSPE = sqrt(mean(ATT ^ 2)), .groups = "drop") + # pre-treatment RMSPE for treated unit + trt_RMSPE <- add_inference(augsynth, inf_type = 'permutation')$results$permutations$placebo_dist %>% + filter(!!as.name(augsynth$time_var) < augsynth$t_int) %>% + filter(!!as.name(unit_var) == augsynth$trt_unit) %>% + pull(RMSPE) %>% unique() + + if (is.numeric(drop)) { + keep_units <- donor_RMSPE %>% filter(RMSPE / trt_RMSPE <= drop) %>% pull(!!unit_var) + } else if (is.character(drop)) { + keep_units <- donor_RMSPE %>% filter((!!as.name(unit_var) %in% drop) == FALSE) %>% pull(!!unit_var) %>% unique() + } + keep_units <- c(keep_units, augsynth$trt_unit) + + form <- as.formula(paste(as.character(augsynth$form)[2], as.character(augsynth$form)[1], as.character(augsynth$form)[3])) + new_data <- as_tibble(augsynth$raw_data, .name_repair = 'unique') %>% + filter(!!as.name(unit_var) %in% keep_units) + + new_augsynth <- augsynth(form = form, + unit = !!as.name(augsynth$unit_var), + time = !!as.name(augsynth$time_var), + data = new_data, + progfunc = augsynth$progfunc, + scm = augsynth$scm, + fixedeff = augsynth$fixedeff, + cov_agg = augsynth$cov_agg + ) + + return(new_augsynth) +} diff --git a/R/fit_synth.R b/R/fit_synth.R index 69cddcd..17c006c 100644 --- a/R/fit_synth.R +++ b/R/fit_synth.R @@ -3,12 +3,14 @@ ####################################################### #' Make a V matrix from a vector (or null) +#' +#' @noRd make_V_matrix <- function(t0, V) { if(is.null(V)) { V <- diag(rep(1, t0)) } else if(is.vector(V)) { if(length(V) != t0) { - stop(paste("`V` must be a vector with", t0, "elements or a", t0, + stop(paste("`V` must be a vector with", t0, "elements or a", t0, "x", t0, "matrix")) } V <- diag(V) @@ -18,7 +20,7 @@ make_V_matrix <- function(t0, V) { V <- diag(c(V)) } else if(nrow(V) == t0) { } else { - stop(paste("`V` must be a vector with", t0, "elements or a", t0, + stop(paste("`V` must be a vector with", t0, "elements or a", t0, "x", t0, "matrix")) } @@ -61,7 +63,7 @@ fit_synth_formatted <- function(synth_data, V = NULL) { #' @param V Scaling matrix #' @noRd synth_qp <- function(X1, X0, V) { - + Pmat <- X0 %*% V %*% t(X0) qvec <- - t(X1) %*% V %*% t(X0) @@ -74,7 +76,7 @@ synth_qp <- function(X1, X0, V) { eps_rel = 1e-8, eps_abs = 1e-8) sol <- osqp::solve_osqp(P = Pmat, q = qvec, - A = A, l = l, u = u, + A = A, l = l, u = u, pars = settings) return(sol$x) diff --git a/R/inference.R b/R/inference.R index 3434970..06318df 100644 --- a/R/inference.R +++ b/R/inference.R @@ -26,7 +26,7 @@ time_jackknife_plus <- function(ascm, alpha = 0.05, conservative = F) { tpost <- ncol(wide_data$y) t_final <- dim(synth_data$Y0plot)[1] - jack_ests <- lapply(1:t0, + jack_ests <- lapply(1:t0, function(tdrop) { # drop unit i new_data <- drop_time_t(wide_data, Z, tdrop) @@ -56,11 +56,11 @@ time_jackknife_plus <- function(ascm, alpha = 0.05, conservative = F) { out <- list() att <- predict(ascm, att = T) - out$att <- c(att, + out$att <- c(att, mean(att[(t0 + 1):t_final])) # held out ATT - out$heldout_att <- c(held_out_errs, - att[(t0 + 1):t_final], + out$heldout_att <- c(held_out_errs, + att[(t0 + 1):t_final], mean(att[(t0 + 1):t_final])) # out$se <- rep(NA, 10 + tpost) @@ -95,7 +95,7 @@ drop_time_t <- function(wide_data, Z, t_drop) { new_wide_data <- list() new_wide_data$trt <- wide_data$trt new_wide_data$X <- wide_data$X[, -t_drop, drop = F] - new_wide_data$y <- cbind(wide_data$X[, t_drop, drop = F], + new_wide_data$y <- cbind(wide_data$X[, t_drop, drop = F], wide_data$y) X0 <- new_wide_data$X[new_wide_data$trt == 0,, drop = F] @@ -113,7 +113,7 @@ drop_time_t <- function(wide_data, Z, t_drop) { return(list(wide_data = new_wide_data, synth_data = new_synth_data, - Z = Z)) + Z = Z)) } #' Conformal inference procedure to compute p-values and point-wise confidence intervals @@ -134,6 +134,7 @@ drop_time_t <- function(wide_data, Z, t_drop) { #' \item{"p_val"}{p-value for test of no post-treatment effect} #' \item{"alpha"}{Level of confidence interval} #' } + conformal_inf <- function(ascm, alpha = 0.05, stat_func = NULL, type = "iid", q = 1, ns = 1000, grid_size = 50) { @@ -177,7 +178,7 @@ conformal_inf <- function(ascm, alpha = 0.05, new_wide_data <- wide_data new_wide_data$X <- cbind(wide_data$X, wide_data$y) new_wide_data$y <- matrix(1, nrow = n, ncol = 1) - null_p <- compute_permute_pval(new_wide_data, ascm, 0, ncol(wide_data$y), + null_p <- compute_permute_pval(new_wide_data, ascm, 0, ncol(wide_data$y), type, q, ns, stat_func) out <- list() att <- predict(ascm, att = T) @@ -264,7 +265,7 @@ conformal_inf_linear <- function(ascm, alpha = 0.05, #' @param q The norm for the test static `((sum(x ^ q))) ^ (1/q)` #' @param ns Number of resamples for "iid" permutations #' @param stat_func Function to compute test statistic -#' +#' #' @return List that contains: #' \itemize{ #' \item{"resids"}{Residuals after enforcing the null} @@ -306,7 +307,7 @@ compute_permute_test_stats <- function(wide_data, ascm, h0, stat_func <- function(x) (sum(abs(x) ^ q) / sqrt(length(x))) ^ (1 / q) } if(type == "iid") { - test_stats <- sapply(1:ns, + test_stats <- sapply(1:ns, function(x) { reorder <- sample(resids) stat_func(reorder[(t0 + 1):tpost]) @@ -319,7 +320,7 @@ compute_permute_test_stats <- function(wide_data, ascm, h0, stat_func(reorder[(t0 + 1):tpost]) }) } - + return(list(resids = resids, test_stats = test_stats, stat_func = stat_func)) @@ -335,7 +336,7 @@ compute_permute_test_stats <- function(wide_data, ascm, h0, #' @param q The norm for the test static `((sum(x ^ q))) ^ (1/q)` #' @param ns Number of resamples for "iid" permutations #' @param stat_func Function to compute test statistic -#' +#' #' @return Computed p-value #' @noRd compute_permute_pval <- function(wide_data, ascm, h0, @@ -357,7 +358,7 @@ compute_permute_pval <- function(wide_data, ascm, h0, #' @param q The norm for the test static `((sum(x ^ q))) ^ (1/q)` #' @param ns Number of resamples for "iid" permutations #' @param stat_func Function to compute test statistic -#' +#' #' @return (lower bound of interval, upper bound of interval, p-value for null of 0 effect) #' @noRd compute_permute_ci <- function(wide_data, ascm, grid, @@ -365,9 +366,9 @@ compute_permute_ci <- function(wide_data, ascm, grid, q, ns, stat_func) { # make sure 0 is in the grid grid <- c(grid, 0) - ps <-sapply(grid, + ps <-sapply(grid, function(x) { - compute_permute_pval(wide_data, ascm, x, + compute_permute_pval(wide_data, ascm, x, post_length, type, q, ns, stat_func) }) c(min(grid[ps >= alpha]), max(grid[ps >= alpha]), ps[grid == 0]) @@ -438,7 +439,7 @@ time_jackknife_plus_multiout <- function(ascm_multi, alpha = 0.05, conservative t_final <- t0 + tpost Z <- wide_data$Z - jack_ests <- lapply(1:t0, + jack_ests <- lapply(1:t0, function(tdrop) { # drop unit i new_data_list <- drop_time_t_multiout(data_list, Z, tdrop) @@ -474,15 +475,15 @@ time_jackknife_plus_multiout <- function(ascm_multi, alpha = 0.05, conservative out <- list() att <- predict(ascm_multi, att = T) - out$att <- rbind(att, + out$att <- rbind(att, colMeans(att[(t0 + 1):t_final, , drop = F])) # held out ATT - out$heldout_att <- rbind(t(held_out_errs), - att[(t0 + 1):t_final, , drop = F], + out$heldout_att <- rbind(t(held_out_errs), + att[(t0 + 1):t_final, , drop = F], colMeans(att[(t0 + 1):t_final, , drop = F])) if(conservative) { - qerr <- apply(abs(held_out_errs), 1, + qerr <- apply(abs(held_out_errs), 1, stats::quantile, 1 - alpha, type = 1) out$lb <- rbind(matrix(NA, nrow = t0, ncol = k), t(t(apply(jack_dist_cons, 1:2, min)) - qerr)) @@ -493,8 +494,8 @@ time_jackknife_plus_multiout <- function(ascm_multi, alpha = 0.05, conservative out$lb <- rbind(matrix(NA, nrow = t0, ncol = k), apply(jack_dist_low, 1:2, stats::quantile, alpha, type = 1)) - out$ub <- rbind(matrix(NA, nrow = t0, ncol = k), - apply(jack_dist_high, 1:2, + out$ub <- rbind(matrix(NA, nrow = t0, ncol = k), + apply(jack_dist_high, 1:2, stats::quantile, 1 - alpha, type = 1)) } # shift back to ATT scale @@ -523,7 +524,7 @@ drop_time_t_multiout <- function(data_list, Z, t_drop) { function(x) x[, -t_drop, drop = F]) new_data_list$y <- lapply(1:length(data_list$y), function(k) { - cbind(data_list$X[[k]][, t_drop, drop = F], + cbind(data_list$X[[k]][, t_drop, drop = F], data_list$y[[k]]) }) return(new_data_list) @@ -548,7 +549,7 @@ drop_time_t_multiout <- function(data_list, Z, t_drop) { #' \item{"p_val"}{p-value for test of no post-treatment effect} #' \item{"alpha"}{Level of confidence interval} #' } -conformal_inf_multiout <- function(ascm_multi, alpha = 0.05, +conformal_inf_multiout <- function(ascm_multi, alpha = 0.05, stat_func = NULL, type = "iid", q = 1, ns = 1000, grid_size = 1, lin_h0 = NULL) { @@ -568,7 +569,7 @@ conformal_inf_multiout <- function(ascm_multi, alpha = 0.05, post_att <- att[(t0 +1):t_final,, drop = F] post_sd <- apply(post_att, 2, function(x) sqrt(mean(x ^ 2, na.rm = T))) # iterate over post-treatment periods to get pointwise CIs - + vapply(1:tpost, function(j) { # fit using t0 + j as a pre-treatment period and get residuals @@ -580,8 +581,8 @@ conformal_inf_multiout <- function(ascm_multi, alpha = 0.05, colnames(data_list$y[[i]])[j]) Xi }) - - + + if(tpost > 1) { new_data_list$y <- lapply(1:k, function(i) { @@ -600,7 +601,7 @@ conformal_inf_multiout <- function(ascm_multi, alpha = 0.05, # make a grid around the estimated ATT if(is.null(lin_h0)) { - grid <- lapply(1:k, + grid <- lapply(1:k, function(i) { seq(att[t0 + j, i] - 2 * post_sd[i], att[t0 + j, i] + 2 * post_sd[i], length.out = grid_size) @@ -618,7 +619,7 @@ conformal_inf_multiout <- function(ascm_multi, alpha = 0.05, compute_permute_pval_multiout(new_data_list, ascm_multi, numeric(k), 1, type, q, ns, stat_func)) } - + }, matrix(0, ncol = k, nrow=3)) -> cis # # test a null post-treatment effect @@ -637,10 +638,10 @@ conformal_inf_multiout <- function(ascm_multi, alpha = 0.05, data_list$y[[i]][, 1, drop = FALSE] }) null_p <- compute_permute_pval_multiout(new_data_list, ascm_multi, - numeric(k), + numeric(k), tpost, type, q, ns, stat_func) if(is.null(lin_h0)) { - grid <- lapply(1:k, + grid <- lapply(1:k, function(i) { seq(min(att[(t0 + 1):tpost, i]) - 4 * post_sd[i], max(att[(t0 + 1):tpost, i]) + 4 * post_sd[i], @@ -689,7 +690,7 @@ conformal_inf_multiout <- function(ascm_multi, alpha = 0.05, #' @param q The norm for the test static `((sum(x ^ q))) ^ (1/q)` #' @param ns Number of resamples for "iid" permutations #' @param stat_func Function to compute test statistic -#' +#' #' @return List that contains: #' \itemize{ #' \item{"resids"}{Residuals after enforcing the null} @@ -730,7 +731,7 @@ compute_permute_test_stats_multiout <- function(data_list, ascm_multi, h0, } } if(type == "iid") { - test_stats <- sapply(1:ns, + test_stats <- sapply(1:ns, function(x) { idxs <- sample(1:nrow(resids)) reorder <- resids[idxs, , drop = F] @@ -747,7 +748,7 @@ compute_permute_test_stats_multiout <- function(data_list, ascm_multi, h0, apply(reorder[(t0 + 1):tpost, , drop = F], 2, stat_func) }) } - + return(list(resids = resids, test_stats = matrix(test_stats, nrow = k), stat_func = stat_func)) @@ -763,7 +764,7 @@ compute_permute_test_stats_multiout <- function(data_list, ascm_multi, h0, #' @param q The norm for the test static `((sum(x ^ q))) ^ (1/q)` #' @param ns Number of resamples for "iid" permutations #' @param stat_func Function to compute test statistic -#' +#' #' @return Computed p-value #' @noRd compute_permute_pval_multiout <- function(data_list, ascm_multi, h0, @@ -796,7 +797,7 @@ compute_permute_pval_multiout <- function(data_list, ascm_multi, h0, #' @param q The norm for the test static `((sum(x ^ q))) ^ (1/q)` #' @param ns Number of resamples for "iid" permutations #' @param stat_func Function to compute test statistic -#' +#' #' @return (lower bound of interval, upper bound of interval, p-value for null of 0 effect) #' @noRd compute_permute_ci_multiout <- function(data_list, ascm_multi, grid, @@ -818,12 +819,12 @@ compute_permute_ci_multiout <- function(data_list, ascm_multi, grid, } ps <- apply(grid, 1, function(x) { - compute_permute_pval_multiout(data_list, ascm_multi, x, + compute_permute_pval_multiout(data_list, ascm_multi, x, post_length, type, q, ns, stat_func) }) - sapply(1:k, - function(i) c(min(grid[ps >= alpha, i]), - max(grid[ps >= alpha, i]), + sapply(1:k, + function(i) c(min(grid[ps >= alpha, i]), + max(grid[ps >= alpha, i]), ps[apply(grid == 0, 1, all)])) } @@ -881,7 +882,7 @@ drop_unit_i_multiout <- function(wide_list, Z, i) { #' Estimate standard errors for single ASCM with the jackknife #' Do this for ridge-augmented synth #' @param ascm Fitted augsynth object -#' +#' #' @return List that contains: #' \itemize{ #' \item{"att"}{Vector of ATT estimates} @@ -1143,7 +1144,7 @@ weighted_bootstrap_multi <- function(multisynth, function(x) mean(x, na.rm=T)) upper_bound <- att - apply(bs_est, c(1,2), function(x) quantile(x, alpha / 2, na.rm = T)) - + lower_bound <- att - apply(bs_est, c(1,2), function(x) quantile(x, 1 - alpha / 2, na.rm = T)) @@ -1155,24 +1156,38 @@ weighted_bootstrap_multi <- function(multisynth, } -#' Bayesian bootstrap + +#' Bootstrap Functions +#' +#' There are several helper functions used for bootstrap inference. +#' Each method returns a list of selection weights of length n, +#' which weights that sum to n. +#' #' @param n Number of units +#' @name bootstrap_methods +#' @rdname bootstrap_methods +NULL + + + +#' @describeIn bootstrap_methods Bayesian bootstrap #' @export rdirichlet_b <- function(n) { Z <- as.numeric(rgamma(n, 1, 1)) return(Z / sum(Z) * n) } -#' Non-parametric bootstrap -#' @param n Number of units +#' @describeIn bootstrap_methods Non-parametric bootstrap #' @export -rmultinom_b <- function(n) as.numeric(rmultinom(1, n, rep(1 / n, n))) +rmultinom_b <- function(n) { + as.numeric(rmultinom(1, n, rep(1 / n, n))) +} -#' Wild bootstrap (Mammen 1993) -#' @param n Number of units + +#' @describeIn bootstrap_methods Wild bootstrap (Mammen 1993) #' @export rwild_b <- function(n) { sample(c(-(sqrt(5) - 1) / 2, (sqrt(5) + 1) / 2 ), n, replace = TRUE, prob = c((sqrt(5) + 1)/ (2 * sqrt(5)), (sqrt(5) - 1) / (2 * sqrt(5)))) -} \ No newline at end of file +} diff --git a/R/make_synth_data.R b/R/make_synth_data.R new file mode 100644 index 0000000..4be0973 --- /dev/null +++ b/R/make_synth_data.R @@ -0,0 +1,133 @@ + +# Utility function to make fake data for testing and whatnot + + + +#' A synthetic data simulator +#' +#' This generates data with time shocks that interact with the unit +#' latent factors. +#' +#' It also gives a unit fixed effect and a time fixed +#' effect. +#' +#' @param n_time Number of time periods +#' @param n_post Number of the time periods that are after tx onset +#' @param n_U Number of latent factors +#' @param N Number of units +#' @param N_tx number of treated units, out of total N +#' @param sd_time How much the time varying shocks matter. +#' @param sd_unit_fe How much the units have different shifts +#' @param sd_time_fe How much the time intercept shifts vary +#' @param tx_shift Add this shift to the distribution of the +#' treatment latent factors to create differences in tx and co +#' groups. +#' +#' @noRd +make_synth_data = function(n_U, N, n_time, N_tx = 1, n_post = 3, + long_form = FALSE, + tx_impact = 1:n_post, + sd_time = 0.3, + sd_unit_fe = 1, + sd_time_fe = 0.2, + sd_e = 1, + tx_shift = 0.5 ) { + + stopifnot( N_tx < N ) + stopifnot( n_post < n_time ) + + ## (1) Make latent factors for units + + # Make correlation structure for latent factors + Sigma = matrix(0.15, nrow = n_U, ncol = n_U) + diag(Sigma) = 1 + #solve(Sigma) + U = MASS::mvrnorm(N, mu = rep(1, n_U), Sigma = Sigma) + #U = abs( U ) + U = cbind( U, 1 ) + U[,1] = sd_unit_fe * U[,1] + #U + #summary(U) + U[1:N_tx,1:n_U] = U[1:N_tx,1:n_U] + tx_shift + + + # (2) Make the time varying component, with the first row being an + # intercept + shocks = matrix(rnorm(n_U * n_time, sd=sd_time), nrow = n_U) + #shocks = abs( shocks ) + shocks = rbind( 1, shocks ) + #shocks[1, ] = 2 * sort(shocks[1, ]) + #shocks[2, ] = sort(shocks[2, ]) + #shocks[2, ] = 0.5 * rev(sort(shocks[2, ])) + #browser() + + # Make a time drift by having the fixed effect time shock increase over time. + shocks[n_U+1,] = sort( shocks[n_U+1,] ) + shocks[n_U+1,] = sd_time_fe * shocks[n_U+1,] / sd_time + # Alt way to create time drift + # shocks = shocks + rep( 1:n_time, each=n_U ) / n_time + + #shocks = shocks + (1:nrow(shocks))/nrow(shocks) + #qplot(1:n_time, shocks[1, ]) + #dim(U) + #dim(shocks) + + # Outcome is latent factors times time shocks, with extra noise + # added. + Y = U %*% shocks + Y = Y + rnorm( length(Y), sd = sd_e ) + #dim(Y) + + dat = as.data.frame(Y) + colnames(dat) = 1:n_time + dat$ID = 1:N + dat$Tx = 0 + dat$Tx[1:N_tx] = 1 + + + # Add in some covariates predictive of latent U (skipping + # intercept) + X = U[,1:n_U, drop=FALSE] + for (i in 1:n_U) { + X[, i] = X[, i] + rnorm(nrow(X)) + } + X = round( X, digits = 1 ) + + #head(dat) + colnames(X) = paste0("X", 1:ncol(X)) + X = as.data.frame(X) + dat = bind_cols(dat, X) + + # Add in a treatment impact! + tx_index = (n_time-n_post+1):n_time + imp = matrix( tx_impact, + ncol=n_post, + nrow=N_tx, + byrow = TRUE ) + dat[ dat$Tx == 1, tx_index ] = dat[ dat$Tx == 1, tx_index ] + imp + + + # Final packaging of the data + if ( long_form ) { + + ldat = pivot_longer( + dat, + cols = all_of(1:n_time), + names_to = "time", + values_to = "Y" + ) + + # Make Y look nice. + ldat$Y = round( ldat$Y, digits = 1 ) #round( ldat$Y / 5, digits = 1 ) * 5 + + ldat$time = as.numeric(ldat$time) + ldat = mutate( ldat, + ever_Tx = Tx, + Tx = ifelse( ever_Tx & time > n_time - n_post, 1, 0 ) ) + ldat + } else { + dat + } +} + + diff --git a/R/multi_outcomes.R b/R/multi_outcomes.R index 625212c..6f3c018 100644 --- a/R/multi_outcomes.R +++ b/R/multi_outcomes.R @@ -8,7 +8,7 @@ #' Ridge=Ridge regression (allows for standard errors), #' None=No outcome model, #' @param scm Whether the SCM weighting function is used -#' @param fixedeff Whether to include a unit fixed effect, default F +#' @param fixedeff Whether to include a unit fixed effect, default F #' @param cov_agg Covariate aggregation functions, if NULL then use mean with NAs omitted #' @param combine_method How to combine outcomes: `concat` concatenates outcomes and `avg` averages them, default: 'avg' #' @param ... optional arguments for outcome model @@ -31,10 +31,15 @@ augsynth_multiout <- function(form, unit, time, t_int, data, ...) { call_name <- match.call() + # if a user sets inf_type at time of model fit, return a warning + if("inf_type" %in% names(call_name)) { + warning("`inf_type` is not an argument for augsynth_multiout, so it is ignored") + } + form <- Formula::Formula(form) unit <- enquo(unit) time <- enquo(time) - + ## format data outcome <- terms(formula(form, rhs=1))[[2]] trt <- terms(formula(form, rhs=1))[[3]] @@ -43,9 +48,9 @@ augsynth_multiout <- function(form, unit, time, t_int, data, outcomes <- sapply(outcomes_str, quo) # get outcomes as a list wide_list <- format_data_multi(outcomes, trt, unit, time, t_int, data) - - + + ## add covariates if(length(form)[2] == 2) { @@ -69,11 +74,11 @@ augsynth_multiout <- function(form, unit, time, t_int, data, # add some extra data augsynth$data$time <- data %>% distinct(!!time) %>% pull(!!time) augsynth$call <- call_name - augsynth$t_int <- t_int + augsynth$t_int <- t_int augsynth$combine_method <- combine_method treated_units <- data %>% filter(!!trt == 1) %>% distinct(!!unit) %>% pull(!!unit) - control_units <- data %>% filter(!(!!unit %in% treated_units)) %>% + control_units <- data %>% filter(!(!!unit %in% treated_units)) %>% distinct(!!unit) %>% pull(!!unit) augsynth$weights <- matrix(augsynth$weights) rownames(augsynth$weights) <- control_units @@ -92,7 +97,7 @@ augsynth_multiout <- function(form, unit, time, t_int, data, #' @param ... Extra args for outcome model #' @noRd fit_augsynth_multiout_internal <- function(wide_list, combine_method, Z, - progfunc, scm, fixedeff, + progfunc, scm, fixedeff, outcomes_str, ...) { @@ -111,7 +116,7 @@ fit_augsynth_multiout_internal <- function(wide_list, combine_method, Z, synth_data$Y1plot <- colMeans(cbind(X, y)[trt == 1,, drop = F]) - augsynth <- fit_augsynth_internal(wide_bal, synth_data, Z, progfunc, + augsynth <- fit_augsynth_internal(wide_bal, synth_data, Z, progfunc, scm, fixedeff, V = V, ...) # potentially add back in fixed effects @@ -174,6 +179,7 @@ combine_outcomes <- function(wide_list, combine_method, fixedeff, # combine outcomes if(combine_method == "concat") { + # center X and scale by overall variance for outcome # X <- lapply(wide_list$X, function(x) t(t(x) - colMeans(x)) / sd(x)) wide_bal <- list(X = do.call(cbind, lapply(wide_list$X, function(x) t(na.omit(t(x))))), @@ -193,8 +199,8 @@ combine_outcomes <- function(wide_list, combine_method, fixedeff, # trt = wide_list$trt) # # first get the standard deviations of the outcomes to put on the same scale - # sds <- do.call(c, - # lapply(wide_list$X, + # sds <- do.call(c, + # lapply(wide_list$X, # function(x) rep((sqrt(ncol(x)) * sd(x, na.rm=T)), ncol(x)))) # # do an SVD on centered and scaled outcomes @@ -271,33 +277,33 @@ predict.augsynth_multiout <- function(object, ...) { # separate out by outcome n_outs <- length(object$data_list$X) - max_t <- max(sapply(1:n_outs, + max_t <- max(sapply(1:n_outs, function(k) ncol(object$data_list$X[[k]]) + ncol(object$data_list$y[[k]]))) - pred_reshape <- matrix(NA, ncol = n_outs, + pred_reshape <- matrix(NA, ncol = n_outs, nrow = max_t) - colnames <- lapply(1:n_outs, - function(k) colnames(cbind(object$data_list$X[[k]], + colnames <- lapply(1:n_outs, + function(k) colnames(cbind(object$data_list$X[[k]], object$data_list$y[[k]]))) rownames(pred_reshape) <- colnames[[which.max(sapply(colnames, length))]] colnames(pred_reshape) <- object$outcomes # get outcome names for predictions - pre_outs <- do.call(c, - sapply(1:n_outs, + pre_outs <- do.call(c, + sapply(1:n_outs, function(j) { rep(object$outcomes[j], ncol(object$data_list$X[[j]])) }, simplify = FALSE)) - + post_outs <- do.call(c, - sapply(1:n_outs, + sapply(1:n_outs, function(j) { rep(object$outcomes[j], ncol(object$data_list$y[[j]])) }, simplify = FALSE)) # print(pred) # print(cbind(names(pred), c(pre_outs, post_outs))) - + pred_reshape[cbind(names(pred), c(pre_outs, post_outs))] <- pred return(pred_reshape) } @@ -328,8 +334,10 @@ print.augsynth_multiout <- function(x, ...) { #' @param grid_size Grid to compute prediction intervals over, default is 1 and only p-values are computed #' @param ... Optional arguments, including \itemize{\item{"se"}{Whether to plot standard error}} #' @export + summary.augsynth_multiout <- function(object, inf = T, inf_type = "conformal", grid_size = 1, ...) { - + + summ <- list() @@ -428,11 +436,7 @@ summary.augsynth_multiout <- function(object, inf = T, inf_type = "conformal", g if(grid_size == 1) { average_att <- average_att %>% mutate(lower_bound = NA, upper_bound = NA) } - # } - # } else { - # average_att <- gather(att_avg, Outcome, Estimate) - # } - + } else { att_est <- predict(object, att = T) @@ -487,9 +491,9 @@ summary.augsynth_multiout <- function(object, inf = T, inf_type = "conformal", g summ$bias_est <- NA } - - + + class(summ) <- "summary.augsynth_multiout" return(summ) } @@ -502,10 +506,10 @@ summary.augsynth_multiout <- function(object, inf = T, inf_type = "conformal", g print.summary.augsynth_multiout <- function(x, ...) { ## straight from lm cat("\nCall:\n", paste(deparse(x$call), sep="\n", collapse="\n"), "\n\n", sep="") - + att_est <- x$att$Estimate ## get pre-treatment fit by outcome - imbal <- x$att %>% + imbal <- x$att %>% filter(Time < x$t_int) %>% group_by(Outcome) %>% summarise(Pre.RMSE = sqrt(mean(Estimate ^ 2, na.rm = TRUE))) @@ -528,7 +532,7 @@ print.summary.augsynth_multiout <- function(x, ...) { #' @param inf Boolean, whether to plot uncertainty intervals, default TRUE #' @param plt_avg Boolean, whether to plot the average of the outcomes, default FALSE #' @param ... Optional arguments for summary function -#' +#' #' @export plot.augsynth_multiout <- function(x, inf = T, plt_avg = F, ...) { plot(summary(x, ...), inf = inf, plt_avg = plt_avg) @@ -538,7 +542,7 @@ plot.augsynth_multiout <- function(x, inf = T, plt_avg = F, ...) { #' @param x summary.augsynth_multiout object #' @param inf Boolean, whether to plot uncertainty intervals, default TRUE #' @param plt_avg Boolean, whether to plot the average of the outcomes, default FALSE -#' +#' #' @export plot.summary.augsynth_multiout <- function(x, inf = T, plt_avg = F, ...) { if(plt_avg) { @@ -546,7 +550,7 @@ plot.summary.augsynth_multiout <- function(x, inf = T, plt_avg = F, ...) { ggplot2::ggplot(ggplot2::aes(x=Time, y=Estimate)) } else { p <- x$att %>% - filter(Outcome != "Average") %>% + filter(Outcome != "Average") %>% ggplot2::ggplot(ggplot2::aes(x=Time, y=Estimate)) } if(inf) { @@ -567,4 +571,4 @@ plot.summary.augsynth_multiout <- function(x, inf = T, plt_avg = F, ...) { ggplot2::facet_wrap(~ Outcome, scales = "free_y") + ggplot2::theme_bw() -} \ No newline at end of file +} diff --git a/R/multisynth_class.R b/R/multisynth_class.R index 8c9a311..09e26b5 100644 --- a/R/multisynth_class.R +++ b/R/multisynth_class.R @@ -1,3 +1,5 @@ + + ################################################################################ ## Fitting, plotting, summarizing staggered synth ################################################################################ @@ -29,7 +31,7 @@ #' @param eps_rel Relative error tolerance for osqp #' @param verbose Whether to print logs for osqp #' @param ... Extra arguments -#' +#' #' @return multisynth object that contains: #' \itemize{ #' \item{"weights"}{weights matrix where each column is a set of weights for a treated unit} @@ -67,6 +69,11 @@ multisynth <- function(form, unit, time, data, unit <- enquo(unit) time <- enquo(time) + # if a user sets inf_type at time of model fit, return a warning + if("inf_type" %in% names(call_name)) { + warning("`inf_type` is not an argument for multisynth, so it is ignored") + } + ## format data outcome <- terms(formula(form, rhs=1))[[2]] trt <- terms(formula(form, rhs=1))[[3]] @@ -142,10 +149,10 @@ multisynth <- function(form, unit, time, data, scm = scm, time_cohort = time_cohort, time_w = F, lambda_t = 0, fit_resids = TRUE, eps_abs = eps_abs, - eps_rel = eps_rel, verbose = verbose, long_df = long_df, + eps_rel = eps_rel, verbose = verbose, long_df = long_df, how_match = how_match, ...) - - + + units <- data %>% arrange(!!unit) %>% distinct(!!unit) %>% pull(!!unit) rownames(msynth$weights) <- units @@ -166,7 +173,7 @@ multisynth <- function(form, unit, time, data, V = V, time_cohort = time_cohort, donors = msynth$donors, - eps_rel = eps_rel, + eps_rel = eps_rel, eps_abs = eps_abs, verbose = verbose) ## scaled global balance @@ -179,6 +186,10 @@ multisynth <- function(form, unit, time, data, } msynth$call <- call_name + msynth$trt_unit <- msynth$data$units[ msynth$data$trt < Inf ] + msynth$time_var <- quo_name(time) + msynth$unit_var <- quo_name(unit) + msynth$form <- form return(msynth) @@ -211,11 +222,11 @@ multisynth_formatted <- function(wide, relative=T, n_leads, n_lags, nu, lambda, V, force, n_factors, - scm, time_cohort, + scm, time_cohort, time_w, lambda_t, fit_resids, eps_abs, eps_rel, - verbose, long_df, + verbose, long_df, how_match, ...) { ## average together treatment groups ## grps <- unique(wide$trt) %>% sort() @@ -246,23 +257,23 @@ multisynth_formatted <- function(wide, relative=T, n_leads, n_lags, n_factors <- ncol(params$factor) ## get residuals from outcome model residuals <- cbind(wide$X, wide$y) - y0hat - + } else if (n_factors != 0) { ## if number of factors is provided don't do CV out <- fit_gsynth_multi(long_df, cbind(wide$X, wide$y), wide$trt, r=n_factors, CV=0, force=force) y0hat <- out$y0hat - params <- out$params - + params <- out$params + ## get residuals from outcome model residuals <- cbind(wide$X, wide$y) - y0hat } else if(force == 0 & n_factors == 0) { - # if no fixed effects or factors, just take out + # if no fixed effects or factors, just take out # control averages at each time point # time fixed effects from pure controls pure_ctrl <- cbind(wide$X, wide$y)[!is.finite(wide$trt), , drop = F] y0hat <- matrix(colMeans(pure_ctrl, na.rm = TRUE), - nrow = nrow(wide$X), ncol = ncol(pure_ctrl), + nrow = nrow(wide$X), ncol = ncol(pure_ctrl), byrow = T) residuals <- cbind(wide$X, wide$y) - y0hat params <- NULL @@ -297,13 +308,13 @@ multisynth_formatted <- function(wide, relative=T, n_leads, n_lags, bal_mat <- wide$X - ctrl_avg bal_mat <- wide$X } - + if(scm) { # get eligible set of donor units based on covariates donors <- get_donors(wide$X, wide$y, wide$trt, - wide$Z[, colnames(wide$Z) %in% + wide$Z[, colnames(wide$Z) %in% wide$match_covariates, drop = F], time_cohort, n_lags, n_leads, how = how_match, exact_covariates = wide$exact_covariates, ...) @@ -386,9 +397,9 @@ multisynth_formatted <- function(wide, relative=T, n_leads, n_lags, msynth$time_w <- time_w msynth$lambda_t <- lambda_t msynth$fit_resids <- fit_resids - msynth$extra_pars <- c(list(eps_abs = eps_abs, - eps_rel = eps_rel, - verbose = verbose), + msynth$extra_pars <- c(list(eps_abs = eps_abs, + eps_rel = eps_rel, + verbose = verbose), list(...)) msynth$long_df <- long_df @@ -417,7 +428,7 @@ predict.multisynth <- function(object, att = F, att_weight = NULL, bs_weight = N multisynth <- object relative <- T - + time_cohort <- multisynth$time_cohort if(is.null(relative)) { relative <- multisynth$relative @@ -436,19 +447,19 @@ predict.multisynth <- function(object, att = F, att_weight = NULL, bs_weight = N } if(time_cohort) { - which_t <- lapply(grps, + which_t <- lapply(grps, function(tj) (1:n)[multisynth$data$trt == tj]) mask <- unique(multisynth$data$mask) } else { which_t <- (1:n)[is.finite(multisynth$data$trt)] mask <- multisynth$data$mask } - + n1 <- sapply(1:J, function(j) length(which_t[[j]])) fullmask <- cbind(mask, matrix(0, nrow = J, ncol = (ttot - d))) - + ## estimate the post-treatment values to get att estimates mu1hat <- vapply(1:J, @@ -502,11 +513,11 @@ predict.multisynth <- function(object, att = F, att_weight = NULL, bs_weight = N c(vec, rep(NA, total_len - length(vec)), mean(mu0hat[(grps[j]+1):(min(grps[j] + n_leads, ttot)), j])) - + }, numeric(total_len +1 )) - + tauhat <- vapply(1:J, function(j) { vec <- c(rep(NA, d-grps[j]), @@ -536,11 +547,11 @@ predict.multisynth <- function(object, att = F, att_weight = NULL, bs_weight = N c(vec, rep(NA, total_len - length(vec)), mean(att_weight[(grps[j]+1):(min(grps[j] + n_leads, ttot)), j])) - + }, numeric(total_len +1 )) - + ## get the overall average estimate avg <- apply(mu0hat, 1, function(z) sum(n1 * z, na.rm=T) / sum(n1 * !is.na(z))) avg <- sapply(1:nrow(mu0hat), @@ -557,7 +568,7 @@ predict.multisynth <- function(object, att = F, att_weight = NULL, bs_weight = N sum(n1 * (!is.na(tauhat[k,])) * att_weight_new[k, ], na.rm = T) }) tauhat <- cbind(avg, tauhat) - + } else { ## remove all estimates for t > T_j + n_leads @@ -571,7 +582,7 @@ predict.multisynth <- function(object, att = F, att_weight = NULL, bs_weight = N rep(NA, max(0, ttot-(grps[j] + n_leads)))), numeric(ttot)) -> tauhat - + ## only average currently treated units avg1 <- rowSums(t(fullmask) * mu0hat * n1) / rowSums(t(fullmask) * n1) @@ -590,7 +601,7 @@ predict.multisynth <- function(object, att = F, att_weight = NULL, bs_weight = N replace_na(avg2,0) * apply(1 - fullmask, 2, max) cbind(avg, tauhat) -> tauhat } - + if(att) { return(tauhat) @@ -606,9 +617,9 @@ predict.multisynth <- function(object, att = F, att_weight = NULL, bs_weight = N #' @export print.multisynth <- function(x, att_weight = NULL, ...) { multisynth <- x - + ## straight from lm - cat("\nCall:\n", paste(deparse(multisynth$call), + cat("\nCall:\n", paste(deparse(multisynth$call), sep="\n", collapse="\n"), "\n\n", sep="") # print att estimates @@ -636,7 +647,7 @@ print.multisynth <- function(x, att_weight = NULL, ...) { #' @param ... Optional arguments #' @export plot.multisynth <- function(x, inf_type = "bootstrap", inf = T, - levels = NULL, label = T, + levels = NULL, label = T, weights = FALSE, ...) { if(weights) { @@ -651,13 +662,13 @@ plot.multisynth <- function(x, inf_type = "bootstrap", inf = T, # plotting the weights weights %>% tidyr::pivot_longer(-unit, names_to = "trt_unit", values_to = "weight") %>% - ggplot2::ggplot(aes(x = trt_unit, y = unit, fill = round(weight, 3))) + + ggplot2::ggplot(aes(x = trt_unit, y = unit, fill = round(weight, 3))) + ggplot2::geom_tile(color = "white", size=.5) + ggplot2::scale_fill_gradient(low = "white", high = "black", limits=c(-0.01,1.01)) + ggplot2::guides(fill = "none") + ggplot2::xlab("Treated Unit") + ggplot2::ylab("Donor Unit") + - ggplot2::theme_bw() + + ggplot2::theme_bw() + ggplot2::theme(axis.ticks.x = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank()) } @@ -679,7 +690,7 @@ plot.multisynth <- function(x, inf_type = "bootstrap", inf = T, #' \item{jackknife}{Jackknife} #' } #' @param ... Optional arguments -#' +#' #' @return summary.multisynth object that contains: #' \itemize{ #' \item{"att"}{Dataframe with ATT estimates, standard errors for each treated unit} @@ -693,7 +704,7 @@ plot.multisynth <- function(x, inf_type = "bootstrap", inf = T, summary.multisynth <- function(object, inf_type = "bootstrap", att_weight = NULL, ...) { multisynth <- object - + relative <- T n_leads <- multisynth$n_leads @@ -710,17 +721,17 @@ summary.multisynth <- function(object, inf_type = "bootstrap", att_weight = NULL grps <- trt[is.finite(trt)] which_t <- (1:n)[is.finite(trt)] } - + # grps <- unique(multisynth$data$trt) %>% sort() J <- length(grps) - + # which_t <- (1:n)[is.finite(multisynth$data$trt)] times <- multisynth$data$time - + summ <- list() ## post treatment estimate for each group and overall # att <- predict(multisynth, relative, att=T) - + if(inf_type == "jackknife") { attse <- jackknife_se_multi(multisynth, relative, att_weight = att_weight, ...) } else if(inf_type == "bootstrap") { @@ -736,16 +747,16 @@ summary.multisynth <- function(object, inf_type = "bootstrap", att_weight = NULL upper_bound = matrix(NA, nrow(att), ncol(att)), lower_bound = matrix(NA, nrow(att), ncol(att))) } - + if(relative) { att <- data.frame(cbind(c(-(d-1):min(n_leads, ttot-min(grps)), NA), attse$att)) if(time_cohort) { - col_names <- c("Time", "Average", + col_names <- c("Time", "Average", as.character(times[grps + 1])) } else { - col_names <- c("Time", "Average", + col_names <- c("Time", "Average", as.character(multisynth$data$units[which_t])) } names(att) <- col_names @@ -754,7 +765,7 @@ summary.multisynth <- function(object, inf_type = "bootstrap", att_weight = NULL mutate(Time=Time-1) -> att se <- data.frame(cbind(c(-(d-1):min(n_leads, ttot-min(grps)), NA), - attse$se)) + attse$se)) names(se) <- col_names se %>% gather(Level, Std.Error, -Time) %>% rename("Time"=Time) %>% @@ -775,11 +786,11 @@ summary.multisynth <- function(object, inf_type = "bootstrap", att_weight = NULL } else { att <- data.frame(cbind(times, attse$att)) - names(att) <- c("Time", "Average", times[grps[1:J]]) + names(att) <- c("Time", "Average", times[grps[1:J]]) att %>% gather(Level, Estimate, -Time) -> att se <- data.frame(cbind(times, attse$se)) - names(se) <- c("Time", "Average", times[grps[1:J]]) + names(se) <- c("Time", "Average", times[grps[1:J]]) se %>% gather(Level, Std.Error, -Time) -> se } @@ -812,12 +823,12 @@ summary.multisynth <- function(object, inf_type = "bootstrap", att_weight = NULL print.summary.multisynth <- function(x, level = "Average", ...) { summ <- x - + ## straight from lm cat("\nCall:\n", paste(deparse(summ$call), sep="\n", collapse="\n"), "\n\n", sep="") first_lvl <- summ$att %>% filter(Level != "Average") %>% pull(Level) %>% min() - + ## get ATT estimates for treatment level, post treatment if(summ$relative) { summ$att %>% @@ -840,18 +851,18 @@ print.summary.multisynth <- function(x, level = "Average", ...) { pull(Std.Error) %>% round(3) %>% format(nsmall=3), ")\n\n", sep="")) - + cat(paste("Global L2 Imbalance: ", format(round(summ$global_l2,3), nsmall=3), "\n", "Scaled Global L2 Imbalance: ", format(round(summ$scaled_global_l2,3), nsmall=3), "\n", - "Percent improvement from uniform global weights: ", + "Percent improvement from uniform global weights: ", format(round(1-summ$scaled_global_l2,3)*100), "\n\n", "Individual L2 Imbalance: ", format(round(summ$ind_l2,3), nsmall=3), "\n", - "Scaled Individual L2 Imbalance: ", + "Scaled Individual L2 Imbalance: ", format(round(summ$scaled_ind_l2,3), nsmall=3), "\n", - "Percent improvement from uniform individual weights: ", + "Percent improvement from uniform individual weights: ", format(round(1-summ$scaled_ind_l2,3)*100), "\t", "\n\n", sep="")) @@ -863,7 +874,7 @@ print.summary.multisynth <- function(x, level = "Average", ...) { #' Plot function for summary function for multisynth #' @importFrom ggplot2 aes -#' +#' #' @param x summary object #' @param inf Whether to plot confidence intervals #' @param levels Which units/groups to plot, default is every group @@ -887,19 +898,19 @@ plot.summary.multisynth <- function(x, inf = T, levels = NULL, label = T, # plotting the weights weights %>% tidyr::pivot_longer(-unit, names_to = "trt_unit", values_to = "weight") %>% - ggplot2::ggplot(aes(x = trt_unit, y = unit, fill = round(weight, 3))) + + ggplot2::ggplot(aes(x = trt_unit, y = unit, fill = round(weight, 3))) + ggplot2::geom_tile(color = "white", size=.5) + ggplot2::scale_fill_gradient(low = "white", high = "black", limits=c(-0.01,1.01)) + ggplot2::guides(fill = "none") + ggplot2::xlab("Treated Unit") + ggplot2::ylab("Donor Unit") + - ggplot2::theme_bw() + + ggplot2::theme_bw() + ggplot2::theme(axis.ticks.x = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank()) } summ <- x - + ## get the last time period for each level summ$att %>% filter(!is.na(Estimate), @@ -921,11 +932,11 @@ plot.summary.multisynth <- function(x, inf = T, levels = NULL, label = T, alpha = is_avg)) + ggplot2::geom_line(size = 1) + ggplot2::geom_point(size = 1) -> p - + if(label) { p <- p + ggrepel::geom_label_repel(ggplot2::aes(label = label), nudge_x = 1, na.rm = T) - } + } p <- p + ggplot2::geom_hline(yintercept = 0, lty = 2) if(summ$relative) { @@ -954,10 +965,10 @@ plot.summary.multisynth <- function(x, inf = T, levels = NULL, label = T, ggplot2::aes(ymin=lower_bound, ymax=upper_bound), alpha = alph, color=clr, - data = summ$att %>% + data = summ$att %>% filter(Level == "Average", Time >= 0)) - + } else { p <- p + error_plt( ggplot2::aes(ymin=lower_bound, @@ -969,8 +980,58 @@ plot.summary.multisynth <- function(x, inf = T, levels = NULL, label = T, p <- p + ggplot2::scale_alpha_manual(values=c(1, 0.5)) + ggplot2::scale_color_manual(values=c("#333333", "#818181")) + - ggplot2::guides(alpha=F, color=F) + + ggplot2::guides(alpha=F, color=F) + ggplot2::theme_bw() return(p) } + + + + +#' +#' @return dim: Dimension of data as pair of (# units, # time points). +#' +#' @rdname multisynth_class +#' @export +#' +dim.multisynth <- function(x, ... ) { + n_unit = length( unique( x$long_df[[ x$unit_var ]] ) ) + n_time = x$n_leads + x$n_lags + return( c( n_unit, n_time ) ) +} + + + +#' @title Number of units in multisynth +#' +#' @return Single number (of unique units). +#' +#' @rdname multisynth_class +#' +#' @export +#' +n_unit.multisynth <- function(x, ... ) { + dim.multisynth(x)[[1]] +} + +#' @title Number of time points in multisynth +#' +#' @rdname multisynth_class +#' +#' @return Single number (of unique time points). +#' @export +n_time.multisynth <- function(x, ... ) { + dim.multisynth(x)[[2]] +} + + +#' +#' @rdname multisynth_class +#' +#' @return Number of treated units (always 1 for multisynth) +#' @export +n_treated.multisynth <- function(x, ... ) { + return( length( x$trt_unit ) ) +} + diff --git a/R/permutation.R b/R/permutation.R new file mode 100644 index 0000000..de09cff --- /dev/null +++ b/R/permutation.R @@ -0,0 +1,703 @@ + + +#' Run augsynth letting each unit be the treated unit. This +#' implements the abadie placebo test (e.g., from tobacco paper) but +#' with the augsynth package. +#' +#' @param tx.id Name/ID of treatment group (not stored in ascm) +#' +#' @return Dataframe With each row corresponding to a unit (including +#' the original treated unit) and each column corresponding to a +#' time point. Each entry is the estimated impact for that unit at +#' that time, after fitting augsynth on that unit as the tx unit and +#' the other units as possible controls. +#' +#' @noRd +get_placebo_gaps = function( ascm, att = TRUE ) { + + tx_id <- ascm$trt_unit + wide_data <- ascm$data + synth_data <- ascm$data$synth_data + Z <- wide_data$Z + control_ids = which( wide_data$trt == 0 ) + all_ids = 1:length(wide_data$trt) + t_final = length( wide_data$time ) + + ests <- vapply(all_ids, function(i) { + new_data <- swap_treat_unit(wide_data, Z, i) + new_ascm <- do.call(augsynth:::fit_augsynth_internal, + c(list(wide = new_data$wide, + synth_data = new_data$synth_data, + Z = new_data$Z, + progfunc = ascm$progfunc, scm = ascm$scm, + fixedeff = ascm$fixedeff), + ascm$extra_args)) + est <- predict(new_ascm, att = att ) + est + }, numeric(t_final) ) + + # Verify we recover the original treated unit by seeing if the + # estimates match our original fit model's estimates + dim( ests ) + pds = as.numeric( predict( ascm, att = att ) ) + pds + if( !all( round( ests[ , which( wide_data$trt == 1 ) ] - pds, digits=4 ) == 0 ) ) { + stop( "Two versions of estimated impacts do not correspond. Serious error. Please contact package maintainers." ) + } + + + + ests = as.data.frame( t( ests ) ) + dim( ests ) + ests$ID = tx_id + ests$ID[ control_ids ] = rownames( ascm$weights ) + + ests %>% dplyr::select( ID, everything() ) +} + +#### Generate donor unit from fit synth model #### + +#' Take inner data and change which unit is marked as 'treated' +#' +#' @noRd +swap_treat_unit = function (wide_data, Z, i) { + wide_data$trt <- rep( 0, length( wide_data$trt ) ) + wide_data$trt[i] = 1 + + X0 <- wide_data$X[wide_data$trt == 0, , drop = F] + x1 <- matrix( wide_data$X[wide_data$trt == 1, , drop = F], ncol = 1 ) + y0 <- wide_data$y[wide_data$trt == 0, , drop = F] + y1 <- matrix( wide_data$y[wide_data$trt == 1, , drop = F], ncol = 1 ) + new_synth_data <- list() + new_synth_data$Z0 <- t(X0) + new_synth_data$X0 <- t(X0) + new_synth_data$Z1 <- x1 + new_synth_data$X1 <- x1 + + wide_data = wide_data[ c( "trt", "X", "y" ) ] + + return(list(wide_data = wide_data, + synth_data = new_synth_data, + Z = Z)) +} + + + +#' Calculate MDES +#' +#' Use our method for Calculating SEs, p-values, and MDEs by looking +#' at the distribution of impacts across the donor units. +#' +#' @param lest Entity by year estimate estimates for all treatment and control entities +#' (original tx and all the comparisons as pseudo-treated). Columns +#' of entity ID (tx_col), treatment flag (trt), time period (time_col), +#' and estimated tx-synthetic control difference (ATT). +#' @param treat_year The time period in which the treatment was introduced. +#' @param tx_col The name to the column containing the treatment variable. +#' @param time_col The name of the column containing the time variable. +#' @return List of three dataframes, one of MDES, one of RMSPEs, and +#' one of info on the r-statistics (the ATTs divided by the RMSPEs). +#' +#' @noRd +calculate_MDES_table = function( lest, treat_year, tx_col, time_col ) { + + + stopifnot("Entity by year estimates (`lest`) are missing one of the following necessary features: `tx_col`, 'trt', `time_col`, or 'ATT'" = all( c( tx_col, "trt", time_col, "ATT" ) %in% + names( lest ) ) ) + + RMSPEs = calculate_RMSPE( lest, treat_year, tx_col, time_col ) + + # merge calculated pre-intervention RMSPE for each county to data set + # of gaps for each year for each county + + # Divide that year's gap by pre-intervention RMSPE. + rstatcalc <- full_join( lest, RMSPEs, by = tx_col ) %>% + mutate( rstat=ATT/RMSPE ) + + # Calculate the permutation p-values (permuting the r statistics) + pvalues = rstatcalc %>% + group_by( !!as.name(time_col) ) %>% + summarise( p_rstat = mean( abs( rstat[trt == 1] ) <= abs( rstat ) ), + SE_rstat = sd( rstat[ trt != 1 ] ) * RMSPE[ trt == 1 ], + p_gap = mean( abs( ATT[ trt == 1 ] ) <= abs( ATT ) ), + SE_gap = sd( ATT[ trt != 1 ] ), + .groups = "drop" ) + + # attach R statistic for the treated tract. + rstattract <- rstatcalc %>% + filter(trt==1) %>% + dplyr::select( -!!as.name(tx_col), -trt ) + + # Make table of main results with permutation p-values and add + # indicator of which years are treated and which not. + main_results <- rstattract %>% + full_join( pvalues, by=time_col ) %>% + mutate( tx = ifelse( !!as.name(time_col) >= treat_year, 1, 0 ) ) + + + # Clean up column ordering and make look nice + main_results = main_results %>% + relocate( !!as.name(time_col), tx, Yobs ) %>% + dplyr::select( -RMSPE ) + + # Bundle and return all results + list( MDES_table = main_results, + RMSPEs = RMSPEs, + rstatcalc=rstatcalc ) +} + + +#' Calculate RMSPE for all units on lagged outcomes +#' +#' This calculates the RMSPE of the difference between estimated and +#' observed outcome, averaged across the pre-treatment years, for each +#' county +#' +#' @param lest Entity by year estimate estimates for all treatment and control entities +#' (original tx and all the comparisons as pseudo-treated). Columns +#' of entity ID (tx_col), treatment flag (trt), time period (time_col), +#' and estimated tx-synthetic control difference (ATT). +#' @param treat_year The time period in which the treatment was introduced. +#' @param tx_col The name to the column containing the treatment variable. +#' @param time_col The name of the column containing the time variable. +#' +#' @noRd +calculate_RMSPE = function( lest, treat_year, tx_col, time_col ) { + stopifnot( !is.null( lest$ATT ) ) + + RMSPEs = lest %>% filter( !!as.name(time_col) < treat_year ) %>% + group_by( !!as.name(tx_col) ) %>% + summarise( RMSPE = sqrt( mean( ATT^2 ) ), .groups = "drop" ) + + return(RMSPEs) +} + +#' Estimate robust SE +#' +#' Given a list of impact estimates (all assumed to be estimating a +#' target impact of 0), calculate the modified standard deviation by +#' looking an in inter-quartile range rather than the simple standard +#' deviation. This reduces the effect of small numbers of extreme +#' outliers, and focuses on central variation. This then gets +#' converted to a standard error (standard deviation of the results, +#' in effect.) +#' +#' @param placebo_estimates The list of impact estimates to use for +#' estimating the SE. +#' @param k Number of obs to drop from each tail (so 2k obs dropped +#' total) +#' @param beta Proportion of obs to drop from each tail. E.g., 95\% to +#' drop 5\% most extreme. Only one of beta or k can be non-null. +#' @param round_beta TRUE means adjust beta to the nearest integer +#' number of units to drop (the Howard method). FALSE means +#' interpolate quantiles using R's quantile function. +#' +#' @noRd +estimate_robust_SE = function( placebo_estimates, k = NULL, beta = NULL, + round_beta = FALSE ) { + + stopifnot("Either `k` or `beta` must be NULL." = is.null(k) != is.null( beta ) ) + + n = length( placebo_estimates ) + + if ( is.null( beta ) ) { + #alpha = (2*k-1) / (2*n) # Method A + alpha = k/( n-1 ) # Method B + beta = 1 - 2*alpha + q = sort( placebo_estimates )[ c(1+k, n - k) ] + } else { + alpha = (1 - beta)/2 + k = alpha * (n-1) # QUESTION: Why n-1 here???? + if ( round_beta ) { + k = pmax( 1, round( k ) ) + alpha = k/(n-1) + beta = 1 - 2*alpha + q = sort( placebo_estimates )[ c(1+k, n - k) ] + } else { + q = quantile( placebo_estimates, probs = c( alpha, 1 - alpha ) ) + } + } + + del = as.numeric( diff( q ) ) + z = -qnorm( alpha ) + SE_imp = del / (2*z) + + res = data.frame( beta = beta, k = k, + q_low = q[[1]], q_high = q[[2]], + range = del, z = z, SE_imp = SE_imp ) + res +} + + +#' Construct an organized dataframe with outcome data in long format +#' from augsynth object +#' +#' @noRd +get_long_data <- function( augsynth ) { + + wide_data <- augsynth$data + + tx_id <- augsynth$trt_unit + control_ids = which( wide_data$trt == 0 ) + + all_ids = 1:nrow(wide_data$X) + all_ids[ control_ids ] = rownames( augsynth$weights ) + all_ids[ which( wide_data$trt == 1 ) ] = tx_id + + df <- bind_cols(wide_data$X, wide_data$y) %>% + mutate(!!as.name(augsynth$unit_var) := all_ids) %>% + pivot_longer(!augsynth$unit_var, + names_to = augsynth$time_var, + values_to = 'Yobs', + ) %>% + mutate(!!as.name(augsynth$time_var) := as.numeric(!!as.name(augsynth$time_var)), + ever_Tx = ifelse(!!as.name(augsynth$unit_var) == augsynth$trt_unit, 1, 0)) + + df +} + + + +add_placebo_distribution <- function(augsynth) { + + # Run permutations + ests <- get_placebo_gaps(augsynth, att = FALSE) + time_cols = 2:ncol(ests) + + ests$trt = augsynth$data$trt + + lest = ests %>% + pivot_longer( cols = all_of( time_cols ), + names_to = augsynth$time_var, values_to = "Yhat" ) %>% + mutate(!!as.name(augsynth$time_var) := as.numeric(!!as.name(augsynth$time_var))) + + df <- get_long_data( augsynth ) + + ##### Make dataset of donor units with their weights ##### + units = dplyr::select( df, !!as.name(augsynth$unit_var), ever_Tx ) %>% + unique() + + tx_col = augsynth$unit_var + weights = data.frame( tx_col = rownames( augsynth$weights ), + weights = augsynth$weights, + stringsAsFactors = FALSE) + colnames(weights)[1] <- augsynth$unit_var + + units = merge( units, weights, by=augsynth$unit_var, all=TRUE ) + + # Zero weights for tx units. + units$weights[ units$ever_Tx == 1 ] = 0 + + # confirm that we have placebos for every entity over every observed time period + stopifnot("Placebos do not cover every entity over every observed time period." = (ncol(augsynth$data$X) + ncol(augsynth$data$y)) * nrow(augsynth$data$y) == nrow(lest)) + + lest = rename( lest, !!as.name(augsynth$unit_var) := ID ) + + nn = nrow(lest) + lest = left_join( lest, + df[ c(augsynth$unit_var, augsynth$time_var, "Yobs") ], # issue is that this is calling for original data + by = names(lest)[names(lest) %in% names(df[ c(augsynth$unit_var, augsynth$time_var, "Yobs") ])] ) + stopifnot( nrow( lest ) == nn ) + + + # Impact is difference between observed and imputed control-side outcome + lest$impact = lest$Yobs - lest$Yhat + + #### Make the actual treatment result information ##### + + # Get our observed series (the treated unit) + T_tract = filter( lest, trt == 1 ) + + # The raw donor pool average series + averages = df %>% + filter( ever_Tx == 0 ) %>% + group_by( !!as.name(augsynth$time_var) ) %>% + summarise( raw_average = mean( .data$Yobs ), + .groups = "drop" ) + T_tract = left_join( T_tract, averages, by = augsynth$time_var ) + + ##### Calculate ATT and MDES ##### + + lest = rename( lest, ATT = impact ) + t0 <- ncol(augsynth$data$X) + treat_year = augsynth$data$time[t0 + 1] + + res = calculate_MDES_table(lest, treat_year, augsynth$unit_var, augsynth$time_var) + + # Add RMSPE to donor list + units = left_join( units, res$RMSPEs, by = names(units)[names(units) %in% names(res$RMSPEs)]) + + MDES_table = mutate( res$MDES_table, + raw_average = T_tract$raw_average, + tx = ifelse( !!as.name(augsynth$time_var) >= treat_year, 1, 0 ) ) %>% + relocate( raw_average, .after = Yhat ) + + augsynth$results$permutations <- list( placebo_dist = res$rstatcalc, + MDES_table = MDES_table) + + return(augsynth) +} + + +#' Generate permutation plots +#' +#' @param results Results from calling augsynth() +#' +#' @param inf_type Inference type (takes a value of 'permutation' or +#' 'permutation_rstat') Type of inference algorithm. Inherits +#' inf_type from `object` or otherwise defaults to "conformal". +#' Options are +#' \itemize{ +#' \item{"If numeric"}{A multiple of the treated unit's RMSPE +#' above which donor units will be dropped} +#' \item{"If a character"}{The name or names of donor units to +#' be dropped based on the `unit` parameter +#' in the augsynth model} +#' } +#' @export +permutation_plot <- function(augsynth, inf_type = 'permutation') { + + if (!inf_type %in% c('permutation', 'permutation_rstat')) { + stop("Permutation plots are only available for `permutation` and `permutation_rstat` inference types") + } + + if(inf_type == 'permutation') { + measure = "ATT" + y_lab = "Estimate (ATT)" + } else { + measure = 'rstat' + y_lab = "Estimate (RMPSE-adjusted ATT)" + } + + if (is.summary.augsynth(augsynth)) { + placebo_dist <- augsynth$permutations$placebo_dist + } else if (is.null(augsynth$results$permutations)) { + augsynth <- add_placebo_distribution(augsynth) + placebo_dist <- augsynth$results$permutations$placebo_dist + } else { + placebo_dist <- augsynth$results$permutations$placebo_dist + } + + plot_df <- placebo_dist %>% + mutate(trt_status = factor(trt, levels = c(0, 1), labels = c('Control', 'Treatment'))) + + t0 <- ncol(augsynth$data$X) + treat_year = augsynth$data$time[t0 + 1] + + out_plot <- ggplot2::ggplot(plot_df, + aes(x = !!as.name(augsynth$time_var), y = !!as.name(measure), + color = trt_status, linetype = !!as.name(augsynth$unit_var)), size = 0.8) + + ggplot2::geom_line() + + ggplot2::geom_vline(lty = 2, xintercept = treat_year) + + ggplot2::geom_hline(lty = 2, yintercept = 0) + + ggplot2::scale_color_manual(values = c('Control' = 'gray', 'Treatment' = 'black')) + + ggplot2::scale_linetype_manual(values = rep('solid', length(unique(plot_df %>% pull(!!as.name(augsynth$unit_var)))))) + + ggplot2::labs(color = NULL, y = y_lab) + + ggplot2::guides(linetype = 'none') + + ggplot2::theme_bw() + + ggplot2::theme(legend.position = 'bottom') + + return(out_plot) +} + + + +#' Generate formatted outputs for statistical inference using permutation inference +#' +#' @param augsynth An augsynth object. +#' +#' @noRd +permutation_inf <- function(augsynth, inf_type) { + + t0 <- dim(augsynth$data$synth_data$Z0)[1] + tpost <- dim(augsynth$data$synth_data$Z0)[1] + + out <- list() + out$att <- augsynth$results$permutations$MDES_table$ATT + + SEg = NA + if (inf_type == 'permutation') { + SEg = augsynth$results$permutations$MDES_table$SE_gap + pval = augsynth$results$permutations$MDES_table$p_gap + } else if (inf_type == 'permutation_rstat') { + SEg = augsynth$results$permutations$MDES_table$SE_rstat + pval = augsynth$results$permutations$MDES_table$p_rstat + } + out$lb <- out$att + (qnorm(0.025) * SEg) + out$ub <- out$att + (qnorm(0.975) * SEg) + out$p_val <- pval + + out$lb[c(1:t0)] <- NA + out$ub[c(1:t0)] <- NA + out$p_val[c(1:t0)] <- NA + out$alpha <- 0.05 + + return(out) +} + + + +# Methods for interacting with the augsynth object + +#' Print function for augsynth +#' @param x augsynth object +#' @param ... Optional arguments +#' @export +print.augsynth <- function(x, ...) { + augsynth <- x + + ## straight from lm + cat("\nCall:\n", paste(deparse(augsynth$call), sep="\n", collapse="\n"), "\n", sep="") + cat( sprintf( " Fit to %d units and %d time points\n\n", n_unit(augsynth), n_time(augsynth) ) ) + ## print att estimates + tint <- ncol(augsynth$data$X) + ttotal <- tint + ncol(augsynth$data$y) + att_post <- predict(augsynth, att = T)[(tint + 1):ttotal] + + cat(paste("Average ATT Estimate: ", + format(round(mean(att_post),3), nsmall = 3), "\n\n", sep="")) +} + + + +#' Plot function for augsynth +#' @importFrom graphics plot +#' +#' +#' @param augsynth Augsynth or summary.augsynth object to be plotted +#' @param plot_type The stylized plot type to be returned. Options include +#' \itemize{ +#' \item{"estimate"}{The ATT and 95\% confidence interval} +#' \item{"estimate only"}{The ATT without a confidence interval} +#' \item{"outcomes"}{The level of the outcome variable for the treated and synthetic control units.} +#' \item{"outcomes raw average"}{The level of the outcome variable for the treated and synthetic control units, along with the raw average of the donor units.} +#' \item{"placebo"}{The ATTs resulting from placebo tests on the donor units.} } +#' @param cv If True, plot cross validation MSE against hyper-parameter, otherwise plot effects +#' @param inf_type Type of inference algorithm. Inherits inf_type from `object` or otherwise defaults to "conformal". Options are +#' \itemize{ +#' \item{"conformal"}{Conformal inference (default)} +#' \item{"jackknife+"}{Jackknife+ algorithm over time periods} +#' \item{"jackknife"}{Jackknife over units} +#' \item{"permutation"}{Placebo permutation, raw ATT} +#' \item{"permutation_rstat"}{Placebo permutation, RMSPE adjusted ATT} +#' \item{"None"}{Return ATT Estimate only} +#' } +#' @param ... Optional arguments for inference, for more details for each `inf_type` see +#' \itemize{ +#' \item{"conformal"}{`conformal_inf`} +#' \item{"jackknife+"}{`time_jackknife_plus`} +#' \item{"jackknife"}{`jackknife_se_single`} +#' \item{"permutation"}{`permutation_inf`} +#' } +#' @param ... Optional arguments +#' @export +plot.augsynth <- function(augsynth, + cv = FALSE, + plot_type = 'estimate', + inf_type = NULL, ...) { + + plot_augsynth_results( augsynth=augsynth, cv=cv, plot_type=plot_type, inf_type=inf_type, ... ) + +} + + + + +#' Methods for accessing details of augsynth result object (of class augsynth) +#' +#' +#' @param x augsynth result object +#' +#' @rdname augsynth_class +#' +#' @return is.augsynth: TRUE if object is a augsynth object. +#' +#' @export +is.augsynth <- function(x) { + inherits(x, "augsynth") +} + + + +#' +#' @return dim: Dimension of data as pair of (# units, # time points). +#' +#' @rdname augsynth_class +#' @export +#' +dim.augsynth <- function(x, ... ) { + n_unit = length( unique( x$raw_data[[ x$unit_var ]] ) ) + n_time = length( unique( x$raw_data[[ x$time_var ]] ) ) + return( c( n_unit, n_time ) ) +} + + + + +#' +#' @return Single number (of unique units). +#' +#' @rdname synth_class +#' @export +#' +n_unit <- function(x, ... ) { + UseMethod( "n_unit" ) +} + +#' @title Number of time points in fit data +#' +#' @rdname synth_class +#' @return Single number (of unique time points). +#' @export +n_time <- function(x, ... ) { + UseMethod( "n_time" ) +} + + + +#' @title Number of treated units in fit data +#' +#' @rdname synth_class +#' @return Single number (of number of treated units). +#' @export +n_treated <- function(x, ... ) { + UseMethod( "n_treated" ) +} + + + +#' +#' @return Single number (of unique units). +#' +#' @rdname augsynth_class +#' +#' @export +#' +n_unit.augsynth <- function(x, ... ) { + dim.augsynth(x)[[1]] +} + +#' @title Number of time points in augsynth +#' +#' @rdname augsynth_class +#' +#' @return Single number (of unique time points). +#' @export +n_time.augsynth <- function(x, ... ) { + dim.augsynth(x)[[2]] +} + + +#' +#' @rdname augsynth_class +#' +#' @return Number of treated units (always 1 for augsynth) +#' @export +n_treated.augsynth <- function(x, ... ) { + return( 1 ) +} + + +#' RMSPE for treated unit +#' +#' @param augsynth Augsynth object +#' @return RMSPE (Root mean squared predictive error) for the treated unit in pre-treatment era +#' +#' @export +RMSPE <- function( augsynth ) { + stopifnot( is.augsynth(augsynth) ) + + pd = predict( augsynth, att = TRUE ) + sqrt( mean( pd[1:ncol(augsynth$data$X)]^2 ) ) +} + + + + + +#' Get placebo distribution +#' +#' @param augsynth Augsynth object or summery object with permutation inference of some sort. +#' +#' @return Data frame holding the placebo distribution, one row per placebo unit and time point. +#' +#' @export +placebo_distribution <- function( augsynth ) { + inf_type = NA + if ( is.summary.augsynth(augsynth) ) { + inf_type = augsynth$inf_type + } else if ( is.augsynth(augsynth) ) { + augsynth <- summary( augsynth, inf_type = "permutation" ) + inf_type = augsynth$inf_type + } else { + stop( "Object must be an Augsynth object or summary object" ) + } + + if ( !is.null( inf_type ) && inf_type %in% c( "permutation", "permutation_rstat" ) ) { + return( augsynth$permutations$placebo_dist ) + } else { + stop( "Placebo distribution only available for permutation inference" ) + } +} + + + + + + + +#' Return a summary data frame for the treated unit +#' +#' @param augsynth Augsynth object of interest +#' +#' @return Dataframe of information about the treated unit, one row +#' per time point. This includes the measured outcome, predicted +#' outcome from the synthetic unit, the average of all donor units +#' (as reference, called `raw_average`), and the estimated impact +#' (`ATT`), and the r-statistic (ATT divided by RMSPE). +#' +#' @seealso [donor_table()] +#' @export +treated_table <- function(augsynth) { + + if ( is.summary.augsynth( augsynth ) ) { + return( augsynth$treated_table ) + } + + # Calculate the time series of the treated, the synthetic control, + # and the overall donor pool average + trt_index <- which(augsynth$data$trt == 1) + df <- bind_cols(augsynth$data$X, augsynth$data$y) + # synth_unit <- t(df[-trt_index, ]) %*% augsynth$weights + synth_unit <- predict(augsynth) + average_unit <- df[-trt_index, ] %>% colMeans() + treated_unit <- t(df[trt_index, ]) + lvls = tibble( + time = as.numeric( colnames(df) ), + Yobs = as.numeric( treated_unit ), + Yhat = as.numeric( synth_unit ), + raw_average = as.numeric( average_unit ) + ) + + #lvls <- df %>% + # group_by( !!sym(augsynth$time_var ), ever_Tx) %>% + # summarise( Yavg = mean( Yobs ), .groups="drop" ) %>% + # pivot_wider( names_from = ever_Tx, values_from = Yavg ) + #colnames(lvls)[2:3] <- c("raw_average", "Yobs") + + t0 <- ncol(augsynth$data$X) + tpost <- ncol(augsynth$data$y) + lvls$tx = rep( c(0,1), c( t0, tpost ) ) + #lvls$Yhat = predict( augsynth ) + lvls$ATT = lvls$Yobs - lvls$Yhat + lvls$rstat = lvls$ATT / sqrt( mean( lvls$ATT[ lvls$tx == 0 ]^2 ) ) + + lvls <- dplyr::relocate( lvls, + time, tx, Yobs, Yhat, raw_average, ATT, rstat ) + + return( lvls ) +} + + + + diff --git a/R/plotting_methods.R b/R/plotting_methods.R new file mode 100644 index 0000000..f26d471 --- /dev/null +++ b/R/plotting_methods.R @@ -0,0 +1,254 @@ + + + +#' Figure out what kind of quick-summary is needed given the desired +#' plot type. +#' +#' @noRd +get_right_summary <- function( augsynth, plot_type, inf_type ) { + + prior_inf = NULL + if ( !is.null(augsynth$results) ) { + prior_inf = augsynth$results$inf_type + } else { + prior_inf = "none" + } + + if ( plot_type=="estimates only" ) { + inf_type = prior_inf + } else if (plot_type == 'placebo') { + if ( prior_inf %in% c('permutation', 'permutation_rstat') ) { + inf_type = prior_inf + } else if ( is.null( inf_type ) && prior_inf == "none" ) { + # if the user specifies the "placebo" plot type without + # accompanying inference, default to placebo + inf_type = "permutation" + } else if (!inf_type %in% c('permutation', 'permutation_rstat')) { + message('Placebo plots are only available for permutation-based inference. The plot shows results from "inf_type = "permutation""') + inf_type = "permutation" + } + } else if (plot_type %in% c( "outcomes", "outcomes raw average" )) { + if ( is.null( inf_type ) ) { + inf_type = "none" + } + } else if ( is.null( prior_inf ) ) { + inf_type = "conformal" + } + + if ( is.null( inf_type ) || inf_type == "none" ) { + inf_type = prior_inf + } + + inf_type +} + + + + +#' Plot function for augsynth or summary.augsynth objects +#' +#' @importFrom graphics plot +#' +#' @param augsynth augsynth or summary.augsynth object to be plotted +#' @param plot_type The stylized plot type to be returned. Options include +#' \itemize{ +#' \item{"estimate"}{The ATT and 95\% confidence interval} +#' \item{"estimate only"}{The ATT without a confidence interval} +#' \item{"outcomes"}{The level of the outcome variable for the treated and synthetic control units.} +#' \item{"outcomes raw average"}{The level of the outcome variable for the treated and synthetic control units, along with the raw average of the donor units.} +#' \item{"placebo"}{The ATTs resulting from placebo tests on the donor units.} } +#' @param cv If True, plot cross validation MSE against hyper-parameter, otherwise plot effects +#' @param inf_type Type of inference algorithm. Inherits inf_type from `object` or otherwise defaults to "conformal". Options are +#' \itemize{ +#' \item{"conformal"}{Conformal inference (default)} +#' \item{"jackknife+"}{Jackknife+ algorithm over time periods} +#' \item{"jackknife"}{Jackknife over units} +#' \item{"permutation"}{Placebo permutation, raw ATT} +#' \item{"permutation_rstat"}{Placebo permutation, RMSPE adjusted ATT} +#' \item{"None"}{Return ATT Estimate only} +#' } +#' @param ... Optional arguments for inference, for more details for each `inf_type` see +#' \itemize{ +#' \item{"conformal"}{`conformal_inf`} +#' \item{"jackknife+"}{`time_jackknife_plus`} +#' \item{"jackknife"}{`jackknife_se_single`} +#' \item{"permutation"}{`permutation_inf`} +#' } +#' @param ... Optional arguments +plot_augsynth_results <- function( augsynth, + cv = FALSE, + plot_type = 'estimate', + inf_type = NULL, ...) { + + if ( !is.null( inf_type ) ) { + inf_type = tolower(inf_type) + stopifnot( inf_type %in% c('conformal', 'jackknife', 'jackknife+', 'permutation', 'permutation_rstat', 'none')) + } + + # Summarize object if needed. + if ( is.augsynth(augsynth) ) { + it <- get_right_summary(augsynth, plot_type, inf_type) + if ( it != "none" ) { + message("Plotting augsynth objects may slow execution time. For faster results, plot from an augsynth summary object using plot.summary.augsynth()") + } + augsynth = summary(augsynth, inf_type=it) + } + + if (cv == TRUE) { + errors = data.frame(lambdas = augsynth$lambdas, + errors = augsynth$lambda_errors, + errors_se = augsynth$lambda_errors_se) + p <- ggplot2::ggplot(errors, ggplot2::aes(x = lambdas, y = errors)) + + ggplot2::geom_point(size = 2) + + ggplot2::geom_errorbar( + ggplot2::aes(ymin = errors, + ymax = errors + errors_se), + width=0.2, size = 0.5) + p <- p + ggplot2::labs(title = bquote("Cross Validation MSE over " ~ lambda), + x = expression(lambda), y = "Cross Validation MSE", + parse = TRUE) + p <- p + ggplot2::scale_x_log10() + + # find minimum and min + 1se lambda to plot + min_lambda <- choose_lambda(augsynth$lambdas, + augsynth$lambda_errors, + augsynth$lambda_errors_se, + F) + min_1se_lambda <- choose_lambda(augsynth$lambdas, + augsynth$lambda_errors, + augsynth$lambda_errors_se, + T) + min_lambda_index <- which(augsynth$lambdas == min_lambda) + min_1se_lambda_index <- which(augsynth$lambdas == min_1se_lambda) + + p <- p + ggplot2::geom_point( + ggplot2::aes(x = min_lambda, + y = augsynth$lambda_errors[min_lambda_index]), + color = "gold") + p + ggplot2::geom_point( + ggplot2::aes(x = min_1se_lambda, + y = augsynth$lambda_errors[min_1se_lambda_index]), + color = "gold") + + ggplot2::theme_bw() + return(p) + } else if (plot_type == 'estimate only') { + p <- augsynth_plot_from_results(augsynth, ci = FALSE) + } else if (plot_type == 'estimate') { + p <- augsynth_plot_from_results(augsynth, ci = TRUE) + } else if (grepl('placebo', plot_type)) { + p <- permutation_plot(augsynth, inf_type = augsynth$inf_type) + } else if (plot_type == 'outcomes') { + p <- augsynth_outcomes_plot(augsynth, measure = 'synth') + } else if (plot_type == 'outcomes raw average') { + p <- augsynth_outcomes_plot(augsynth, measure = c('synth', 'average')) + } + return(p) +} + + + + + + +#' Plot function for summary function for augsynth +#' +#' @param augsynth Summary object +#' @param ... Optional arguments +#' +#' @noRd +augsynth_plot_from_results <- function(augsynth, + ci = TRUE, + ...) { + + pdat = NA + if ( is.augsynth(augsynth) ) { + pdat <- augsynth$results$att + } else { + # Summary object + pdat <- augsynth$att + } + + p <- pdat %>% + ggplot2::ggplot(ggplot2::aes(x = Time, y = Estimate)) + + if (ci) { + if(all(is.na(pdat$lower_bound))) { + p <- p + ggplot2::geom_ribbon(ggplot2::aes(ymin = Estimate - 2 * Std.Error, + ymax = Estimate + 2 * Std.Error), + alpha = 0.2) + } else { + p <- p + ggplot2::geom_ribbon(ggplot2::aes(ymin = lower_bound, + ymax = upper_bound), + alpha = 0.2) + } + + } + p <- p + ggplot2::geom_line() + + ggplot2::geom_vline(xintercept = augsynth$t_int, lty = 2) + + ggplot2::geom_hline(yintercept = 0, lty = 2) + + ggplot2::labs(x = augsynth$time_var) + + ggplot2::theme_bw() + + return(p) + +} + + + + + + + + +#' Plot the original level of the outcome variable for the treated +#' unit and its synthetic counterfactual +#' +#' @param augsynth Augsynth object or augsynth summary object to be plotted +#' @param measure Whether to plot the synthetic counterfactual or the +#' raw average of donor units. Can list both if desired. +#' +#' @noRd +augsynth_outcomes_plot <- function(augsynth, measure = c("synth", "average")) { + + if (!is.summary.augsynth(augsynth)) { + augsynth <- summary(augsynth) + } + + series = augsynth$treated_table + + all_y = c(series$Yobs, series$Yhat, series$raw_average) + max_y <- max(all_y) + min_y <- min(all_y) + + pt = which( series$tx == 1 )[[1]] + cut_time = (series$time[pt] + series$time[pt + 1]) / 2 + + p <- ggplot2::ggplot( series ) + + ggplot2::geom_line(aes(x = time, y = Yobs, linetype = as.character(augsynth$trt_unit))) + + if ('synth' %in% measure) { + p <- p + + ggplot2::geom_line(aes(x = time, y = Yhat, linetype = 'Synthetic counterfactual')) + } + + if ('average' %in% measure) { + p <- p + + ggplot2::geom_line(aes(x = time, y = raw_average, linetype = 'Donor raw average')) + } + + p <- p + + ggplot2::labs(linetype = NULL, + x = augsynth$time_var, + y = 'Outcome') + + ggplot2::ylim(min_y, max_y) + + ggplot2::theme_bw() + + ggplot2::geom_vline(xintercept = cut_time, linetype = 'dashed') + + ggplot2::theme(legend.position = 'bottom', + legend.key = ggplot2::element_rect(fill = scales::alpha("white", 0.5)), + legend.background = ggplot2::element_rect(fill = scales::alpha("white", 0)) + ) + + return(p) +} + + diff --git a/README.md b/README.md index f7a3736..f992fb4 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,11 @@ ## Overview This package implements the Augmented Synthetic Control Method (ASCM). +In particular, there are three types of ASCM implemented in this package: + +1. **single_augsynth**: The ASCM version of the classic synthetic control approach with a single treated unit. +2. **multisynth**: ASCM that estimates the treatment effect for multiple treated units with staggered adoption. +3. **augsynth_multiout**: ASCM for a single treated unit with multiple outcomes. For a more detailed description of the main functionality check out: - [the vignette for simultaneous adoption](https://github.com/ebenmichael/augsynth/blob/master/vignettes/singlesynth-vignette.md) diff --git a/man/RMSPE.Rd b/man/RMSPE.Rd new file mode 100644 index 0000000..2e6eaaa --- /dev/null +++ b/man/RMSPE.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/permutation.R +\name{RMSPE} +\alias{RMSPE} +\title{RMSPE for treated unit} +\usage{ +RMSPE(augsynth) +} +\arguments{ +\item{augsynth}{Augsynth object} +} +\value{ +RMSPE (Root mean squared predictive error) for the treated unit in pre-treatment era +} +\description{ +RMSPE for treated unit +} diff --git a/man/add_inference.Rd b/man/add_inference.Rd new file mode 100644 index 0000000..8345c4e --- /dev/null +++ b/man/add_inference.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/augsynth.R +\name{add_inference} +\alias{add_inference} +\title{Function to add inference to augsynth object} +\usage{ +add_inference(object, inf_type = "conformal", linear_effect = F, ...) +} +\arguments{ +\item{object}{augsynth object} + +\item{inf_type}{Type of inference algorithm. Options are +\itemize{ + \item{"conformal"}{Conformal inference (default)} + \item{"jackknife+"}{Jackknife+ algorithm over time periods} + \item{"jackknife"}{Jackknife over units} + \item{"permutation"}{Placebo permutation, raw ATT} + \item{"permutation_rstat"}{Placebo permutation, RMSPE adjusted ATT} +}} + +\item{linear_effect}{Boolean, whether to invert the conformal inference hypothesis test to get confidence intervals for a linear-in-time treatment effect: intercept + slope * time} + +\item{...}{Optional arguments for inference, for more details for each `inf_type` see +\itemize{ + \item{"conformal"}{`conformal_inf`} + \item{"jackknife+"}{`time_jackknife_plus`} + \item{"jackknife"}{`jackknife_se_single`} + \item{"permutation"}{`permutation_inf`} +}} +} +\description{ +Function to add inference to augsynth object +} diff --git a/man/augsynth-package.Rd b/man/augsynth-package.Rd index 737cc24..c6fdcfe 100644 --- a/man/augsynth-package.Rd +++ b/man/augsynth-package.Rd @@ -7,3 +7,7 @@ \description{ A package implementing the Augmented Synthetic Controls Method } +\author{ +\strong{Maintainer}: Eli Ben-Michael \email{ebenmichael@berkeley.edu} + +} diff --git a/man/augsynth.Rd b/man/augsynth.Rd index cd5d46a..722fa2a 100644 --- a/man/augsynth.Rd +++ b/man/augsynth.Rd @@ -15,7 +15,8 @@ augsynth(form, unit, time, data, t_int = NULL, ...) \item{data}{Panel data as dataframe} -\item{t_int}{Time of intervention (used for single-period treatment only)} +\item{t_int}{Time of intervention (used for single-period treatment +only)} \item{...}{Optional arguments \itemize{ @@ -23,7 +24,7 @@ augsynth(form, unit, time, data, t_int = NULL, ...) \itemize{ \item{"progfunc"}{What function to use to impute control outcomes: Ridge=Ridge regression (allows for standard errors), None=No outcome model, EN=Elastic Net, RF=Random Forest, GSYN=gSynth, MCP=MCPanel, CITS=CITS, CausalImpact=Bayesian structural time series with CausalImpact, seq2seq=Sequence to sequence learning with feedforward nets} \item{"scm"}{Whether the SCM weighting function is used} - \item{"fixedeff"}{Whether to include a unit fixed effect, default F } + \item{"fixedeff"}{Whether to include a unit fixed effect, default is FALSE } \item{"cov_agg"}{Covariate aggregation functions, if NULL then use mean with NAs omitted} } \item Multi period (staggered) augsynth @@ -39,12 +40,18 @@ augsynth(form, unit, time, data, t_int = NULL, ...) }} } \value{ -augsynth object that contains: +augsynth or multisynth object (depending on dispatch) that contains (among other things): \itemize{ \item{"weights"}{weights} \item{"data"}{Panel data as matrices} } } \description{ -Fit Augmented SCM +The general `augsynth()` method dispatches to `single_augsynth`, +`augsynth_multiout`, or `multisynth` based on the number of +outcomes and treatment times. See documentation for these methods +for further detail. +} +\seealso{ +`single_augsynth`, `augsynth_multiout`, `multisynth` } diff --git a/man/augsynth_class.Rd b/man/augsynth_class.Rd new file mode 100644 index 0000000..9d69709 --- /dev/null +++ b/man/augsynth_class.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/permutation.R +\name{is.augsynth} +\alias{is.augsynth} +\alias{dim.augsynth} +\alias{n_unit.augsynth} +\alias{n_time.augsynth} +\alias{n_treated.augsynth} +\title{Methods for accessing details of augsynth result object (of class augsynth)} +\usage{ +is.augsynth(x) + +\method{dim}{augsynth}(x, ...) + +\method{n_unit}{augsynth}(x, ...) + +\method{n_time}{augsynth}(x, ...) + +\method{n_treated}{augsynth}(x, ...) +} +\arguments{ +\item{x}{augsynth result object} +} +\value{ +is.augsynth: TRUE if object is a augsynth object. + +dim: Dimension of data as pair of (# units, # time points). + +Single number (of unique units). + +Single number (of unique time points). + +Number of treated units (always 1 for augsynth) +} +\description{ +Methods for accessing details of augsynth result object (of class augsynth) + +Number of time points in augsynth +} diff --git a/man/bootstrap_methods.Rd b/man/bootstrap_methods.Rd new file mode 100644 index 0000000..d6946ae --- /dev/null +++ b/man/bootstrap_methods.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{bootstrap_methods} +\alias{bootstrap_methods} +\alias{rdirichlet_b} +\alias{rmultinom_b} +\alias{rwild_b} +\title{Bootstrap Functions} +\usage{ +rdirichlet_b(n) + +rmultinom_b(n) + +rwild_b(n) +} +\arguments{ +\item{n}{Number of units} +} +\description{ +There are several helper functions used for bootstrap inference. +Each method returns a list of selection weights of length n, +which weights that sum to n. +} +\section{Functions}{ +\itemize{ +\item \code{rdirichlet_b()}: Bayesian bootstrap + +\item \code{rmultinom_b()}: Non-parametric bootstrap + +\item \code{rwild_b()}: Wild bootstrap (Mammen 1993) + +}} diff --git a/man/covariate_balance_table.Rd b/man/covariate_balance_table.Rd new file mode 100644 index 0000000..ce4b531 --- /dev/null +++ b/man/covariate_balance_table.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/calc_covariate_balance.R +\name{covariate_balance_table} +\alias{covariate_balance_table} +\title{Make covariate balance table.} +\usage{ +covariate_balance_table(ascm, pre_period = NULL) +} +\arguments{ +\item{ascm}{An augsynth result object from single_augsynth} + +\item{pre_period}{List of names of the pre-period timepoints to +calculate balance for. NULL means none.} +} +\description{ +Make a table comparing means of covariates in the treated group, +the raw control group, and the new weighted control group (the +synthetic control) +} diff --git a/man/donor_table.Rd b/man/donor_table.Rd new file mode 100644 index 0000000..3fdfa47 --- /dev/null +++ b/man/donor_table.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/donor_control.R +\name{donor_table} +\alias{donor_table} +\title{Return a summary data frame donor units used in the model with +their synthetic weights.} +\usage{ +donor_table(augsynth, include_RMSPE = TRUE, zap_weights = 1e-07) +} +\arguments{ +\item{augsynth}{Augsynth object to be plotted} + +\item{include_RMSPE}{Include RMSPEs in the table even if +permutation inference has not yet been conducted.} + +\item{zap_weights}{all weights smaller than this value will be set +to zero. Set to NULL to keep all weights.} +} +\description{ +If permutation inference has been conducted, table will also +include RMSPEs. This can be forced with include_RMSPE flag. +} +\details{ +If the augsynth object does not have permutation-based inference +results, the function will call that form of inference, in order to +calculate the RMSPEs for each donor unit in turn. +} diff --git a/man/make_V_matrix.Rd b/man/make_V_matrix.Rd deleted file mode 100644 index 30934d6..0000000 --- a/man/make_V_matrix.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fit_synth.R -\name{make_V_matrix} -\alias{make_V_matrix} -\title{Make a V matrix from a vector (or null)} -\usage{ -make_V_matrix(t0, V) -} -\description{ -Make a V matrix from a vector (or null) -} diff --git a/man/multisynth_class.Rd b/man/multisynth_class.Rd new file mode 100644 index 0000000..09719c5 --- /dev/null +++ b/man/multisynth_class.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multisynth_class.R +\name{dim.multisynth} +\alias{dim.multisynth} +\alias{n_unit.multisynth} +\alias{n_time.multisynth} +\alias{n_treated.multisynth} +\title{Number of units in multisynth} +\usage{ +\method{dim}{multisynth}(x, ...) + +\method{n_unit}{multisynth}(x, ...) + +\method{n_time}{multisynth}(x, ...) + +\method{n_treated}{multisynth}(x, ...) +} +\value{ +dim: Dimension of data as pair of (# units, # time points). + +Single number (of unique units). + +Single number (of unique time points). + +Number of treated units (always 1 for multisynth) +} +\description{ +Number of units in multisynth + +Number of time points in multisynth +} diff --git a/man/permutation_plot.Rd b/man/permutation_plot.Rd new file mode 100644 index 0000000..2f1dbab --- /dev/null +++ b/man/permutation_plot.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/permutation.R +\name{permutation_plot} +\alias{permutation_plot} +\title{Generate permutation plots} +\usage{ +permutation_plot(augsynth, inf_type = "permutation") +} +\arguments{ +\item{inf_type}{Inference type (takes a value of 'permutation' or +'permutation_rstat') Type of inference algorithm. Inherits +inf_type from `object` or otherwise defaults to "conformal". +Options are + \itemize{ + \item{"If numeric"}{A multiple of the treated unit's RMSPE + above which donor units will be dropped} + \item{"If a character"}{The name or names of donor units to + be dropped based on the `unit` parameter + in the augsynth model} + }} + +\item{results}{Results from calling augsynth()} +} +\description{ +Generate permutation plots +} diff --git a/man/placebo_distribution.Rd b/man/placebo_distribution.Rd new file mode 100644 index 0000000..ebcab05 --- /dev/null +++ b/man/placebo_distribution.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/permutation.R +\name{placebo_distribution} +\alias{placebo_distribution} +\title{Get placebo distribution} +\usage{ +placebo_distribution(augsynth) +} +\arguments{ +\item{augsynth}{Augsynth object or summery object with permutation inference of some sort.} +} +\value{ +Data frame holding the placebo distribution, one row per placebo unit and time point. +} +\description{ +Get placebo distribution +} diff --git a/man/plot.augsynth.Rd b/man/plot.augsynth.Rd index 0c8f052..156ae5c 100644 --- a/man/plot.augsynth.Rd +++ b/man/plot.augsynth.Rd @@ -1,18 +1,34 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/augsynth.R +% Please edit documentation in R/permutation.R \name{plot.augsynth} \alias{plot.augsynth} \title{Plot function for augsynth} \usage{ -\method{plot}{augsynth}(x, inf = T, cv = F, ...) +\method{plot}{augsynth}(augsynth, cv = FALSE, plot_type = "estimate", inf_type = NULL, ...) } \arguments{ -\item{x}{Augsynth object to be plotted} - -\item{inf}{Boolean, whether to get confidence intervals around the point estimates} +\item{augsynth}{Augsynth or summary.augsynth object to be plotted} \item{cv}{If True, plot cross validation MSE against hyper-parameter, otherwise plot effects} +\item{plot_type}{The stylized plot type to be returned. Options include +\itemize{ + \item{"estimate"}{The ATT and 95\% confidence interval} + \item{"estimate only"}{The ATT without a confidence interval} + \item{"outcomes"}{The level of the outcome variable for the treated and synthetic control units.} + \item{"outcomes raw average"}{The level of the outcome variable for the treated and synthetic control units, along with the raw average of the donor units.} + \item{"placebo"}{The ATTs resulting from placebo tests on the donor units.} }} + +\item{inf_type}{Type of inference algorithm. Inherits inf_type from `object` or otherwise defaults to "conformal". Options are +\itemize{ + \item{"conformal"}{Conformal inference (default)} + \item{"jackknife+"}{Jackknife+ algorithm over time periods} + \item{"jackknife"}{Jackknife over units} + \item{"permutation"}{Placebo permutation, raw ATT} + \item{"permutation_rstat"}{Placebo permutation, RMSPE adjusted ATT} + \item{"None"}{Return ATT Estimate only} +}} + \item{...}{Optional arguments} } \description{ diff --git a/man/plot.summary.augsynth.Rd b/man/plot.summary.augsynth.Rd index 5d04bab..8b86ca3 100644 --- a/man/plot.summary.augsynth.Rd +++ b/man/plot.summary.augsynth.Rd @@ -2,17 +2,28 @@ % Please edit documentation in R/augsynth.R \name{plot.summary.augsynth} \alias{plot.summary.augsynth} -\title{Plot function for summary function for augsynth} +\title{Plot results from summarized augsynth} \usage{ -\method{plot}{summary.augsynth}(x, inf = T, ...) +\method{plot}{summary.augsynth}(x, plot_type = "estimate", ...) } \arguments{ -\item{x}{Summary object} +\item{x}{summary.augsynth object} -\item{inf}{Boolean, whether to plot confidence intervals} +\item{plot_type}{The stylized plot type to be returned. Options include +\itemize{ + \item{"estimate"}{The ATT and 95\% confidence interval} + \item{"estimate only"}{The ATT without a confidence interval} + \item{"outcomes"}{The level of the outcome variable for the treated and synthetic control units.} + \item{"outcomes raw average"}{The level of the outcome variable for the treated and synthetic control units, along with the raw average of the donor units.} + \item{"placebo"}{The ATTs resulting from placebo tests on the donor units.} }} \item{...}{Optional arguments} } \description{ -Plot function for summary function for augsynth +Make a variety of plots, depending on plot_type. Default is to +plot impacts with associated uncertainty (if present). Other +options are estimates with no uncertainty ("estimate only"), the +level of outcome ("outcomes"), level of outcomes with the raw trend +as well ("outcomes raw average"), and the classic spagetti plot +("placebo"). } diff --git a/man/plot_augsynth_results.Rd b/man/plot_augsynth_results.Rd new file mode 100644 index 0000000..f67074f --- /dev/null +++ b/man/plot_augsynth_results.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting_methods.R +\name{plot_augsynth_results} +\alias{plot_augsynth_results} +\title{Plot function for augsynth or summary.augsynth objects} +\usage{ +plot_augsynth_results( + augsynth, + cv = FALSE, + plot_type = "estimate", + inf_type = NULL, + ... +) +} +\arguments{ +\item{augsynth}{augsynth or summary.augsynth object to be plotted} + +\item{cv}{If True, plot cross validation MSE against hyper-parameter, otherwise plot effects} + +\item{plot_type}{The stylized plot type to be returned. Options include +\itemize{ + \item{"estimate"}{The ATT and 95\% confidence interval} + \item{"estimate only"}{The ATT without a confidence interval} + \item{"outcomes"}{The level of the outcome variable for the treated and synthetic control units.} + \item{"outcomes raw average"}{The level of the outcome variable for the treated and synthetic control units, along with the raw average of the donor units.} + \item{"placebo"}{The ATTs resulting from placebo tests on the donor units.} }} + +\item{inf_type}{Type of inference algorithm. Inherits inf_type from `object` or otherwise defaults to "conformal". Options are +\itemize{ + \item{"conformal"}{Conformal inference (default)} + \item{"jackknife+"}{Jackknife+ algorithm over time periods} + \item{"jackknife"}{Jackknife over units} + \item{"permutation"}{Placebo permutation, raw ATT} + \item{"permutation_rstat"}{Placebo permutation, RMSPE adjusted ATT} + \item{"None"}{Return ATT Estimate only} +}} + +\item{...}{Optional arguments} +} +\description{ +Plot function for augsynth or summary.augsynth objects +} diff --git a/man/print.augsynth.Rd b/man/print.augsynth.Rd index 8686c2c..746da2f 100644 --- a/man/print.augsynth.Rd +++ b/man/print.augsynth.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/augsynth.R +% Please edit documentation in R/permutation.R \name{print.augsynth} \alias{print.augsynth} \title{Print function for augsynth} diff --git a/man/rdirichlet_b.Rd b/man/rdirichlet_b.Rd deleted file mode 100644 index 1a498da..0000000 --- a/man/rdirichlet_b.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inference.R -\name{rdirichlet_b} -\alias{rdirichlet_b} -\title{Bayesian bootstrap} -\usage{ -rdirichlet_b(n) -} -\arguments{ -\item{n}{Number of units} -} -\description{ -Bayesian bootstrap -} diff --git a/man/rmultinom_b.Rd b/man/rmultinom_b.Rd deleted file mode 100644 index d6bf719..0000000 --- a/man/rmultinom_b.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inference.R -\name{rmultinom_b} -\alias{rmultinom_b} -\title{Non-parametric bootstrap} -\usage{ -rmultinom_b(n) -} -\arguments{ -\item{n}{Number of units} -} -\description{ -Non-parametric bootstrap -} diff --git a/man/rwild_b.Rd b/man/rwild_b.Rd deleted file mode 100644 index 6503f78..0000000 --- a/man/rwild_b.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inference.R -\name{rwild_b} -\alias{rwild_b} -\title{Wild bootstrap (Mammen 1993)} -\usage{ -rwild_b(n) -} -\arguments{ -\item{n}{Number of units} -} -\description{ -Wild bootstrap (Mammen 1993) -} diff --git a/man/single_augsynth.Rd b/man/single_augsynth.Rd index 85654d1..4c6b7bf 100644 --- a/man/single_augsynth.Rd +++ b/man/single_augsynth.Rd @@ -32,17 +32,19 @@ single_augsynth( ridge=Ridge regression (allows for standard errors), none=No outcome model, en=Elastic Net, RF=Random Forest, GSYN=gSynth, -mcp=MCPanel, +mcp=MCPanel, cits=Comparitive Interuppted Time Series causalimpact=Bayesian structural time series with CausalImpact} -\item{scm}{Whether the SCM weighting function is used} +\item{scm}{Whether the SCM weighting function is used. If FALSE, then package will fit the outcome model, but not calculate new donor weights to match pre-treatment covariates. Instead, each donor unit will be equally weighted. If TRUE, weights on donor pool will be calculated.} \item{fixedeff}{Whether to include a unit fixed effect, default F} \item{cov_agg}{Covariate aggregation functions, if NULL then use mean with NAs omitted} \item{...}{optional arguments for outcome model} + +\item{plot}{Whether or not to return a plot of the augsynth model} } \value{ augsynth object that contains: diff --git a/man/summary.augsynth.Rd b/man/summary.augsynth.Rd index 8d1c7cd..9e0f62f 100644 --- a/man/summary.augsynth.Rd +++ b/man/summary.augsynth.Rd @@ -4,29 +4,35 @@ \alias{summary.augsynth} \title{Summary function for augsynth} \usage{ -\method{summary}{augsynth}(object, inf = T, inf_type = "conformal", linear_effect = F, ...) +\method{summary}{augsynth}(object, inf_type = "conformal", ...) } \arguments{ \item{object}{augsynth object} -\item{inf}{Boolean, whether to get confidence intervals around the point estimates} +\item{inf_type}{Type of inference algorithm. If left NULL, inherits +inf_type from `object` or otherwise defaults to "conformal." +Options are + \itemize{ + \item{"conformal"}{Conformal inference (default)} + \item{"jackknife+"}{Jackknife+ algorithm over time periods} + \item{"jackknife"}{Jackknife over units} + \item{"permutation"}{Placebo permutation, raw ATT} + \item{"permutation_rstat"}{Placebo permutation, RMSPE adjusted ATT} + }} -\item{inf_type}{Type of inference algorithm. Options are -\itemize{ - \item{"conformal"}{Conformal inference (default)} - \item{"jackknife+"}{Jackknife+ algorithm over time periods} - \item{"jackknife"}{Jackknife over units} -}} - -\item{linear_effect}{Boolean, whether to invert the conformal inference hypothesis test to get confidence intervals for a linear-in-time treatment effect: intercept + slope * time} - -\item{...}{Optional arguments for inference, for more details for each `inf_type` see -\itemize{ - \item{"conformal"}{`conformal_inf`} - \item{"jackknife+"}{`time_jackknife_plus`} - \item{"jackknife"}{`jackknife_se_single`} -}} +\item{...}{Optional arguments for inference, for more details for +each `inf_type` see + \itemize{ + \item{"conformal"}{`conformal_inf`} + \item{"jackknife+"}{`time_jackknife_plus`} + \item{"jackknife"}{`jackknife_se_single`} + \item{"permutation", "permutation_rstat"}{`permutation_inf`} + }} } \description{ -Summary function for augsynth +Summary summarizes an augsynth result by (usually) adding an +inferential result, if that has not been calculated already, and +calculating a few other summary statistics such as estimated bias. +This method does this via `add_inference()`, if inference is +needed. } diff --git a/man/summary.augsynth_class.Rd b/man/summary.augsynth_class.Rd new file mode 100644 index 0000000..ebc852e --- /dev/null +++ b/man/summary.augsynth_class.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/augsynth.R +\name{is.summary.augsynth} +\alias{is.summary.augsynth} +\title{Methods for accessing details of summary.augsynth object} +\usage{ +is.summary.augsynth(x) +} +\arguments{ +\item{x}{summary.augsynth result object} +} +\value{ +is.summary.augsynth: TRUE if object is a augsynth object. +} +\description{ +Methods for accessing details of summary.augsynth object +} diff --git a/man/synth_class.Rd b/man/synth_class.Rd new file mode 100644 index 0000000..246bb5b --- /dev/null +++ b/man/synth_class.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/permutation.R +\name{n_unit} +\alias{n_unit} +\alias{n_time} +\alias{n_treated} +\title{Number of time points in fit data} +\usage{ +n_unit(x, ...) + +n_time(x, ...) + +n_treated(x, ...) +} +\value{ +Single number (of unique units). + +Single number (of unique time points). + +Single number (of number of treated units). +} +\description{ +Number of time points in fit data + +Number of treated units in fit data +} diff --git a/man/treated_table.Rd b/man/treated_table.Rd new file mode 100644 index 0000000..95d2b0e --- /dev/null +++ b/man/treated_table.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/permutation.R +\name{treated_table} +\alias{treated_table} +\title{Return a summary data frame for the treated unit} +\usage{ +treated_table(augsynth) +} +\arguments{ +\item{augsynth}{Augsynth object of interest} +} +\value{ +Dataframe of information about the treated unit, one row + per time point. This includes the measured outcome, predicted + outcome from the synthetic unit, the average of all donor units + (as reference, called `raw_average`), and the estimated impact + (`ATT`), and the r-statistic (ATT divided by RMSPE). +} +\description{ +Return a summary data frame for the treated unit +} +\seealso{ +[donor_table()] +} diff --git a/man/update_augsynth.Rd b/man/update_augsynth.Rd new file mode 100644 index 0000000..5d1b3ed --- /dev/null +++ b/man/update_augsynth.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/donor_control.R +\name{update_augsynth} +\alias{update_augsynth} +\title{Return a new augsynth object with specified donor units removed} +\usage{ +update_augsynth(augsynth, drop = 20) +} +\arguments{ +\item{augsynth}{Augsynth object to be plotted} + +\item{drop}{Drop donor units, based on pre-treatment RMSPE or unit +name(s). Default of 20 means drop units with an RMSPE 20x higher +than the treated unit. The `drop` parameter can also be a character vector of unit +IDs to drop.} +} +\description{ +Return a new augsynth object with specified donor units removed +} diff --git a/pkg.Rproj b/pkg.Rproj index d848a9f..bfa3107 100644 --- a/pkg.Rproj +++ b/pkg.Rproj @@ -5,8 +5,13 @@ SaveWorkspace: No AlwaysSaveHistory: Default EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 4 Encoding: UTF-8 +RnwWeave: Sweave +LaTeX: pdfLaTeX + AutoAppendNewline: Yes StripTrailingWhitespace: Yes diff --git a/tests/testthat/test-augsynth_plotting.R b/tests/testthat/test-augsynth_plotting.R new file mode 100644 index 0000000..d5862bc --- /dev/null +++ b/tests/testthat/test-augsynth_plotting.R @@ -0,0 +1,98 @@ +library( tidyverse ) +data(basque, package = "Synth") + +context("Test plotting features of single augsynth objects") + +basque <- basque %>% mutate(trt = case_when(year < 1975 ~ 0, + regionno != 17 ~ 0, + regionno == 17 ~ 1)) %>% + filter(regionno != 1) + +syn <- augsynth(gdpcap ~ trt, regionno, year, basque, progfunc = "Ridge", scm = T) + +test_that("All plot types for plot.summary.augsynth() are working", { + + for (it in c('conformal', 'jackknife', 'jackknife+', 'permutation', 'permutation_rstat')){ + s_syn <- summary(syn, inf_type = it) + for (pt in c('estimate only', 'estimate', 'placebo', 'outcomes', 'outcomes raw average')) { + if((!it %in% c('permutation', 'permutation_rstat')) & (pt == 'placebo')) { + expect_error(plot(s_syn, plot_type = pt)) + } else { + p <- plot(s_syn, plot_type = pt) + expect_true('ggplot' %in% class(p)) + } + } + } + +}) + +test_that("All plot types for plot.augsynth() are working", { + + for (it in c('conformal', 'jackknife', 'jackknife+', 'permutation', 'permutation_rstat')){ + for (pt in c('estimate only', 'estimate', 'placebo', 'outcomes', 'outcomes raw average')) { + p <- plot(syn, plot_type = pt, inf_type = it) + expect_true('ggplot' %in% class(p)) + } + } + +}) + + +test_that("General plotting functions do not crash", { + + syn <- single_augsynth(gdpcap ~ trt, regionno, year, 1975, basque, + progfunc="None", scm=T, fixedeff = F) + + # Testing some of the permutation plotting functions + tst_plt <- augsynth:::permutation_plot( syn, inf_type = "permutation_rstat" ) + expect_true( is.ggplot( tst_plt ) ) + + sum <- summary( syn, inf_type = "permutation" ) + expect_equal( sum$inf_type, "permutation" ) + + # Check that it will add inference on the fly + expect_message( auto_add_inf <- plot( syn, inf_type = "permutation_rstat" ) ) + expect_true( is.ggplot( auto_add_inf ) ) + + plt <- augsynth:::augsynth_outcomes_plot( sum ) + expect_true( is.ggplot( plt ) ) + + # use testthat to check print.augsynth.summary writes to console + expect_output( print( sum ), "Permutation inference" ) + + p0 <- plot( sum ) + p <- plot( sum, plot_type = "estimate" ) + expect_equal( p0, p ) + + pEO <- plot( sum, plot_type = "estimate only" ) + expect_message( + pEO1 <- plot( syn, plot_type = "estimate only", inf_type = "permutation" ) + ) + expect_equal( pEO, pEO1 ) + + po <- plot( sum, plot_type = "outcomes" ) + po1 = plot( syn, plot_type = "outcomes", inf_type = "permutation" ) + expect_equal( po, po1 ) + + praw <- plot( sum, plot_type = "outcomes raw average" ) + praw1 <- plot( syn, plot_type = "outcomes raw average", inf_type="permutation" ) + expect_equal( praw$data, praw1$data ) + + ppla <- plot( sum, plot_type = "placebo" ) + ppla1 <- plot( syn, plot_type = "placebo", inf_type="permutation" ) + expect_equal( ppla, ppla1 ) + + # These two plots should be the same + p2 <- plot( syn ) + expect_true( is.ggplot(p2 ) ) + + sump <- summary( syn ) + p3 <- plot( sump ) + expect_true( is.ggplot( p3 ) ) + + expect_output( print( sump ), "Conformal inference" ) + + dt = sump$donor_table + expect_equal( sum( dt$weight ), 1, tolerance = 0.000001 ) + +}) diff --git a/tests/testthat/test-donor_control.R b/tests/testthat/test-donor_control.R new file mode 100644 index 0000000..0432cdc --- /dev/null +++ b/tests/testthat/test-donor_control.R @@ -0,0 +1,116 @@ + + +set.seed(7393) +dat = augsynth:::make_synth_data( n_time = 10, n_U = 5, N = 12, long_form = TRUE, tx_impact = 2, tx_shift = 1 ) +# ggplot( dat, aes( time, Y, group=ID ) ) + geom_line() + +syn = augsynth( Y ~ Tx | X1, unit = ID, time = time, data = dat, fixedeff = TRUE, progfunc = "none" ) + +test_that("Donor control - high RMSPE multiple doesn't drop any units", { + syn2 <- update_augsynth(syn, drop = 1000) + expect_equal(nrow(donor_table(syn2)), 11) +}) + + + +test_that("Donor control drops the correct units based on RMSPE", { + + ssyn = summary( syn, inf_type="permutation" ) + if ( FALSE ) { + plot( ssyn, "outcomes raw average" ) + } + treated_table(syn) + ssyn + + dtable <- donor_table(syn) + dtable + nrow( dtable ) + trt_RMSPE <- add_inference(syn, inf_type = 'permutation')$results$permutations$placebo_dist %>% + filter(time < syn$t_int) %>% + filter(ID == 1) %>% + pull(RMSPE) %>% unique() + + drop_factor = 1.0 + drop_units <- dtable %>% + filter(RMSPE > trt_RMSPE * drop_factor) %>% + pull(ID) + drop_units + + syn3 <- update_augsynth(syn, drop = drop_factor) + d3 <- donor_table(syn3) + expect_true( all( !( drop_units %in% d3$ID ) ) ) + + # TODO: Why should the new donor table have 8 rows? I dropped + # that check in the following: + + #criteria <- (nrow(donor_table(syn3)) == 8) & (drop_units %in% $ID %>% all() == FALSE) + #expect_true(criteria) + + + # Check that we get the same model parameters from dropping units manually + dat_tmp <- dat %>% filter(!ID %in% drop_units) + syn_manual <- augsynth( Y ~ Tx | X1, unit = ID, time = time, data = dat_tmp, + fixedeff = TRUE, progfunc = "none" ) + s_syn_manual <- summary(syn_manual) + s_syn3 <- summary(syn3) + + expect_equal( s_syn_manual$att$Estimate, s_syn3$att$Estimate ) + + expect_equal( nrow(donor_table(syn_manual)), nrow(donor_table(syn3))) + + + # Check that dropping useless donors doesn't change anything + dd = setdiff( as.character( 1:12 ), dtable$ID ) + synX = update_augsynth( syn, drop = dd ) + expect_equal( syn$att$Estimate, synX$att$Estimate ) + expect_equal( nrow(donor_table(syn)), nrow(donor_table(synX))) + +}) + + + +test_that("Donor control drops the correct units based on unit names", { + drop_units <- c('2', '3', '4') + + syn4 <- update_augsynth(syn, drop = drop_units) + + expect_true( all( ! (drop_units %in% donor_table(syn4)$ID ) ) ) + + + # Check that we get the same model parameters from dropping units manually + dat_tmp <- dat %>% filter(!ID %in% drop_units) + syn_manual <- augsynth( Y ~ Tx | X1, unit = ID, time = time, + data = dat_tmp, fixedeff = TRUE, + progfunc = "none" ) + s_syn_manual <- summary(syn_manual) + s_syn4 <- summary(syn4) + + expect_equal( s_syn_manual$att$Estimate, s_syn4$att$Estimate ) + expect_equal( nrow(donor_table(syn_manual)), nrow(donor_table(syn4))) + + +}) + + +test_that("`update_augsynth` returns the same basic SCM in cases when updates are not applied", { + + syn1 <- augsynth(lngdpcapita ~ treated, fips, year_qtr, kansas, + progfunc = "None", scm = T) + syn2 <- update_augsynth(syn1, Inf) # dropping based on RMSPE multiple + syn3 <- update_augsynth(syn1, "") # dropping based on unit names + + # test that weights are the same + expect_equal(syn2$weights, syn3$weights) + expect_equal(syn1$weights, syn2$weights) + + # test that ATTs are the same + summ1 <- summary(syn1) + summ2 <- summary(syn2) + summ3 <- summary(syn3) + expect_equal(summ2$att['Estimate'], summ3$att['Estimate']) + expect_equal(summ1$att['Estimate'], summ2$att['Estimate']) +}) + + + + diff --git a/tests/testthat/test-permutation_inference.R b/tests/testthat/test-permutation_inference.R new file mode 100644 index 0000000..083a545 --- /dev/null +++ b/tests/testthat/test-permutation_inference.R @@ -0,0 +1,91 @@ + + +library( tidyverse ) + +data(basque, package="Synth") + +head( basque) + +#basque <- basque %>% +# dplyr::select( regionno, year, gdpcap ) %>% +# as_tibble() + +basque <- basque %>% + mutate(trt = case_when(year < 1975 ~ 0, + regionno != 17 ~ 0, + regionno == 17 ~ 1)) %>% + filter(regionno != 1) %>% + dplyr::select( -c( sec.agriculture:invest ) ) + +head( basque ) + +table( basque$trt, basque$regionno ) + + + +test_that( "MDES_table corresponds to default treatment table", { + + syn <- augsynth(gdpcap ~ trt, regionno, year, + data=basque, scm = TRUE) + + summ <- summary(syn, inf_type = 'permutation') + + mm <- summ$permutations$MDES_table[1:7] %>% + select( sort( names(.))) + mm + tt <- treated_table(syn) %>% + select( sort( names(.))) + tt + + expect_equal( as.data.frame(tt)[c("ATT","raw_average","Yhat", "tx")], + as.data.frame(mm)[c("ATT","raw_average","Yhat", "tx")] ) +} ) + +test_that( "Placebo distribtion works", { + + syn <- augsynth(gdpcap ~ trt, regionno, year, + data=basque, scm = TRUE, progfunc= "none" ) + tt <- treated_table(syn) + tt + + donor_table(syn) + + b2 = basque %>% + mutate( trt = 0 + (regionno == 7 ) * (year>=1975) ) + syn7 <- augsynth(gdpcap ~ trt, regionno, year, + data=b2, scm = TRUE, progfunc= "none") + + # These should be the same? Or close to the same? + # (They were not under ridge, probably due to the cross-validation procedure.) + gaps7 <- augsynth:::get_placebo_gaps(syn7) + g17 <- gaps7 %>% + dplyr::filter( ID == 17 ) %>% + pivot_longer( cols = `1955`:`1997`, + names_to = "year", + values_to = "est" ) + expect_equal( tt$ATT - g17$est, rep(0,nrow(tt)), tolerance = 0.000001 ) + + #as.numeric( gaps[ gaps$ID == 10, -1 ] ) + #- as.numeric( gaps7[ gaps7$ID == 10, -1 ] ) + + n_unit = length(unique(basque$regionno)) + expect_equal( nrow(gaps7), n_unit ) + n_year = length(unique(basque$year)) + + expect_equal( ncol(gaps7), 1 + n_year ) + expect_equal( dim( syn ), c( n_unit, n_year ) ) + + pc <- augsynth:::add_placebo_distribution( syn ) + rs <- pc$results$permutations + expect_equal( names(rs), c("placebo_dist", "MDES_table") ) + expect_equal( nrow( rs$placebo_dist ), n_year * n_unit ) + + expect_equal( syn$unit_var, "regionno" ) + expect_equal( syn$time_var, "year" ) +}) + + + + + + diff --git a/tests/testthat/test-summary.augsynth.R b/tests/testthat/test-summary.augsynth.R new file mode 100644 index 0000000..44e17be --- /dev/null +++ b/tests/testthat/test-summary.augsynth.R @@ -0,0 +1,45 @@ +library( augsynth ) +set.seed( 4040440 ) + +n_units <- 12 +n_time <- 10 + +dat = augsynth:::make_synth_data( n_time = n_time, n_U = 5, N = n_units, long_form = TRUE, tx_impact = 2, tx_shift = 1 ) +syn = augsynth( Y ~ Tx | X1 + X2 + X3 + X4 + X5, unit = ID, time = time, data = dat, progfunc="none") + +test_that( "covariate table works", { + cov = covariate_balance_table( syn ) + cov + expect_true( all( paste0("X", 1:5 ) %in% cov$variable ) ) + + expect_output( print( cov ), "variable.*Tx.*Co.*Raw") + + c2 = covariate_balance_table( syn, pre_period = c("1", "2") ) + c2 + aa <- dat$Y[ dat$ever_Tx & dat$time %in% c("1", "2") ] + expect_equal( c2$Tx[ c2$variable == "2" ], aa[[2]] ) + expect_equal( c2$Tx[ c2$variable == "1" ], aa[[1]] ) +}) + + +test_that("summary.augsynth works", { + + expect_output( print( syn ), glue::glue("Fit to {n_units} units and {n_time} time points.*Average ATT Estimate") ) + + sum = summary( syn ) + + # get donor table, add up number of units without weights and then check that the number in the summary is the same as that + n_donor <- sum$donor_table %>% filter(weight > 0) %>% nrow() + expect_output( print( sum ), glue::glue("{n_donor} donor units used with weights.")) + + s2 = summary( syn, inf_type = "jackknife+" ) + n_donor2 <- s2$donor_table %>% filter(weight > 0) %>% nrow() + expect_output( print( s2 ), + glue::glue("{n_donor2} donor units used with weights.*Avg Estimated Bias: .*Jackknife\\+ over time periods")) + + + s3 = summary( syn, inf_type = "none" ) + expect_output( print( s3 ), + "Inference type: None" ) + +}) diff --git a/tests/testthat/test_format.R b/tests/testthat/test_format.R index 58796e7..5ee6ff3 100644 --- a/tests/testthat/test_format.R +++ b/tests/testthat/test_format.R @@ -6,9 +6,9 @@ basque <- basque %>% mutate(trt = case_when(year < 1975 ~ 0, regionno != 17 ~0, regionno == 17 ~ 1)) %>% filter(regionno != 1) - + test_that("format_data creates matrices with the right dimensions", { - + dat <- format_data(quo(gdpcap), quo(trt), quo(regionno), quo(year),1975, basque) test_dim <- function(obj, d) { @@ -23,7 +23,7 @@ test_that("format_data creates matrices with the right dimensions", { test_that("format_synth creates matrices with the right dimensions", { - + dat <- format_data(quo(gdpcap), quo(trt), quo(regionno), quo(year),1975, basque) syn_dat <- format_synth(dat$X, dat$trt, dat$y) test_dim <- function(obj, d) { @@ -115,4 +115,4 @@ test_that("augsynth exits with error message if there are no never treated units TRUE ~ 0)) expect_error(augsynth(gdpcap ~ trt, regionno, year, basque2), "1996") - }) \ No newline at end of file + }) diff --git a/tests/testthat/test_general.R b/tests/testthat/test_general.R index ccacb36..fdffd93 100644 --- a/tests/testthat/test_general.R +++ b/tests/testthat/test_general.R @@ -1,5 +1,7 @@ + context("Generally testing the workflow for augsynth") +library( tidyverse ) library(Synth) data(basque) @@ -8,20 +10,47 @@ basque <- basque %>% mutate(trt = case_when(year < 1975 ~ 0, regionno == 17 ~ 1)) %>% filter(regionno != 1) +# fake_aug( gdpcap ~ trt, regionno, year, basque, progfunc="None", scm=T, t_int=1975 ) - test_that("SCM gives the right answer", { - syn <- augsynth(gdpcap ~ trt, regionno, year, basque, progfunc="None", scm=T, t_int=1975) + syn <- single_augsynth( gdpcap ~ trt, regionno, year, basque, + progfunc="None", scm=TRUE, t_int=1975) + sss = summary(syn, inf_type = 'none') ## average att estimate is as expected - expect_equal(-.3686, mean(summary(syn, inf = F)$att$Estimate), tolerance=1e-4) + expect_equal(-.3686, + mean(sss$att$Estimate), tolerance=1e-4) + expect_equal( syn$progfunc, "none" ) ## level of balance is as expected expect_equal(.377, syn$l2_imbalance, tolerance=1e-3) -} -) + expect_equal( dim( syn ), c( 17, 43 ) ) + expect_equal( n_unit( syn ), 17 ) + expect_equal( n_time( syn ), 43 ) + + # Try progfunc not defined. Get different answer? + syn2 <- single_augsynth(gdpcap ~ trt, regionno, year, basque, + scm=TRUE, t_int=1975) + + expect_equal( syn2$progfunc, "ridge" ) + ss = summary( syn2, inf_type = "none" ) + # different answers + expect_true( sd( ss$att$Estimate - sss$att$Estimate ) > 0.2 ) + + + # No SCM and no progfunc throws an error? + expect_error( sraw <- single_augsynth(gdpcap ~ trt, regionno, year, basque, + progfunc="none", scm=FALSE, t_int=1975) ) + + sraw2 <- single_augsynth(gdpcap ~ trt, regionno, year, basque, + progfunc="none", scm=TRUE, t_int=1975) + sumraw2 = summary( sraw2, inf_type="none" ) + sumraw2$donor_table + + +}) test_that("SCM finds the correct t_int and gives the right answer", { @@ -30,12 +59,12 @@ test_that("SCM finds the correct t_int and gives the right answer", { syn2 <- augsynth(gdpcap ~ trt, regionno, year, basque, progfunc = "None", scm = T, t_int = 1975) ## average att estimate is as expected - expect_equal(mean(summary(syn1, inf = F)$att$Estimate), - mean(summary(syn2, inf = F)$att$Estimate), tolerance=1e-4) - + expect_equal(mean(summary(syn1, inf_type = 'none')$att$Estimate), + mean(summary(syn2, inf_type = 'none')$att$Estimate), tolerance=1e-4) + ## level of balance is as expected expect_equal(syn1$l2_imbalance, syn2$l2_imbalance, tolerance=1e-3) - + } ) @@ -46,11 +75,12 @@ test_that("Ridge ASCM gives the right answer", { scm=T, lambda=8) ## average att estimate is as expected - expect_equal(-.3696, mean(summary(asyn, inf = F)$att$Estimate), tolerance=1e-3) + expect_equal(-.3696, mean(summary(asyn, inf_type = 'none')$att$Estimate), tolerance=1e-3) ## level of balance is as expected expect_equal(.373, asyn$l2_imbalance, tolerance=1e-3) + } ) @@ -59,21 +89,21 @@ test_that("Ridge ASCM gives the right answer", { test_that("SCM after residualizing covariates gives the right answer", { - covsyn_resid <- augsynth(gdpcap ~ trt | invest + popdens, - regionno, year, basque, - progfunc = "None", scm = T, - residualize = T) + covsyn_resid <- augsynth(gdpcap ~ trt | invest + popdens, + regionno, year, basque, + progfunc = "None", scm = T, + residualize = T) - ## average att estimate is as expected - expect_equal(-.1443, - mean(summary(covsyn_resid, inf = F)$att$Estimate), - tolerance = 1e-3) + ## average att estimate is as expected + expect_equal(-.1443, + mean(summary(covsyn_resid, inf_type = 'none')$att$Estimate), + tolerance = 1e-3) - ## level of balance is as expected - expect_equal(.3720, covsyn_resid$l2_imbalance, tolerance=1e-3) + ## level of balance is as expected + expect_equal(.3720, covsyn_resid$l2_imbalance, tolerance=1e-3) - # perfect auxiliary covariate balance - expect_equal(0, covsyn_resid$covariate_l2_imbalance, tolerance=1e-3) + # perfect auxiliary covariate balance + expect_equal(0, covsyn_resid$covariate_l2_imbalance, tolerance=1e-3) } ) @@ -81,13 +111,13 @@ test_that("SCM after residualizing covariates gives the right answer", { test_that("Ridge ASCM with covariates jointly gives the right answer", { covsyn_noresid <- augsynth(gdpcap ~ trt | invest + popdens, - regionno, year, basque, - progfunc = "None", scm = T, - residualize = F) + regionno, year, basque, + progfunc = "None", scm = T, + residualize = F) ## average att estimate is as expected expect_equal(-.3345, - mean(summary(covsyn_noresid, inf = F)$att$Estimate), + mean(summary(covsyn_noresid, inf_type = 'none')$att$Estimate), tolerance = 1e-3) ## level of balance is as expected @@ -104,14 +134,14 @@ test_that("Ridge ASCM with covariates jointly gives the right answer", { test_that("Ridge ASCM after residualizing covariates gives the right answer", { covascm_resid <- augsynth(gdpcap ~ trt | invest + popdens, - regionno, year, basque, - progfunc = "Ridge", scm = T, - lambda = 1, - residualize = T) + regionno, year, basque, + progfunc = "Ridge", scm = T, + lambda = 1, + residualize = T) ## average att estimate is as expected expect_equal(-.123, - mean(summary(covascm_resid, inf = F)$att$Estimate), + mean(summary(covascm_resid, inf_type = 'none')$att$Estimate), tolerance = 1e-3) ## level of balance is as expected @@ -126,14 +156,14 @@ test_that("Ridge ASCM after residualizing covariates gives the right answer", { test_that("Ridge ASCM with covariates jointly gives the right answer", { covascm_noresid <- augsynth(gdpcap ~ trt | invest + popdens, - regionno, year, basque, - progfunc = "Ridge", scm = T, - lambda = 1, - residualize = F) + regionno, year, basque, + progfunc = "Ridge", scm = T, + lambda = 1, + residualize = F) ## average att estimate is as expected expect_equal(-.267, - mean(summary(covascm_noresid, inf = F)$att$Estimate), + mean(summary(covascm_noresid, inf_type = 'none')$att$Estimate), tolerance = 1e-3) ## level of balance is as expected @@ -149,8 +179,9 @@ test_that("Ridge ASCM with covariates jointly gives the right answer", { test_that("Warning given when inputting an unused argument", { expect_warning( - augsynth(gdpcap ~ trt| invest + popdens, regionno, year, basque, - progfunc="Ridge", scm=T, lambda=8, t_int = 1975, - bad_param = "Unused input parameter"), + augsynth(gdpcap ~ trt| invest + popdens, regionno, year, basque, + progfunc="Ridge", scm=T, lambda=8, t_int = 1975, + bad_param = "Unused input parameter"), ) }) + diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R new file mode 100644 index 0000000..cbc3841 --- /dev/null +++ b/tests/testthat/test_inference.R @@ -0,0 +1,15 @@ + + + +context("Test inference features of single augsynth objects") + +basque <- basque %>% mutate(trt = case_when(year < 1975 ~ 0, + regionno != 17 ~0, + regionno == 17 ~ 1)) %>% + filter(regionno != 1) + +test_that("Default model doesn't contain inference", { + syn <- augsynth(gdpcap ~ trt, regionno, year, basque, progfunc="Ridge", scm = T) + expect_equivalent(is.null(syn$results), TRUE) +}) + diff --git a/tests/testthat/test_multisynth.R b/tests/testthat/test_multisynth.R index 79008a1..c87f14a 100644 --- a/tests/testthat/test_multisynth.R +++ b/tests/testthat/test_multisynth.R @@ -1,5 +1,6 @@ context("Generally testing the workflow for multisynth") +library( tidyverse ) library(Synth) data(basque) basque <- basque %>% mutate(trt = case_when(year < 1975 ~ 0, @@ -8,14 +9,16 @@ basque <- basque %>% mutate(trt = case_when(year < 1975 ~ 0, filter(regionno != 1) - + test_that("augsynth and multisynth give the same answer for a single treated unit and no augmentation", { syn <- single_augsynth(gdpcap ~ trt, regionno, year, 1975, basque, progfunc="None", scm=T, fixedeff = F) msyn <- multisynth(gdpcap ~ trt, regionno, year, basque, nu = 0, fixedeff = F, scm=T, eps_rel=1e-5, eps_abs=1e-5) - + + expect_equal( dim( msyn ), dim( syn ) ) + # weights are the same-ish expect_equal(c(syn$weights), c(msyn$weights[-16]), tolerance=3e-4) @@ -39,6 +42,10 @@ test_that("Pooling doesn't matter for a single treated unit", { allpool <- multisynth(gdpcap ~ trt, regionno, year, basque, nu = 1, scm=T, eps_rel=1e-5, eps_abs=1e-5) + expect_equal( dim( nopool ), dim( allpool ) ) + expect_equal( n_treated(nopool), n_treated(allpool) ) + expect_equal( n_treated(nopool), 1 ) + # weights are the same expect_equal(nopool$weights, allpool$weights) @@ -56,7 +63,7 @@ test_that("Pooling doesn't matter for a single treated unit", { - + test_that("Separate synth is the same as fitting separate synths", { @@ -66,23 +73,27 @@ test_that("Separate synth is the same as fitting separate synths", { filter(regionno != 1) - basque2 %>% filter(regionno != 16) %>% + basque2 %>% filter(regionno != 16) %>% single_augsynth(gdpcap ~ trt, regionno, year, 1975, ., progfunc="None", scm=T) -> scm17 - basque2 %>% filter(regionno != 17) %>% + basque2 %>% filter(regionno != 17) %>% single_augsynth(gdpcap ~ trt, regionno, year, 1975, ., progfunc="None", scm=T) -> scm16 - + msyn <- multisynth(gdpcap ~ trt, regionno, year, basque2, nu = 0, scm=T, eps_rel=1e-5, eps_abs=1e-5, fixedeff = F) - + + expect_equal( dim( scm17 ), c(16,43) ) + expect_equal( n_treated(scm17), 1 ) + expect_equal( n_treated(msyn), 2 ) + # weights are the same-ish sscm_weights <- unname(c(scm17$weights)) mscm_weights <- unname(c(msyn$weights[-c(15, 16), 2])) expect_equal(sscm_weights, mscm_weights, tolerance=3e-2) expect_equal(rownames(scm17$weights), rownames(as.matrix(msyn$weights[-c(15, 16), 2]))) # expect_equal(c(scm16$weights), c(msyn$weights[-c(15, 16), 1]), tolerance=3e-2) - + # estimates are the same-ish pred_msyn <- predict(msyn, att=F) pred_msyn <- pred_msyn[-nrow(pred_msyn), ] @@ -101,7 +112,7 @@ test_that("Limiting number of lags works", { expect_error( multisynth(gdpcap ~ trt, regionno, year, basque2, nu = 0, - scm=T, eps_rel=1e-5, eps_abs=1e-5, n_lags =3), + scm=T, eps_rel=1e-5, eps_abs=1e-5, n_lags = 3), NA ) } @@ -117,6 +128,14 @@ test_that("L2 imbalance computed correctly", { msyn <- multisynth(gdpcap ~ trt, regionno, year, basque2, scm=T, eps_rel=1e-5, eps_abs=1e-5) + expect_true( "weights" %in% names( msyn ) ) + + ntx = length( unique( basque2$regionno[ basque2$trt == 1 ] ) ) + expect_equal( dim( msyn$weights ), c( 17, ntx ) ) + + expect_true( "data" %in% names( msyn ) ) + #msyn$data + glbl <- sqrt(mean(msyn$imbalance[,1]^2)) ind <- sqrt(mean( apply(msyn$imbalance[, -1], 2, @@ -165,7 +184,7 @@ test_that("V matrix is the same for single and multi synth", { ) - + test_that("multisynth doesn't depend on unit order", { basque2 <- basque %>% mutate(trt = case_when(year < 1975 ~ 0, @@ -180,7 +199,7 @@ test_that("multisynth doesn't depend on unit order", { basque2 %>% arrange(desc(regionno)), nu = 0, fixedeff = F, scm=T, eps_rel=1e-5, eps_abs=1e-5) - + # weights are the same expect_equal(c(msyn$weights), c(msyn2$weights)) @@ -191,7 +210,7 @@ test_that("multisynth doesn't depend on unit order", { ) - + test_that("multisynth doesn't depend on time order", { basque2 <- basque %>% mutate(trt = case_when(year < 1975 ~ 0, @@ -206,7 +225,7 @@ test_that("multisynth doesn't depend on time order", { basque2 %>% arrange(desc(year)), nu = 0, fixedeff = F, scm=T, eps_rel=1e-5, eps_abs=1e-5) - + # weights are the same expect_equal(c(msyn$weights), c(msyn2$weights)) diff --git a/tests/testthat/test_outcome_models.R b/tests/testthat/test_outcome_models.R index 2eccdb9..cbe6c1b 100644 --- a/tests/testthat/test_outcome_models.R +++ b/tests/testthat/test_outcome_models.R @@ -10,7 +10,7 @@ basque <- basque %>% mutate(trt = case_when(year < 1975 ~ 0, filter(regionno != 1) - + test_that("Augmenting synth with glmnet runs", { if(!requireNamespace("glmnet", quietly = TRUE)) { @@ -24,7 +24,7 @@ test_that("Augmenting synth with glmnet runs", { ## should run because glmnet is installed expect_error(augsynth(gdpcap ~ trt, regionno, year, basque, progfunc="EN", scm=T), - NA) + NA) } ) @@ -43,7 +43,7 @@ test_that("Augmenting synth with random forest runs", { ## should run because randomForest is installed expect_error(augsynth(gdpcap ~ trt, regionno, year, basque, progfunc="RF", scm=T), - NA) + NA) } ) @@ -54,7 +54,7 @@ test_that("Augmenting synth with gsynth runs and produces the correct result", { if(!requireNamespace("gsynth", quietly = TRUE)) { ## should fail because gsynth isn't installed - expect_error(augsynth(gdpcap ~ trt, regionno, year, basque, + expect_error(augsynth(gdpcap ~ trt, regionno, year, basque, progfunc="GSYN", scm=T), "you must install the gsynth package") @@ -64,13 +64,13 @@ test_that("Augmenting synth with gsynth runs and produces the correct result", { ## should run because gsynth is installed expect_error( - augsynth(gdpcap ~ trt, regionno, year, basque, + augsynth(gdpcap ~ trt, regionno, year, basque, progfunc = "GSYN", scm = T, CV = 0, r = 4), NA) asyn_gsyn <- augsynth(gdpcap ~ trt, regionno, year, basque, progfunc = "GSYN", scm = F, CV = 0, r = 4) - expect_equal(summary(asyn_gsyn, inf = F)$average_att$Estimate, - -0.1444637, tolerance=1e-4) + expect_equal(summary(asyn_gsyn, inf_type = 'none')$average_att$Estimate, + -0.1444637, tolerance=1e-4) } ) @@ -85,10 +85,10 @@ test_that("Augmenting synth with MCPanel runs", { } else { ## should run because MCPanel is installed expect_error(augsynth(gdpcap ~ trt, regionno, year, basque, progfunc="MCP", scm=T), - NA) + NA) } - + } ) @@ -108,6 +108,6 @@ test_that("Augmenting synth with CausalImpact runs", { ## should run because CausalImpact is installed expect_error(augsynth(gdpcap ~ trt, regionno, year, basque, progfunc="CausalImpact", scm=T), - NA) + NA) } ) diff --git a/tests/testthat/test_unbalanced_multisynth.R b/tests/testthat/test_unbalanced_multisynth.R index 7cf8c9b..eebab67 100644 --- a/tests/testthat/test_unbalanced_multisynth.R +++ b/tests/testthat/test_unbalanced_multisynth.R @@ -122,11 +122,11 @@ test_that("Separate synth with missing control unit time drops control unit", { # drop a time period for unit 17 basque %>% filter(!regionno %in% c(18) | year != 1970) -> basque_mis - - msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, + + msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, nu = 0, scm=T, eps_rel=1e-8, eps_abs=1e-8) - msyn2 <- multisynth(gdpcap ~ trt, regionno, year, + msyn2 <- multisynth(gdpcap ~ trt, regionno, year, basque %>% filter(regionno != 18), nu = 0, scm=T, eps_rel=1e-8, eps_abs=1e-8) @@ -143,10 +143,10 @@ test_that("Separate synth with missing control unit only in post-treatment perio dat_format <- format_data_stag(quo(gdpcap), quo(trt), quo(regionno), quo(year), basque_mis) expect_true(nrow(dat_format$X) == nrow(dat_format$y)) - msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, + msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, nu = 0, scm=T, eps_rel=1e-8, eps_abs=1e-8) - msyn2 <- multisynth(gdpcap ~ trt, regionno, year, + msyn2 <- multisynth(gdpcap ~ trt, regionno, year, basque %>% filter(regionno != 18), nu = 0, scm=T, eps_rel=1e-8, eps_abs=1e-8) @@ -163,10 +163,10 @@ test_that("Separate synth with missing control unit only in pre-treatment period expect_true(nrow(dat_format$X) == nrow(dat_format$y)) - msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, + msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, nu = 0, scm=T, eps_rel=1e-8, eps_abs=1e-8) - msyn2 <- multisynth(gdpcap ~ trt, regionno, year, + msyn2 <- multisynth(gdpcap ~ trt, regionno, year, basque %>% filter(regionno != 18), nu = 0, scm=T, eps_rel=1e-8, eps_abs=1e-8) @@ -180,7 +180,7 @@ test_that("Multisynth with unbalanced panels runs", { basque %>% filter(!regionno %in% c(15, 17) | year != 1970) -> basque_mis - msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, + msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, scm=T, eps_rel=1e-8, eps_abs=1e-8) expect_error(summary(msyn), NA) @@ -193,7 +193,7 @@ test_that("Multisynth with unbalanced panels runs with missing post-treatment", basque %>% filter(!regionno %in% c(15, 17) | year != 1990) -> basque_mis - msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, + msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, scm=T, eps_rel=1e-8, eps_abs=1e-8) expect_error(summary(msyn), NA) @@ -207,8 +207,8 @@ test_that("Multisynth with unbalanced panels runs", { basque %>% filter(!regionno %in% c(15) | year != 1985) -> basque_mis - msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, + msyn <- multisynth(gdpcap ~ trt, regionno, year, basque_mis, scm=T, eps_rel=1e-8, eps_abs=1e-8) expect_error(summary(msyn), NA) -}) \ No newline at end of file +}) diff --git a/tobacco_replication/beer_troubles.R b/tobacco_replication/beer_troubles.R new file mode 100644 index 0000000..3d12c5b --- /dev/null +++ b/tobacco_replication/beer_troubles.R @@ -0,0 +1,4 @@ + + +# What is up with the beer variable + diff --git a/tobacco_replication/data/restofus_38.dta b/tobacco_replication/data/restofus_38.dta new file mode 100755 index 0000000..98cfdf3 Binary files /dev/null and b/tobacco_replication/data/restofus_38.dta differ diff --git a/tobacco_replication/data/restofus_50.dta b/tobacco_replication/data/restofus_50.dta new file mode 100755 index 0000000..0c79728 Binary files /dev/null and b/tobacco_replication/data/restofus_50.dta differ diff --git a/tobacco_replication/data/smoking_wkdata_39.dta b/tobacco_replication/data/smoking_wkdata_39.dta new file mode 100755 index 0000000..7c0cedd Binary files /dev/null and b/tobacco_replication/data/smoking_wkdata_39.dta differ diff --git a/tobacco_replication/inference_blogpost.Rmd b/tobacco_replication/inference_blogpost.Rmd new file mode 100644 index 0000000..86c7df2 --- /dev/null +++ b/tobacco_replication/inference_blogpost.Rmd @@ -0,0 +1,449 @@ +--- +title: Comparing inference approaches for the synthetic control method with the `augsynth` + package +author: "Miratrix & Delaney" +date: "2024-11-16" +output: + pdf_document: default + html_document: default +subtitle: Also replicating Abadie et al. (2010) +editor_options: + chunk_output_type: console +--- + +```{r setup, include = FALSE} + +knitr::opts_chunk$set( + collapse = FALSE, + message = FALSE, + warning = FALSE, + fig.align = 'center', + comment = "#>" +) + +## Install Synth if not already installed +# install.packages("Synth") + +library(kableExtra) +library(magrittr) +library(dplyr) +library(ggplot2) +library(reshape2) +library(tidyr) +library(rlang) + +library(foreign) +library(graphics) +library(Synth) +library(augsynth) + +## For the Abadie plots +replication_theme <- theme_bw(base_family = "Times New Roman") + + theme(panel.grid = element_blank(), + legend.background = element_rect(color = 'black'), + legend.spacing.y = unit(0.0, 'pt'), + legend.box.margin = margin(t = 5, r = 5, unit = 'pt'), + legend.position = c(0.81, 0.9)) + +set.seed(1234) +``` + + +There are many competing approaches to inference in the Synthetic Control world, and these approaches rely on a variety of different sets of assumptions that researchers can get quite agitated about. +In this post, we are going to compare some of these approaches by re-analyzing the famous "Tobacco example" originally laid out in [Abadie et al. (2010)](https://web.stanford.edu/~jhain/Paper/JASA2010.pdf) using some cool new features we recently added to the `augsynth` package. +This post also shows off our `augsynth` package, and in particular demonstrates how fitting and exploring synthetic control models can be easily done with only a few lines of code. + +As a tiny bit of background, the synthetic control approach has two major steps. +The first step is to generate the synthetic control. +Intuitively, this consists of using some sort of algorithm to reweight a set of possible "donor" units so that their weighted average "looks like" the target treated unit, usually with respect to the treated unit's pre-treatment outcome trajectory and possibly some other baseline covariates besides. +There are an ever-increasing variety of ways one might do this, which we will discuss a bit more below. +Once you have your synthetic control, you estimate treatment impacts as the difference in the treated outcome and the synthetic control outcome for each post-treatment time period. + +With synthetic control and consequent treatment impacts in hand, the second step is to figure out how to conduct inference: in the first step we deliberately manufactured a "synthetic" unit to look like our treated unit, and now we want to know if the difference in their behaviors post-treatment could be due to random chance, or something structural like a treatment effect. +Inference is a thorny problem since there is no easily well defined model or source of randomness to be seen! + + +## A note on different packages and approaches + +In terms of R packages for fitting a synthetic control, we focus on two contenders. +The first is the `Synth` package, which has the algorithm and approach outlined in the seminal Abadie et al. (2010). +The second is the newer `augsynth` package, that offers a variety of different methods for building a synthetic comparison, and in fact _does not_ offer the exact same fitting approach used in `Synth`. +Nevertheless, we can fairly closely mimic the spirit of the original `Synth` with `augsynth`, as measured by getting fairly close to the same results shown in Figures 2--7 in the 2010 paper. +The `augsynth` package also offers a wider array of inference approaches which, as of the latest update, includes the permutation approach advocated for by the original Synth papers. +Before diving into inference, we first discuss how Synth and augsynth are different in how they build the synthetic control. + +To use the `Synth` package, you specify a set of features (including a subset of pre-treatment outcome measures) you want to match on, and a separate set of pre-treatment "test" outcomes to assess performance on, and then the package will find the best weighting of the match features that produces the closest matched synthetic control, as measured by the specified pre-treatment "test" outcomes. +The package does this with a nested optimization of, for a given set of variable weights, finding the besdt weighting of the donor units, and then finding the best variable weights that gives the "best of the best" synthetic control. +This iterative procedure results in two sets of weights, the "V" weights, which weight the possible match variables, and the donor weights, which weight the donor units to give the final synthetic control. +This approach makes it easy to include both lagged outcomes *and* auxiliary covariates, as the V weights will essentially rescale covariates to optimize any tradeoffs between which covariates to prioritize when building a match. + +The `augsynth` package, by contrast, directly finds unit weights to minimize the mean squared difference between the synthetic control and the treated unit across all pre-treatment outcome periods and any additional covariates. +The scaling of match covariates can be done as a pre-processing step, but is not done within the package. +Instead of focusing on V weights, the `augsynth` package then allows for balance correction via a (usually ridge) regression step. +This second step is the "augmentation" part, and it can help regularize weights so, in the case of rich donor unit contexts, more donor units can be included in the synthetic match. +That said, the `augsynth` package can also just match on pre-treatment trends and not do any further adjustment, which is closer to the original `Synth` approach (but without being able to up- or down-weight some covariates or pre-treatment outcomes over others). + +A more careful replication of the original Synth analysis with augsynth can be found in the new vignette in the package. + + + + +```{r import tobacco data, include=FALSE} +# Get working data CA and 38 control states +Wk.data <- read.dta(here::here("./tobacco_replication/data/smoking_wkdata_39.dta")) + +tobacco <- Wk.data %>% + mutate(treated = ifelse((name == 'California') & + (year > 1988), 1, 0), + state = name, # rename something meaningful + state_index = index) %>% + select(cigsalepcap, treated, state, year, retprice, xxincome, K1_r_15_24, xxbeer) + +tobacco_70 <- tobacco %>% filter(year >= 1970) # subset data to 1970 forward to avoid issues due to missingess of outcomes +``` + + +## Obtaining our synthetic control + +The `augsynth` package divides the two steps into two function calls. +To get a synthetic control, we call `augsynth()`. +To get inference, we call `summary()`. + +Here we do our first step of getting our synthetic control. + +```{r run tobacco model using augsynth} +syn <- single_augsynth(form = cigsalepcap ~ treated | retprice + xxincome + K1_r_15_24, + unit = state, + time = year, t_int = 1988, + data = tobacco_70 ) +syn +``` + +The above default `augsynth()` call removes bias via a ridge regression adjustment on top of calculating the synthetic control weights. +You can prevent this, making the fitting procedure closer in spirit to the Abadie approach, by including `progfunc = "none"`. +However, in this case, dropping the bias removal gives results that deviate more strongly from the tobacco paper. +We hypothesize that the bias removal is serving the role of the Abadie approach's variable importance weights (the bias removal being a regression that will naturally weight some variables more than others). +In augsynth, without a bias removal step, all the pre-treatment outcomes and covariates are weighted equally (after standardization). + +We can plot our result with `plot(plot_type = 'outcomes')`: + +```{r fig.width=4.5, fig.height=4} +p <- plot(syn, plot_type = 'outcomes raw average') + + ggtitle("Replication of Abadie et al. (2010), Figure 2") +p +``` + +If we do not want the raw average as a comparison line, simply use "outcomes" rather than "outcomes raw average." + + +## Getting our inference + +We get our inference via the `summary()` method, specifying what inference approach to use. +Here we use the permutation approach with impacts standardized by pre-treatment RMSPE: + +```{r} +syn_rstat <- summary( syn, inf_type = 'permutation_rstat' ) +syn_rstat +``` + +The inference options include those initially included in the package (`conformal`, `jackknife`, `jackknife+`), and now two that follow Abadie's permutation approach (`permutation` and `permutation_rstat`). +These last two refit the syn model to each donor unit in turn, and use the collection of pseudo-impact estimates as a reference distribution to compare our observed treatment trajectory to. +The `permutation` method uses the raw impact estimates, and `permutation_rstat` is the more recent approach promoted by Abadie of standardizing the gaps by the pre-treatment root mean squared predictive error (RMSPE). + +We can plot and compare the confidence bands generated by our four different inference types (the "Jackknife" currently does not provide standard errors, so we omit it here): + +```{r plot augsynth models, echo=FALSE, cache=TRUE, fig.width=8, fig.height=4.5} + +get_all_inference <- function( syn ) { + all_results <- bind_rows( + Conformal = summary(syn, inf_type = 'conformal')$att, + #Jackknife = summary(syn, inf_type = 'jackknife')$att, + `Jackknife+` = summary(syn, inf_type = 'jackknife+')$att, + Permutation = summary(syn, inf_type = 'permutation')$att, + `Permutation (rstat)` = summary(syn, inf_type = 'permutation_rstat')$att, + .id = 'name') %>% + mutate(name = factor(name, + levels = c('Conformal', 'Jackknife', 'Jackknife+', + "Permutation", "Permutation (rstat)"), + ordered = TRUE)) + all_results +} + +all_results <- get_all_inference( syn ) +ggplot(all_results, aes(x = Time, y = Estimate, color = name)) + + geom_hline(yintercept = 0, linetype = 'solid') + + geom_vline(xintercept = syn$t_int, linetype = 'dashed') + + geom_line() + + geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound, fill = name), + alpha = 0.2, size = 0.1) + + facet_wrap(. ~ name) + + scale_x_continuous(breaks = seq(1970, 2010, 5)) + + labs(x = 'Year', y = 'Gap in per-capita cigarette sales\n(in packs)', + caption = 'ND replication: augsynth package') + + scale_color_manual(values = c('red', 'blue', 'darkgreen', + 'purple', 'orange'), + guide = 'none', aesthetics = c('color', 'fill')) + + theme_bw() + + theme(plot.caption = element_text(hjust = 0, color = 'darkseagreen', + size = 9)) +``` + +There are massive differences! +Conformal and Jackknife+ do not have widening uncertainty as time progresses---this is due to their assumption of, in effect, temporal exchangability. +In my mind this is a nonsensical assumption, and it leads to nonsensical results when extrapolating beyond the time of treatment. +The Jackknife+ seems to be extremely optimistic in terms of its inference, and does not pass face validity, in my mind. +The permutation approaches are more narrow towards the onset of treatment, and get wider as the placebo trajectories diverge--this seems sensible. +Interestingly, the width of these is on the order of the Conformal inference when you average across the span of time. +The RStat approach gives narrower intervals, likely by reducing the effect of outlier donor units that are hard to fit pre-treatment and diverge more post-treatment. +A tension here is whether we are standardizing our test statistic (like a studentized statistic for permutation testing) or artificially discounting the chance that a given unit may diverge markedly from the group. + +The critique against permutation inference is a good one: our reference distribution of placebo trajectories is useful only if we think it is reasonable to believe the scale of likely difference between the counterfactual for the treated unit and its synthetic control would indeed be "like" the differences we see for the donor units. +If our treated unit is different from the donor pool (which it likely is---it was the unit that got treatment, after all) then the reference distribution may not a good reference. +For example, if the treatment unit is harder to build a good synthetic control for than the donor units, then how the donor unit trajectories diverge from 0 after the time of treatment may not capture the true randomness (whatever that is) of the treatment unit at all. + +All of this said, the reference distribution does give us _some_ sense of what this kind of exercise is likely to do for the kind of data we are looking at. +And, given the other options, it seems much better than any provided alternative. + + + +## Where do the permutation confidence intervals come from? + +The confidence regions for the permutation approaches come from measuring the uncertainty of the placebo distribution at each time point. +We can look at the raw placebo trajectories as so: + +```{r fig.width=4.5, fig.height=4} +plot(syn_rstat, + plot_type = 'placebo', +) + ggtitle("Replication of Abadie et al. (2010), Figure 4") +``` + +We then use the standard deviation of the trajectories to estimate a rough "Standard Error." +We can also get a table of the placebo trajectories via the `placebo_distribution()` method: + +```{r show_perm_inference} +placebo_distribution( syn_rstat ) +``` + +# A second example: Kansas + +We can easily replicate the above comparison of the four inference types for the "kansas" data used in the primary vignette of this package (see that vignette and the original paper for more details on these data, and for how we fit the model).^[Full disclosure: we looked across a few different specifications from that vignette, and the inference is not particularly stable. We are trying here to give a reasonable faith choice, and we stand by it being an illustration of _something_ that can naturally happen. Here we control for covariates, and use the ridge regression adjustment.] + +```{r raw_kansas, cache=TRUE, echo=FALSE} +data(kansas) + +syn_k <- augsynth( lngdpcapita ~ treated | lngdpcapita + log(revstatecapita) + + log(revlocalcapita) + log(avgwklywagecapita) + + estabscapita + emplvlcapita, + fips, year_qtr, kansas, + progfunc = "ridge" ) + +all_results_k <- get_all_inference( syn_k ) + +ggplot(all_results_k, aes(x = Time, y = Estimate, color = name)) + + geom_hline(yintercept = 0, linetype = 'solid') + + geom_vline(xintercept = syn$t_int, linetype = 'dashed') + + geom_line() + + geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound, fill = name), + alpha = 0.2, lwd = 0.1) + + facet_wrap(. ~ name) + + scale_x_continuous(breaks = seq(1970, 2010, 5)) + + labs(x = 'Year', y = 'Impact' ) + + scale_color_manual(values = c('red', 'blue', 'darkgreen', + 'purple', 'orange'), + guide = 'none', aesthetics = c('color', 'fill')) + + theme_bw() + + theme(plot.caption = element_text(hjust = 0, color = 'darkseagreen', + size = 9)) + +``` + + +## Classic synthetic control + +We next explore not using the ridge adjustment, which is closer to Abadie's original conception, to see how that impacts the relative uncertainty of the different methods. +This is done by setting `progfunc = "None"`. + +```{r} +syn_k_classic <- augsynth( lngdpcapita ~ treated | lngdpcapita + log(revstatecapita) + + log(revlocalcapita) + log(avgwklywagecapita) + + estabscapita + emplvlcapita, + fips, year_qtr, kansas, + progfunc = "None" ) +``` + +We plot our envelopes: +```{r, echo=FALSE, cache=TRUE} +all_results_k_classic <- get_all_inference( syn_k_classic ) +ggplot(all_results_k_classic, aes(x = Time, y = Estimate, color = name)) + + geom_hline(yintercept = 0, linetype = 'solid') + + geom_vline(xintercept = syn$t_int, linetype = 'dashed') + + geom_line() + + geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound, fill = name), + alpha = 0.2, size = 0.1) + + facet_wrap(. ~ name) + + scale_x_continuous(breaks = seq(1970, 2010, 5)) + + labs(x = 'Year', y = 'Impact' ) + + scale_color_manual(values = c('red', 'blue', 'darkgreen', + 'purple', 'orange'), + guide = 'none', aesthetics = c('color', 'fill')) + + theme_bw() + + theme(plot.caption = element_text(hjust = 0, color = 'darkseagreen', + size = 9)) + +``` + +Here the raw permutation approach gives extremely wide intervals, likely due to extreme outliers in the donor pool. We can check this to see that we indeed have concerns: + +```{r fig.width=4.5, fig.height=4} +plot( syn_k_classic, plot_type = 'placebo' ) + + scale_y_continuous(limits = c(-0.25, 0.5) ) +``` + +If we drop those outliers, we may get a more reasonable set of intervals. +This is easy to do with the `augsynth` package's `update_augsynth()` method, which will exclude certain donor units and automatically recalculate our synthetic control. +We specify which donor units to drop via the `drop` argument. +If `drop` is passed as a numeric, then all donor units with an RMSPE greater than `drop` times the treated unit's pre-treatment RMSPE will be dropped, following the methods put forth by Abadie. +For example, setting `drop = 2` will exclude any units with a pre-treatment RMSPE twice as large as the treated unit. +The updated augsynth model inherits the formula and data structure of the original model. + +We try a factor of 2: + +```{r slimmed_kansas} +syn_k_classic_restr <- update_augsynth(syn_k_classic, drop = 2 ) +syn_k_classic_restr +``` + +We have dropped 15 states, and have a somewhat cleaned up plot: + +```{r fig.width=4.5, fig.height=4} +plot( syn_k_classic_restr, plot_type = 'placebo' ) + + scale_y_continuous(limits = c(-0.25, 0.5) ) +``` + +Our final comparative inference plot is then: +```{r, echo=FALSE, cache=TRUE} +all_results_k_classic_restr <- get_all_inference( syn_k_classic_restr ) + +ggplot(all_results_k_classic_restr, aes(x = Time, y = Estimate, color = name)) + + geom_hline(yintercept = 0, linetype = 'solid') + + geom_vline(xintercept = syn$t_int, linetype = 'dashed') + + geom_line() + + geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound, fill = name), + alpha = 0.2, lwd = 0.1) + + facet_wrap(. ~ name) + + scale_x_continuous(breaks = seq(1970, 2010, 5)) + + labs(x = 'Year', y = 'Kansas impact' ) + + scale_color_manual(values = c('red', 'blue', 'darkgreen', + 'purple', 'orange'), + guide = 'none', aesthetics = c('color', 'fill')) + + theme_bw() + + theme(plot.caption = element_text(hjust = 0, color = 'darkseagreen', + size = 9)) + +``` + + + +# Relative standard errors + +For both our examples, we can plot the widths of the confidence intervals. +To get a more rich picture, we will add in different specifications for our inference; in particular, we will use the ridge adjustment and no ridge adjustment. +For all specifications, we continue to adjust with our baseline covariates. + +```{r, echo=FALSE, cache=TRUE} + +# add in "ridge" for the restricted +syn_k <- augsynth( lngdpcapita ~ treated | lngdpcapita + log(revstatecapita) + + log(revlocalcapita) + log(avgwklywagecapita) + + estabscapita + emplvlcapita, + fips, year_qtr, kansas, + progfunc = "ridge" ) +all_results_k <- get_all_inference( syn_k ) + +drp = setdiff( kansas$fips, donor_table( syn_k_classic_restr )$fips ) +syn_k_restr <- update_augsynth( syn_k, + drop = as.character(drp) ) +all_results_k_restr <- get_all_inference( syn_k_restr ) + +syn_restr = update_augsynth( syn, drop = 2 ) +all_results_restr = get_all_inference( syn_restr ) + +# Stack tobacco and kansas (ridge) +tt = bind_rows( Tobacco = all_results, + Tobacco_restr = all_results_restr, + Kansas = all_results_k, + Kansas_restr = all_results_k_restr, + .id = "data" ) + +# Now generate and add in the "no ridge" specs +syn_classic <- augsynth(form = cigsalepcap ~ treated | retprice + xxincome + K1_r_15_24, + unit = state, + time = year, + data = tobacco_70, progfunc = "None" ) +all_results_classic <- get_all_inference( syn_classic ) + +syn_classic_restr <- update_augsynth( syn_classic, drop = 2 ) +all_results_classic_restr <- get_all_inference( syn_classic_restr ) + +tt2 = bind_rows( Tobacco = all_results_classic, + Tobacco_restr = all_results_classic_restr, + Kansas = all_results_k_classic, + Kansas_restr = all_results_k_classic_restr, + .id = "data" ) + +# Stack ridge and no ridge and drop NAs +tt_all = bind_rows( classic=tt2, ridge=tt, .id = "adjustment" ) %>% + mutate( width = upper_bound - lower_bound ) %>% + filter( !is.na( width ) ) %>% + separate( data, into = c("data", "restricted"), sep = "_", remove = FALSE ) %>% + mutate(restricted = ifelse( is.na(restricted), "Full", "Restricted" ) ) + +#head(tt_all) + +``` + +We get the following (Kansas on left, Tobacco on right) + +```{r final_SE, echo=FALSE, eval=TRUE, fig.show="hold", fig.align='default', fig.height=3.5, fig.width=3} +ggplot( tt_all %>% filter( data != "Tobacco" ), aes( Time, width, col=name ) ) + + facet_grid( adjustment ~ restricted ) + + geom_line( size = 1) + + theme_minimal() + + geom_hline( yintercept = 0 ) + + theme( legend.position = "none" ) + + scale_x_continuous(labels = function(x) substr(x, 3, 4)) # Last two digits + +ggplot( tt_all %>% filter( data == "Tobacco" ), aes( Time, width, col=name ) ) + + facet_grid( adjustment ~ restricted ) + + geom_line( size = 1) + + theme_minimal() + + geom_hline( yintercept = 0 ) + + labs( color = "Inf: " ) + + theme( legend.position = "none" ) + + scale_x_continuous(labels = function(x) substr(x, 3, 4)) # Last two digits +``` + +Across our scenarios, the permutation approach tends to be most conservative, and for Kansas, has an unclear trend for extrapolation. +Across our examples, the R-statistic does stabalize over the raw permutation, giving tighter inference. +Restricting the donor pool stabalizes even further in the case of Kansas, and maybe does a little for Tobacco. +For the R-statistic approach, we see a steady climb in the interval width as we extrapolate further and further; I like this aspect of this approach. + +The conformal widens under ridge adjustment for Kansas, but does not for the rest of the specifications. +It is also interesting how it is quite high for Tobacco and low for Kansas; it is driven by differnet structures of the data than the permutation approaches. + +The Jackknife+ gives the tightest intervals, and they do not widen; I find myself suspicious of this approach given this behavior as compared to the other approaches. +It seems overly aggressive, as compared to the other forms of uncertainty quantification. + +It would be interesting to see how these behaviors and trends were similar or different over even more datasets. +The `augsynth` package makes such explorations easy to do, and so we leave this for you. + + + +# Acknowledgements + +Thanks to the original designers of the `augsynth` package, in particular Eli Ben-Michael, for the support on the extensions used in this blog post. +Also thanks to Abadie for providing the data and original source code for the Synth package for the tobacco example. +This work was supported by the U.S. Department of Education, Institute for Education Sciences, Grant R305D200010. The opinions expressed are those of the authors and do not represent views of the Institute or the U.S. Department of Education. + diff --git a/tobacco_replication/tobacco_exploration.R b/tobacco_replication/tobacco_exploration.R new file mode 100644 index 0000000..d3ad190 --- /dev/null +++ b/tobacco_replication/tobacco_exploration.R @@ -0,0 +1,481 @@ + + +# Exploration of the tobacco data +# (C) 2024 Miratrix + +# This came out of the howard "best synth practices" project + + +## Install Synth if not already installed +# install.packages("Synth") + +library(kableExtra) +library(magrittr) +library(dplyr) +library(ggplot2) +library(reshape2) +library(tidyr) +library(rlang) + +library(foreign) +library(graphics) +library(Synth) +library(augsynth) + + +## For the Abadie plots +replication_theme <- theme_bw(base_family = "Times New Roman") + + theme(panel.grid = element_blank(), + legend.background = element_rect(color = 'black'), + legend.spacing.y = unit(0.0, 'pt'), + legend.box.margin = margin(t = 5, r = 5, unit = 'pt'), + legend.position = c(0.81, 0.9)) + +set.seed(1234) + + +# Get working data CA and 38 control states +Wk.data <- read.dta(here::here("./tobacco_replication/data/smoking_wkdata_39.dta")) + + + +time_tx = 1988 + + +# Look at the data ---- + +skimr::skim( Wk.data ) + +# How much do the predictors predict our outcome? +library( lme4 ) +Wk.data <- mutate( Wk.data, + year_c = year - 1988 ) + +M <- lmer( cigsalepcap ~ 1 + year_c + I(year_c^2) + retprice + xxincome + K1_r_15_24 + (1|index), + data = Wk.data ) + +arm::display( M ) + + +d1975 = filter( Wk.data, year==1975 ) +table( d1975$index ) + +filter( Wk.data, year == 1975 ) %>% + dplyr::select( cigsalepcap ) %>% + head() + +# name and index correspond - check +table( table( Wk.data$name, Wk.data$index ) ) + +al = filter( Wk.data, name == "Alabama") +al %>% + summarise( xxbeer = mean( xxbeer, na.rm=TRUE ), + rp = sd( retprice, na.rm=TRUE ), + xx = sd( xxincome, na.rm=TRUE ), + #cig1989 = cigsalepcap[ year == 1989 ], + cig1980 = length( cigsalepcap[ year == 1980 ] ), + cig1988 = length( cigsalepcap[ year == 1988 ] ), + cig1975 = length( cigsalepcap[ year == 1975 ] ), + .groups = "drop" ) + + +covs <- Wk.data %>% + #dplyr::filter( year %in% 1984:1988 ) %>% + group_by( name, index ) %>% + summarise( xxbeer = mean( xxbeer[ year %in% 1984:1988 ], na.rm=TRUE ), + rp = sd( retprice, na.rm=TRUE ), + xx = sd( xxincome, na.rm=TRUE ), + retprice = mean( retprice, na.rm=TRUE ), + xxincome = mean( xxincome, na.rm=TRUE ), + K1_r_15_24 = mean( K1_r_15_24, na.rm=TRUE ), + cig1989 = cigsalepcap[ year == 1989 ], + cig1995 = cigsalepcap[ year == 1989 ], + cig1980 = cigsalepcap[ year == 1980 ] , + + cig1988 = cigsalepcap[ year == 1988 ] , + cig1975 = cigsalepcap[ year == 1975 ] , + .groups = "drop" + ) +head( covs ) + + + +MRpre <- lm( cig1995 ~ cig1980 + cig1988 + cig1975, data=covs ) +MRcov <- lm( cig1995 ~ xxbeer + retprice + xxincome + K1_r_15_24, data=covs ) + +summary( MRpre ) +summary( MRcov ) + +MRfull <- lm( cig1995 ~ cig1980 + cig1988 + cig1975 + xxbeer + retprice + xxincome + K1_r_15_24, data=covs ) +summary( MRfull ) + +anova( MRpre, MRfull ) + + + +## ----reformat tobacco data using tidyverse --------------------------------------- +tobacco <- Wk.data %>% + mutate(treated = ifelse((name == 'California') & + (year > 1988), 1, 0), + state = name, # rename something meaningful + state_index = index) %>% + select(cigsalepcap, treated, state, year, retprice, xxincome, K1_r_15_24, xxbeer) + + +# subset data to 1970 forward to avoid issues due to missingess of outcomes +tobacco_70 <- tobacco %>% filter(year >= 1970) + + +tobacco %>% group_by( year ) %>% + summarise( prop.miss = mean( is.na( cigsalepcap ))) %>% + print( n=100 ) + +tobacco_55 <- tobacco %>% filter(year >= 1955) + + +# Plot of states outcome over time ---- + +ggplot( Wk.data, aes( year, cigsalepcap, group=index ) ) + + geom_line() + + scale_y_log10() + + +# Various plots of covariates over time ---- +ggplot( Wk.data, aes( year, K1_r_15_24, group=index ) ) + + geom_line() + +ggplot( Wk.data, aes( year, xxincome, group=index ) ) + + geom_line() + +ggplot( Wk.data, aes( year, retprice, group=index ) ) + + geom_line() + + scale_y_log10() + +ggplot( Wk.data, aes( year, cigsalepcap, group=index ) ) + + geom_line() + + scale_y_log10() + +ggplot( Wk.data, aes( year, xxbeer, group=index ) ) + + geom_line() + + facet_wrap( ~ name ) + + scale_y_log10() + +ggplot( Wk.data, aes( xxbeer ) ) + + geom_histogram() + + +# Windsorize the units to remove extreme outliers + + +windsorize_outliers <- function(column) { + Q1 <- quantile(column, 0.25, na.rm = TRUE) + Q3 <- quantile(column, 0.75, na.rm = TRUE) + IQR_value <- IQR(column, na.rm = TRUE) + + lower_bound <- Q1 - 2 * IQR_value + upper_bound <- Q3 + 2 * IQR_value + + # Windsorize outliers + column <- ifelse(column < lower_bound, lower_bound, + ifelse(column > upper_bound, upper_bound, column)) + return(column) +} + +replace_outliers_with_na <- function(column) { + Q1 <- quantile(column, 0.25, na.rm = TRUE) + Q3 <- quantile(column, 0.75, na.rm = TRUE) + IQR_value <- IQR(column, na.rm = TRUE) + + lower_bound <- Q1 - 2 * IQR_value + upper_bound <- Q3 + 2 * IQR_value + + prop = column < lower_bound | column > upper_bound + cat( glue::glue( "{deparse(substitute(column))} outliers %" ), + 100*mean(prop, na.rm=TRUE), "\n" ) + column[column < lower_bound | column > upper_bound] <- NA + return(column) +} + + +# Apply the outlier function to each specified column +data = Wk.data +data <- data %>% + mutate(across(c(xxincome, xxbeer, K1_r_15_24, + rate, cigsale, cigsalepcap, retprice, + pop_census), replace_outliers_with_na)) + + +ggplot( Wk.data, aes( xxbeer ) ) + + geom_histogram() + +ggplot( data, aes( xxbeer ) ) + + geom_histogram() + + +ggplot( Wk.data, aes( rate ) ) + + geom_histogram() + +ggplot( data, aes( rate ) ) + + geom_histogram() + + + + + +### Run Baseline Model + +dataprep.out <- dataprep( + foo = Wk.data, + predictors = c("retprice", "xxincome", "K1_r_15_24"), + predictors.op = c("mean"), + dependent = c('cigsalepcap'), + unit.variable = c('index'), + time.variable = c('year'), + special.predictors = list(list("xxbeer", 1984:1988, c("mean")), + list("cigsalepcap", 1988, c("mean")), + list("cigsalepcap", 1980, c("mean")), + list("cigsalepcap", 1975, c("mean")) + ), + treatment.identifier = 3, + controls.identifier = c(1:39)[-3], + time.predictors.prior = 1980:1988, + time.optimize.ssr = 1970:1988, + unit.names.variable = c('name'), + time.plot = 1970:2005 +) + + + + +### run synth + +if ( FALSE ) { + synth.out <- synth( + dataprep.out, + Pop.size = 300, + Max.generations = 1, + Unif.seed = 356987, + Int.seed = 627478, + Optim.method = "BFGS", + L.ipop = 0, + Maxiter.ipop = 1e10, + Margin.ipop = 0.05, + Sigf.ipop = 8, + genoud = TRUE + ) + saveRDS( synth.out, file = here::here( "tobacco_replication/Synth_result.rds" ) ) + +} + +synth.out = readRDS( here::here( "tobacco_replication/Synth_result.rds" ) ) + + +## ----Abadie Figure 2, echo=FALSE, include=FALSE, fig.height=4, fig.width=4.6----------------- +f2_plot_df <- cbind(rownames(dataprep.out$Y1plot), + dataprep.out$Y1plot, + dataprep.out$Y0plot %*% synth.out$solution.w) %>% + as_tibble() %>% + mutate_all(as.numeric) + +colnames(f2_plot_df) <- c('year', 'California', 'Synthetic California') + +f2_plot_df <- f2_plot_df %>% + reshape2::melt(id.vars = 'year', + variable.name = 'treat_group', + value.name = 'per_cap_cigs') + +path_plot <- ggplot(f2_plot_df, aes(x = year, y = per_cap_cigs, linetype = treat_group)) + + geom_line() + + geom_vline(linetype = 'dotted', xintercept = 1988) + + scale_linetype_manual(values = c('solid', 'dashed'), + breaks = c("California", "Synthetic California")) + + scale_y_continuous(breaks = seq(0, 141, 20), limits = c(0, 140)) + + scale_x_continuous(breaks = seq(1970, 2005, 5), limits = c(1970, 2005)) + + labs(linetype = NULL, + title = 'Abadie et al. (2010), Figure 2', + y = 'per-capita cigarette sales (in packs)') + + annotate('text', y = 41, x = 1987.5, label = 'Passage of Proposition 99 \u2192', + hjust = 1, color = 'black', size = 3, family = "Times New Roman") + + annotate('text', y = 0, x = 1970, label = 'ND replication: Synth package', + hjust = 0, color = 'darkseagreen', size = 3) + + replication_theme + +path_plot + + + + +## ----Abadie Figure 3, echo=FALSE, include=FALSE, fig.height=4, fig.width=4.6----------------- +gap <- dataprep.out$Y1plot - dataprep.out$Y0plot %*% synth.out$solution.w +year <- dataprep.out$tag$time.plot + +plot_df <- cbind(gap, year) %>% as_tibble() +colnames(plot_df) <- c('gap', 'year') + +gap_plot <- ggplot(plot_df, aes(x = year, y = gap)) + + geom_line() + + geom_vline(linetype = 'dotted', xintercept = 1988) + + geom_hline(linetype = 'dashed', yintercept = 0) + + scale_y_continuous(breaks = seq(-30, 30, 10), limits = c(-30, 30)) + + scale_x_continuous(breaks = seq(1970, 2005, 5), limits = c(1970, 2005)) + + labs(linetype = NULL, + title = 'Abadie et al. (2010), Figure 3', + y = 'gap in per-capita cigarette sales (in packs)') + + annotate('text', y = -25, x = 1987.5, label = 'Passage of Proposition 99 \u2192', + hjust = 1, color = 'black', size = 3, family = "Times New Roman") + + annotate('text', y = -30, x = 1970, label = 'ND replication: Synth package', + hjust = 0, color = 'darkseagreen', size = 3) + + replication_theme + +gap_plot + + + +## ----run tobacco model using augsynth-------------------------------------------------------- + +# Basic SCM with covariates +syn <- augsynth(form = cigsalepcap ~ treated | retprice + xxincome + K1_r_15_24, + unit = state, + time = year, + data = tobacco_70 ) + +sum = summary(syn, inf_type = 'permutation_rstat') + + +# Look at analysis with shifting initial year of treatment ---- + +do_year <- function( tob, yr ) { + tob <- tob %>% + dplyr::mutate( tx_yr = ifelse((state == 'California') & (year > yr), 1, 0) ) + + sum2 <- augsynth(form = cigsalepcap ~ tx_yr | retprice + xxincome + K1_r_15_24, + unit = state, + time = year, + data = tob ) %>% + summary(inf_type = 'permutation_rstat') + + sum2$att$tx_year = yr + sum2$att +} + +library( tidyverse ) +yrs <- map_df( 1975:1988, do_year, tob = tobacco_70 ) +#yrs$tx_year = as.factor(yrs$tx_year) +head( yrs ) + +ggplot( yrs, aes(x = Time, y = Estimate, color = tx_year, group=tx_year)) + + geom_hline(yintercept = 0, linetype = 'solid') + + geom_vline(xintercept = syn$t_int - 0.5, linetype = 'dashed') + + geom_line() + +# geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound, fill = name), alpha = 0.2, size = 0.1) + + #facet_wrap( ~ name) + + scale_x_continuous(breaks = seq(1970, 2010, 5)) + + labs(x = 'Year', y = 'Gap in per-capita cigarette sales\n(in packs)' ) + + theme_bw() + + theme(plot.caption = element_text(hjust = 0, color = 'darkseagreen', size = 9)) + + +head( yrs ) + +y85 = filter( yrs, tx_year %in% c( 1985, 1986, 1987, 1988 ) ) +ggplot( y85, aes(x = Time, y = Estimate)) + + geom_hline(yintercept = 0, linetype = 'solid') + + geom_vline(xintercept = syn$t_int - 0.5, linetype = 'dashed') + + geom_vline(xintercept = 1985 + 0.5, linetype = 'dashed') + + geom_line() + + geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound ), alpha = 0.2, size = 0.1) + + facet_wrap( ~ tx_year ) + + scale_x_continuous(breaks = seq(1970, 2010, 5)) + + labs(x = 'Year', y = 'Gap in per-capita cigarette sales\n(in packs)' ) + + theme_bw() + + theme(plot.caption = element_text(hjust = 0, color = 'darkseagreen', size = 9)) + + + +ggplot(all_results, aes(x = Time, y = Estimate, color = name)) + + geom_hline(yintercept = 0, linetype = 'solid') + + geom_vline(xintercept = syn$t_int - 0.5, linetype = 'dashed') + + geom_line() + + #geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound, fill = name), alpha = 0.2, size = 0.1) + + #facet_wrap( ~ name) + + scale_x_continuous(breaks = seq(1970, 2010, 5)) + + labs(x = 'Year', y = 'Gap in per-capita cigarette sales\n(in packs)' ) + + theme_bw() + + theme(plot.caption = element_text(hjust = 0, color = 'darkseagreen', size = 9)) + + + +## ----show_perm_inference--------------------------------------------------------------------- +syn_rstat = summary( syn, inf_type = "permutation_rstat" ) +placebo_distribution( syn_rstat ) + + +## ----fig.width=4.5, fig.height=3.25---------------------------------------------------------- +plot(syn) + labs(title = 'Conformal inference (default)') + + +## ----fig.width=4.5, fig.height=4------------------------------------------------------------- +plot(syn_rstat, + plot_type = 'placebo', + inf_type = 'permutation', +) + ggtitle("Replication of Abadie et al. (2010), Figure 4") + + +## ----fig.width=4.5, fig.height=3.25---------------------------------------------------------- +p <- plot(syn_rstat, plot_type = 'estimate only') + ggtitle("Replication of Abadie et al. (2010), Figure 3") + + +## ----Format figures 3 comparison, echo=FALSE, fig.show="hold", fig.align='default', fig.height=4, fig.width=4.6---- +gap_plot +p + + +## ----fig.width=4.5, fig.height=4------------------------------------------------------------- +p <- plot(syn_rstat, plot_type = 'outcomes') + + ggtitle("Replication of Abadie et al. (2010), Figure 2") + + +## ----Format figures 2 comparison, echo=FALSE, fig.show="hold", fig.align='default', fig.height=4, fig.width=4.6---- +path_plot +p + + +## ----Outcomes plot with average, echo=TRUE, fig.show="hold", fig.align='default', fig.height=4, fig.width=4.6---- +plot(syn_rstat, plot_type="outcomes raw average") + + +## -------------------------------------------------------------------------------------------- +donor_table(syn_rstat) %>% + arrange( -abs(weight) ) + + +## ----Figures 6 and 7, echo=TRUE, eval=TRUE, fig.show="hold", fig.align='default', fig.height=4.5, fig.width=4.6---- +update_augsynth(syn) %>% # drops units with >20x treated RMSPE by default + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo' ) + + ggtitle("Replication of Abadie et al. (2010), Figure 5") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, label = "Removes donors with 20x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) +update_augsynth(syn, drop = 5) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') + + ggtitle("Replication of Abadie et al. (2010), Figure 6") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, label = "Removes donors with 5x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) +update_augsynth(syn, drop = 2) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') + + ggtitle("Replication of Abadie et al. (2010), Figure 7") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, label = "Removes donors with 2x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) + + +## ----permutation without states, echo=TRUE, eval=TRUE, fig.show="hold", fig.align='default', fig.height=4.5, fig.width=4.6---- +drop_states <- c("Iowa", "Arizona", "Alabama", "Illinois", "Indiana", "Idaho", "Connecticut", + "New Mexico", "Texas", "Utah", "North Dakota", "South Dakota", "Vermont", + "Wisconsin", "West Virginia", "Wyoming", "Tennessee", "Pennsylvania") + +update_augsynth(syn, drop = drop_states) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') + diff --git a/tobacco_replication/tobacco_replication.R b/tobacco_replication/tobacco_replication.R new file mode 100644 index 0000000..6140b8b --- /dev/null +++ b/tobacco_replication/tobacco_replication.R @@ -0,0 +1,268 @@ +## ----setup, include = FALSE------------------------------------------------------------------ + +knitr::opts_chunk$set( + collapse = FALSE, + message = FALSE, + warning = FALSE, + fig.align = 'center', + comment = "#>" +) + +## Install Synth if not already installed +# install.packages("Synth") + +library(kableExtra) +library(magrittr) +library(dplyr) +library(ggplot2) +library(reshape2) +library(tidyr) +library(rlang) + +library(foreign) +library(graphics) +library(Synth) +library(augsynth) +data(kansas) + +## For the Abadie plots +replication_theme <- theme_bw(base_family = "Times New Roman") + + theme(panel.grid = element_blank(), + legend.background = element_rect(color = 'black'), + legend.spacing.y = unit(0.0, 'pt'), + legend.box.margin = margin(t = 5, r = 5, unit = 'pt'), + legend.position = c(0.81, 0.9)) + +set.seed(1234) + + +## ----import tobacco data, include=FALSE------------------------------------------------------ +# Get working data CA and 38 control states +Wk.data <- read.dta(here::here("./tobacco_replication/data/smoking_wkdata_39.dta")) + +# load in "Rest of US" data (computed from 50 states) for figure 1 +RestUS.50 <- read.dta(here::here("./tobacco_replication/data/restofus_50.dta")) + +# this data is used for table 1, where we average the 38 control states +RestUS.38 <- read.dta(here::here("./tobacco_replication/data/restofus_38.dta")) + + +## ----Synth dataprep and model, echo=FALSE, cache=TRUE, include=FALSE------------------------- +### Run Baseline Model + +dataprep.out <- dataprep( + foo = Wk.data, + predictors = c("retprice", "xxincome", "K1_r_15_24"), + predictors.op = c("mean"), + dependent = c('cigsalepcap'), + unit.variable = c('index'), + time.variable = c('year'), + special.predictors = list(list("xxbeer", 1984:1988, c("mean")), + list("cigsalepcap", 1988, c("mean")), + list("cigsalepcap", 1980, c("mean")), + list("cigsalepcap", 1975, c("mean")) + ), + treatment.identifier = 3, + controls.identifier = c(1:39)[-3], + time.predictors.prior = 1980:1988, + time.optimize.ssr = 1970:1988, + unit.names.variable = c('name'), + time.plot = 1970:2005 +) + +### run synth + +synth.out <- synth( + dataprep.out, + Pop.size = 300, + Max.generations = 1, + Unif.seed = 356987, + Int.seed = 627478, + Optim.method = "BFGS", + L.ipop = 0, + Maxiter.ipop = 1e10, + Margin.ipop = 0.05, + Sigf.ipop = 8, + genoud = TRUE +) + + +## ----Abadie Figure 2, echo=FALSE, include=FALSE, fig.height=4, fig.width=4.6----------------- +f2_plot_df <- cbind(rownames(dataprep.out$Y1plot), + dataprep.out$Y1plot, + dataprep.out$Y0plot %*% synth.out$solution.w) %>% + as_tibble() %>% + mutate_all(as.numeric) + +colnames(f2_plot_df) <- c('year', 'California', 'Synthetic California') + +f2_plot_df <- f2_plot_df %>% + reshape2::melt(id.vars = 'year', + variable.name = 'treat_group', + value.name = 'per_cap_cigs') + +path_plot <- ggplot(f2_plot_df, aes(x = year, y = per_cap_cigs, linetype = treat_group)) + + geom_line() + + geom_vline(linetype = 'dotted', xintercept = 1988) + + scale_linetype_manual(values = c('solid', 'dashed'), + breaks = c("California", "Synthetic California")) + + scale_y_continuous(breaks = seq(0, 141, 20), limits = c(0, 140)) + + scale_x_continuous(breaks = seq(1970, 2005, 5), limits = c(1970, 2005)) + + labs(linetype = NULL, + title = 'Abadie et al. (2010), Figure 2', + y = 'per-capita cigarette sales (in packs)') + + annotate('text', y = 41, x = 1987.5, label = 'Passage of Proposition 99 \u2192', + hjust = 1, color = 'black', size = 3, family = "Times New Roman") + + annotate('text', y = 0, x = 1970, label = 'ND replication: Synth package', + hjust = 0, color = 'darkseagreen', size = 3) + + replication_theme + +path_plot + + +## ----Abadie Figure 3, echo=FALSE, include=FALSE, fig.height=4, fig.width=4.6----------------- +gap <- dataprep.out$Y1plot - dataprep.out$Y0plot %*% synth.out$solution.w +year <- dataprep.out$tag$time.plot + +plot_df <- cbind(gap, year) %>% as_tibble() +colnames(plot_df) <- c('gap', 'year') + +gap_plot <- ggplot(plot_df, aes(x = year, y = gap)) + + geom_line() + + geom_vline(linetype = 'dotted', xintercept = 1988) + + geom_hline(linetype = 'dashed', yintercept = 0) + + scale_y_continuous(breaks = seq(-30, 30, 10), limits = c(-30, 30)) + + scale_x_continuous(breaks = seq(1970, 2005, 5), limits = c(1970, 2005)) + + labs(linetype = NULL, + title = 'Abadie et al. (2010), Figure 3', + y = 'gap in per-capita cigarette sales (in packs)') + + annotate('text', y = -25, x = 1987.5, label = 'Passage of Proposition 99 \u2192', + hjust = 1, color = 'black', size = 3, family = "Times New Roman") + + annotate('text', y = -30, x = 1970, label = 'ND replication: Synth package', + hjust = 0, color = 'darkseagreen', size = 3) + + replication_theme + +gap_plot + + +## ----reformat tobacco data using tidyverse, echo=FALSE--------------------------------------- +tobacco <- Wk.data %>% + mutate(treated = ifelse((name == 'California') & + (year > 1988), 1, 0), + state = name, # rename something meaningful + state_index = index) %>% + select(cigsalepcap, treated, state, year, retprice, xxincome, K1_r_15_24, xxbeer) + +tobacco_70 <- tobacco %>% filter(year >= 1970) # subset data to 1970 forward to avoid issues due to missingess of outcomes + + +## ----run tobacco model using augsynth-------------------------------------------------------- +# Basic SCM with covariates +syn <- augsynth(form = cigsalepcap ~ treated | retprice + xxincome + K1_r_15_24, + unit = state, + time = year, + data = tobacco_70 + #progfunc = "none", scm=TRUE +) + + +## ----plot augsynth kansas models, echo=FALSE, cache=TRUE, fig.width=8, fig.height=4.5-------- +all_results <- bind_rows(Conformal = summary(syn, inf_type = 'conformal')$att, + Jackknife = summary(syn, inf_type = 'jackknife')$att, + `Jackknife+` = summary(syn, inf_type = 'jackknife+')$att, + Permutation = summary(syn, inf_type = 'permutation')$att, + `Permutation (rstat)` = summary(syn, inf_type = 'permutation_rstat')$att, + .id = 'name') %>% + mutate(name = factor(name, + levels = c('Conformal', 'Jackknife', 'Jackknife+', "Permutation", "Permutation (rstat)"), + ordered = T)) + +ggplot(all_results, aes(x = Time, y = Estimate, color = name)) + + geom_hline(yintercept = 0, linetype = 'solid') + + geom_vline(xintercept = syn$t_int, linetype = 'dashed') + + geom_line() + + geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound, fill = name), alpha = 0.2, size = 0.1) + + facet_wrap(. ~ name) + + scale_x_continuous(breaks = seq(1970, 2010, 5)) + + labs(x = 'Year', y = 'Gap in per-capita cigarette sales\n(in packs)', + caption = 'ND replication: augsynth package') + + scale_color_manual(values = c('red', 'blue', 'darkgreen', 'purple', 'orange'), + guide = 'none', aesthetics = c('color', 'fill')) + + theme_bw() + + theme(plot.caption = element_text(hjust = 0, color = 'darkseagreen', size = 9)) + + +## ----show_perm_inference--------------------------------------------------------------------- +syn_rstat = summary( syn, inf_type = "permutation_rstat" ) +placebo_distribution( syn_rstat ) + + +## ----fig.width=4.5, fig.height=3.25---------------------------------------------------------- +plot(syn) + labs(title = 'Conformal inference (default)') + + +## ----fig.width=4.5, fig.height=4------------------------------------------------------------- +plot(syn_rstat, + plot_type = 'placebo', + inf_type = 'permutation', +) + ggtitle("Replication of Abadie et al. (2010), Figure 4") + + +## ----fig.width=4.5, fig.height=3.25---------------------------------------------------------- +p <- plot(syn_rstat, plot_type = 'estimate only') + ggtitle("Replication of Abadie et al. (2010), Figure 3") + + +## ----Format figures 3 comparison, echo=FALSE, fig.show="hold", fig.align='default', fig.height=4, fig.width=4.6---- +gap_plot +p + + +## ----fig.width=4.5, fig.height=4------------------------------------------------------------- +p <- plot(syn_rstat, plot_type = 'outcomes') + + ggtitle("Replication of Abadie et al. (2010), Figure 2") + + +## ----Format figures 2 comparison, echo=FALSE, fig.show="hold", fig.align='default', fig.height=4, fig.width=4.6---- +path_plot +p + + +## ----Outcomes plot with average, echo=TRUE, fig.show="hold", fig.align='default', fig.height=4, fig.width=4.6---- +plot(syn_rstat, plot_type="outcomes raw average") + + +## -------------------------------------------------------------------------------------------- +donor_table(syn_rstat) %>% + arrange( -abs(weight) ) + + +## ----Figures 6 and 7, echo=TRUE, eval=TRUE, fig.show="hold", fig.align='default', fig.height=4.5, fig.width=4.6---- +update_augsynth(syn) %>% # drops units with >20x treated RMSPE by default + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo' ) + + ggtitle("Replication of Abadie et al. (2010), Figure 5") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, label = "Removes donors with 20x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) +update_augsynth(syn, drop = 5) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') + + ggtitle("Replication of Abadie et al. (2010), Figure 6") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, label = "Removes donors with 5x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) +update_augsynth(syn, drop = 2) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') + + ggtitle("Replication of Abadie et al. (2010), Figure 7") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, label = "Removes donors with 2x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) + + +## ----permutation without states, echo=TRUE, eval=TRUE, fig.show="hold", fig.align='default', fig.height=4.5, fig.width=4.6---- +drop_states <- c("Iowa", "Arizona", "Alabama", "Illinois", "Indiana", "Idaho", "Connecticut", + "New Mexico", "Texas", "Utah", "North Dakota", "South Dakota", "Vermont", + "Wisconsin", "West Virginia", "Wyoming", "Tennessee", "Pennsylvania") + +update_augsynth(syn, drop = drop_states) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') + diff --git a/tobacco_replication/tobacco_replication.Rmd b/tobacco_replication/tobacco_replication.Rmd new file mode 100644 index 0000000..1cae702 --- /dev/null +++ b/tobacco_replication/tobacco_replication.Rmd @@ -0,0 +1,354 @@ +--- +title: "Replicating Abadie et al. (2010) with the `augsynth` package" +author: "Delaney & Miratrix" +date: "2024-11-16" +output: html_document +editor_options: + chunk_output_type: console +--- + + + + + +```{r setup, include = FALSE} +library( tidyverse ) +knitr::opts_chunk$set( + collapse = FALSE, + message = FALSE, + warning = FALSE, + fig.align = 'center', + comment = "#>" +) + +## Install Synth if not already installed +# install.packages("Synth") + +library(kableExtra) +library(magrittr) +library(dplyr) +library(ggplot2) +library(reshape2) +library(tidyr) +library(rlang) + +library(foreign) +library(graphics) +library(Synth) +library(augsynth) + +## For the Abadie plots +replication_theme <- theme_bw(base_family = "Times New Roman") + + theme(panel.grid = element_blank(), + legend.background = element_rect(color = 'black'), + legend.spacing.y = unit(0.0, 'pt'), + legend.box.margin = margin(t = 5, r = 5, unit = 'pt'), + legend.position = c(0.81, 0.9)) + +set.seed(1234) +``` + +This vignette replicates the analysis and figures in [Abadie et al. (2010)](https://web.stanford.edu/~jhain/Paper/JASA2010.pdf) using the `augsynth` package. +In particular, our recent extensions to the `augsynth` package streamline permutation-based inference and associated visualizations on behalf of the user. +In the following, we show how to easily generate results similar (but not entirely identical) to Figures 2-7 in the original paper. + +While the general intuition of Abadie et al.'s analysis is replicated in the `augsynth` package, it is worth noting that covariates and model parameters are handled somewhat differently than in the original analysis. In particular, unlike the `augsynth` package, the `Synth` package used in Abadie et al.'s analysis reweights a subset of the lagged outcomes and passed auxiliary covariates when generating synthetic weights. +The `augsynth` package, by contrast, directly balances _all_ pre-treatment outcomes along with auxillary covariates, which means it does not dynamically determine variable importance weights. + +In this vignette we also include `Synth` code to generate the reported Abadie et al.'s results with a high degree of fidelity (with the resulting `Synth`-based plots shown adjacent to analogous output from `augsynth` models) for comparison purposes. +We do note that we show _all_ the post-treatment years, while the original paper only shows the first 10 years post-treatment. + + +```{r import tobacco data, include=FALSE} +# Get working data CA and 38 control states +Wk.data <- read.dta(here::here("./tobacco_replication/data/smoking_wkdata_39.dta")) +``` + +```{r Synth dataprep and model, echo=FALSE, cache=TRUE, include=FALSE} +### Run Baseline Model + +# Notes +# `retprice` is the state average retail price of cigarettes (in cents) +# `K1_r_15_24` is the share of the state population aged 15-24 + +dataprep.out <- dataprep( + foo = Wk.data, + predictors = c("retprice", "xxincome", "K1_r_15_24"), + predictors.op = c("mean"), + dependent = c('cigsalepcap'), + unit.variable = c('index'), + time.variable = c('year'), + special.predictors = list(list("xxbeer", 1984:1988, c("mean")), + list("cigsalepcap", 1988, c("mean")), + list("cigsalepcap", 1980, c("mean")), + list("cigsalepcap", 1975, c("mean")) + ), + treatment.identifier = 3, + controls.identifier = c(1:39)[-3], + time.predictors.prior = 1980:1988, + time.optimize.ssr = 1970:1988, + unit.names.variable = c('name'), + time.plot = 1970:2005 +) + +### run synth + +synth.out <- synth( + dataprep.out, + Pop.size = 300, + Max.generations = 1, + Unif.seed = 356987, + Int.seed = 627478, + Optim.method = "BFGS", + L.ipop = 0, + Maxiter.ipop = 1e10, + Margin.ipop = 0.05, + Sigf.ipop = 8, + genoud = TRUE +) +``` + +```{r Abadie Figure 2, echo=FALSE, include=FALSE, fig.height=4, fig.width=4.6} +f2_plot_df <- cbind(rownames(dataprep.out$Y1plot), + dataprep.out$Y1plot, + dataprep.out$Y0plot %*% synth.out$solution.w) %>% + as_tibble() %>% + mutate_all(as.numeric) + +colnames(f2_plot_df) <- c('year', 'California', 'Synthetic California') + +f2_plot_df <- f2_plot_df %>% + reshape2::melt(id.vars = 'year', + variable.name = 'treat_group', + value.name = 'per_cap_cigs') + +path_plot <- ggplot(f2_plot_df, aes(x = year, y = per_cap_cigs, linetype = treat_group)) + + geom_line() + + geom_vline(linetype = 'dotted', xintercept = 1988) + + scale_linetype_manual(values = c('solid', 'dashed'), + breaks = c("California", "Synthetic California")) + + scale_y_continuous(breaks = seq(0, 141, 20), limits = c(0, 140)) + + scale_x_continuous(breaks = seq(1970, 2005, 5), limits = c(1970, 2005)) + + labs(linetype = NULL, + title = 'Abadie et al. (2010), Figure 2', + y = 'per-capita cigarette sales (in packs)') + + annotate('text', y = 41, x = 1987.5, label = 'Passage of Proposition 99 \u2192', + hjust = 1, color = 'black', size = 3, family = "Times New Roman") + + replication_theme + +path_plot +``` + +```{r Abadie Figure 3, echo=FALSE, include=FALSE, fig.height=4, fig.width=4.6} +gap <- dataprep.out$Y1plot - dataprep.out$Y0plot %*% synth.out$solution.w +year <- dataprep.out$tag$time.plot + +plot_df <- cbind(gap, year) %>% as_tibble() +colnames(plot_df) <- c('gap', 'year') + +gap_plot <- ggplot(plot_df, aes(x = year, y = gap)) + + geom_line() + + geom_vline(linetype = 'dotted', xintercept = 1988) + + geom_hline(linetype = 'dashed', yintercept = 0) + + scale_y_continuous(breaks = seq(-30, 30, 10), limits = c(-30, 30)) + + scale_x_continuous(breaks = seq(1970, 2005, 5), limits = c(1970, 2005)) + + labs(linetype = NULL, + title = 'Abadie et al. (2010), Figure 3', + y = 'gap in per-capita cigarette sales (in packs)') + + annotate('text', y = -25, x = 1987.5, label = 'Passage of Proposition 99 \u2192', + hjust = 1, color = 'black', size = 3, family = "Times New Roman") + + annotate('text', y = -30, x = 1970, label = 'ND replication: Synth package', + hjust = 0, color = 'darkseagreen', size = 3) + + replication_theme + +gap_plot +``` + +### Replication with augsynth + +When fitting our model, we found that using `progfunc = "none"` (meaning no ridge regression bias adjustment step, which would be seemingly more similar to Abadie) then the results deviate more strongly from the tobacco paper, but if we allow augsynth to do its default bias removal, we more closely replicate the paper. +Thus, we use ridge regresssion in this vignette, but then use the "R-statistic" permutation approahc from Abadie for inference: + +```{r reformat tobacco data using tidyverse, echo=FALSE} +tobacco <- Wk.data %>% + mutate(treated = ifelse((name == 'California') & + (year > 1988), 1, 0), + state = name, # rename something meaningful + state_index = index) %>% + select(cigsalepcap, treated, state, year, retprice, xxincome, K1_r_15_24, xxbeer) + +tobacco_70 <- tobacco %>% filter(year >= 1970) # subset data to 1970 forward to avoid issues due to missingess of outcomes + +# Make beer variable and rescale proportion young people +tobacco_70 <- tobacco_70 %>% group_by( state) %>% + mutate( beer = mean( xxbeer[ year %in% 1984:1988 ], na.rm=TRUE ), + retail = mean( retprice[ year %in% 1980:1988 ], na.rm=TRUE ), + K1_r_15_24 = 100 * K1_r_15_24 ) %>% + ungroup() + +``` + +```{r run tobacco model using augsynth} +# Basic SCM with covariates +syn <- augsynth(form = cigsalepcap ~ treated | retail + xxincome + K1_r_15_24 + beer, + unit = state, + time = year, + data = tobacco_70 + #progfunc = "none" +) +syn_rstat <- summary( syn, inf_type = 'permutation_rstat' ) +``` + +To get the outcome plot (Figure 2), call `plot(plot_type = 'outcomes')` on an `augsynth` object or summary object: + +```{r fig.width=4.5, fig.height=4} +p <- plot(syn_rstat, plot_type = 'outcomes raw average') + + ggtitle("Replication of Abadie et al. (2010), Figure 2") +``` + +```{r Format figures 2 comparison, echo=FALSE, fig.show="hold", fig.align='default', fig.height=4, fig.width=4.6} +path_plot +p +``` + +Note our version has a bit of "Figure 1" in it, with the raw outcome trajectory. +If desired, setting `plot_type = 'outcomes'` will drop the raw average of donor units and the synthetic counterfactual. + +To get Table 1, the comparison of the means of various covariates, we can use +```{r} +covariate_balance_table( syn, pre_period = c( 1975, 1980, 1988 ) ) %>% + knitr::kable( digits = 2 ) +``` + + +For Figure 3, we compare the estimated impact to that calculated in the original paper: + +```{r fig.width=4.5, fig.height=3.25} +p <- plot(syn_rstat, plot_type = 'estimate only') + + ggtitle("Replication of Abadie et al. (2010), Figure 3") + + scale_y_continuous(limits = c(-30, 30)) +``` + +```{r Format figures 3 comparison, echo=FALSE, fig.show="hold", fig.align='default', fig.height=4, fig.width=4.6} +gap_plot +p +``` + +We see a bit more choppyness than the Synth package. + + +We can finally plot our result, using `plot_type = "placebo"`, to obtain a plot showing each placebo trajectory: + +```{r fig.width=4.5, fig.height=4} +plot(syn_rstat, plot_type = 'placebo' ) + + ggtitle("Replication of Abadie et al. (2010), Figure 4") +``` + +We can get the raw trajectories via +```{r} +placebo_distribution( syn_rstat ) +``` + +To see what makes up our synthetic unit, we can use `donor_table()` to obtain a dataframe with the RMSPE and synthetic weight for each of the donor units. + +```{r} +donor_table(syn_rstat) %>% + arrange( -abs(weight) ) +``` + +California is apparently mainly a mix of Colorado and Connecticut, with a bit of Utah and Nevada thrown in. +The negative weights, and the lack of sparse weights, are from the ridge regresssion. +Without that second stage, these weights would generally be sparse, as in the original paper. +Here we drop the ridge step, and see what we get: + +```{r} +syn_classic <- augsynth(form = cigsalepcap ~ treated | retail + xxincome + K1_r_15_24 + beer, + unit = state, + time = year, + data = tobacco_70, + progfunc = "none" ) +donor_table(syn_classic) %>% + filter( weight!= 0 ) %>% + arrange( -abs(weight) ) +``` + +A small number of units. The weights differ from Abadie, as we are using an overall different strategy. + + +### Sensitivity checks by limiting the donor pool + +The augsynth models can be re-fitted after excluding certain donor units via the `update_augsynth()` method. +The main method is to set `drop` to a multiplier for the treatment unit RMSPE. +All units with a RMSPE larger than that value will be dropped: + +Here we replicate Figures 5, 6, and 7 from Abadie et al. (2010) by dropping donor units with pre-treatment RMSPE greater than 20, 5, and 2 times California's pre-treatment RMSPE, respectively: + +```{r Figures 6 and 7, echo=TRUE, eval=TRUE, fig.show="hold", fig.align='default', fig.height=4.5, fig.width=4.6} +update_augsynth(syn) %>% # drops units with >20x treated RMSPE by default + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo' ) + + ggtitle("Replication of Abadie et al. (2010), Figure 5") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, + label = "Removes donors with 20x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) +update_augsynth(syn, drop = 5) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') + + ggtitle("Replication of Abadie et al. (2010), Figure 6") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, + label = "Removes donors with 5x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) +update_augsynth(syn, drop = 2) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') + + ggtitle("Replication of Abadie et al. (2010), Figure 7") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, + label = "Removes donors with 2x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) +``` + +Alternatively, the names of units to be excluded can be passed in directly as characters using `drop`. + +```{r permutation without states, echo=TRUE, eval=TRUE, fig.show="hold", fig.align='default', fig.height=4.5, fig.width=4.6} +drop_states <- c("Iowa", "Arizona", "Alabama", "Illinois", "Indiana", "Idaho", "Connecticut", + "New Mexico", "Texas", "Utah", "North Dakota", "South Dakota", "Vermont", + "Wisconsin", "West Virginia", "Wyoming", "Tennessee", "Pennsylvania") + +update_augsynth(syn, drop = drop_states) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') +``` + + +# The RMSPE plot + +Finally, we can make Figure 8--interestingly, this does not replicate well, with CA no longer being an extreme outlier (although it still is in the tail of the distribution). +It looks like all of the outliers have pulled back substantially, vs. the original Figure 8. + +```{r} + +rms <- placebo_distribution( syn_rstat ) %>% + mutate( post = ifelse( year > 1988, "post", "pre" ), + Tx = ifelse( state == "California", "CA", "synth" ) ) %>% + group_by( state, post, Tx ) %>% + summarise( MSPE = mean( (Yhat - Yobs)^2 ), .groups = "drop" ) %>% + pivot_wider( names_from = post, values_from = MSPE ) %>% + mutate( ratio = post / pre ) + +ggplot( rms, aes( ratio, fill=Tx ) ) + + geom_dotplot( alpha = 0.5, method="histodot", binwidth = 2 ) + + ggtitle("RMSPE ratio of post to pre-treatment periods") + + scale_x_continuous( limits = c(0, 130 ), breaks = seq( 0,120,by=20) ) + + replication_theme +``` diff --git a/tobacco_replication/tobacco_replication.html b/tobacco_replication/tobacco_replication.html new file mode 100644 index 0000000..5114795 --- /dev/null +++ b/tobacco_replication/tobacco_replication.html @@ -0,0 +1,638 @@ + + + + + + + + + + + + + + + +Replicating Abadie et al. (2010) with the augsynth package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + +

This vignette replicates the analysis and figures in Abadie et +al. (2010) using the augsynth package. In particular, +our recent extensions to the augsynth package streamline +permutation-based inference and associated visualizations on behalf of +the user. In the following, we show how to easily generate results +similar (but not entirely identical) to Figures 2-7 in the original +paper.

+

While the general intuition of Abadie et al.’s analysis is replicated +in the augsynth package, it is worth noting that covariates +and model parameters are handled somewhat differently than in the +original analysis. In particular, unlike the augsynth +package, the Synth package used in Abadie et al.’s analysis +reweights a subset of the lagged outcomes and passed auxiliary +covariates when generating synthetic weights. The augsynth +package, by contrast, directly balances all pre-treatment +outcomes along with auxillary covariates, which means it does not +dynamically determine variable importance weights.

+

In this vignette we also include Synth code to generate +the reported Abadie et al.’s results with a high degree of fidelity +(with the resulting Synth-based plots shown adjacent to +analogous output from augsynth models) for comparison +purposes. We do note that we show all the post-treatment years, +while the original paper only shows the first 10 years +post-treatment.

+
+

Replication with augsynth

+

When fitting our model, we found that using +progfunc = "none" (meaning no ridge regression bias +adjustment step, which would be seemingly more similar to Abadie) then +the results deviate more strongly from the tobacco paper, but if we +allow augsynth to do its default bias removal, we more closely replicate +the paper. Thus, we use ridge regresssion in this vignette, but then use +the “R-statistic” permutation approahc from Abadie for inference:

+
# Basic SCM with covariates
+syn <- augsynth(form = cigsalepcap ~ treated | retail + xxincome + K1_r_15_24 + beer,
+                unit = state,
+                time = year,
+                data = tobacco_70
+                #progfunc = "none"
+)
+syn_rstat <- summary( syn, inf_type = 'permutation_rstat' )
+

To get the outcome plot (Figure 2), call +plot(plot_type = 'outcomes') on an augsynth +object or summary object:

+
p <- plot(syn_rstat, plot_type = 'outcomes raw average') + 
+    ggtitle("Replication of Abadie et al. (2010), Figure 2")
+

+

Note our version has a bit of “Figure 1” in it, with the raw outcome +trajectory. If desired, setting plot_type = 'outcomes' will +drop the raw average of donor units and the synthetic +counterfactual.

+

To get Table 1, the comparison of the means of various covariates, we +can use

+
covariate_balance_table( syn, pre_period = c( 1975, 1980, 1988 ) ) %>%
+    knitr::kable( digits = 2 )
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
variableTxCoRaw
retail89.4290.4087.27
xxincome10.039.979.79
K1_r_15_2417.8717.8817.83
beer24.2823.2523.66
1975127.10121.48136.93
1980120.20123.65138.09
198890.1096.35113.82
+

For Figure 3, we compare the estimated impact to that calculated in +the original paper:

+
p <- plot(syn_rstat, plot_type = 'estimate only') +
+    ggtitle("Replication of Abadie et al. (2010), Figure 3") +
+    scale_y_continuous(limits = c(-30, 30))
+

+

We see a bit more choppyness than the Synth package.

+

We can finally plot our result, using +plot_type = "placebo", to obtain a plot showing each +placebo trajectory:

+
plot(syn_rstat, plot_type = 'placebo' ) + 
+    ggtitle("Replication of Abadie et al. (2010), Figure 4")
+

+

We can get the raw trajectories via

+
placebo_distribution( syn_rstat )
+
#> # A tibble: 1,404 × 8
+#>    state     trt  year  Yhat  Yobs     ATT RMSPE   rstat
+#>    <chr>   <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>   <dbl>
+#>  1 Alabama     0  1970  91.5  89.8 -1.67    1.14 -1.47  
+#>  2 Alabama     0  1971  96.5  95.4 -1.06    1.14 -0.926 
+#>  3 Alabama     0  1972 100.  101.   1.08    1.14  0.941 
+#>  4 Alabama     0  1973 104.  103.  -1.15    1.14 -1.00  
+#>  5 Alabama     0  1974 108.  108.   0.0232  1.14  0.0203
+#>  6 Alabama     0  1975 112.  112.  -0.269   1.14 -0.236 
+#>  7 Alabama     0  1976 115.  116.   1.10    1.14  0.964 
+#>  8 Alabama     0  1977 117.  117.  -0.149   1.14 -0.130 
+#>  9 Alabama     0  1978 120.  123    2.62    1.14  2.29  
+#> 10 Alabama     0  1979 120.  121.   1.38    1.14  1.21  
+#> # ℹ 1,394 more rows
+

To see what makes up our synthetic unit, we can use +donor_table() to obtain a dataframe with the RMSPE and +synthetic weight for each of the donor units.

+
donor_table(syn_rstat) %>%
+    arrange( -abs(weight) )
+
#> # A tibble: 38 × 3
+#>    state        weight RMSPE
+#>    <chr>         <dbl> <dbl>
+#>  1 Colorado     0.545   2.68
+#>  2 Connecticut  0.363   2.69
+#>  3 Utah         0.106   2.70
+#>  4 Nevada       0.0935  5.11
+#>  5 Mississippi -0.0792  1.66
+#>  6 Arkansas    -0.0689  1.73
+#>  7 Virginia     0.0560  1.95
+#>  8 New Mexico   0.0522  1.43
+#>  9 Oklahoma    -0.0480  2.27
+#> 10 Tennessee   -0.0445  1.46
+#> # ℹ 28 more rows
+

California is apparently mainly a mix of Colorado and Connecticut, +with a bit of Utah and Nevada thrown in. The negative weights, and the +lack of sparse weights, are from the ridge regresssion. Without that +second stage, these weights would generally be sparse, as in the +original paper. For example:

+
syn_classic <- augsynth(form = cigsalepcap ~ treated | retail + xxincome + K1_r_15_24 + beer,
+                unit = state,
+                time = year,
+                data = tobacco_70,
+                progfunc = "none" )
+donor_table(syn_classic) %>%
+    filter( weight!= 0 ) %>%
+    arrange( -abs(weight) )
+
#> # A tibble: 5 × 3
+#>   state       weight RMSPE
+#>   <chr>        <dbl> <dbl>
+#> 1 Colorado    0.564   4.62
+#> 2 Connecticut 0.342   5.30
+#> 3 Utah        0.0508 24.4 
+#> 4 New Mexico  0.0274  3.58
+#> 5 Nevada      0.0156  9.17
+
+
+

Sensitivity checks by limiting the donor pool

+

The augsynth models can be re-fitted after excluding certain donor +units via the update_augsynth() method. The main method is +to set drop to a multiplier for the treatment unit RMSPE. +All units with a RMSPE larger than that value will be dropped:

+
update_augsynth(syn) %>% # drops units with >20x treated RMSPE by default
+    summary( inf_type = 'permutation' ) %>%
+    plot(plot_type = 'placebo' ) + 
+    ggtitle("Replication of Abadie et al. (2010), Figure 5") + ylim(-51, 91) + 
+    annotate('text', y = -48, x = 1970, 
+             label = "Removes donors with 20x California's  pre-treatment RMSPE",
+             hjust = 0, color = 'darkseagreen', size = 3) 
+update_augsynth(syn, drop = 5) %>% 
+    summary( inf_type = 'permutation' ) %>%
+    plot(plot_type = 'placebo') + 
+    ggtitle("Replication of Abadie et al. (2010), Figure 6") + ylim(-51, 91) + 
+    annotate('text', y = -48, x = 1970, 
+             label = "Removes donors with 5x California's pre-treatment RMSPE",
+             hjust = 0, color = 'darkseagreen', size = 3) 
+update_augsynth(syn, drop = 2) %>% 
+    summary( inf_type = 'permutation' ) %>%
+    plot(plot_type = 'placebo') + 
+    ggtitle("Replication of Abadie et al. (2010), Figure 7") + ylim(-51, 91) + 
+    annotate('text', y = -48, x = 1970, 
+             label = "Removes donors with 2x California's pre-treatment  RMSPE",
+             hjust = 0, color = 'darkseagreen', size = 3) 
+

+

Alternatively, the names of units to be excluded can be passed in +directly as characters using drop.

+
drop_states <- c("Iowa", "Arizona", "Alabama", "Illinois", "Indiana", "Idaho", "Connecticut", 
+                 "New Mexico", "Texas", "Utah", "North Dakota", "South Dakota", "Vermont", 
+                 "Wisconsin", "West Virginia", "Wyoming", "Tennessee", "Pennsylvania")
+
+update_augsynth(syn, drop = drop_states) %>% 
+    summary( inf_type = 'permutation' ) %>%
+    plot(plot_type = 'placebo') 
+

+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/tobacco_replication/tobacco_script.R b/tobacco_replication/tobacco_script.R new file mode 100644 index 0000000..9ae6011 --- /dev/null +++ b/tobacco_replication/tobacco_script.R @@ -0,0 +1,138 @@ + +# Script replicating Abadie 2010 using augsynth + +library( tidyverse ) +library(augsynth) + + +set.seed(1234) + + +## ----reformat tobacco data using tidyverse, echo=FALSE----------------------------- +Wk.data <- read.dta(here::here("./tobacco_replication/data/smoking_wkdata_39.dta")) +tobacco <- Wk.data %>% + mutate(treated = ifelse((name == 'California') & + (year > 1988), 1, 0), + state = name, # rename something meaningful + state_index = index) %>% + select(cigsalepcap, treated, state, year, retprice, xxincome, K1_r_15_24, xxbeer) + +tobacco_70 <- tobacco %>% filter(year >= 1970) # subset data to 1970 forward to avoid issues due to missingess of outcomes +rm( tobacco ) +rm( Wk.data ) + +# Make beer variable +tobacco_70 <- tobacco_70 %>% group_by( state) %>% + mutate( beer = mean( xxbeer[ year %in% 1984:1988 ], na.rm=TRUE ), + retail = mean( retprice[ year %in% 1980:1988 ], na.rm=TRUE ), + K1_r_15_24 = 100 * K1_r_15_24 ) %>% + ungroup() + + +## ----run tobacco model using augsynth---------------------------------------------- + +rs = rnorm( length( unique( tobacco_70$state ) ), sd=0.2 ) +names(rs) <- unique( tobacco_70$state ) +tobacco_70$fnk = tobacco_70$year - 1970 + rs[ tobacco_70$state ] +summary( tobacco_70$fnk ) +syn <- augsynth(form = cigsalepcap ~ treated | retail + xxincome + K1_r_15_24 + beer + fnk, + unit = state, + time = year, + data = tobacco_70 ) + +ggplot( tobacco_70, aes( year, retprice, group = state ) ) + + geom_line() +filter( tobacco_70, state == 'California' ) %>% + dplyr::select( year, retprice ) %>% + knitr::kable() +tobacco_70 %>% + group_by( state ) %>% + summarise( mn = mean( retprice ) ) + +covariate_balance_table( syn, pre_period = c( 1975, 1980, 1988 ) ) + +# Now inference +syn_rstat <- summary( syn, inf_type = 'permutation_rstat' ) + + + + + + +## ----fig.width=4.5, fig.height=4--------------------------------------------------- +p <- plot(syn_rstat, plot_type = 'outcomes raw average') + + ggtitle("Replication of Abadie et al. (2010), Figure 2") +p + + +## ---------------------------------------------------------------------------------- + + +## ----fig.width=4.5, fig.height=3.25------------------------------------------------ +p <- plot(syn_rstat, plot_type = 'estimate only') + + ggtitle("Replication of Abadie et al. (2010), Figure 3") + + scale_y_continuous(limits = c(-30, 30)) + +p + + +## ----fig.width=4.5, fig.height=4--------------------------------------------------- +plot(syn_rstat, plot_type = 'placebo' ) + + ggtitle("Replication of Abadie et al. (2010), Figure 4") + + +## ---------------------------------------------------------------------------------- +placebo_distribution( syn_rstat ) + + +## ---------------------------------------------------------------------------------- +donor_table(syn_rstat) %>% + arrange( -abs(weight) ) + + +## ---------------------------------------------------------------------------------- +syn_classic <- augsynth(form = cigsalepcap ~ treated | retprice + xxincome + K1_r_15_24, + unit = state, + time = year, + data = tobacco_70, + progfunc = "none" ) +donor_table(syn_classic) %>% + filter( weight!= 0 ) %>% + arrange( -abs(weight) ) + + +## ----Figures 6 and 7, echo=TRUE, eval=TRUE, fig.show="hold", fig.align='default', fig.height=4.5, fig.width=4.6---- +update_augsynth(syn) %>% # drops units with >20x treated RMSPE by default + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo' ) + + ggtitle("Replication of Abadie et al. (2010), Figure 5") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, + label = "Removes donors with 20x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) +update_augsynth(syn, drop = 5) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') + + ggtitle("Replication of Abadie et al. (2010), Figure 6") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, + label = "Removes donors with 5x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) +update_augsynth(syn, drop = 2) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') + + ggtitle("Replication of Abadie et al. (2010), Figure 7") + ylim(-51, 91) + + annotate('text', y = -48, x = 1970, + label = "Removes donors with 2x California's pre-treatment RMSPE", + hjust = 0, color = 'darkseagreen', size = 3) + + +## ----permutation without states, echo=TRUE, eval=TRUE, fig.show="hold", fig.align='default', fig.height=4.5, fig.width=4.6---- +drop_states <- c("Iowa", "Arizona", "Alabama", "Illinois", "Indiana", "Idaho", "Connecticut", + "New Mexico", "Texas", "Utah", "North Dakota", "South Dakota", "Vermont", + "Wisconsin", "West Virginia", "Wyoming", "Tennessee", "Pennsylvania") + +update_augsynth(syn, drop = drop_states) %>% + summary( inf_type = 'permutation' ) %>% + plot(plot_type = 'placebo') + + +rm( p )