From 23a3a80ba04c532e6fc6e5b41e0c7faa3b26a1b2 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Tue, 16 Dec 2025 10:59:11 -0600 Subject: [PATCH] Improvements for `sparse_model_matrix` --- ONGOING_ISSUES.md | 2 - R/r-language.R | 2 +- R/sparse_model_matrix.R | 172 +++++++++++++++++++++++++++------------- tests/fixest_tests.R | 9 ++- 4 files changed, 124 insertions(+), 61 deletions(-) diff --git a/ONGOING_ISSUES.md b/ONGOING_ISSUES.md index 9197bbcc..44474fdb 100644 --- a/ONGOING_ISSUES.md +++ b/ONGOING_ISSUES.md @@ -5,8 +5,6 @@ So far `fixest` objects are compatible with variances computed with the package `sandwich` with the following exceptions: - - `vcovHC` with `type = "HC4"`: The compatibility is only partial, i.e. it works for models without fixed-effects but breaks with fixed-effects estimations. This is because the hatvalues, needed for this kind of VCOV correction, are not defined when fixed-effects are used (actually I could make it compatible but the model would need to be reestimated with dummy variables, and this hardly squares with the idea of efficient fixed-effects estimation). - - `vcovBS`, i.e. bootstraped standard-errors: Absolutely not compatible so far, but it will become compatible later (hopefully). diff --git a/R/r-language.R b/R/r-language.R index ebd5b9c0..c1111949 100644 --- a/R/r-language.R +++ b/R/r-language.R @@ -187,7 +187,7 @@ fml_split_internal = function(fml, split.lhs = FALSE){ fml_split = function(fml, i, split.lhs = FALSE, text = FALSE, raw = FALSE){ - # I had to create that fonction to cope with the following formula: + # I had to create that function to cope with the following formula: # # y ~ x1 | fe1 | u ~ z # diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R index cd4d9e63..5a6123e4 100644 --- a/R/sparse_model_matrix.R +++ b/R/sparse_model_matrix.R @@ -111,18 +111,23 @@ sparse_model_matrix = function(object, data, type = "rhs", sample = "estimation" # Allows passage of formula to sparse_model_matrix. A bit inefficient, but it works. isFormula = FALSE - split_fml = NULL if (inherits(object, "formula")) { - split_fml = fml_split_internal(object) + fml_parts = get_fml_parts(object, parts_as_formula = TRUE) if (isTRUE(collin.rm)) { message("Formula passed to sparse_model_matrix with collin.rm == TRUE. Estimating feols with formula.") object = feols(object, data = data) - } else if (length(split_fml) == 3) { + } else if (!is.null(fml_parts$inst_fml)) { message("Formula passed to sparse_model_matrix with an iv formula. Estimating feols with formula.") object = feols(object, data = data) } else { isFormula = TRUE } + } + + # na.rm = FALSE doesn't work with type = "fixef" (which FE col gets NA?) + if (("fixef" %in% type & !na.rm)) { + # na.rm = TRUE + message("na.rm = FALSE doesn't work with type = 'fixef'. It has been set to TRUE.") } # Panel setup @@ -132,45 +137,26 @@ sparse_model_matrix = function(object, data, type = "rhs", sample = "estimation" # The formulas if (isFormula) { - fml_linear = split_fml[[1]] - - fml_0 = attr(stats::terms(fml_linear), "intercept") == 0 - fake_intercept = !is.null(split_fml[[2]]) | fml_0 + fml_parts = get_fml_parts(object, parts_as_formula = TRUE) } else { - fml_linear = formula(object, type = "linear") - - # we kick out the intercept if there is presence of fixed-effects - fml_0 = attr(stats::terms(fml_linear), "intercept") == 0 - fake_intercept = !is.null(object$fixef_vars) && !(!is.null(object$slope_flag) && all(object$slope_flag < 0)) - fake_intercept = fake_intercept | fml_0 - } - + fml_parts = get_fml_parts(formula(object), parts_as_formula = TRUE) + } + fml_0 = attr(stats::terms(fml_parts$exo_fml), "intercept") == 0 + # we kick out the intercept if there is presence of fixed-effects + fake_intercept = !is.null(fml_parts$fe_fml) | fml_0 res = list() if ("lhs" %in% type) { - lhs_text = deparse_long(fml_linear[[2]]) - lhs = eval(fml_linear[[2]], data) + lhs = eval(fml_parts$y_fml[[2]], data) lhs = Matrix::Matrix(lhs, sparse = TRUE, ncol = 1) - - colnames(lhs) = lhs_text + colnames(lhs) = deparse_long(fml_parts$y_fml[[2]]) res[["lhs"]] = lhs } if ("rhs" %in% type && !isTRUE(object$onlyFixef)) { - - fml = fml_linear - - if (isFormula && (length(split_fml) == 3)) { - fml_iv = split_fml[[3]] - fml = .xpd(..lhs ~ ..endo + ..rhs, ..lhs = fml[[2]], ..endo = fml_iv[[2]], ..rhs = fml[[3]]) - } else if (isTRUE(object$iv)) { - fml_iv = object$fml_all$iv - fml = .xpd(..lhs ~ ..endo + ..rhs, ..lhs = fml[[2]], ..endo = fml_iv[[2]], ..rhs = fml[[3]]) - } - + fml = .xpd(~ ..endo + ..rhs, ..rhs = fml_parts$exo_fml, ..endo = fml_parts$endo_fml) vars = attr(stats::terms(fml), "term.labels") - linear.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "rhs", add_intercept = !fake_intercept) @@ -180,18 +166,10 @@ sparse_model_matrix = function(object, data, type = "rhs", sample = "estimation" if ("fixef" %in% type) { - if (isFormula && (length(split_fml) < 2)) { - mat_FE = NULL - } else if (!isFormula & length(object$fixef_id) == 0) { + if (is.null(fml_parts$fe_fml)) { mat_FE = NULL } else { - - if (isFormula) { - fixef_fml = .xpd(~ ..fe, ..fe = split_fml[[2]]) - } else { - fixef_fml = object$fml_all$fixef - } - + fixef_fml = .xpd(~ ..fe, ..fe = fml_parts$fe_fml) fixef_terms_full = fixef_terms(fixef_fml) fe_vars = fixef_terms_full$fe_vars slope_var_list = fixef_terms_full$slope_vars_list @@ -334,7 +312,7 @@ sparse_model_matrix = function(object, data, type = "rhs", sample = "estimation" # IV # if ("iv.endo" %in% type) { - fml = object$iv_endo_fml + fml = .xpd(~ ..endo, ..endo = fml_parts$endo_fml) vars = attr(stats::terms(fml), "term.labels") endo.mat = vars_to_sparse_mat(vars = vars, data = data, object, collin.rm = collin.rm, type = "iv.endo") @@ -343,7 +321,7 @@ sparse_model_matrix = function(object, data, type = "rhs", sample = "estimation" } if ("iv.inst" %in% type) { - fml = object$fml_all$iv + fml = .xpd(~ ..inst, ..inst = fml_parts$inst_fml) vars = attr(stats::terms(fml), "term.labels") inst.mat = vars_to_sparse_mat(vars = vars, data = data, object, collin.rm = collin.rm, type = "iv.inst") @@ -352,8 +330,7 @@ sparse_model_matrix = function(object, data, type = "rhs", sample = "estimation" } if ("iv.exo" %in% type) { - - fml = object$fml_all$linear + fml = .xpd(~ ..exo, ..exo = fml_parts$exo_fml) vars = attr(stats::terms(fml), "term.labels") exo.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "iv.exo", @@ -363,23 +340,18 @@ sparse_model_matrix = function(object, data, type = "rhs", sample = "estimation" } if ("iv.rhs1" %in% type) { - fml = object$fml - if (object$iv_stage == 2) { - fml_iv = object$fml_all$iv - fml = .xpd(..lhs ~ ..inst + ..rhs, ..lhs = fml[[2]], ..inst = fml_iv[[3]], ..rhs = fml[[3]]) - } - + fml = .xpd(~ ..inst + ..rhs, ..rhs = fml_parts$exo_fml, ..inst = fml_parts$inst_fml) vars = attr(stats::terms(fml), "term.labels") iv_rhs1 = vars_to_sparse_mat(vars = vars, data = data, object = object, - collin.rm = collin.rm, type = "iv.rhs1", - add_intercept = !fake_intercept) + collin.rm = collin.rm, type = "iv.rhs1", + add_intercept = !fake_intercept) res[["iv.rhs1"]] = iv_rhs1 } if ("iv.rhs2" %in% type) { # Second stage - if (!object$iv_stage == 2) { + if (!isFormula && !object$iv_stage == 2) { stop("In model.matrix, the type 'iv.rhs2' is only valid for second stage IV models. This estimation is the first stage.") } @@ -394,7 +366,7 @@ sparse_model_matrix = function(object, data, type = "rhs", sample = "estimation" # II) we create the variables fml = object$fml - fml = .xpd(..lhs ~ ..fit + ..rhs, ..lhs = fml[[2]], ..fit = fit_vars, ..rhs = fml[[3]]) + fml = .xpd(~ ..fit + ..rhs, ..fit = fit_vars, ..rhs = fml_parts$exo_fml) vars = attr(stats::terms(fml), "term.labels") iv_rhs2 = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "iv.rhs2", @@ -700,7 +672,7 @@ vars_to_sparse_mat = function(vars, data, collin.rm = FALSE, object = NULL, exo = lapply(object$iv_first_stage, function(x) names(stats::coef(x))) exo = unique(unlist(exo, use.names = FALSE)) - + inst = setdiff(exo, names(object$coefficients)) } @@ -733,7 +705,6 @@ vars_to_sparse_mat = function(vars, data, collin.rm = FALSE, object = NULL, coef_names = names(object$coefficients) add_intercept = "(Intercept)" %in% coef_names } - keep = colnames(mat) %in% coef_names if (length(keep) == 0) { mat = NULL @@ -751,3 +722,90 @@ vars_to_sparse_mat = function(vars, data, collin.rm = FALSE, object = NULL, return(mat) } + +get_fml_parts <- function(formula, parts_as_formula = FALSE) { + has_lhs <- !is_rhs_only(formula) + fml_split_tilde <- sm_fml_breaker(formula, "~") + + res <- list( + y_fml = NULL, + exo_fml = NULL, + fe_fml = NULL, + endo_fml = NULL, + inst_fml = NULL + ) + + # LHS + if (has_lhs) { + res$y_fml <- fml_split_tilde[[length(fml_split_tilde)]] + # Drop y and treat everything as RHS only formula + fml_split_tilde <- fml_split_tilde[-length(fml_split_tilde)] + } + + # OLS + if (length(fml_split_tilde) == 1) { + fml_split <- sm_fml_breaker(fml_split_tilde[[1]], "|") + if (length(fml_split) == 2) { + res$exo_fml <- fml_split[[2]] + res$fe_fml <- fml_split[[1]] + } else { + res$exo_fml <- fml_split[[1]] + } + } + + # IV + if (length(fml_split_tilde) == 2) { + res$inst_fml <- fml_split_tilde[[1]] + + # This works b/c we know there is an IV + fml_W_T_split <- sm_fml_breaker(fml_split_tilde[[2]], "|") + if (length(fml_W_T_split) == 3) { + res$endo_fml <- fml_W_T_split[[1]] + res$fe_fml <- fml_W_T_split[[2]] + res$exo_fml <- fml_W_T_split[[3]] + } else { + res$endo_fml <- fml_W_T_split[[1]] + res$exo_fml <- fml_W_T_split[[2]] + } + } + + if (parts_as_formula) { + res = lapply(res, function(rhs) { + if (is.null(rhs)) return(NULL) + fml <- ~. + fml[[2]] <- rhs + return(fml) + }) + } + return(res) +} + +is_rhs_only <- function(fml) { + # e.g. fml = ~ x + if (length(fml) == 2) { + return(TRUE) + } + # e.g. fml = ~ x | t ~ z + if (length(fml[[2]]) == 2) { + return(TRUE) + } + return(FALSE) +} + +# fml_breaker but works with RHS only +sm_fml_breaker <- function(fml, op) { + res <- list() + k <- 1 + while (is_operator(fml, op) & length(fml) == 3) { + res[[k]] <- fml[[3]] + k <- k + 1 + fml <- fml[[2]] + } + + if (length(fml) == 2) { + res[[k]] <- fml[[2]] + } else { + res[[k]] <- fml + } + res +} diff --git a/tests/fixest_tests.R b/tests/fixest_tests.R index cd46b457..d3d710fa 100644 --- a/tests/fixest_tests.R +++ b/tests/fixest_tests.R @@ -2579,6 +2579,8 @@ test(names(m_lhs_rhs_fixef), c("y1", "fit_x2", "x1", "species")) #### sparse_model_matrix #### #### +chunk("Sparse model matrix") + # unit tests: i(..., sparse = TRUE) x <- c(1, 1, 3, 1, 3) sp1 <- i(x, sparse = TRUE) @@ -2712,6 +2714,7 @@ test(nrow(sm_lag), nobs(res_lag)) # Interacted fixef res = feols(y1 ~ x1 + x2 + x3 | species^fe2, base) sm_ife = sparse_model_matrix(res, data = base, type = "fixef", collin.rm = FALSE) +test(ncol(sm_ife), 45) # fixef res = feols(y1 ~ x1 + x2 + x3 | species + fe2, base) @@ -2966,6 +2969,7 @@ names(base) = c("y", "x1", "x_endo_1", "x_inst_1", "fe") set.seed(2) base$x_inst_2 = 0.2 * base$y + 0.2 * base$x_endo_1 + rnorm(150, sd = 0.5) base$x_endo_2 = 0.2 * base$y - 0.2 * base$x_inst_1 + rnorm(150, sd = 0.5) +base$w = sample(c(0.5, 0.25), nrow(base), replace = TRUE) # Checking a basic estimation est_iv = feols(y ~ x1 | x_endo_1 + x_endo_2 ~ x_inst_1 + x_inst_2, base) @@ -2973,9 +2977,12 @@ est_iv = feols(y ~ x1 | x_endo_1 + x_endo_2 ~ x_inst_1 + x_inst_2, base) fitstat(est_iv, ~ f + ivf + ivf2 + wald + ivwald + ivwald2 + wh + sargan + rmse + g + n + ll + sq.cor + r2) est_fe = feols(y ~ x1 | fe, base) - fitstat(est_fe, ~ wf) +# https://github.com/lrberge/fixest/issues/584 +est_weighted = feols(y ~ x1 | fe, base, weights = ~w) +fitstat(est_weighted, ~ rmse) + # fitstat works with `split` and `lean` (https://github.com/lrberge/fixest/issues/566) est_split = feols(y ~ x1, base, fsplit = ~fe) est_split_lean = feols(y ~ x1, base, fsplit = ~fe, lean = TRUE)