Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(crosswalk_data)
export(get_available_crosswalks)
export(get_crosswalk)
export(list_nhgis_crosswalks)
33 changes: 23 additions & 10 deletions R/crosswalk_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,11 @@
#' about join quality, including the number of data rows not matching the crosswalk
#' and vice versa. For state-nested geographies (tract, county, block group, etc.),
#' also reports state-level concentration of unmatched rows. Set to FALSE to
#' suppress these messages.
#' suppress these messages. Automatically suppressed when `silent = TRUE`.
#' @param silent Logical. If `TRUE`, suppresses all informational messages and
#' warnings, including join quality diagnostics regardless of `show_join_quality`.
#' Defaults to `getOption("crosswalk.silent", FALSE)`. Set
#' `options(crosswalk.silent = TRUE)` to silence all calls by default.
#'
#' @return If `return_intermediate = FALSE` (default), a tibble with data summarized
#' to the final target geography.
Expand Down Expand Up @@ -162,7 +166,14 @@ crosswalk_data <- function(
count_columns = NULL,
non_count_columns = NULL,
return_intermediate = FALSE,
show_join_quality = TRUE) {
show_join_quality = TRUE,
silent = getOption("crosswalk.silent", FALSE)) {

old_opts <- options(crosswalk.silent = silent)
on.exit(options(old_opts), add = TRUE)

# When silent, suppress join quality regardless of show_join_quality
if (silent) show_join_quality <- FALSE

# Determine if we need to fetch the crosswalk
crosswalk_provided <- !is.null(crosswalk)
Expand All @@ -175,14 +186,15 @@ crosswalk_data <- function(
}

if (crosswalk_provided && geography_provided) {
warning(
cw_warning(
"Both 'crosswalk' and geography parameters provided. ",
"Using the provided 'crosswalk' and ignoring geography parameters.")
"Using the provided 'crosswalk' and ignoring geography parameters.",
call. = FALSE)
}

# Fetch crosswalk if not provided
if (!crosswalk_provided) {
message("Fetching crosswalk from ", source_geography, " to ", target_geography, "...")
cw_message("Fetching crosswalk from ", source_geography, " to ", target_geography, "...")
crosswalk <- get_crosswalk(
source_geography = source_geography,
target_geography = target_geography,
Expand Down Expand Up @@ -240,7 +252,7 @@ crosswalk_data <- function(
step_name <- names(crosswalk_list)[i]
step_crosswalk <- crosswalk_list[[i]]

message(stringr::str_c("Applying crosswalk step ", i, " of ", n_steps, "..."))
cw_message(stringr::str_c("Applying crosswalk step ", i, " of ", n_steps, "..."))

# Apply single crosswalk step
current_data <- apply_single_crosswalk(
Expand Down Expand Up @@ -460,7 +472,7 @@ format_join_quality_message <- function(join_quality, step_number, total_steps)
format(join_quality$n_data_unmatched, big.mark = ","),
" of ",
format(join_quality$n_data_total, big.mark = ","),
" data rows (",
" unique data GEOIDs (",
sprintf("%.1f%%", join_quality$pct_data_unmatched),
") did not match the crosswalk."
)
Expand Down Expand Up @@ -614,7 +626,7 @@ report_join_quality <- function(data, crosswalk, geoid_column, step_number = 1,
# Print messages if there are issues
messages <- format_join_quality_message(join_quality, step_number, total_steps)
if (length(messages) > 0) {
purrr::walk(messages, message)
purrr::walk(messages, cw_message)
}

return(join_quality)
Expand Down Expand Up @@ -648,9 +660,10 @@ apply_single_crosswalk <- function(

# Check if crosswalk is empty
if (nrow(crosswalk) == 0) {
warning(
cw_warning(
"Crosswalk is empty. If source geography is nested within target geography, ",
"consider aggregating your data directly instead.")
"consider aggregating your data directly instead.",
call. = FALSE)
return(tibble::tibble())
}

Expand Down
73 changes: 69 additions & 4 deletions R/get_crosswalk.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@
#' @param cache Directory path. Where to download the crosswalk to. If NULL (default),
#' crosswalk is returned but not saved to disk. Individual component crosswalks
#' are cached separately when provided.
#' @param silent Logical. If `TRUE`, suppresses all informational messages and
#' warnings. Defaults to `getOption("crosswalk.silent", FALSE)`. Set
#' `options(crosswalk.silent = TRUE)` to silence all calls by default.
#'
#' @return A list with a consistent structure:
#' \describe{
Expand Down Expand Up @@ -137,7 +140,11 @@ get_crosswalk <- function(
source_year = NULL,
target_year = NULL,
cache = NULL,
weight = "population") {
weight = "population",
silent = getOption("crosswalk.silent", FALSE)) {

old_opts <- options(crosswalk.silent = silent)
on.exit(options(old_opts), add = TRUE)

# Check for nested geographies (no crosswalk needed)
# Determine if years match (both NULL, or both non-NULL and equal)
Expand All @@ -150,10 +157,10 @@ get_crosswalk <- function(
(source_geography == "county" && target_geography == "core_based_statistical_area")

if (is_nested && years_match) {
warning(
cw_warning(
"The source geography is nested within the target geography and an empty result
will be returned. No crosswalk is needed to translate data between nested geographies;
simply aggregate your data to the desired geography.")
simply aggregate your data to the desired geography.", call. = FALSE)

# Return empty list structure for consistency
return(list(
Expand Down Expand Up @@ -418,6 +425,64 @@ and counties. The provided geography '", geography, "' is not supported.")}
return(result)
}

#' List All Available Crosswalk Combinations
#'
#' Returns a tibble of all source/target geography and year combinations
#' supported by `get_crosswalk()`.
#'
#' @return A tibble with columns: `source_geography`, `target_geography`,
#' `source_year`, `target_year`.
#' @export
get_available_crosswalks <- function() {

# 1. NHGIS: reuse list_nhgis_crosswalks(), select and coerce years to integer
nhgis <- list_nhgis_crosswalks() |>
dplyr::select(source_geography, target_geography, source_year, target_year) |>
dplyr::mutate(
source_year = as.integer(source_year),
target_year = as.integer(target_year))

# 2. Geocorr 2022: all pairwise combinations of 9 canonical geographies
geocorr_2022_geographies <- c(
"block", "block_group", "tract", "county", "place",
"zcta", "puma22", "cd118", "cd119")

geocorr_2022 <- tidyr::crossing(
source_geography = geocorr_2022_geographies,
target_geography = geocorr_2022_geographies) |>
dplyr::filter(source_geography != target_geography) |>
dplyr::mutate(
source_year = 2022L,
target_year = 2022L)

# 3. Geocorr 2018: all pairwise combinations of 9 canonical geographies
geocorr_2018_geographies <- c(
"block", "block_group", "tract", "county", "place",
"zcta", "puma12", "cd115", "cd116")

geocorr_2018 <- tidyr::crossing(
source_geography = geocorr_2018_geographies,
target_geography = geocorr_2018_geographies) |>
dplyr::filter(source_geography != target_geography) |>
dplyr::mutate(
source_year = 2018L,
target_year = 2018L)

# 4. CTData: 7 manually specified combinations (2020<->2022)
ctdata <- tibble::tibble(
source_geography = c("block", "block_group", "tract", "county",
"block", "block_group", "tract"),
target_geography = c("block", "block_group", "tract", "county",
"block", "block_group", "tract"),
source_year = c(rep(2020L, 4), rep(2022L, 3)),
target_year = c(rep(2022L, 4), rep(2020L, 3)))

# 5. Combine, deduplicate, and sort
dplyr::bind_rows(nhgis, geocorr_2022, geocorr_2018, ctdata) |>
dplyr::distinct() |>
dplyr::arrange(source_geography, target_geography, source_year, target_year)
}

utils::globalVariables(c(
"allocation_factor_source_to_target", "geoid", "label",
"allocation_factor_source_to_target", "geoid", "label",
"n_unmatched", "pct_of_unmatched", "state_abbr"))
6 changes: 3 additions & 3 deletions R/get_crosswalk_chain.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ get_crosswalk_chain <- function(
message = format_chain_plan_message(plan))

# Print the plan message
message(result$message)
cw_message(result$message)

# Handle case where no crosswalk is needed
if (nrow(plan$steps) > 0 && plan$steps$crosswalk_source[1] == "none") {
message("Returning empty crosswalk list since no transformation is needed.")
cw_message("Returning empty crosswalk list since no transformation is needed.")
return(result)
}

Expand All @@ -61,7 +61,7 @@ get_crosswalk_chain <- function(
step <- plan$steps[i, ]
step_name <- stringr::str_c("step_", i)

message(stringr::str_c("\nFetching ", step_name, ": ", step$description))
cw_message(stringr::str_c("\nFetching ", step_name, ": ", step$description))

crosswalk_i <- get_crosswalk_single(
source_geography = step$source_geography,
Expand Down
20 changes: 10 additions & 10 deletions R/get_ctdata_crosswalk.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ Only block, block_group, and tract geographies support the 2022 -> 2020 directio

# Check cache for full national crosswalk
if (file.exists(csv_path) & !is.null(cache)) {
message(stringr::str_c("Reading national ", source_year, "-", target_year, " crosswalk from cache."))
cw_message(stringr::str_c("Reading national ", source_year, "-", target_year, " crosswalk from cache."))
result <- readr::read_csv(
csv_path,
col_types = readr::cols(.default = readr::col_character(),
Expand Down Expand Up @@ -150,7 +150,7 @@ Only block, block_group, and tract geographies support the 2022 -> 2020 directio
return(result)
}

message("Constructing nationally comprehensive 2020-2022 crosswalk...")
cw_message("Constructing nationally comprehensive 2020-2022 crosswalk...")

# ===========================================================================
# STEP 1: Get all 2020 GEOIDs from NHGIS crosswalk (non-CT) or tidycensus (county)
Expand All @@ -167,7 +167,7 @@ Only block, block_group, and tract geographies support the 2022 -> 2020 directio

if (geography_standardized == "county") {
# For county, use tidycensus since NHGIS doesn't have county -> county crosswalks
message("Fetching all 2020 county GEOIDs via tidycensus...")
cw_message("Fetching all 2020 county GEOIDs via tidycensus...")

all_2020_geoids <- suppressMessages({
tidycensus::get_acs(
Expand All @@ -182,7 +182,7 @@ Only block, block_group, and tract geographies support the 2022 -> 2020 directio

} else {
# For block, block_group, tract: use NHGIS 2010 -> 2020 crosswalk
message(stringr::str_c(
cw_message(stringr::str_c(
"Fetching NHGIS ", nhgis_source_geog, " 2010 -> 2020 crosswalk to obtain all 2020 GEOIDs..."))

nhgis_crosswalk <- get_nhgis_crosswalk(
Expand All @@ -200,7 +200,7 @@ Only block, block_group, and tract geographies support the 2022 -> 2020 directio
dplyr::pull(target_geoid)
}

message(stringr::str_c(
cw_message(stringr::str_c(
"Found ", format(length(all_2020_geoids), big.mark = ","),
" non-CT 2020 ", geography_standardized, " GEOIDs."))

Expand All @@ -223,7 +223,7 @@ Only block, block_group, and tract geographies support the 2022 -> 2020 directio
# STEP 3: Get CT-specific crosswalk from CT Data Collaborative
# ===========================================================================

message("Fetching Connecticut crosswalk from CT Data Collaborative...")
cw_message("Fetching Connecticut crosswalk from CT Data Collaborative...")

if (geography_standardized == "block") {
raw_df <- readr::read_csv(ctdata_urls$block, show_col_types = FALSE)
Expand Down Expand Up @@ -326,7 +326,7 @@ Only block, block_group, and tract geographies support the 2022 -> 2020 directio
result <- dplyr::bind_rows(ct_crosswalk, non_ct_crosswalk) |>
dplyr::arrange(source_geoid)

message(stringr::str_c(
cw_message(stringr::str_c(
"National 2020-2022 crosswalk constructed: ",
format(nrow(ct_crosswalk), big.mark = ","), " CT records + ",
format(nrow(non_ct_crosswalk), big.mark = ","), " non-CT records = ",
Expand All @@ -337,7 +337,7 @@ Only block, block_group, and tract geographies support the 2022 -> 2020 directio
# ===========================================================================

if (is_reverse) {
message("Reversing crosswalk direction to 2022 -> 2020...")
cw_message("Reversing crosswalk direction to 2022 -> 2020...")

# For identity crosswalks (block, block_group, tract), simply swap columns
# Note: County is not supported for reverse direction (checked earlier)
Expand All @@ -363,10 +363,10 @@ Only block, block_group, and tract geographies support the 2022 -> 2020 directio
dir.create(cache_path, recursive = TRUE)
}
readr::write_csv(result, csv_path)
message(stringr::str_c("Cached to: ", csv_path))
cw_message(stringr::str_c("Cached to: ", csv_path))
}

message(stringr::str_c(
cw_message(stringr::str_c(
"National ", source_year, "-", target_year, " crosswalk constructed:
- Connecticut: CT Data Collaborative (https://github.com/CT-Data-Collaborative)
- Other states: Identity mapping derived from NHGIS 2010-2020 crosswalk"))
Expand Down
4 changes: 2 additions & 2 deletions R/get_geocorr_crosswalk.R
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ get_geocorr_crosswalk <- function(
if (file.exists(outpath) & !is.null(cache)) {
result = readr::read_csv(outpath, show_col_types = FALSE)

message("Reading file from cache.")
cw_message("Reading file from cache.")

# Attach metadata to cached result
attr(result, "crosswalk_metadata") <- list(
Expand All @@ -188,7 +188,7 @@ get_geocorr_crosswalk <- function(
base_url <- "https://mcdc.missouri.edu/cgi-bin/broker"

if (is.null(weight)) {
message("Setting the default crosswalk weighting variable to: population.")
cw_message("Setting the default crosswalk weighting variable to: population.")
weight = "population"
}

Expand Down
Loading