diff --git a/.Rbuildignore b/.Rbuildignore index 046520c..18a2201 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,10 +1,9 @@ ^.*\.Rproj$ ^\.Rproj\.user$ -new_flow_control.R test.R build.R .gitignore - ui_flow_control.R +ui_flow_control*.R heaping_breakpoints.R R/test2.R R/test_data_NOT_part_of_final_package.R diff --git a/DESCRIPTION b/DESCRIPTION index c8402a9..772e14b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ODAPbackend Type: Package Title: Standardize, Evaluate, and Adjust Demographic Data -Version: 00.03.008 -Date: 2025-10-15 +Version: 00.03.009 +Date: 2025-10-16 Authors@R: c( person("Tim", "Riffe", role = c("aut", "cre"), email = "tim.riffe@gmail.com", comment = c(ORCID = "0000-0002-2673-4622")), @@ -12,7 +12,7 @@ License: file LICENSE LazyLoad: yes LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.1), ggplot2 @@ -23,6 +23,10 @@ Suggests: Imports: DemoTools (>= 01.13.85), stringr, + codetools, + wpp2024, + forcats, + ungroup, dplyr, scales, readxl, diff --git a/NAMESPACE b/NAMESPACE index ed4a8f8..5185fe7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,23 +3,34 @@ export(capture_args) export(check_coherent) export(check_data) +export(check_data_generic) +export(check_data_graduate_time) +export(check_data_heaping) +export(check_data_lifetable) +export(check_data_opag) export(check_groupid) +export(check_has_numeric) export(check_heaping_general) export(check_heaping_user) export(check_interpolate) export(check_lower) export(check_missing_cols) +export(check_missing_cols_generic) +export(check_missing_cols_opag) export(check_nas) +export(check_nlx_if_present) export(check_numeric) +export(check_numeric_opag) export(check_redundant) export(check_rows) export(check_sequential) export(check_sex) -export(check_smooth) +export(check_sex_opag) export(create_groupid) export(extension_check) export(graduate_auto) export(graduate_auto_5) +export(graduate_time) export(interp_linear) export(interpolate) export(lt_check) @@ -28,7 +39,6 @@ export(lt_flexible_chunk) export(lt_plot) export(lt_summary) export(lt_summary_chunk) -export(movepop) export(odap_opag) export(plot_compare_rates) export(plot_histogram) @@ -40,7 +50,6 @@ export(plot_smooth_compare) export(pyramid) export(read_data) export(smooth1d) -export(smooth1d_chunk) export(smooth_flexible) export(smooth_flexible_chunk) importFrom(DemoTools,OPAG) @@ -55,6 +64,7 @@ importFrom(DemoTools,graduate) importFrom(DemoTools,graduate_mono) importFrom(DemoTools,graduate_uniform) importFrom(DemoTools,groupAges) +importFrom(DemoTools,int2age) importFrom(DemoTools,is_abridged) importFrom(DemoTools,is_age_coherent) importFrom(DemoTools,is_age_redundant) @@ -81,6 +91,7 @@ importFrom(cowplot,plot_grid) importFrom(dplyr,.data) importFrom(dplyr,across) importFrom(dplyr,arrange) +importFrom(dplyr,as_tibble) importFrom(dplyr,bind_rows) importFrom(dplyr,case_match) importFrom(dplyr,case_when) @@ -101,6 +112,7 @@ importFrom(dplyr,last) importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,n) +importFrom(dplyr,pick) importFrom(dplyr,pull) importFrom(dplyr,reframe) importFrom(dplyr,rename) @@ -108,6 +120,7 @@ importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,summarize) importFrom(dplyr,ungroup) +importFrom(forcats,as_factor) importFrom(furrr,future_map_dfr) importFrom(future,multisession) importFrom(future,plan) @@ -126,9 +139,11 @@ importFrom(ggplot2,ggplot) importFrom(ggplot2,ggtitle) importFrom(ggplot2,guide_legend) importFrom(ggplot2,labs) +importFrom(ggplot2,scale_color_brewer) importFrom(ggplot2,scale_color_discrete) importFrom(ggplot2,scale_color_manual) importFrom(ggplot2,scale_fill_brewer) +importFrom(ggplot2,scale_linetype_manual) importFrom(ggplot2,scale_x_continuous) importFrom(ggplot2,scale_x_log10) importFrom(ggplot2,scale_y_continuous) @@ -140,6 +155,7 @@ importFrom(ggplot2,theme_minimal) importFrom(grDevices,gray) importFrom(magrittr,"%>%") importFrom(mgcv,gam) +importFrom(mgcv,s) importFrom(parallelly,availableCores) importFrom(purrr,map) importFrom(purrr,map2) @@ -159,10 +175,12 @@ importFrom(scales,comma) importFrom(scales,label_log) importFrom(scales,pretty_breaks) importFrom(signal,interp1) +importFrom(stats,gaussian) importFrom(stats,loess) importFrom(stats,loess.control) importFrom(stats,na.omit) importFrom(stats,predict) +importFrom(stats,qlogis) importFrom(stats,smooth.spline) importFrom(stats,splinefun) importFrom(stats,supsmu) @@ -171,6 +189,7 @@ importFrom(stringr,str_detect) importFrom(stringr,str_flatten) importFrom(tibble,as_tibble) importFrom(tibble,lst) +importFrom(tibble,rownames_to_column) importFrom(tibble,tibble) importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) @@ -180,6 +199,7 @@ importFrom(tidyselect,all_of) importFrom(tidyselect,any_of) importFrom(tidyselect,everything) importFrom(tidyselect,matches) +importFrom(ungroup,pclm) importFrom(utils,data) importFrom(utils,globalVariables) importFrom(utils,installed.packages) diff --git a/R/basepop.R b/R/basepop.R new file mode 100644 index 0000000..4f241a6 --- /dev/null +++ b/R/basepop.R @@ -0,0 +1,39 @@ + + +# TODO: +# make wrapper following the same input-output format as smooth_flexible(), +# modified to the needs of basepop. + +# We either have 5-year-in - 5-year out, single-year-in - single-year-out, +# or abridged-in - abridged out. + +# for the case of 5-year-in 5-year out, one can either graduate to single, +# run basepop_single(), then group to 5-year, or one can graduate to single (pclm), +# abridge ages, then run basepop_five(). Let's do this second one; example seen +# in DemoTools documentation of basepop_five() + +# age regrouping should be handled elsewhere. +# TR recommends first graduating and then running basepop, +# not the other way around + +# Two main DemoTools functions to use are basepop_five() and basepop_single(). + +#remotes::install_github("timriffe/DemoTools") + +# The trick here is in showing how a user might actually practically give their +# own inputs rather than relying on the default wpp. + + + + + + + + + + + + + + + diff --git a/R/basepop_single_sketch.R b/R/basepop_single_sketch.R deleted file mode 100644 index b45ea80..0000000 --- a/R/basepop_single_sketch.R +++ /dev/null @@ -1,140 +0,0 @@ - -# package building ignores this script but load_all() does not, -# so I wrap it. -do_this <- FALSE -if (do_this){ -library(tidyverse) -library(wpp2024) -library(DemoTools) -# ------ -data(popAge1dt) -data(mx1dt) -# Jan 1 2000 female pop; -# note: 1999 means dec 31, 1999, so we treat as jan 1, 2000. -popF <- - popAge1dt |> - filter(country_code == 900, - year == 1999) |> - select(year, age, pop = popF) |> - mutate(year = year + 1, - cohort = year - age - 1) - -mxF <- mx1dt |> - filter(country_code == 900, - between(year, 1990, 1999)) |> - as_tibble() |> - select(year, age, mx = mxF) |> - # could try to warp to PC shape here, - # but uncertain infants. Maybe using - # an a0 assumption it'd be doable. - - # need cohorts to structure reverse survival - mutate(cohort = year - age - 1, - age_int = 1, - ax = if_else(age == 0, - lt_rule_1a0_ak(M0 = mx, Sex = "f"), - .5), - qx = lt_id_ma_q(nMx = mx, nax = ax, AgeInt = age_int)) |> - arrange(cohort, age) |> - group_by(cohort) |> - mutate(lx = lt_id_q_l(nqx = qx, radix = 1), - dx = lx * qx, - Lx = lx - dx * (1-ax), - SxRev = Lx / lead(Lx, default = 1)) |> - ungroup() |> - arrange(year, age) |> - group_by(year) |> - mutate( - lxp = lt_id_q_l(nqx = qx, radix = 1), - dxp = lxp * qx, - Lxp = lxp - dxp * (1-ax), - SxRev = if_else(year == max(year), - Lxp / lead(Lxp), - SxRev)) |> - ungroup() |> - filter(age < 100) |> - select(cohort, year, age, SxRev) |> - arrange(cohort,-age) |> - group_by(cohort) |> - # inflation for reverse-surviving - mutate(inflation_factor = cumprod(SxRev)) - -# get exposure via reverse survival -expF <- - popF |> - select(cohort, pop) |> - right_join(mxF,by=join_by(cohort)) |> - mutate(pop_hat = pop * inflation_factor) |> - select(year, age, pop = pop_hat) |> - bind_rows(popF |> select(year,age,pop)) |> - filter(between(age, 15,50)) |> - arrange(age, year) |> - mutate(pop_1p1 = lead(pop), - exposure = (pop + pop_1p1) / 2) |> - filter(age<50) - -# each location year sums to 100% -data(percentASFR1dt) -# use TFR as period scalar -data(tfr1dt) - -# get asfr: -asfr <- - left_join(percentASFR1dt,tfr1dt,by=join_by(year, country_code,name)) |> - filter(country_code == 900, - between(year, 1990,1999)) |> - mutate(asfr = pasfr / 100 * tfr) - -Bt <- - left_join(expF, asfr |> select(-name), by = join_by(year, age)) |> - filter(year < 2000) |> - mutate(Bx = asfr * exposure) |> - group_by(year) |> - summarize(B = sum(Bx)) |> - mutate(age = 2000-year-1) |> - select(-year) - -# this now needs to survive forward until year 2000 - -SRB <- 1.05 - -# this chunk might have a misalignment, and still needs -# some care -pop_hat<- - mx1dt |> - filter(country_code == 900) |> - as_tibble() |> - select(year, age, mx = mxF) |> - # could try to warp to PC shape here, - # but uncertain infants. Maybe using - # an a0 assumption it'd be doable. - - # need cohorts to structure reverse survival - mutate(cohort = year - age - 1, - age_int = 1, - ax = if_else(age == 0, - lt_rule_1a0_ak(M0 = mx, Sex = "f"), - .5), - qx = lt_id_ma_q(nMx = mx, nax = ax, AgeInt = age_int)) |> - filter(between(cohort,1990,1999), - between(year,1990,2000), - age < 10) |> - arrange(cohort, age) |> - group_by(cohort) |> - mutate(lx = lt_id_q_l(nqx = qx, radix = 1), - dx = lx * qx, - Lx = lx - dx * (1-ax)) |> - filter(year == max(year)) |> - select(age, Lx) |> - left_join(Bt, by = join_by(age)) |> - mutate(pop_hat = Lx * B, - popF = pop_hat * 1 / (1+SRB), - popM = pop_hat * SRB / (1+SRB)) - -# reasonable comparison! -pop_hat |> - select(age, pop = popF) |> - ggplot(aes(x = age, y = pop)) + - geom_line() + - geom_line(data = popF |> filter(age < 10),color = "red") -} \ No newline at end of file diff --git a/R/checkers.R b/R/checkers.R index 5ca3cd9..936805b 100644 --- a/R/checkers.R +++ b/R/checkers.R @@ -131,8 +131,6 @@ check_numeric <- function(data) { #' data = data) #' -# Should we even ask some of these? Like if there is no AgeInt the reader will not work. - check_missing_cols <- function(data) { # TR: DemoTools has the function age2int() to infer age intervals from an Age vector, # to technically we don't need it. We do however need ages specified as lower bounds of @@ -482,25 +480,25 @@ check_lower <- function(data) { check_sex <- function(data) { if (any(str_detect("Sex", names(data)))) { - lvls <- unique(data$Sex) - - if (all(lvls %in% c("Male", "Female", "Total"))) { + lvls <- tolower(unique(data$Sex)) + + if (all(lvls %in% c("male", "female", "total"))) { message <- NA_character_ - + } else { - message <- "If Sex variable is given, it should be coded with either `Male`, `Female`, or `Total`." - + message <- "If Sex variable is given, it should be coded with either `male`, `female`, or `total`." + } - + } else { message <- NA_character_ - + } - + res <- data.frame(check = "check sex", message = message) - + res$pass <- ifelse(!is.na(message), "Fail", "Pass") - + return(res) } @@ -557,9 +555,663 @@ check_data <- function(data) { # Combine all the check results - result <- do.call(rbind, list(ch1, ch2, ch3, - ch4, ch5, ch6, + result <- do.call(rbind, list(ch1, ch2, ch3, + ch4, ch5, ch6, ch7, ch8, ch9)) return(result) } + + +#' @title `check_missing_cols_generic` +#' @description Check if Age column exists. For generic validation that doesn't require specific value columns. +#' @param data tibble. A tibble with at minimum an Age column. +#' @return A data.frame with validation results containing columns: `check`, `message`, `pass`. +#' @export +check_missing_cols_generic <- function(data) { + missing_cols <- setdiff("Age", names(data)) + + if (length(missing_cols) > 0) { + message <- "The Age column is missing from the data. The calculations are halted." + } else { + message <- NA_character_ + } + + res <- data.frame(check = "check missing columns", message = message) + res$pass <- ifelse(!is.na(message), "Fail", "Pass") + return(res) +} + + +#' @title `check_has_numeric` +#' @description Check if the data has at least one numeric column besides Age. +#' @param data tibble. A tibble with demographic data. +#' @return A data.frame with validation results containing columns: `check`, `message`, `pass`. +#' @export +check_has_numeric <- function(data) { + # Get numeric columns excluding Age and internal columns + exclude_cols <- c("Age", "AgeInt", ".id", ".id_label") + numeric_cols <- names(data)[sapply(data, is.numeric)] + value_cols <- setdiff(numeric_cols, exclude_cols) + + if (length(value_cols) == 0) { + message <- "No numeric value columns found. Please include at least one numeric column besides Age." + } else { + message <- NA_character_ + } + + res <- data.frame(check = "check has numeric", message = message) + res$pass <- ifelse(!is.na(message), "Fail", "Pass") + return(res) +} + + +#' @title `check_data_generic` +#' @description Generic data validation for modules that accept any numeric column (heaping, smoothing, graduation). +#' Requires only Age column and at least one numeric value column. Runs all standard age-related checks. +#' @param data tibble. A tibble with Age column and at least one numeric value column. +#' @return A data.frame with validation results containing columns: `check`, `message`, `pass`. +#' @export +#' @examples +#' library(tibble) +#' data <- tibble( +#' Age = c(0, 1, seq(5, 100, by = 5)), +#' births = runif(22, 1000, 10000), +#' population = runif(22, 50000, 100000) +#' ) +#' check_data_generic(data = data) +#' +# TODO: this should be a wrapper to the module-specific checkers, +# i.e. taking a second argument (module?). +check_data_generic <- function(data) { + + # Ensure '.id' column exists + if (!(".id" %in% colnames(data))) { + data$.id <- "all" + } + + # Split data by '.id', apply checks, and combine results + split_data <- split(data, data$.id) + + # Generic checks - only require Age and at least one numeric column + ch1 <- do.call(rbind, lapply(split_data, check_missing_cols_generic)) + ch2 <- do.call(rbind, lapply(split_data, check_has_numeric)) + ch3 <- do.call(rbind, lapply(split_data, check_rows)) + ch4 <- do.call(rbind, lapply(split_data, check_nas)) + ch5 <- do.call(rbind, lapply(split_data, check_coherent)) + ch6 <- do.call(rbind, lapply(split_data, check_sequential)) + ch7 <- do.call(rbind, lapply(split_data, check_redundant)) + ch8 <- do.call(rbind, lapply(split_data, check_lower)) + + # Combine all the check results + result <- do.call(rbind, list(ch1, ch2, ch3, ch4, ch5, ch6, ch7, ch8)) + + return(result) +} + + +#' @title `check_numeric_opag` +#' @description Check if the numeric columns for ODAP data (`Age`, `pop`) are numeric. +#' @param data tibble. A tibble containing population data for ODAP analysis. +#' @return A data.frame with 3 columns: `check` - the name of the test, `message` - The error message with corresponding information generated if the test is failed (if the test is passed evaluates to NA), `pass` - binary result of the test either "Fail" or "Pass". +#' @importFrom purrr map +#' @importFrom stringr str_flatten +#' @export +#' @examples +#' library(tibble) +#' data <- tibble( +#' Age = 0:100, +#' pop = c(9544406, 7471790, rep(1000000, 99)), +#' name = "India", +#' country_code = 356, +#' sex = "M", +#' year = 1971 +#' ) +#' +#' check_numeric_opag(data = data) +#' +check_numeric_opag <- function(data) { + # For ODAP, we check Age and pop columns + data_check <- subset(data, select = c("Age", "pop")) + + isnumeric <- data_check |> + map(~ is.numeric(.)) |> + unlist() + + if (sum(isnumeric) < ncol(data_check)) { + message <- paste0( + "Please check the input data. Every column should be numeric, while columns ", + str_flatten(names(data_check)[!isnumeric], collapse = ", "), + " are not." + ) + } else { + message <- NA_character_ + } + + data.frame( + check = "check_numeric_opag", + message = message, + pass = ifelse(is.na(message), "Pass", "Fail") + ) +} + + +#' @title `check_missing_cols_opag` +#' @description Check if any of the crucial columns are missing from ODAP population data: (`Age`, `pop`). +#' @param data tibble. A tibble containing population data for ODAP analysis. +#' @return A data.frame with 3 columns: `check` - the name of the test, `message` - The error message with corresponding information generated if the test is failed (if the test is passed evaluates to NA), `pass` - binary result of the test either "Fail" or "Pass". +#' @importFrom stringr str_c str_flatten +#' @export +#' @examples +#' library(tibble) +#' data <- tibble( +#' Age = 0:100, +#' pop = c(9544406, 7471790, rep(1000000, 99)), +#' name = "India", +#' country_code = 356, +#' sex = "M", +#' year = 1971 +#' ) +#' +#' check_missing_cols_opag(data = data) +#' +check_missing_cols_opag <- function(data) { + # For ODAP, we need Age and pop (population counts) + # Optional columns: name, sex, year, country_code, nLx + missing_cols <- setdiff(c("Age", "pop"), names(data)) + + if (length(missing_cols) > 0) { + message <- str_c( + "The following columns are missing from the data: ", + str_flatten(missing_cols, collapse = ", "), + ". The ODAP calculations require Age and pop (population counts)." + ) + } else { + message <- NA_character_ + } + + data.frame( + check = "check_missing_cols_opag", + message = message, + pass = ifelse(is.na(message), "Pass", "Fail") + ) +} + + +#' @title `check_nlx_if_present` +#' @description Check if nLx column is present and valid (numeric, no NAs). +#' @param data tibble. A tibble containing population data for ODAP analysis. +#' @return A data.frame with 3 columns: `check` - the name of the test, `message` - The error message with corresponding information generated if the test is failed (if the test is passed evaluates to NA), `pass` - binary result of the test either "Fail" or "Pass". +#' @export +#' @examples +#' library(tibble) +#' data <- tibble( +#' Age = 0:100, +#' pop = c(9544406, 7471790, rep(1000000, 99)), +#' nLx = runif(101, 50000, 100000) +#' ) +#' +#' check_nlx_if_present(data = data) +#' +check_nlx_if_present <- function(data) { + # Check for both nLx and nlx (lowercase) since column names may be converted + has_nlx <- "nLx" %in% names(data) + has_nlx_lower <- "nlx" %in% names(data) + + if (has_nlx || has_nlx_lower) { + # Use the correct column name + col_name <- if (has_nlx) "nLx" else "nlx" + + # Check if numeric + if (!is.numeric(data[[col_name]])) { + return(data.frame( + check = "check_nlx_numeric", + message = "Column 'nLx' must be numeric.", + pass = "Fail" + )) + } + + # Check for NAs + if (any(is.na(data[[col_name]]))) { + return(data.frame( + check = "check_nlx_missing", + message = "Column 'nLx' contains missing values.", + pass = "Fail" + )) + } + + return(data.frame( + check = "check_nlx", + message = NA_character_, + pass = "Pass" + )) + } + + # No nLx column - this is fine, will use WPP data + return(data.frame( + check = "check_nlx", + message = NA_character_, + pass = "Pass" + )) +} + + +#' @title `check_sex_opag` +#' @description Checks sex variable for OPAG analysis. Unlike general check_sex, +#' this does NOT allow 'Total' since OPAG requires sex-specific mortality data +#' from WPP. Accepts various formats: M/F, m/f, Male/Female, male/female, males/females. +#' @param data tibble. A tibble generated by the output of the read_data function. +#' @return A data.frame with validation results containing columns: `check`, `message`, `pass`. +#' @export +#' @examples +#' library(tibble) +#' data <- tibble(Age = 0:100, +#' pop = c(9544406, 7471790, rep(1000000, 99)), +#' Sex = "m") +#' check_sex_opag(data = data) +#' +check_sex_opag <- function(data) { + if (any(str_detect("Sex", names(data)))) { + lvls <- unique(data$Sex) + # Normalize sex values for comparison (handle m/f/male/female variants) + lvls_lower <- tolower(lvls) + valid_male <- lvls_lower %in% c("m", "male", "males") + valid_female <- lvls_lower %in% c("f", "female", "females") + + if (all(valid_male | valid_female)) { + message <- NA_character_ + } else { + message <- "If Sex variable is given, it should be coded as `male` or `female` (or `Male`/`Female`, `m`/`f`). `Total` is not supported for OPAG." + } + } else { + message <- NA_character_ + } + + res <- data.frame(check = "check sex", message = message) + res$pass <- ifelse(!is.na(message), "Fail", "Pass") + return(res) +} + + +#' @title `check_data_opag` +#' @description Upper level function that checks ODAP population data quality. Checks if the columns are numeric, if any of the columns is missing, if there is insufficient rows, if there are missing data entries, if ages do not start with 0, and also if ages are coherent, sequential and not redundant. +#' @param data tibble. A tibble containing population data for ODAP analysis with at minimum Age and pop columns. +#' @return A data.frame with validation results containing columns: `check`, `message`, `pass`. +#' @export +#' @examples +#' library(tibble) +#' data <- tibble( +#' Age = 0:100, +#' pop = c(9544406, 7471790, rep(1000000, 99)), +#' name = "India", +#' country_code = 356, +#' sex = "M", +#' year = 1971 +#' ) +#' +#' check_data_opag(data = data) +#' +check_data_opag <- function(data) { + + # Ensure '.id' column exists + if (!(".id" %in% colnames(data))) { + data$.id <- "all" + } + + # Extract unique .id values + id <- unique(data$.id) + + # Perform checks + # Split data by '.id', apply checks, and combine results + split_data <- split(data, data$.id) + + # ODAP-specific checks (using pop instead of Deaths/Exposures) + ch1 <- do.call(rbind, lapply(split_data, check_numeric_opag)) + ch2 <- do.call(rbind, lapply(split_data, check_missing_cols_opag)) + ch3 <- do.call(rbind, lapply(split_data, check_rows)) + ch4 <- do.call(rbind, lapply(split_data, check_nas)) + ch5 <- do.call(rbind, lapply(split_data, check_coherent)) + ch6 <- do.call(rbind, lapply(split_data, check_sequential)) + ch7 <- do.call(rbind, lapply(split_data, check_redundant)) + ch8 <- do.call(rbind, lapply(split_data, check_lower)) + ch9 <- do.call(rbind, lapply(split_data, check_sex_opag)) + ch10 <- do.call(rbind, lapply(split_data, check_nlx_if_present)) + + # Combine all the check results + result <- do.call(rbind, list(ch1, ch2, ch3, + ch4, ch5, ch6, + ch7, ch8, ch9, ch10)) + + return(result) +} + + +#' Checks input data for lifetable reconstruction +#' @title Check consistency of lifetable input data +#' @description +#' Validates a data frame intended for lifetable reconstruction. +#' The function checks whether the input contains at least one column that can be used to reconstruct a full lifetable and performs a set of structural and value-based validations. +#' If no grouping variable `.id` is present, one is automatically created with a single level `"all"`. +#' @details +#' The function searches for at least one of the following columns (in order of preference): +#' \itemize{ +#' \item \code{nMx} +#' \item \code{nlx} +#' \item \code{nqx} +#' \item \code{npx} +#' \item \code{ndx} +#' \item \code{nLx} +#' } +#' The first available column is used for validation checks. The following checks are performed: +#' \itemize{ +#' \item Numeric type validation +#' \item Missing value detection +#' \item Non-negativity constraint +#' } +#' If an \code{Age} column is present, additional checks are performed: +#' \itemize{ +#' \item Age coherence (numeric validity) +#' \item Redundancy detection +#' \item Sequential ordering +#' } +#' Validation is performed separately for each \code{.id} group. +#' @param data A data frame containing lifetable-related columns. Must include at least one of: \code{nMx}, \code{nlx}, \code{nqx}, \code{npx}, \code{ndx}, or \code{nLx}. Optionally may contain an \code{Age} column and a grouping column \code{.id}. +#' @importFrom DemoTools age2int is_age_coherent is_age_redundant is_age_sequential +#' @return +#' A data frame summarizing validation results for each group. The returned object contains: +#' \describe{ +#' \item{check}{Name of the validation performed} +#' \item{message}{Error message if the check fails, otherwise \code{NA}} +#' \item{pass}{Either `"Pass"` or `"Fail"`} +#' \item{.id}{Group identifier} +#' } +#' +#' @examples +#' library(readr) +#' fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +#' data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +#' data_in$nMx <- data_in$Deaths / data_in$Exposures +#' data_in <- data_in[data_in$`.id` == 1, ] +#' check_data_lifetable(data_in) +#' +#' @seealso +#' \code{\link{age2int}}, +#' \code{\link{is_age_coherent}}, +#' \code{\link{is_age_redundant}}, +#' \code{\link{is_age_sequential}} +#' +#' @export +#' + +check_data_lifetable <- function(data) { + + # Ensure '.id' column exists + if (!(".id" %in% names(data))) { + data$.id <- "all" + } + + # We can reconstruct lifetable from any of these columns + # We do not need ax strictly speaking, we cannot reconstruct from Tx and ex + # NOTE: I arranged columns in a order of preference from the best to the worst + # i.e. I prefer to construct LT from nMx, then from nlx etc. + # now I can choose the one column to check very easily + lt_cols <- c("nMx", "nlx", "nqx", "npx", "ndx", "nLx") + present <- intersect(lt_cols, names(data))[1] + # If at lease one column is present we can build a litable + # lets perform a series fo checks + if(is.na(present)) { + + stop( + "Your data does not contain columns that can be used to reconstruct a full lifetable.\n", + "It must contain at least one of the following: ", + paste(lt_cols, collapse = ", "), + call. = FALSE + ) + + } + + split_data <- split(data, data$.id) + + check_one <- function(d) { + + # choose column to work with + x <- d[[present]] + + # Perform checks + checks <- c( + check_numeric = !is.numeric(x), + check_missing = any(is.na(x)), + check_negative = any(x < 0, na.rm = TRUE) + ) + + messages <- c( + check_numeric = sprintf("Column %s must be numeric", present), + check_missing = sprintf("Column %s should not contain missing values", present), + check_negative = sprintf("All values in column %s should be >= 0", present) + ) + + out <- data.frame( + check = paste0(names(checks)," (", present, ")"), + message = ifelse(checks, messages, NA_character_)) + + out$pass <- ifelse(!is.na(out$message), "Fail", "Pass") + + if("Age" %in% names(d)) { + + Age <- d$Age + AgeInt <- age2int(Age) + + checks_age <- c( + check_coherent = !is_age_coherent(Age, AgeInt), + check_redundant = is_age_redundant(Age, AgeInt), + check_sequential = !is_age_sequential(Age) + ) + + messages_age <- c( + check_coherent = "Age values must be numeric", + check_redundant = "Age values are redundant", + check_sequential = "Age values should be sequential" + ) + + out_age <- data.frame( + check = names(checks_age), + message = ifelse(checks_age, messages_age, NA_character_)) + + out_age$pass <- ifelse(!is.na(out$message), "Fail", "Pass") + + out <- rbind(out, out_age) + + } + + return(out) + + } + + res <- + do.call(rbind, lapply(names(split_data), \(id) { + cbind(check_one(split_data[[id]]), .id = id) + })) + + return(res) + +} + + +#' Validate input data for age heaping analysis +#' @title Check consistency of data for age heaping diagnostics +#' @description +#' Validates a data frame prior to performing age heaping analysis. +#' The function checks whether a specified variable contains valid numeric values and ensures that required structural conditions are satisfied. +#' If a grouping variable \code{.id} is not present, one is automatically created with a single level `"all"`. +#' Validation is performed separately for each \code{.id} group. +#' @param data A data frame containing the variable to be evaluated. +#' Optionally may contain a grouping column \code{.id}. +#' @param X A character string specifying the name of the variable to be checked (e.g., `"Deaths"`). +#' @details +#' For the selected variable \code{X}, the following checks are performed: +#' \itemize{ +#' \item Numeric type validation +#' \item Missing value detection +#' \item Infinite value detection +#' \item Strict positivity constraint (all values must be > 0) +#' \item Presence of an \code{Age} column (required for age heaping analysis) +#' } +#' Each validation is conducted within levels of \code{.id}. +#' @return +#' A data frame summarizing validation results for each group. +#' The returned object contains: +#' \describe{ +#' \item{check}{Name of the validation performed} +#' \item{message}{Error message if the check fails, otherwise \code{NA}} +#' \item{pass}{Either `"Pass"` or `"Fail"`} +#' \item{.id}{Group identifier} +#' } +#' @examples +#' library(readr) +#' fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +#' data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +#' data_in <- data_in[data_in$`.id` == 1, ] +#' check_data_heaping(data = data_in, X = "Deaths") +#' +#' @export +#' +check_data_heaping <- function(data, X) { + + # Ensure '.id' column exists + if (!(".id" %in% names(data))) { + data$.id <- "all" + } + + # Perform checks + # Split data by '.id', apply checks, and combine results + split_data <- split(data, data$.id) + + check_one <- function(d) { + + x <- d[[X]] + checks <- c(check_numeric = !is.numeric(x), + check_missing = any(is.na(x)), + check_infinite = any(is.infinite(x)), + check_negative = any(x <= 0, na.rm = TRUE), + check_age = !("Age" %in% names(d)) + ) + + messages <- c( + check_numeric = sprintf("Column %s must be numeric", X), + check_missing = sprintf("Column %s should not contain missing values", X), + check_infinite = sprintf("Column %s should not contain infinite values", X), + check_nonpos = sprintf("All values in column %s must be > 0", X), + check_age = "Age column is required to check age heaping" + ) + + out <- data.frame( + check = names(checks), + message = ifelse(checks, messages, NA_character_)) + + out$pass <- ifelse(!is.na(out$message), "Fail", "Pass") + + return(out) + + } + + res <- + do.call(rbind, lapply(names(split_data), \(id) { + cbind(check_one(split_data[[id]]), .id = id) + })) + + return(res) + +} + +#' Validate input data for time graduation +#' @description +#' Performs structural and numerical validation checks for time-series data used in graduation or smoothing procedures. The function verifies that time points are numeric, unique, sequential, and evenly spaced, and that the associated values are numeric, finite, and strictly positive. +#' Validation is performed separately for each `.id` group. If `.id` is `NULL`, all observations are treated as belonging to a single group. +#' @param value Numeric vector of values to be graduated (e.g., life expectancy). +#' @param time Numeric vector of time points corresponding to `value` (e.g., calendar years). +#' @param .id Optional grouping variable. If provided, checks are performed separately by group. +#' @importFrom tibble tibble +#' @importFrom rlang %||% +#' @return +#' A data frame summarizing validation results. Contains: +#' \itemize{ +#' \item `check` – name of the validation performed +#' \item `message` – error message if the check fails, otherwise `NA` +#' \item `pass` – `"Pass"` or `"Fail"` +#' \item `.id` – group identifier +#' } +#' +#' @examples +#' check_data_graduate_time( +#' value = c(70.1, 70.4, 70.8), +#' time = c(2000, 2005, 2010) +#' ) +#' +#' @export +#' +# TODO, should take a data arg rather than building a tibble +check_data_graduate_time <- function(value, time, .id = NULL) { + + data <- tibble( + value = value, + time = time, + .id = .id %||% "all" + ) + + split_data <- split(data, data$.id) + + check_one <- function(d) { + + value <- d$value + time <- d$time + + time_diff <- diff(sort(time)) + + checks <- c( + check_time_numeric = !is.numeric(time), + check_value_numeric = !is.numeric(value), + check_time_missing = any(is.na(time)), + check_value_missing = any(is.na(value)), + check_time_infinite = any(is.infinite(time)), + check_value_infinite = any(is.infinite(value)), + check_time_positive = any(time <= 0, na.rm = TRUE), + check_value_positive = any(value <= 0, na.rm = TRUE), + check_time_unique = length(time) != length(unique(time)), + check_time_sequential = !all(time == sort(time)), + check_time_regular = length(unique(time_diff)) > 1 + ) + + messages <- c( + check_time_numeric = "Time variable must be numeric", + check_value_numeric = "Value variable must be numeric", + check_time_missing = "Time variable contains missing values", + check_value_missing = "Value variable contains missing values", + check_time_infinite = "Time variable contains infinite values", + check_value_infinite = "Value variable contains infinite values", + check_time_positive = "Time values must be > 0", + check_value_positive = "Value values must be > 0", + check_time_unique = "Time values must be unique", + check_time_sequential = "Time values must be sequential and ordered", + check_time_regular = "Time intervals must be regular (equal spacing)" + ) + + out <- data.frame( + check = names(checks), + message = ifelse(checks, messages, NA_character_) + ) + + out$pass <- ifelse(!is.na(out$message), "Fail", "Pass") + + return(out) + } + + res <- do.call( + rbind, + lapply(names(split_data), function(id) { + cbind(check_one(split_data[[id]]), .id = id) + }) + ) + + return(res) +} + diff --git a/R/graduation.R b/R/graduation.R index 488fbca..08718a2 100644 --- a/R/graduation.R +++ b/R/graduation.R @@ -1,3 +1,323 @@ +#' Graduate grouped time data to single-year estimates +#' Expands grouped time data (e.g., 5-year intervals) into single-year values using various graduation methods. The function supports smoothing, monotone interpolation, uniform splitting, cubic regression splines, penalized composite link models (PCLM), and LOESS smoothing. +#' +#' Available methods: +#' \itemize{ +#' \item `"uniform"` — distributes totals evenly within each interval +#' \item `"mono"` — monotone cubic interpolation (Hyman filtered spline) +#' \item `"smooth_spline"` — smoothing spline via \code{stats::smooth.spline} +#' \item `"cubic"` — cubic regression spline via \code{mgcv::gam} +#' \item `"loess"` — local polynomial smoothing via \code{stats::loess} +#' \item `"pclm"` — penalized composite link model via \code{ungroup::pclm} +#' } +#' +#' Block totals are preserved for all smoothing-based methods by rescaling +#' predictions within each interval to match original grouped totals. +#' @param data_in Data frame containing grouped time data. +#' @param method Graduation method. One of \code{"loess"}, \code{"smooth_spline"}, \code{"cubic"}, \code{"mono"}, \code{"uniform"}, \code{"pclm"}. +#' @param X Name of the time variable (character string). +#' @param Y Name of the grouped value variable (character string). +#' @param timeInt Width of grouping interval (default = 5). +#' +#' @importFrom stats smooth.spline loess loess.control predict splinefun +#' @importFrom mgcv gam s +#' @importFrom ungroup pclm +#' @importFrom tibble rownames_to_column +#' @importFrom dplyr mutate +#' @importFrom magrittr %>% +#' @importFrom DemoTools int2age +#' @importFrom stats gaussian +#' @details +#' If column \code{.id} exists in \code{data_in}, graduation is performed separately within each subgroup. Otherwise all observations are treated as a single group. +#' For spline-based methods, predicted single-year values are rescaled +#' within each interval so that grouped totals are exactly preserved. +#' The PCLM method relies on the \pkg{ungroup} package and may internally rescale small counts to avoid negative fitted values. +#' +#' @return +#' A data frame with columns: +#' \itemize{ +#' \item \code{time} — single-year time values +#' \item \code{value} — graduated single-year estimates +#' \item \code{.id} — subgroup identifier +#' } +#' +#' @examples +#' df <- data.frame( +#' Year = seq(1950, 2015, by = 5), +#' Deaths = c(403564.9, 426012.0, 466215.9, 523753.8, 560874.1, +#' 545608.3, 555335.9, 594584.6, 608425.3, 638167.3, +#' 655438.6, 689386.4, 740519.0, 804439.3) +#' ) +#' +#' graduate_time( +#' df, +#' method = "pclm", +#' X = "Year", +#' Y = "Deaths", +#' timeInt = 5 +#' ) +#' +#' @export +#' + +graduate_time <- function(data_in, + method = c("loess", "smooth_spline", "cubic", "mono", "uniform", "pclm"), + X, + Y, + timeInt = 5) { + # uniform graduation + graduate_uniform_time <- function(Value, + time, + timeInt = 5) { + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + if (is_single(time)) { + names(Value) <- time + return(Value) + } + + out <- rep(Value / timeInt, each = timeInt) + names(out) <- min(time):(min(time) + length(out) - 1) + + return(out) + + } + + # mono graduation + graduate_mono_time <- function (Value, time, timeInt = 5) { + + timeInt <- rep(timeInt, times = length(time)) + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + if (is_single(time)) { + names(Value) <- time + return(Value) + } + + timePred <- c(min(time), cumsum(timeInt) + min(time)) + y <- c(0, cumsum(Value)) + timeS <- min(time):(sum(timeInt) + min(time)) + y1 <- splinefun(y ~ timePred, method = "hyman")(timeS) + out <- diff(y1) + names(out) <- timeS[-length(timeS)] + time1 <- min(time):(min(time) + length(out) - 1) + names(out) <- time1 + return(out) + } + + # pclm routine using ungroup package + graduate_pclm_time <- function(Value, time, timeInt = 5) { + + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + + timeInt <- rep(timeInt, times = length(time)) + a1 <- seq(from = min(time), to = c(max(time) + (unique(timeInt) - 1)), by = 1) + ind0 <- Value == 0 + have0s <- any(ind0) + + if (have0s) { + cat("\n0s detected in Value, replacing with .01\n") + Value[ind0] <- 0.01 + } + + A <- pclm(x = time, + y = Value, + nlast = unique(timeInt)) + fac <- 1 + for(i in 1:3) { + + if(any(A$fitted < 0)) { + + fac <- 10 ^ i + A <- pclm(x = time, + y = Value * fac, + nlast = unique(timeInt)) + + } else { + + break + + } + } + + if(any(A$fitted < 0)) { + cat("\nCareful, results of PCLM produced some negatives. \n \nWe tried rescaling inputs by as much as", + fac, "\nbut alas it wasn't enough.\n") + } + if(fac > 1) { + cat("\nPossible small counts issue with these data and the PCLM method\nIt seems to have worked without producing negatives when fitting Value is scaled by", + fac, "\nCouldn't hurt to eyeball results!\n") + } + + B <- A$fitted / fac + a1.fitted <- A$bin.definition$output$breaks["left", ] + names(B) <- a1.fitted + return(B) + + } + + # cubic spline + graduate_cubic_time <- function(Value, time, timeInt = 5) { + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + + a1 <- seq(from = min(time), to = c(max(time) + (unique(timeInt) - 1)), by = 1) + + # Smooth totals (shape only) + z2 <- gam( + Value ~ s(time, bs = "cr"), + family = gaussian(), + method = "REML" + ) + + pred <- predict( + z2, + newdata = data.frame(time = a1) + ) + + # Enforce totals blockwise + out <- pred + for(i in seq_along(Value)) { + idx <- ((i - 1) * unique(timeInt) + 1):(i * unique(timeInt)) + out[idx] <- out[idx] * Value[i] / sum(out[idx]) + } + + return(out) + } + + # smooth_spline graduation menthod (spline) + graduate_smooth_spline_time <- function(Value, time, timeInt = 5) { + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + + a1 <- seq(from = min(time), to = c(max(time) + (unique(timeInt) - 1)), by = 1) + z <- smooth.spline(time, Value, spar = 0.6) + + pred <- predict(z, a1)$y + + # Enforce totals blockwise + out <- pred + for (i in seq_along(Value)) { + idx <- ((i - 1) * unique(timeInt) + 1):(i * unique(timeInt)) + out[idx] <- out[idx] * Value[i] / sum(out[idx]) + } + + return(out) + } + + + # loess graduation + graduate_loess_time <- function(Value, time, timeInt = 5) { + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + + a1 <- seq(from = min(time), to = c(max(time) + (unique(timeInt) - 1)), by = 1) + z1 <- loess(Value ~ time, + control = loess.control(surface = "direct")) + + pred <- predict(z1, newdata = data.frame(time = a1)) + + # rescale each 5-year block + out <- pred + for (i in seq_along(Value)) { + idx <- ((i - 1) * unique(timeInt) + 1):(i * unique(timeInt)) + out[idx] <- out[idx] * Value[i] / sum(out[idx]) + } + + return(out) + } + + method <- match.arg(method) + + # Ensure '.id' column exists + if (!(".id" %in% names(data_in))) { + data_in$.id <- "all" + } + + split_data <- split(data_in, data_in$.id) + + for_one <- function(d) { + + time <- d[[X]] + Value <- d[[Y]] + + res <- switch(method, + "smooth_spline" = graduate_smooth_spline_time( Value, time, timeInt = timeInt), + "loess" = graduate_loess_time( Value, time, timeInt = timeInt), + "cubic" = graduate_cubic_time( Value, time, timeInt = timeInt), + "mono" = graduate_mono_time( Value, time, timeInt = timeInt), + "uniform" = graduate_uniform_time(Value, time, timeInt = timeInt), + "pclm" = graduate_pclm_time( Value, time, timeInt = timeInt) + ) + + + res <- as.data.frame(res) %>% + rownames_to_column() %>% + set_names(c("time", "value")) %>% + mutate("value" = as.numeric(.data$value), + "time" = as.numeric(.data$time)) + + return(res) + + } + + out <- do.call(rbind, lapply(names(split_data), \(id) { + cbind(for_one(split_data[[id]]), .id = id) + })) + + return(out) + +} + #' @title `smooth_flexible` #' @description This is a wrapper around `smooth_flexible_chunk` that allows us to work with `.id` variable and create the output for corresponding groups. Smoothes population or death counts using various methods from the `DemoTools` package and paragraph five from "Method protocol for evaluating census population data by age and sex". #' @param data_in tibble or data.frame. A tibble with two numeric columns - population or death counts with supported names: `Pop`, `Population`, `Exp`, `Exposures` or `Deaths`, and corresponding numeric `Age` - provided in single age intervals, 5-year age intervals, or abridged age format e.g. with ages 0, 1, 5, etc. @@ -26,7 +346,7 @@ #' "single_hmd_spain.csv.gz", #' package = "ODAPbackend") #' data_in <- read_csv(fpath) |> -#' dplyr::select(-1) +#' select(-1) #' ex1 <- smooth_flexible( #' data_in, #' variable = "Exposures", @@ -114,13 +434,13 @@ smooth_flexible <- function(data_in, # nms <- results |> - dplyr::select(all_of(c(".id", by_args))) |> + select(all_of(c(".id", by_args))) |> unite("one", everything(), sep = "_") |> pull(.data$one) smoothed_data <- results |> mutate(data = map(.data$result, ~ .x[["data"]]))|> - dplyr::select(-c(.data$result)) |> + select(-c(.data$result)) |> unnest(.data$data) figures <- results$result |> @@ -141,11 +461,13 @@ smooth_flexible <- function(data_in, #' @param constrain_infants logical. A scalar that indicates whether the infant proportions should be constrained or left as is. Defaults to `TRUE`. #' @param u5m numeric. Under-five mortality rate. Defaults to NULL. #' @param Sex character. Either `m` for males, `f` for females, or `t` for total (default). Please note that in case this parameter is not explicitly provided, the function will scan the data for the column with the corresponding name and use its levels automatically. +#' @param i18n Optional internationalization object for translating plot labels and titles. Defaults to NULL. #' @importFrom dplyr mutate group_by filter pull select summarise #' @importFrom tibble tibble #' @importFrom rlang := !! .data #' @importFrom DemoTools is_single is_abridged check_heaping_bachi groupAges ageRatioScore mav graduate_mono calcAgeAbr age2int graduate_uniform names2age lt_rule_4m0_D0 lt_rule_4m0_m0 -#' @return data_out. A list with two elements: `data_out` - a tibble with two numeric columns - smoothed counts for the chosen variable and `Age` - chosen age grouping, and `arguments` - a list of arguments used in the function. +#' @importFrom ggplot2 ggplot aes geom_line geom_point scale_color_manual scale_linetype_manual labs theme_minimal theme element_text +#' @return data_out. A list with three elements: `data_out` - a tibble with two numeric columns - smoothed counts for the chosen variable and `Age` - chosen age grouping, `plot` - a ggplot2 object comparing original vs graduated data, and `arguments` - a list of arguments used in the function. #' @export #' @examples #' library(readr) @@ -155,7 +477,7 @@ smooth_flexible <- function(data_in, #' "five_hmd_spain.csv.gz", #' package = "ODAPbackend") #' data_in <- read_csv(fpath) |> -#' dplyr::select(-1) |> +#' select(-1) |> #' filter(.id == 1) #' #' ex1 <- graduate_auto(data_in, @@ -169,12 +491,15 @@ smooth_flexible <- function(data_in, #' ex1$arguments #' +# only can be called for counts and ages +# no need for rates and TIME graduate_auto <- function(data_in, age_out = c("single", "abridged", "5-year"), variable = NULL, u5m = NULL, Sex = c("t", "f", "m"), - constrain_infants = TRUE) { + constrain_infants = TRUE, + i18n = NULL) { ## f_args <- capture_args() age_out <- match.arg(age_out, c("single", "abridged", "5-year")) @@ -501,18 +826,117 @@ graduate_auto <- function(data_in, # (3) possibly constrain infants if(age_out %in% c("abridged", "single") & age_in %in% c("abridged", "single") & constrain_infants) { - + v_child <- data_out[data_out$Age < 5, variable, drop = TRUE] vN <- sum(v_child) v_child[1] <- pct_fst_ages[1] * vN v_child[-1] <- (1 - pct_fst_ages[1]) * vN * v_child[-1] / sum(v_child[-1]) data_out[data_out$Age < 5, variable] <- v_child - + } - - return(list(data_out = data_out - # arguments = f_args + # Pre-translate all UI text for plots (to avoid scope issues in nested functions) + cat(sprintf("[GRADUATION] Pre-translating text | i18n_null=%s\n", is.null(i18n))) + graduation_text <- translate_text("Graduation", i18n) + age_text <- translate_text("Age", i18n) + original_text <- translate_text("Original", i18n) + graduated_text <- translate_text("Graduated", i18n) + type_text <- translate_text("Type", i18n) + variable_text <- translate_text(variable, i18n) + + cat(sprintf("[GRADUATION] Translations: graduation='%s', age='%s', original='%s', graduated='%s'\n", + graduation_text, age_text, original_text, graduated_text)) + + # Create plot comparing original vs graduated data + plot_obj <- NULL + tryCatch({ + # Combine original and graduated data + original_df <- data.frame( + Age = data_in$Age, + Value = data_in[[variable]], + Type = original_text + ) + + graduated_df <- data.frame( + Age = data_out$Age, + Value = data_out[[variable]], + Type = graduated_text + ) + + # Check if we're converting from single ages to grouped ages + # If so, normalize graduated values for fair visual comparison + input_is_single <- is_single(data_in$Age) + output_is_grouped <- !is_single(data_out$Age) && + (age_out %in% c("5-year", "abridged")) + + if (input_is_single && output_is_grouped) { + # For visual comparison, divide grouped values by typical interval width + # This shows average per single age rather than sum + age_intervals <- diff(c(data_out$Age, max(data_out$Age) + 5)) + graduated_df$Value <- graduated_df$Value / age_intervals + cat(sprintf("[GRADUATION] Normalized graduated values for plot (single->grouped conversion)\n")) + } + + # Rename columns to translated versions for hover tooltips + names(original_df)[names(original_df) == "Age"] <- age_text + names(original_df)[names(original_df) == "Value"] <- variable_text + names(original_df)[names(original_df) == "Type"] <- type_text + + names(graduated_df)[names(graduated_df) == "Age"] <- age_text + names(graduated_df)[names(graduated_df) == "Value"] <- variable_text + names(graduated_df)[names(graduated_df) == "Type"] <- type_text + + plot_data <- rbind(original_df, graduated_df) + + # Create named vector for colors using translated keys + color_values <- c("black", "blue") + names(color_values) <- c(original_text, graduated_text) + + # Build plot using .data[[]] for runtime evaluation + plot_obj <- ggplot(plot_data, aes(x = .data[[age_text]], y = .data[[variable_text]], + color = .data[[type_text]], + linetype = .data[[type_text]])) + + geom_line(linewidth = 1) + + geom_point(data = plot_data[plot_data[[type_text]] == original_text, ], size = 2) + + scale_color_manual(name = type_text, values = color_values) + + scale_linetype_manual(name = type_text, values = c("solid", "solid")) + + labs(x = age_text, y = variable_text, + title = paste0(graduation_text, ": ", variable_text)) + + theme_minimal(base_size = 14) + + theme( + legend.position = "bottom", + plot.title = element_text(face = "bold", hjust = 0.5) + ) + + cat(sprintf("[GRADUATION] Plot created successfully\n")) + }, error = function(e) { + cat(sprintf("[GRADUATION] Plot creation error: %s\n", e$message)) + }) + + # Normalize data_out for download to ensure matching totals + # This is different from plot normalization - here we preserve total sums + data_out_normalized <- data_out + + # Check if we're converting from single ages to grouped ages + input_is_single <- is_single(data_in$Age) + output_is_grouped <- !is_single(data_out$Age) && + (age_out %in% c("5-year", "abridged")) + + if (input_is_single && output_is_grouped) { + # Calculate scaling factor to preserve total sum + total_original <- sum(data_in[[variable]], na.rm = TRUE) + total_graduated <- sum(data_out[[variable]], na.rm = TRUE) + + if (total_graduated > 0) { + scaling_factor <- total_original / total_graduated + data_out_normalized[[variable]] <- data_out[[variable]] * scaling_factor + cat(sprintf("[GRADUATION] Applied normalization factor %.4f to preserve totals in downloaded data\n", scaling_factor)) + } + } + + return(list(data_out = data_out_normalized, + plot = plot_obj + # arguments = f_args )) } @@ -534,7 +958,7 @@ graduate_auto <- function(data_in, #' "five_hmd_spain.csv.gz", #' package = "ODAPbackend") #' data_in <- read_csv(fpath) |> -#' dplyr::select(-1) |> +#' select(-1) |> #' filter(.id == 1) #' #' graduate_auto_5(data_in, "Exposures")$Exposures @@ -672,7 +1096,7 @@ graduate_auto_5 <- function(dat_5, #' "abridged_hmd_spain.csv.gz", #' package = "ODAPbackend") #' data_in <- read_csv(fpath) |> -#' dplyr::select(-1) |> +#' select(-1) |> #' filter(.id == 1) #' #' ex1 <- smooth_flexible_chunk( @@ -689,6 +1113,17 @@ graduate_auto_5 <- function(dat_5, #' ex1$figure$figure #' ex1$arguments + +# if the input is time then user provides vector 2011:2020 +# another copy for graduate_time +# graduate time with pclm and mono, graduate uniform +# "pclm", "mono", "uniform" +# do not demand age, time can be various intervals +# times should be unique, and sequential and make seance, and coherent 2005:2010, 2008:2011 +# no overlap (unique) +# use the logic of age coherent +# use teh same plotting logic + smooth_flexible_chunk <- function(data_in, variable = "Deaths", age_out = c("single", "abridged", "5-year"), @@ -895,10 +1330,17 @@ smooth_flexible_chunk <- function(data_in, } if(any(data5[, variable, drop = TRUE] < 0)) { - stop( - "Check your input data or consider changing the selected rough method. - Current smoothing process is returning negative values." - ) + neg_idx <- which(data5[, variable, drop = TRUE] < 0) + neg_ages <- paste(data5$Age[neg_idx], collapse = ", ") + neg_vals <- paste(round(data5[neg_idx, variable, drop = TRUE], 2), collapse = ", ") + + stop(paste0( + "Smoothing produced negative values for '", variable, "'\n", + "Method '", rough_method, "' produced negative values at ages: ", neg_ages, "\n", + "Values: ", neg_vals, "\n", + "This typically happens when data has extreme discontinuities (e.g., large jumps between age groups)\n", + "Try using 'auto' method instead, or check your input data for irregularities" + )) } # HERE @@ -1239,8 +1681,8 @@ smooth_flexible_chunk <- function(data_in, #' "abridged_hmd_spain.csv.gz", #' package = "ODAPbackend") #' data_in <- read_csv(fpath, show_col_types = FALSE) |> -#' dplyr::select(-1) |> -#' dplyr::filter(.id == 1) +#' select(-1) |> +#' filter(.id == 1) #' #' data_out <- smooth_flexible_chunk( #' data_in, @@ -1272,13 +1714,8 @@ plot_smooth_compare <- function(data_in, # Translate the variable name for display if translation exists variable_display <- translate_text(variable, i18n) age_text <- translate_text("Age", i18n) - # TR: kludge 2025-10-14 in order to avoid having two columns called Age - # in data_out when the language is English, which breaks ggplot code. - age_text <- paste0(" ",age_text," ") - # same - variable_display <- paste0(" ",variable_display," ") title_text <- paste(translate_text("Original (black) and adjusted (red)", i18n), variable_display, translate_text("data", i18n)) - + data_in <- data_in |> mutate( single = is_single(.data$Age), @@ -1296,16 +1733,8 @@ plot_smooth_compare <- function(data_in, age_label = case_when(.data$Age == max(.data$Age) ~ paste0(max(.data$Age), "+"), TRUE ~ paste0("[", .data$Age, ",", .data$Age + .data$AgeInt, ")")), plot_y = !!sym(variable) / .data$AgeInt) - - # Translate column names for display - names(data_in)[names(data_in) == "age_mid"] <- age_text - names(data_out)[names(data_out) == "age_mid"] <- age_text - - names(data_in)[names(data_in) == "plot_y"] <- variable_display - names(data_out)[names(data_out) == "plot_y"] <- variable_display - - sym_variable_display <- sym(variable_display) - sym_age_text <- sym(age_text) + sym_variable_display <- sym("plot_y") + sym_age_text <- sym("age_mid") figure <- ggplot() + @@ -1336,4 +1765,3 @@ plot_smooth_compare <- function(data_in, - diff --git a/R/input_diagnostics.R b/R/input_diagnostics.R index 5f3460d..e229c1c 100644 --- a/R/input_diagnostics.R +++ b/R/input_diagnostics.R @@ -1,83 +1,337 @@ -#' @title `check_heaping_general` -#' @description Check the age heaping for 5 or 1 year data. -#' @param data data.frame. User file from the read_data command with the minimum data on `Exposures`, `Death` and `Age`. Data can be both in 5 and 1 year age intervals -#' @param y character.Variable name for which the heaping should be checked `Deaths` or `Exposures`. -#' @return A data.frame with 2 columns `method` - the method used for age heaping evaluation and `result` - the resulting heaping measure -#' @importFrom stringr str_detect -#' @importFrom dplyr bind_rows left_join join_by -#' @importFrom rlang .data -#' @importFrom DemoTools check_heaping_roughness check_heaping_bachi check_heaping_myers check_heaping_sawtooth -#' @export -#' @examples -#' \dontrun{ -#' check_heaping_general( -#' data = data, -#' y = "Exposures") + +# check_heaping_general <- function(data, y) { +# +# tbl <- tibble("level" = c(c("Highly accurate", "Fairly accurate", +# "Approximate", "Rough","Very rough")), +# color = c("#ffffb2","#fecc5c","#fd8d3c","#f03b20","#bd0026")) +# +# has_single <- is_single(data$Age) +# +# if(has_single) { +# +# # go with +# bachi <- check_heaping_bachi(subset(data, select = y, drop = TRUE), +# data$Age, +# OAG = TRUE, +# method = "pasex") +# +# myers <- check_heaping_myers(subset(data, select = y, drop = TRUE), data$Age) +# +# res <- tibble("age scale" = "single", +# "method" = c("bachi", "myers"), +# "result" = c(bachi, myers)) |> +# mutate("level" = cut(.data$result, +# breaks = c(0,1,2,5,15,80), +# labels = c("Highly accurate", "Fairly accurate", "Approximate", +# "Rough","Very rough")) |> as.character()) |> +# left_join(tbl, by = join_by("level")) +# +# } +# tblr <- tibble("method" = "roughness", +# "level" = c(1.03)) +# # 5-year methods +# roughness <- check_heaping_roughness(subset(data, select = y, drop = TRUE), data$Age, ageMin = 30) +# sawtooth <- check_heaping_sawtooth( subset(data, select = y, drop = TRUE), data$Age, ageMin = 30) +# +# r <- cut(roughness, breaks = c(0,.1,.2,.5,1.5,10), +# labels = c("Highly accurate", "Fairly accurate", "Approximate", +# "Rough","Very rough")) |> as.character() +# +# s <- cut(sawtooth, breaks = c(1,1.03,1.1,1.5,3,10), +# labels = c("Highly accurate", "Fairly accurate", "Approximate", +# "Rough","Very rough")) |> as.character() +# +# res5 <- tibble('age scale' = "5-year", +# "method" = c("roughness", "sawtooth"), +# "result" = c(roughness, sawtooth), +# "level" = c(r, s)) |> +# left_join(tbl, by = join_by("level")) +# +# if (has_single){ +# +# out <- bind_rows(res, res5) +# +# } else { +# +# out <- res5 +# +# } +# +# return(out) +# +# } + +# add argument rate or proportion, units +# and add log, sqrt, logit +# Scale +# check_heaping_general <- function(data, y) { +# +# if(!(".id" %in% colnames(data))) { +# data <- data |> +# mutate(.id = "all") +# } +# +# labels <- c( +# "Highly accurate", +# "Fairly accurate", +# "Approximate", +# "Rough", +# "Very rough" +# ) +# +# tbl <- tibble("level" = labels, +# color = c("#ffffb2", +# "#fecc5c", +# "#fd8d3c", +# "#f03b20", +# "#bd0026")) +# +# has_single <- is_single(unique(data$Age)) +# +# if(has_single) { +# +# res <- data |> +# select("Age", all_of(y), ".id") |> +# group_nest(.data$`.id`) |> +# mutate("bachi" = map(data, ~ +# check_heaping_bachi( +# Value = pull(.x, !!sym(y)), +# Age = .x$Age, +# OAG = TRUE, +# method = "pasex")), +# "myers" = map(data, ~ +# check_heaping_myers( +# Value = pull(.x, !!sym(y)), +# Age = .x$Age)), +# "age scale" = "single") |> +# select(".id", "bachi", "myers", "age scale") |> +# unnest(c("bachi", "myers")) |> +# pivot_longer(-c(".id", "age scale"), +# names_to = "method", +# values_to = "result") |> +# group_by(.data$`.id`) |> +# mutate("level" = cut(.data$result, +# breaks = c(0, 1, 2, 5, 15, 80), +# labels = labels), +# "level" = as.character(.data$level)) |> +# left_join(tbl, by = c("level")) +# +# } +# +# # 5-year methods +# classify <- function(x, method, breaks, labels) { +# tibble( +# result = x, +# method = method, +# level = cut(x, breaks = breaks, labels = labels) +# ) +# } +# +# res5 <- data |> +# select("Age", all_of(y), ".id") |> +# group_nest(.data$`.id`) |> +# mutate( +# "roughness" = map(data, ~ +# check_heaping_roughness( +# Value = pull(.x, !!sym(y)), +# Age = .x$Age, +# ageMin = 30)), +# "sawtooth" = map(data, ~ +# check_heaping_sawtooth( +# Value = pull(.x, !!sym(y)), +# Age = .x$Age, +# ageMin = 30)), +# "res" = map2(.x = .data$roughness, +# .y = .data$sawtooth, ~ +# bind_rows( +# classify( +# x = .x, +# method = "roughness", +# breaks = c(0, .1, .2, .5, 1.5, 10), +# labels = labels), +# classify( +# x = .y, +# method = "sawtooth", +# breaks = c(1, 1.03, 1.1, 1.5, 3, 10), +# labels = labels))), +# "age scale" = "5-year") |> +# select(-c("data", "roughness", "sawtooth")) |> +# unnest("res") |> +# left_join(tbl, by = join_by("level")) +# +# if(has_single) { +# +# out <- bind_rows(res, res5) +# +# } else { +# +# out <- res5 +# +# } +# +# return(out) +# +# } + + +#' General age-heaping diagnostics +#' @description +#' Computes age-heaping and age-irregularity indices from age-specific counts, rates, or proportions. The set of diagnostics depends on the age scale: digit-preference indices are used for single-year ages, while irregularity indices are used for grouped ages. +#' @param data A data frame containing at least `Age` and the variable `y`. If `.id` is present, diagnostics are computed by group; otherwise all observations are treated as one group. +#' @param y Character string giving the name of the age-specific variable. +#' @param units Type of input data. One of `"count"`, `"rate"`, or `"proportion"`. +#' @importFrom tibble tibble +#' @importFrom dplyr full_join mutate left_join join_by bind_rows +#' @importFrom DemoTools check_heaping_bachi check_heaping_myers check_heaping_roughness check_heaping_sawtooth +#' @details +#' * Single-year ages: Bachi and Myers digit-preference indices. +#' * Grouped ages: roughness and sawtooth irregularity indices. +#' For `"count"` data, values are internally converted to proportions. +#' For `"rate"` and `"proportion"` data, values are normalized to sum to one +#' to ensure comparability across ages. +#' @return +#' A data frame with one row per index and group, containing: +#' \itemize{ +#' \item `method` – diagnostic index name +#' \item `result` – numeric value +#' \item `level` – qualitative data-quality category +#' \item `color` – associated color code +#' \item `age scale` – `"single"` or `"5-year"` +#' \item `.id` – group identifier #' } +#' @seealso +#' \code{\link{check_heaping_bachi}}, +#' \code{\link{check_heaping_myers}}, +#' \code{\link{check_heaping_roughness}}, +#' \code{\link{check_heaping_sawtooth}} +#' @references +#' Bachi (1951); Myers (1940); United Nations (1955). +#' @examples +#' library(readr) +#' fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +#' data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +#' data_in$nMx <- data_in$Deaths / data_in$Exposures +#' data_in <- data_in[data_in$.id == 1, ] +#' check_heaping_general(data = data_in, y = "Deaths", units = "count") +#' check_heaping_general(data = data_in, y = "nMx", units = "rate") +#' @export #' -check_heaping_general <- function(data, y) { +check_heaping_general <- function(data, y, units = c("count", "rate", "proportion")) { - tbl <- tibble("level" = c(c("Highly accurate", "Fairly accurate", - "Approximate", "Rough","Very rough")), - color = c("#ffffb2","#fecc5c","#fd8d3c","#f03b20","#bd0026")) + units <- match.arg(units) - has_single <- is_single(data$Age) + stopifnot(is.character(y), length(y) == 1) + stopifnot(all(c("Age", y) %in% names(data))) - if(has_single) { - - # go with - bachi <- check_heaping_bachi(subset(data, select = y, drop = TRUE), - data$Age, - OAG = TRUE, - method = "pasex") - - myers <- check_heaping_myers(subset(data, select = y, drop = TRUE), data$Age) - - res <- tibble("age scale" = "single", - "method" = c("bachi", "myers"), - "result" = c(bachi, myers)) |> - mutate("level" = cut(.data$result, - breaks = c(0,1,2,5,15,80), - labels = c("Highly accurate", "Fairly accurate", "Approximate", - "Rough","Very rough")) |> as.character()) |> - left_join(tbl, by = join_by("level")) - - } - tblr <- tibble("method" = "roughness", - "level" = c(1.03)) - # 5-year methods - roughness <- check_heaping_roughness(subset(data, select = y, drop = TRUE), data$Age, ageMin = 30) - sawtooth <- check_heaping_sawtooth( subset(data, select = y, drop = TRUE), data$Age, ageMin = 30) - - r <- cut(roughness, breaks = c(0,.1,.2,.5,1.5,10), - labels = c("Highly accurate", "Fairly accurate", "Approximate", - "Rough","Very rough")) |> as.character() + # Ensure '.id' column exists + if (!(".id" %in% names(data))) { + data$.id <- "all" + } + + # keep only columns we need + data <- data |> + select(c('.id', "Age", all_of(y))) - s <- cut(sawtooth, breaks = c(1,1.03,1.1,1.5,3,10), - labels = c("Highly accurate", "Fairly accurate", "Approximate", - "Rough","Very rough")) |> as.character() + # Perform checks + # Split data by '.id', apply checks, and combine results + split_data <- split(data, data$.id) + labels <- c("Highly accurate", "Fairly accurate", "Approximate", "Rough", "Very rough") + color <- c("#ffffb2", "#fecc5c", "#fd8d3c", "#f03b20", "#bd0026") + tbl <- tibble("level" = labels, + "color" = color) - res5 <- tibble('age scale' = "5-year", - "method" = c("roughness", "sawtooth"), - "result" = c(roughness, sawtooth), - "level" = c(r, s)) |> - left_join(tbl, by = join_by("level")) + check_one <- function(d = split_data) { + + Y <- d[[y]] + Age <- d$Age + has_single <- is_single(Age) + + if (units != "count") { + + Y <- Y / sum(Y) + + } - if (has_single){ + roughness <- check_heaping_roughness( + Value = Y, + Age = Age, + ageMin = 30) - out <- bind_rows(res, res5) + sawtooth <- check_heaping_sawtooth( + Value = Y, + Age = Age, + ageMin = 30) - } else { + roughness <- tibble("method" = "roughness", + "result" = roughness) |> + mutate("level" = cut(.data$result, + breaks = c(0, .1, .2, .5, 1.5, 10), + labels = labels), + "level" = as.character(.data$level)) - out <- res5 + sawtooth <- tibble("method" = "sawtooth", + "result" = sawtooth) |> + mutate("level" = cut(.data$result, + breaks = c(1, 1.03, 1.1, 1.5, 3, 10), + labels = labels), + "level" = as.character(.data$level)) + + out5 <- roughness |> + full_join(sawtooth, by = join_by("method", "result", "level")) |> + mutate("age scale" = "5-year") |> + left_join(tbl, by = c("level")) + + if(has_single) { + + bachi <- check_heaping_bachi( + Value = Y, + Age = Age, + OAG = TRUE, + method = "pasex") + + myers <- check_heaping_myers( + Value = Y, + Age = Age) + + out <- tibble("method" = c("bachi", "myers"), + "result" = c(bachi, myers), + "age scale" = "single") |> + mutate("level" = cut(.data$result, + breaks = c(0, 1, 2, 5, 15, 80), + labels = labels), + "level" = as.character(.data$level)) |> + left_join(tbl, by = c("level")) + + out <- bind_rows(out, out5) + + } else { + + out <- out5 + + } + + if(units %in% c("rate", "proportion")) { + + out <- out[!out$method == "roughness", ] + + } + + return(out) } - return(out) + res <- + do.call(rbind, lapply(names(split_data), \(id) { + cbind(check_one(split_data[[id]]), .id = id) + })) + + return(res) } + + #' @title `check_heaping_user` #' @description Check the age heaping for 5 or 1 year data, but this time give user control over minimum and maximum evaluation age. #' @param data data.frame. User file from the read_data command with the minimum data on Exposures, Death and Age. Data ca be both in 5 and 1 year age intervals diff --git a/R/lifetables.R b/R/lifetables.R index c3e3ce7..921bfae 100644 --- a/R/lifetables.R +++ b/R/lifetables.R @@ -642,31 +642,35 @@ lt_summary_chunk <- function(data_out, i18n = NULL) { p_20_65 <- l65 / l20 q_20_65 <- 1 - p_20_65 - out <- tibble(e0, + label_vec <- c("e_0", + translate_text("Median", i18n), + translate_text("Mode", i18n), + "e_65", + "\\sigma_0", + "\\sigma_{10}", + "IQR", + "{}_{45}q_{20}") + + message_vec <- c(translate_text("Life Expectancy at Birth", i18n), + translate_text("Median Age at Death", i18n), + translate_text("Modal Age at Death", i18n), + translate_text("Remaining Life Expectancy at Age 65", i18n), + translate_text("Lifespan Variation Calculated as Standard Deviation of Age at Death", i18n), + translate_text("Standard Deviation of Remaining Lifespan Conditional on Survival to Age 10", i18n), + translate_text("Interquartile Range of Age at Death Distribution", i18n), + translate_text("Conditional Probability of Death Between Ages 20 and 65", i18n)) + + out <- tibble(e0, Median = median_age, Mode = mod_age, - e65, - sd0 = S[1], + e65, + sd0 = S[1], sd10 = S[11], IQR, - q_20_65) |> - pivot_longer(everything(), names_to = "measure", values_to = "value") |> - mutate(label = c("e_0", - translate_text("Median", i18n), - translate_text("Mode", i18n), - "e_65", - "\\sigma_0", - "\\sigma_{10}", - "IQR", - "{}_{45}q_{20}"), - message = c(translate_text("Life Expectancy at Birth", i18n), - translate_text("Median Age at Death", i18n), - translate_text("Modal Age at Death", i18n), - translate_text("Remaining Life Expectancy at Age 65", i18n), - translate_text("Lifespan Variation Calculated as Standard Deviation of Age at Death", i18n), - translate_text("Standard Deviation of Remaining Lifespan Conditional on Survival to Age 10", i18n), - translate_text("Interquartile Range of Age at Death Distribution", i18n), - translate_text("Conditional Probability of Death Between Ages 20 and 65", i18n))) + q_20_65) |> + pivot_longer(everything(), names_to = "measure", values_to = "value") |> + mutate(label = label_vec, + message = message_vec) # Note from Jorge C: pleassseee let's leave the order as is. If we change this order # then I have to change the order (not adding any other columns or anything) in the diff --git a/R/movepop.R b/R/movepop.R deleted file mode 100644 index f8a7dfd..0000000 --- a/R/movepop.R +++ /dev/null @@ -1,393 +0,0 @@ -# movepop <- function(initial_date, -# desired_date, -# male_pop, -# female_pop, -# male_mx, -# female_mx, -# asfr, -# annual_net_migrants = 0, -# age_groups = NULL, -# age_format = "auto") { -# -# # age format -# age_format <- match.arg(age_format, c("single_year", "five_year", "auto")) -# -# # check if all incoming data have same number of ages -# # Validate input lengths -# if(length(unique( -# c(length(male_pop), length(female_pop), length(male_mx), length(female_mx)) -# )) != 1) { -# stop("All population and mortality vectors must have the same length") -# } -# -# n_ages <- length(male_pop) -# -# # Auto-detect age format -# # stop if age format is not single or five year -# if (age_format == "auto") { -# -# age_format <- case_when( -# n_ages == 18 ~ "five_year", -# n_ages >= 101 ~ "single_year", -# TRUE ~ NA_character_ -# ) -# -# if(is.na(age_format)) { -# -# stop("Cannot auto-detect age format") -# -# } -# -# } -# -# -# # groupAges() -# -# # Default age groups -# if(is.null(age_groups)) { -# -# if(age_format == "five_year") { -# -# ge_groups <- c( -# "0-1", "1-4", "5-9", "10-14", "15-19", "20-24", "25-29", -# "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", -# "65-69", "70-74", "75-79", "80+") -# } -# -# if(age_format == "single_year") { -# -# age_groups <- c(as.character(0:99), "100+") -# -# } -# -# -# if(!age_format %in% c("single_year", "five_year")) { -# -# age_groups <- c(as.character(0:(n_ages - 2)), paste0(n_ages - 1, "+")) -# -# } -# -# } -# -# -# -# # Reproductive indices (fertile ages) -# repro_indices <- if (age_format == "five_year") 5:11 else { -# switch(as.character(length(asfr)), -# `35` = 16:50, `40` = 11:50, `45` = 11:55, -# stop("ASFR must have 35, 40, or 45 values for single-year format")) -# } -# -# # Births -# births <- asfr * female_pop[repro_indices] -# birth_age_groups <- if (age_format == "five_year") c("15-19","20-24","25-29","30-34","35-39","40-44","45-49") else -# as.character(if (length(asfr)==35) 15:49 else if (length(asfr)==40) 10:49 else 10:54) -# total_births <- sum(births) -# -# # Deaths -# deaths <- male_mx * male_pop + female_mx * female_pop -# total_deaths <- sum(deaths) -# -# # Vital rates -# total_pop_initial <- sum(male_pop) + sum(female_pop) -# crude_birth_rate <- total_births / total_pop_initial -# crude_death_rate <- total_deaths / total_pop_initial -# net_migration_rate <- annual_net_migrants / total_pop_initial -# growth_rate <- crude_birth_rate - crude_death_rate + net_migration_rate -# tfr <- if (age_format == "five_year") 5*sum(asfr) else sum(asfr) -# -# # Project population -# time_diff <- desired_date - initial_date -# total_pop_projected <- round(total_pop_initial * exp(growth_rate * time_diff)) -# adjustment_factor <- total_pop_projected / total_pop_initial -# male_pop_projected <- round(adjustment_factor * male_pop) -# female_pop_projected <- round(adjustment_factor * female_pop) -# both_sexes_projected <- male_pop_projected + female_pop_projected -# sex_ratio <- male_pop_projected / female_pop_projected -# -# # Age summary -# age_summary <- tibble( -# age_group = age_groups, -# both_sexes = both_sexes_projected, -# male = male_pop_projected, -# female = female_pop_projected, -# sex_ratio = round(sex_ratio,6) -# ) -# -# # Special summaries -# all_ages <- tibble( -# age_group = "All ages", -# both_sexes = sum(both_sexes_projected), -# male = sum(male_pop_projected), -# female = sum(female_pop_projected), -# sex_ratio = sum(male_pop_projected)/sum(female_pop_projected) -# ) -# -# special_summaries <- list(all_ages = all_ages) -# -# if (age_format == "five_year") { -# special_summaries$under_5 <- tibble( -# age_group = "0-4", -# both_sexes = sum(both_sexes_projected[1:2]), -# male = sum(male_pop_projected[1:2]), -# female = sum(female_pop_projected[1:2]), -# sex_ratio = sum(male_pop_projected[1:2])/sum(female_pop_projected[1:2]) -# ) -# } else { -# df <- tibble( -# age = 0:(length(both_sexes_projected)-1), -# both_sexes = both_sexes_projected, -# male = male_pop_projected, -# female = female_pop_projected -# ) %>% -# mutate(age_group = case_when( -# age <= 4 ~ "0-4", -# age >= 100 ~ "100+", -# TRUE ~ paste0(floor(age/5)*5,"-",floor(age/5)*5+4) -# )) %>% -# group_by(age_group) %>% -# summarise( -# both_sexes = sum(both_sexes), -# male = sum(male), -# female = sum(female), -# .groups="drop" -# ) %>% -# mutate(sex_ratio = male/female) -# special_summaries$age_groups_5yr <- df -# } -# -# # Assemble results -# results <- list( -# initial_date = initial_date, -# desired_date = desired_date, -# time_interval = time_diff, -# age_format = age_format, -# crude_birth_rate = crude_birth_rate, -# crude_death_rate = crude_death_rate, -# net_migration_rate = net_migration_rate, -# growth_rate = growth_rate, -# total_fertility_rate = tfr, -# initial_population = total_pop_initial, -# projected_population = total_pop_projected, -# adjustment_factor = adjustment_factor, -# births_by_age = tibble(age_group=birth_age_groups, births=births, asfr=asfr), -# total_births = total_births, -# deaths_by_age = tibble(age_group=age_groups, deaths=deaths, male_mx=male_mx, female_mx=female_mx), -# total_deaths = total_deaths, -# population_projection = if(age_format=="five_year") bind_rows(all_ages,special_summaries$under_5,age_summary) else age_summary, -# special_summaries = special_summaries, -# annual_net_migrants = annual_net_migrants -# ) -# -# return(results) -# } - - - -#' Project Population Between Two Dates Using Age-Specific Rates -#' -#' The `movepop()` function projects a population from an initial date to a desired date -#' using age-specific fertility (ASFR) and mortality rates. It supports single-year or -#' five-year age group formats and optionally accounts for net migration. -#' The function produces both projected population by age and sex and summary statistics. -#' -#' @param initial_date Numeric or Date. The starting year/date of the population. -#' @param desired_date Numeric or Date. The year/date to which the population is projected. -#' @param male_pop Numeric vector of male population counts by age. -#' @param female_pop Numeric vector of female population counts by age. -#' @param male_mx Numeric vector of male mortality rates by age. -#' @param female_mx Numeric vector of female mortality rates by age. -#' @param asfr Numeric vector of age-specific fertility rates corresponding to female ages. -#' @param annual_net_migrants Numeric scalar for annual net migration to include in growth (default 0). -#' @param age_groups Optional character vector of age group labels. If `NULL`, default labels are assigned. -#' @param age_format Character specifying age structure: `"single_year"`, `"five_year"`, or `"auto"` (default `"auto"`). When `"auto"`, the function infers format from the length of population vectors. -#' -#' @details -#' - The function ensures that population and mortality vectors have equal lengths. -#' - If `age_format` is `"auto"`, lengths of 18 imply `"five_year"` and lengths ≥101 imply `"single_year"`. -#' - Age groups are automatically generated if not provided, and ASFR values are aligned starting at the age group `"15"` (or equivalent). -#' - Projected populations are computed using exponential growth based on crude birth rate, crude death rate, and net migration rate. -#' - The function returns both age-specific projected populations and summary statistics. -#' -#' @return A list with three elements: -#' \describe{ -#' \item{\code{initial_summaries}}{A tibble summarising total births, deaths, crude rates, growth rate, total population, and adjustment factor.} -#' \item{\code{projected_summaries}}{A tibble summarising projected total population by sex and overall sex ratio.} -#' \item{\code{data}}{A tibble with age-specific populations, ASFR, births, deaths, projected populations by sex, both-sexes totals, and sex ratios.} -#' } -#' -#' @examples -#' \dontrun{ -#'male_pop <- c(48875, 164390, 173551, 130297, 101143, 73615, 60594, 55175, -#' 49530, 46562, 39028, 27837, 22110, 18066, 15340, 13318, -#'#' 12002, 6424) -#' -#'female_pop <- c(47105, 159546, 168760, 119437, 92080, 70515, 58801, 53381, -#' 46757, 41164, 33811, 24121, 19315, 16319, 14058, 12302, -#' 11047, 5922) -#' -#'male_mx <- c(0.12427, 0.01639, 0.00274, 0.00167, 0.00251, 0.00380, 0.00382, -#' 0.00442, 0.00506, 0.00663, 0.00872, 0.01240, 0.01783, 0.02700, -#' 0.04126, 0.06785, 0.11287, 0.21015) -#' -#'female_mx <- c(0.11050, 0.01577, 0.00254, 0.00159, 0.00232, 0.00304, 0.00344, -#' 0.00370, 0.00418, 0.00492, 0.00592, 0.00831, 0.01182, 0.01942, -#' 0.03221, 0.05669, 0.09771, 0.19385) -#' -#'asfr <- c(0.199, 0.478, 0.418, 0.321, 0.163, 0.071, 0.028) -#' -#' res <- movepop( -#'initial_date = 1973.58, -#'desired_date = 1973.50, -#'male_pop = male_pop, -#'female_pop = female_pop, -#'male_mx = male_mx, -#'female_mx = female_mx, -#'asfr = asfr, -#'annual_net_migrants = -50000, -#'age_format = "five_year" -#' -#' res$initial_summaries -#' res$projected_summaries -#' head(res$data) -#' } -#' -#' @importFrom dplyr mutate summarise case_when -#' @importFrom stringr str_detect -#' @importFrom magrittr %>% -#' @importFrom tibble tibble -#' @importFrom rlang .data -#' @export - - - -# make age mandatory arg. remove age_format, rename arguments -movepop <- function(initial_date, - desired_date, - male_pop, - female_pop, - male_mx, - female_mx, - asfr, - annual_net_migrants = 0, - age_groups = NULL, - age_format = "auto") { - - # age format - age_format <- match.arg(age_format, c("single_year", "five_year", "auto")) - - lengths_vec <- c( - length(male_pop), - length(female_pop), - length(male_mx), - length(female_mx) - ) - - if(length(unique(lengths_vec)) != 1) { - stop("All population and mortality vectors must have the same length (ages)") - } - - # collect - data <- tibble( - male_pop = male_pop, - female_pop = female_pop, - male_mx = male_mx, - female_mx = female_mx - ) - - n_ages <- lengths_vec[1] - - # Auto-detect age format - # stop if age format is not single or five year - if(age_format == "auto") { - - age_format <- case_when( - n_ages == 18 ~ "five_year", - n_ages %in% c(81, 86) | n_ages >= 101 ~ "single_year", - TRUE ~ NA_character_ - ) - - } - - if(is.na(age_format)) { - - stop("Cannot auto-detect age format") - - } - - # Default age groups - if(is.null(age_groups)) { - - # for 5-year age groups - if(age_format == "five_year") { - - age_groups <- c( - "0-1", "1-4", "5-9", "10-14", "15-19", "20-24", "25-29", - "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", - "65-69", "70-74", "75-79", "80+") - } else { - - # for single age groups - - age_groups <- c(as.character(0:(n_ages - 2)), paste0(n_ages - 1, "+")) - - if(!length(asfr) %in% c(35, 40, 45)) { - - stop("ASFR must have 35, 40, or 45 values for single-year format") - - } - } - } - - # asfr join - vec <- vector("numeric", length = nrow(data)) - ind <- which(str_detect(age_groups, "15")) - vec[ind:(ind + length(asfr) - 1)] <- asfr - - data <- data %>% - mutate(age = age_groups, - asfr = vec, - births = asfr * female_pop, - deaths = male_mx * male_pop + female_mx * female_pop) - - summaries <- data %>% - summarise(total_births = sum(.data$births), - total_deaths = sum(.data$deaths), - total_pop_initial = sum(male_pop) + sum(female_pop), - crude_birth_rate = .data$total_births / .data$total_pop_initial, - crude_death_rate = .data$total_deaths / .data$total_pop_initial, - net_migration_rate = annual_net_migrants / .data$total_pop_initial, - growth_rate = .data$crude_birth_rate - .data$crude_death_rate + .data$net_migration_rate, - age_format = age_format, - tfr = ifelse(age_format == "five_year", 5 * sum(asfr), sum(asfr)), - desired_date = desired_date, - initial_date = initial_date, - time_diff = desired_date - initial_date, - total_pop_projected = .data$total_pop_initial * exp(.data$growth_rate * .data$time_diff), - adjustment_factor = .data$total_pop_projected / .data$total_pop_initial) - - - projected_data <- data %>% - mutate(male_pop_projected = summaries$adjustment_factor * male_pop, - female_pop_projected = summaries$adjustment_factor * female_pop, - both_sexes_projected = .data$male_pop_projected + .data$female_pop_projected, - sex_ratio = .data$male_pop_projected / .data$female_pop_projected - ) - - summaries_proj <- projected_data %>% - summarise(age_group = "All ages", - both_sexes = sum(.data$both_sexes_projected), - male = sum(.data$male_pop_projected), - female = sum(.data$female_pop_projected), - sex_ratio = sum(.data$male_pop_projected) / sum(.data$female_pop_projected)) - - - # Assemble results - results <- list( - initial_summaries = summaries, - projected_summaries = summaries_proj, - data = projected_data - ) - - return(results) -} diff --git a/R/odap.R b/R/odap.R index a76a785..a1561ed 100644 --- a/R/odap.R +++ b/R/odap.R @@ -15,17 +15,17 @@ #' @param country_code Numeric vector of country codes to filter by (default \code{356}). #' @param year Numeric vector of years to filter by (default \code{1971}). #' @param sex Character scalar indicating sex to filter by, e.g., \code{"M"} or \code{"F"} (default \code{"M"}). -#' @importFrom dplyr filter arrange across select mutate group_by reframe ungroup full_join group_nest +#' @param i18n Optional internationalization object for translating plot labels and titles (default \code{NULL}). +#' @importFrom dplyr filter arrange across select mutate group_by reframe ungroup full_join group_nest pick #' @importFrom tibble as_tibble #' @importFrom DemoTools lt_single_mx OPAG groupAges #' @importFrom tidyr pivot_longer #' @importFrom ggplot2 ggplot geom_point geom_line aes scale_color_manual labs theme_minimal theme element_text #' @importFrom tidyselect all_of any_of #' @importFrom purrr map map2 -#' @importFrom rlang .data %||% +#' @importFrom rlang .data %||% sym #' @importFrom magrittr %>% #' @importFrom utils installed.packages data globalVariables -#' @export #' #' @details #' The function: @@ -46,10 +46,7 @@ #' \item{\code{figures}}{A named list of \code{ggplot2} objects showing original vs. redistributed populations.} #' } #' -#' @seealso \code{\link[DemoTools]{OPAG}}, \code{\link{link[DemoTools]{lt_single_mx}}, \code{\link{\link[DemoTools]{groupAges}} -#' #' @examples -#' \dontrun{ #' library(dplyr) #' #' data_in <- tibble( @@ -68,7 +65,9 @@ #' #' # Plot original vs redistributed population #' print(res$figures$India) -#' } +#' @export +#' + odap_opag <- function(data_in = NULL, Age_fit = c(60, 70), AgeInt_fit = c(10, 10), @@ -76,22 +75,23 @@ odap_opag <- function(data_in = NULL, OAnew = 100, method = c("mono", "pclm", "uniform"), nLx = NULL, - # we want for user to be able to choose country + # we want for user to be able to choose country # by name AND/OR code name = "India", country_code = 356, # Here we indicate the sex and year to choose from # latest wpp if needed year = 1971, - sex = "M" + sex = "M", + i18n = NULL ) { # chosen method method <- match.arg(method, c("mono", "pclm", "uniform")) - - - # Helper: conditional filtering for user defined variables - # e.g. if sex exits and provided by user we use it in filtering + + + # conditional filtering for user defined variables + # eg if sex exits and provided by user we use it in filtering conditional_filter <- function(df, col, values) { if(!is.null(values) && col %in% names(df)) { @@ -120,17 +120,23 @@ odap_opag <- function(data_in = NULL, # introducing uniform names # adding group columns and id if missing # ---------------------------------------------------------------------------- # - + + data_in <- as_tibble(data_in) names(data_in) <- tolower(names(data_in)) - + + if(!"sex" %in% names(data_in)) data_in$sex <- sex if(!"country_code" %in% names(data_in)) data_in$country_code <- country_code %||% 1 if(!"name" %in% names(data_in)) data_in$name <- name %||% "country" + if(!"year" %in% names(data_in)) data_in$year <- year %||% 2020 + if(!".id" %in% names(data_in)) { - - data_in <- create_groupid(data_in, keys = c("name", "sex", "year", "country_code")) - + + # Only use keys that exist in the data + available_keys <- intersect(c("name", "sex", "year", "country_code"), names(data_in)) + data_in <- create_groupid(data_in, keys = available_keys) + } # ---------------------------------------------------------------------------- # @@ -144,20 +150,25 @@ odap_opag <- function(data_in = NULL, # ---------------------------------------------------------------------------- # # check if nLx column is in the names of data_in - if("nLx" %in% names(data_in)) { - - nLx <- data_in %>% + # Note: column names were converted to lowercase at line 131, so check for "nlx" + if("nlx" %in% names(data_in)) { + + nLx <- data_in %>% conditional_filter("country_code", country_code) %>% conditional_filter("name", name) %>% conditional_filter("year", year) %>% - conditional_filter("sex", sex) %>% - select(any_of(c("name", "country_code", "sex", "year", "age", "nLx"))) - + conditional_filter("sex", sex) %>% + select(any_of(c("name", "country_code", "sex", "year", "age", "nlx"))) + # Rename nlx back to nLx for consistency with downstream code + if("nlx" %in% names(nLx)) { + names(nLx)[names(nLx) == "nlx"] <- "nLx" + } + } if(is.null(nLx)) { - + installed_wpp <- grep("^wpp\\d{4}$", rownames(installed.packages()), value = TRUE) if(length(installed_wpp) == 0) stop("No wpp package installed.") @@ -171,36 +182,38 @@ odap_opag <- function(data_in = NULL, } + # add env solution suppressPackageStartupMessages(library(latest_wpp, character.only = TRUE)) - data("mx1dt", package = latest_wpp) + + env <- new.env() + data("mx1dt", package = latest_wpp, envir = env) - nLx <- mx1dt %>% + nLx <- env$mx1dt %>% as_tibble() %>% - select(-.data$mxB) %>% - conditional_filter("country_code", .env$country_code) %>% - conditional_filter("name", .env$name) %>% - conditional_filter("year", .env$year) %>% + select(-c("mxB")) %>% + conditional_filter("country_code", country_code) %>% + conditional_filter("name", name) %>% + conditional_filter("year", year) %>% pivot_longer( - cols = c(.data$mxM, .data$mxF), - names_to = "sex", + cols = c("mxM", "mxF"), + names_to = "sex", values_to = "mx" ) %>% mutate(sex = substr(.data$sex, 3, 3)) %>% - conditional_filter("sex", .env$sex) + conditional_filter("sex", sex) - group_vars <- intersect(c("name","country_code","sex","year"), names(nLx)) - - nLx <- nLx %>% - group_by(across(all_of(intersect(c("name", "country_code", "sex", "year"), names(.))))) %>% + nLx <- nLx %>% + group_by(across(any_of(c("name", "country_code", "sex", "year")))) %>% reframe( - lt_single_mx(nMx = .data$mx, Age = .data$age), .groups = "drop") %>% - select(.data$name, - .data$country_code, - .data$sex, - .data$year, - age = .data$Age, - .data$AgeInt, - .data$nLx) + lt_single_mx(nMx = .data$mx, + Age = .data$age), .groups = "drop") %>% + select("name", + "country_code", + "sex", + "year", + "age" = "Age", + "AgeInt", + "nLx") # # nLx <- nLx %>% # as_tibble() %>% @@ -219,42 +232,44 @@ odap_opag <- function(data_in = NULL, # select("name", "country_code", "sex", "year", age = "Age", "AgeInt", "nLx") } else { - + nLx <- as_tibble(nLx) %>% conditional_filter("country_code", country_code) %>% conditional_filter("name", name) %>% conditional_filter("year", year) %>% - conditional_filter("sex", sex) %>% - select(any_of(c("name", "country_code", "sex", "year", "age", "nLx"))) - + conditional_filter("sex", sex) %>% + select(any_of(c("name", "country_code", "sex", "year", "age", "AgeInt", "nLx"))) + } # can be changed to is_single if needed age_diff <- diff(sort(unique(data_in$age))) unique_diff <- unique(na.omit(age_diff)) # NA removed in case of strange OAG coding - + if(length(unique_diff) == 1) { - + age_spacing <- unique_diff - + } else { - + stop("Mixed or irregular age spacing: ", paste(unique_diff, collapse = ", ")) - + } # --- Group nLx ages if needed --- # if(age_spacing %in% c(5, 10)) { - + + nLx <- nLx %>% - group_by(across(all_of(intersect(c("name", "country_code", "sex", "year"), names(.))))) %>% - reframe( - age = seq(0, max(age), by = age_spacing), - nLx = groupAges(.data$nLx, Age = .data$age, N = age_spacing), - .groups = "drop" - ) + group_by(across(any_of(c("name", "country_code", "sex", "year")))) %>% + reframe({ + dat <- pick(everything()) + age_vals <- seq(0, max(dat$age), by = age_spacing) + nLx_vals <- groupAges(Value = dat$nLx, Age = dat$age, N = age_spacing) + tibble(age = age_vals, nLx = nLx_vals) + }) + } - # ---------------------------------------------------------------------------- # # Part 3: Now we have user data and reference data and we can use the OPAG function @@ -264,9 +279,9 @@ odap_opag <- function(data_in = NULL, # match row by row with nLx. This makes it easier to work with data further data_in <- conditional_arrange(data_in, c("name", "country_code", "sex", "year", "age")) nLx <- conditional_arrange(nLx, c("name", "country_code", "sex", "year", "age")) - - nLx$.id_label <- data_in$.id_label - nLx$.id <- data_in$.id + + # Note: .id and .id_label will be added via full_join below + # No need to manually assign here since nLx may have different row count after filtering # result <- data_in %>% # full_join(nLx) %>% @@ -318,8 +333,23 @@ odap_opag <- function(data_in = NULL, # }) # ) - result <- data_in %>% - full_join(nLx) %>% + + # Pre-translate all UI text for plots (to avoid scope issues in map2) + title_text <- translate_text("Population Redistribution (OPAG Method)", i18n) + age_text <- translate_text("Age", i18n) + pop_text <- translate_text("Population", i18n) + original_text <- translate_text("Original", i18n) + redistributed_text <- translate_text("Redistributed", i18n) + type_text <- translate_text("Type", i18n) + + + # Join only on 'age' to attach nLx values + # Don't join on sex/name/year/country_code because: + # 1. Those are either grouping variables (user's data structure) + # 2. Or WPP metadata (which mortality table was used) + # The user selected ONE mortality table to apply to ALL groups + result <- data_in %>% + full_join(nLx, by = "age") %>% group_nest(.data$.id, .data$.id_label) %>% mutate( results = map(.data$data, ~ { @@ -327,28 +357,49 @@ odap_opag <- function(data_in = NULL, Pop_vals <- .x$pop nLx_vals <- .x$nLx OPAG( - Pop = Pop_vals, - Age_Pop = Age_vals, - nLx = nLx_vals, - Age_nLx = Age_vals, - Age_fit = .env$Age_fit, - AgeInt_fit = .env$AgeInt_fit, - Redistribute_from = .env$Redistribute_from, - OAnew = .env$OAnew, - method = .env$method + Pop = Pop_vals, + Age_Pop = Age_vals, + nLx = nLx_vals, + Age_nLx = Age_vals, + Age_fit = Age_fit, + AgeInt_fit = AgeInt_fit, + Redistribute_from = Redistribute_from, + OAnew = OAnew, + method = method ) }), - plots = map2(.data$data, .data$results, ~ { - old <- tibble(pop = .x$pop, age = .x$age) %>% - filter(.data$age > .env$Redistribute_from) - new <- tibble(pop = .y$Pop_out, age = .y$Age_out) %>% - filter(.data$age > .env$Redistribute_from) - + plots = map2(.data$data, .data$results, function(.x, .y) { + old <- tibble(pop = .x$pop, + age = .x$age) %>% + filter(.data$age > Redistribute_from) + new <- tibble(pop = .y$Pop_out, + age = .y$Age_out) %>% + filter(.data$age > Redistribute_from) + + # Note: translated text variables (title_text, age_text, etc.) + # are captured from parent scope where i18n is available + + + # Rename columns to translated versions for hover tooltips (following lifetable pattern) + names(old)[names(old) == "age"] <- age_text + names(old)[names(old) == "pop"] <- pop_text + old[[type_text]] <- original_text + + names(new)[names(new) == "age"] <- age_text + names(new)[names(new) == "pop"] <- pop_text + new[[type_text]] <- redistributed_text + + + # Create named vector for colors using translated keys + color_values <- c("black", "red") + names(color_values) <- c(original_text, redistributed_text) + + # Build plot using .data[[]] for runtime evaluation (NOT !!sym()) ggplot() + - geom_point(data = old, aes(x = .data$age, y = .data$pop, color = "Old"), size = 2) + - geom_line(data = new, aes(x = .data$age, y = .data$pop, color = "New"), linewidth = 1) + - scale_color_manual(name = "Population", values = c("Old" = "black", "New" = "red")) + - labs(x = "Age", y = "Population") + + geom_point(data = old, aes(x = .data[[age_text]], y = .data[[pop_text]], color = .data[[type_text]]), size = 2) + + geom_line( data = new, aes(x = .data[[age_text]], y = .data[[pop_text]], color = .data[[type_text]]), linewidth = 1) + + scale_color_manual(name = type_text, values = color_values) + + labs(x = age_text, y = pop_text, title = title_text) + theme_minimal(base_size = 14) + theme( legend.position = "bottom", diff --git a/R/plots.R b/R/plots.R index a451b83..5a9e41d 100644 --- a/R/plots.R +++ b/R/plots.R @@ -474,9 +474,10 @@ abs_and_comma <- function(x, ...) { pyramid <- function(data, y) { data |> - filter(.data$Sex %in% c("Male", "Female")) |> - mutate(Sex = factor(.data$Sex, levels = c("Male", "Female"))) |> - mutate(!!y := ifelse(.data$Sex == "Male", -(!!sym(y)), !!sym(y))) |> # done + mutate(Sex = tolower(.data$Sex)) |> + filter(.data$Sex %in% c("male", "female")) |> + mutate(Sex = factor(.data$Sex, levels = c("male", "female"))) |> + mutate(!!y := ifelse(.data$Sex == "male", -(!!sym(y)), !!sym(y))) |> # done ggplot(aes(x = .data$Age, y = (.data[[y]] / .data$AgeInt), fill = .data$Sex, width = .data$AgeInt)) + geom_bar(stat = "identity") + @@ -926,7 +927,8 @@ plot_initial_data <- function(data, i18n = NULL) { if (length(sexes) > 1) { data <- data |> - filter(.data$Sex %in% c("Male", "Female")) + mutate(Sex = tolower(.data$Sex)) |> + filter(.data$Sex %in% c("male", "female")) warning("Currently plots only handle single-sex data") diff --git a/R/point5.R b/R/point5.R deleted file mode 100644 index 77c1ed9..0000000 --- a/R/point5.R +++ /dev/null @@ -1,312 +0,0 @@ -#' ODAP–OPAG Mortality and Population Redistribution Analysis -#' -#' This function prepares population data and mortality life table data (`nLx`) to perform age redistribution using the OPAG (Old-Age Population Age Group) method. -#' It supports flexible input with country name/code, sex, and year filters, and handles both user-provided and WPP standard mortality data. -#' The function outputs adjusted population estimates along with diagnostic plots comparing original vs. redistributed populations. -#' -#' @param data_in A data frame or tibble of population counts with columns including \code{age}, \code{pop}, and optionally \code{name}, \code{sex}, \code{year}, and \code{country_code}. -#' @param Age_fit Numeric vector of two ages defining the age range for fitting redistribution (default \code{c(60, 70)}). -#' @param AgeInt_fit Numeric vector of two age interval widths corresponding to \code{Age_fit} (default \code{c(10, 10)}). -#' @param Redistribute_from Numeric scalar age threshold above which population redistribution occurs (default \code{80}). -#' @param OAnew Numeric scalar indicating the new open-age group upper bound (default \code{100}). -#' @param method Character scalar specifying the redistribution method; one of \code{"mono"}, \code{"pclm"}, or \code{"uniform"} (default \code{"mono"}). -#' @param nLx Optional mortality life table data frame containing columns \code{age}, \code{nLx}, and grouping columns. If \code{NULL}, mortality data is pulled from the latest installed WPP package. -#' @param name Character vector of country names to filter by (default \code{"India"}). -#' @param country_code Numeric vector of country codes to filter by (default \code{356}). -#' @param year Numeric vector of years to filter by (default \code{1971}). -#' @param sex Character scalar indicating sex to filter by, e.g., \code{"M"} or \code{"F"} (default \code{"M"}). -#' -#' @details -#' The function: -#' \itemize{ -#' \item Standardizes and validates population input data. -#' \item Automatically retrieves and filters WPP mortality data (\code{nLx}) if not supplied. -#' \item Checks age interval consistency and aggregates mortality data if necessary. -#' \item Applies the OPAG redistribution method per group (e.g. country, year, sex). -#' \item Generates diagnostic plots comparing original and redistributed population counts. -#' } -#' -#' The function uses internal helpers such as \code{conditional_filter()}, -#' \code{conditional_arrange()}, and \code{create_groupid()} to standardize and align data. -#' -#' @return A list with two elements: -#' \describe{ -#' \item{\code{data_out}}{A named list of OPAG redistribution results by group.} -#' \item{\code{figures}}{A named list of \code{ggplot2} objects showing original vs. redistributed populations.} -#' } -#' -#' @seealso \code{\link{OPAG}}, \code{\link{lt_single_mx}}, \code{\link{groupAges}} -#' -#' @examples -#' \dontrun{ -#' library(dplyr) -#' -#' data_in <- tibble( -#' age = seq(0, 100, by = 5), -#' pop = runif(21, 10000, 100000), -#' name = "India", -#' country_code = 356, -#' sex = "M", -#' year = 1971 -#' ) -#' -#' res <- odap_opag(data_in, Redistribute_from = 80) -#' -#' # View redistributed data -#' res$data_out$India -#' -#' # Plot original vs redistributed population -#' print(res$figures$India) -#' } -#' -#' @importFrom dplyr filter arrange across select mutate group_by reframe ungroup full_join group_nest -#' @importFrom tibble as_tibble -#' @importFrom DemoTools lt_single_mx OPAG groupAges -#' @importFrom tidyr pivot_longer -#' @importFrom ggplot2 ggplot geom_point geom_line aes scale_color_manual labs theme_minimal theme element_text -#' @importFrom tidyselect all_of any_of -#' @importFrom purrr map map2 -#' @importFrom rlang .data %||% -#' @importFrom magrittr %>% -#' @export - -# !!!!!!!!!! -# if 2022 and 2024 are not there then warning, that there are no single ages -# on a fork of demotools include the update function -# do a separate script for examples and ODAP flowcontrol and ignore this in package building -# change Pop for data_in -# check for nLx column in data_in - -# just a working name -odap_opag <- function(data_in = NULL, - Age_fit = c(60, 70), - AgeInt_fit = c(10, 10), - Redistribute_from = 80, # highest age that is at least 10 years younger than max and divisible by 10 - OAnew = 100, - method = c("mono", "pclm", "uniform"), - nLx = NULL, - # we want for user to be able to choose country - # by name AND/OR code - name = "India", - country_code = 356, - # Here we indicate the sex and year to choose from - # latest wpp if needed - year = 1971, - sex = "M" - ) { - - # chosen method - method <- match.arg(method, c("mono", "pclm", "uniform")) - - - # Helper: conditional filtering for user defined variables - # e.g. if sex exits and provided by user we use it in filtering - conditional_filter <- function(df, col, values) { - - if(!is.null(values) && col %in% names(df)) { - - df %>% - filter(.data[[col]] %in% values) - - } else df - - } - - # Helper: conditional arrange for user defined variables - conditional_arrange <- function(df, cols) { - # Keep only columns that exist in df - cols_exist <- intersect(cols, names(df)) - - if (length(cols_exist) > 0) { - df %>% arrange(across(all_of(cols_exist))) - } else { - df - } - } - - # ---------------------------------------------------------------------------- # - # Part 1: This part prepares the data_in data for further analysis by: - # introducing uniform names - # adding group columns and id if missing - # ---------------------------------------------------------------------------- # - - data_in <- as_tibble(data_in) - names(data_in) <- tolower(names(data_in)) - - if(!"sex" %in% names(data_in)) data_in$sex <- sex - if(!"country_code" %in% names(data_in)) data_in$country_code <- country_code %||% 1 - if(!"name" %in% names(data_in)) data_in$name <- name %||% "country" - if(!".id" %in% names(data_in)) { - - data_in <- create_groupid(data_in, keys = c("name", "sex", "year", "country_code")) - - } - - # ---------------------------------------------------------------------------- # - # Part 2: Here we deal with the standard data - # first case is when the data is not provided by user - # in this case, it is being pulled from the latest installed wpp package - # if the data is provided by the user, then we filter corresponding info from it - # by default we calculate nLx for chosen combination for single years - # then if user provided data are in 5 or 10-year intervals, we group our nLx accordingly - # if spacing is strange the warning is thrown - # ---------------------------------------------------------------------------- # - - # check if nLx column is in the names of data_in - if("nLx" %in% names(data_in)) { - - nLx <- data_in %>% - conditional_filter("country_code", country_code) %>% - conditional_filter("name", name) %>% - conditional_filter("year", year) %>% - conditional_filter("sex", sex) %>% - select(any_of(c("name", "country_code", "sex", "year", "age", "nLx"))) - - } - - - if(is.null(nLx)) { - - installed_wpp <- grep("^wpp\\d{4}$", rownames(installed.packages()), value = TRUE) - - if(length(installed_wpp) == 0) stop("No wpp package installed.") - - latest_wpp <- sort(installed_wpp, decreasing = TRUE)[1] - - if(parse_number("wpp2024") < 2022) { - - warning("No single ages are availabe in wpp versions earlier that wpp2022. - Consider updating the wpp package or change to five year solution.") - - } - - - suppressPackageStartupMessages(library(latest_wpp, character.only = TRUE)) - data("mx1dt", package = latest_wpp) - - nLx <- mx1dt %>% - as_tibble() %>% - select(-mxB) %>% - conditional_filter("country_code", country_code) %>% - conditional_filter("name", name) %>% - conditional_filter("year", year) %>% - pivot_longer(c(mxM, mxF), - names_to = "sex", - values_to = "mx") %>% - mutate(sex = substr(sex, 3, 3)) %>% - conditional_filter("sex", sex) %>% - group_by(across(all_of(intersect(c("name", "country_code", "sex", "year"), names(.))))) %>% - reframe(lt_single_mx(nMx = mx, Age = age)) %>% - ungroup() %>% - select(name:year, age = Age, AgeInt, nLx) - - } else { - - nLx <- as_tibble(nLx) %>% - conditional_filter("country_code", country_code) %>% - conditional_filter("name", name) %>% - conditional_filter("year", year) %>% - conditional_filter("sex", sex) %>% - select(any_of(c("name", "country_code", "sex", "year", "age", "nLx"))) - - } - - # can be changed to is_single if needed - age_diff <- diff(sort(unique(data_in$age))) - unique_diff <- unique(na.omit(age_diff)) # NA removed in case of strange OAG coding - - if(length(unique_diff) == 1) { - - age_spacing <- unique_diff - - } else { - - stop("Mixed or irregular age spacing: ", paste(unique_diff, collapse = ", ")) - - } - - # --- Group nLx ages if needed --- # - if(age_spacing %in% c(5, 10)) { - - nLx <- nLx %>% - group_by(across(all_of(intersect(c("name", "country_code", "sex", "year"), names(.))))) %>% - reframe( - age = seq(0, max(age), by = age_spacing), - nLx = groupAges(nLx, Age = age, N = age_spacing), - .groups = "drop" - ) - } - - - # ---------------------------------------------------------------------------- # - # Part 3: Now we have user data and reference data and we can use the OPAG function - # ---------------------------------------------------------------------------- # - - # we conditionally arrange both datasets to ensure that ids of Pop - # match row by row with nLx. This makes it easier to work with data further - data_in <- conditional_arrange(data_in, c("name", "country_code", "sex", "year", "age")) - nLx <- conditional_arrange(nLx, c("name", "country_code", "sex", "year", "age")) - - nLx$.id_label <- data_in$.id_label - nLx$.id <- data_in$.id - - result <- data_in %>% - full_join(nLx) %>% - group_nest(.id, .id_label) %>% - mutate( - results = map(data, ~ { - Age_vals <- unique(.x$age) - Pop_vals <- .x$pop - nLx_vals <- .x$nLx - - OPAG( - Pop = Pop_vals, - Age_Pop = Age_vals, - nLx = nLx_vals, - Age_nLx = Age_vals, - Age_fit = Age_fit, - AgeInt_fit = AgeInt_fit, - Redistribute_from = Redistribute_from, - OAnew = OAnew, - method = method - ) - }), - - plots = map2(data, results, ~ { - old <- tibble(pop = .x$pop, - age = .x$age) %>% - filter(age > Redistribute_from) - - new <- tibble(pop = .y$Pop_out, - age = .y$Age_out) %>% - filter(age > Redistribute_from) - - # A figure to compare results with original - ggplot() + - geom_point(data = old, aes(x = age, y = pop, color = "Old"), size = 2) + - geom_line( data = new, aes(x = age, y = pop, color = "New"), linewidth = 1) + - scale_color_manual(name = "Population", - values = c("Old" = "black", - "New" = "red")) + - labs( - x = "Age", - y = "Population" - ) + - theme_minimal(base_size = 14) + - theme( - legend.position = "bottom", - plot.title = element_text(face = "bold", hjust = 0.5) - ) - }) - ) - - results <- result$results - names(results) <- result$.id_label - figures <- result$plots - names(figures) <- names(results) - - return(list(data_out = results, - figures = figures - )) - -} diff --git a/R/point6.R b/R/point6.R deleted file mode 100644 index 5871dc9..0000000 --- a/R/point6.R +++ /dev/null @@ -1,563 +0,0 @@ -#' @title Cohort-Component Population Reconstruction with Single-Year Ages -#' -#' @description -#' Estimates the base population by age and sex using the cohort-component -#' reverse survival method with **single-year ages**. The function reconstructs -#' births from observed female populations of reproductive age, age-specific -#' fertility rates (ASFR), survival ratios, and the sex ratio at birth (SRB). -#' Births are then distributed by sex and survived to ages 0–9, replacing -#' reported counts in those ages. -#' -#' @details -#' This function is analogous to [basepop_five()], but works with **single-year** -#' age groups instead of 5-year age groups. Female exposures aged 15–49 are -#' reverse-survived one year to estimate births across three anchor dates -#' (`refDate - c(0.5, 2.5, 7.5)`). Births are then split by sex using the SRB -#' and survived to obtain reconstructed populations for ages 0–9. Older ages -#' remain unchanged. -#' -#' @param location Optional. Country or region name/ISO code. Used if inputs are -#' to be automatically downloaded from WPP. -#' @param refDate Numeric or `Date`. Reference date of the population count (e.g., -#' census year and month, converted internally to decimal date). -#' @param Age Integer vector. Lower bounds of single-year ages (if `NULL`, inferred -#' from input names or length of `Females_single`). -#' @param Females_single Numeric vector. Female population counts by single year -#' of age at `refDate`. -#' @param Males_single Optional numeric vector. Male population counts by single year -#' of age at `refDate`. -#' @param nLxFemale,nLxMale Life table person-years lived by age and sex. Can be -#' provided directly or downloaded if `location` is supplied. -#' @param nLxDatesIn Numeric vector of input dates corresponding to `nLxFemale` -#' and `nLxMale`. Defaults to `refDate - c(0.5, 9.5)`. -#' @param AsfrMat Matrix or data.frame of age-specific fertility rates (ASFR), -#' with ages 15–49 in rows and dates in columns. -#' @param AsfrDatesIn Numeric vector of input dates for `AsfrMat`. Defaults to -#' `refDate - c(0.5, 9.5)`. -#' @param SRB Optional numeric vector of sex ratios at birth (males per female). -#' If `NULL`, values are downloaded when `location` is provided. -#' @param SRBDatesIn Numeric vector of input dates for `SRB`. Defaults to the -#' three anchor dates `refDate - c(0.5, 2.5, 7.5)`. -#' @param radix Life table radix (default = 1). If `NULL`, inferred from `nLx`. -#' @param verbose Logical (default = `TRUE`). Print assumptions and progress. -#' @param ... Further arguments passed to [interp()]. -#' -#' @return A list with components: -#' \describe{ -#' \item{Females_adjusted}{Numeric vector of reconstructed female counts (ages 0–9 adjusted).} -#' \item{Males_adjusted}{Numeric vector of reconstructed male counts (ages 0–9 adjusted).} -#' \item{Females_single}{Original female counts (input).} -#' \item{Males_single}{Original male counts (input).} -#' \item{nLxf}{Interpolated female life table values.} -#' \item{nLxm}{Interpolated male life table values.} -#' \item{Asfr}{Interpolated ASFR by age and date.} -#' \item{Exposure_female}{Estimated female exposures aged 15–49 at anchor dates.} -#' \item{Bt}{Estimated total births at each anchor date.} -#' \item{SRB}{Sex ratio at birth used in the reconstruction.} -#' \item{Age}{Age vector used.} -#' } -#' -#' @seealso [basepop_five()] -#' -#' @examples -#' \dontrun{ -#' pop <- basepop_single( -#' location = "Sweden", -#' refDate = 2000, -#' Females_single = SwedenFemales, -#' Males_single = SwedenMales -#' ) -#' } -#' -#' @export - - - -# example setup to run the function -method = "linear" -refDate <- 1986 -location <- "Brazil" -pop_female_single <- fertestr::FetchPopWpp2019(location, - refDate, - ages = 0:100, - sex = "female") - -Age <- pop_female_single$ages - -Males_single <- pop_female_single$pop -Females_single <- fertestr::FetchPopWpp2019(location, - refDate, - ages = 0:100, - sex = "male")$pop -nLxFemale = NULL -nLxMale = NULL -nLxDatesIn = NULL -AsfrMat = NULL -AsfrDatesIn = NULL -SRB = NULL -SRBDatesIn = NULL -radix = 100000 -verbose = TRUE - -# !!!!!!!!!!!!!!!! NOTE: Two versions of the function -# Currenly the second one that is provided below is used -# !!!!!!!!!!!!!!!! IMPORTANT: I have not checked this function for robust work -# just with minimal examples -# If you like the implementation, I will make the final version in 1-2 days - - -# basepop_single <- function(location = NULL, -# refDate, -# Age = NULL, -# Females_single, -# Males_single = NULL, -# nLxFemale = NULL, -# nLxMale = NULL, -# nLxDatesIn = NULL, -# AsfrMat = NULL, -# AsfrDatesIn = NULL, -# ..., -# SRB = NULL, -# SRBDatesIn = NULL, -# radix = 100000, -# verbose = TRUE) { -# -# options(basepop_verbose = verbose) -# on.exit(options(basepop_verbose = NULL)) -# refDate <- dec.date(refDate) -# -# # 1) Age setup -# if (!is.null(Age)) { -# -# stopifnot(is_single(Age)) # user must define this helper: check contiguous 0,1,2,... -# stopifnot(length(Age) == length(Females_single)) -# -# } else { -# -# if (!is.null(names(Females_single))) { -# Age <- as.integer(names(Females_single)) -# } else { -# if (verbose) -# cat("Assuming age groups are single years starting at 0\n") -# Age <- 0:(length(Females_single) - 1) -# } -# } -# -# -# # 2) Default input dates -# if (is.null(nLxDatesIn)) { -# -# nLxDatesIn <- refDate - c(0.5, 9.5) -# -# if (verbose) { -# cat( -# "Assuming the two prior dates for the nLx matrix to be: ", -# paste0(nLxDatesIn, collapse = ", "), -# "\n" -# ) -# } -# } -# -# -# if (is.null(AsfrDatesIn)) { -# AsfrDatesIn <- refDate - c(0.5, 9.5) -# if (verbose) { -# cat( -# "Assuming the two prior dates for the Asfr matrix to be: ", -# paste0(AsfrDatesIn, collapse = ", "), -# "\n" -# ) -# } -# } -# -# # 3) Assign names -# names(Females_single) <- Age -# names(Males_single) <- Age -# -# # 4) Download or prepare nLx, ASFR, SRB -# nLxFemale <- download_Lx( -# nLx = nLxFemale, -# location = location, -# gender = "female", -# nLxDatesIn = nLxDatesIn -# ) -# -# nLxMale <- download_Lx( -# nLx = nLxMale, -# location = location, -# gender = "male", -# nLxDatesIn = nLxDatesIn -# ) -# -# if(is.null(radix)) { -# radix <- lt_infer_radix_from_1L0(nLxMale[1, 1]) -# -# if (verbose) -# cat("Setting radix to lx = ", -# radix, -# ". Can be overwritten with `radix` argument\n") -# -# } -# -# # downloadAsfr(Asfrmat = NULL, -# # location = location, -# # AsfrDatesIn = AsfrDatesIn) -# -# # check this maybe wrong -# # the only one place -# AsfrMat <- download_Asfr(AsfrMat, -# location = location, -# AsfrDatesIn = AsfrDatesIn) %>% -# filter(between(age, 15, 49)) -# -# # anchor dates: ages 0-9 -# DatesOut <- refDate - ((0:9) + 0.5) -# -# SRBDatesIn <- if(!is.null(SRBDatesIn)) { -# -# SRBDatesIn -# -# } else { -# -# DatesOut -# -# } -# -# SRB <- download_SRB(SRB, -# location, -# DatesOut = SRBDatesIn, -# verbose = verbose) -# -# # 5) Checks -# AllArgs <- as.list(environment()) -# -# # ArgsCheck(AllArgs) -# # -# lower_bound <- abs(min(nLxDatesIn) - min(DatesOut)) -# upper_bound <- abs(max(nLxDatesIn) - max(DatesOut)) -# -# if (lower_bound > 5 || upper_bound > 5) { -# stop("nLxDatesIn extrapolation > 5 years not allowed") -# } -# -# nLxf <- interp(nLxFemale[,-c(1:4)], -# datesIn = nLxDatesIn, -# datesOut = DatesOut, -# ...) # ... everywhere -# -# nLxm <- interp(nLxMale[,-c(1:4)], -# datesIn = nLxDatesIn, -# datesOut = DatesOut, -# ...) -# -# lower_bound <- abs(min(AsfrDatesIn) - min(DatesOut)) -# upper_bound <- abs(max(AsfrDatesIn) - max(DatesOut)) -# -# if (lower_bound > 5 || upper_bound > 5) { -# stop("AsfrDatesIn extrapolation > 5 years not allowed") -# } -# -# Asfr <- interp(AsfrMat[,-c(1:2, 5)], -# datesIn = AsfrDatesIn, -# datesOut = DatesOut, -# ...) -# -# # 6) Female exposures by reverse survival and interpolation -# ages_repro <- 15:49 -# Fcurrent <- Females_single[as.character(ages_repro)] -# -# # reverse survival one-year steps -# Ft_minus_1 <- Fcurrent -# -# for(a in (16:49)) { -# -# Ft_minus_1[as.character(a - 1)] <- Fcurrent[as.character(a)] * -# nLxf[as.character(a - 1), which(DatesOut == (refDate - 0.5))] / -# nLxf[as.character(a), which(DatesOut == (refDate - 0.5))] -# -# } -# -# # Not sure which version to use -# # exposures: interpolate between F(t) and F(t-1) -# # fExpos <- sapply(DatesOut, function(s) { -# # -# # weight <- (refDate - s) / 1 # always 0.5 at midpoints -# # -# # (1 - weight) * Fcurrent + weight * Ft_minus_1[names(Fcurrent)] -# # }) -# # colnames(fExpos) <- as.character(DatesOut) -# # -# -# # I like this weights more -# fExpos <- sapply(DatesOut, function(s) { -# weight <- (refDate - s) / (refDate - min(DatesOut)) # normalized to [0,1] -# (1 - weight) * Fcurrent + weight * Ft_minus_1[names(Fcurrent)] -# }) -# colnames(fExpos) <- as.character(DatesOut) -# -# -# # 7) Births at each anchor -# Bt <- colSums(fExpos * Asfr) -# -# -# # 8) Split by sex and compute adjusted counts -# PF <- 1 / (SRB + 1) -# Females_out <- Females_single -# Males_out <- Males_single -# -# -# for (x in 0:9) { -# -# d <- x + 1 # match age with DatesOut index -# Females_out[as.character(x)] <- Bt[d] * PF[d] * nLxf[x + 1, d] / radix -# Males_out[as.character(x)] <- Bt[d] * (1 - PF[d]) * nLxm[x + 1, d] / radix -# -# } -# -# -# age_groups <- list( -# "0" = 1, # Bt[1], PF[1], nLx[,1] -# "1:4" = 2, # Bt[2], PF[2], nLx[,2] -# "5:9" = 3 # Bt[3], PF[3], nLx[,3] -# ) -# -# for (group in names(age_groups)) { -# d <- age_groups[[group]] # anchor index -# ages <- eval(parse(text = group)) # expand e.g. "1:4" → 1 2 3 4 -# -# for (x in ages) { -# Females_out[as.character(x)] <- Bt[d] * PF[d] * nLxf[x + 1, d] / radix -# -# Males_out[as.character(x)] <- Bt[d] * (1 - PF[d]) * nLxm[x + 1, d] / radix -# } -# } -# -# -# -# # the figuire will look something like this -# # tibble(new = Females_out, -# # old = Females_single, -# # Age = Age) %>% -# # filter(between(Age, 0, 10)) %>% -# # ggplot(aes(x = Age)) + -# # geom_line(aes(y = new), color = "red") + -# # geom_line(aes(y = old), color = "black") + -# # theme_minimal() + -# # theme(legend.position = "bottom") -# -# list( -# Females_adjusted = Females_out, -# Males_adjusted = Males_out, -# Females_single = Females_single, -# Males_single = Males_single, -# nLxf = nLxf, -# nLxm = nLxm, -# Asfr = Asfr, -# Exposure_female = fExpos, -# Bt = Bt, -# SRB = SRB, -# Age = Age -# ) -# } - - -# ------------------------------------------------------------------------- # -basepop_single <- function(location = NULL, - refDate, - Age = NULL, - Females_single = NULL, - Males_single = NULL, - nLxFemale = NULL, - nLxMale = NULL, - nLxDatesIn = NULL, - AsfrMat = NULL, - AsfrDatesIn = NULL, - ..., - SRB = NULL, - SRBDatesIn = NULL, - radix = 1, - verbose = TRUE) { - - options(basepop_verbose = verbose) - on.exit(options(basepop_verbose = NULL)) - refDate <- dec.date(refDate) - - ## 1) Age setup - if (!is.null(Age)) { - stopifnot(is_single(Age)) - stopifnot(length(Age) == length(Females_single)) - } else { - if (!is.null(names(Females_single))) { - Age <- as.integer(names(Females_single)) - } else { - if (verbose) - cat("Assuming age groups are single years starting at 0\n") - Age <- 0:(length(Females_single) - 1) - } - } - - ## 2) Default input dates - if (is.null(nLxDatesIn)) { - nLxDatesIn <- refDate - c(0.5, 9.5) - if (verbose) cat("Assuming nLx input dates: ", paste0(nLxDatesIn, collapse = ", "), "\n") - } - - if (is.null(AsfrDatesIn)) { - AsfrDatesIn <- refDate - c(0.5, 9.5) - if (verbose) cat("Assuming ASFR input dates: ", paste0(AsfrDatesIn, collapse = ", "), "\n") - } - - ## 3) Assign names - names(Females_single) <- Age - if (!is.null(Males_single)) names(Males_single) <- Age - - ## 4) Download or prepare inputs - nLxFemale <- download_Lx(nLxFemale, location, "female", nLxDatesIn) - nLxMale <- download_Lx(nLxMale, location, "male", nLxDatesIn) - - # demotools tripple column - if (is.null(radix)) { - radix <- lt_infer_radix_from_1L0(nLxMale[1, 1]) - if (verbose) cat("Setting radix to lx = ", radix, "\n") - } - - AsfrMat <- download_Asfr(AsfrMat, location, AsfrDatesIn) - - ## Anchor dates: 3 points like basepop_five - DatesOut <- refDate - c(0.5, 2.5, 7.5) - - if (is.null(SRBDatesIn)) SRBDatesIn <- DatesOut - SRB <- download_SRB(SRB, location, SRBDatesIn, verbose) - - ## 5) Checks - if (abs(min(nLxDatesIn) - min(DatesOut)) > 5 || - abs(max(nLxDatesIn) - max(DatesOut)) > 5) { - stop("nLxDatesIn extrapolation > 5 years not allowed") - } - - nLxf <- interp(nLxFemale[, -c(1:4)], datesIn = nLxDatesIn, datesOut = DatesOut) - nLxm <- interp(nLxMale[, -c(1:4)], datesIn = nLxDatesIn, datesOut = DatesOut) - - if (abs(min(AsfrDatesIn) - min(DatesOut)) > 5 || - abs(max(AsfrDatesIn) - max(DatesOut)) > 5) { - stop("AsfrDatesIn extrapolation > 5 years not allowed") - } - - AsfrMat <- AsfrMat %>% - filter(age %in% 15:49) %>% - select(-c(country_code, name, age)) - - # this can be read in from the data - # we just can take birth from wpp???? we can generate female exposure based on female population - # reverse survive wpp and take new exposures - # utils download function 294 interp_co_download_mortailty demotools - # gets mx then calculates Sx from it. - # first and last matrix leslie output will be discounted in case it is not cleran dates - # take the product of the cohort diagonals year age Sx and then make a cohort variable year - age - 1 - # and then arrange cohort age group by cohort summarize Sx = prod(Sx) - survive births forward to the census - # use Lx downloader and utild for baasepopsingle - - - - # S0 is Lx(1) / Lx(0) - # S2 is LX2 / Lx1 - # for forward survival of births (maybe not product, we want the opposite of Sx) - # we start on the rights female single year and we get Sx for the previous year and the inverse of that - # we multiply and it shifts down in age and we go to age 59 (cumprod) - # reverse survival for filling female exposures multipluied by downloaded Asfr = births - # then take colsums and it is the birth factor - # then take births and survive them back year t (different triangle) - # - - # single loop over ages or cohorts - - Asfr <- interp(AsfrMat, datesIn = AsfrDatesIn, datesOut = DatesOut) - - ## 6) Female exposures - ages_repro <- 15:49 - Fcurrent <- Females_single[as.character(ages_repro)] - - # one-year reverse survival - Ft_minus_1 <- Fcurrent - for (a in 16:49) { - Ft_minus_1[as.character(a - 1)] <- Fcurrent[as.character(a)] * - nLxf[as.character(a - 1), 1] / - nLxf[as.character(a), 1] - } - - # exposures: linear interpolation between F(t) and F(t-1) - fExpos <- sapply(DatesOut, function(s) { - weight <- (refDate - s) / 1 - (1 - weight) * Fcurrent + weight * Ft_minus_1[names(Fcurrent)] - }) - colnames(fExpos) <- as.character(DatesOut) - - ## 7) Births - Bt <- colSums(fExpos * Asfr) - - ## 8) Split by sex and adjust ages 0–9 - PF <- 1 / (SRB + 1) - Females_out <- Females_single - Males_out <- Males_single - - age_groups <- list( - "0" = 1, # anchor 1 - "1:4" = 2, # anchor 2 - "5:9" = 3 # anchor 3 - ) - - - # remake with single loop not two, where the Bt is vector of length 10, - # 470 asfr 2 columns exposure 10 7? - # then also adjust PF and nLxf - # SRB is a vectorand then if d is indexing it it pulls from the vector that does not have these names - # and the return is not correct - # we want x age groups or cohor years - # nLx we need to survive them from birth to position in census - # then the dates out are ref dates - # so we can just take those nLx values since they are on singe year grid - # use population data to obtaion exposures - # 465 we can get the exposure from wpp and not the user - # get cohort Sx for reverse surviving in demotools - - - - for (group in names(age_groups)) { - - d <- age_groups[[group]] - ages <- eval(parse(text = group)) - - for (x in ages) { - Females_out[as.character(x)] <- Bt[d] * PF[d] * nLxf[x + 1, d] / radix - Males_out[as.character(x)] <- Bt[d] * (1 - PF[d]) * nLxm[x + 1, d] / radix - } - } - - - # example figure - #tibble(new = Females_out, - # old = Females_single, - # Age = Age) %>% - # filter(between(Age, 0, 10)) %>% - # ggplot(aes(x = Age)) + - # geom_line(aes(y = new), color = "red") + - # geom_line(aes(y = old), color = "black") + - # theme_minimal() + - # theme(legend.position = "bottom") - - ## 9) Return - list( - Females_adjusted = Females_out, - Males_adjusted = Males_out, - Females_single = Females_single, - Males_single = Males_single, - nLxf = nLxf, - nLxm = nLxm, - Asfr = Asfr, - Exposure_female = fExpos, - Bt = Bt, - SRB = SRB, - Age = Age - ) - -} diff --git a/R/remade_WPP_Lx_ASFR_grab.R b/R/remade_WPP_Lx_ASFR_grab.R deleted file mode 100644 index 6546590..0000000 --- a/R/remade_WPP_Lx_ASFR_grab.R +++ /dev/null @@ -1,450 +0,0 @@ -# !!!!!!!!!!!!!!!!!!!!!!!!!!!! NOTE: -# This script remakes the DemoTools functions -# DemoTools::downloadSRB() -# DemoTools::downloadnLx() -# DemoTools::downloadAsfr() -# By allowing to detect the wpp version and 1 or 5 year age groups -# I have closely followed the logic of original functions -# performance can be improved if approved in future - - -# nLx = NULL -# location = "Argentina" -# gender = "both" -# nLxDatesIn = 1950:2030 -# method = "linear" -# output = "5-year" -# -# -# refDate <- 1986 -# location <- "Brazil" - - -# updated to handle 5-year output ot single year output (age) -download_Lx <- function(nLx = NULL, - location = NULL, - gender = NULL, - nLxDatesIn = NULL, - method = "linear", - output = "5-year", - ...) { - - verbose <- getOption("basepop_verbose", TRUE) - - # if nLx is provided by user just return back the nLx data - if(!is.null(nLx)) { - - return(nLx) - - } - - # until the end of function - if(is.null(nLx)) { - - # if no location return error - if(is.null(location)) { - - stop("You need to provide a location to download the data for nLx") - - } - - # since we will need to download data anyway, I have changed the - # function structure a little bit. - # We first download data, then we ding if ID is provided, - # then we do filtering and all other checks. - # I do not think this will change the performance of the function - # ------------------------------------------------------ # - # NEW - # I check which wpp versions are available on the machine - # if none we stop - # then I use the latest available for the data grab - - # installed wpp versions - installed_wpp <- grep("^wpp\\d{4}$", rownames(installed.packages()), value = TRUE) - - # stop if none - if(length(installed_wpp) == 0) { - - stop("No wpp package installed.") - - } - # find the lates one - latest_wpp <- sort(installed_wpp, decreasing = TRUE)[1] - - # download mx1dt data from the latest package version - data("mx1dt", package = latest_wpp) - - # if any date chosen is less then 1950 or more than wpp version + 1 - if(any(nLxDatesIn < 1950, nLxDatesIn > (parse_number(latest_wpp) + 1))) { - - cat(paste0("Careful, choosing beyond range 1950-", parse_number(latest_wpp))) - - } - - - if(is.numeric(location)) { - - location_code <- location - - } else { - - - # find location code from the provided location - # if location is misspelled return NA - location_code <- mx1dt %>% - subset(name %in% location, select = country_code) %>% - unique() %>% - as.numeric() - - } - # ------------------------------------------------------ # - # if we need message print it - if(verbose) { - cat( - paste0( - "Downloading nLx data for ", - location, - ", years ", - paste(nLxDatesIn, collapse = ", "), - ", gender ", - gender - ), - sep = "\n" - ) - } - - # standardize input strings to match what we expect - sex_code <- ifelse(tolower(gender) == "both", - "b", - ifelse( - tolower(gender) == "female", - "f", - ifelse(tolower(gender) == "male", "m", NA) - )) - - # sex standardization - Sex_mortlaws <- ifelse(sex_code == "b", "total", tolower(gender)) - - stopifnot(`Invalid sex name, please set it to 'both', 'male' or 'female'` = !is.na(sex_code)) - - # here some data wrangling going on, including - # calulation of interp() - # then lt_sinle Lx calculation - - out <- mx1dt %>% - as_tibble() %>% - filter(country_code %in% location_code, - year < parse_number(latest_wpp) + 1) %>% - pivot_longer(-c(country_code, name, year, age), - names_to = "sex", - values_to = "mx") %>% - mutate(sex = str_remove(sex, "mx"), sex = tolower(sex)) %>% - subset(sex %in% sex_code) %>% - pivot_wider(names_from = year, values_from = mx) %>% - select(-age) %>% - group_nest(country_code, name, sex) %>% - # interpolate - mutate(data = map( - data, - ~ interp( - .x, - as.numeric(names(.x)), - as.numeric(nLxDatesIn), - extrap = TRUE, - method = method, - ... - ) %>% - as_tibble() - )) %>% - unnest(data) %>% - pivot_longer(-c(country_code, name, sex), - names_to = "year", - values_to = "mx") %>% - group_nest(country_code, name, sex, year) %>% - # calculate lifetable - mutate(data = map(data, ~ lt_single_mx(nMx = .x$mx) %>% - select(age = Age, nLx))) %>% - unnest(data) %>% - # wide format - pivot_wider(names_from = year, - values_from = nLx) - - - # what type of age output is desired - if(output == "5-year") { - - out <- out %>% - mutate(age = (age %/% 5) * 5) %>% # floor to nearest 5-year start - group_by(country_code, name, sex, age) %>% - summarise(across(where(is.numeric), ~ sum(., na.rm = TRUE)), .groups = "drop") - - } - - return(out) - - } -} - - -# add them in demotools -# make a list that actually use these in demotools - -# add them in demotools -# make a list that actually use these in demotools -# if we need a five year version aggregate one year data -# group_ages in demotools -# - -# Asfrmat = NULL -# location = "Argentina" -# AsfrDatesIn = 1950:2030 -# -# -# Asfrmat = NULL -# location = country_code -# AsfrDatesIn = refDate_start:refDate -# method = "linear" -# output = "single" - - -download_Asfr <- function(Asfrmat = NULL, - location = NULL, - AsfrDatesIn = NULL, - method = "linear", - output = "5-year", - ...) { - - - verbose <- getOption("basepop_verbose", TRUE) - - - if (!is.null(Asfrmat)) { - - return(Asfrmat) - - } - - if (is.null(location)) { - - stop("You need to provide a location to download the data for Asfrmat") - - } - - # ------------------------------------------------------ # - # installed wpp versions - installed_wpp <- grep("^wpp\\d{4}$", rownames(installed.packages()), value = TRUE) - - # stop if none - if(length(installed_wpp) == 0) { - - stop("No wpp package installed.") - - } - # find the lates one - latest_wpp <- sort(installed_wpp, decreasing = TRUE)[1] - - # download mx1dt data from the latest package version - data("percentASFR1dt", package = latest_wpp) - data("tfr1", package = latest_wpp) - - # if any date chosen is less then 1950 or more than wpp version + 1 - if (any(AsfrDatesIn < 1950, AsfrDatesIn > (parse_number(latest_wpp) + 1))) { - cat(paste0( - "Careful, choosing beyond range 1950-", - parse_number(latest_wpp) - )) - - } - # ------------------------------------------------------ # - # find location code from the provided location - # if location is misspelled return NA - - if(is.numeric(location)) { - - location_code <- location - - } else { - - location_code <- percentASFR1dt %>% - as_tibble() %>% - subset(name %in% location, select = country_code) %>% - unique() %>% - as.numeric() - - } - - - - if(verbose) { - cat(paste0( - "Downloading ASFR data for ", - location, - ", years ", - paste(AsfrDatesIn, collapse = ", ") - ), - sep = "\n") - } - - age <- unique(percentASFR1dt$age) - - tfr <- tfr1 %>% - as_tibble() %>% - pivot_longer(-c(country_code, name), - names_to = "year", - values_to = "tfr") %>% - subset(country_code %in% location_code & - year < parse_number(latest_wpp) + 1) %>% - mutate(year = as.integer(year)) - - out <- percentASFR1dt %>% - as_tibble() %>% - subset(country_code %in% location_code & - year < parse_number(latest_wpp) + 1) %>% - left_join(tfr) %>% - # create asfr - mutate(asfr = (pasfr / 100) * tfr) %>% - select(-c(pasfr, tfr)) %>% - # wide format - pivot_wider(names_from = year, - values_from = asfr) %>% - select(-age) %>% - group_nest(country_code, name) %>% - # interpolate - mutate(data = map( - data, - ~ interp( - .x, - as.numeric(names(.x)), - as.numeric(AsfrDatesIn), - extrap = TRUE, - method = method, - ... - ) %>% - as_tibble() - )) %>% - unnest(data) %>% - mutate(age = age) - - if(output == "5-year") { - - nLx <- download_Lx(location = location, - gender = "female", - nLxDatesIn = AsfrDatesIn, - method = "linear", - output = "single") %>% - select(-sex) %>% - pivot_longer(-c(country_code, name, age), - names_to = "year", - values_to = "nLx") - - out <- out %>% - pivot_longer(-c(country_code, name, age), - names_to = "year", - values_to = "asfr") %>% - left_join(nLx) %>% - filter(!is.na(asfr)) %>% - mutate(age = (age %/% 5) * 5) %>% - group_by(country_code, name, year, age) %>% - summarise( - asfr = sum(asfr * nLx) / sum(nLx), - .groups = "drop" - ) %>% - pivot_wider(names_from = year, - values_from = asfr) - - } - - return(out) - -} - -# SRB = NULL -# location = "Argentina" -# DatesOut = 1950:2030 - -download_SRB <- function(SRB = NULL, - location, - DatesOut, - verbose = TRUE) { - - SRB_default <- round((1 - 0.4886) / 0.4886, 3) - - # Check DatesOut - if(length(DatesOut) < 1) { - - stop("DatesOut must contain at least one date.") - } - - # If SRB provided directly - if (!is.null(SRB)) { - - SRB <- stats::setNames(rep(SRB, length.out = length(DatesOut)), DatesOut) - - return(SRB) - - } - - installed_wpp <- grep("^wpp\\d{4}$", rownames(installed.packages()), value = TRUE) - - if(length(installed_wpp) == 0) stop("No WPP package installed.") - - latest_wpp <- sort(installed_wpp, decreasing = TRUE)[1] - - data("sexRatio1", package = latest_wpp) - - - if(is.numeric(location)) { - - location_code <- location - - } else { - - location_code <- sexRatio1 %>% - as_tibble() %>% - subset(name %in% location, select = country_code) %>% - unique() %>% - as.numeric() - } - - if (is.na(location_code)) { - if (verbose) cat(location, "not available in wpp. Using default SRB:", SRB_default, "\n") - return(stats::setNames(rep(SRB_default, length(DatesOut)), DatesOut)) - } - - if(verbose) { - cat(paste0("\nDownloading SRB for ", location, - " for years ", round(min(DatesOut),1), " to ", round(max(DatesOut),1), "\n")) - } - - dt <- sexRatio1 %>% - as_tibble() %>% - filter(country_code == location_code) %>% - pivot_longer(-c(country_code, name), names_to = "year", values_to = "SRB") %>% - mutate(year = as.numeric(year)) %>% - filter(year %in% floor(DatesOut)) - - years_srb <- dt$year - SRB <- stats::setNames(dt$SRB, years_srb) - - # Fill missing years with mean SRB - DatesOut_floor <- floor(DatesOut) - yrs_present <- DatesOut_floor %in% years_srb - if(any(!yrs_present)) { - mean_srb <- mean(SRB[as.character(DatesOut_floor[yrs_present])]) - SRB <- c(SRB, stats::setNames(rep(mean_srb, sum(!yrs_present)), DatesOut_floor[!yrs_present])) - } - - SRB <- SRB[order(as.numeric(names(SRB)))] - SRB <- stats::setNames(SRB, DatesOut) - - return(SRB) -} - - - - -# sexRatio1 %>% -# as_tibble() %>% -# subset(name %in% location, select = country_code) %>% -# unique() %>% -# as.numeric() diff --git a/R/smoothing1d.R b/R/smoothing1d.R index 7b317a6..6c5136f 100644 --- a/R/smoothing1d.R +++ b/R/smoothing1d.R @@ -1,315 +1,283 @@ -#' @title `smooth1d_chunk` -#' @description Smooth a univariate time series, optionally using `weights.` Choose between the super-smoother (`"supsmu"`) method, loess (`"lowess"` or `"loess"`) , smoothing splines (`"cubicsplines"`), thin-plate splines (`"gam-tp"`), or p-splines (`"gam-ps"`). Input data may have multiple observations per x coordinate. Output can be for arbitrary x-coordinates (`xout`). -#' @details `"supsmu"` method takes a `smoothing_par` between 0 and 10. `"lowess"` and -#' @param data_in. `data.frame` with x and y coordinates to fit to. Optionally with `weights`. Should refer to a single subset of data. -#' @param method character. Smoothing method desired. options `"supsmu"` (default),`"lowess"`,`"loess"`,`"cubicspline"`,`"gam-tp"`,`"gam-ps"` -#' @param smoothing_par smoothing parameter, interpretation varies by method, but higher always means smoother.Default 1 -#' @param xout numeric vector of coordinates to predict for. Defaults to original unique coordinates. -#' @param xname name of variable containing x coordinate, default `x` -#' @param yname name of variable containing y coordinate, default `y` -#' @importFrom stats supsmu smooth.spline loess predict loess.control -#' @importFrom dplyr case_when +#' One–Dimensional Smoothing Wrapper for Age or Time Series +#' \code{smooth1d()} provides a unified interface for smoothing one–dimensional demographic or time–series data (e.g., age schedules or time trends). +#' The function supports multiple smoothing engines including \code{supsmu}, \code{lowess}, \code{loess}, cubic splines, and GAM-based methods. +#' Optional transformations (log, sqrt, logit) can be applied prior to smoothing. +#' The function automatically handles grouped data via a \code{.id} column. +#' @param data_in A data.frame or tibble containing the input data. +#' @param units Character string. One of: +#' \itemize{ +#' \item \code{"count"} +#' \item \code{"rate"} +#' \item \code{"proportion"} +#' } +#' @param X Character string. Name of the independent variable (typically \code{"Age"} or \code{"Time"}). +#' @param Y Character string. Name of the dependent variable. +#' @param scale Optional transformation applied before smoothing: +#' \itemize{ +#' \item \code{"log"} +#' \item \code{"sqrt"} +#' \item \code{"logit"} +#' \item \code{"none"} +#' } +#' @param method Smoothing engine: +#' \itemize{ +#' \item \code{"supsmu"} +#' \item \code{"lowess"} +#' \item \code{"loess"} +#' \item \code{"cubicspline"} +#' \item \code{"gam-tp"} (thin plate regression spline) +#' \item \code{"gam-ps"} (P-spline) +#' } +#' @param smoothing_par Numeric smoothing parameter. Interpretation depends on method. +#' @param xout Optional vector of evaluation points. Defaults to unique values of \code{X}. +#' @importFrom dplyr mutate group_split bind_rows case_when +#' @importFrom dplyr as_tibble +#' @importFrom purrr map +#' @importFrom rlang sym +#' @importFrom tibble tibble +#' @importFrom ggplot2 ggplot geom_point geom_line theme theme_bw +#' @importFrom ggplot2 element_text scale_x_continuous scale_y_continuous +#' @importFrom ggplot2 scale_color_brewer labs +#' @importFrom scales pretty_breaks +#' @importFrom stats supsmu loess smooth.spline predict qlogis +#' @importFrom mgcv gam s #' @importFrom signal interp1 -#' @importFrom mgcv gam +#' @importFrom magrittr %>% +#' @importFrom forcats as_factor +#' @return A named list with: +#' \itemize{ +#' \item \code{result} — Tibble containing smoothed values +#' \item \code{plot} — ggplot object comparing raw and smoothed values +#' } +#' +#' @details +#' If the input data does not contain a \code{.id} column, a default group \code{"all"} is created. +#' Weights default to 1 if not supplied. +#' Transformations are applied before smoothing but not automatically back-transformed. +#' +#' @examples +#' # Example 1: Age smoothing of mortality rates (log scale) +#' library(readr) +#' +#' fpath <- system.file( +#' "extdata", +#' "single_hmd_spain.csv.gz", +#' package = "ODAPbackend" +#' ) +#' +#' data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +#' data_in$nMx <- data_in$Deaths / data_in$Exposures +#' data_in <- data_in[data_in$.id == 1, ] +#' names(data_in) <- c(".id", "Time", "Age", "Sex", +#' "Deaths", "Exposures", "nMx") +#' +#' z <- smooth1d( +#' data_in = data_in, +#' units = "rate", +#' X = "Age", +#' Y = "nMx", +#' method = "supsmu", +#' scale = "log" +#' ) +#' +#' z$result +#' z$plot +#' +#' # Example 2: Time smoothing of counts +#' df <- data.frame( +#' Time = seq(1950, 2015, by = 5), +#' Deaths = c(403564.9, 426012.0, 466215.9, 523753.8, +#' 560874.1, 545608.3, 555335.9, 594584.6, +#' 608425.3, 638167.3, 655438.6, 689386.4, +#' 740519.0, 804439.3) +#' ) +#' +#' z <- smooth1d( +#' data_in = df, +#' units = "count", +#' X = "Time", +#' Y = "Deaths", +#' method = "supsmu", +#' scale = "none" +#' ) +#' +#' z$result +#' z$plot +#' #' @export #' -smooth1d_chunk <- function(data_in., - method = c("supsmu", "lowess", "loess", - "cubicspline", "gam-tp", "gam-ps")[1], +smooth1d <- function(data_in, + units = c("count", "rate", "proportion"), + X = c("Age", "Time"), + Y = c("Exposures"), + scale = c("log", "sqrt", "logit", "none"), + method = c("supsmu", "lowess", "loess", "cubicspline", "gam-tp", "gam-ps"), smoothing_par = 1, - xout = data_in.[["x"]], - xname = "x", - yname = "y"){ + xout = unique(data_in[[X]])) { - x <- data_in.[[xname]] - y <- data_in.[[yname]] - w <- data_in.[["weights"]] + units <- match.arg(units) + scale <- match.arg(scale) + X <- match.arg(X) - if (method == "supsmu"){ - smoothing_par <- case_when(smoothing_par < 0 ~ 0, - smoothing_par > 10 ~ 10, - TRUE ~ smoothing_par) - - fit <- supsmu(x = x, - y = y, - wt = w, - span = "cv", - bass = smoothing_par) - # ---------------------------------------------------------------# - # TR: would be bettwe to follow this function with interpolate() # - # instead of fixing this option? # - # Then again maybe keep it cuz most methods have predict()? # - # ---------------------------------------------------------------# - pred <- interp1(fit$x, - fit$y, - xi = xout, - method = 'pchip', - extrap = TRUE) - data_out <- tibble(!!xname := xout, !!yname := pred) - } - if (method == "lowess"){ - fit <- loess(y ~ x, - weights = w, - span = smoothing_par, - degree = 1, - control = loess.control(surface = "direct")) - pred <- predict(fit, newdata = data.frame(x = xout)) - data_out <- tibble(!!xname := xout, !!yname := pred) + # Ensure '.id' column exists + if (!(".id" %in% names(data_in))) { + data_in$.id <- "all" } - if (method == "loess"){ - fit <- loess(y ~ x, - weights = w, - span = smoothing_par, - degree = 2, - control = loess.control(surface = "direct")) - pred <- predict(fit, newdata = data.frame(x = xout)) - data_out <- tibble(!!xname := xout, !!yname := pred) + if (!"weights" %in% colnames(data_in)){ + data_in <- data_in |> + mutate(weights = 1) } - if (method == "cubicspline") { - if (is.numeric(smoothing_par)){ - fit <- smooth.spline(x = x, - y = y, - w = w, - spar = smoothing_par) - } else { - fit <- smooth.spline(x = x, - y = y, - w = w, - cv = FALSE) - } - - pred <- predict(fit, x = data.frame(x = xout)) - data_out <- data.frame(pred) - colnames(data_out) <- c(xname, yname) - } + # here transformation occurs + # we give warning if it is strange - # covers gam-tp and gam-ps - if (grepl(method, pattern = "gam")) { - smoothing_par <- ifelse(is.numeric(smoothing_par), smoothing_par, 1) - lil_data <- tibble(x = x, y = y, w = w) - bs <- gsub(method, pattern = "gam-", replacement = "") - fit <- mgcv::gam(y ~ + s(x, m = smoothing_par), - bs = bs, - data = lil_data, - weights = w, - method = "REML", - select = TRUE, - family = "gaussian") - pred <- predict(fit, newdata = data.frame(x = xout)) - data_out <- tibble(!!xname := xout, !!yname := pred) + if(units == "counts" & scale != "none") { + + warning("You are applying transformation to counts which is strange") + } - return(data_out) + split_data <- data_in %>% + mutate( + {{ Y }} := { + + y <- !!sym(Y) + + switch(scale, + log = log(y), + sqrt = sqrt(y), + logit = qlogis(y), + none = y + ) + }, .by = ".id" + ) %>% + group_split(.data$`.id`) -} - -#' @title `smooth1d` -#' @description Smooth a univariate time series, optionally using `weights.` Choose between the super-smoother (`"supsmu"`) method, loess (`"lowess"` or `"loess"`) , smoothing splines (`"cubicsplines"`), thin-plate splines (`"gam-tp"`), or p-splines (`"gam-ps"`). Input data may have multiple observations per x coordinate. Output can be for arbitrary x-coordinates (`xout`). If grouping variables have been declared, then we use the same parameters for each subset. -#' @details `"supsmu"` method takes a `smoothing_par` between 0 and 10. `"lowess"` and -#' @param data_in `data.frame` with x and y coordinates to fit to. Optionally with `weights` -#' @param method character. Smoothing method desired. options `"supsmu"` (default),`"lowess"`,`"loess"`,`"cubicspline"`,`"gam-tp"`,`"gam-ps"` -#' @param smoothing_par smoothing parameter, interpretation varies by method, but higher always means smoother.Default 1 -#' @param xout numeric vector of coordinates to predict for. Defaults to original unique coordinates. -#' @param xname name of variable containing x coordinate, default `x` -#' @param yname name of variable containing y coordinate, default `y` -#' @importFrom dplyr mutate filter reframe bind_rows ungroup .data -#' @importFrom purrr map_lgl -#' @importFrom tidyselect all_of -#' @importFrom tidyr unnest -#' @importFrom rlang set_names .data -#' @export -#' @examples -#' library(tibble) -#' x = c(1950.77104722793, 1951.77104722793, 1952.77104722793, 1952.77104722793, -#' 1953.77104722793, 1954.77104722793, 1955.77104722793, 1956.77104722793, -#' 1957.77104722793, 1957.77104722793, 1958.77104722793, 1959.77104722793, -#' 1963.5, 1964.5, 1965.5, 1966.5, 1967.5, 1968.5, 1969.5, 1970.5, -#' 1970.5, 1971.5, 1971.5, 1972.5, 1972.5, 1973.5, 1973.5, 1974.5, -#' 1974.5, 1975.5, 1975.5, 1976.5, 1976.5, 1977.5, 1977.5, 1978.5, -#' 1978.5, 1979.5, 1979.5, 1980.5, 1980.5, 1981.5, 1981.5, 1982.5, -#' 1982.5, 1983.5, 1983.5, 1984.5, 1984.5, 1985.5, 1985.5, 1986.5, -#' 1986.5, 1987.5, 1987.5, 1988.5, 1988.5, 1989.5, 1989.5, 1990.5, -#' 1990.5, 1991.5, 1991.5, 1992.5, 1992.5, 1993.5, 1993.5, 1994.5, -#' 1994.5, 1995.5, 1995.5, 1996.5, 1996.5, 1997.5, 1997.5, 1998.5, -#' 1998.5, 1998.5, 1999.5, 1999.5, 1999.5, 2000.5, 2000.5, 2000.5, -#' 2001.5, 2001.5, 2001.5, 2001.5, 2002.5, 2002.5, 2002.5, 2003.5, -#' 2003.5, 2003.5, 2004.5, 2004.5, 2005.5, 2005.5, 2006.5, 2006.5, -#' 2007.5, 2007.5, 2008.5, 2008.5, 2009.5, 2009.5, 2010.5, 2010.5, -#' 2011.5, 2012.5, 2013.5, 2014.5, 2015.5, 2016.5, 2017.5) -#' -#' y = c(5.28312492370605, 5.4010272026062, 5.55507040023804, 5.52694797515869, -#' 5.65941572189331, 5.81246614456177, 5.95277643203735, 6.07998991012573, -#' 6.20043277740479, 6.23209381103516, 6.4145884513855, 6.44994592666626, -#' 5.77428722381592, 6.09462738037109, 6.31580305099487, 5.68929624557495, -#' 6.34508848190308, 5.67744398117065, 5.6477165222168, 5.12978315353394, -#' 4.83979654312134, 5.40941429138184, 4.93997049331665, 5.06586456298828, -#' 4.68799591064453, 4.31546640396118, 4.18193292617798, 3.75800633430481, -#' 3.82137632369995, 3.17197370529175, 3.41054058074951, 3.08278703689575, -#' 3.04342699050903, 3.01940298080444, 2.9705445766449, 2.96306347846985, -#' 2.88018417358398, 2.72204685211182, 2.72689270973206, 3.43744516372681, -#' 2.88990616798401, 3.12944483757019, 3.18246674537659, 3.20358324050903, -#' 3.0853967666626, 3.38533687591553, 3.18455958366394, 3.14047956466675, -#' 3.08752226829529, 3.15941309928894, 3.09168982505798, 3.22912931442261, -#' 2.95333743095398, 3.05898070335388, 2.91993451118469, 3.29801154136658, -#' 2.93581032752991, 3.18667984008789, 2.92741537094116, 3.10476756095886, -#' 2.86983323097229, 3.31816387176514, 2.87090802192688, 3.57006907463074, -#' 3.20188307762146, 3.38520860671997, 3.1142041683197, 3.05472731590271, -#' 2.88588929176331, 2.85394668579102, 2.69696426391602, 2.71265292167664, -#' 2.49935364723206, 2.55814361572266, 2.37988924980164, 2.35105299949646, -#' 2.32249999046326, 2.21962690353394, 2.46929597854614, 2.43600010871887, -#' 2.6223669052124, 2.31005716323853, 2.33249998092651, 2.408358335495, -#' 2.39718627929688, 2.38599991798401, 3, 2.51987075805664, 2.1131649017334, -#' 2.13849997520447, 2.04657125473022, 2.06265759468079, 2.10050010681152, -#' 1.98469293117523, 2.09120869636536, 2.2427921295166, 1.97930490970612, -#' 2.15000796318054, 2.06816005706787, 2.18898129463196, 1.77942955493927, -#' 1.99617087841034, 1.89081299304962, 2.0644690990448, 1.85119438171387, -#' 2.03594541549683, 1.83351850509644, 2.02167296409607, 1.89562964439392, -#' 1.89205574989319, 1.85763072967529, 1.68259692192078, 1.69228148460388, -#' 1.56845271587372, 1.37076985836029) -#' data_in <- tibble(x=x,y=y,w=1) -#' xout <- 1950:2018 + .5 -#' \dontrun{ -#' plot(data_in$x, data_in$y) -#' lines(xout, smooth1d(data_in, method = "supsmu", -#' xout = xout, smoothing_par = .2)$y, -#' col = "red") -#' lines(xout, smooth1d(data_in, method = "loess", -#' xout = xout, smoothing_par = .2)$y, -#' col = "blue", lty = "82") -#' lines(xout, smooth1d(data_in, method = "gam-ps", xout = xout)$y, -#' col = "forestgreen" ,lty="82") -#' lines(xout, smooth1d(data_in, method = "cubicspline", -#' xout = xout, smoothing_par =1)$y, -#' col = "purple", lty="22") -#' } -#' - -smooth1d <- function(data_in, - method = c("supsmu", "lowess", "loess", - "cubicspline", "gam-tp", "gam-ps")[1], - smoothing_par = 1, - xout = data_in[["x"]] |> unique(), - xname = "x", - yname = "y"){ - if (!(".id" %in% colnames(data_in))) { - data_in <- data_in |> - mutate(.id = "all") - } - if (!"weights" %in% colnames(data_in)){ - data_in <- data_in |> - mutate(weights = 1) + for_one <- function(data) { + + x <- data[[X]] + y <- data[[Y]] + w <- data[["weights"]] + + if (method == "supsmu") { + + smoothing_par <- case_when(smoothing_par < 0 ~ 0, + smoothing_par > 10 ~ 10, + TRUE ~ smoothing_par) + + fit <- supsmu(x = x, + y = y, + wt = w, + span = "cv", + bass = smoothing_par) + # ---------------------------------------------------------------# + # TR: would be better to follow this function with interpolate() # + # instead of fixing this option? # + # Then again maybe keep it cuz most methods have predict()? # + # ---------------------------------------------------------------# + pred <- interp1(fit$x, + fit$y, + xi = xout, + method = 'pchip', + extrap = TRUE) + data_out <- tibble(!!X := xout, !!Y := pred) + } + + if (method %in% c("lowess", "loess")) { + + degree_val <- ifelse(method == "lowess", 1, 2) + + fit <- loess(y ~ x, + weights = w, + span = smoothing_par, + degree = degree_val, + control = loess.control(surface = "direct")) + pred <- predict(fit, newdata = data.frame(x = xout)) + data_out <- tibble(!!X := xout, !!Y := pred) + } + + if (method == "cubicspline") { + if (is.numeric(smoothing_par)){ + fit <- smooth.spline(x = x, + y = y, + w = w, + spar = smoothing_par) + } else { + fit <- smooth.spline(x = x, + y = y, + w = w, + cv = FALSE) + } + + pred <- predict(fit, x = data.frame(x = xout)) + data_out <- data.frame(pred) + names(data_out) <- c(X, Y) + data_out <- as_tibble(data_out) + } + + # covers gam-tp and gam-ps + if (grepl(method, pattern = "gam")) { + + smoothing_par <- ifelse(is.numeric(smoothing_par), smoothing_par, 1) + lil_data <- tibble(x = x, y = y, w = w) + bs <- gsub(method, pattern = "gam-", replacement = "") + fit <- gam(y ~ + s(x, m = smoothing_par), + bs = bs, + data = lil_data, + weights = w, + method = "REML", + select = TRUE, + family = "gaussian") + pred <- predict(fit, newdata = data.frame(x = xout)) + data_out <- tibble(!!X := xout, !!Y := pred) + } + + return(data_out) } - data_out <- data_in |> - ungroup() |> - reframe( - smooth1d_chunk(data_in. = .data, - method = method, - smoothing_par = smoothing_par, - xout = xout, - xname = xname, - yname = yname), - .by = all_of(c(".id")) - ) #|> - #set_names(c(".id", by_args, "data")) + res <- split_data %>% + map(~ for_one(.x)) %>% + bind_rows(.id = ".id") - # data_out <- data_out |> - # filter(map_lgl(data, is.data.frame))|> - # unnest(.data$data) - return(data_out) + compare <- split_data %>% + bind_rows(.id = ".id") -} - -#' @title `check_smooth` -#' @description Creates a plot of original data as points and smooth result as line for each `.id`. -#' @param data_out a data.frame or tibble. The data.frame output of the `interpolate` function. -#' @param data_in a data.frame or tibble. The original data provided by user -#' @return A figure of original data as points and smooth result as line for each `.id` -#' @importFrom dplyr mutate select summarise group_by -#' @importFrom scales pretty_breaks -#' @importFrom ggplot2 ggplot geom_point geom_line theme_minimal aes theme element_text scale_x_continuous scale_y_continuous -#' @export -#' @examples -#' library(tibble) -#' x = c(1950.77104722793, 1951.77104722793, 1952.77104722793, 1952.77104722793, -#' 1953.77104722793, 1954.77104722793, 1955.77104722793, 1956.77104722793, -#' 1957.77104722793, 1957.77104722793, 1958.77104722793, 1959.77104722793, -#' 1963.5, 1964.5, 1965.5, 1966.5, 1967.5, 1968.5, 1969.5, 1970.5, -#' 1970.5, 1971.5, 1971.5, 1972.5, 1972.5, 1973.5, 1973.5, 1974.5, -#' 1974.5, 1975.5, 1975.5, 1976.5, 1976.5, 1977.5, 1977.5, 1978.5, -#' 1978.5, 1979.5, 1979.5, 1980.5, 1980.5, 1981.5, 1981.5, 1982.5, -#' 1982.5, 1983.5, 1983.5, 1984.5, 1984.5, 1985.5, 1985.5, 1986.5, -#' 1986.5, 1987.5, 1987.5, 1988.5, 1988.5, 1989.5, 1989.5, 1990.5, -#' 1990.5, 1991.5, 1991.5, 1992.5, 1992.5, 1993.5, 1993.5, 1994.5, -#' 1994.5, 1995.5, 1995.5, 1996.5, 1996.5, 1997.5, 1997.5, 1998.5, -#' 1998.5, 1998.5, 1999.5, 1999.5, 1999.5, 2000.5, 2000.5, 2000.5, -#' 2001.5, 2001.5, 2001.5, 2001.5, 2002.5, 2002.5, 2002.5, 2003.5, -#' 2003.5, 2003.5, 2004.5, 2004.5, 2005.5, 2005.5, 2006.5, 2006.5, -#' 2007.5, 2007.5, 2008.5, 2008.5, 2009.5, 2009.5, 2010.5, 2010.5, -#' 2011.5, 2012.5, 2013.5, 2014.5, 2015.5, 2016.5, 2017.5) -#' -#' y = c(5.28312492370605, 5.4010272026062, 5.55507040023804, 5.52694797515869, -#' 5.65941572189331, 5.81246614456177, 5.95277643203735, 6.07998991012573, -#' 6.20043277740479, 6.23209381103516, 6.4145884513855, 6.44994592666626, -#' 5.77428722381592, 6.09462738037109, 6.31580305099487, 5.68929624557495, -#' 6.34508848190308, 5.67744398117065, 5.6477165222168, 5.12978315353394, -#' 4.83979654312134, 5.40941429138184, 4.93997049331665, 5.06586456298828, -#' 4.68799591064453, 4.31546640396118, 4.18193292617798, 3.75800633430481, -#' 3.82137632369995, 3.17197370529175, 3.41054058074951, 3.08278703689575, -#' 3.04342699050903, 3.01940298080444, 2.9705445766449, 2.96306347846985, -#' 2.88018417358398, 2.72204685211182, 2.72689270973206, 3.43744516372681, -#' 2.88990616798401, 3.12944483757019, 3.18246674537659, 3.20358324050903, -#' 3.0853967666626, 3.38533687591553, 3.18455958366394, 3.14047956466675, -#' 3.08752226829529, 3.15941309928894, 3.09168982505798, 3.22912931442261, -#' 2.95333743095398, 3.05898070335388, 2.91993451118469, 3.29801154136658, -#' 2.93581032752991, 3.18667984008789, 2.92741537094116, 3.10476756095886, -#' 2.86983323097229, 3.31816387176514, 2.87090802192688, 3.57006907463074, -#' 3.20188307762146, 3.38520860671997, 3.1142041683197, 3.05472731590271, -#' 2.88588929176331, 2.85394668579102, 2.69696426391602, 2.71265292167664, -#' 2.49935364723206, 2.55814361572266, 2.37988924980164, 2.35105299949646, -#' 2.32249999046326, 2.21962690353394, 2.46929597854614, 2.43600010871887, -#' 2.6223669052124, 2.31005716323853, 2.33249998092651, 2.408358335495, -#' 2.39718627929688, 2.38599991798401, 3, 2.51987075805664, 2.1131649017334, -#' 2.13849997520447, 2.04657125473022, 2.06265759468079, 2.10050010681152, -#' 1.98469293117523, 2.09120869636536, 2.2427921295166, 1.97930490970612, -#' 2.15000796318054, 2.06816005706787, 2.18898129463196, 1.77942955493927, -#' 1.99617087841034, 1.89081299304962, 2.0644690990448, 1.85119438171387, -#' 2.03594541549683, 1.83351850509644, 2.02167296409607, 1.89562964439392, -#' 1.89205574989319, 1.85763072967529, 1.68259692192078, 1.69228148460388, -#' 1.56845271587372, 1.37076985836029) -#' data_in <- tibble(x = x, -#' y = y, -#' w = 1) -#' xout <- 1950:2018 + .5 -#' -#' data_out <- smooth1d(data_in, method = "gam-ps", xout = xout) -#' -#' check_smooth(data_in, data_out) -#' - -check_smooth <- function(data_in, - data_out) { + compare$.id <- as_factor(compare$.id) + res$.id <- as_factor(res$.id) - if (!(".id" %in% colnames(data_in))) { - data_in <- data_in |> - mutate(.id = "all") - } + fig <- ggplot() + + geom_point( + aes(x = pull(compare, !!sym("X")), + y = pull(compare, !!sym("Y")), + color = pull(compare, ".id") + )) + + geom_line( + aes(x = pull(res, !!sym("X")), + y = pull(res, !!sym("Y")), + color = pull(res, ".id") + )) + + theme( + legend.position = "bottom", + legend.title = element_text(face = "bold"), + axis.text = element_text(color = "black") + ) + + scale_x_continuous(breaks = pretty_breaks()) + + scale_y_continuous(breaks = pretty_breaks()) + + labs( + x = NULL, + y = NULL + ) + + scale_color_brewer( + palette = "Set1", + name = "Group" + ) + + theme_bw() + + return(list(result = res, + plot = fig)) - ggplot() + - geom_point(data = data_in, aes(x = .data$x, - y = .data$y, - color = .data$.id)) + - geom_line(data = data_out, aes(x = .data$x, - y = .data$y, - color = .data$.id)) + - theme_minimal() + - theme(legend.position = "bottom", - axis.text = element_text(color = "black")) + - scale_x_continuous(breaks = pretty_breaks()) + - scale_y_continuous(breaks = pretty_breaks()) -} +} \ No newline at end of file diff --git a/build.R b/build.R index f98133b..e2003e6 100644 --- a/build.R +++ b/build.R @@ -1,3 +1,4 @@ library(devtools) document() +check() diff --git a/man/check_data_generic.Rd b/man/check_data_generic.Rd new file mode 100644 index 0000000..f6b3fee --- /dev/null +++ b/man/check_data_generic.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_data_generic} +\alias{check_data_generic} +\title{\code{check_data_generic}} +\usage{ +check_data_generic(data) +} +\arguments{ +\item{data}{tibble. A tibble with Age column and at least one numeric value column.} +} +\value{ +A data.frame with validation results containing columns: \code{check}, \code{message}, \code{pass}. +} +\description{ +Generic data validation for modules that accept any numeric column (heaping, smoothing, graduation). +Requires only Age column and at least one numeric value column. Runs all standard age-related checks. +} +\examples{ +library(tibble) +data <- tibble( + Age = c(0, 1, seq(5, 100, by = 5)), + births = runif(22, 1000, 10000), + population = runif(22, 50000, 100000) +) +check_data_generic(data = data) + +} diff --git a/man/check_data_graduate_time.Rd b/man/check_data_graduate_time.Rd new file mode 100644 index 0000000..9fedfe5 --- /dev/null +++ b/man/check_data_graduate_time.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_data_graduate_time} +\alias{check_data_graduate_time} +\title{Validate input data for time graduation} +\usage{ +check_data_graduate_time(value, time, .id = NULL) +} +\arguments{ +\item{value}{Numeric vector of values to be graduated (e.g., life expectancy).} + +\item{time}{Numeric vector of time points corresponding to \code{value} (e.g., calendar years).} + +\item{.id}{Optional grouping variable. If provided, checks are performed separately by group.} +} +\value{ +A data frame summarizing validation results. Contains: +\itemize{ +\item \code{check} – name of the validation performed +\item \code{message} – error message if the check fails, otherwise \code{NA} +\item \code{pass} – \code{"Pass"} or \code{"Fail"} +\item \code{.id} – group identifier +} +} +\description{ +Performs structural and numerical validation checks for time-series data used in graduation or smoothing procedures. The function verifies that time points are numeric, unique, sequential, and evenly spaced, and that the associated values are numeric, finite, and strictly positive. +Validation is performed separately for each \code{.id} group. If \code{.id} is \code{NULL}, all observations are treated as belonging to a single group. +} +\examples{ +check_data_graduate_time( + value = c(70.1, 70.4, 70.8), + time = c(2000, 2005, 2010) +) + +} diff --git a/man/check_data_heaping.Rd b/man/check_data_heaping.Rd new file mode 100644 index 0000000..58b85f9 --- /dev/null +++ b/man/check_data_heaping.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_data_heaping} +\alias{check_data_heaping} +\title{Check consistency of data for age heaping diagnostics} +\usage{ +check_data_heaping(data, X) +} +\arguments{ +\item{data}{A data frame containing the variable to be evaluated. +Optionally may contain a grouping column \code{.id}.} + +\item{X}{A character string specifying the name of the variable to be checked (e.g., \code{"Deaths"}).} +} +\value{ +A data frame summarizing validation results for each group. +The returned object contains: +\describe{ +\item{check}{Name of the validation performed} +\item{message}{Error message if the check fails, otherwise \code{NA}} +\item{pass}{Either \code{"Pass"} or \code{"Fail"}} +\item{.id}{Group identifier} +} +} +\description{ +Validates a data frame prior to performing age heaping analysis. +The function checks whether a specified variable contains valid numeric values and ensures that required structural conditions are satisfied. +If a grouping variable \code{.id} is not present, one is automatically created with a single level \code{"all"}. +Validation is performed separately for each \code{.id} group. +} +\details{ +Validate input data for age heaping analysis + +For the selected variable \code{X}, the following checks are performed: +\itemize{ +\item Numeric type validation +\item Missing value detection +\item Infinite value detection +\item Strict positivity constraint (all values must be > 0) +\item Presence of an \code{Age} column (required for age heaping analysis) +} +Each validation is conducted within levels of \code{.id}. +} +\examples{ +library(readr) +fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +data_in <- data_in[data_in$`.id` == 1, ] +check_data_heaping(data = data_in, X = "Deaths") + +} diff --git a/man/check_data_lifetable.Rd b/man/check_data_lifetable.Rd new file mode 100644 index 0000000..65d67b3 --- /dev/null +++ b/man/check_data_lifetable.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_data_lifetable} +\alias{check_data_lifetable} +\title{Check consistency of lifetable input data} +\usage{ +check_data_lifetable(data) +} +\arguments{ +\item{data}{A data frame containing lifetable-related columns. Must include at least one of: \code{nMx}, \code{nlx}, \code{nqx}, \code{npx}, \code{ndx}, or \code{nLx}. Optionally may contain an \code{Age} column and a grouping column \code{.id}.} +} +\value{ +A data frame summarizing validation results for each group. The returned object contains: +\describe{ +\item{check}{Name of the validation performed} +\item{message}{Error message if the check fails, otherwise \code{NA}} +\item{pass}{Either \code{"Pass"} or \code{"Fail"}} +\item{.id}{Group identifier} +} +} +\description{ +Validates a data frame intended for lifetable reconstruction. +The function checks whether the input contains at least one column that can be used to reconstruct a full lifetable and performs a set of structural and value-based validations. +If no grouping variable \code{.id} is present, one is automatically created with a single level \code{"all"}. +} +\details{ +Checks input data for lifetable reconstruction + +The function searches for at least one of the following columns (in order of preference): +\itemize{ +\item \code{nMx} +\item \code{nlx} +\item \code{nqx} +\item \code{npx} +\item \code{ndx} +\item \code{nLx} +} +The first available column is used for validation checks. The following checks are performed: +\itemize{ +\item Numeric type validation +\item Missing value detection +\item Non-negativity constraint +} +If an \code{Age} column is present, additional checks are performed: +\itemize{ +\item Age coherence (numeric validity) +\item Redundancy detection +\item Sequential ordering +} +Validation is performed separately for each \code{.id} group. +} +\examples{ +library(readr) +fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +data_in$nMx <- data_in$Deaths / data_in$Exposures +data_in <- data_in[data_in$`.id` == 1, ] +check_data_lifetable(data_in) + +} +\seealso{ +\code{\link{age2int}}, +\code{\link{is_age_coherent}}, +\code{\link{is_age_redundant}}, +\code{\link{is_age_sequential}} +} diff --git a/man/check_data_opag.Rd b/man/check_data_opag.Rd new file mode 100644 index 0000000..c3bdb97 --- /dev/null +++ b/man/check_data_opag.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_data_opag} +\alias{check_data_opag} +\title{\code{check_data_opag}} +\usage{ +check_data_opag(data) +} +\arguments{ +\item{data}{tibble. A tibble containing population data for ODAP analysis with at minimum Age and pop columns.} +} +\value{ +A data.frame with validation results containing columns: \code{check}, \code{message}, \code{pass}. +} +\description{ +Upper level function that checks ODAP population data quality. Checks if the columns are numeric, if any of the columns is missing, if there is insufficient rows, if there are missing data entries, if ages do not start with 0, and also if ages are coherent, sequential and not redundant. +} +\examples{ +library(tibble) +data <- tibble( + Age = 0:100, + pop = c(9544406, 7471790, rep(1000000, 99)), + name = "India", + country_code = 356, + sex = "M", + year = 1971 +) + +check_data_opag(data = data) + +} diff --git a/man/check_has_numeric.Rd b/man/check_has_numeric.Rd new file mode 100644 index 0000000..696361a --- /dev/null +++ b/man/check_has_numeric.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_has_numeric} +\alias{check_has_numeric} +\title{\code{check_has_numeric}} +\usage{ +check_has_numeric(data) +} +\arguments{ +\item{data}{tibble. A tibble with demographic data.} +} +\value{ +A data.frame with validation results containing columns: \code{check}, \code{message}, \code{pass}. +} +\description{ +Check if the data has at least one numeric column besides Age. +} diff --git a/man/check_heaping_general.Rd b/man/check_heaping_general.Rd index b67b4ec..7f36c73 100644 --- a/man/check_heaping_general.Rd +++ b/man/check_heaping_general.Rd @@ -2,26 +2,55 @@ % Please edit documentation in R/input_diagnostics.R \name{check_heaping_general} \alias{check_heaping_general} -\title{\code{check_heaping_general}} +\title{General age-heaping diagnostics} \usage{ -check_heaping_general(data, y) +check_heaping_general(data, y, units = c("count", "rate", "proportion")) } \arguments{ -\item{data}{data.frame. User file from the read_data command with the minimum data on \code{Exposures}, \code{Death} and \code{Age}. Data can be both in 5 and 1 year age intervals} +\item{data}{A data frame containing at least \code{Age} and the variable \code{y}. If \code{.id} is present, diagnostics are computed by group; otherwise all observations are treated as one group.} -\item{y}{character.Variable name for which the heaping should be checked \code{Deaths} or \code{Exposures}.} +\item{y}{Character string giving the name of the age-specific variable.} + +\item{units}{Type of input data. One of \code{"count"}, \code{"rate"}, or \code{"proportion"}.} } \value{ -A data.frame with 2 columns \code{method} - the method used for age heaping evaluation and \code{result} - the resulting heaping measure +A data frame with one row per index and group, containing: +\itemize{ +\item \code{method} – diagnostic index name +\item \code{result} – numeric value +\item \code{level} – qualitative data-quality category +\item \code{color} – associated color code +\item \verb{age scale} – \code{"single"} or \code{"5-year"} +\item \code{.id} – group identifier +} } \description{ -Check the age heaping for 5 or 1 year data. +Computes age-heaping and age-irregularity indices from age-specific counts, rates, or proportions. The set of diagnostics depends on the age scale: digit-preference indices are used for single-year ages, while irregularity indices are used for grouped ages. +} +\details{ +\itemize{ +\item Single-year ages: Bachi and Myers digit-preference indices. +\item Grouped ages: roughness and sawtooth irregularity indices. +For \code{"count"} data, values are internally converted to proportions. +For \code{"rate"} and \code{"proportion"} data, values are normalized to sum to one +to ensure comparability across ages. +} } \examples{ -\dontrun{ -check_heaping_general( - data = data, - y = "Exposures") +library(readr) +fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +data_in$nMx <- data_in$Deaths / data_in$Exposures +data_in <- data_in[data_in$.id == 1, ] +check_heaping_general(data = data_in, y = "Deaths", units = "count") +check_heaping_general(data = data_in, y = "nMx", units = "rate") } - +\references{ +Bachi (1951); Myers (1940); United Nations (1955). +} +\seealso{ +\code{\link{check_heaping_bachi}}, +\code{\link{check_heaping_myers}}, +\code{\link{check_heaping_roughness}}, +\code{\link{check_heaping_sawtooth}} } diff --git a/man/check_missing_cols_generic.Rd b/man/check_missing_cols_generic.Rd new file mode 100644 index 0000000..27325ba --- /dev/null +++ b/man/check_missing_cols_generic.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_missing_cols_generic} +\alias{check_missing_cols_generic} +\title{\code{check_missing_cols_generic}} +\usage{ +check_missing_cols_generic(data) +} +\arguments{ +\item{data}{tibble. A tibble with at minimum an Age column.} +} +\value{ +A data.frame with validation results containing columns: \code{check}, \code{message}, \code{pass}. +} +\description{ +Check if Age column exists. For generic validation that doesn't require specific value columns. +} diff --git a/man/check_missing_cols_opag.Rd b/man/check_missing_cols_opag.Rd new file mode 100644 index 0000000..3d65045 --- /dev/null +++ b/man/check_missing_cols_opag.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_missing_cols_opag} +\alias{check_missing_cols_opag} +\title{\code{check_missing_cols_opag}} +\usage{ +check_missing_cols_opag(data) +} +\arguments{ +\item{data}{tibble. A tibble containing population data for ODAP analysis.} +} +\value{ +A data.frame with 3 columns: \code{check} - the name of the test, \code{message} - The error message with corresponding information generated if the test is failed (if the test is passed evaluates to NA), \code{pass} - binary result of the test either "Fail" or "Pass". +} +\description{ +Check if any of the crucial columns are missing from ODAP population data: (\code{Age}, \code{pop}). +} +\examples{ +library(tibble) +data <- tibble( + Age = 0:100, + pop = c(9544406, 7471790, rep(1000000, 99)), + name = "India", + country_code = 356, + sex = "M", + year = 1971 +) + +check_missing_cols_opag(data = data) + +} diff --git a/man/check_nlx_if_present.Rd b/man/check_nlx_if_present.Rd new file mode 100644 index 0000000..c217e89 --- /dev/null +++ b/man/check_nlx_if_present.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_nlx_if_present} +\alias{check_nlx_if_present} +\title{\code{check_nlx_if_present}} +\usage{ +check_nlx_if_present(data) +} +\arguments{ +\item{data}{tibble. A tibble containing population data for ODAP analysis.} +} +\value{ +A data.frame with 3 columns: \code{check} - the name of the test, \code{message} - The error message with corresponding information generated if the test is failed (if the test is passed evaluates to NA), \code{pass} - binary result of the test either "Fail" or "Pass". +} +\description{ +Check if nLx column is present and valid (numeric, no NAs). +} +\examples{ +library(tibble) +data <- tibble( + Age = 0:100, + pop = c(9544406, 7471790, rep(1000000, 99)), + nLx = runif(101, 50000, 100000) +) + +check_nlx_if_present(data = data) + +} diff --git a/man/check_numeric_opag.Rd b/man/check_numeric_opag.Rd new file mode 100644 index 0000000..55521e1 --- /dev/null +++ b/man/check_numeric_opag.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_numeric_opag} +\alias{check_numeric_opag} +\title{\code{check_numeric_opag}} +\usage{ +check_numeric_opag(data) +} +\arguments{ +\item{data}{tibble. A tibble containing population data for ODAP analysis.} +} +\value{ +A data.frame with 3 columns: \code{check} - the name of the test, \code{message} - The error message with corresponding information generated if the test is failed (if the test is passed evaluates to NA), \code{pass} - binary result of the test either "Fail" or "Pass". +} +\description{ +Check if the numeric columns for ODAP data (\code{Age}, \code{pop}) are numeric. +} +\examples{ +library(tibble) +data <- tibble( + Age = 0:100, + pop = c(9544406, 7471790, rep(1000000, 99)), + name = "India", + country_code = 356, + sex = "M", + year = 1971 +) + +check_numeric_opag(data = data) + +} diff --git a/man/check_sex_opag.Rd b/man/check_sex_opag.Rd new file mode 100644 index 0000000..9f6e728 --- /dev/null +++ b/man/check_sex_opag.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_sex_opag} +\alias{check_sex_opag} +\title{\code{check_sex_opag}} +\usage{ +check_sex_opag(data) +} +\arguments{ +\item{data}{tibble. A tibble generated by the output of the read_data function.} +} +\value{ +A data.frame with validation results containing columns: \code{check}, \code{message}, \code{pass}. +} +\description{ +Checks sex variable for OPAG analysis. Unlike general check_sex, +this does NOT allow 'Total' since OPAG requires sex-specific mortality data +from WPP. Accepts various formats: M/F, m/f, Male/Female, male/female, males/females. +} +\examples{ +library(tibble) +data <- tibble(Age = 0:100, + pop = c(9544406, 7471790, rep(1000000, 99)), + Sex = "m") +check_sex_opag(data = data) + +} diff --git a/man/check_smooth.Rd b/man/check_smooth.Rd deleted file mode 100644 index 0405bb3..0000000 --- a/man/check_smooth.Rd +++ /dev/null @@ -1,77 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/smoothing1d.R -\name{check_smooth} -\alias{check_smooth} -\title{\code{check_smooth}} -\usage{ -check_smooth(data_in, data_out) -} -\arguments{ -\item{data_in}{a data.frame or tibble. The original data provided by user} - -\item{data_out}{a data.frame or tibble. The data.frame output of the \code{interpolate} function.} -} -\value{ -A figure of original data as points and smooth result as line for each \code{.id} -} -\description{ -Creates a plot of original data as points and smooth result as line for each \code{.id}. -} -\examples{ -library(tibble) -x = c(1950.77104722793, 1951.77104722793, 1952.77104722793, 1952.77104722793, - 1953.77104722793, 1954.77104722793, 1955.77104722793, 1956.77104722793, - 1957.77104722793, 1957.77104722793, 1958.77104722793, 1959.77104722793, - 1963.5, 1964.5, 1965.5, 1966.5, 1967.5, 1968.5, 1969.5, 1970.5, - 1970.5, 1971.5, 1971.5, 1972.5, 1972.5, 1973.5, 1973.5, 1974.5, - 1974.5, 1975.5, 1975.5, 1976.5, 1976.5, 1977.5, 1977.5, 1978.5, - 1978.5, 1979.5, 1979.5, 1980.5, 1980.5, 1981.5, 1981.5, 1982.5, - 1982.5, 1983.5, 1983.5, 1984.5, 1984.5, 1985.5, 1985.5, 1986.5, - 1986.5, 1987.5, 1987.5, 1988.5, 1988.5, 1989.5, 1989.5, 1990.5, - 1990.5, 1991.5, 1991.5, 1992.5, 1992.5, 1993.5, 1993.5, 1994.5, - 1994.5, 1995.5, 1995.5, 1996.5, 1996.5, 1997.5, 1997.5, 1998.5, - 1998.5, 1998.5, 1999.5, 1999.5, 1999.5, 2000.5, 2000.5, 2000.5, - 2001.5, 2001.5, 2001.5, 2001.5, 2002.5, 2002.5, 2002.5, 2003.5, - 2003.5, 2003.5, 2004.5, 2004.5, 2005.5, 2005.5, 2006.5, 2006.5, - 2007.5, 2007.5, 2008.5, 2008.5, 2009.5, 2009.5, 2010.5, 2010.5, - 2011.5, 2012.5, 2013.5, 2014.5, 2015.5, 2016.5, 2017.5) - -y = c(5.28312492370605, 5.4010272026062, 5.55507040023804, 5.52694797515869, - 5.65941572189331, 5.81246614456177, 5.95277643203735, 6.07998991012573, - 6.20043277740479, 6.23209381103516, 6.4145884513855, 6.44994592666626, - 5.77428722381592, 6.09462738037109, 6.31580305099487, 5.68929624557495, - 6.34508848190308, 5.67744398117065, 5.6477165222168, 5.12978315353394, - 4.83979654312134, 5.40941429138184, 4.93997049331665, 5.06586456298828, - 4.68799591064453, 4.31546640396118, 4.18193292617798, 3.75800633430481, - 3.82137632369995, 3.17197370529175, 3.41054058074951, 3.08278703689575, - 3.04342699050903, 3.01940298080444, 2.9705445766449, 2.96306347846985, - 2.88018417358398, 2.72204685211182, 2.72689270973206, 3.43744516372681, - 2.88990616798401, 3.12944483757019, 3.18246674537659, 3.20358324050903, - 3.0853967666626, 3.38533687591553, 3.18455958366394, 3.14047956466675, - 3.08752226829529, 3.15941309928894, 3.09168982505798, 3.22912931442261, - 2.95333743095398, 3.05898070335388, 2.91993451118469, 3.29801154136658, - 2.93581032752991, 3.18667984008789, 2.92741537094116, 3.10476756095886, - 2.86983323097229, 3.31816387176514, 2.87090802192688, 3.57006907463074, - 3.20188307762146, 3.38520860671997, 3.1142041683197, 3.05472731590271, - 2.88588929176331, 2.85394668579102, 2.69696426391602, 2.71265292167664, - 2.49935364723206, 2.55814361572266, 2.37988924980164, 2.35105299949646, - 2.32249999046326, 2.21962690353394, 2.46929597854614, 2.43600010871887, - 2.6223669052124, 2.31005716323853, 2.33249998092651, 2.408358335495, - 2.39718627929688, 2.38599991798401, 3, 2.51987075805664, 2.1131649017334, - 2.13849997520447, 2.04657125473022, 2.06265759468079, 2.10050010681152, - 1.98469293117523, 2.09120869636536, 2.2427921295166, 1.97930490970612, - 2.15000796318054, 2.06816005706787, 2.18898129463196, 1.77942955493927, - 1.99617087841034, 1.89081299304962, 2.0644690990448, 1.85119438171387, - 2.03594541549683, 1.83351850509644, 2.02167296409607, 1.89562964439392, - 1.89205574989319, 1.85763072967529, 1.68259692192078, 1.69228148460388, - 1.56845271587372, 1.37076985836029) -data_in <- tibble(x = x, - y = y, - w = 1) -xout <- 1950:2018 + .5 - -data_out <- smooth1d(data_in, method = "gam-ps", xout = xout) - -check_smooth(data_in, data_out) - -} diff --git a/man/graduate_auto.Rd b/man/graduate_auto.Rd index 0bed047..979e55f 100644 --- a/man/graduate_auto.Rd +++ b/man/graduate_auto.Rd @@ -10,7 +10,8 @@ graduate_auto( variable = NULL, u5m = NULL, Sex = c("t", "f", "m"), - constrain_infants = TRUE + constrain_infants = TRUE, + i18n = NULL ) } \arguments{ @@ -25,9 +26,11 @@ graduate_auto( \item{Sex}{character. Either \code{m} for males, \code{f} for females, or \code{t} for total (default). Please note that in case this parameter is not explicitly provided, the function will scan the data for the column with the corresponding name and use its levels automatically.} \item{constrain_infants}{logical. A scalar that indicates whether the infant proportions should be constrained or left as is. Defaults to \code{TRUE}.} + +\item{i18n}{Optional internationalization object for translating plot labels and titles. Defaults to NULL.} } \value{ -data_out. A list with two elements: \code{data_out} - a tibble with two numeric columns - smoothed counts for the chosen variable and \code{Age} - chosen age grouping, and \code{arguments} - a list of arguments used in the function. +data_out. A list with three elements: \code{data_out} - a tibble with two numeric columns - smoothed counts for the chosen variable and \code{Age} - chosen age grouping, \code{plot} - a ggplot2 object comparing original vs graduated data, and \code{arguments} - a list of arguments used in the function. } \description{ Smoothed population or death counts with moving averages. The method was adopted from paragraph five of the "Method protocol for evaluating census population data by age and sex". @@ -40,7 +43,7 @@ fpath <- system.file("extdata", "five_hmd_spain.csv.gz", package = "ODAPbackend") data_in <- read_csv(fpath) |> - dplyr::select(-1) |> + select(-1) |> filter(.id == 1) ex1 <- graduate_auto(data_in, diff --git a/man/graduate_auto_5.Rd b/man/graduate_auto_5.Rd index 81fcb1f..ab6ec32 100644 --- a/man/graduate_auto_5.Rd +++ b/man/graduate_auto_5.Rd @@ -26,7 +26,7 @@ fpath <- system.file("extdata", "five_hmd_spain.csv.gz", package = "ODAPbackend") data_in <- read_csv(fpath) |> - dplyr::select(-1) |> + select(-1) |> filter(.id == 1) graduate_auto_5(data_in, "Exposures")$Exposures diff --git a/man/graduate_time.Rd b/man/graduate_time.Rd new file mode 100644 index 0000000..4ab61e9 --- /dev/null +++ b/man/graduate_time.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/graduation.R +\name{graduate_time} +\alias{graduate_time} +\title{Graduate grouped time data to single-year estimates +Expands grouped time data (e.g., 5-year intervals) into single-year values using various graduation methods. The function supports smoothing, monotone interpolation, uniform splitting, cubic regression splines, penalized composite link models (PCLM), and LOESS smoothing.} +\usage{ +graduate_time( + data_in, + method = c("loess", "smooth_spline", "cubic", "mono", "uniform", "pclm"), + X, + Y, + timeInt = 5 +) +} +\arguments{ +\item{data_in}{Data frame containing grouped time data.} + +\item{method}{Graduation method. One of \code{"loess"}, \code{"smooth_spline"}, \code{"cubic"}, \code{"mono"}, \code{"uniform"}, \code{"pclm"}.} + +\item{X}{Name of the time variable (character string).} + +\item{Y}{Name of the grouped value variable (character string).} + +\item{timeInt}{Width of grouping interval (default = 5).} +} +\value{ +A data frame with columns: +\itemize{ +\item \code{time} — single-year time values +\item \code{value} — graduated single-year estimates +\item \code{.id} — subgroup identifier +} +} +\description{ +Available methods: +\itemize{ +\item \code{"uniform"} — distributes totals evenly within each interval +\item \code{"mono"} — monotone cubic interpolation (Hyman filtered spline) +\item \code{"smooth_spline"} — smoothing spline via \code{stats::smooth.spline} +\item \code{"cubic"} — cubic regression spline via \code{mgcv::gam} +\item \code{"loess"} — local polynomial smoothing via \code{stats::loess} +\item \code{"pclm"} — penalized composite link model via \code{ungroup::pclm} +} +} +\details{ +Block totals are preserved for all smoothing-based methods by rescaling +predictions within each interval to match original grouped totals. + +If column \code{.id} exists in \code{data_in}, graduation is performed separately within each subgroup. Otherwise all observations are treated as a single group. +For spline-based methods, predicted single-year values are rescaled +within each interval so that grouped totals are exactly preserved. +The PCLM method relies on the \pkg{ungroup} package and may internally rescale small counts to avoid negative fitted values. +} +\examples{ +df <- data.frame( + Year = seq(1950, 2015, by = 5), + Deaths = c(403564.9, 426012.0, 466215.9, 523753.8, 560874.1, + 545608.3, 555335.9, 594584.6, 608425.3, 638167.3, + 655438.6, 689386.4, 740519.0, 804439.3) +) + +graduate_time( + df, + method = "pclm", + X = "Year", + Y = "Deaths", + timeInt = 5 +) + +} diff --git a/man/movepop.Rd b/man/movepop.Rd deleted file mode 100644 index 51f8dee..0000000 --- a/man/movepop.Rd +++ /dev/null @@ -1,100 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/movepop.R -\name{movepop} -\alias{movepop} -\title{Project Population Between Two Dates Using Age-Specific Rates} -\usage{ -movepop( - initial_date, - desired_date, - male_pop, - female_pop, - male_mx, - female_mx, - asfr, - annual_net_migrants = 0, - age_groups = NULL, - age_format = "auto" -) -} -\arguments{ -\item{initial_date}{Numeric or Date. The starting year/date of the population.} - -\item{desired_date}{Numeric or Date. The year/date to which the population is projected.} - -\item{male_pop}{Numeric vector of male population counts by age.} - -\item{female_pop}{Numeric vector of female population counts by age.} - -\item{male_mx}{Numeric vector of male mortality rates by age.} - -\item{female_mx}{Numeric vector of female mortality rates by age.} - -\item{asfr}{Numeric vector of age-specific fertility rates corresponding to female ages.} - -\item{annual_net_migrants}{Numeric scalar for annual net migration to include in growth (default 0).} - -\item{age_groups}{Optional character vector of age group labels. If \code{NULL}, default labels are assigned.} - -\item{age_format}{Character specifying age structure: \code{"single_year"}, \code{"five_year"}, or \code{"auto"} (default \code{"auto"}). When \code{"auto"}, the function infers format from the length of population vectors.} -} -\value{ -A list with three elements: -\describe{ -\item{\code{initial_summaries}}{A tibble summarising total births, deaths, crude rates, growth rate, total population, and adjustment factor.} -\item{\code{projected_summaries}}{A tibble summarising projected total population by sex and overall sex ratio.} -\item{\code{data}}{A tibble with age-specific populations, ASFR, births, deaths, projected populations by sex, both-sexes totals, and sex ratios.} -} -} -\description{ -The \code{movepop()} function projects a population from an initial date to a desired date -using age-specific fertility (ASFR) and mortality rates. It supports single-year or -five-year age group formats and optionally accounts for net migration. -The function produces both projected population by age and sex and summary statistics. -} -\details{ -\itemize{ -\item The function ensures that population and mortality vectors have equal lengths. -\item If \code{age_format} is \code{"auto"}, lengths of 18 imply \code{"five_year"} and lengths ≥101 imply \code{"single_year"}. -\item Age groups are automatically generated if not provided, and ASFR values are aligned starting at the age group \code{"15"} (or equivalent). -\item Projected populations are computed using exponential growth based on crude birth rate, crude death rate, and net migration rate. -\item The function returns both age-specific projected populations and summary statistics. -} -} -\examples{ -\dontrun{ -male_pop <- c(48875, 164390, 173551, 130297, 101143, 73615, 60594, 55175, - 49530, 46562, 39028, 27837, 22110, 18066, 15340, 13318, -#' 12002, 6424) - -female_pop <- c(47105, 159546, 168760, 119437, 92080, 70515, 58801, 53381, - 46757, 41164, 33811, 24121, 19315, 16319, 14058, 12302, - 11047, 5922) - -male_mx <- c(0.12427, 0.01639, 0.00274, 0.00167, 0.00251, 0.00380, 0.00382, - 0.00442, 0.00506, 0.00663, 0.00872, 0.01240, 0.01783, 0.02700, - 0.04126, 0.06785, 0.11287, 0.21015) - -female_mx <- c(0.11050, 0.01577, 0.00254, 0.00159, 0.00232, 0.00304, 0.00344, - 0.00370, 0.00418, 0.00492, 0.00592, 0.00831, 0.01182, 0.01942, - 0.03221, 0.05669, 0.09771, 0.19385) - -asfr <- c(0.199, 0.478, 0.418, 0.321, 0.163, 0.071, 0.028) - -res <- movepop( -initial_date = 1973.58, -desired_date = 1973.50, -male_pop = male_pop, -female_pop = female_pop, -male_mx = male_mx, -female_mx = female_mx, -asfr = asfr, -annual_net_migrants = -50000, -age_format = "five_year" - -res$initial_summaries -res$projected_summaries -head(res$data) -} - -} diff --git a/man/odap_opag.Rd b/man/odap_opag.Rd index 00c5da4..bca9cf4 100644 --- a/man/odap_opag.Rd +++ b/man/odap_opag.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/odap.R, R/point5.R +% Please edit documentation in R/odap.R \name{odap_opag} \alias{odap_opag} \title{ODAP–OPAG Mortality and Population Redistribution Analysis} @@ -15,21 +15,8 @@ odap_opag( name = "India", country_code = 356, year = 1971, - sex = "M" -) - -odap_opag( - data_in = NULL, - Age_fit = c(60, 70), - AgeInt_fit = c(10, 10), - Redistribute_from = 80, - OAnew = 100, - method = c("mono", "pclm", "uniform"), - nLx = NULL, - name = "India", - country_code = 356, - year = 1971, - sex = "M" + sex = "M", + i18n = NULL ) } \arguments{ @@ -54,14 +41,10 @@ odap_opag( \item{year}{Numeric vector of years to filter by (default \code{1971}).} \item{sex}{Character scalar indicating sex to filter by, e.g., \code{"M"} or \code{"F"} (default \code{"M"}).} + +\item{i18n}{Optional internationalization object for translating plot labels and titles (default \code{NULL}).} } \value{ -A list with two elements: -\describe{ -\item{\code{data_out}}{A named list of OPAG redistribution results by group.} -\item{\code{figures}}{A named list of \code{ggplot2} objects showing original vs. redistributed populations.} -} - A list with two elements: \describe{ \item{\code{data_out}}{A named list of OPAG redistribution results by group.} @@ -69,10 +52,6 @@ A list with two elements: } } \description{ -This function prepares population data and mortality life table data (\code{nLx}) to perform age redistribution using the OPAG (Old-Age Population Age Group) method. -It supports flexible input with country name/code, sex, and year filters, and handles both user-provided and WPP standard mortality data. -The function outputs adjusted population estimates along with diagnostic plots comparing original vs. redistributed populations. - This function prepares population data and mortality life table data (\code{nLx}) to perform age redistribution using the OPAG (Old-Age Population Age Group) method. It supports flexible input with country name/code, sex, and year filters, and handles both user-provided and WPP standard mortality data. The function outputs adjusted population estimates along with diagnostic plots comparing original vs. redistributed populations. @@ -87,23 +66,10 @@ The function: \item Generates diagnostic plots comparing original and redistributed population counts. } -The function uses internal helpers such as \code{conditional_filter()}, -\code{conditional_arrange()}, and \code{create_groupid()} to standardize and align data. - -The function: -\itemize{ -\item Standardizes and validates population input data. -\item Automatically retrieves and filters WPP mortality data (\code{nLx}) if not supplied. -\item Checks age interval consistency and aggregates mortality data if necessary. -\item Applies the OPAG redistribution method per group (e.g. country, year, sex). -\item Generates diagnostic plots comparing original and redistributed population counts. -} - The function uses internal helpers such as \code{conditional_filter()}, \code{conditional_arrange()}, and \code{create_groupid()} to standardize and align data. } \examples{ -\dontrun{ library(dplyr) data_in <- tibble( @@ -123,34 +89,3 @@ res$data_out$India # Plot original vs redistributed population print(res$figures$India) } -\dontrun{ -library(dplyr) - -data_in <- tibble( - age = seq(0, 100, by = 5), - pop = runif(21, 10000, 100000), - name = "India", - country_code = 356, - sex = "M", - year = 1971 -) - -res <- odap_opag(data_in, Redistribute_from = 80) - -# View redistributed data -res$data_out$India - -# Plot original vs redistributed population -print(res$figures$India) -} - -} -\seealso{ -<<<<<<< HEAD -\code{\link{OPAG}}, \code{\link{lt_single_mx}}, \code{\link{groupAges}} - -\code{\link{OPAG}}, \code{\link{lt_single_mx}}, \code{\link{groupAges}} -======= -\code{\link[DemoTools]{OPAG}}, \code{\link{link[DemoTools]{lt_single_mx}}, \code{\link{\link[DemoTools]{groupAges}} ->>>>>>> 2f4af5b16613027e611f0591eedc863278910965 -} diff --git a/man/plot_smooth_compare.Rd b/man/plot_smooth_compare.Rd index 11db1a8..2e72669 100644 --- a/man/plot_smooth_compare.Rd +++ b/man/plot_smooth_compare.Rd @@ -29,8 +29,8 @@ fpath <- system.file("extdata", "abridged_hmd_spain.csv.gz", package = "ODAPbackend") data_in <- read_csv(fpath, show_col_types = FALSE) |> - dplyr::select(-1) |> - dplyr::filter(.id == 1) + select(-1) |> + filter(.id == 1) data_out <- smooth_flexible_chunk( data_in, diff --git a/man/smooth1d.Rd b/man/smooth1d.Rd index ba2f3b0..fc41572 100644 --- a/man/smooth1d.Rd +++ b/man/smooth1d.Rd @@ -2,99 +2,125 @@ % Please edit documentation in R/smoothing1d.R \name{smooth1d} \alias{smooth1d} -\title{\code{smooth1d}} +\title{One–Dimensional Smoothing Wrapper for Age or Time Series +\code{smooth1d()} provides a unified interface for smoothing one–dimensional demographic or time–series data (e.g., age schedules or time trends). +The function supports multiple smoothing engines including \code{supsmu}, \code{lowess}, \code{loess}, cubic splines, and GAM-based methods. +Optional transformations (log, sqrt, logit) can be applied prior to smoothing. +The function automatically handles grouped data via a \code{.id} column.} \usage{ smooth1d( data_in, - method = c("supsmu", "lowess", "loess", "cubicspline", "gam-tp", "gam-ps")[1], + units = c("count", "rate", "proportion"), + X = c("Age", "Time"), + Y = c("Exposures"), + scale = c("log", "sqrt", "logit", "none"), + method = c("supsmu", "lowess", "loess", "cubicspline", "gam-tp", "gam-ps"), smoothing_par = 1, - xout = unique(data_in[["x"]]), - xname = "x", - yname = "y" + xout = unique(data_in[[X]]) ) } \arguments{ -\item{data_in}{\code{data.frame} with x and y coordinates to fit to. Optionally with \code{weights}} +\item{data_in}{A data.frame or tibble containing the input data.} -\item{method}{character. Smoothing method desired. options \code{"supsmu"} (default),\code{"lowess"},\code{"loess"},\code{"cubicspline"},\code{"gam-tp"},\code{"gam-ps"}} +\item{units}{Character string. One of: +\itemize{ +\item \code{"count"} +\item \code{"rate"} +\item \code{"proportion"} +}} -\item{smoothing_par}{smoothing parameter, interpretation varies by method, but higher always means smoother.Default 1} +\item{X}{Character string. Name of the independent variable (typically \code{"Age"} or \code{"Time"}).} -\item{xout}{numeric vector of coordinates to predict for. Defaults to original unique coordinates.} +\item{Y}{Character string. Name of the dependent variable.} -\item{xname}{name of variable containing x coordinate, default \code{x}} +\item{scale}{Optional transformation applied before smoothing: +\itemize{ +\item \code{"log"} +\item \code{"sqrt"} +\item \code{"logit"} +\item \code{"none"} +}} -\item{yname}{name of variable containing y coordinate, default \code{y}} +\item{method}{Smoothing engine: +\itemize{ +\item \code{"supsmu"} +\item \code{"lowess"} +\item \code{"loess"} +\item \code{"cubicspline"} +\item \code{"gam-tp"} (thin plate regression spline) +\item \code{"gam-ps"} (P-spline) +}} + +\item{smoothing_par}{Numeric smoothing parameter. Interpretation depends on method.} + +\item{xout}{Optional vector of evaluation points. Defaults to unique values of \code{X}.} +} +\value{ +A named list with: +\itemize{ +\item \code{result} — Tibble containing smoothed values +\item \code{plot} — ggplot object comparing raw and smoothed values +} } \description{ -Smooth a univariate time series, optionally using \code{weights.} Choose between the super-smoother (\code{"supsmu"}) method, loess (\code{"lowess"} or \code{"loess"}) , smoothing splines (\code{"cubicsplines"}), thin-plate splines (\code{"gam-tp"}), or p-splines (\code{"gam-ps"}). Input data may have multiple observations per x coordinate. Output can be for arbitrary x-coordinates (\code{xout}). If grouping variables have been declared, then we use the same parameters for each subset. +One–Dimensional Smoothing Wrapper for Age or Time Series +\code{smooth1d()} provides a unified interface for smoothing one–dimensional demographic or time–series data (e.g., age schedules or time trends). +The function supports multiple smoothing engines including \code{supsmu}, \code{lowess}, \code{loess}, cubic splines, and GAM-based methods. +Optional transformations (log, sqrt, logit) can be applied prior to smoothing. +The function automatically handles grouped data via a \code{.id} column. } \details{ -\code{"supsmu"} method takes a \code{smoothing_par} between 0 and 10. \code{"lowess"} and +If the input data does not contain a \code{.id} column, a default group \code{"all"} is created. +Weights default to 1 if not supplied. +Transformations are applied before smoothing but not automatically back-transformed. } \examples{ -library(tibble) -x = c(1950.77104722793, 1951.77104722793, 1952.77104722793, 1952.77104722793, - 1953.77104722793, 1954.77104722793, 1955.77104722793, 1956.77104722793, - 1957.77104722793, 1957.77104722793, 1958.77104722793, 1959.77104722793, - 1963.5, 1964.5, 1965.5, 1966.5, 1967.5, 1968.5, 1969.5, 1970.5, - 1970.5, 1971.5, 1971.5, 1972.5, 1972.5, 1973.5, 1973.5, 1974.5, - 1974.5, 1975.5, 1975.5, 1976.5, 1976.5, 1977.5, 1977.5, 1978.5, - 1978.5, 1979.5, 1979.5, 1980.5, 1980.5, 1981.5, 1981.5, 1982.5, - 1982.5, 1983.5, 1983.5, 1984.5, 1984.5, 1985.5, 1985.5, 1986.5, - 1986.5, 1987.5, 1987.5, 1988.5, 1988.5, 1989.5, 1989.5, 1990.5, - 1990.5, 1991.5, 1991.5, 1992.5, 1992.5, 1993.5, 1993.5, 1994.5, - 1994.5, 1995.5, 1995.5, 1996.5, 1996.5, 1997.5, 1997.5, 1998.5, - 1998.5, 1998.5, 1999.5, 1999.5, 1999.5, 2000.5, 2000.5, 2000.5, - 2001.5, 2001.5, 2001.5, 2001.5, 2002.5, 2002.5, 2002.5, 2003.5, - 2003.5, 2003.5, 2004.5, 2004.5, 2005.5, 2005.5, 2006.5, 2006.5, - 2007.5, 2007.5, 2008.5, 2008.5, 2009.5, 2009.5, 2010.5, 2010.5, - 2011.5, 2012.5, 2013.5, 2014.5, 2015.5, 2016.5, 2017.5) +# Example 1: Age smoothing of mortality rates (log scale) +library(readr) -y = c(5.28312492370605, 5.4010272026062, 5.55507040023804, 5.52694797515869, - 5.65941572189331, 5.81246614456177, 5.95277643203735, 6.07998991012573, - 6.20043277740479, 6.23209381103516, 6.4145884513855, 6.44994592666626, - 5.77428722381592, 6.09462738037109, 6.31580305099487, 5.68929624557495, - 6.34508848190308, 5.67744398117065, 5.6477165222168, 5.12978315353394, - 4.83979654312134, 5.40941429138184, 4.93997049331665, 5.06586456298828, - 4.68799591064453, 4.31546640396118, 4.18193292617798, 3.75800633430481, - 3.82137632369995, 3.17197370529175, 3.41054058074951, 3.08278703689575, - 3.04342699050903, 3.01940298080444, 2.9705445766449, 2.96306347846985, - 2.88018417358398, 2.72204685211182, 2.72689270973206, 3.43744516372681, - 2.88990616798401, 3.12944483757019, 3.18246674537659, 3.20358324050903, - 3.0853967666626, 3.38533687591553, 3.18455958366394, 3.14047956466675, - 3.08752226829529, 3.15941309928894, 3.09168982505798, 3.22912931442261, - 2.95333743095398, 3.05898070335388, 2.91993451118469, 3.29801154136658, - 2.93581032752991, 3.18667984008789, 2.92741537094116, 3.10476756095886, - 2.86983323097229, 3.31816387176514, 2.87090802192688, 3.57006907463074, - 3.20188307762146, 3.38520860671997, 3.1142041683197, 3.05472731590271, - 2.88588929176331, 2.85394668579102, 2.69696426391602, 2.71265292167664, - 2.49935364723206, 2.55814361572266, 2.37988924980164, 2.35105299949646, - 2.32249999046326, 2.21962690353394, 2.46929597854614, 2.43600010871887, - 2.6223669052124, 2.31005716323853, 2.33249998092651, 2.408358335495, - 2.39718627929688, 2.38599991798401, 3, 2.51987075805664, 2.1131649017334, - 2.13849997520447, 2.04657125473022, 2.06265759468079, 2.10050010681152, - 1.98469293117523, 2.09120869636536, 2.2427921295166, 1.97930490970612, - 2.15000796318054, 2.06816005706787, 2.18898129463196, 1.77942955493927, - 1.99617087841034, 1.89081299304962, 2.0644690990448, 1.85119438171387, - 2.03594541549683, 1.83351850509644, 2.02167296409607, 1.89562964439392, - 1.89205574989319, 1.85763072967529, 1.68259692192078, 1.69228148460388, - 1.56845271587372, 1.37076985836029) -data_in <- tibble(x=x,y=y,w=1) -xout <- 1950:2018 + .5 -\dontrun{ - plot(data_in$x, data_in$y) - lines(xout, smooth1d(data_in, method = "supsmu", - xout = xout, smoothing_par = .2)$y, - col = "red") - lines(xout, smooth1d(data_in, method = "loess", - xout = xout, smoothing_par = .2)$y, - col = "blue", lty = "82") - lines(xout, smooth1d(data_in, method = "gam-ps", xout = xout)$y, - col = "forestgreen" ,lty="82") - lines(xout, smooth1d(data_in, method = "cubicspline", - xout = xout, smoothing_par =1)$y, - col = "purple", lty="22") -} +fpath <- system.file( + "extdata", + "single_hmd_spain.csv.gz", + package = "ODAPbackend" +) + +data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +data_in$nMx <- data_in$Deaths / data_in$Exposures +data_in <- data_in[data_in$.id == 1, ] +names(data_in) <- c(".id", "Time", "Age", "Sex", + "Deaths", "Exposures", "nMx") + +z <- smooth1d( + data_in = data_in, + units = "rate", + X = "Age", + Y = "nMx", + method = "supsmu", + scale = "log" +) + +z$result +z$plot + +# Example 2: Time smoothing of counts +df <- data.frame( + Time = seq(1950, 2015, by = 5), + Deaths = c(403564.9, 426012.0, 466215.9, 523753.8, + 560874.1, 545608.3, 555335.9, 594584.6, + 608425.3, 638167.3, 655438.6, 689386.4, + 740519.0, 804439.3) +) + +z <- smooth1d( + data_in = df, + units = "count", + X = "Time", + Y = "Deaths", + method = "supsmu", + scale = "none" +) + +z$result +z$plot } diff --git a/man/smooth1d_chunk.Rd b/man/smooth1d_chunk.Rd deleted file mode 100644 index 0028484..0000000 --- a/man/smooth1d_chunk.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/smoothing1d.R -\name{smooth1d_chunk} -\alias{smooth1d_chunk} -\title{\code{smooth1d_chunk}} -\usage{ -smooth1d_chunk( - data_in., - method = c("supsmu", "lowess", "loess", "cubicspline", "gam-tp", "gam-ps")[1], - smoothing_par = 1, - xout = data_in.[["x"]], - xname = "x", - yname = "y" -) -} -\arguments{ -\item{data_in.}{\code{data.frame} with x and y coordinates to fit to. Optionally with \code{weights}. Should refer to a single subset of data.} - -\item{method}{character. Smoothing method desired. options \code{"supsmu"} (default),\code{"lowess"},\code{"loess"},\code{"cubicspline"},\code{"gam-tp"},\code{"gam-ps"}} - -\item{smoothing_par}{smoothing parameter, interpretation varies by method, but higher always means smoother.Default 1} - -\item{xout}{numeric vector of coordinates to predict for. Defaults to original unique coordinates.} - -\item{xname}{name of variable containing x coordinate, default \code{x}} - -\item{yname}{name of variable containing y coordinate, default \code{y}} -} -\description{ -Smooth a univariate time series, optionally using \code{weights.} Choose between the super-smoother (\code{"supsmu"}) method, loess (\code{"lowess"} or \code{"loess"}) , smoothing splines (\code{"cubicsplines"}), thin-plate splines (\code{"gam-tp"}), or p-splines (\code{"gam-ps"}). Input data may have multiple observations per x coordinate. Output can be for arbitrary x-coordinates (\code{xout}). -} -\details{ -\code{"supsmu"} method takes a \code{smoothing_par} between 0 and 10. \code{"lowess"} and -} diff --git a/man/smooth_flexible.Rd b/man/smooth_flexible.Rd index e5fde45..fc17fce 100644 --- a/man/smooth_flexible.Rd +++ b/man/smooth_flexible.Rd @@ -54,7 +54,7 @@ fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") data_in <- read_csv(fpath) |> - dplyr::select(-1) + select(-1) ex1 <- smooth_flexible( data_in, variable = "Exposures", diff --git a/man/smooth_flexible_chunk.Rd b/man/smooth_flexible_chunk.Rd index 86f4adf..440b363 100644 --- a/man/smooth_flexible_chunk.Rd +++ b/man/smooth_flexible_chunk.Rd @@ -51,7 +51,7 @@ fpath <- system.file("extdata", "abridged_hmd_spain.csv.gz", package = "ODAPbackend") data_in <- read_csv(fpath) |> - dplyr::select(-1) |> + select(-1) |> filter(.id == 1) ex1 <- smooth_flexible_chunk( diff --git a/ui_flow_control_graduate_standalone.R b/ui_flow_control_graduate_standalone.R new file mode 100644 index 0000000..66d7fde --- /dev/null +++ b/ui_flow_control_graduate_standalone.R @@ -0,0 +1,27 @@ + +# TODO + +# this should show the setup for a graduation-only standalone module. +# here there are two families, depending on whether we are graduating +# time or graduating age. They should be in the same module, but which +# backend funcition is called should depend on whether the user has given +# time as the abscissa or age. The main reason why we have two functions +# rather than a single wrapper is that the methods lists are different +# for age and time. + +# see graduate_time() + + + + + + + + + + + + + + + diff --git a/ui_flow_control_graduation.R b/ui_flow_control_graduation.R index f7deaac..f8d433c 100644 --- a/ui_flow_control_graduation.R +++ b/ui_flow_control_graduation.R @@ -24,11 +24,11 @@ output <- smooth_flexible(data_in, i18n = NULL # ui arg ) -# plots currently br -output$data_out # valid output -output$figures # broken due to duplicated Age column, breaks ggplot code +output$data_out # valid output +# figure separate for each .id +print(output$figures[[1]]$figure) args(smooth_flexible) # user-specified arguments: diff --git a/ui_flow_control_heaping.R b/ui_flow_control_heaping.R index 0ce1e6a..7f1ecc0 100644 --- a/ui_flow_control_heaping.R +++ b/ui_flow_control_heaping.R @@ -1,3 +1,5 @@ +# TODO update this to show operations on grouped data w .id + devtools::load_all() library(dplyr) diff --git a/ui_flow_control_smoothing.R b/ui_flow_control_smoothing.R index 72667eb..1d3498c 100644 --- a/ui_flow_control_smoothing.R +++ b/ui_flow_control_smoothing.R @@ -1,4 +1,4 @@ - +# This script in progress, pending some directions devtools::load_all() library(dplyr)