From 38bd95614c7803df4c703088740fe14244bb4e8e Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastanedaa@worldbank.org>
Date: Wed, 25 Mar 2026 22:16:06 -0400
Subject: [PATCH] fix issue with population in refy

---
 R/create_lkups.R            |   7 +-
 R/fgt_cumsum.R              | 224 ++++++++++++++++++++----------------
 inst/TMP/TMP_data_testing.R |  11 +-
 3 files changed, 136 insertions(+), 106 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 325055bb..d96edd84 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -370,7 +370,12 @@ create_lkups <- function(data_dir, versions) {
 
     # --- END inclussion of CMD data.
 
-    refy_lkup <- refy_lkup[reporting_year %in% lineup_years$lineup_years, ]
+    refy_lkup <- refy_lkup[
+      reporting_year %in% lineup_years$lineup_years,
+    ][
+      # Filter out rows with missing or zero population
+      !is.na(reporting_pop) & reporting_pop > 0
+    ]
 
     gv(
       refy_lkup,
diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R
index 8128b64f..823e3c6f 100644
--- a/R/fgt_cumsum.R
+++ b/R/fgt_cumsum.R
@@ -10,7 +10,6 @@
 #' @return List with elements: DT (data.table of all surveys, with id_rl) and g (GRP object for grouping by id_rl).
 #' @keywords internal
 format_lfst <- \(lfst, dict) {
-
   DT <- rbindlist(lfst, fill = TRUE)
 
   # Convert to factors (is it faster?)
@@ -18,20 +17,15 @@ format_lfst <- \(lfst, dict) {
     DT[, index := as.integer(index)]
   }
 
-  out <- encode_pairs(DT = DT,
-                      dict = dict,
-                      drop_labels = TRUE)
+  out <- encode_pairs(DT = DT, dict = dict, drop_labels = TRUE)
 
   ## Grouping ----------
-  g <- GRP(out, ~ id_rl, sort = FALSE)
+  g <- GRP(out, ~id_rl, sort = FALSE)
 
-  list(DT = out,
-       g = g)
+  list(DT = out, g = g)
 }
 
 
-
-
 #' Compute total population by survey and reporting level
 #'
 #' Sums the weights for each (id, reporting_level) group in the combined survey data.
@@ -43,11 +37,12 @@ format_lfst <- \(lfst, dict) {
 get_total_pop <- \(LDTg) {
   list2env(LDTg, envir = environment())
   rm(LDTg)
-  add_vars(g[["groups"]],
-           get_vars(DT, c("weight")) |>
-           fsum(g)) |>
-    setnames(old = "weight",
-             new =  "W")
+  add_vars(
+    g[["groups"]],
+    get_vars(DT, c("weight")) |>
+      fsum(g)
+  ) |>
+    setnames(old = "weight", new = "W")
 }
 
 #' Compute FGT and Watts indices for all groups and poverty lines
@@ -60,67 +55,74 @@ get_total_pop <- \(LDTg) {
 #' @param drop_vars Logical, if TRUE returns only summary columns.
 #' @return data.table with FGT and Watts measures by group and poverty line.
 #' @keywords internal
-fgt_cumsum <- \(LDTg, tpop, povline,
-                drop_vars = TRUE) {
+fgt_cumsum <- \(LDTg, tpop, povline, drop_vars = TRUE) {
   list2env(LDTg, envir = environment())
   rm(LDTg)
 
   # Temporal values to be added to the data.table
-  tz      <- pmax(povline, 1e-12)
-  tz2     <- pmax(povline^2, 1e-16)
-  tlogz   <- log(tz)
+  tz <- pmax(povline, 1e-12)
+  tz2 <- pmax(povline^2, 1e-16)
+  tlogz <- log(tz)
 
   # 1) Compute cutpoint index for each z, using ONLY non-zero rows for welfare
   #    -> findInterval(povline, welfare) returns values in 0..N (never N+1)
-  ID <- DT[index > 0L,
-           {
-             idx <- findInterval(povline, welfare, left.open = TRUE)
-             # 2) Attach z, z2, logz in-group (no replication/copies)
-             data.table(index = idx,
-                        z     = tz,
-                        z2    = tz2,
-                        logz  = tlogz)
-           },
-           by = id_rl]
+  ID <- DT[
+    index > 0L,
+    {
+      idx <- findInterval(povline, welfare, left.open = TRUE)
+      # 2) Attach z, z2, logz in-group (no replication/copies)
+      data.table(index = idx, z = tz, z2 = tz2, logz = tlogz)
+    },
+    by = id_rl
+  ]
 
   # 3) Minimal cumulative view (shallow column subset; avoids copying DT)
-  DT_min <- get_vars(DT,
-                     c("id_rl", "index", "cw", "cwy", "cwy2", "cwylog"))
+  DT_min <- get_vars(DT, c("id_rl", "index", "cw", "cwy", "cwy2", "cwylog"))
 
   # 4) join cutpoints to cumulatives (index==0 hits the already-present zero row)
   CS <- join(
     x = ID,
     y = DT_min,
-    on   = c("id_rl","index"),
-    how  = "left",
-    validate = "m:1",          # many cutpoints -> 1 cumulative row
+    on = c("id_rl", "index"),
+    how = "left",
+    validate = "m:1", # many cutpoints -> 1 cumulative row
     drop.dup.cols = "y",
-    verbose = 0) |>
-  # 5) Bring total population W
-    join(tpop,
-         on = "id_rl",
-         how = "left",
-         validate = "m:1",
-         drop.dup.cols = "y",
-         verbose = 0) |>
+    verbose = 0
+  ) |>
+    # 5) Bring total population W
+    join(
+      tpop,
+      on = "id_rl",
+      how = "left",
+      validate = "m:1",
+      drop.dup.cols = "y",
+      verbose = 0
+    ) |>
     setorder(id_rl, index)
 
-
   # 6) Compute measures (vectorized). Small clamps for numerical safety.
   CS[, `:=`(
-    headcount        = cw / W,
-    poverty_gap      = (z * cw - cwy) / (z * W),
+    headcount = cw / W,
+    poverty_gap = (z * cw - cwy) / (z * W),
     poverty_severity = (z2 * cw - 2 * z * cwy + cwy2) / (z2 * W),
-    watts            = (logz * cw - cwylog) / W
+    watts = (logz * cw - cwylog) / W
   )]
 
   setnames(CS, "z", "povline")
   if (!drop_vars) {
     return(CS)
   }
-  get_vars(CS, c("id_rl", "povline", "headcount",
-                 "poverty_gap", "poverty_severity", "watts"))
-
+  get_vars(
+    CS,
+    c(
+      "id_rl",
+      "povline",
+      "headcount",
+      "poverty_gap",
+      "poverty_severity",
+      "watts"
+    )
+  )
 }
 
 
@@ -140,25 +142,27 @@ fgt_cumsum <- \(LDTg, tpop, povline,
 #' @return data.table with columns id, reporting_level, and code.
 #' @keywords internal
 build_pair_dict <- function(lkup, fill_gaps = TRUE) {
-
   FT <- if (fill_gaps) {
     lkup$refy_lkup[, .(country_code, reporting_year, reporting_level)]
   } else {
-    lkup$svy_lkup[, .(country_code, reporting_year, reporting_level)]
-  } |>
-    funique()
+    {
+      lkup$svy_lkup[, .(country_code, reporting_year, reporting_level)]
+    } |>
+      funique()
+  }
 
-  FT[, id := paste0(country_code, "_", reporting_year)
-     ][, c("country_code", "reporting_year") := NULL]
+  FT[, id := paste0(country_code, "_", reporting_year)][,
+    c("country_code", "reporting_year") := NULL
+  ]
 
   cols <- c("id", "reporting_level")
   dict <- unique(FT[, ..cols])
 
   # deterministic code order
-  setorderv(dict, cols, order = 1L)     # radix by default
-  dict[, code := as.integer(.I)]        # fast in DT
-  setkeyv(dict, cols)                   # fast key lookups when needed
-  setindexv(dict, "code")               # index on code
+  setorderv(dict, cols, order = 1L) # radix by default
+  dict[, code := as.integer(.I)] # fast in DT
+  setkeyv(dict, cols) # fast key lookups when needed
+  setindexv(dict, "code") # index on code
   dict
 }
 
@@ -181,18 +185,20 @@ build_pair_dict <- function(lkup, fill_gaps = TRUE) {
 #' @param verbose Integer, verbosity level.
 #' @return data.table with code column added.
 #' @keywords internal
-encode_pairs <- function(DT, dict,
-                         id_col = "id", level_col = "reporting_level",
-                         code_col = "id_rl",
-                         drop_labels = FALSE,
-                         strict = TRUE,
-                         verbose = 0L) {
-
+encode_pairs <- function(
+  DT,
+  dict,
+  id_col = "id",
+  level_col = "reporting_level",
+  code_col = "id_rl",
+  drop_labels = FALSE,
+  strict = TRUE,
+  verbose = 0L
+) {
   stopifnot(is.data.table(DT), is.data.table(dict))
   cols <- c(id_col, level_col)
   stopifnot(all(cols %in% names(DT)), all(c(cols, "code") %in% names(dict)))
 
-
   out <- join(
     x = DT,
     y = dict,
@@ -203,7 +209,9 @@ encode_pairs <- function(DT, dict,
     verbose = verbose
   )
   # Ensure it's a data.table (join usually preserves)
-  if (!is.data.table(out)) setDT(out)
+  if (!is.data.table(out)) {
+    setDT(out)
+  }
 
   # Rename 'code' -> code_col if needed
   if (code_col != "code" && "code" %in% names(out)) {
@@ -223,7 +231,9 @@ encode_pairs <- function(DT, dict,
     }
   }
 
-  if (drop_labels) out[, (cols) := NULL]
+  if (drop_labels) {
+    out[, (cols) := NULL]
+  }
   out
 }
 
@@ -245,26 +255,29 @@ encode_pairs <- function(DT, dict,
 #' @param verbose Integer, verbosity level.
 #' @return data.table with id and reporting_level columns added.
 #' @keywords internal
-decode_pairs <- function(DT, dict,
-                         code_col = "id_rl",
-                         id_col = "id",
-                         level_col = "reporting_level",
-                         keep_code = FALSE,
-                         add_true_vars = TRUE,
-                         verbose = 0L) {
+decode_pairs <- function(
+  DT,
+  dict,
+  code_col = "id_rl",
+  id_col = "id",
+  level_col = "reporting_level",
+  keep_code = FALSE,
+  add_true_vars = TRUE,
+  verbose = 0L
+) {
   stopifnot(exprs = {
     is.data.table(DT)
     is.data.table(dict)
-    })
+  })
   stopifnot(exprs = {
     code_col %in% names(DT)
     all(c("code", id_col, level_col) %in% names(dict))
-    })
+  })
 
   out <- join(
     x = DT,
     y = dict,
-    on = setNames("code", code_col),   # map DT[code_col] to dict$code
+    on = setNames("code", code_col), # map DT[code_col] to dict$code
     how = "left",
     drop.dup.cols = "y",
     validate = "m:1",
@@ -274,13 +287,16 @@ decode_pairs <- function(DT, dict,
 
   if (add_true_vars) {
     out[, `:=`(
-        country_code   = gsub("(.+)(_.+)", "\\1", id),
-        reporting_year = as.integer(gsub("(.+_)(.+)", "\\2", id))
-      )][,
-         id := NULL]
+      country_code = gsub("(.+)(_.+)", "\\1", id),
+      reporting_year = as.integer(gsub("(.+_)(.+)", "\\2", id))
+    )][,
+      id := NULL
+    ]
   }
 
-  if (!keep_code) out[, (code_col) := NULL]
+  if (!keep_code) {
+    out[, (code_col) := NULL]
+  }
   out
 }
 
@@ -298,8 +314,12 @@ decode_pairs <- function(DT, dict,
 #' @param level_col Name of reporting level column.
 #' @return Updated data.table dictionary.
 #' @keywords internal
-update_pair_dict <- function(dict, DT,
-                             id_col = "id", level_col = "reporting_level") {
+update_pair_dict <- function(
+  dict,
+  DT,
+  id_col = "id",
+  level_col = "reporting_level"
+) {
   stopifnot(is.data.table(dict), is.data.table(DT))
   cols <- c(id_col, level_col)
   stopifnot(all(c(cols, "code") %in% names(dict)), all(cols %in% names(DT)))
@@ -317,7 +337,6 @@ update_pair_dict <- function(dict, DT,
 }
 
 
-
 #' Load survey data from file list
 #'
 #' Reads a list of survey files (e.g., .fst) and returns a named list of data.tables, each with an id column.
@@ -326,8 +345,7 @@ update_pair_dict <- function(dict, DT,
 #' @param input_list Character vector of file paths (from create_full_list()).
 #' @return Named list of data.tables, each with an id column.
 #' @keywords internal
-load_list_refy <- \(input_list){
-
+load_list_refy <- \(input_list) {
   id_names <- input_list |>
     fs::path_file() |>
     fs::path_ext_remove()
@@ -338,17 +356,25 @@ load_list_refy <- \(input_list){
     base::seq_along
   }
 
-
-  lfst <- lapply(seq_flex(input_list),
-                 \(i) {
-                   x <- input_list[i]
-                   idn <- fs::path_file(x) |>
-                     fs::path_ext_remove()
-                   fst::read_fst(x, as.data.table = TRUE) |>
-                     _[, id := idn]
-                 }) |>
+  lfst <- lapply(seq_flex(input_list), \(i) {
+    x <- input_list[i]
+    idn <- fs::path_file(x) |>
+      fs::path_ext_remove()
+    tryCatch(
+      fst::read_fst(x, as.data.table = TRUE) |>
+        _[, id := idn],
+      error = \(e) {
+        cli::cli_abort(
+          c(
+            "Failed to read file at index {.val {i}}: {.file {x}}",
+            "x" = conditionMessage(e)
+          ),
+          parent = e
+        )
+      }
+    )
+  }) |>
     setNames(id_names)
 
   lfst
 }
-
diff --git a/inst/TMP/TMP_data_testing.R b/inst/TMP/TMP_data_testing.R
index 0e9e7272..0f8f6dd8 100644
--- a/inst/TMP/TMP_data_testing.R
+++ b/inst/TMP/TMP_data_testing.R
@@ -28,7 +28,7 @@ fg <- pip(country = "ALL", lkup = lkup, fill_gaps = TRUE)
 
 ago <- pip(country = "AGO", lkup = lkup, fill_gaps = TRUE)
 
-wb <- pip_agg(group_by = "wb", lkup = lkup)
+wb <- pip_agg(group_by = "wb", lkup = lkup, povline = 3)
 
 
 # cache most common queries --------------
@@ -50,19 +50,18 @@ cl <- pip(
 fg <- pip(country = "ALL", lkup = lkup, fill_gaps = TRUE, povline = povlines)
 
 wb <- pip_agg(group_by = "wb", lkup = lkup, povline = povlines)
-
+pushoverr::pushover("finished inter caching")
 
 # Debugging --------------
 
 # remove columns where all obs ar NAs
 
-ttt <- pip(country = "AUS", lkup = lkup, fill_gaps = FALSE, povline = 3)
+ttt <- pip(country = "MWI", lkup = lkup, fill_gaps = FALSE, povline = 3)
 head(ttt)
 
 
-debugonce(pipapi:::add_spl)
-col <- pip(country = "COL", lkup = lkup, fill_gaps = TRUE, povline = 3)
-head(col)
+qqq <- pip(country = "MWI", lkup = lkup, fill_gaps = TRUE, povline = 3)
+head(qqq)
 
 
 dt <- dt[, .SD, .SDcols = \(x) !all(is.na(x))]