CDCgov · AFg6K7h4fhy2 · Oct 28, 2024 · Oct 28, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/.gitignore b/.gitignore
@@ -181,3 +181,4 @@ DS_Store
 _book/
 _book
 render_test_idata_general_time_representation_files
+*.csv
@@ -69,12 +69,12 @@ repos:
 ################################################################################
 # PYTHON
 ################################################################################
--   repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.10.0
-    hooks:
-    -   id: black
-        args: ["--line-length", "79"]
-        language_version: python3
+# -   repo: https://github.com/psf/black-pre-commit-mirror
+#     rev: 24.10.0
+#     hooks:
+#     -   id: black
+#         args: ["--line-length", "79"]
+#         language_version: python3
 -   repo: https://github.com/PyCQA/isort
     rev: 5.13.2
     hooks:

diff --git a/assets/external/idata_csv_to_tidy.R b/assets/external/idata_csv_to_tidy.R
@@ -0,0 +1,24 @@
+arviz_split <- function(x) {
+x %>%
+    select(-distribution) %>%
+    split(f = as.factor(x$distribution))
+}
+
+pyrenew_samples <-
+read_csv(inference_data_path) %>%
+rename_with(\(varname) str_remove_all(varname, "\\(|\\)|\\'|(, \\d+)")) |>
+rename(
+    .chain = chain,
+    .iteration = draw
+) |>
+mutate(across(c(.chain, .iteration), \(x) as.integer(x + 1))) |>
+mutate(
+    .draw = tidybayes:::draw_from_chain_and_iteration_(.chain, .iteration),
+    .after = .iteration
+) |>
+pivot_longer(-starts_with("."),
+    names_sep = ", ",
+    names_to = c("distribution", "name")
+) |>
+arviz_split() |>
+map(\(x) pivot_wider(x, names_from = name) |> tidy_draws())
diff --git a/assets/external/inference_data_tidy.R b/assets/external/inference_data_tidy.R
@@ -0,0 +1,85 @@
+format_split_text <- function(x, concat_char = "|") {
+  group <- x[1]
+  non_group <- x[-1]
+  pre_bracket <- stringr::str_extract(non_group[1], "^.*(?=\\[)")
+  if (is.na(pre_bracket)) {
+    formatted_text <- glue::glue("{group}{concat_char}{non_group}")
+  } else {
+    bracket_contents <- non_group[-1] |>
+      stringr::str_replace_all("\\s", "_") |>
+      stringr::str_c(collapse = ",")
+    formatted_text <- glue::glue(
+      "{group}{concat_char}{pre_bracket}[{bracket_contents}]"
+    )
+  }
+  formatted_text
+}
+
+#' Convert InferenceData column names to tidy column names
+#'
+#' InferenceData column names for scalar variables are of the form
+#' `"('group', 'var_name')"`, while column names for array variables are of the
+#'  form `"('group', 'var_name[i,j]', 'i_name', 'j_name')"`.
+#'  This function converts these column names to a format that is useful for
+#'  creating tidy_draws data frames.
+#'  `"('group', 'var_name')"` becomes `"group|var_name"`
+#'  `"('group', 'var_name[i,j]', 'i_name', 'j_name')"` becomes
+#'  `"group|var_name[i_name, j_name]"`
+#'
+#' @param column_names A character vector of InferenceData column names
+#'
+#' @return A character vector of tidy column names
+#' @examples
+#' forecasttools:::idata_names_to_tidy_names(c(
+#'   "('group', 'var_name')",
+#'   "group|var_name[i_name, j_name]"
+#' ))
+idata_names_to_tidy_names <- function(column_names) {
+  column_names |>
+    stringr::str_remove_all("^\\(|\\)$") |>
+    # remove opening and closing parentheses
+    stringr::str_split(", ") |>
+    purrr::map(\(x) stringr::str_remove_all(x, "^\\'|\\'$")) |>
+    # remove opening and closing quotes
+    purrr::map(\(x) stringr::str_remove_all(x, '\\"')) |> # remove double quotes
+    purrr::map_chr(format_split_text) # reformat groups and brackets
+}
+
+#' Convert InferenceData DataFrame to nested tibble of tidy_draws
+#'
+#' @param idata InferenceData DataFrame (the result of calling
+#' arviz.InferenceData.to_dataframe in Python)
+#'
+#' @return A nested tibble, with columns group and data. Each element of data is
+#' a tidy_draws data frame
+#' @export
+
+inferencedata_to_tidy_draws <- function(idata) {
+  idata |>
+    dplyr::rename(
+      .chain = chain,
+      .iteration = draw
+    ) |>
+    dplyr::rename_with(idata_names_to_tidy_names,
+      .cols = -tidyselect::starts_with(".")
+    ) |>
+    dplyr::mutate(dplyr::across(
+      c(.chain, .iteration),
+      \(x) as.integer(x + 1) # convert to 1-indexed
+    )) |>
+    dplyr::mutate(
+      .draw = tidybayes:::draw_from_chain_and_iteration_(.chain, .iteration),
+      .after = .iteration
+    ) |>
+    tidyr::pivot_longer(-starts_with("."),
+      names_sep = "\\|",
+      names_to = c("group", "name")
+    ) |>
+    dplyr::group_by(group) |>
+    tidyr::nest() |>
+    dplyr::mutate(data = purrr::map(data, \(x) {
+      tidyr::drop_na(x) |>
+        tidyr::pivot_wider(names_from = name) |>
+        tidybayes::tidy_draws()
+    }))
+}
@@ -0,0 +1,17 @@
+
+# jax
+@software{jax2018github,
+  author = {James Bradbury and Roy Frostig and Peter Hawkins and Matthew James Johnson and Chris Leary and Dougal Maclaurin and George Necula and Adam Paszke and Jake Vander{P}las and Skye Wanderman-{M}ilne and Qiao Zhang},
+  title = {{JAX}: composable transformations of {P}ython+{N}um{P}y programs},
+  url = {http://github.com/jax-ml/jax},
+  version = {0.3.13},
+  year = {2018},
+}
+
+# numpyro
+@article{phan2019composable,
+  title={Composable Effects for Flexible and Accelerated Probabilistic Programming in NumPyro},
+  author={Phan, Du and Pradhan, Neeraj and Jankowiak, Martin},
+  journal={arXiv preprint arXiv:1912.11554},
+  year={2019}
+}
@@ -4,6 +4,7 @@
 import polars as pl
 
 from forecasttools.daily_to_epiweekly import df_aggregate_to_epiweekly
+from forecasttools.idata_to_tidy import convert_idata_forecast_to_tidydraws
 from forecasttools.idata_w_dates_to_df import (
     add_time_coords_to_idata_dimension,
     add_time_coords_to_idata_dimensions,
@@ -95,4 +96,5 @@
     "generate_time_range_for_dim",
     "validate_iter_has_expected_types",
     "ensure_listlike",
+    "convert_idata_forecast_to_tidydraws"
 ]
@@ -0,0 +1,71 @@
+"""
+Contains functions for interfacing between
+the tidy-verse and arviz, which includes
+the conversion of idata objects (and hence
+their groups) in tidy-usable objects.
+"""
+
+import re
+
+import arviz as az
+import polars as pl
+
+
+def convert_idata_forecast_to_tidydraws(
+    idata: az.InferenceData,
+    groups: list[str]
+) -> dict[str, pl.DataFrame]:
+    """
+    Creates a dictionary of polars dataframes
+    from the groups of an arviz InferenceData
+    object that when converted to a csv(s)
+    and read in R is tidy-usable.
+
+    Parameters
+    ----------
+    idata : az.InferenceData
+        An InferenceData object generated
+        from a numpyro forecast. Typically
+        has the groups observed_data and
+        posterior_predictive.
+    groups : list[str]
+        A list of groups belonging to the
+        idata object.
+
+    Returns
+    -------
+    dict[str, pl.DataFrame]
+        A dictionary of groups from the idata
+        convert to tidy-usable polars dataframe.
+    """
+    tidy_dfs = {}
+    idata_df = idata.to_dataframe()
+    for group in groups:
+        group_columns = [
+            col for col in idata_df.columns
+            if isinstance(col, tuple) and col[0] == group
+        ]
+        meta_columns = ["chain", "draw"]
+        group_df = idata_df[meta_columns + group_columns]
+        group_df.columns = [
+            col[1] if isinstance(col, tuple) else col
+            for col in group_df.columns
+        ]
+        group_pols_df = pl.from_pandas(group_df)
+        value_columns = [col for col in group_pols_df.columns if col not in meta_columns]
+        group_pols_df = group_pols_df.melt(
+            id_vars=meta_columns,
+            value_vars=value_columns,
+            variable_name="variable",
+            value_name="value"
+        )
+        group_pols_df = group_pols_df.with_columns(
+            pl.col("variable").map_elements(
+                lambda x: re.sub(r"\[.*\]", "", x)).alias("variable")
+        )
+        group_pols_df = group_pols_df.with_columns(
+            ((pl.col("draw") - 1) % group_pols_df["draw"].n_unique() + 1).alias(".iteration")
+        )
+        group_pols_df = group_pols_df.rename({"chain": ".chain", "draw": ".draw"})
+        tidy_dfs[group] = group_pols_df.select([".chain", ".draw", ".iteration", "variable", "value"])
+    return tidy_dfs
@@ -0,0 +1,156 @@
+---
+title: Add Dates To Pyrenew Idata And Use In Tidy-Verse
+format: gfm
+engine: jupyter
+---
+
+_The following notebook illustrates the addition of dates to an external `idata` object before demonstrating the tidy-usable capabilities in R._
+
+__Load In Packages And External Pyrenew InferenceData Object__
+
+```{python}
+#| echo: true
+
+import forecasttools
+import arviz as az
+import xarray as xr
+import os
+import subprocess
+import tempfile
+from datetime import date, timedelta
+
+xr.set_options(display_expand_data=False, display_expand_attrs=False)
+
+
+pyrenew_idata_path = "../assets/external/inference_data_1.nc"
+pyrenew_idata = az.from_netcdf(pyrenew_idata_path)
+pyrenew_idata
+```
+
+__Define Groups To Save And Convert__
+
+```{python}
+#| echo: true
+
+pyrenew_groups = ["posterior_predictive"]
+tidy_usable_groups = forecasttools.convert_idata_forecast_to_tidydraws(
+    idata=pyrenew_idata,
+    groups=pyrenew_groups
+)
+
+# show variables
+print(tidy_usable_groups["posterior_predictive"]["variable"].unique())
+
+# show output
+tidy_usable_groups
+```
+
+__Demonstrate Adding Time To Pyrenew InferenceData__
+
+```{python}
+#| echo: true
+
+start_time_as_dt = date(2022, 8, 1) # arbitrary
+
+pyrenew_target_var = pyrenew_idata["posterior_predictive"]["observed_hospital_admissions"]
+print(pyrenew_target_var)
+
+pyrenew_var_w_dates = forecasttools.generate_time_range_for_dim(
+    start_time_as_dt=start_time_as_dt,
+    variable_data=pyrenew_target_var,
+    dimension="observed_hospital_admissions_dim_0",
+    time_step=timedelta(days=1),
+)
+print(pyrenew_var_w_dates[:5], type(pyrenew_var_w_dates[0]))
+```
+
+__Add Dates To Pyrenew InferenceData__
+
+```{python}
+#| echo: true
+
+pyrenew_idata_w_dates = forecasttools.add_time_coords_to_idata_dimension(
+    idata=pyrenew_idata,
+    group="posterior_predictive",
+    variable="observed_hospital_admissions",
+    dimension="observed_hospital_admissions_dim_0",
+    start_date_iso=start_time_as_dt,
+    time_step=timedelta(days=1),
+)
+
+print(pyrenew_idata_w_dates["posterior_predictive"]["observed_hospital_admissions"]["observed_hospital_admissions_dim_0"])
+pyrenew_idata_w_dates
+```
+
+__Again Convert The Dated Pyrenew InferenceData To Tidy-Usable__
+
+
+```{python}
+
+pyrenew_groups = ["posterior_predictive"]
+tidy_usable_groups_w_dates = forecasttools.convert_idata_forecast_to_tidydraws(
+    idata=pyrenew_idata_w_dates,
+    groups=pyrenew_groups
+)
+tidy_usable_groups_w_dates
+```
+
+__Examine The Dataframe In The Tidyverse__
+
+```{python}
+def light_r_runner(r_code: str) -> None:
+    """
+    Run R code from Python as a temp file.
+    """
+    with tempfile.NamedTemporaryFile(suffix=".R", delete=False) as temp_r_file:
+        temp_r_file.write(r_code.encode("utf-8"))
+        temp_r_file_path = temp_r_file.name
+    try:
+        subprocess.run(["Rscript", temp_r_file_path], check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"R script failed with error: {e}")
+    finally:
+        os.remove(temp_r_file_path)
+
+
+for key, tidy_df in tidy_usable_groups_w_dates.items():
+    file_name = f"{key}.csv"
+    if not os.path.exists(file_name):
+        tidy_df.write_csv(file_name)
+        print(f"Saved {file_name}")
+
+
+r_code_to_verify_tibble = """
+library(magrittr)
+library(tidyverse)
+library(tidybayes)
+
+csv_files <- c("posterior_predictive.csv")
+
+for (csv_file in csv_files) {
+  tibble_data <- read_csv(csv_file)
+
+  print(paste("Tibble from", csv_file))
+  print(tibble_data)
+
+  tidy_data <- tibble_data %>%
+    tidybayes::tidy_draws()
+  print(tidy_data)
+}
+"""
+light_r_runner(r_code_to_verify_tibble)
+```
+
+The output of the last cell is:
+
+```
+ℹ Use `spec()` to retrieve the full column specification for this data.
+ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
+Error: To use a data frame directly with `tidy_draws()`, it must already be a
+  tidy-format data frame of draws: it must have integer-like `.chain`
+  `.iteration`, and `.draw` columns with one row per draw.
+
+  The `.draw` column in the input data frame has more than one row per draw
+  (its values are not unique).
+Execution halted
+```
-Original file line number
+Diff line change
@@ Expand Up / @@ -181,3 +181,4 @@ DS_Store @@
     _book/
     _book
     render_test_idata_general_time_representation_files
+    *.csv