From 8dc34dfb56078f5992ebda178b93ada7ae17cde8 Mon Sep 17 00:00:00 2001
From: chasem <chasem@wustl.edu>
Date: Fri, 17 Apr 2026 15:41:09 -0500
Subject: [PATCH 1/7] adding performance logging

---
 configure_logger.py                           | 88 ++++++++++++-------
 docs/development.md                           | 23 ++++-
 production.yml                                |  4 +-
 tfbpshiny/__main__.py                         | 14 +++
 tfbpshiny/app.py                              | 18 +++-
 tfbpshiny/modules/binding/server/workspace.py | 64 +++++++++-----
 .../modules/comparison/server/workspace.py    | 52 +++++++----
 .../modules/perturbation/server/workspace.py  | 59 +++++++++----
 .../select_datasets/server/workspace.py       | 38 +++++++-
 tfbpshiny/utils/profiler.py                   | 48 ++++++++++
 tfbpshiny/utils/vdb_init.py                   | 52 ++++++++---
 11 files changed, 354 insertions(+), 106 deletions(-)
 create mode 100644 tfbpshiny/utils/profiler.py

diff --git a/configure_logger.py b/configure_logger.py
index 0fb62a0..c8acb22 100644
--- a/configure_logger.py
+++ b/configure_logger.py
@@ -2,6 +2,8 @@
 from enum import Enum
 from typing import Literal
 
+_PROFILE_FORMAT = "%(message)s"
+
 
 class LogLevel(Enum):
     DEBUG = logging.DEBUG
@@ -31,34 +33,22 @@ def from_string(cls, level_str: str) -> Enum:
 
 def configure_logger(
     name: str,
-    level: int = logging.DEBUG,  # Use int type hint here
+    level: int = logging.DEBUG,
     format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
     handler_type: Literal["console", "file"] = "console",
     log_file: str = "tfbpmodeling.log",
 ) -> logging.Logger:
     """
-    Configures a logger.
-
-    :param name: Name of the logger
-    :type name: str
-    :param level: Logging level, must be one of logging.DEBUG,
-        logging.INFO, logging.WARNING, logging.ERROR
-    :type level: int
-    :param format: Logging format
-    :type format: str
-    :param handler_type: Type of handler, either 'console' or 'file'
-    :type handler_type: Literal["console", "file"]
-    :param log_file: Path to log file, required if handler_type is 'file'.
-        Default is 'tfbpmodeling.log'
-    :type log_file: str
-
-    :return: Configured logger
+    Configure a logger with a single handler.
+
+    :param name: Logger name.
+    :param level: Logging level (``logging.DEBUG``, ``logging.INFO``, etc.).
+    :param format: Log record format string.
+    :param handler_type: Destination — ``"console"`` or ``"file"``.
+    :param log_file: Path used when ``handler_type="file"``.
+    :returns: Configured logger.
     :rtype: logging.Logger
-
-    :raises ValueError: If any of the parameters have invalid datatypes
-
-    example usage:
-    >>> logger = configure_logger("my_logger", level=logging.INFO)
+    :raises ValueError: If any parameter is invalid.
 
     """
     if not isinstance(name, str):
@@ -69,7 +59,7 @@ def configure_logger(
         raise ValueError("Invalid logging level")
     if not isinstance(format, str):
         raise ValueError("format must be a string")
-    if handler_type not in ["console", "file"]:
+    if handler_type not in ("console", "file"):
         raise ValueError("handler_type must be 'console' or 'file'")
     if handler_type == "file" and not log_file:
         raise ValueError("log_file must be specified for file handler")
@@ -77,22 +67,52 @@ def configure_logger(
     logger = logging.getLogger(name)
     logger.setLevel(level)
 
-    # Remove all handlers associated with the logger object to avoid duplicate logs
-    for handler in logger.handlers[:]:
-        logger.removeHandler(handler)
+    # Remove existing handlers to avoid duplicates on re-configuration.
+    for h in logger.handlers[:]:
+        logger.removeHandler(h)
 
     if handler_type == "console":
-        handler = logging.StreamHandler()
-    elif handler_type == "file":
-        if not log_file:
-            raise ValueError("log_file must be specified for file handler")
-        handler = logging.FileHandler(log_file)
+        handler: logging.Handler = logging.StreamHandler()
     else:
-        raise ValueError("Invalid handler_type. Must be 'console' or 'file'.")
+        handler = logging.FileHandler(log_file)
 
     handler.setLevel(level)
-    formatter = logging.Formatter(format)
-    handler.setFormatter(formatter)
+    handler.setFormatter(logging.Formatter(format))
     logger.addHandler(handler)
 
     return logger
+
+
+def configure_profile_logger(
+    handler_type: Literal["console", "file"] = "console",
+    level: int = logging.DEBUG,
+    log_file: str = "tfbpshiny_profile.log",
+    enabled: bool = True,
+) -> logging.Logger:
+    """
+    Configure and return the ``"profiler"`` logger for timing instrumentation.
+
+    The profiler logger uses a bare ``%(message)s`` format because all
+    structure is embedded in the message by
+    :func:`tfbpshiny.utils.profiler.profile_span`.
+    It never propagates to the root or ``"shiny"`` logger.
+
+    :param handler_type: Destination for profile records — ``"console"`` or ``"file"``.
+    :param level: Log level; ignored (set to ``CRITICAL``) when ``enabled=False``.
+    :param log_file: Path used when ``handler_type="file"``.
+    :param enabled: When ``False``, silences the logger by setting its level
+        to ``CRITICAL``.
+    :returns: Configured ``"profiler"`` logger.
+    :rtype: logging.Logger
+
+    """
+    effective_level = level if enabled else logging.CRITICAL
+    logger = configure_logger(
+        "profiler",
+        level=effective_level,
+        format=_PROFILE_FORMAT,
+        handler_type=handler_type,
+        log_file=log_file,
+    )
+    logger.propagate = False
+    return logger
diff --git a/docs/development.md b/docs/development.md
index a0a02f3..ca330d6 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -45,4 +45,25 @@ callbacks are necessary, then either a different plotting library or a custom
 implementation of some sort will likely be necessary. Would be nice to explore
 other options, including custom implementations of plots using d3 and then creating
 the shiny widget manually. With AI, that might be more achievable (for me at least)
-than it would be otherwise b/c we can give the AI the docs for all three as context.
\ No newline at end of file
+than it would be otherwise b/c we can give the AI the docs for all three as context.
+
+There is both a general log and profiling log
+
+Both loggers write to stdout/stderr, Docker's awslogs driver captures everything into the shinyapp stream under /tfbpshiny/production. To later separate PROFILE lines from main log lines when parsing:
+
+```{python}
+import pandas as pd
+
+df = pd.read_csv("exported.log", sep="|", header=None, skipinitialspace=True,
+                 names=["marker","timestamp","elapsed_s","op","module","dataset","context"])
+profile = df[df["marker"].str.strip() == "PROFILE"]
+```
+
+
+Or in CloudWatch Logs Insights:
+
+```{raw}
+fields @timestamp, @message
+| filter @message like /^PROFILE/
+| parse @message "PROFILE | * | * | * | * | * | *" as timestamp, elapsed_s, op, module, dataset, context
+```
\ No newline at end of file
diff --git a/production.yml b/production.yml
index ede0f0d..44f2335 100644
--- a/production.yml
+++ b/production.yml
@@ -12,6 +12,8 @@ services:
       - HF_HOME=/hf-cache
       - HF_TOKEN
       - DOCKER_ENV
+      - TFBPSHINY_LOG_HANDLER=console
+      - TFBPSHINY_PROFILE_HANDLER=console
     volumes:
       - hf_cache:/hf-cache
     expose:
@@ -62,7 +64,7 @@ services:
       driver: awslogs
       options:
         awslogs-group: /tfbpshiny/production
-        awslogs-region: us-east-1
+        awslogs-region: us-east-2
         awslogs-stream: traefik
         awslogs-create-group: "true"
     restart: unless-stopped
diff --git a/tfbpshiny/__main__.py b/tfbpshiny/__main__.py
index 3b4e78c..cb16872 100644
--- a/tfbpshiny/__main__.py
+++ b/tfbpshiny/__main__.py
@@ -38,6 +38,18 @@ def make_parser() -> argparse.ArgumentParser:
         choices=["console", "file"],
         help="Set log handler type",
     )
+    parser.add_argument(
+        "--profile-handler",
+        type=str,
+        default="console",
+        choices=["console", "file"],
+        help="Handler for the profiler logger",
+    )
+    parser.add_argument(
+        "--no-profile",
+        action="store_true",
+        help="Disable the profiler logger entirely",
+    )
 
     subparsers = parser.add_subparsers(dest="command", required=True)
 
@@ -78,6 +90,8 @@ def main() -> None:
     # convert to int when configuring the logger
     os.environ["TFBPSHINY_LOG_LEVEL"] = str(log_level.value)
     os.environ["TFBPSHINY_LOG_HANDLER"] = args.log_handler
+    os.environ["TFBPSHINY_PROFILE_HANDLER"] = args.profile_handler
+    os.environ["TFBPSHINY_PROFILE_ENABLED"] = "0" if args.no_profile else "1"
     args.func(args)
 
 
diff --git a/tfbpshiny/app.py b/tfbpshiny/app.py
index 60e4eed..7068c21 100644
--- a/tfbpshiny/app.py
+++ b/tfbpshiny/app.py
@@ -9,7 +9,7 @@
 from dotenv import load_dotenv
 from shiny import App, reactive, render, ui
 
-from configure_logger import configure_logger
+from configure_logger import configure_logger, configure_profile_logger
 from tfbpshiny.components import github_badge, nav_button
 from tfbpshiny.modules.binding.server import (
     binding_sidebar_server,
@@ -60,6 +60,14 @@
     log_file=log_file,
 )
 
+profile_logger = configure_profile_logger(
+    handler_type=cast(
+        Literal["console", "file"],
+        os.getenv("TFBPSHINY_PROFILE_HANDLER", "console"),
+    ),
+    enabled=os.getenv("TFBPSHINY_PROFILE_ENABLED", "1") == "1",
+)
+
 # instantiate the virtualDB and compute app-level dataset metadata
 
 virtualdb_config = os.getenv(
@@ -67,7 +75,9 @@
 )
 hf_token: str | None = os.getenv("HF_TOKEN")
 logger.info(f"Loading VirtualDB with config: {virtualdb_config}")
-vdb, app_datasets = initialize_data(virtualdb_config, hf_token)
+vdb, app_datasets = initialize_data(
+    virtualdb_config, hf_token, profile_logger=profile_logger
+)
 
 app_ui = ui.page_fillable(
     ui.include_css((Path(__file__).parent / "app.css").resolve()),
@@ -121,6 +131,7 @@ def app_server(input: Any, output: Any, session: Any) -> None:
         dataset_filters=dataset_filters,
         vdb=vdb,
         logger=logger,
+        profile_logger=profile_logger,
     )
 
     corr_type, col_preference = binding_sidebar_server(
@@ -138,6 +149,7 @@ def app_server(input: Any, output: Any, session: Any) -> None:
         dataset_filters=dataset_filters,
         vdb=vdb,
         logger=logger,
+        profile_logger=profile_logger,
     )
 
     corr_type_p, col_preference_p = perturbation_sidebar_server(
@@ -155,6 +167,7 @@ def app_server(input: Any, output: Any, session: Any) -> None:
         dataset_filters=dataset_filters,
         vdb=vdb,
         logger=logger,
+        profile_logger=profile_logger,
     )
 
     top_n, effect_threshold, pvalue_threshold, facet_by = comparison_sidebar_server(
@@ -175,6 +188,7 @@ def app_server(input: Any, output: Any, session: Any) -> None:
         facet_by=facet_by,
         vdb=vdb,
         logger=logger,
+        profile_logger=profile_logger,
     )
 
     # set the active module when a nav button is clicked
diff --git a/tfbpshiny/modules/binding/server/workspace.py b/tfbpshiny/modules/binding/server/workspace.py
index fcb731b..652b9b8 100644
--- a/tfbpshiny/modules/binding/server/workspace.py
+++ b/tfbpshiny/modules/binding/server/workspace.py
@@ -16,6 +16,7 @@
     get_measurement_column,
     regulator_scatter_sql,
 )
+from tfbpshiny.utils.profiler import profile_span
 from tfbpshiny.utils.vdb_init import get_regulator_display_name
 
 
@@ -30,6 +31,7 @@ def binding_workspace_server(
     dataset_filters: reactive.Value[dict[str, Any]],
     vdb: VirtualDB,
     logger: Logger,
+    profile_logger: Logger,
 ) -> None:
     """
     Render the binding correlation rows: pairwise distributions
@@ -107,17 +109,24 @@ def _all_corr_data() -> dict[tuple[str, str], pd.DataFrame]:
                 logger.debug(
                     f"Correlating {db_a}({col_a}) vs {db_b}({col_b}) ({method})"
                 )
-                result[(db_a, db_b)] = corr_pair_sql(
-                    vdb,
-                    db_a,
-                    col_a,
-                    filters.get(db_a),
-                    db_b,
-                    col_b,
-                    filters.get(db_b),
-                    method,
-                    prefix=f"p{i}_",
-                )
+                with profile_span(
+                    profile_logger,
+                    "vdb.query",
+                    module="binding",
+                    dataset=f"{db_a}x{db_b}",
+                    context="_all_corr_data",
+                ):
+                    result[(db_a, db_b)] = corr_pair_sql(
+                        vdb,
+                        db_a,
+                        col_a,
+                        filters.get(db_a),
+                        db_b,
+                        col_b,
+                        filters.get(db_b),
+                        method,
+                        prefix=f"p{i}_",
+                    )
             except Exception as exc:
                 logger.error(
                     f"Failed to correlate {db_a} vs {db_b}: {exc}", exc_info=True
@@ -249,14 +258,16 @@ def distributions_plot() -> ui.Tag:
             "  });"
             "})();"
         )
-        return ui.HTML(
-            to_html(
-                fig,
-                include_plotlyjs="cdn",
-                full_html=False,
-                post_script=post_script,
+        with profile_span(
+            profile_logger,
+            "plot.build",
+            module="binding",
+            context="distributions_plot",
+        ):
+            html = to_html(
+                fig, include_plotlyjs="cdn", full_html=False, post_script=post_script
             )
-        )
+        return ui.HTML(html)
 
     @render.ui
     def regulator_selector() -> ui.Tag:
@@ -385,7 +396,13 @@ def _strip_reg(f: dict | None) -> dict | None:
                     reg,
                     idx,
                 )
-                merged = vdb.query(scatter_sql, **scatter_params)
+                with profile_span(
+                    profile_logger,
+                    "vdb.query",
+                    module="binding",
+                    context="scatter",
+                ):
+                    merged = vdb.query(scatter_sql, **scatter_params)
                 logger.debug(
                     f"scatter {db_a}/{db_b} reg={reg!r} "
                     f"rows={len(merged)} fa={fa!r} fb={fb!r}"
@@ -447,9 +464,16 @@ def _strip_reg(f: dict | None) -> dict | None:
                 width=400,
                 height=400,
             )
+            with profile_span(
+                profile_logger,
+                "plot.build",
+                module="binding",
+                context="regulator_plot",
+            ):
+                plot_html = to_html(fig, include_plotlyjs=False, full_html=False)
             plot_divs.append(
                 ui.div(
-                    ui.HTML(to_html(fig, include_plotlyjs=False, full_html=False)),
+                    ui.HTML(plot_html),
                     style="flex: 0 0 auto;",
                 )
             )
diff --git a/tfbpshiny/modules/comparison/server/workspace.py b/tfbpshiny/modules/comparison/server/workspace.py
index f71eb0a..a872cb6 100644
--- a/tfbpshiny/modules/comparison/server/workspace.py
+++ b/tfbpshiny/modules/comparison/server/workspace.py
@@ -20,6 +20,7 @@
     PERTURBATION_LABEL_MAP,
     topn_responsive_ratio,
 )
+from tfbpshiny.utils.profiler import profile_span
 from tfbpshiny.utils.vdb_init import get_regulator_display_name
 
 # color palettes
@@ -70,6 +71,7 @@ def comparison_workspace_server(
     facet_by: Callable[[], str],
     vdb: VirtualDB,
     logger: Logger,
+    profile_logger: Logger,
 ) -> None:
     """Render the Top-N by Binding workspace plot."""
 
@@ -133,19 +135,26 @@ def _topn_data() -> pd.DataFrame:
                     p_filters = (
                         None if p_cfg.get("hackett_time_filter") else filters.get(p_db)
                     )
-                    result = topn_responsive_ratio(
-                        vdb=vdb,
-                        binding_view=b_db,
-                        perturbation_view=p_db,
-                        top_n=n,
-                        effect_threshold=eff,
-                        pvalue_threshold=pval,
-                        binding_filters=filters.get(b_db),
-                        perturbation_filters=p_filters,
-                        param_prefix=f"{b_db}_{p_db}",
-                        **b_cfg,
-                        **p_cfg,
-                    )
+                    with profile_span(
+                        profile_logger,
+                        "vdb.query",
+                        module="comparison",
+                        dataset=f"{b_db}x{p_db}",
+                        context="_topn_data",
+                    ):
+                        result = topn_responsive_ratio(
+                            vdb=vdb,
+                            binding_view=b_db,
+                            perturbation_view=p_db,
+                            top_n=n,
+                            effect_threshold=eff,
+                            pvalue_threshold=pval,
+                            binding_filters=filters.get(b_db),
+                            perturbation_filters=p_filters,
+                            param_prefix=f"{b_db}_{p_db}",
+                            **b_cfg,
+                            **p_cfg,
+                        )
                     assert isinstance(result, pd.DataFrame)
                     result["binding_source"] = b_label
                     result["perturbation_source"] = p_label
@@ -162,7 +171,13 @@ def _topn_data() -> pd.DataFrame:
 
         if not results:
             return pd.DataFrame()
-        out = pd.concat(results, ignore_index=True)
+        with profile_span(
+            profile_logger,
+            "df.concat",
+            module="comparison",
+            context="_topn_data",
+        ):
+            out = pd.concat(results, ignore_index=True)
         out["percent_responsive"] = out["responsive_ratio"] * 100
         return out
 
@@ -268,7 +283,14 @@ def topn_plot() -> ui.Tag:
             legend_title=legend_title,
             margin=dict(l=50, r=20, t=80, b=30),
         )
-        return ui.HTML(to_html(fig, include_plotlyjs="cdn", full_html=False))
+        with profile_span(
+            profile_logger,
+            "plot.build",
+            module="comparison",
+            context="topn_plot",
+        ):
+            html = to_html(fig, include_plotlyjs="cdn", full_html=False)
+        return ui.HTML(html)
 
 
 __all__ = ["comparison_workspace_server"]
diff --git a/tfbpshiny/modules/perturbation/server/workspace.py b/tfbpshiny/modules/perturbation/server/workspace.py
index d799979..5218b98 100644
--- a/tfbpshiny/modules/perturbation/server/workspace.py
+++ b/tfbpshiny/modules/perturbation/server/workspace.py
@@ -16,6 +16,7 @@
     get_measurement_column,
     regulator_scatter_sql,
 )
+from tfbpshiny.utils.profiler import profile_span
 from tfbpshiny.utils.vdb_init import get_regulator_display_name
 
 
@@ -30,6 +31,7 @@ def perturbation_workspace_server(
     dataset_filters: reactive.Value[dict[str, Any]],
     vdb: VirtualDB,
     logger: Logger,
+    profile_logger: Logger,
 ) -> None:
     """
     Render the perturbation correlation rows: pairwise distributions
@@ -105,17 +107,24 @@ def _all_corr_data() -> dict[tuple[str, str], pd.DataFrame]:
                 logger.debug(
                     f"Correlating {db_a}({col_a}) vs {db_b}({col_b}) ({method})"
                 )
-                result[(db_a, db_b)] = corr_pair_sql(
-                    vdb,
-                    db_a,
-                    col_a,
-                    filters.get(db_a),
-                    db_b,
-                    col_b,
-                    filters.get(db_b),
-                    method,
-                    prefix=f"p{i}_",
-                )
+                with profile_span(
+                    profile_logger,
+                    "vdb.query",
+                    module="perturbation",
+                    dataset=f"{db_a}x{db_b}",
+                    context="_all_corr_data",
+                ):
+                    result[(db_a, db_b)] = corr_pair_sql(
+                        vdb,
+                        db_a,
+                        col_a,
+                        filters.get(db_a),
+                        db_b,
+                        col_b,
+                        filters.get(db_b),
+                        method,
+                        prefix=f"p{i}_",
+                    )
             except Exception as exc:
                 logger.error(
                     f"Failed to correlate {db_a} vs {db_b}: {exc}", exc_info=True
@@ -247,14 +256,19 @@ def distributions_plot() -> ui.Tag:
             "  });"
             "})();"
         )
-        return ui.HTML(
-            to_html(
+        with profile_span(
+            profile_logger,
+            "plot.build",
+            module="perturbation",
+            context="distributions_plot",
+        ):
+            html = to_html(
                 fig,
                 include_plotlyjs="cdn",
                 full_html=False,
                 post_script=post_script,
             )
-        )
+        return ui.HTML(html)
 
     @render.ui
     def regulator_selector() -> ui.Tag:
@@ -383,7 +397,13 @@ def _strip_reg(f: dict | None) -> dict | None:
                     reg,
                     idx,
                 )
-                merged = vdb.query(scatter_sql, **scatter_params)
+                with profile_span(
+                    profile_logger,
+                    "vdb.query",
+                    module="perturbation",
+                    context="scatter",
+                ):
+                    merged = vdb.query(scatter_sql, **scatter_params)
                 logger.debug(
                     f"scatter {db_a}/{db_b} reg={reg!r} "
                     f"rows={len(merged)} fa={fa!r} fb={fb!r}"
@@ -445,9 +465,16 @@ def _strip_reg(f: dict | None) -> dict | None:
                 width=400,
                 height=400,
             )
+            with profile_span(
+                profile_logger,
+                "plot.build",
+                module="perturbation",
+                context="regulator_plot",
+            ):
+                plot_html = to_html(fig, include_plotlyjs=False, full_html=False)
             plot_divs.append(
                 ui.div(
-                    ui.HTML(to_html(fig, include_plotlyjs=False, full_html=False)),
+                    ui.HTML(plot_html),
                     style="flex: 0 0 auto;",
                 )
             )
diff --git a/tfbpshiny/modules/select_datasets/server/workspace.py b/tfbpshiny/modules/select_datasets/server/workspace.py
index 90042e8..43e7b8d 100644
--- a/tfbpshiny/modules/select_datasets/server/workspace.py
+++ b/tfbpshiny/modules/select_datasets/server/workspace.py
@@ -24,6 +24,7 @@
     diagonal_cell_modal_ui,
     off_diagonal_cell_modal_ui,
 )
+from tfbpshiny.utils.profiler import profile_span
 from tfbpshiny.utils.ratelimit import debounce
 from tfbpshiny.utils.vdb_init import HIDDEN_FILTER_FIELDS
 
@@ -38,6 +39,7 @@ def select_datasets_workspace_server(
     dataset_filters: reactive.Value[dict[str, Any]],
     vdb: VirtualDB,
     logger: Logger,
+    profile_logger: Logger,
 ) -> None:
     """Render the sample-count matrix for all active datasets."""
 
@@ -83,12 +85,26 @@ def _matrix_data() -> dict[str, Any]:
             db_filters = filters.get(db_name)
 
             sql, params = regulator_locus_tags_query(db_name, db_filters)
-            reg_df = vdb.query(sql, **params)
+            with profile_span(
+                profile_logger,
+                "vdb.query",
+                module="select_datasets",
+                dataset=db_name,
+                context="_matrix_data",
+            ):
+                reg_df = vdb.query(sql, **params)
             regulators = set(reg_df["regulator_locus_tag"].dropna().astype(str))
             regulator_sets[db_name] = regulators
 
             sql, params = sample_count_query(db_name, db_filters)
-            n_samples = int(vdb.query(sql, **params).iloc[0, 0])
+            with profile_span(
+                profile_logger,
+                "vdb.query",
+                module="select_datasets",
+                dataset=db_name,
+                context="_matrix_data",
+            ):
+                n_samples = int(vdb.query(sql, **params).iloc[0, 0])
 
             diagonal[db_name] = {"regulators": len(regulators), "samples": n_samples}
 
@@ -105,8 +121,22 @@ def _matrix_data() -> dict[str, Any]:
                 sql_b, params_b = sample_count_query(
                     db_b, filters.get(db_b), restrict_to_regulators=common_list
                 )
-                n_a = int(vdb.query(sql_a, **params_a).iloc[0, 0])
-                n_b = int(vdb.query(sql_b, **params_b).iloc[0, 0])
+                with profile_span(
+                    profile_logger,
+                    "vdb.query",
+                    module="select_datasets",
+                    dataset=f"{db_a}x{db_b}",
+                    context="_matrix_data",
+                ):
+                    n_a = int(vdb.query(sql_a, **params_a).iloc[0, 0])
+                with profile_span(
+                    profile_logger,
+                    "vdb.query",
+                    module="select_datasets",
+                    dataset=f"{db_a}x{db_b}",
+                    context="_matrix_data",
+                ):
+                    n_b = int(vdb.query(sql_b, **params_b).iloc[0, 0])
 
                 cross_dataset[(db_a, db_b)] = {
                     "common_regulators": len(common),
diff --git a/tfbpshiny/utils/profiler.py b/tfbpshiny/utils/profiler.py
new file mode 100644
index 0000000..91b0a30
--- /dev/null
+++ b/tfbpshiny/utils/profiler.py
@@ -0,0 +1,48 @@
+"""Timing context manager for the profiler logger."""
+
+from __future__ import annotations
+
+import time
+from collections.abc import Generator
+from contextlib import contextmanager
+from datetime import datetime, timezone
+from logging import Logger
+
+
+@contextmanager
+def profile_span(
+    logger: Logger,
+    op: str,
+    module: str = "",
+    dataset: str = "",
+    context: str = "",
+) -> Generator[None]:
+    """
+    Time a block of code and emit one fixed-schema pipe-delimited record to *logger*.
+
+    The record always has exactly seven pipe-separated columns so it can be read
+    without regex using ``pandas.read_csv(sep="|")``:
+
+    ::
+
+        PROFILE | timestamp | elapsed_s | op | module | dataset | context
+
+    :param logger: The ``"profiler"`` logger.
+    :param op: Operation label, e.g. ``"vdb.query"`` or ``"plot.build"``.
+    :param module: Shiny module name, e.g. ``"binding"``. Empty for init-time spans.
+    :param dataset: Dataset(s) involved, e.g. ``"harbison"`` or ``"harbison x rossi"``.
+    :param context: Function or plot name providing location, e.g. ``"_all_corr_data"``.
+
+    """
+    t0 = time.perf_counter()
+    try:
+        yield
+    finally:
+        elapsed = time.perf_counter() - t0
+        ts = datetime.now(timezone.utc).isoformat(timespec="milliseconds")
+        logger.debug(
+            " | ".join(["PROFILE", ts, f"{elapsed:.4f}", op, module, dataset, context])
+        )
+
+
+__all__ = ["profile_span"]
diff --git a/tfbpshiny/utils/vdb_init.py b/tfbpshiny/utils/vdb_init.py
index de62974..e177be1 100644
--- a/tfbpshiny/utils/vdb_init.py
+++ b/tfbpshiny/utils/vdb_init.py
@@ -2,11 +2,15 @@
 
 from __future__ import annotations
 
+import logging
 from dataclasses import dataclass
+from logging import Logger
 
 import pandas as pd
 from labretriever import VirtualDB
 
+from tfbpshiny.utils.profiler import profile_span
+
 # Metadata fields to suppress from the filter UI, keyed by db_name.
 # Use "*" for fields hidden across all datasets; use the db_name key for
 # dataset-specific exclusions. The effective hidden set for a given dataset
@@ -140,7 +144,9 @@ def _filter_hackett_views(vdb: VirtualDB) -> None:
         )
 
 
-def ensure_hackett_analysis_set(vdb: VirtualDB) -> None:
+def ensure_hackett_analysis_set(
+    vdb: VirtualDB, profile_logger: Logger | None = None
+) -> None:
     """
     Build the ``hackett_analysis_set`` table and permanently filter the ``hackett_meta``
     and ``hackett`` views to include only those samples.
@@ -148,9 +154,12 @@ def ensure_hackett_analysis_set(vdb: VirtualDB) -> None:
     Safe to call multiple times; uses ``CREATE OR REPLACE``.
 
     :param vdb: The application VirtualDB instance.
+    :param profile_logger: Optional profiler logger; timing is skipped when ``None``.
 
     """
-    vdb._conn.execute(_HACKETT_ANALYSIS_SET_SQL)
+    _pl = profile_logger or logging.getLogger("profiler")
+    with profile_span(_pl, "init.hackett_set"):
+        vdb._conn.execute(_HACKETT_ANALYSIS_SET_SQL)
     _filter_hackett_views(vdb)
 
 
@@ -174,7 +183,9 @@ def ensure_hackett_analysis_set(vdb: VirtualDB) -> None:
 """
 
 
-def _build_regulator_display_names(vdb: VirtualDB) -> None:
+def _build_regulator_display_names(
+    vdb: VirtualDB, profile_logger: Logger | None = None
+) -> None:
     """
     Build the ``regulator_display_names`` DuckDB table from all dataset meta views.
 
@@ -184,6 +195,7 @@ def _build_regulator_display_names(vdb: VirtualDB) -> None:
     non-empty symbol different from the tag is present; otherwise it equals the tag.
 
     :param vdb: The application VirtualDB instance.
+    :param profile_logger: Optional profiler logger; timing is skipped when ``None``.
 
     """
     db_names = [
@@ -201,12 +213,15 @@ def _build_regulator_display_names(vdb: VirtualDB) -> None:
         table=_REGULATOR_DISPLAY_NAME_TABLE,
         union_sql=union_sql,
     )
-    vdb._conn.execute(sql)
+    _pl = profile_logger or logging.getLogger("profiler")
+    with profile_span(_pl, "init.regulator_table"):
+        vdb._conn.execute(sql)
 
 
 def get_regulator_display_name(
     vdb: VirtualDB,
     locus_tags: list[str] | None = None,
+    profile_logger: Logger | None = None,
 ) -> pd.DataFrame:
     """
     Return a DataFrame of regulator display names from the pre-built lookup table.
@@ -219,13 +234,22 @@ def get_regulator_display_name(
     :rtype: pandas.DataFrame
 
     """
-    if locus_tags is None:
-        return vdb._conn.execute(f"SELECT * FROM {_REGULATOR_DISPLAY_NAME_TABLE}").df()
-    return vdb._conn.execute(
-        f"SELECT * FROM {_REGULATOR_DISPLAY_NAME_TABLE} "
-        f"WHERE regulator_locus_tag = ANY(?)",
-        [locus_tags],
-    ).df()
+    _pl = profile_logger or logging.getLogger("profiler")
+    with profile_span(
+        _pl,
+        "vdb.execute",
+        dataset=_REGULATOR_DISPLAY_NAME_TABLE,
+        context="get_regulator_display_name",
+    ):
+        if locus_tags is None:
+            return vdb._conn.execute(
+                f"SELECT * FROM {_REGULATOR_DISPLAY_NAME_TABLE}"
+            ).df()
+        return vdb._conn.execute(
+            f"SELECT * FROM {_REGULATOR_DISPLAY_NAME_TABLE} "
+            f"WHERE regulator_locus_tag = ANY(?)",
+            [locus_tags],
+        ).df()
 
 
 @dataclass
@@ -252,19 +276,21 @@ class AppDatasets:
 def initialize_data(
     virtualdb_config: str,
     hf_token: str | None = None,
+    profile_logger: Logger | None = None,
 ) -> tuple[VirtualDB, AppDatasets]:
     """
     Construct the VirtualDB, run one-time setup, and compute app-level dataset metadata.
 
     :param virtualdb_config: Path to the VirtualDB YAML config file.
     :param hf_token: Optional HuggingFace token for private repo access.
+    :param profile_logger: Optional profiler logger for timing instrumentation.
     :returns: Tuple of ``(vdb, app_datasets)``.
     :rtype: tuple[VirtualDB, AppDatasets]
 
     """
     vdb = VirtualDB(virtualdb_config, token=hf_token)
-    ensure_hackett_analysis_set(vdb)
-    _build_regulator_display_names(vdb)
+    ensure_hackett_analysis_set(vdb, profile_logger=profile_logger)
+    _build_regulator_display_names(vdb, profile_logger=profile_logger)
 
     condition_cols: dict[str, list[str]] = {}
     upstream_cols: dict[str, list[str]] = {}

From 09f863b9b9d9d1274b462431a4f3f0a35ad34144 Mon Sep 17 00:00:00 2001
From: chasem <chasem@wustl.edu>
Date: Fri, 17 Apr 2026 16:09:48 -0500
Subject: [PATCH 2/7] adding session id to track unique visits

---
 tfbpshiny/app.py                              |  9 +++++
 tfbpshiny/modules/binding/server/workspace.py |  5 +++
 .../modules/comparison/server/workspace.py    |  4 ++
 .../modules/perturbation/server/workspace.py  |  5 +++
 .../select_datasets/server/workspace.py       |  5 +++
 tfbpshiny/utils/profiler.py                   | 39 +++++++++++++++++--
 6 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/tfbpshiny/app.py b/tfbpshiny/app.py
index 7068c21..c73634f 100644
--- a/tfbpshiny/app.py
+++ b/tfbpshiny/app.py
@@ -41,6 +41,7 @@
     selection_matrix_ui,
     selection_sidebar_ui,
 )
+from tfbpshiny.utils.profiler import log_session_event
 from tfbpshiny.utils.vdb_init import initialize_data
 
 if not os.getenv("DOCKER_ENV"):
@@ -110,6 +111,10 @@
 def app_server(input: Any, output: Any, session: Any) -> None:
     """Create shared reactive state and call all module servers."""
 
+    sid: str = session.id
+    log_session_event(profile_logger, "START", sid)
+    session.on_ended(lambda: log_session_event(profile_logger, "END", sid))
+
     # this stores the name of the currently active module, ie
     # "home", "selection", "binding", "perturbation", or "comparison"
     active_module: reactive.Value[str] = reactive.value("home")
@@ -132,6 +137,7 @@ def app_server(input: Any, output: Any, session: Any) -> None:
         vdb=vdb,
         logger=logger,
         profile_logger=profile_logger,
+        session_id=sid,
     )
 
     corr_type, col_preference = binding_sidebar_server(
@@ -150,6 +156,7 @@ def app_server(input: Any, output: Any, session: Any) -> None:
         vdb=vdb,
         logger=logger,
         profile_logger=profile_logger,
+        session_id=sid,
     )
 
     corr_type_p, col_preference_p = perturbation_sidebar_server(
@@ -168,6 +175,7 @@ def app_server(input: Any, output: Any, session: Any) -> None:
         vdb=vdb,
         logger=logger,
         profile_logger=profile_logger,
+        session_id=sid,
     )
 
     top_n, effect_threshold, pvalue_threshold, facet_by = comparison_sidebar_server(
@@ -189,6 +197,7 @@ def app_server(input: Any, output: Any, session: Any) -> None:
         vdb=vdb,
         logger=logger,
         profile_logger=profile_logger,
+        session_id=sid,
     )
 
     # set the active module when a nav button is clicked
diff --git a/tfbpshiny/modules/binding/server/workspace.py b/tfbpshiny/modules/binding/server/workspace.py
index 652b9b8..9a8a5b3 100644
--- a/tfbpshiny/modules/binding/server/workspace.py
+++ b/tfbpshiny/modules/binding/server/workspace.py
@@ -32,6 +32,7 @@ def binding_workspace_server(
     vdb: VirtualDB,
     logger: Logger,
     profile_logger: Logger,
+    session_id: str = "",
 ) -> None:
     """
     Render the binding correlation rows: pairwise distributions
@@ -115,6 +116,7 @@ def _all_corr_data() -> dict[tuple[str, str], pd.DataFrame]:
                     module="binding",
                     dataset=f"{db_a}x{db_b}",
                     context="_all_corr_data",
+                    session_id=session_id,
                 ):
                     result[(db_a, db_b)] = corr_pair_sql(
                         vdb,
@@ -263,6 +265,7 @@ def distributions_plot() -> ui.Tag:
             "plot.build",
             module="binding",
             context="distributions_plot",
+            session_id=session_id,
         ):
             html = to_html(
                 fig, include_plotlyjs="cdn", full_html=False, post_script=post_script
@@ -401,6 +404,7 @@ def _strip_reg(f: dict | None) -> dict | None:
                     "vdb.query",
                     module="binding",
                     context="scatter",
+                    session_id=session_id,
                 ):
                     merged = vdb.query(scatter_sql, **scatter_params)
                 logger.debug(
@@ -469,6 +473,7 @@ def _strip_reg(f: dict | None) -> dict | None:
                 "plot.build",
                 module="binding",
                 context="regulator_plot",
+                session_id=session_id,
             ):
                 plot_html = to_html(fig, include_plotlyjs=False, full_html=False)
             plot_divs.append(
diff --git a/tfbpshiny/modules/comparison/server/workspace.py b/tfbpshiny/modules/comparison/server/workspace.py
index a872cb6..28e0c8d 100644
--- a/tfbpshiny/modules/comparison/server/workspace.py
+++ b/tfbpshiny/modules/comparison/server/workspace.py
@@ -72,6 +72,7 @@ def comparison_workspace_server(
     vdb: VirtualDB,
     logger: Logger,
     profile_logger: Logger,
+    session_id: str = "",
 ) -> None:
     """Render the Top-N by Binding workspace plot."""
 
@@ -141,6 +142,7 @@ def _topn_data() -> pd.DataFrame:
                         module="comparison",
                         dataset=f"{b_db}x{p_db}",
                         context="_topn_data",
+                        session_id=session_id,
                     ):
                         result = topn_responsive_ratio(
                             vdb=vdb,
@@ -176,6 +178,7 @@ def _topn_data() -> pd.DataFrame:
             "df.concat",
             module="comparison",
             context="_topn_data",
+            session_id=session_id,
         ):
             out = pd.concat(results, ignore_index=True)
         out["percent_responsive"] = out["responsive_ratio"] * 100
@@ -288,6 +291,7 @@ def topn_plot() -> ui.Tag:
             "plot.build",
             module="comparison",
             context="topn_plot",
+            session_id=session_id,
         ):
             html = to_html(fig, include_plotlyjs="cdn", full_html=False)
         return ui.HTML(html)
diff --git a/tfbpshiny/modules/perturbation/server/workspace.py b/tfbpshiny/modules/perturbation/server/workspace.py
index 5218b98..ebfc249 100644
--- a/tfbpshiny/modules/perturbation/server/workspace.py
+++ b/tfbpshiny/modules/perturbation/server/workspace.py
@@ -32,6 +32,7 @@ def perturbation_workspace_server(
     vdb: VirtualDB,
     logger: Logger,
     profile_logger: Logger,
+    session_id: str = "",
 ) -> None:
     """
     Render the perturbation correlation rows: pairwise distributions
@@ -113,6 +114,7 @@ def _all_corr_data() -> dict[tuple[str, str], pd.DataFrame]:
                     module="perturbation",
                     dataset=f"{db_a}x{db_b}",
                     context="_all_corr_data",
+                    session_id=session_id,
                 ):
                     result[(db_a, db_b)] = corr_pair_sql(
                         vdb,
@@ -261,6 +263,7 @@ def distributions_plot() -> ui.Tag:
             "plot.build",
             module="perturbation",
             context="distributions_plot",
+            session_id=session_id,
         ):
             html = to_html(
                 fig,
@@ -402,6 +405,7 @@ def _strip_reg(f: dict | None) -> dict | None:
                     "vdb.query",
                     module="perturbation",
                     context="scatter",
+                    session_id=session_id,
                 ):
                     merged = vdb.query(scatter_sql, **scatter_params)
                 logger.debug(
@@ -470,6 +474,7 @@ def _strip_reg(f: dict | None) -> dict | None:
                 "plot.build",
                 module="perturbation",
                 context="regulator_plot",
+                session_id=session_id,
             ):
                 plot_html = to_html(fig, include_plotlyjs=False, full_html=False)
             plot_divs.append(
diff --git a/tfbpshiny/modules/select_datasets/server/workspace.py b/tfbpshiny/modules/select_datasets/server/workspace.py
index 43e7b8d..14e05b4 100644
--- a/tfbpshiny/modules/select_datasets/server/workspace.py
+++ b/tfbpshiny/modules/select_datasets/server/workspace.py
@@ -40,6 +40,7 @@ def select_datasets_workspace_server(
     vdb: VirtualDB,
     logger: Logger,
     profile_logger: Logger,
+    session_id: str = "",
 ) -> None:
     """Render the sample-count matrix for all active datasets."""
 
@@ -91,6 +92,7 @@ def _matrix_data() -> dict[str, Any]:
                 module="select_datasets",
                 dataset=db_name,
                 context="_matrix_data",
+                session_id=session_id,
             ):
                 reg_df = vdb.query(sql, **params)
             regulators = set(reg_df["regulator_locus_tag"].dropna().astype(str))
@@ -103,6 +105,7 @@ def _matrix_data() -> dict[str, Any]:
                 module="select_datasets",
                 dataset=db_name,
                 context="_matrix_data",
+                session_id=session_id,
             ):
                 n_samples = int(vdb.query(sql, **params).iloc[0, 0])
 
@@ -127,6 +130,7 @@ def _matrix_data() -> dict[str, Any]:
                     module="select_datasets",
                     dataset=f"{db_a}x{db_b}",
                     context="_matrix_data",
+                    session_id=session_id,
                 ):
                     n_a = int(vdb.query(sql_a, **params_a).iloc[0, 0])
                 with profile_span(
@@ -135,6 +139,7 @@ def _matrix_data() -> dict[str, Any]:
                     module="select_datasets",
                     dataset=f"{db_a}x{db_b}",
                     context="_matrix_data",
+                    session_id=session_id,
                 ):
                     n_b = int(vdb.query(sql_b, **params_b).iloc[0, 0])
 
diff --git a/tfbpshiny/utils/profiler.py b/tfbpshiny/utils/profiler.py
index 91b0a30..ece6fc5 100644
--- a/tfbpshiny/utils/profiler.py
+++ b/tfbpshiny/utils/profiler.py
@@ -16,22 +16,25 @@ def profile_span(
     module: str = "",
     dataset: str = "",
     context: str = "",
+    session_id: str = "",
 ) -> Generator[None]:
     """
     Time a block of code and emit one fixed-schema pipe-delimited record to *logger*.
 
-    The record always has exactly seven pipe-separated columns so it can be read
+    The record always has exactly eight pipe-separated columns so it can be read
     without regex using ``pandas.read_csv(sep="|")``:
 
     ::
 
-        PROFILE | timestamp | elapsed_s | op | module | dataset | context
+        PROFILE | timestamp | elapsed_s | op | module | dataset | context | session_id
 
     :param logger: The ``"profiler"`` logger.
     :param op: Operation label, e.g. ``"vdb.query"`` or ``"plot.build"``.
     :param module: Shiny module name, e.g. ``"binding"``. Empty for init-time spans.
     :param dataset: Dataset(s) involved, e.g. ``"harbison"`` or ``"harbison x rossi"``.
     :param context: Function or plot name providing location, e.g. ``"_all_corr_data"``.
+    :param session_id: Shiny session ID, used to correlate spans and count concurrent
+        users. Empty for init-time spans that run outside a session.
 
     """
     t0 = time.perf_counter()
@@ -41,8 +44,36 @@ def profile_span(
         elapsed = time.perf_counter() - t0
         ts = datetime.now(timezone.utc).isoformat(timespec="milliseconds")
         logger.debug(
-            " | ".join(["PROFILE", ts, f"{elapsed:.4f}", op, module, dataset, context])
+            " | ".join(
+                [
+                    "PROFILE",
+                    ts,
+                    f"{elapsed:.4f}",
+                    op,
+                    module,
+                    dataset,
+                    context,
+                    session_id,
+                ]
+            )
         )
 
 
-__all__ = ["profile_span"]
+def log_session_event(logger: Logger, event: str, session_id: str) -> None:
+    """
+    Emit a SESSION_START or SESSION_END record to *logger*.
+
+    Record format (five pipe-separated columns)::
+
+        SESSION | timestamp | event | session_id
+
+    :param logger: The ``"profiler"`` logger.
+    :param event: ``"START"`` or ``"END"``.
+    :param session_id: Shiny session ID.
+
+    """
+    ts = datetime.now(timezone.utc).isoformat(timespec="milliseconds")
+    logger.debug(" | ".join(["SESSION", ts, event, session_id]))
+
+
+__all__ = ["profile_span", "log_session_event"]

From 2f3b1bb5ada8b5696482f7395375641c78a7cdbb Mon Sep 17 00:00:00 2001
From: chasem <chasem@wustl.edu>
Date: Fri, 17 Apr 2026 16:28:27 -0500
Subject: [PATCH 3/7] updating terraform to add cloudwatch dashboard

---
 terraform/cloudwatch.tf | 166 ++++++++++++++++++++++++++++++++++++++++
 terraform/main.tf       |  10 +++
 2 files changed, 176 insertions(+)
 create mode 100644 terraform/cloudwatch.tf

diff --git a/terraform/cloudwatch.tf b/terraform/cloudwatch.tf
new file mode 100644
index 0000000..1a234d0
--- /dev/null
+++ b/terraform/cloudwatch.tf
@@ -0,0 +1,166 @@
+locals {
+  log_group  = "/tfbpshiny/production"
+  log_stream = "shinyapp"
+  namespace  = "TFBPShiny"
+}
+
+# ---------------------------------------------------------------------------
+# Metric filters — extract numeric values from pipe-delimited log records
+# ---------------------------------------------------------------------------
+
+# Extracts elapsed_s from every PROFILE record.
+# Pattern captures the second numeric field after "PROFILE | <ts> | ".
+resource "aws_cloudwatch_log_metric_filter" "profile_latency" {
+  name           = "tfbpshiny-profile-latency"
+  log_group_name = local.log_group
+  pattern        = "[marker=PROFILE, ts, elapsed_s, op, module, dataset, context, sid]"
+
+  metric_transformation {
+    name          = "ProfileLatencySeconds"
+    namespace     = local.namespace
+    value         = "$elapsed_s"
+    default_value = 0
+    unit          = "Seconds"
+  }
+}
+
+# Counts SESSION START records — one per new user connection.
+resource "aws_cloudwatch_log_metric_filter" "session_start" {
+  name           = "tfbpshiny-session-start"
+  log_group_name = local.log_group
+  pattern        = "[marker=SESSION, ts, event=START, sid]"
+
+  metric_transformation {
+    name          = "SessionStart"
+    namespace     = local.namespace
+    value         = "1"
+    default_value = 0
+    unit          = "Count"
+  }
+}
+
+# ---------------------------------------------------------------------------
+# Alarms
+# ---------------------------------------------------------------------------
+
+# Fires when avg vdb query latency exceeds 30 s over a 15-minute window.
+# Threshold is intentionally high while the app is known to be slow —
+# lower it toward 5–10 s once query performance improves.
+resource "aws_cloudwatch_metric_alarm" "high_latency" {
+  alarm_name          = "tfbpshiny-high-latency"
+  alarm_description   = "Avg PROFILE latency > 30 s over 15 min — investigate vdb query bottlenecks."
+  namespace           = local.namespace
+  metric_name         = "ProfileLatencySeconds"
+  statistic           = "Average"
+  period              = 900  # 15 minutes
+  evaluation_periods  = 1
+  threshold           = 30
+  comparison_operator = "GreaterThanThreshold"
+  treat_missing_data  = "notBreaching"
+}
+
+# Fires when more than 20 sessions start within a 5-minute window.
+# Adjust threshold based on observed normal traffic once the app is live.
+resource "aws_cloudwatch_metric_alarm" "high_session_count" {
+  alarm_name          = "tfbpshiny-high-session-count"
+  alarm_description   = "More than 20 new sessions in 5 min — unexpected traffic spike."
+  namespace           = local.namespace
+  metric_name         = "SessionStart"
+  statistic           = "Sum"
+  period              = 300  # 5 minutes
+  evaluation_periods  = 1
+  threshold           = 20
+  comparison_operator = "GreaterThanThreshold"
+  treat_missing_data  = "notBreaching"
+}
+
+# ---------------------------------------------------------------------------
+# Dashboard
+# ---------------------------------------------------------------------------
+
+resource "aws_cloudwatch_dashboard" "tfbpshiny" {
+  dashboard_name = "tfbpshiny-production"
+
+  dashboard_body = jsonencode({
+    widgets = [
+      {
+        type   = "log"
+        x      = 0
+        y      = 0
+        width  = 24
+        height = 6
+        properties = {
+          title         = "Avg and max latency by operation"
+          region        = var.aws_region
+          view          = "table"
+          logGroupNames = [local.log_group]
+          queryString   ="fields @timestamp, @message | filter @message like /^PROFILE/ | parse @message 'PROFILE | * | * | * | * | * | * | *' as ts, elapsed_s, op, module, dataset, context, sid | stats avg(elapsed_s) as avg_s, max(elapsed_s) as max_s, count() as n by op, module, context | sort avg_s desc"
+        }
+      },
+      {
+        type   = "log"
+        x      = 0
+        y      = 6
+        width  = 12
+        height = 6
+        properties = {
+          title         = "Concurrent sessions started (5-min bins)"
+          region        = var.aws_region
+          view          = "timeSeries"
+          logGroupNames = [local.log_group]
+          queryString   ="fields @timestamp, @message | filter @message like /^SESSION/ | filter @message like /START/ | parse @message 'SESSION | * | * | *' as ts, event, sid | stats count(sid) as sessions_started by bin(5m) | sort @timestamp asc"
+        }
+      },
+      {
+        type   = "log"
+        x      = 12
+        y      = 6
+        width  = 12
+        height = 6
+        properties = {
+          title         = "Unique visitors per day"
+          region        = var.aws_region
+          view          = "table"
+          logGroupNames = [local.log_group]
+          queryString   ="fields @timestamp, @message | filter @message like /^SESSION/ | filter @message like /START/ | parse @message 'SESSION | * | * | *' as ts, event, sid | stats count_distinct(sid) as unique_visitors by datefloor(@timestamp, 1d) | sort datefloor_timestamp asc"
+        }
+      },
+      {
+        type   = "log"
+        x      = 0
+        y      = 12
+        width  = 24
+        height = 6
+        properties = {
+          title         = "Latency by operation over time (5-min bins)"
+          region        = var.aws_region
+          view          = "timeSeries"
+          logGroupNames = [local.log_group]
+          queryString   ="fields @timestamp, @message | filter @message like /^PROFILE/ | parse @message 'PROFILE | * | * | * | * | * | * | *' as ts, elapsed_s, op, module, dataset, context, sid | stats avg(elapsed_s) as avg_s by bin(5m), op | sort @timestamp asc"
+        }
+      },
+      {
+        type   = "alarm"
+        x      = 0
+        y      = 18
+        width  = 12
+        height = 3
+        properties = {
+          title  = "High latency alarm"
+          alarms = [aws_cloudwatch_metric_alarm.high_latency.arn]
+        }
+      },
+      {
+        type   = "alarm"
+        x      = 12
+        y      = 18
+        width  = 12
+        height = 3
+        properties = {
+          title  = "High session count alarm"
+          alarms = [aws_cloudwatch_metric_alarm.high_session_count.arn]
+        }
+      },
+    ]
+  })
+}
diff --git a/terraform/main.tf b/terraform/main.tf
index 7c841aa..20cf7e6 100644
--- a/terraform/main.tf
+++ b/terraform/main.tf
@@ -102,6 +102,16 @@ resource "aws_instance" "tfbpshiny" {
 
   user_data = file("${path.module}/user_data.sh")
 
+  # NOTE: ami and user_data are pinned via ignore_changes. This means that
+  # if the Amazon Linux 2023 AMI is updated or user_data.sh is modified,
+  # terraform apply will NOT replace the instance. This is intentional —
+  # both changes force instance termination, which would take down production.
+  # To deploy a new AMI or updated user_data, terminate the instance manually,
+  # remove this lifecycle block, and run terraform apply to reprovision.
+  lifecycle {
+    ignore_changes = [ami, user_data]
+  }
+
   tags = {
     Name = "tfbpshiny-production"
   }

From 8d18802889b1b9da024e73fc802777c908a0209d Mon Sep 17 00:00:00 2001
From: chasem <chasem@wustl.edu>
Date: Fri, 17 Apr 2026 16:51:42 -0500
Subject: [PATCH 4/7] removing traefik from shiny log

---
 production.yml          |   2 +-
 terraform/cloudwatch.tf | 152 +++++++++++++++++++++++++++++-----------
 2 files changed, 114 insertions(+), 40 deletions(-)

diff --git a/production.yml b/production.yml
index 44f2335..2585d92 100644
--- a/production.yml
+++ b/production.yml
@@ -63,7 +63,7 @@ services:
     logging:
       driver: awslogs
       options:
-        awslogs-group: /tfbpshiny/production
+        awslogs-group: /tfbpshiny/production/traefik
         awslogs-region: us-east-2
         awslogs-stream: traefik
         awslogs-create-group: "true"
diff --git a/terraform/cloudwatch.tf b/terraform/cloudwatch.tf
index 1a234d0..e0b3c0a 100644
--- a/terraform/cloudwatch.tf
+++ b/terraform/cloudwatch.tf
@@ -1,34 +1,79 @@
 locals {
-  log_group  = "/tfbpshiny/production"
-  log_stream = "shinyapp"
-  namespace  = "TFBPShiny"
+  log_group_app     = "/tfbpshiny/production"
+  log_group_traefik = "/tfbpshiny/production/traefik"
+  namespace         = "TFBPShiny"
+}
+
+# ---------------------------------------------------------------------------
+# Log groups
+# ---------------------------------------------------------------------------
+# Declaring these explicitly so Terraform owns retention/lifecycle. If either
+# already exists in the account, import it with:
+#   terraform import aws_cloudwatch_log_group.app /tfbpshiny/production
+#   terraform import aws_cloudwatch_log_group.traefik /tfbpshiny/production/traefik
+#
+# NOTE: After applying, update the Docker awslogs driver for the Traefik
+# container to use awslogs-group=/tfbpshiny/production/traefik so its noisy
+# network-discovery warnings stop polluting the Shiny app's log group.
+
+resource "aws_cloudwatch_log_group" "app" {
+  name              = local.log_group_app
+  retention_in_days = 30
+}
+
+resource "aws_cloudwatch_log_group" "traefik" {
+  name              = local.log_group_traefik
+  retention_in_days = 14
 }
 
 # ---------------------------------------------------------------------------
 # Metric filters — extract numeric values from pipe-delimited log records
 # ---------------------------------------------------------------------------
+# Bracket-pattern filters tokenize on whitespace, so every pipe character is
+# its own token (p1..p7). Field positions below reflect that.
+#
+# Record format emitted by the app:
+#   PROFILE | <ts> | <elapsed_s> | <op> | <module> | <dataset> | <context> | <sid>
+#   SESSION | <ts> | START|END   | <sid>
+#
+# default_value is intentionally omitted. If it were set to 0, every minute
+# without a PROFILE event would emit a zero, dragging the latency average
+# toward zero and suppressing the alarm during low traffic.
 
-# Extracts elapsed_s from every PROFILE record.
-# Pattern captures the second numeric field after "PROFILE | <ts> | ".
+# All PROFILE records — used for dashboard widgets that show latency across
+# every operation. Not used for the vdb.query-specific alarm.
 resource "aws_cloudwatch_log_metric_filter" "profile_latency" {
   name           = "tfbpshiny-profile-latency"
-  log_group_name = local.log_group
-  pattern        = "[marker=PROFILE, ts, elapsed_s, op, module, dataset, context, sid]"
+  log_group_name = aws_cloudwatch_log_group.app.name
+  pattern        = "[marker=PROFILE, p1, ts, p2, elapsed_s, p3, op, p4, module, p5, dataset, p6, context, p7, sid]"
 
   metric_transformation {
-    name          = "ProfileLatencySeconds"
-    namespace     = local.namespace
-    value         = "$elapsed_s"
-    default_value = 0
-    unit          = "Seconds"
+    name      = "ProfileLatencySeconds"
+    namespace = local.namespace
+    value     = "$elapsed_s"
+    unit      = "Seconds"
+  }
+}
+
+# Narrow filter that only matches op=vdb.query. Drives the latency alarm.
+resource "aws_cloudwatch_log_metric_filter" "vdb_query_latency" {
+  name           = "tfbpshiny-vdb-query-latency"
+  log_group_name = aws_cloudwatch_log_group.app.name
+  pattern        = "[marker=PROFILE, p1, ts, p2, elapsed_s, p3, op=vdb.query, p4, module, p5, dataset, p6, context, p7, sid]"
+
+  metric_transformation {
+    name      = "VdbQueryLatencySeconds"
+    namespace = local.namespace
+    value     = "$elapsed_s"
+    unit      = "Seconds"
   }
 }
 
 # Counts SESSION START records — one per new user connection.
 resource "aws_cloudwatch_log_metric_filter" "session_start" {
   name           = "tfbpshiny-session-start"
-  log_group_name = local.log_group
-  pattern        = "[marker=SESSION, ts, event=START, sid]"
+  log_group_name = aws_cloudwatch_log_group.app.name
+  pattern        = "[marker=SESSION, p1, ts, p2, event=START, p3, sid]"
 
   metric_transformation {
     name          = "SessionStart"
@@ -42,19 +87,19 @@ resource "aws_cloudwatch_log_metric_filter" "session_start" {
 # ---------------------------------------------------------------------------
 # Alarms
 # ---------------------------------------------------------------------------
-
-# Fires when avg vdb query latency exceeds 30 s over a 15-minute window.
-# Threshold is intentionally high while the app is known to be slow —
-# lower it toward 5–10 s once query performance improves.
-resource "aws_cloudwatch_metric_alarm" "high_latency" {
-  alarm_name          = "tfbpshiny-high-latency"
-  alarm_description   = "Avg PROFILE latency > 30 s over 15 min — investigate vdb query bottlenecks."
+# Fires when avg vdb.query latency exceeds 5 s over 15 min (3 × 5-min periods).
+# Raise the threshold to 10–30 s while the app is known to be slow; drop it
+# toward 2–5 s once query performance improves.
+resource "aws_cloudwatch_metric_alarm" "high_vdb_query_latency" {
+  alarm_name          = "tfbpshiny-high-vdb-query-latency"
+  alarm_description   = "Avg vdb.query latency > 5 s over 15 min — investigate vector DB bottlenecks."
   namespace           = local.namespace
-  metric_name         = "ProfileLatencySeconds"
+  metric_name         = "VdbQueryLatencySeconds"
   statistic           = "Average"
-  period              = 900  # 15 minutes
-  evaluation_periods  = 1
-  threshold           = 30
+  period              = 300
+  evaluation_periods  = 3
+  datapoints_to_alarm = 3
+  threshold           = 5
   comparison_operator = "GreaterThanThreshold"
   treat_missing_data  = "notBreaching"
 }
@@ -67,7 +112,7 @@ resource "aws_cloudwatch_metric_alarm" "high_session_count" {
   namespace           = local.namespace
   metric_name         = "SessionStart"
   statistic           = "Sum"
-  period              = 300  # 5 minutes
+  period              = 300
   evaluation_periods  = 1
   threshold           = 20
   comparison_operator = "GreaterThanThreshold"
@@ -77,10 +122,13 @@ resource "aws_cloudwatch_metric_alarm" "high_session_count" {
 # ---------------------------------------------------------------------------
 # Dashboard
 # ---------------------------------------------------------------------------
+# All Logs Insights queries target only the app log group (Traefik lives
+# elsewhere now), so the PROFILE/SESSION filters aren't competing with
+# reverse-proxy noise. Parse patterns use the exact pipe-with-spaces format
+# the app emits; if you ever change that format, update these in lockstep.
 
 resource "aws_cloudwatch_dashboard" "tfbpshiny" {
   dashboard_name = "tfbpshiny-production"
-
   dashboard_body = jsonencode({
     widgets = [
       {
@@ -93,8 +141,14 @@ resource "aws_cloudwatch_dashboard" "tfbpshiny" {
           title         = "Avg and max latency by operation"
           region        = var.aws_region
           view          = "table"
-          logGroupNames = [local.log_group]
-          queryString   ="fields @timestamp, @message | filter @message like /^PROFILE/ | parse @message 'PROFILE | * | * | * | * | * | * | *' as ts, elapsed_s, op, module, dataset, context, sid | stats avg(elapsed_s) as avg_s, max(elapsed_s) as max_s, count() as n by op, module, context | sort avg_s desc"
+          logGroupNames = [local.log_group_app]
+          queryString   = <<-EOT
+            fields @timestamp, @message
+            | filter @message like /^PROFILE \|/
+            | parse @message "PROFILE | * | * | * | * | * | * | *" as ts, elapsed_s, op, module, dataset, context, sid
+            | stats avg(elapsed_s) as avg_s, max(elapsed_s) as max_s, count() as n by op, module, context
+            | sort avg_s desc
+          EOT
         }
       },
       {
@@ -104,11 +158,18 @@ resource "aws_cloudwatch_dashboard" "tfbpshiny" {
         width  = 12
         height = 6
         properties = {
-          title         = "Concurrent sessions started (5-min bins)"
+          title         = "Sessions started (5-min bins)"
           region        = var.aws_region
           view          = "timeSeries"
-          logGroupNames = [local.log_group]
-          queryString   ="fields @timestamp, @message | filter @message like /^SESSION/ | filter @message like /START/ | parse @message 'SESSION | * | * | *' as ts, event, sid | stats count(sid) as sessions_started by bin(5m) | sort @timestamp asc"
+          logGroupNames = [local.log_group_app]
+          queryString   = <<-EOT
+            fields @timestamp, @message
+            | filter @message like /^SESSION \|/
+            | parse @message "SESSION | * | * | *" as ts, event, sid
+            | filter event = "START"
+            | stats count(*) as sessions_started by bin(5m)
+            | sort @timestamp asc
+          EOT
         }
       },
       {
@@ -120,9 +181,16 @@ resource "aws_cloudwatch_dashboard" "tfbpshiny" {
         properties = {
           title         = "Unique visitors per day"
           region        = var.aws_region
-          view          = "table"
-          logGroupNames = [local.log_group]
-          queryString   ="fields @timestamp, @message | filter @message like /^SESSION/ | filter @message like /START/ | parse @message 'SESSION | * | * | *' as ts, event, sid | stats count_distinct(sid) as unique_visitors by datefloor(@timestamp, 1d) | sort datefloor_timestamp asc"
+          view          = "bar"
+          logGroupNames = [local.log_group_app]
+          queryString   = <<-EOT
+            fields @timestamp, @message
+            | filter @message like /^SESSION \|/
+            | parse @message "SESSION | * | * | *" as ts, event, sid
+            | filter event = "START"
+            | stats count_distinct(sid) as unique_visitors by bin(1d)
+            | sort @timestamp asc
+          EOT
         }
       },
       {
@@ -135,8 +203,14 @@ resource "aws_cloudwatch_dashboard" "tfbpshiny" {
           title         = "Latency by operation over time (5-min bins)"
           region        = var.aws_region
           view          = "timeSeries"
-          logGroupNames = [local.log_group]
-          queryString   ="fields @timestamp, @message | filter @message like /^PROFILE/ | parse @message 'PROFILE | * | * | * | * | * | * | *' as ts, elapsed_s, op, module, dataset, context, sid | stats avg(elapsed_s) as avg_s by bin(5m), op | sort @timestamp asc"
+          logGroupNames = [local.log_group_app]
+          queryString   = <<-EOT
+            fields @timestamp, @message
+            | filter @message like /^PROFILE \|/
+            | parse @message "PROFILE | * | * | * | * | * | * | *" as ts, elapsed_s, op, module, dataset, context, sid
+            | stats avg(elapsed_s) as avg_s by bin(5m), op
+            | sort @timestamp asc
+          EOT
         }
       },
       {
@@ -146,8 +220,8 @@ resource "aws_cloudwatch_dashboard" "tfbpshiny" {
         width  = 12
         height = 3
         properties = {
-          title  = "High latency alarm"
-          alarms = [aws_cloudwatch_metric_alarm.high_latency.arn]
+          title  = "vdb.query latency alarm"
+          alarms = [aws_cloudwatch_metric_alarm.high_vdb_query_latency.arn]
         }
       },
       {

From 39c27679bf72f2d04fd4e7b3651ed33ed49ce9d5 Mon Sep 17 00:00:00 2001
From: chasem <chasem@wustl.edu>
Date: Fri, 17 Apr 2026 17:00:09 -0500
Subject: [PATCH 5/7] trying to get cloudwatch dashboard working

---
 terraform/cloudwatch.tf     | 76 ++++++++++++++-----------------------
 tfbpshiny/utils/profiler.py | 54 +++++++++++++++-----------
 2 files changed, 61 insertions(+), 69 deletions(-)

diff --git a/terraform/cloudwatch.tf b/terraform/cloudwatch.tf
index e0b3c0a..78af86e 100644
--- a/terraform/cloudwatch.tf
+++ b/terraform/cloudwatch.tf
@@ -11,10 +11,6 @@ locals {
 # already exists in the account, import it with:
 #   terraform import aws_cloudwatch_log_group.app /tfbpshiny/production
 #   terraform import aws_cloudwatch_log_group.traefik /tfbpshiny/production/traefik
-#
-# NOTE: After applying, update the Docker awslogs driver for the Traefik
-# container to use awslogs-group=/tfbpshiny/production/traefik so its noisy
-# network-discovery warnings stop polluting the Shiny app's log group.
 
 resource "aws_cloudwatch_log_group" "app" {
   name              = local.log_group_app
@@ -27,44 +23,41 @@ resource "aws_cloudwatch_log_group" "traefik" {
 }
 
 # ---------------------------------------------------------------------------
-# Metric filters — extract numeric values from pipe-delimited log records
+# Metric filters — JSON format, no tokenization ambiguity
 # ---------------------------------------------------------------------------
-# Bracket-pattern filters tokenize on whitespace, so every pipe character is
-# its own token (p1..p7). Field positions below reflect that.
+# Records emitted by the app are single-line JSON objects:
+#   PROFILE: {"event":"PROFILE","ts":"...","elapsed_s":0.1234,"op":"...","module":"...","dataset":"...","context":"...","session_id":"..."}
+#   SESSION: {"event":"SESSION","ts":"...","lifecycle":"START"|"END","session_id":"..."}
 #
-# Record format emitted by the app:
-#   PROFILE | <ts> | <elapsed_s> | <op> | <module> | <dataset> | <context> | <sid>
-#   SESSION | <ts> | START|END   | <sid>
+# CloudWatch JSON metric filters match on field values and extract numeric
+# fields directly, with no whitespace-tokenization issues.
 #
-# default_value is intentionally omitted. If it were set to 0, every minute
-# without a PROFILE event would emit a zero, dragging the latency average
-# toward zero and suppressing the alarm during low traffic.
+# default_value is intentionally omitted on latency filters. Setting it to 0
+# would drag the average toward zero during quiet periods and suppress alarms.
 
-# All PROFILE records — used for dashboard widgets that show latency across
-# every operation. Not used for the vdb.query-specific alarm.
 resource "aws_cloudwatch_log_metric_filter" "profile_latency" {
   name           = "tfbpshiny-profile-latency"
   log_group_name = aws_cloudwatch_log_group.app.name
-  pattern        = "[marker=PROFILE, p1, ts, p2, elapsed_s, p3, op, p4, module, p5, dataset, p6, context, p7, sid]"
+  pattern        = "{ $.event = \"PROFILE\" }"
 
   metric_transformation {
     name      = "ProfileLatencySeconds"
     namespace = local.namespace
-    value     = "$elapsed_s"
+    value     = "$.elapsed_s"
     unit      = "Seconds"
   }
 }
 
-# Narrow filter that only matches op=vdb.query. Drives the latency alarm.
+# Narrow filter for vdb.query only — drives the latency alarm.
 resource "aws_cloudwatch_log_metric_filter" "vdb_query_latency" {
   name           = "tfbpshiny-vdb-query-latency"
   log_group_name = aws_cloudwatch_log_group.app.name
-  pattern        = "[marker=PROFILE, p1, ts, p2, elapsed_s, p3, op=vdb.query, p4, module, p5, dataset, p6, context, p7, sid]"
+  pattern        = "{ $.event = \"PROFILE\" && $.op = \"vdb.query\" }"
 
   metric_transformation {
     name      = "VdbQueryLatencySeconds"
     namespace = local.namespace
-    value     = "$elapsed_s"
+    value     = "$.elapsed_s"
     unit      = "Seconds"
   }
 }
@@ -73,7 +66,7 @@ resource "aws_cloudwatch_log_metric_filter" "vdb_query_latency" {
 resource "aws_cloudwatch_log_metric_filter" "session_start" {
   name           = "tfbpshiny-session-start"
   log_group_name = aws_cloudwatch_log_group.app.name
-  pattern        = "[marker=SESSION, p1, ts, p2, event=START, p3, sid]"
+  pattern        = "{ $.event = \"SESSION\" && $.lifecycle = \"START\" }"
 
   metric_transformation {
     name          = "SessionStart"
@@ -87,25 +80,22 @@ resource "aws_cloudwatch_log_metric_filter" "session_start" {
 # ---------------------------------------------------------------------------
 # Alarms
 # ---------------------------------------------------------------------------
-# Fires when avg vdb.query latency exceeds 5 s over 15 min (3 × 5-min periods).
-# Raise the threshold to 10–30 s while the app is known to be slow; drop it
-# toward 2–5 s once query performance improves.
+# Raise threshold while the app is known to be slow; lower toward 2-5 s
+# once query performance improves.
 resource "aws_cloudwatch_metric_alarm" "high_vdb_query_latency" {
   alarm_name          = "tfbpshiny-high-vdb-query-latency"
-  alarm_description   = "Avg vdb.query latency > 5 s over 15 min — investigate vector DB bottlenecks."
+  alarm_description   = "Avg vdb.query latency > 30 s over 15 min — investigate query bottlenecks."
   namespace           = local.namespace
   metric_name         = "VdbQueryLatencySeconds"
   statistic           = "Average"
   period              = 300
   evaluation_periods  = 3
   datapoints_to_alarm = 3
-  threshold           = 5
+  threshold           = 30
   comparison_operator = "GreaterThanThreshold"
   treat_missing_data  = "notBreaching"
 }
 
-# Fires when more than 20 sessions start within a 5-minute window.
-# Adjust threshold based on observed normal traffic once the app is live.
 resource "aws_cloudwatch_metric_alarm" "high_session_count" {
   alarm_name          = "tfbpshiny-high-session-count"
   alarm_description   = "More than 20 new sessions in 5 min — unexpected traffic spike."
@@ -122,10 +112,8 @@ resource "aws_cloudwatch_metric_alarm" "high_session_count" {
 # ---------------------------------------------------------------------------
 # Dashboard
 # ---------------------------------------------------------------------------
-# All Logs Insights queries target only the app log group (Traefik lives
-# elsewhere now), so the PROFILE/SESSION filters aren't competing with
-# reverse-proxy noise. Parse patterns use the exact pipe-with-spaces format
-# the app emits; if you ever change that format, update these in lockstep.
+# Logs Insights auto-discovers JSON fields, so no parse directive needed.
+# Queries filter on $.event to separate PROFILE from SESSION records.
 
 resource "aws_cloudwatch_dashboard" "tfbpshiny" {
   dashboard_name = "tfbpshiny-production"
@@ -143,9 +131,8 @@ resource "aws_cloudwatch_dashboard" "tfbpshiny" {
           view          = "table"
           logGroupNames = [local.log_group_app]
           queryString   = <<-EOT
-            fields @timestamp, @message
-            | filter @message like /^PROFILE \|/
-            | parse @message "PROFILE | * | * | * | * | * | * | *" as ts, elapsed_s, op, module, dataset, context, sid
+            fields @timestamp, op, module, dataset, context, elapsed_s
+            | filter event = "PROFILE"
             | stats avg(elapsed_s) as avg_s, max(elapsed_s) as max_s, count() as n by op, module, context
             | sort avg_s desc
           EOT
@@ -163,10 +150,8 @@ resource "aws_cloudwatch_dashboard" "tfbpshiny" {
           view          = "timeSeries"
           logGroupNames = [local.log_group_app]
           queryString   = <<-EOT
-            fields @timestamp, @message
-            | filter @message like /^SESSION \|/
-            | parse @message "SESSION | * | * | *" as ts, event, sid
-            | filter event = "START"
+            fields @timestamp, session_id
+            | filter event = "SESSION" and lifecycle = "START"
             | stats count(*) as sessions_started by bin(5m)
             | sort @timestamp asc
           EOT
@@ -184,11 +169,9 @@ resource "aws_cloudwatch_dashboard" "tfbpshiny" {
           view          = "bar"
           logGroupNames = [local.log_group_app]
           queryString   = <<-EOT
-            fields @timestamp, @message
-            | filter @message like /^SESSION \|/
-            | parse @message "SESSION | * | * | *" as ts, event, sid
-            | filter event = "START"
-            | stats count_distinct(sid) as unique_visitors by bin(1d)
+            fields @timestamp, session_id
+            | filter event = "SESSION" and lifecycle = "START"
+            | stats count_distinct(session_id) as unique_visitors by bin(1d)
             | sort @timestamp asc
           EOT
         }
@@ -205,9 +188,8 @@ resource "aws_cloudwatch_dashboard" "tfbpshiny" {
           view          = "timeSeries"
           logGroupNames = [local.log_group_app]
           queryString   = <<-EOT
-            fields @timestamp, @message
-            | filter @message like /^PROFILE \|/
-            | parse @message "PROFILE | * | * | * | * | * | * | *" as ts, elapsed_s, op, module, dataset, context, sid
+            fields @timestamp, op, elapsed_s
+            | filter event = "PROFILE"
             | stats avg(elapsed_s) as avg_s by bin(5m), op
             | sort @timestamp asc
           EOT
diff --git a/tfbpshiny/utils/profiler.py b/tfbpshiny/utils/profiler.py
index ece6fc5..fa971c3 100644
--- a/tfbpshiny/utils/profiler.py
+++ b/tfbpshiny/utils/profiler.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import json
 import time
 from collections.abc import Generator
 from contextlib import contextmanager
@@ -19,19 +20,18 @@ def profile_span(
     session_id: str = "",
 ) -> Generator[None]:
     """
-    Time a block of code and emit one fixed-schema pipe-delimited record to *logger*.
+    Time a block of code and emit one structured JSON record to *logger*.
 
-    The record always has exactly eight pipe-separated columns so it can be read
-    without regex using ``pandas.read_csv(sep="|")``:
+    Each record has the following keys::
 
-    ::
-
-        PROFILE | timestamp | elapsed_s | op | module | dataset | context | session_id
+        {"event": "PROFILE", "ts": "...", "elapsed_s": 0.1234,
+         "op": "...", "module": "...", "dataset": "...",
+         "context": "...", "session_id": "..."}
 
     :param logger: The ``"profiler"`` logger.
     :param op: Operation label, e.g. ``"vdb.query"`` or ``"plot.build"``.
     :param module: Shiny module name, e.g. ``"binding"``. Empty for init-time spans.
-    :param dataset: Dataset(s) involved, e.g. ``"harbison"`` or ``"harbison x rossi"``.
+    :param dataset: Dataset(s) involved, e.g. ``"harbison"`` or ``"harbisонxrossi"``.
     :param context: Function or plot name providing location, e.g. ``"_all_corr_data"``.
     :param session_id: Shiny session ID, used to correlate spans and count concurrent
         users. Empty for init-time spans that run outside a session.
@@ -44,28 +44,29 @@ def profile_span(
         elapsed = time.perf_counter() - t0
         ts = datetime.now(timezone.utc).isoformat(timespec="milliseconds")
         logger.debug(
-            " | ".join(
-                [
-                    "PROFILE",
-                    ts,
-                    f"{elapsed:.4f}",
-                    op,
-                    module,
-                    dataset,
-                    context,
-                    session_id,
-                ]
+            json.dumps(
+                {
+                    "event": "PROFILE",
+                    "ts": ts,
+                    "elapsed_s": round(elapsed, 4),
+                    "op": op,
+                    "module": module,
+                    "dataset": dataset,
+                    "context": context,
+                    "session_id": session_id,
+                }
             )
         )
 
 
 def log_session_event(logger: Logger, event: str, session_id: str) -> None:
     """
-    Emit a SESSION_START or SESSION_END record to *logger*.
+    Emit a session lifecycle record to *logger*.
 
-    Record format (five pipe-separated columns)::
+    Each record has the following keys::
 
-        SESSION | timestamp | event | session_id
+        {"event": "SESSION", "ts": "...",
+        "lifecycle": "START"|"END", "session_id": "..."}
 
     :param logger: The ``"profiler"`` logger.
     :param event: ``"START"`` or ``"END"``.
@@ -73,7 +74,16 @@ def log_session_event(logger: Logger, event: str, session_id: str) -> None:
 
     """
     ts = datetime.now(timezone.utc).isoformat(timespec="milliseconds")
-    logger.debug(" | ".join(["SESSION", ts, event, session_id]))
+    logger.debug(
+        json.dumps(
+            {
+                "event": "SESSION",
+                "ts": ts,
+                "lifecycle": event,
+                "session_id": session_id,
+            }
+        )
+    )
 
 
 __all__ = ["profile_span", "log_session_event"]

From 0e6af754bb7cf8e4d03290c74a41fe3ec29b24a5 Mon Sep 17 00:00:00 2001
From: chasem <chasem@wustl.edu>
Date: Fri, 17 Apr 2026 17:12:47 -0500
Subject: [PATCH 6/7] giving up and did cloudwatch dashboard by hand

---
 terraform/cloudwatch.tf | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/terraform/cloudwatch.tf b/terraform/cloudwatch.tf
index 78af86e..1b79dde 100644
--- a/terraform/cloudwatch.tf
+++ b/terraform/cloudwatch.tf
@@ -1,3 +1,7 @@
+# NOTE! THIS isn't working to create the dashboard. But I could paste in queries
+# by hand to each panel and that did work so the logging is producing correct
+# values
+
 locals {
   log_group_app     = "/tfbpshiny/production"
   log_group_traefik = "/tfbpshiny/production/traefik"

From 91eb249beab6488203b7e6a593507749f490467c Mon Sep 17 00:00:00 2001
From: chasem <chasem@wustl.edu>
Date: Tue, 21 Apr 2026 09:30:55 -0500
Subject: [PATCH 7/7] sorting imports

---
 tfbpshiny/modules/binding/server/workspace.py      | 1 -
 tfbpshiny/modules/perturbation/server/workspace.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tfbpshiny/modules/binding/server/workspace.py b/tfbpshiny/modules/binding/server/workspace.py
index 5e7db1a..4d6f5d3 100644
--- a/tfbpshiny/modules/binding/server/workspace.py
+++ b/tfbpshiny/modules/binding/server/workspace.py
@@ -17,7 +17,6 @@
     get_measurement_column,
     regulator_scatter_sql,
 )
-
 from tfbpshiny.utils.profiler import profile_span
 from tfbpshiny.utils.sample_conditions import fetch_sample_condition_map
 from tfbpshiny.utils.vdb_init import AppDatasets, get_regulator_display_name
diff --git a/tfbpshiny/modules/perturbation/server/workspace.py b/tfbpshiny/modules/perturbation/server/workspace.py
index f24eafe..e7f9baa 100644
--- a/tfbpshiny/modules/perturbation/server/workspace.py
+++ b/tfbpshiny/modules/perturbation/server/workspace.py
@@ -17,7 +17,6 @@
     get_measurement_column,
     regulator_scatter_sql,
 )
-
 from tfbpshiny.utils.profiler import profile_span
 from tfbpshiny.utils.sample_conditions import fetch_sample_condition_map
 from tfbpshiny.utils.vdb_init import AppDatasets, get_regulator_display_name