From db924d1016dd9ba3fe88dadc9936572a4cfebabd Mon Sep 17 00:00:00 2001
From: Jeff <jeffzhu6969@gmail.com>
Date: Tue, 7 Apr 2026 14:12:46 +0200
Subject: [PATCH 1/3] Merge docker.sh and docker_original.sh into single
 startup script

Embed live_sync integration directly in the master_template.yml
docker_cln_sh template so each client startup kit produces a single
docker.sh with all flags.  _injectLiveSyncIntoStartupKits.sh now only
copies the helper files (sync.conf, build_heartbeat.sh, live_sync.sh)
instead of creating a wrapper that delegates to docker_original.sh.

Live sync auto-starts for --local_training (foreground, killed on exit)
and --start_client (nohup daemon).  All other modes are unchanged.
If live_sync.sh is not present the hooks are a graceful no-op.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 docker_config/master_template.yml             |  54 +++++++++
 .../build/_injectLiveSyncIntoStartupKits.sh   | 112 +-----------------
 2 files changed, 58 insertions(+), 108 deletions(-)

diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml
index be980911..d04b7ed5 100644
--- a/docker_config/master_template.yml
+++ b/docker_config/master_template.yml
@@ -803,6 +803,57 @@ docker_cln_sh: |
      ENV_VARS+=" --env LOG_DATASET_DETAILS=1"
   fi
 
+  # ── Live Sync Integration ──────────────────────────────────────────
+  # live_sync.sh is co-located in the startup directory (injected by
+  # _injectLiveSyncIntoStartupKits.sh) and syncs training artifacts to
+  # the central monitoring server for --local_training and --start_client
+  # modes.  If live_sync.sh is not present the functions are a no-op.
+  KIT_ROOT="$(cd "$DIR/.." && pwd)"
+  SYNC_STATE_DIR="$DIR/.mediswarm_sync"
+  SITE_NAME_RESOLVED="{~~client_name~~}"
+
+  _start_live_sync() {
+    local mode="$1"
+    if [ ! -f "$DIR/live_sync.sh" ]; then return; fi
+    mkdir -p "$SYNC_STATE_DIR"
+
+    if [ "$mode" = "local" ]; then
+      "$DIR/live_sync.sh" \
+        --mode local \
+        --site-name "$SITE_NAME_RESOLVED" \
+        --kit-root "$KIT_ROOT" \
+        --startup-dir "$DIR" \
+        --scratch-dir "${MY_SCRATCH_DIR:-}" &
+      LIVE_SYNC_PID=$!
+    elif [ "$mode" = "swarm" ]; then
+      local pid_file="$SYNC_STATE_DIR/swarm_sync.pid"
+      if [ -f "$pid_file" ]; then
+        local old_pid
+        old_pid="$(cat "$pid_file" 2>/dev/null || true)"
+        if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
+          echo "Live sync daemon already running (PID $old_pid)"
+          return 0
+        fi
+      fi
+      nohup "$DIR/live_sync.sh" \
+        --mode swarm \
+        --site-name "$SITE_NAME_RESOLVED" \
+        --kit-root "$KIT_ROOT" \
+        --startup-dir "$DIR" \
+        --scratch-dir "${MY_SCRATCH_DIR:-}" \
+        > "$SYNC_STATE_DIR/live_sync_daemon.log" 2>&1 < /dev/null &
+      echo $! > "$pid_file"
+      echo "Started live sync daemon (PID $(cat "$pid_file"))"
+    fi
+  }
+
+  _stop_live_sync() {
+    if [ -n "${LIVE_SYNC_PID:-}" ] && kill -0 "$LIVE_SYNC_PID" 2>/dev/null; then
+      kill "$LIVE_SYNC_PID" || true
+      wait "$LIVE_SYNC_PID" || true
+    fi
+  }
+
   # Execution modes
   if [ -n "$DUMMY_TRAINING" ]; then
       docker run --rm $TTY_OPT $DOCKER_OPTIONS $ENV_VARS --env TRAINING_MODE=local_training $DOCKER_IMAGE \
@@ -815,10 +866,13 @@ docker_cln_sh: |
 
   elif [ -n "$LOCAL_TRAINING" ]; then
       echo "[INFO] Local training using job: $JOB_NAME"
+      trap _stop_live_sync EXIT INT TERM
+      _start_live_sync local
       docker run --rm $TTY_OPT $DOCKER_OPTIONS $ENV_VARS --env TRAINING_MODE=local_training --env NUM_EPOCHS=100 $DOCKER_IMAGE \
       /bin/bash -c "/MediSwarm/application/jobs/${JOB_NAME}/app/custom/main.py"
 
   elif [ -n "$START_CLIENT" ]; then
+      _start_live_sync swarm
       docker run -d -t --restart=on-failure:5 \
       --health-cmd="nvidia-smi > /dev/null 2>&1 || exit 1" \
       --health-interval=120s --health-start-period=180s --health-retries=3 \
diff --git a/scripts/build/_injectLiveSyncIntoStartupKits.sh b/scripts/build/_injectLiveSyncIntoStartupKits.sh
index 4ad1a17f..94d5c6a5 100755
--- a/scripts/build/_injectLiveSyncIntoStartupKits.sh
+++ b/scripts/build/_injectLiveSyncIntoStartupKits.sh
@@ -37,116 +37,12 @@ find "$TARGET_FOLDER" -mindepth 1 -maxdepth 1 -type d | while read -r KIT_DIR; d
 
   chmod +x "$STARTUP_DIR/build_heartbeat.sh" "$STARTUP_DIR/live_sync.sh"
 
-  if [ ! -f "$STARTUP_DIR/docker_original.sh" ]; then
-    mv "$ORIGINAL_DOCKER_SH" "$STARTUP_DIR/docker_original.sh"
-    chmod +x "$STARTUP_DIR/docker_original.sh"
+  # Clean up legacy docker_original.sh wrapper if present from a previous build
+  if [ -f "$STARTUP_DIR/docker_original.sh" ]; then
+    rm -f "$STARTUP_DIR/docker_original.sh"
   fi
 
-  cat > "$STARTUP_DIR/docker.sh" <<'EOF'
-#!/usr/bin/env bash
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-STARTUP_DIR="$SCRIPT_DIR"
-KIT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
-ORIGINAL="$SCRIPT_DIR/docker_original.sh"
-STATE_DIR="$STARTUP_DIR/.mediswarm_sync"
-mkdir -p "$STATE_DIR"
-
-MODE=""
-SCRATCHDIR=""
-SITE_NAME_FROM_ARGS=""
-
-parse_args() {
-  while [ "$#" -gt 0 ]; do
-    case "$1" in
-      --local_training)
-        MODE="local"
-        shift
-        ;;
-      --start_client)
-        MODE="swarm"
-        shift
-        ;;
-      --scratch_dir)
-        SCRATCHDIR="${2:-}"
-        shift 2
-        ;;
-      --site_name)
-        SITE_NAME_FROM_ARGS="${2:-}"
-        shift 2
-        ;;
-      *)
-        shift
-        ;;
-    esac
-  done
-}
-
-parse_args "$@"
-
-SITE_NAME_FALLBACK="$(basename "$KIT_ROOT")"
-if [ -n "$SITE_NAME_FROM_ARGS" ]; then
-  export SITE_NAME="$SITE_NAME_FROM_ARGS"
-elif [ -z "${SITE_NAME:-}" ]; then
-  export SITE_NAME="$SITE_NAME_FALLBACK"
-fi
-
-start_local_sync() {
-  "$SCRIPT_DIR/live_sync.sh" \
-    --mode local \
-    --site-name "$SITE_NAME" \
-    --kit-root "$KIT_ROOT" \
-    --startup-dir "$STARTUP_DIR" \
-    --scratch-dir "${SCRATCHDIR:-}" &
-  SYNC_PID=$!
-}
-
-stop_local_sync() {
-  if [ -n "${SYNC_PID:-}" ] && kill -0 "$SYNC_PID" >/dev/null 2>&1; then
-    kill "$SYNC_PID" || true
-    wait "$SYNC_PID" || true
-  fi
-}
-
-start_swarm_sync_daemon() {
-  local pid_file="$STATE_DIR/swarm_sync.pid"
-
-  if [ -f "$pid_file" ]; then
-    old_pid="$(cat "$pid_file" 2>/dev/null || true)"
-    if [ -n "$old_pid" ] && kill -0 "$old_pid" >/dev/null 2>&1; then
-      echo "Live sync daemon already running with PID $old_pid"
-      return 0
-    fi
-  fi
-
-  nohup "$SCRIPT_DIR/live_sync.sh" \
-    --mode swarm \
-    --site-name "$SITE_NAME" \
-    --kit-root "$KIT_ROOT" \
-    --startup-dir "$STARTUP_DIR" \
-    --scratch-dir "${SCRATCHDIR:-}" \
-    > "$STATE_DIR/live_sync_daemon.log" 2>&1 < /dev/null &
-
-  echo $! > "$pid_file"
-  echo "Started live sync daemon with PID $(cat "$pid_file")"
-}
-
-if [ "$MODE" = "local" ]; then
-  trap stop_local_sync EXIT INT TERM
-  start_local_sync
-  exec "$ORIGINAL" "$@"
-elif [ "$MODE" = "swarm" ]; then
-  start_swarm_sync_daemon
-  exec "$ORIGINAL" "$@"
-else
-  exec "$ORIGINAL" "$@"
-fi
-EOF
-
-  chmod +x "$STARTUP_DIR/docker.sh"
-
-  echo "Patched $STARTUP_DIR/docker.sh"
+  echo "Injected live sync helpers into $STARTUP_DIR"
 done
 
 echo "Live sync injection finished"
\ No newline at end of file

From 888d341e6da00442398fac559f4d33a855a1e40b Mon Sep 17 00:00:00 2001
From: Jeff <jeffzhu6969@gmail.com>
Date: Tue, 7 Apr 2026 15:11:07 +0200
Subject: [PATCH 2/3] Enhance live monitor and live_sync: status inference,
 filters, downloads, version tracking

- server_tools/app.py: Major overhaul of the MediSwarm Live Monitor webviewer
  - Add filter bar (site, mode, status, job_id) with default sort by newest
  - Add status inference (stale >5min, finished >1hr, heartbeat_final.json wins)
  - Add file download endpoint for all run artifacts
  - Add job grouping for swarm runs
  - Add kit version column from heartbeat data
  - Add training summary extraction (best val metrics, epoch count, FL rounds)
  - Add TensorBoard metric parsing and inline charts via tbparse
  - Add enriched detail page with full file inventory, checkpoints, models cards
  - Add stats bar with running/finished/stale/site counts
  - Add server-side file paths with download buttons

- kit_live_sync/build_heartbeat.sh: Add kit_version field extracted from docker.sh
  MEDISWARM_VERSION baked in at build time

- kit_live_sync/live_sync.sh: Fix duplicate entries and empty heartbeat fields
  - Export SCRATCHDIR before calling build_heartbeat.sh so run_dir is populated
  - Track current run and finalize old runs with heartbeat_final.json when a new
    local training run starts (prevents stale "running" entries)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 kit_live_sync/build_heartbeat.sh |   9 +
 kit_live_sync/live_sync.sh       |  35 ++
 server_tools/app.py              | 904 +++++++++++++++++++++++++------
 3 files changed, 787 insertions(+), 161 deletions(-)

diff --git a/kit_live_sync/build_heartbeat.sh b/kit_live_sync/build_heartbeat.sh
index 90f744a9..7f38c871 100755
--- a/kit_live_sync/build_heartbeat.sh
+++ b/kit_live_sync/build_heartbeat.sh
@@ -11,6 +11,14 @@ OUT_FILE="${7:-/tmp/mediswarm_heartbeat.json}"
 
 timestamp="$(date -u +%FT%TZ)"
 
+# Extract kit version from docker.sh (baked in at build time)
+kit_version=""
+docker_sh="${KIT_ROOT:+$KIT_ROOT/startup/docker.sh}"
+if [ -n "$docker_sh" ] && [ -f "$docker_sh" ]; then
+  kit_version="$(grep -oP '(?<=MEDISWARM_VERSION=)\S+' "$docker_sh" 2>/dev/null | head -1 || true)"
+  [ -n "$kit_version" ] || kit_version="$(grep -oP '(?<=jefftud/odelia:)\S+' "$docker_sh" 2>/dev/null | head -1 || true)"
+fi
+
 log_file=""
 console_file=""
 global_model=""
@@ -49,6 +57,7 @@ cat > "$OUT_FILE" <<EOF
   "run_name": "$RUN_NAME",
   "timestamp": "$timestamp",
   "status": "$STATUS",
+  "kit_version": "$kit_version",
   "kit_root": "$KIT_ROOT",
   "log_file": "$log_file",
   "console_file": "$console_file",
diff --git a/kit_live_sync/live_sync.sh b/kit_live_sync/live_sync.sh
index da1de208..cfa79bc2 100755
--- a/kit_live_sync/live_sync.sh
+++ b/kit_live_sync/live_sync.sh
@@ -32,6 +32,9 @@ mkdir -p "$STATE_DIR"
 LAST_CKPT_SYNC_FILE="$STATE_DIR/${MODE}_last_ckpt_sync_ts"
 touch "$LAST_CKPT_SYNC_FILE"
 
+# Track the run we are currently syncing so we can finalize it if a new run appears
+CURRENT_LOCAL_RUN=""
+
 ssh_cmd() {
   ssh ${SSH_OPTS} "$@"
 }
@@ -86,12 +89,42 @@ build_remote_dir() {
   printf '%s/%s/%s/%s' "$REMOTE_BASE" "$SITE_NAME" "$MODE" "$run_id"
 }
 
+_finalize_local_run() {
+  # Write a heartbeat_final.json for a given run_name and mark it "finished"
+  local old_run="$1"
+  [ -n "$old_run" ] || return 0
+
+  local old_remote_dir old_hb_final old_run_dir
+  old_remote_dir="$(build_remote_dir "$old_run")"
+  old_hb_final="$STATE_DIR/local_heartbeat_final_${old_run}.json"
+  export SCRATCHDIR
+  "$SCRIPT_DIR/build_heartbeat.sh" "$SITE_NAME" "local" "$KIT_ROOT" "" "$old_run" "finished" "$old_hb_final" >/dev/null
+  rsync_cmd "$old_hb_final" "${REMOTE_USER}@${REMOTE_HOST}:${old_remote_dir}/heartbeat_final.json" || true
+
+  # Final sync of the old run's artifacts
+  old_run_dir=""
+  if [ -n "$SCRATCHDIR" ] && [ -d "$SCRATCHDIR/runs/$SITE_NAME/$old_run" ]; then
+    old_run_dir="$SCRATCHDIR/runs/$SITE_NAME/$old_run"
+  elif [ -d "$STARTUP_DIR/runs/$SITE_NAME/$old_run" ]; then
+    old_run_dir="$STARTUP_DIR/runs/$SITE_NAME/$old_run"
+  fi
+  if [ -n "$old_run_dir" ]; then
+    rsync_cmd "$old_run_dir/" "${REMOTE_USER}@${REMOTE_HOST}:${old_remote_dir}/run_dir/" || true
+  fi
+}
+
 sync_local() {
   local run_name run_dir remote_dir hb_file now last
 
   run_name="$(find_latest_local_run_name || true)"
   [ -n "$run_name" ] || return 0
 
+  # If the run changed (new training started), finalize the old one
+  if [ -n "$CURRENT_LOCAL_RUN" ] && [ "$CURRENT_LOCAL_RUN" != "$run_name" ]; then
+    _finalize_local_run "$CURRENT_LOCAL_RUN" || true
+  fi
+  CURRENT_LOCAL_RUN="$run_name"
+
   # Determine run_dir: check scratch dir first, fall back to startup dir
   run_dir=""
   if [ -n "$SCRATCHDIR" ] && [ -d "$SCRATCHDIR/runs/$SITE_NAME/$run_name" ]; then
@@ -105,6 +138,7 @@ sync_local() {
   ensure_remote_dir "$remote_dir"
 
   hb_file="$STATE_DIR/local_heartbeat.json"
+  export SCRATCHDIR
   "$SCRIPT_DIR/build_heartbeat.sh" "$SITE_NAME" "local" "$KIT_ROOT" "" "$run_name" "running" "$hb_file" >/dev/null
 
   rsync_cmd "$hb_file" "${REMOTE_USER}@${REMOTE_HOST}:${remote_dir}/heartbeat.json" || true
@@ -210,6 +244,7 @@ final_sync() {
 
       remote_dir="$(build_remote_dir "$run_name")"
       hb_file="$STATE_DIR/local_heartbeat_final.json"
+      export SCRATCHDIR
       "$SCRIPT_DIR/build_heartbeat.sh" "$SITE_NAME" "local" "$KIT_ROOT" "" "$run_name" "finished" "$hb_file" >/dev/null
       rsync_cmd "$hb_file" "${REMOTE_USER}@${REMOTE_HOST}:${remote_dir}/heartbeat_final.json" || true
       if [ -n "$run_dir" ]; then
diff --git a/server_tools/app.py b/server_tools/app.py
index a4feba5a..40050bed 100644
--- a/server_tools/app.py
+++ b/server_tools/app.py
@@ -2,6 +2,16 @@
 
 Serves a styled web UI that displays live training status, metrics charts,
 and artifact links for all sites synced by live_sync to /srv/mediswarm/live/.
+
+Features:
+- Filters by site, mode, status, job_id
+- Default sort by timestamp (newest first)
+- Status inference (stale running → stale, very old → presumed finished)
+- Server-side file paths with download links
+- Job grouping for swarm runs
+- Kit version column
+- TensorBoard metric parsing (via tbparse)
+- Enriched detail page with full artifact inventory
 """
 
 from pathlib import Path
@@ -15,8 +25,8 @@
 
 from html import escape as html_escape
 
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import HTMLResponse, JSONResponse, PlainTextResponse
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.responses import HTMLResponse, JSONResponse, PlainTextResponse, FileResponse
 
 # Optional: TensorBoard event parsing via tbparse
 try:
@@ -37,28 +47,17 @@
 
 
 def _safe_segment(value: str) -> str:
-    """Validate that a URL path segment is safe (no traversal, no slashes).
-
-    Raises HTTPException 400 if the segment looks like a traversal attempt
-    or contains characters that could escape the BASE directory.
-    """
     if not value or not _SAFE_SEGMENT_RE.match(value) or ".." in value:
         raise HTTPException(status_code=400, detail="Invalid path segment")
     return value
 
 
 def _resolve_run_dir(site: str, mode: str, run_id: str) -> Path:
-    """Build and validate a run directory path under BASE.
-
-    Ensures the resolved path is strictly under BASE to prevent traversal.
-    Uses os.path.commonpath for robust containment checking.
-    """
     site = _safe_segment(site)
     mode = _safe_segment(mode)
     run_id = _safe_segment(run_id)
     run_dir = (BASE / site / mode / run_id).resolve()
     base_resolved = BASE.resolve()
-    # Verify that the resolved path is actually under BASE
     try:
         common = os.path.commonpath([str(base_resolved), str(run_dir)])
     except ValueError:
@@ -84,6 +83,7 @@ def _resolve_run_dir(site: str, mode: str, run_id: str) -> Path:
   --orange: #e67e22;
   --gray: #95a5a6;
   --red: #c0392b;
+  --purple: #8e44ad;
   --border: #dfe6e9;
   --text: #2d3436;
   --text-light: #636e72;
@@ -97,33 +97,62 @@ def _resolve_run_dir(site: str, mode: str, run_id: str) -> Path:
 header h1 { font-size: 1.4rem; font-weight: 600; }
 header .meta { font-size: 0.82rem; color: var(--gray); }
 header .meta a { color: var(--gray); text-decoration: underline; margin-left: 1rem; }
-main { max-width: 1400px; margin: 1.5rem auto; padding: 0 1rem; }
+main { max-width: 1600px; margin: 1.5rem auto; padding: 0 1rem; }
+
+/* Filter bar */
+.filter-bar { display: flex; flex-wrap: wrap; gap: 0.6rem; margin-bottom: 1.2rem;
+              align-items: center; }
+.filter-bar label { font-size: 0.82rem; font-weight: 600; color: var(--text-light); }
+.filter-bar select, .filter-bar input { font-size: 0.82rem; padding: 4px 10px;
+  border: 1px solid var(--border); border-radius: 6px; background: var(--card); }
+.filter-bar .filter-group { display: flex; align-items: center; gap: 0.3rem; }
+.filter-bar .btn-small { display: inline-block; padding: 4px 12px; border-radius: 6px;
+  background: var(--accent); color: #fff; font-size: 0.8rem; text-decoration: none;
+  cursor: pointer; border: none; }
+.filter-bar .btn-small:hover { background: #16213e; }
+
 table { width: 100%; border-collapse: collapse; background: var(--card);
         border-radius: 8px; overflow: hidden; box-shadow: 0 1px 4px rgba(0,0,0,0.08); }
 th { background: var(--accent); color: #fff; text-align: left;
-     padding: 0.7rem 0.9rem; font-size: 0.82rem; text-transform: uppercase;
-     letter-spacing: 0.04em; }
-td { padding: 0.65rem 0.9rem; border-bottom: 1px solid var(--border);
-     font-size: 0.88rem; vertical-align: top; }
+     padding: 0.7rem 0.9rem; font-size: 0.78rem; text-transform: uppercase;
+     letter-spacing: 0.04em; cursor: pointer; user-select: none; white-space: nowrap; }
+th:hover { background: #16213e; }
+th .sort-arrow { font-size: 0.7rem; margin-left: 0.3rem; }
+td { padding: 0.55rem 0.9rem; border-bottom: 1px solid var(--border);
+     font-size: 0.85rem; vertical-align: top; }
 tr:nth-child(even) td { background: #f9fafb; }
 tr:hover td { background: #eef2f7; }
 .badge { display: inline-block; padding: 2px 10px; border-radius: 12px;
-         font-size: 0.78rem; font-weight: 600; color: #fff; }
+         font-size: 0.75rem; font-weight: 600; color: #fff; }
 .badge-running { background: var(--green); }
 .badge-finished { background: var(--blue); }
 .badge-unknown { background: var(--gray); }
 .badge-error { background: var(--red); }
-.artifact { font-size: 0.8rem; color: var(--text-light); }
+.badge-stale { background: var(--orange); }
+.artifact { font-size: 0.78rem; color: var(--text-light); }
 .artifact .yes { color: var(--green); font-weight: 600; }
 .artifact .no { color: var(--gray); }
 a { color: var(--accent); text-decoration: none; }
 a:hover { text-decoration: underline; }
-.links a { margin-right: 0.7rem; font-size: 0.82rem; }
-.run-id { font-family: var(--mono); font-size: 0.78rem; word-break: break-all; }
+.links a { margin-right: 0.5rem; font-size: 0.8rem; }
+.run-id { font-family: var(--mono); font-size: 0.75rem; word-break: break-all; }
 .run-name { font-weight: 500; }
 .age-stale { color: var(--orange); }
 .age-dead { color: var(--red); }
 .empty { text-align: center; padding: 3rem; color: var(--gray); }
+.version { font-family: var(--mono); font-size: 0.75rem; color: var(--text-light); }
+.job-id { font-family: var(--mono); font-size: 0.72rem; color: var(--purple); }
+
+/* Job group header */
+.job-group-row td { background: #e8e4f0 !important; font-weight: 600;
+  font-size: 0.82rem; color: var(--purple); padding: 0.5rem 0.9rem; }
+
+/* Summary stats */
+.stats-bar { display: flex; gap: 1rem; margin-bottom: 1rem; flex-wrap: wrap; }
+.stat-card { background: var(--card); border-radius: 8px; padding: 0.6rem 1.2rem;
+  box-shadow: 0 1px 3px rgba(0,0,0,0.06); display: flex; align-items: center; gap: 0.5rem; }
+.stat-card .stat-num { font-size: 1.4rem; font-weight: 700; color: var(--accent); }
+.stat-card .stat-label { font-size: 0.78rem; color: var(--text-light); }
 
 /* Detail page */
 .detail-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 1.5rem; margin-top: 1.2rem; }
@@ -135,15 +164,28 @@ def _resolve_run_dir(site: str, mode: str, run_id: str) -> Path:
             overflow-x: auto; font-size: 0.78rem; line-height: 1.5; max-height: 400px;
             overflow-y: auto; }
 .card table { box-shadow: none; }
-.card table th { background: var(--accent); }
-.kv-table td:first-child { font-weight: 600; white-space: nowrap; width: 160px; }
+.card table th { background: var(--accent); cursor: default; }
+.kv-table td:first-child { font-weight: 600; white-space: nowrap; width: 180px; }
+.kv-table td:last-child { font-family: var(--mono); font-size: 0.82rem; word-break: break-all; }
 .btn { display: inline-block; padding: 6px 14px; border-radius: 6px;
        background: var(--accent); color: #fff; font-size: 0.82rem;
-       text-decoration: none; margin-right: 0.5rem; }
+       text-decoration: none; margin-right: 0.5rem; margin-bottom: 0.3rem; }
 .btn:hover { background: #16213e; text-decoration: none; }
-.chart-container { position: relative; width: 100%; height: 320px; }
+.btn-download { background: var(--green); }
+.btn-download:hover { background: #1e8449; }
+.chart-container { position: relative; width: 100%; height: 350px; }
 .breadcrumb { font-size: 0.85rem; margin-bottom: 1rem; color: var(--text-light); }
 .breadcrumb a { color: var(--accent); }
+
+/* File list in detail */
+.file-list { list-style: none; }
+.file-list li { padding: 0.3rem 0; border-bottom: 1px solid #f0f0f0; font-size: 0.82rem;
+  display: flex; align-items: center; gap: 0.5rem; }
+.file-list li:last-child { border-bottom: none; }
+.file-icon { font-size: 0.9rem; }
+.file-path { font-family: var(--mono); font-size: 0.78rem; color: var(--text-light);
+  word-break: break-all; }
+.file-size { font-size: 0.75rem; color: var(--gray); white-space: nowrap; }
 """
 
 
@@ -155,11 +197,12 @@ def _status_badge(status: str) -> str:
         cls = "badge-finished"
     elif status in ("error", "failed"):
         cls = "badge-error"
+    elif status == "stale":
+        cls = "badge-stale"
     return f'<span class="badge {cls}">{html_escape(status)}</span>'
 
 
 def _age_class(age_str: str) -> str:
-    """Return a CSS class for stale/dead heartbeats."""
     try:
         secs = int(age_str.rstrip("s"))
     except (ValueError, AttributeError):
@@ -171,7 +214,7 @@ def _age_class(age_str: str) -> str:
     return ""
 
 
-def _html_page(title: str, body: str, *, refresh: int = 0) -> str:
+def _html_page(title: str, body: str, *, refresh: int = 0, extra_head: str = "") -> str:
     refresh_tag = (
         f'<meta http-equiv="refresh" content="{refresh}">' if refresh else ""
     )
@@ -184,6 +227,7 @@ def _html_page(title: str, body: str, *, refresh: int = 0) -> str:
   {refresh_tag}
   <title>{safe_title}</title>
   <style>{CSS}</style>
+  {extra_head}
 </head>
 <body>
 {body}
@@ -191,6 +235,17 @@ def _html_page(title: str, body: str, *, refresh: int = 0) -> str:
 </html>"""
 
 
+def _format_size(size_bytes: int) -> str:
+    if size_bytes < 1024:
+        return f"{size_bytes} B"
+    elif size_bytes < 1024 * 1024:
+        return f"{size_bytes / 1024:.1f} KB"
+    elif size_bytes < 1024 * 1024 * 1024:
+        return f"{size_bytes / (1024 * 1024):.1f} MB"
+    else:
+        return f"{size_bytes / (1024 * 1024 * 1024):.2f} GB"
+
+
 # ---------------------------------------------------------------------------
 # Data helpers
 # ---------------------------------------------------------------------------
@@ -212,13 +267,25 @@ def parse_age(ts: str) -> str:
             return f"{secs}s"
         if secs < 3600:
             return f"{secs // 60}m {secs % 60}s"
-        return f"{secs // 3600}h {(secs % 3600) // 60}m"
+        if secs < 86400:
+            return f"{secs // 3600}h {(secs % 3600) // 60}m"
+        return f"{secs // 86400}d {(secs % 86400) // 3600}h"
     except Exception:
         return "unknown"
 
 
+def _age_seconds(ts: str) -> int:
+    """Return age in seconds from an ISO timestamp, or -1 if unparseable."""
+    if not ts:
+        return -1
+    try:
+        dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
+        return int((datetime.now(timezone.utc) - dt).total_seconds())
+    except Exception:
+        return -1
+
+
 def _read_heartbeat(run_dir: Path) -> dict[str, Any]:
-    """Read the best available heartbeat file (prefer final over live)."""
     for name in ["heartbeat_final.json", "heartbeat.json"]:
         p = run_dir / name
         if p.exists():
@@ -229,8 +296,36 @@ def _read_heartbeat(run_dir: Path) -> dict[str, Any]:
     return {}
 
 
+def _infer_status(hb: dict[str, Any], run_dir: Path) -> str:
+    """Infer the effective status from heartbeat + file system state.
+
+    Rules:
+    - If heartbeat_final.json exists -> use its status (typically "finished")
+    - If status is "running" but heartbeat is >5 min old -> "stale"
+    - If status is "running" but heartbeat is >1 hour old -> "finished" (presumed)
+    - Otherwise use heartbeat status as-is
+    """
+    has_final = (run_dir / "heartbeat_final.json").exists()
+    raw_status = hb.get("status", "unknown")
+
+    if has_final:
+        try:
+            final = json.loads((run_dir / "heartbeat_final.json").read_text())
+            return final.get("status", raw_status)
+        except Exception:
+            pass
+
+    if raw_status == "running":
+        age = _age_seconds(hb.get("timestamp", ""))
+        if age > 3600:
+            return "finished"
+        if age > 300:
+            return "stale"
+
+    return raw_status
+
+
 def _find_csv_files(run_dir: Path) -> list[str]:
-    """Find class-probability CSV files under run_dir/."""
     rd = run_dir / "run_dir"
     if not rd.exists():
         return []
@@ -240,13 +335,49 @@ def _find_csv_files(run_dir: Path) -> list[str]:
 
 
 def _find_tb_events(run_dir: Path) -> list[Path]:
-    """Find TensorBoard event files under run_dir/."""
     rd = run_dir / "run_dir"
     if not rd.exists():
         return []
     return sorted(rd.rglob("events.out.tfevents*"))
 
 
+def _find_checkpoints(run_dir: Path) -> list[dict[str, Any]]:
+    """Find all checkpoint files under run_dir/run_dir/."""
+    rd = run_dir / "run_dir"
+    results = []
+    if not rd.exists():
+        return results
+    for p in sorted(rd.rglob("*.ckpt")):
+        results.append({
+            "name": p.name,
+            "rel_path": str(p.relative_to(run_dir)),
+            "size": p.stat().st_size if p.exists() else 0,
+            "server_path": str(p),
+        })
+    return results
+
+
+def _find_all_files(run_dir: Path) -> list[dict[str, Any]]:
+    """Find all files in the run directory with metadata."""
+    results = []
+    if not run_dir.exists():
+        return results
+    for p in sorted(run_dir.rglob("*")):
+        if p.is_file():
+            try:
+                stat = p.stat()
+                results.append({
+                    "name": p.name,
+                    "rel_path": str(p.relative_to(run_dir)),
+                    "size": stat.st_size,
+                    "mtime": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
+                    "server_path": str(p),
+                })
+            except Exception:
+                pass
+    return results
+
+
 def rows() -> list[dict[str, Any]]:
     out: list[dict[str, Any]] = []
     if not BASE.exists():
@@ -258,6 +389,23 @@ def rows() -> list[dict[str, Any]]:
                 hb = _read_heartbeat(run_dir)
                 ts = hb.get("timestamp", "")
                 age = parse_age(ts)
+                status = _infer_status(hb, run_dir)
+                csv_files = _find_csv_files(run_dir)
+                tb_events = _find_tb_events(run_dir)
+                checkpoints = _find_checkpoints(run_dir)
+
+                # Count total files and size
+                total_files = 0
+                total_size = 0
+                rd = run_dir / "run_dir"
+                if rd.exists():
+                    for f in rd.rglob("*"):
+                        if f.is_file():
+                            total_files += 1
+                            try:
+                                total_size += f.stat().st_size
+                            except Exception:
+                                pass
 
                 out.append(
                     {
@@ -266,20 +414,28 @@ def rows() -> list[dict[str, Any]]:
                         "run_id": run_dir.name,
                         "run_name": hb.get("run_name", ""),
                         "job_id": hb.get("job_id", ""),
-                        "status": hb.get("status", "unknown"),
+                        "status": status,
+                        "raw_status": hb.get("status", "unknown"),
                         "timestamp": ts,
                         "age": age,
+                        "age_seconds": _age_seconds(ts),
+                        "kit_version": hb.get("kit_version", ""),
                         "has_console": (run_dir / "nohup.out").exists()
                         or (run_dir / "local_training_console_output.txt").exists(),
                         "has_log": (run_dir / "log.txt").exists(),
-                        "last_ckpt": bool(hb.get("last_ckpt")),
-                        "epoch_ckpt": bool(hb.get("epoch_ckpt")),
-                        "global_model": bool(hb.get("global_model")),
-                        "best_global_model": bool(hb.get("best_global_model")),
-                        "csv_files": _find_csv_files(run_dir),
-                        "tb_events": bool(_find_tb_events(run_dir)),
+                        "has_global_model": (run_dir / "FL_global_model.pt").exists(),
+                        "has_best_model": (run_dir / "best_FL_global_model.pt").exists(),
+                        "checkpoints": len(checkpoints),
+                        "csv_files": csv_files,
+                        "tb_events": len(tb_events),
+                        "total_files": total_files,
+                        "total_size": total_size,
+                        "server_path": str(run_dir),
                     }
                 )
+
+    # Sort by timestamp descending (newest first)
+    out.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
     return out
 
 
@@ -293,11 +449,10 @@ def rows() -> list[dict[str, Any]]:
 
 
 def parse_console_metrics(text: str) -> dict[str, Any]:
-    """Extract epoch-level ACC and AUC_ROC from console output."""
     data: dict[str, dict[int, dict[str, float]]] = {}
     for m in _EPOCH_RE.finditer(text):
         epoch = int(m.group(1))
-        phase = m.group(2)  # train / val / test
+        phase = m.group(2)
         acc = float(m.group(3))
         auc = float(m.group(4))
         data.setdefault(phase, {})[epoch] = {"acc": acc, "auc_roc": auc}
@@ -325,15 +480,65 @@ def _get_console_text(site: str, mode: str, run_id: str) -> str:
     return ""
 
 
+def _extract_training_summary(text: str) -> dict[str, Any]:
+    """Extract a training summary from console output."""
+    summary: dict[str, Any] = {}
+
+    # Total epochs
+    epochs = _EPOCH_RE.findall(text)
+    if epochs:
+        all_epochs = [int(e[0]) for e in epochs]
+        summary["total_epochs"] = max(all_epochs) + 1
+        summary["last_epoch"] = max(all_epochs)
+
+    # Best checkpoint
+    best_match = re.findall(
+        r"Epoch\s+(\d+)\s*-\s*val\s+ACC:\s*([\d.]+),\s*AUC_ROC:\s*([\d.]+)", text
+    )
+    if best_match:
+        best_auc = max(best_match, key=lambda x: float(x[2]))
+        summary["best_val_epoch"] = int(best_auc[0])
+        summary["best_val_acc"] = float(best_auc[1])
+        summary["best_val_auc_roc"] = float(best_auc[2])
+
+    # Final train metrics
+    train_match = re.findall(
+        r"Epoch\s+(\d+)\s*-\s*train\s+ACC:\s*([\d.]+),\s*AUC_ROC:\s*([\d.]+)", text
+    )
+    if train_match:
+        last_train = train_match[-1]
+        summary["final_train_acc"] = float(last_train[1])
+        summary["final_train_auc_roc"] = float(last_train[2])
+
+    # NVFlare round info (swarm mode)
+    round_matches = re.findall(r"(?:Round|round)\s+(\d+)", text)
+    if round_matches:
+        summary["total_rounds"] = max(int(r) for r in round_matches)
+
+    return summary
+
+
 # ---------------------------------------------------------------------------
 # Index page
 # ---------------------------------------------------------------------------
 
 
 @app.get("/", response_class=HTMLResponse)
-def index():
+def index(
+    site_filter: str = Query("", alias="site"),
+    mode_filter: str = Query("", alias="mode"),
+    status_filter: str = Query("", alias="status"),
+    job_filter: str = Query("", alias="job"),
+    group_by_job: bool = Query(False, alias="group"),
+):
     r = rows()
-    now_str = datetime.now().strftime("%H:%M:%S")
+    now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+    # Collect unique values for filters
+    all_sites = sorted({x["site"] for x in r})
+    all_modes = sorted({x["mode"] for x in r})
+    all_statuses = sorted({x["status"] for x in r})
+    all_jobs = sorted({x["job_id"] for x in r if x["job_id"]})
 
     if not r:
         body = f"""
@@ -345,64 +550,105 @@ def index():
 <main><div class="empty">No training runs found under {BASE}</div></main>"""
         return _html_page("MediSwarm Monitor", body, refresh=30)
 
+    # Apply filters
+    filtered = r
+    if site_filter:
+        filtered = [x for x in filtered if x["site"] == site_filter]
+    if mode_filter:
+        filtered = [x for x in filtered if x["mode"] == mode_filter]
+    if status_filter:
+        filtered = [x for x in filtered if x["status"] == status_filter]
+    if job_filter:
+        filtered = [x for x in filtered if x["job_id"] == job_filter]
+
+    # Stats
+    n_total = len(filtered)
+    n_running = sum(1 for x in filtered if x["status"] == "running")
+    n_finished = sum(1 for x in filtered if x["status"] == "finished")
+    n_stale = sum(1 for x in filtered if x["status"] == "stale")
+    n_sites = len({x["site"] for x in filtered})
+
+    stats_html = f"""
+<div class="stats-bar">
+  <div class="stat-card"><span class="stat-num">{n_total}</span><span class="stat-label">Total Runs</span></div>
+  <div class="stat-card"><span class="stat-num">{n_running}</span><span class="stat-label">Running</span></div>
+  <div class="stat-card"><span class="stat-num">{n_finished}</span><span class="stat-label">Finished</span></div>
+  <div class="stat-card"><span class="stat-num">{n_stale}</span><span class="stat-label">Stale</span></div>
+  <div class="stat-card"><span class="stat-num">{n_sites}</span><span class="stat-label">Sites</span></div>
+</div>"""
+
+    # Filter bar
+    def _select_opts(name: str, values: list[str], current: str) -> str:
+        opts = '<option value="">All</option>'
+        for v in values:
+            sel = " selected" if v == current else ""
+            opts += f'<option value="{html_escape(v)}"{sel}>{html_escape(v)}</option>'
+        return f'<select name="{name}" onchange="this.form.submit()">{opts}</select>'
+
+    group_checked = " checked" if group_by_job else ""
+    filter_html = f"""
+<form class="filter-bar" method="get" action="/">
+  <div class="filter-group">
+    <label>Site:</label> {_select_opts("site", all_sites, site_filter)}
+  </div>
+  <div class="filter-group">
+    <label>Mode:</label> {_select_opts("mode", all_modes, mode_filter)}
+  </div>
+  <div class="filter-group">
+    <label>Status:</label> {_select_opts("status", all_statuses, status_filter)}
+  </div>
+  <div class="filter-group">
+    <label>Job:</label> {_select_opts("job", all_jobs, job_filter)}
+  </div>
+  <div class="filter-group">
+    <label><input type="checkbox" name="group" value="true"{group_checked}
+      onchange="this.form.submit()"> Group by Job</label>
+  </div>
+  <a href="/" class="btn-small">Clear Filters</a>
+</form>"""
+
+    # Build table rows
     table_rows = []
-    for x in r:
-        # Links
-        links = []
-        detail = f"/detail/{x['site']}/{x['mode']}/{x['run_id']}"
-        links.append(f'<a class="btn" href="{detail}">Details</a>')
-        links.append(
-            f"<a href=\"/heartbeat/{x['site']}/{x['mode']}/{x['run_id']}\">heartbeat</a>"
-        )
-        if x["has_console"]:
-            label = "nohup" if x["mode"] == "swarm" else "console"
-            links.append(
-                f"<a href=\"/console/{x['site']}/{x['mode']}/{x['run_id']}\">{label}</a>"
+
+    if group_by_job and not job_filter:
+        # Group swarm runs by job_id, local runs standalone
+        from collections import OrderedDict
+
+        job_groups: OrderedDict[str, list[dict]] = OrderedDict()
+        standalone: list[dict] = []
+
+        for x in filtered:
+            if x["job_id"]:
+                job_groups.setdefault(x["job_id"], []).append(x)
+            else:
+                standalone.append(x)
+
+        for job_id, items in job_groups.items():
+            n_items = len(items)
+            sites = ", ".join(sorted({x["site"] for x in items}))
+            statuses = {x["status"] for x in items}
+            job_status = (
+                "running"
+                if "running" in statuses
+                else ("stale" if "stale" in statuses else "finished")
             )
-        if x["has_log"]:
-            links.append(
-                f"<a href=\"/log/{x['site']}/{x['mode']}/{x['run_id']}\">log</a>"
+            run_name = items[0].get("run_name", "") if items else ""
+            table_rows.append(
+                f"""<tr class="job-group-row"><td colspan="10">
+                Job: <span class="job-id">{html_escape(job_id)}</span>
+                &nbsp;&middot;&nbsp; {n_items} client(s): {html_escape(sites)}
+                &nbsp;&middot;&nbsp; {_status_badge(job_status)}
+                {f' &middot; {html_escape(run_name)}' if run_name else ''}
+                </td></tr>"""
             )
+            for x in items:
+                table_rows.append(_build_table_row(x))
 
-        # Artifacts
-        arts = []
-        if x["last_ckpt"]:
-            arts.append('<span class="yes">last.ckpt</span>')
-        if x["epoch_ckpt"]:
-            arts.append('<span class="yes">epoch.ckpt</span>')
-        if x["global_model"]:
-            arts.append('<span class="yes">FL_global</span>')
-        if x["best_global_model"]:
-            arts.append('<span class="yes">best_FL</span>')
-        if x["csv_files"]:
-            arts.append(f'<span class="yes">{len(x["csv_files"])} CSV</span>')
-        if x["tb_events"]:
-            arts.append('<span class="yes">TFEvents</span>')
-        if not arts:
-            arts.append('<span class="no">none</span>')
-
-        # Run display
-        run_display = ""
-        if x["run_name"]:
-            run_display = f'<span class="run-name">{html_escape(x["run_name"])}</span><br>'
-        run_display += f'<span class="run-id">{html_escape(x["run_id"])}</span>'
-
-        age_cls = _age_class(x["age"])
-        age_td = (
-            f'<span class="{age_cls}">{x["age"]}</span>' if age_cls else x["age"]
-        )
-
-        table_rows.append(
-            f"""<tr>
-  <td>{html_escape(x['site'])}</td>
-  <td>{html_escape(x['mode'])}</td>
-  <td>{run_display}</td>
-  <td>{_status_badge(x['status'])}</td>
-  <td>{age_td}</td>
-  <td class="artifact">{' &middot; '.join(arts)}</td>
-  <td class="links">{' '.join(links)}</td>
-</tr>"""
-        )
+        for x in standalone:
+            table_rows.append(_build_table_row(x))
+    else:
+        for x in filtered:
+            table_rows.append(_build_table_row(x))
 
     body = f"""
 <header>
@@ -412,10 +658,12 @@ def index():
     &middot; <a href="/api/runs">API</a></div>
 </header>
 <main>
+{stats_html}
+{filter_html}
 <table>
 <thead><tr>
   <th>Site</th><th>Mode</th><th>Run</th><th>Status</th><th>Age</th>
-  <th>Artifacts</th><th>Links</th>
+  <th>Version</th><th>Artifacts</th><th>Size</th><th>Server Path</th><th>Links</th>
 </tr></thead>
 <tbody>
 {''.join(table_rows)}
@@ -425,6 +673,78 @@ def index():
     return _html_page("MediSwarm Monitor", body, refresh=30)
 
 
+def _build_table_row(x: dict[str, Any]) -> str:
+    """Build a single <tr> for the index table."""
+    # Links
+    links = []
+    detail = f"/detail/{x['site']}/{x['mode']}/{x['run_id']}"
+    links.append(f'<a class="btn" href="{detail}">Details</a>')
+    if x["has_console"]:
+        label = "nohup" if x["mode"] == "swarm" else "console"
+        links.append(
+            f"<a href=\"/console/{x['site']}/{x['mode']}/{x['run_id']}\">{label}</a>"
+        )
+    if x["has_log"]:
+        links.append(
+            f"<a href=\"/log/{x['site']}/{x['mode']}/{x['run_id']}\">log</a>"
+        )
+
+    # Artifacts
+    arts = []
+    if x["checkpoints"]:
+        arts.append(f'<span class="yes">{x["checkpoints"]} ckpt</span>')
+    if x["has_global_model"]:
+        arts.append('<span class="yes">FL_global</span>')
+    if x["has_best_model"]:
+        arts.append('<span class="yes">best_FL</span>')
+    if x["csv_files"]:
+        arts.append(f'<span class="yes">{len(x["csv_files"])} CSV</span>')
+    if x["tb_events"]:
+        arts.append(f'<span class="yes">{x["tb_events"]} TFE</span>')
+    if not arts:
+        arts.append('<span class="no">none</span>')
+
+    # Run display
+    run_display = ""
+    if x["run_name"]:
+        run_display = (
+            f'<span class="run-name">{html_escape(x["run_name"])}</span><br>'
+        )
+    run_display += f'<span class="run-id">{html_escape(x["run_id"])}</span>'
+    if x["job_id"]:
+        run_display += f'<br><span class="job-id">job: {html_escape(x["job_id"][:8])}...</span>'
+
+    age_cls = _age_class(x["age"])
+    age_td = (
+        f'<span class="{age_cls}">{x["age"]}</span>' if age_cls else x["age"]
+    )
+
+    version = (
+        f'<span class="version">{html_escape(x["kit_version"])}</span>'
+        if x["kit_version"]
+        else '<span class="no">-</span>'
+    )
+
+    size_str = _format_size(x["total_size"]) if x["total_size"] else "-"
+    server_path = (
+        f'<span class="file-path" title="{html_escape(x["server_path"])}">'
+        f'{html_escape(x["server_path"][-40:])}</span>'
+    )
+
+    return f"""<tr>
+  <td>{html_escape(x['site'])}</td>
+  <td>{html_escape(x['mode'])}</td>
+  <td>{run_display}</td>
+  <td>{_status_badge(x['status'])}</td>
+  <td>{age_td}</td>
+  <td>{version}</td>
+  <td class="artifact">{' &middot; '.join(arts)}</td>
+  <td>{size_str}</td>
+  <td>{server_path}</td>
+  <td class="links">{' '.join(links)}</td>
+</tr>"""
+
+
 # ---------------------------------------------------------------------------
 # Detail page
 # ---------------------------------------------------------------------------
@@ -434,58 +754,166 @@ def index():
 def detail(site: str, mode: str, run_id: str):
     run_dir = _resolve_run_dir(site, mode, run_id)
     hb = _read_heartbeat(run_dir)
+    status = _infer_status(hb, run_dir)
     console_text = _get_console_text(site, mode, run_id)
     metrics = parse_console_metrics(console_text)
     csv_files = _find_csv_files(run_dir)
-    has_tb = bool(_find_tb_events(run_dir))
-
-    # Heartbeat info card
+    tb_events = _find_tb_events(run_dir)
+    checkpoints = _find_checkpoints(run_dir)
+    all_files = _find_all_files(run_dir)
+    training_summary = _extract_training_summary(console_text)
+
+    # -- Heartbeat info card --
+    hb_display_keys = [
+        ("site_name", "Site Name"),
+        ("mode", "Mode"),
+        ("job_id", "Job ID"),
+        ("run_name", "Run Name"),
+        ("timestamp", "Last Heartbeat"),
+        ("status", "Raw Status"),
+        ("kit_version", "Kit Version"),
+        ("kit_root", "Kit Root (client)"),
+        ("run_dir", "Run Dir (client)"),
+        ("log_file", "Log File (client)"),
+        ("console_file", "Console File (client)"),
+        ("global_model", "Global Model (client)"),
+        ("best_global_model", "Best Global Model (client)"),
+        ("last_ckpt", "Last Checkpoint (client)"),
+        ("epoch_ckpt", "Epoch Checkpoint (client)"),
+        ("tb_file", "TensorBoard File (client)"),
+    ]
     hb_rows = ""
-    for key in [
-        "site_name",
-        "mode",
-        "job_id",
-        "run_name",
-        "timestamp",
-        "status",
-        "run_dir",
-        "last_ckpt",
-        "epoch_ckpt",
-        "global_model",
-        "best_global_model",
-        "tb_file",
-    ]:
+    # Add inferred status first
+    hb_rows += f"<tr><td>Effective Status</td><td>{_status_badge(status)}</td></tr>\n"
+    for key, label in hb_display_keys:
         val = hb.get(key, "")
         if val:
-            hb_rows += f"<tr><td>{html_escape(str(key))}</td><td>{html_escape(str(val))}</td></tr>\n"
+            hb_rows += f"<tr><td>{html_escape(label)}</td><td>{html_escape(str(val))}</td></tr>\n"
+    # Add age
+    ts = hb.get("timestamp", "")
+    if ts:
+        hb_rows += f"<tr><td>Heartbeat Age</td><td>{parse_age(ts)}</td></tr>\n"
     if not hb_rows:
         hb_rows = '<tr><td colspan="2">No heartbeat data available</td></tr>'
 
-    # CSV links
-    csv_links = ""
+    # -- Training summary card --
+    summary_html = ""
+    if training_summary:
+        summary_rows = ""
+        if "total_epochs" in training_summary:
+            summary_rows += (
+                f"<tr><td>Total Epochs</td><td>{training_summary['total_epochs']}</td></tr>"
+            )
+        if "best_val_epoch" in training_summary:
+            summary_rows += (
+                f"<tr><td>Best Validation</td>"
+                f"<td>Epoch {training_summary['best_val_epoch']} &mdash; "
+                f"ACC: {training_summary['best_val_acc']:.4f}, "
+                f"AUC_ROC: {training_summary['best_val_auc_roc']:.4f}</td></tr>"
+            )
+        if "final_train_acc" in training_summary:
+            summary_rows += (
+                f"<tr><td>Final Training</td>"
+                f"<td>ACC: {training_summary['final_train_acc']:.4f}, "
+                f"AUC_ROC: {training_summary['final_train_auc_roc']:.4f}</td></tr>"
+            )
+        if "total_rounds" in training_summary:
+            summary_rows += (
+                f"<tr><td>FL Rounds</td><td>{training_summary['total_rounds']}</td></tr>"
+            )
+        if summary_rows:
+            summary_html = f"""
+<div class="card">
+  <h2>Training Summary</h2>
+  <table class="kv-table"><tbody>{summary_rows}</tbody></table>
+</div>"""
+
+    # -- Server-side files card --
+    file_list_html = ""
+    if all_files:
+        items = ""
+        for f in all_files:
+            size = _format_size(f["size"])
+            dl_url = f"/download/{site}/{mode}/{run_id}/{f['rel_path']}"
+            icon = "📄"
+            if f["name"].endswith(".csv"):
+                icon = "📊"
+            elif f["name"].endswith(".ckpt") or f["name"].endswith(".pt"):
+                icon = "📦"
+            elif f["name"].endswith(".json") or f["name"].endswith(".yaml"):
+                icon = "📝"
+            elif "tfevents" in f["name"]:
+                icon = "📈"
+            items += (
+                f'<li><span class="file-icon">{icon}</span>'
+                f'<span style="flex:1;">'
+                f'<a href="{dl_url}" title="Download">{html_escape(f["rel_path"])}</a>'
+                f'<br><span class="file-path">{html_escape(f["server_path"])}</span>'
+                f'</span>'
+                f'<span class="file-size">{size}</span>'
+                f'<a class="btn btn-download" href="{dl_url}" '
+                f'style="font-size:0.72rem;padding:2px 8px;">Download</a>'
+                f'</li>'
+            )
+        total_size = _format_size(sum(f["size"] for f in all_files))
+        file_list_html = f"""
+<div class="card" style="grid-column: 1 / -1;">
+  <h2>Server Files ({len(all_files)} files, {total_size} total)</h2>
+  <p style="margin-bottom:0.5rem;font-size:0.82rem;color:var(--text-light);">
+    Server directory: <code>{html_escape(str(run_dir))}</code></p>
+  <ul class="file-list">{items}</ul>
+</div>"""
+
+    # -- CSV links card --
+    csv_links_html = ""
     if csv_files:
         csv_items = "".join(
-            f'<li><a href="/csv/{site}/{mode}/{run_id}/{f}">{html_escape(f)}</a></li>'
+            f'<li><a href="/csv/{site}/{mode}/{run_id}/{f}">{html_escape(f)}</a>'
+            f' &middot; <a class="btn btn-download" '
+            f'href="/download/{site}/{mode}/{run_id}/run_dir/{f}" '
+            f'style="font-size:0.72rem;padding:2px 8px;">Download</a></li>'
             for f in csv_files
         )
-        csv_links = f"<ul>{csv_items}</ul>"
-    else:
-        csv_links = "<p>No CSV result files found.</p>"
+        csv_links_html = f"""
+<div class="card">
+  <h2>Result CSVs</h2>
+  <ul class="file-list">{csv_items}</ul>
+</div>"""
+
+    # -- Checkpoints card --
+    ckpt_html = ""
+    if checkpoints:
+        ckpt_items = "".join(
+            f'<li><span class="file-icon">📦</span>'
+            f'<span style="flex:1;">{html_escape(c["name"])}'
+            f'<br><span class="file-path">{html_escape(c["server_path"])}</span></span>'
+            f'<span class="file-size">{_format_size(c["size"])}</span>'
+            f'<a class="btn btn-download" '
+            f'href="/download/{site}/{mode}/{run_id}/{c["rel_path"]}" '
+            f'style="font-size:0.72rem;padding:2px 8px;">Download</a>'
+            f'</li>'
+            for c in checkpoints
+        )
+        ckpt_html = f"""
+<div class="card">
+  <h2>Checkpoints ({len(checkpoints)})</h2>
+  <ul class="file-list">{ckpt_items}</ul>
+</div>"""
 
-    # Console snippet (last 200 lines)
+    # -- Console snippet (last 300 lines) --
     console_lines = console_text.strip().split("\n")
-    console_tail = "\n".join(console_lines[-200:]) if console_lines else "No output."
-    # Escape HTML in console output
-    console_tail = (
+    console_tail = "\n".join(console_lines[-300:]) if console_lines else "No output."
+    console_tail_escaped = (
         console_tail.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
     )
+    console_len = len(console_lines)
 
-    # Chart section
+    # -- Chart section --
     chart_html = ""
     if metrics["epochs"]:
         chart_html = f"""
 <div class="card" style="grid-column: 1 / -1;">
-  <h2>Training Metrics</h2>
+  <h2>Training Metrics (from console)</h2>
   <div class="chart-container"><canvas id="metricsChart"></canvas></div>
 </div>
 <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
@@ -506,7 +934,7 @@ def detail(site: str, mode: str, run_id: str):
     borderColor: colors[key] || '#636e72',
     backgroundColor: 'transparent',
     tension: 0.3,
-    pointRadius: 3,
+    pointRadius: 2,
     borderWidth: 2
   }});
 }}
@@ -527,23 +955,124 @@ def detail(site: str, mode: str, run_id: str):
 }});
 </script>"""
 
-    # TensorBoard metrics link
+    # -- TensorBoard metrics --
     tb_html = ""
-    if has_tb and HAS_TBPARSE:
-        tb_html = f"""
+    if tb_events and HAS_TBPARSE:
+        # Parse and display inline
+        try:
+            reader = SummaryReader(str(tb_events[0].parent))
+            df = reader.scalars
+            tags = sorted(df["tag"].unique()) if len(df) > 0 else []
+            tb_data: dict[str, Any] = {}
+            for tag in tags:
+                subset = df[df["tag"] == tag].sort_values("step")
+                tb_data[tag] = {
+                    "steps": subset["step"].tolist(),
+                    "values": subset["value"].tolist(),
+                }
+            if tb_data:
+                tb_html = f"""
+<div class="card" style="grid-column: 1 / -1;">
+  <h2>TensorBoard Metrics ({len(tags)} tags)</h2>
+  <div class="chart-container"><canvas id="tbChart"></canvas></div>
+  <div style="margin-top:0.5rem;">
+    <a class="btn" href="/tb_metrics/{site}/{mode}/{run_id}">Raw JSON</a>
+  </div>
+</div>
+<script>
+const tbData = {json.dumps(tb_data)};
+const tbCtx = document.getElementById('tbChart').getContext('2d');
+const tbColors = ['#27ae60','#2980b9','#8e44ad','#e67e22','#c0392b',
+                  '#f39c12','#1abc9c','#e74c3c','#3498db','#9b59b6'];
+const tbDatasets = [];
+let colorIdx = 0;
+for (const [tag, data] of Object.entries(tbData)) {{
+  tbDatasets.push({{
+    label: tag,
+    data: data.steps.map((s, i) => ({{ x: s, y: data.values[i] }})),
+    borderColor: tbColors[colorIdx % tbColors.length],
+    backgroundColor: 'transparent',
+    tension: 0.3,
+    pointRadius: 1,
+    borderWidth: 2,
+    showLine: true
+  }});
+  colorIdx++;
+}}
+new Chart(tbCtx, {{
+  type: 'scatter',
+  data: {{ datasets: tbDatasets }},
+  options: {{
+    responsive: true,
+    maintainAspectRatio: false,
+    scales: {{
+      x: {{ title: {{ display: true, text: 'Step' }} }},
+      y: {{ title: {{ display: true, text: 'Value' }} }}
+    }},
+    plugins: {{
+      legend: {{ position: 'top' }}
+    }}
+  }}
+}});
+</script>"""
+        except Exception as e:
+            tb_html = f"""
 <div class="card">
   <h2>TensorBoard Metrics</h2>
-  <p>TensorBoard events available.
-     <a class="btn" href="/tb_metrics/{site}/{mode}/{run_id}">View raw JSON</a></p>
+  <p>Error parsing TensorBoard events: {html_escape(str(e))}</p>
+  <a class="btn" href="/tb_metrics/{site}/{mode}/{run_id}">Try raw JSON</a>
 </div>"""
-    elif has_tb:
-        tb_html = """
+    elif tb_events:
+        tb_html = f"""
 <div class="card">
   <h2>TensorBoard Metrics</h2>
-  <p>TensorBoard events found but <code>tbparse</code> is not installed.
-     Install with <code>pip install tbparse</code> to enable parsing.</p>
+  <p>Found {len(tb_events)} TensorBoard event file(s).</p>
+  <p><code>tbparse</code> is {'installed' if HAS_TBPARSE else
+  'not installed &mdash; install with <code>pip install tbparse</code> to enable parsing'}.</p>
 </div>"""
 
+    # -- Models card --
+    model_html = ""
+    model_files = []
+    for mname in [
+        "FL_global_model.pt",
+        "best_FL_global_model.pt",
+        "last_global_model.ckpt",
+    ]:
+        mp = run_dir / mname
+        if not mp.exists():
+            mp = run_dir / "run_dir" / mname
+        if mp.exists():
+            model_files.append(
+                {
+                    "name": mname,
+                    "size": mp.stat().st_size,
+                    "path": str(mp),
+                    "rel_path": str(mp.relative_to(run_dir)),
+                }
+            )
+    if model_files:
+        model_items = "".join(
+            f'<li><span class="file-icon">🧠</span>'
+            f'<span style="flex:1;">{html_escape(m["name"])}'
+            f'<br><span class="file-path">{html_escape(m["path"])}</span></span>'
+            f'<span class="file-size">{_format_size(m["size"])}</span>'
+            f'<a class="btn btn-download" '
+            f'href="/download/{site}/{mode}/{run_id}/{m["rel_path"]}" '
+            f'style="font-size:0.72rem;padding:2px 8px;">Download</a>'
+            f"</li>"
+            for m in model_files
+        )
+        model_html = f"""
+<div class="card">
+  <h2>Models</h2>
+  <ul class="file-list">{model_items}</ul>
+</div>"""
+
+    log_btn = ""
+    if (run_dir / "log.txt").exists():
+        log_btn = f'<a class="btn" href="/log/{site}/{mode}/{run_id}">Full NVFlare Log</a>'
+
     body = f"""
 <header>
   <h1>MediSwarm Live Monitor</h1>
@@ -556,30 +1085,70 @@ def detail(site: str, mode: str, run_id: str):
 
 <div class="detail-grid">
   <div class="card">
-    <h2>Heartbeat</h2>
+    <h2>Heartbeat &amp; Status</h2>
     <table class="kv-table"><tbody>{hb_rows}</tbody></table>
-  </div>
-
-  <div class="card">
-    <h2>Artifacts &amp; CSVs</h2>
-    {csv_links}
     <div style="margin-top:0.8rem;">
-      <a class="btn" href="/heartbeat/{site}/{mode}/{run_id}">Raw heartbeat</a>
-      <a class="btn" href="/console/{site}/{mode}/{run_id}">Full console</a>
-      {"<a class='btn' href='/log/" + site + "/" + mode + "/" + run_id + "'>Full log</a>" if (run_dir / "log.txt").exists() else ""}
+      <a class="btn" href="/heartbeat/{site}/{mode}/{run_id}">Raw Heartbeat JSON</a>
     </div>
   </div>
 
+  {summary_html}
+  {csv_links_html}
+  {ckpt_html}
+  {model_html}
+
   {chart_html}
   {tb_html}
+  {file_list_html}
 
   <div class="card" style="grid-column: 1 / -1;">
-    <h2>Console Output (last 200 lines)</h2>
-    <pre>{console_tail}</pre>
+    <h2>Console Output (last 300 of {console_len} lines)</h2>
+    <div style="margin-bottom:0.5rem;">
+      <a class="btn" href="/console/{site}/{mode}/{run_id}">Full Console Output</a>
+      {log_btn}
+    </div>
+    <pre>{console_tail_escaped}</pre>
   </div>
 </div>
 </main>"""
-    return _html_page(f"{html_escape(site)}/{html_escape(mode)}/{html_escape(run_id)} — MediSwarm", body)
+    return _html_page(
+        f"{html_escape(site)}/{html_escape(mode)}/{html_escape(run_id)} — MediSwarm",
+        body,
+    )
+
+
+# ---------------------------------------------------------------------------
+# File download endpoint
+# ---------------------------------------------------------------------------
+
+
+@app.get("/download/{site}/{mode}/{run_id}/{file_path:path}")
+def download_file(site: str, mode: str, run_id: str, file_path: str):
+    """Download any file from a run directory."""
+    run_dir = _resolve_run_dir(site, mode, run_id)
+
+    # Prevent traversal in file_path
+    if ".." in file_path:
+        raise HTTPException(status_code=400, detail="Invalid file path")
+
+    target = (run_dir / file_path).resolve()
+
+    # Ensure target is under run_dir
+    try:
+        common = os.path.commonpath([str(run_dir.resolve()), str(target)])
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid path")
+    if common != str(run_dir.resolve()):
+        raise HTTPException(status_code=400, detail="Invalid path")
+
+    if not target.exists() or not target.is_file():
+        raise HTTPException(status_code=404, detail="File not found")
+
+    return FileResponse(
+        path=str(target),
+        filename=target.name,
+        media_type="application/octet-stream",
+    )
 
 
 # ---------------------------------------------------------------------------
@@ -615,14 +1184,12 @@ def log(site: str, mode: str, run_id: str):
 
 @app.get("/metrics/{site}/{mode}/{run_id}", response_class=JSONResponse)
 def metrics(site: str, mode: str, run_id: str):
-    """Return parsed training metrics from console output as JSON."""
     text = _get_console_text(site, mode, run_id)
     return parse_console_metrics(text)
 
 
 @app.get("/tb_metrics/{site}/{mode}/{run_id}", response_class=JSONResponse)
 def tb_metrics(site: str, mode: str, run_id: str):
-    """Return TensorBoard scalar metrics as JSON (requires tbparse)."""
     if not HAS_TBPARSE:
         return JSONResponse(
             {"error": "tbparse is not installed"}, status_code=501
@@ -630,11 +1197,12 @@ def tb_metrics(site: str, mode: str, run_id: str):
 
     validated_dir = _resolve_run_dir(site, mode, run_id)
     run_dir = validated_dir / "run_dir"
-    events = sorted(run_dir.rglob("events.out.tfevents*")) if run_dir.exists() else []
+    events = (
+        sorted(run_dir.rglob("events.out.tfevents*")) if run_dir.exists() else []
+    )
     if not events:
         return {"scalars": []}
 
-    # Parse the directory containing events
     try:
         reader = SummaryReader(str(events[0].parent))
         df = reader.scalars
@@ -654,15 +1222,12 @@ def tb_metrics(site: str, mode: str, run_id: str):
 
 @app.get("/csv/{site}/{mode}/{run_id}/{filename}", response_class=HTMLResponse)
 def csv_view(site: str, mode: str, run_id: str, filename: str):
-    """Render a CSV file as a styled HTML table."""
-    # Sanitize filename to prevent directory traversal
     safe_name = Path(filename).name
     if not safe_name or safe_name != filename or ".." in filename or "/" in filename:
         return HTMLResponse("<p>Invalid filename</p>", status_code=400)
 
     validated_dir = _resolve_run_dir(site, mode, run_id)
     rd = validated_dir / "run_dir"
-    # Search recursively for the file
     matches = list(rd.rglob(safe_name)) if rd.exists() else []
     if not matches:
         return HTMLResponse("<p>File not found</p>", status_code=404)
@@ -680,7 +1245,7 @@ def csv_view(site: str, mode: str, run_id: str, filename: str):
 
     th = "".join(f"<th>{html_escape(h)}</th>" for h in headers)
     trs = ""
-    for row in data_rows[:500]:  # limit display to 500 rows
+    for row in data_rows[:500]:
         tds = "".join(f"<td>{html_escape(cell)}</td>" for cell in row)
         trs += f"<tr>{tds}</tr>\n"
 
@@ -690,6 +1255,8 @@ def csv_view(site: str, mode: str, run_id: str, filename: str):
         else ""
     )
 
+    dl_url = f"/download/{site}/{mode}/{run_id}/run_dir/{safe_name}"
+
     body = f"""
 <header>
   <h1>MediSwarm Live Monitor</h1>
@@ -703,6 +1270,10 @@ def csv_view(site: str, mode: str, run_id: str, filename: str):
 </div>
 <div class="card">
   <h2>{html_escape(safe_name)}</h2>
+  <div style="margin-bottom:0.5rem;">
+    <a class="btn btn-download" href="{dl_url}">Download CSV</a>
+    <span class="file-path">Server: {html_escape(str(csv_path))}</span>
+  </div>
   {truncated}
   <div style="overflow-x:auto;">
   <table><thead><tr>{th}</tr></thead><tbody>{trs}</tbody></table>
@@ -719,19 +1290,30 @@ def csv_view(site: str, mode: str, run_id: str, filename: str):
 
 @app.get("/api/runs", response_class=JSONResponse)
 def api_runs():
-    """Return all runs as JSON."""
     return rows()
 
 
 @app.get("/api/metrics/{site}/{mode}/{run_id}", response_class=JSONResponse)
 def api_metrics(site: str, mode: str, run_id: str):
-    """Return parsed training metrics as JSON (alias for /metrics/)."""
     text = _get_console_text(site, mode, run_id)
     return parse_console_metrics(text)
 
 
 @app.get("/api/heartbeat/{site}/{mode}/{run_id}", response_class=JSONResponse)
 def api_heartbeat(site: str, mode: str, run_id: str):
-    """Return heartbeat JSON directly."""
     run_dir = _resolve_run_dir(site, mode, run_id)
     return _read_heartbeat(run_dir)
+
+
+@app.get("/api/files/{site}/{mode}/{run_id}", response_class=JSONResponse)
+def api_files(site: str, mode: str, run_id: str):
+    """Return all files in a run directory as JSON."""
+    run_dir = _resolve_run_dir(site, mode, run_id)
+    return _find_all_files(run_dir)
+
+
+@app.get("/api/summary/{site}/{mode}/{run_id}", response_class=JSONResponse)
+def api_summary(site: str, mode: str, run_id: str):
+    """Return training summary extracted from console output."""
+    text = _get_console_text(site, mode, run_id)
+    return _extract_training_summary(text)

From 5267f8912e758dab90f0b501ab24ef21bdadf8fb Mon Sep 17 00:00:00 2001
From: Jeff <jeffzhu6969@gmail.com>
Date: Tue, 7 Apr 2026 15:17:34 +0200
Subject: [PATCH 3/3] Fix TensorBoard chart not rendering: load Chart.js in
 head when any chart is present

Chart.js CDN script was only included inside the console metrics chart
block, so TensorBoard charts would try to use Chart() without the
library loaded when console metrics were absent.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 server_tools/app.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/server_tools/app.py b/server_tools/app.py
index 40050bed..17165b5b 100644
--- a/server_tools/app.py
+++ b/server_tools/app.py
@@ -916,7 +916,6 @@ def detail(site: str, mode: str, run_id: str):
   <h2>Training Metrics (from console)</h2>
   <div class="chart-container"><canvas id="metricsChart"></canvas></div>
 </div>
-<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 <script>
 const metricsData = {json.dumps(metrics)};
 const ctx = document.getElementById('metricsChart').getContext('2d');
@@ -1111,9 +1110,14 @@ def detail(site: str, mode: str, run_id: str):
   </div>
 </div>
 </main>"""
+    # Include Chart.js if any chart is rendered
+    needs_chartjs = bool(chart_html) or bool(tb_html and HAS_TBPARSE and tb_events)
+    chartjs_head = '<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>' if needs_chartjs else ""
+
     return _html_page(
         f"{html_escape(site)}/{html_escape(mode)}/{html_escape(run_id)} — MediSwarm",
         body,
+        extra_head=chartjs_head,
     )