KatherLab · Ultimate-Storm · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/application/jobs/challenge_1DivideAndConquer/app/config/config_fed_client.conf b/application/jobs/challenge_1DivideAndConquer/app/config/config_fed_client.conf
@@ -42,10 +42,10 @@
           learn_task_timeout = 7200
           # time to wait for learn task to abort gracefully (5 min)
           learn_task_abort_timeout = 300
-          # time for task acknowledgment (2 min)
-          learn_task_ack_timeout = 120
-          # time for final result acknowledgment (5 min)
-          final_result_ack_timeout = 300
+          # time for task acknowledgment incl. model weight streaming (10 min)
+          learn_task_ack_timeout = 600
+          # time for final result acknowledgment (10 min)
+          final_result_ack_timeout = 600
 
           # ids must map to corresponding components
           persistor_id = "persistor"

diff --git a/application/jobs/challenge_1DivideAndConquer/app/config/config_fed_server.conf b/application/jobs/challenge_1DivideAndConquer/app/config/config_fed_server.conf
@@ -17,12 +17,12 @@ workflows = [
     args {
       # can also set aggregation clients and train clients, see class for all available args
       num_rounds = 20
-      # time for clients to configure and start (10 min)
-      start_task_timeout = 600
-      # max time without any progress before declaring failure (2 hours)
-      progress_timeout = 7200
-      # time for clients to acknowledge configuration (5 min)
-      configure_task_timeout = 300
+      # time for clients to configure and start (15 min)
+      start_task_timeout = 900
+      # max time without any progress before declaring failure (4 hours)
+      progress_timeout = 14400
+      # time for clients to acknowledge configuration (10 min)
+      configure_task_timeout = 600
       # interval for clients to report status (5 min)
       max_status_report_interval = 300
     }

diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml
@@ -803,6 +803,57 @@ docker_cln_sh: |
      ENV_VARS+=" --env LOG_DATASET_DETAILS=1"
   fi
 
+  # ── Live Sync Integration ──────────────────────────────────────────
+  # live_sync.sh is co-located in the startup directory (injected by
+  # _injectLiveSyncIntoStartupKits.sh) and syncs training artifacts to
+  # the central monitoring server for --local_training and --start_client
+  # modes.  If live_sync.sh is not present the functions are a no-op.
+  KIT_ROOT="$(cd "$DIR/.." && pwd)"
+  SYNC_STATE_DIR="$DIR/.mediswarm_sync"
+  SITE_NAME_RESOLVED="{~~client_name~~}"
+
+  _start_live_sync() {
+    local mode="$1"
+    if [ ! -f "$DIR/live_sync.sh" ]; then return; fi
+    mkdir -p "$SYNC_STATE_DIR"
+
+    if [ "$mode" = "local" ]; then
+      "$DIR/live_sync.sh" \
+        --mode local \
+        --site-name "$SITE_NAME_RESOLVED" \
+        --kit-root "$KIT_ROOT" \
+        --startup-dir "$DIR" \
+        --scratch-dir "${MY_SCRATCH_DIR:-}" &
+      LIVE_SYNC_PID=$!
+    elif [ "$mode" = "swarm" ]; then
+      local pid_file="$SYNC_STATE_DIR/swarm_sync.pid"
+      if [ -f "$pid_file" ]; then
+        local old_pid
+        old_pid="$(cat "$pid_file" 2>/dev/null || true)"
+        if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
+          echo "Live sync daemon already running (PID $old_pid)"
+          return 0
+        fi
+      fi
+      nohup "$DIR/live_sync.sh" \
+        --mode swarm \
+        --site-name "$SITE_NAME_RESOLVED" \
+        --kit-root "$KIT_ROOT" \
+        --startup-dir "$DIR" \
+        --scratch-dir "${MY_SCRATCH_DIR:-}" \
+        > "$SYNC_STATE_DIR/live_sync_daemon.log" 2>&1 < /dev/null &
+      echo $! > "$pid_file"
+      echo "Started live sync daemon (PID $(cat "$pid_file"))"
+    fi
+  }
+
+  _stop_live_sync() {
+    if [ -n "${LIVE_SYNC_PID:-}" ] && kill -0 "$LIVE_SYNC_PID" 2>/dev/null; then
+      kill "$LIVE_SYNC_PID" || true
+      wait "$LIVE_SYNC_PID" || true
+    fi
+  }
+
   # Execution modes
   if [ -n "$DUMMY_TRAINING" ]; then
       docker run --rm $TTY_OPT $DOCKER_OPTIONS $ENV_VARS --env TRAINING_MODE=local_training $DOCKER_IMAGE \
@@ -815,10 +866,13 @@ docker_cln_sh: |
 
   elif [ -n "$LOCAL_TRAINING" ]; then
       echo "[INFO] Local training using job: $JOB_NAME"
+      trap _stop_live_sync EXIT INT TERM
+      _start_live_sync local
       docker run --rm $TTY_OPT $DOCKER_OPTIONS $ENV_VARS --env TRAINING_MODE=local_training --env NUM_EPOCHS=100 $DOCKER_IMAGE \
       /bin/bash -c "/MediSwarm/application/jobs/${JOB_NAME}/app/custom/main.py"
 
   elif [ -n "$START_CLIENT" ]; then
+      _start_live_sync swarm
       docker run -d -t --restart=on-failure:5 \
       --health-cmd="nvidia-smi > /dev/null 2>&1 || exit 1" \
       --health-interval=120s --health-start-period=180s --health-retries=3 \

diff --git a/kit_live_sync/build_heartbeat.sh b/kit_live_sync/build_heartbeat.sh
@@ -11,6 +11,14 @@ OUT_FILE="${7:-/tmp/mediswarm_heartbeat.json}"
 
 timestamp="$(date -u +%FT%TZ)"
 
+# Extract kit version from docker.sh (baked in at build time)
+kit_version=""
+docker_sh="${KIT_ROOT:+$KIT_ROOT/startup/docker.sh}"
+if [ -n "$docker_sh" ] && [ -f "$docker_sh" ]; then
+  kit_version="$(grep -oP '(?<=MEDISWARM_VERSION=)\S+' "$docker_sh" 2>/dev/null | head -1 || true)"
+  [ -n "$kit_version" ] || kit_version="$(grep -oP '(?<=jefftud/odelia:)\S+' "$docker_sh" 2>/dev/null | head -1 || true)"
+fi
+
 log_file=""
 console_file=""
 global_model=""
@@ -49,6 +57,7 @@ cat > "$OUT_FILE" <<EOF
   "run_name": "$RUN_NAME",
   "timestamp": "$timestamp",
   "status": "$STATUS",
+  "kit_version": "$kit_version",
   "kit_root": "$KIT_ROOT",
   "log_file": "$log_file",
   "console_file": "$console_file",

diff --git a/kit_live_sync/live_sync.sh b/kit_live_sync/live_sync.sh
@@ -32,6 +32,9 @@ mkdir -p "$STATE_DIR"
 LAST_CKPT_SYNC_FILE="$STATE_DIR/${MODE}_last_ckpt_sync_ts"
 touch "$LAST_CKPT_SYNC_FILE"
 
+# Track the run we are currently syncing so we can finalize it if a new run appears
+CURRENT_LOCAL_RUN=""
+
 ssh_cmd() {
   ssh ${SSH_OPTS} "$@"
 }
@@ -86,12 +89,42 @@ build_remote_dir() {
   printf '%s/%s/%s/%s' "$REMOTE_BASE" "$SITE_NAME" "$MODE" "$run_id"
 }
 
+_finalize_local_run() {
+  # Write a heartbeat_final.json for a given run_name and mark it "finished"
+  local old_run="$1"
+  [ -n "$old_run" ] || return 0
+
+  local old_remote_dir old_hb_final old_run_dir
+  old_remote_dir="$(build_remote_dir "$old_run")"
+  old_hb_final="$STATE_DIR/local_heartbeat_final_${old_run}.json"
+  export SCRATCHDIR
+  "$SCRIPT_DIR/build_heartbeat.sh" "$SITE_NAME" "local" "$KIT_ROOT" "" "$old_run" "finished" "$old_hb_final" >/dev/null
+  rsync_cmd "$old_hb_final" "${REMOTE_USER}@${REMOTE_HOST}:${old_remote_dir}/heartbeat_final.json" || true
+
+  # Final sync of the old run's artifacts
+  old_run_dir=""
+  if [ -n "$SCRATCHDIR" ] && [ -d "$SCRATCHDIR/runs/$SITE_NAME/$old_run" ]; then
+    old_run_dir="$SCRATCHDIR/runs/$SITE_NAME/$old_run"
+  elif [ -d "$STARTUP_DIR/runs/$SITE_NAME/$old_run" ]; then
+    old_run_dir="$STARTUP_DIR/runs/$SITE_NAME/$old_run"
+  fi
+  if [ -n "$old_run_dir" ]; then
+    rsync_cmd "$old_run_dir/" "${REMOTE_USER}@${REMOTE_HOST}:${old_remote_dir}/run_dir/" || true
+  fi
+}
+
 sync_local() {
   local run_name run_dir remote_dir hb_file now last
 
   run_name="$(find_latest_local_run_name || true)"
   [ -n "$run_name" ] || return 0
 
+  # If the run changed (new training started), finalize the old one
+  if [ -n "$CURRENT_LOCAL_RUN" ] && [ "$CURRENT_LOCAL_RUN" != "$run_name" ]; then
+    _finalize_local_run "$CURRENT_LOCAL_RUN" || true
+  fi
+  CURRENT_LOCAL_RUN="$run_name"
+
   # Determine run_dir: check scratch dir first, fall back to startup dir
   run_dir=""
   if [ -n "$SCRATCHDIR" ] && [ -d "$SCRATCHDIR/runs/$SITE_NAME/$run_name" ]; then
@@ -105,6 +138,7 @@ sync_local() {
   ensure_remote_dir "$remote_dir"
 
   hb_file="$STATE_DIR/local_heartbeat.json"
+  export SCRATCHDIR
   "$SCRIPT_DIR/build_heartbeat.sh" "$SITE_NAME" "local" "$KIT_ROOT" "" "$run_name" "running" "$hb_file" >/dev/null
 
   rsync_cmd "$hb_file" "${REMOTE_USER}@${REMOTE_HOST}:${remote_dir}/heartbeat.json" || true
@@ -210,6 +244,7 @@ final_sync() {
 
       remote_dir="$(build_remote_dir "$run_name")"
       hb_file="$STATE_DIR/local_heartbeat_final.json"
+      export SCRATCHDIR
       "$SCRIPT_DIR/build_heartbeat.sh" "$SITE_NAME" "local" "$KIT_ROOT" "" "$run_name" "finished" "$hb_file" >/dev/null
       rsync_cmd "$hb_file" "${REMOTE_USER}@${REMOTE_HOST}:${remote_dir}/heartbeat_final.json" || true
       if [ -n "$run_dir" ]; then

diff --git a/scripts/build/_injectLiveSyncIntoStartupKits.sh b/scripts/build/_injectLiveSyncIntoStartupKits.sh
@@ -37,116 +37,12 @@ find "$TARGET_FOLDER" -mindepth 1 -maxdepth 1 -type d | while read -r KIT_DIR; d
 
   chmod +x "$STARTUP_DIR/build_heartbeat.sh" "$STARTUP_DIR/live_sync.sh"
 
-  if [ ! -f "$STARTUP_DIR/docker_original.sh" ]; then
-    mv "$ORIGINAL_DOCKER_SH" "$STARTUP_DIR/docker_original.sh"
-    chmod +x "$STARTUP_DIR/docker_original.sh"
+  # Clean up legacy docker_original.sh wrapper if present from a previous build
+  if [ -f "$STARTUP_DIR/docker_original.sh" ]; then
+    rm -f "$STARTUP_DIR/docker_original.sh"
   fi
 
-  cat > "$STARTUP_DIR/docker.sh" <<'EOF'
-#!/usr/bin/env bash
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-STARTUP_DIR="$SCRIPT_DIR"
-KIT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
-ORIGINAL="$SCRIPT_DIR/docker_original.sh"
-STATE_DIR="$STARTUP_DIR/.mediswarm_sync"
-mkdir -p "$STATE_DIR"
-
-MODE=""
-SCRATCHDIR=""
-SITE_NAME_FROM_ARGS=""
-
-parse_args() {
-  while [ "$#" -gt 0 ]; do
-    case "$1" in
-      --local_training)
-        MODE="local"
-        shift
-        ;;
-      --start_client)
-        MODE="swarm"
-        shift
-        ;;
-      --scratch_dir)
-        SCRATCHDIR="${2:-}"
-        shift 2
-        ;;
-      --site_name)
-        SITE_NAME_FROM_ARGS="${2:-}"
-        shift 2
-        ;;
-      *)
-        shift
-        ;;
-    esac
-  done
-}
-
-parse_args "$@"
-
-SITE_NAME_FALLBACK="$(basename "$KIT_ROOT")"
-if [ -n "$SITE_NAME_FROM_ARGS" ]; then
-  export SITE_NAME="$SITE_NAME_FROM_ARGS"
-elif [ -z "${SITE_NAME:-}" ]; then
-  export SITE_NAME="$SITE_NAME_FALLBACK"
-fi
-
-start_local_sync() {
-  "$SCRIPT_DIR/live_sync.sh" \
-    --mode local \
-    --site-name "$SITE_NAME" \
-    --kit-root "$KIT_ROOT" \
-    --startup-dir "$STARTUP_DIR" \
-    --scratch-dir "${SCRATCHDIR:-}" &
-  SYNC_PID=$!
-}
-
-stop_local_sync() {
-  if [ -n "${SYNC_PID:-}" ] && kill -0 "$SYNC_PID" >/dev/null 2>&1; then
-    kill "$SYNC_PID" || true
-    wait "$SYNC_PID" || true
-  fi
-}
-
-start_swarm_sync_daemon() {
-  local pid_file="$STATE_DIR/swarm_sync.pid"
-
-  if [ -f "$pid_file" ]; then
-    old_pid="$(cat "$pid_file" 2>/dev/null || true)"
-    if [ -n "$old_pid" ] && kill -0 "$old_pid" >/dev/null 2>&1; then
-      echo "Live sync daemon already running with PID $old_pid"
-      return 0
-    fi
-  fi
-
-  nohup "$SCRIPT_DIR/live_sync.sh" \
-    --mode swarm \
-    --site-name "$SITE_NAME" \
-    --kit-root "$KIT_ROOT" \
-    --startup-dir "$STARTUP_DIR" \
-    --scratch-dir "${SCRATCHDIR:-}" \
-    > "$STATE_DIR/live_sync_daemon.log" 2>&1 < /dev/null &
-
-  echo $! > "$pid_file"
-  echo "Started live sync daemon with PID $(cat "$pid_file")"
-}
-
-if [ "$MODE" = "local" ]; then
-  trap stop_local_sync EXIT INT TERM
-  start_local_sync
-  exec "$ORIGINAL" "$@"
-elif [ "$MODE" = "swarm" ]; then
-  start_swarm_sync_daemon
-  exec "$ORIGINAL" "$@"
-else
-  exec "$ORIGINAL" "$@"
-fi
-EOF
-
-  chmod +x "$STARTUP_DIR/docker.sh"
-
-  echo "Patched $STARTUP_DIR/docker.sh"
+  echo "Injected live sync helpers into $STARTUP_DIR"
 done
 
 echo "Live sync injection finished"
diff --git a/scripts/ci/runIntegrationTests.sh b/scripts/ci/runIntegrationTests.sh
@@ -732,12 +732,25 @@ cleanup_synthetic_data () {
 }
 
 
+# Helper: remove a directory that may contain root-owned files created by
+# Docker containers.  Falls back to a disposable Alpine container when the
+# regular `rm` fails (which happens on the CI runner because NVFlare runs
+# as root inside its container and bind-mounts the workspace).
+_rm_rf () {
+    local dir="$1"
+    [ -z "$dir" ] && return 0
+    [ ! -e "$dir" ] && return 0
+    rm -rf "$dir" 2>/dev/null || \
+        docker run --rm -v "$(cd "$(dirname "$dir")" && pwd)":/ws alpine \
+            rm -rf "/ws/$(basename "$dir")"
+}
+
 cleanup_temporary_data () {
     echo "[Cleanup] Removing synthetic data directory, scratch directory, dummy workspace ..."
-    rm -rf "$SYNTHETIC_DATA_DIR"
-    rm -rf "$STAMP_SYNTHETIC_DATA_DIR"
-    rm -rf "$SCRATCH_DIR"
-    rm -rf "$PROJECT_DIR"
+    _rm_rf "$SYNTHETIC_DATA_DIR"
+    _rm_rf "$STAMP_SYNTHETIC_DATA_DIR"
+    _rm_rf "$SCRATCH_DIR"
+    _rm_rf "$PROJECT_DIR"
 }