Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@
learn_task_timeout = 7200
# time to wait for learn task to abort gracefully (5 min)
learn_task_abort_timeout = 300
# time for task acknowledgment (2 min)
learn_task_ack_timeout = 120
# time for final result acknowledgment (5 min)
final_result_ack_timeout = 300
# time for task acknowledgment incl. model weight streaming (10 min)
learn_task_ack_timeout = 600
# time for final result acknowledgment (10 min)
final_result_ack_timeout = 600

# ids must map to corresponding components
persistor_id = "persistor"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ workflows = [
args {
# can also set aggregation clients and train clients, see class for all available args
num_rounds = 20
# time for clients to configure and start (10 min)
start_task_timeout = 600
# max time without any progress before declaring failure (2 hours)
progress_timeout = 7200
# time for clients to acknowledge configuration (5 min)
configure_task_timeout = 300
# time for clients to configure and start (15 min)
start_task_timeout = 900
# max time without any progress before declaring failure (4 hours)
progress_timeout = 14400
# time for clients to acknowledge configuration (10 min)
configure_task_timeout = 600
# interval for clients to report status (5 min)
max_status_report_interval = 300
}
Expand Down
54 changes: 54 additions & 0 deletions docker_config/master_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,57 @@ docker_cln_sh: |
ENV_VARS+=" --env LOG_DATASET_DETAILS=1"
fi

# ── Live Sync Integration ──────────────────────────────────────────
# live_sync.sh is co-located in the startup directory (injected by
# _injectLiveSyncIntoStartupKits.sh) and syncs training artifacts to
# the central monitoring server for --local_training and --start_client
# modes. If live_sync.sh is not present the functions are a no-op.
KIT_ROOT="$(cd "$DIR/.." && pwd)"
SYNC_STATE_DIR="$DIR/.mediswarm_sync"
SITE_NAME_RESOLVED="{~~client_name~~}"

_start_live_sync() {
local mode="$1"
if [ ! -f "$DIR/live_sync.sh" ]; then return; fi
mkdir -p "$SYNC_STATE_DIR"

if [ "$mode" = "local" ]; then
"$DIR/live_sync.sh" \
--mode local \
--site-name "$SITE_NAME_RESOLVED" \
--kit-root "$KIT_ROOT" \
--startup-dir "$DIR" \
--scratch-dir "${MY_SCRATCH_DIR:-}" &
LIVE_SYNC_PID=$!
elif [ "$mode" = "swarm" ]; then
local pid_file="$SYNC_STATE_DIR/swarm_sync.pid"
if [ -f "$pid_file" ]; then
local old_pid
old_pid="$(cat "$pid_file" 2>/dev/null || true)"
if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
echo "Live sync daemon already running (PID $old_pid)"
return 0
fi
fi
nohup "$DIR/live_sync.sh" \
--mode swarm \
--site-name "$SITE_NAME_RESOLVED" \
--kit-root "$KIT_ROOT" \
--startup-dir "$DIR" \
--scratch-dir "${MY_SCRATCH_DIR:-}" \
> "$SYNC_STATE_DIR/live_sync_daemon.log" 2>&1 < /dev/null &
echo $! > "$pid_file"
echo "Started live sync daemon (PID $(cat "$pid_file"))"
fi
}

_stop_live_sync() {
if [ -n "${LIVE_SYNC_PID:-}" ] && kill -0 "$LIVE_SYNC_PID" 2>/dev/null; then
kill "$LIVE_SYNC_PID" || true
wait "$LIVE_SYNC_PID" || true
fi
}

# Execution modes
if [ -n "$DUMMY_TRAINING" ]; then
docker run --rm $TTY_OPT $DOCKER_OPTIONS $ENV_VARS --env TRAINING_MODE=local_training $DOCKER_IMAGE \
Expand All @@ -815,10 +866,13 @@ docker_cln_sh: |

elif [ -n "$LOCAL_TRAINING" ]; then
echo "[INFO] Local training using job: $JOB_NAME"
trap _stop_live_sync EXIT INT TERM
_start_live_sync local
docker run --rm $TTY_OPT $DOCKER_OPTIONS $ENV_VARS --env TRAINING_MODE=local_training --env NUM_EPOCHS=100 $DOCKER_IMAGE \
/bin/bash -c "/MediSwarm/application/jobs/${JOB_NAME}/app/custom/main.py"

elif [ -n "$START_CLIENT" ]; then
_start_live_sync swarm
docker run -d -t --restart=on-failure:5 \
--health-cmd="nvidia-smi > /dev/null 2>&1 || exit 1" \
--health-interval=120s --health-start-period=180s --health-retries=3 \
Expand Down
9 changes: 9 additions & 0 deletions kit_live_sync/build_heartbeat.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ OUT_FILE="${7:-/tmp/mediswarm_heartbeat.json}"

timestamp="$(date -u +%FT%TZ)"

# Extract kit version from docker.sh (baked in at build time)
kit_version=""
docker_sh="${KIT_ROOT:+$KIT_ROOT/startup/docker.sh}"
if [ -n "$docker_sh" ] && [ -f "$docker_sh" ]; then
kit_version="$(grep -oP '(?<=MEDISWARM_VERSION=)\S+' "$docker_sh" 2>/dev/null | head -1 || true)"
[ -n "$kit_version" ] || kit_version="$(grep -oP '(?<=jefftud/odelia:)\S+' "$docker_sh" 2>/dev/null | head -1 || true)"
fi

log_file=""
console_file=""
global_model=""
Expand Down Expand Up @@ -49,6 +57,7 @@ cat > "$OUT_FILE" <<EOF
"run_name": "$RUN_NAME",
"timestamp": "$timestamp",
"status": "$STATUS",
"kit_version": "$kit_version",
"kit_root": "$KIT_ROOT",
"log_file": "$log_file",
"console_file": "$console_file",
Expand Down
35 changes: 35 additions & 0 deletions kit_live_sync/live_sync.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ mkdir -p "$STATE_DIR"
LAST_CKPT_SYNC_FILE="$STATE_DIR/${MODE}_last_ckpt_sync_ts"
touch "$LAST_CKPT_SYNC_FILE"

# Track the run we are currently syncing so we can finalize it if a new run appears
CURRENT_LOCAL_RUN=""

ssh_cmd() {
ssh ${SSH_OPTS} "$@"
}
Expand Down Expand Up @@ -86,12 +89,42 @@ build_remote_dir() {
printf '%s/%s/%s/%s' "$REMOTE_BASE" "$SITE_NAME" "$MODE" "$run_id"
}

_finalize_local_run() {
# Write a heartbeat_final.json for a given run_name and mark it "finished"
local old_run="$1"
[ -n "$old_run" ] || return 0

local old_remote_dir old_hb_final old_run_dir
old_remote_dir="$(build_remote_dir "$old_run")"
old_hb_final="$STATE_DIR/local_heartbeat_final_${old_run}.json"
export SCRATCHDIR
"$SCRIPT_DIR/build_heartbeat.sh" "$SITE_NAME" "local" "$KIT_ROOT" "" "$old_run" "finished" "$old_hb_final" >/dev/null
rsync_cmd "$old_hb_final" "${REMOTE_USER}@${REMOTE_HOST}:${old_remote_dir}/heartbeat_final.json" || true

# Final sync of the old run's artifacts
old_run_dir=""
if [ -n "$SCRATCHDIR" ] && [ -d "$SCRATCHDIR/runs/$SITE_NAME/$old_run" ]; then
old_run_dir="$SCRATCHDIR/runs/$SITE_NAME/$old_run"
elif [ -d "$STARTUP_DIR/runs/$SITE_NAME/$old_run" ]; then
old_run_dir="$STARTUP_DIR/runs/$SITE_NAME/$old_run"
fi
if [ -n "$old_run_dir" ]; then
rsync_cmd "$old_run_dir/" "${REMOTE_USER}@${REMOTE_HOST}:${old_remote_dir}/run_dir/" || true
fi
}

sync_local() {
local run_name run_dir remote_dir hb_file now last

run_name="$(find_latest_local_run_name || true)"
[ -n "$run_name" ] || return 0

# If the run changed (new training started), finalize the old one
if [ -n "$CURRENT_LOCAL_RUN" ] && [ "$CURRENT_LOCAL_RUN" != "$run_name" ]; then
_finalize_local_run "$CURRENT_LOCAL_RUN" || true
fi
CURRENT_LOCAL_RUN="$run_name"

# Determine run_dir: check scratch dir first, fall back to startup dir
run_dir=""
if [ -n "$SCRATCHDIR" ] && [ -d "$SCRATCHDIR/runs/$SITE_NAME/$run_name" ]; then
Expand All @@ -105,6 +138,7 @@ sync_local() {
ensure_remote_dir "$remote_dir"

hb_file="$STATE_DIR/local_heartbeat.json"
export SCRATCHDIR
"$SCRIPT_DIR/build_heartbeat.sh" "$SITE_NAME" "local" "$KIT_ROOT" "" "$run_name" "running" "$hb_file" >/dev/null

rsync_cmd "$hb_file" "${REMOTE_USER}@${REMOTE_HOST}:${remote_dir}/heartbeat.json" || true
Expand Down Expand Up @@ -210,6 +244,7 @@ final_sync() {

remote_dir="$(build_remote_dir "$run_name")"
hb_file="$STATE_DIR/local_heartbeat_final.json"
export SCRATCHDIR
"$SCRIPT_DIR/build_heartbeat.sh" "$SITE_NAME" "local" "$KIT_ROOT" "" "$run_name" "finished" "$hb_file" >/dev/null
rsync_cmd "$hb_file" "${REMOTE_USER}@${REMOTE_HOST}:${remote_dir}/heartbeat_final.json" || true
if [ -n "$run_dir" ]; then
Expand Down
112 changes: 4 additions & 108 deletions scripts/build/_injectLiveSyncIntoStartupKits.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,116 +37,12 @@ find "$TARGET_FOLDER" -mindepth 1 -maxdepth 1 -type d | while read -r KIT_DIR; d

chmod +x "$STARTUP_DIR/build_heartbeat.sh" "$STARTUP_DIR/live_sync.sh"

if [ ! -f "$STARTUP_DIR/docker_original.sh" ]; then
mv "$ORIGINAL_DOCKER_SH" "$STARTUP_DIR/docker_original.sh"
chmod +x "$STARTUP_DIR/docker_original.sh"
# Clean up legacy docker_original.sh wrapper if present from a previous build
if [ -f "$STARTUP_DIR/docker_original.sh" ]; then
rm -f "$STARTUP_DIR/docker_original.sh"
fi

cat > "$STARTUP_DIR/docker.sh" <<'EOF'
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
STARTUP_DIR="$SCRIPT_DIR"
KIT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ORIGINAL="$SCRIPT_DIR/docker_original.sh"
STATE_DIR="$STARTUP_DIR/.mediswarm_sync"
mkdir -p "$STATE_DIR"

MODE=""
SCRATCHDIR=""
SITE_NAME_FROM_ARGS=""

parse_args() {
while [ "$#" -gt 0 ]; do
case "$1" in
--local_training)
MODE="local"
shift
;;
--start_client)
MODE="swarm"
shift
;;
--scratch_dir)
SCRATCHDIR="${2:-}"
shift 2
;;
--site_name)
SITE_NAME_FROM_ARGS="${2:-}"
shift 2
;;
*)
shift
;;
esac
done
}

parse_args "$@"

SITE_NAME_FALLBACK="$(basename "$KIT_ROOT")"
if [ -n "$SITE_NAME_FROM_ARGS" ]; then
export SITE_NAME="$SITE_NAME_FROM_ARGS"
elif [ -z "${SITE_NAME:-}" ]; then
export SITE_NAME="$SITE_NAME_FALLBACK"
fi

start_local_sync() {
"$SCRIPT_DIR/live_sync.sh" \
--mode local \
--site-name "$SITE_NAME" \
--kit-root "$KIT_ROOT" \
--startup-dir "$STARTUP_DIR" \
--scratch-dir "${SCRATCHDIR:-}" &
SYNC_PID=$!
}

stop_local_sync() {
if [ -n "${SYNC_PID:-}" ] && kill -0 "$SYNC_PID" >/dev/null 2>&1; then
kill "$SYNC_PID" || true
wait "$SYNC_PID" || true
fi
}

start_swarm_sync_daemon() {
local pid_file="$STATE_DIR/swarm_sync.pid"

if [ -f "$pid_file" ]; then
old_pid="$(cat "$pid_file" 2>/dev/null || true)"
if [ -n "$old_pid" ] && kill -0 "$old_pid" >/dev/null 2>&1; then
echo "Live sync daemon already running with PID $old_pid"
return 0
fi
fi

nohup "$SCRIPT_DIR/live_sync.sh" \
--mode swarm \
--site-name "$SITE_NAME" \
--kit-root "$KIT_ROOT" \
--startup-dir "$STARTUP_DIR" \
--scratch-dir "${SCRATCHDIR:-}" \
> "$STATE_DIR/live_sync_daemon.log" 2>&1 < /dev/null &

echo $! > "$pid_file"
echo "Started live sync daemon with PID $(cat "$pid_file")"
}

if [ "$MODE" = "local" ]; then
trap stop_local_sync EXIT INT TERM
start_local_sync
exec "$ORIGINAL" "$@"
elif [ "$MODE" = "swarm" ]; then
start_swarm_sync_daemon
exec "$ORIGINAL" "$@"
else
exec "$ORIGINAL" "$@"
fi
EOF

chmod +x "$STARTUP_DIR/docker.sh"

echo "Patched $STARTUP_DIR/docker.sh"
echo "Injected live sync helpers into $STARTUP_DIR"
done

echo "Live sync injection finished"
21 changes: 17 additions & 4 deletions scripts/ci/runIntegrationTests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -732,12 +732,25 @@ cleanup_synthetic_data () {
}


# Helper: remove a directory that may contain root-owned files created by
# Docker containers. Falls back to a disposable Alpine container when the
# regular `rm` fails (which happens on the CI runner because NVFlare runs
# as root inside its container and bind-mounts the workspace).
_rm_rf () {
local dir="$1"
[ -z "$dir" ] && return 0
[ ! -e "$dir" ] && return 0
rm -rf "$dir" 2>/dev/null || \
docker run --rm -v "$(cd "$(dirname "$dir")" && pwd)":/ws alpine \
rm -rf "/ws/$(basename "$dir")"
}

cleanup_temporary_data () {
echo "[Cleanup] Removing synthetic data directory, scratch directory, dummy workspace ..."
rm -rf "$SYNTHETIC_DATA_DIR"
rm -rf "$STAMP_SYNTHETIC_DATA_DIR"
rm -rf "$SCRATCH_DIR"
rm -rf "$PROJECT_DIR"
_rm_rf "$SYNTHETIC_DATA_DIR"
_rm_rf "$STAMP_SYNTHETIC_DATA_DIR"
_rm_rf "$SCRATCH_DIR"
_rm_rf "$PROJECT_DIR"
}


Expand Down
Loading
Loading