diff --git a/.github/workflows/odelia-deploy-test.yml b/.github/workflows/odelia-deploy-test.yml new file mode 100644 index 00000000..4438d87c --- /dev/null +++ b/.github/workflows/odelia-deploy-test.yml @@ -0,0 +1,55 @@ +name: ODELIA Deploy Test + +on: + workflow_dispatch: # Manual trigger + schedule: + - cron: '0 2 * * 1' # Weekly Monday 2am UTC + +permissions: + contents: read + +concurrency: + group: odelia-deploy-test + cancel-in-progress: false # Don't cancel long-running swarm training + +jobs: + deploy-test: + # Must run on Cosmos — it orchestrates dl0/dl2/dl3 via SSH. + # dl0, dl2, dl3 are also self-hosted runners but they are the remote + # *clients* managed by Cosmos, so they must NOT pick up this job. + # Add the "cosmos" label to the Cosmos runner in: + # GitHub → repo Settings → Actions → Runners → Edit + runs-on: [self-hosted, cosmos] + timeout-minutes: 1440 # 24 hours (6 models x ~2-4h each) + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 0 + + - name: Build Docker image + startup kits + run: | + ./scripts/build/buildDockerImageAndStartupKits.sh \ + -p application/provision/project_deploy_test_4site.yml \ + --use-docker-cache + + - name: Run all 6 models deploy test + run: | + ./scripts/deploy/run_deploy_test.sh \ + --all \ + --conf deploy_sites_4node_test.conf + + - name: Upload results + uses: actions/upload-artifact@v4 + if: always() + with: + name: deploy-test-results + path: workspace/deploy_test_results/ + + - name: Kill orphaned containers (cleanup) + if: always() + run: | + # Stop and remove any lingering containers from this test run + docker ps --format '{{.Names}}' | grep -E 'odelia|stamp|nvflare' | xargs -r docker kill 2>/dev/null || true + docker ps -a --format '{{.Names}}' | grep -E 'odelia|stamp|nvflare' | xargs -r docker rm -f 2>/dev/null || true diff --git a/.gitignore b/.gitignore index 3ed9cc73..2ad63b72 100644 --- a/.gitignore +++ b/.gitignore @@ -188,6 +188,7 @@ docker_config/torch_home_cache # Deployment credentials (contains passwords — never commit) deploy_sites.conf +deploy_sites_4node_test.conf # Build output and logs challenge_docker_generation.txt diff --git a/README.md b/README.md index 02cab581..c8332dc9 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ the ODELIA consortium. [![PR Tests](https://github.com/KatherLab/MediSwarm/actions/workflows/pr-test.yaml/badge.svg)](https://github.com/KatherLab/MediSwarm/actions/workflows/pr-test.yaml) [![Build](https://github.com/KatherLab/MediSwarm/actions/workflows/update-apt-versions.yml/badge.svg)](https://github.com/KatherLab/MediSwarm/actions/workflows/update-apt-versions.yml) +[![Deploy Test](https://github.com/KatherLab/MediSwarm/actions/workflows/odelia-deploy-test.yml/badge.svg)](https://github.com/KatherLab/MediSwarm/actions/workflows/odelia-deploy-test.yml) ## What is MediSwarm? diff --git a/application/provision/project_deploy_test_4site.yml b/application/provision/project_deploy_test_4site.yml new file mode 100644 index 00000000..4212ad9e --- /dev/null +++ b/application/provision/project_deploy_test_4site.yml @@ -0,0 +1,44 @@ +api_version: 3 +name: odelia_deploy_test___REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS___model_test +description: ODELIA 4-site deploy test across Tailscale VPN (Cosmos, dl0, dl2, dl3) + +participants: + - name: dl3.tud.de + type: server + org: TUD + fed_learn_port: 8002 + admin_port: 8003 + - name: UMCU_1 + type: client + org: TUD + - name: RUMC_1 + type: client + org: TUD + - name: MHA_1 + type: client + org: TUD + - name: CAM_1 + type: client + org: TUD + - name: jiefu.zhu@tu-dresden.de + type: admin + org: TUD + role: project_admin + +builders: + - path: nvflare.lighter.impl.workspace.WorkspaceBuilder + args: + template_file: master_template.yml + - path: nvflare.lighter.impl.template.TemplateBuilder + - path: nvflare.lighter.impl.static_file.StaticFileBuilder + args: + config_folder: config + scheme: http + docker_image: jefftud/odelia:__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__ + overseer_agent: + path: nvflare.ha.dummy_overseer_agent.DummyOverseerAgent + overseer_exists: false + args: + sp_end_point: dl3.tud.de:8002:8003 + - path: nvflare.lighter.impl.cert.CertBuilder + - path: nvflare.lighter.impl.signature.SignatureBuilder diff --git a/scripts/deploy/distribute_data.sh b/scripts/deploy/distribute_data.sh new file mode 100755 index 00000000..41221150 --- /dev/null +++ b/scripts/deploy/distribute_data.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash +# ============================================================================ +# distribute_data.sh — One-time data distribution for 4-site ODELIA deploy test +# +# Copies institution data from dl3 (source of truth) to dl0 and dl2 via +# Cosmos as an intermediary. Cosmos already has all data locally. +# +# Data source: dl3:/mnt/swarm_alpha/Odelia_challange/ODELIA_Challenge_unilateral/ +# Layout per institution: {INSTITUTION}/data_unilateral/ + metadata_unilateral/ +# +# Target layout after distribution: +# Cosmos: /mnt/sda1/ODELIA_Challenge_unilateral/{UMCU_1,UKA_1}/ (already present) +# dl0: /mnt/scratch/odelia_data/RUMC_1/ (copied from dl3) +# dl2: /mnt/scratch/odelia_data/MHA_1/ (copied from dl3) +# dl3: /mnt/swarm_alpha/.../CAM_1/ (already present) +# +# Usage: +# ./scripts/deploy/distribute_data.sh [--dry-run] +# ============================================================================ + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# ── Colors ───────────────────────────────────────────────────────────────── +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +BOLD='\033[1m' +NC='\033[0m' + +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +err() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +step() { echo -e "\n${BOLD}=== $* ===${NC}"; } + +DRY_RUN=false +if [[ "${1:-}" == "--dry-run" ]]; then + DRY_RUN=true + warn "DRY RUN MODE — no files will be copied" +fi + +# ── Configuration ────────────────────────────────────────────────────────── +# Source machine (dl3) — has all institution data +DL3_HOST="100.126.224.113" +DL3_USER="swarm" +DL3_PASS="Ekfz2ekfz" +DL3_DATA_ROOT="/mnt/swarm_alpha/Odelia_challange/ODELIA_Challenge_unilateral" + +# Target machines +DL0_HOST="100.127.161.36" +DL0_USER="swarm" +DL0_PASS="Ekfz2ekfz" +DL0_DATA_DIR="/mnt/dlhd0/odelia_data" + +DL2_HOST="100.64.251.72" +DL2_USER="swarm" +DL2_PASS="Ekfz2ekfz" +DL2_DATA_DIR="/mnt/sda1/odelia_data" + +# Cosmos local data (already present) +COSMOS_DATA_DIR="/mnt/sda1/ODELIA_Challenge_unilateral" + +SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR" + +# ── Check dependencies ───────────────────────────────────────────────────── +if ! command -v sshpass &>/dev/null; then + err "sshpass is required. Install with: sudo apt-get install sshpass" + exit 1 +fi + +# ── Verify source data exists on dl3 ────────────────────────────────────── +step "Verifying source data on dl3 ($DL3_HOST)" + +for inst in CAM_1 MHA_1 RUMC_1 UKA_1 UMCU_1; do + if sshpass -p "$DL3_PASS" ssh $SSH_OPTS "$DL3_USER@$DL3_HOST" \ + "test -d '$DL3_DATA_ROOT/$inst/data_unilateral'"; then + ok " $inst/data_unilateral exists on dl3" + else + err " $inst/data_unilateral NOT FOUND on dl3" + exit 1 + fi +done + +# ── Verify Cosmos local data ────────────────────────────────────────────── +step "Verifying local data on Cosmos" + +for inst in UMCU_1 UKA_1; do + if [[ -d "$COSMOS_DATA_DIR/$inst/data_unilateral" ]]; then + ok " $inst/data_unilateral exists locally" + else + err " $inst/data_unilateral NOT FOUND at $COSMOS_DATA_DIR/$inst/" + exit 1 + fi +done + +# ── Helper: copy data from dl3 to a target machine via Cosmos ───────────── +# Strategy: SSH into dl3 and run scp FROM dl3 TO the target machine directly. +# This avoids downloading to Cosmos and re-uploading (which would be slow for +# large NIfTI datasets). +copy_data_via_dl3() { + local institution=$1 + local target_host=$2 + local target_user=$3 + local target_pass=$4 + local target_dir=$5 + + info "Copying $institution from dl3 → $target_host:$target_dir/$institution/" + + if $DRY_RUN; then + info " [DRY RUN] Would create $target_dir on $target_host" + info " [DRY RUN] Would scp $DL3_DATA_ROOT/$institution → $target_host:$target_dir/" + return + fi + + # Create target directory on destination machine + sshpass -p "$target_pass" ssh $SSH_OPTS "$target_user@$target_host" \ + "mkdir -p '$target_dir'" + + # Check if data already exists on target + if sshpass -p "$target_pass" ssh $SSH_OPTS "$target_user@$target_host" \ + "test -d '$target_dir/$institution/data_unilateral'"; then + warn " $institution already exists on $target_host — skipping (delete manually to re-copy)" + return + fi + + # SSH into dl3 and scp from there to the target machine + # Note: dl3 must be able to reach the target via Tailscale IPs + sshpass -p "$DL3_PASS" ssh $SSH_OPTS "$DL3_USER@$DL3_HOST" \ + "sshpass -p '$target_pass' scp -r $SSH_OPTS '$DL3_DATA_ROOT/$institution' '$target_user@$target_host:$target_dir/'" + + # Verify the copy + if sshpass -p "$target_pass" ssh $SSH_OPTS "$target_user@$target_host" \ + "test -d '$target_dir/$institution/data_unilateral'"; then + ok " $institution copied successfully to $target_host" + else + err " Failed to copy $institution to $target_host" + exit 1 + fi +} + +# ── Copy MHA_1 → dl2 ───────────────────────────────────────────────────── +step "Distributing MHA_1 to dl2 ($DL2_HOST)" +copy_data_via_dl3 "MHA_1" "$DL2_HOST" "$DL2_USER" "$DL2_PASS" "$DL2_DATA_DIR" + +# ── Copy RUMC_1 → dl0 ──────────────────────────────────────────────────── +step "Distributing RUMC_1 to dl0 ($DL0_HOST)" +copy_data_via_dl3 "RUMC_1" "$DL0_HOST" "$DL0_USER" "$DL0_PASS" "$DL0_DATA_DIR" + +# ── Summary ─────────────────────────────────────────────────────────────── +step "Data Distribution Summary" +echo "" +echo " Cosmos (localhost): $COSMOS_DATA_DIR/{UMCU_1,UKA_1} [local, pre-existing]" +echo " dl0 ($DL0_HOST): $DL0_DATA_DIR/RUMC_1 [copied from dl3]" +echo " dl2 ($DL2_HOST): $DL2_DATA_DIR/MHA_1 [copied from dl3]" +echo " dl3 ($DL3_HOST): $DL3_DATA_ROOT/CAM_1 [source, pre-existing]" +echo "" + +if $DRY_RUN; then + warn "DRY RUN complete — no files were copied" +else + ok "Data distribution complete!" +fi diff --git a/scripts/deploy/run_deploy_test.sh b/scripts/deploy/run_deploy_test.sh new file mode 100755 index 00000000..d6b1dd90 --- /dev/null +++ b/scripts/deploy/run_deploy_test.sh @@ -0,0 +1,713 @@ +#!/usr/bin/env bash +# ============================================================================ +# run_deploy_test.sh — Orchestrate multi-site ODELIA deploy tests +# +# Runs all 6 ODELIA models through a full federated training + evaluation +# cycle across 4 physical machines connected via Tailscale VPN. +# +# For each model: +# 1. (Re)start the NVFlare server on Cosmos +# 2. Start clients on all 4 sites with the correct MODEL_NAME +# 3. Submit the corresponding training job +# 4. Poll for training completion (Server runner finished) +# 5. Stop all containers +# 6. Evaluate the final global model on UKA_1 (held-out test site) +# 7. Record pass/fail + metrics +# +# Usage: +# ./scripts/deploy/run_deploy_test.sh --all --conf deploy_sites_4node_test.conf +# ./scripts/deploy/run_deploy_test.sh --model MST --job ODELIA_ternary_classification --conf deploy_sites_4node_test.conf +# +# The Docker image and startup kits must be built BEFORE running this script: +# ./scripts/build/buildDockerImageAndStartupKits.sh -p application/provision/project_deploy_test_4site.yml +# ============================================================================ + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# ── Colors ───────────────────────────────────────────────────────────────── +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +BOLD='\033[1m' +NC='\033[0m' + +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +err() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +step() { echo -e "\n${BOLD}=== $* ===${NC}"; } + +# ── Parse arguments ──────────────────────────────────────────────────────── +RUN_ALL=false +SINGLE_MODEL="" +SINGLE_JOB="" +CONF_FILE="" +SKIP_BUILD=false +TIMEOUT_MINUTES=240 # Per-model training timeout (4 hours) + +while [[ $# -gt 0 ]]; do + case $1 in + --all) RUN_ALL=true ;; + --model) SINGLE_MODEL="$2"; shift ;; + --job) SINGLE_JOB="$2"; shift ;; + --conf) CONF_FILE="$2"; shift ;; + --skip-build) SKIP_BUILD=true ;; + --timeout) TIMEOUT_MINUTES="$2"; shift ;; + -h|--help) + echo "Usage: $0 [--all | --model NAME --job JOB_DIR] --conf CONF_FILE [--skip-build] [--timeout MINUTES]" + exit 0 + ;; + *) + err "Unknown argument: $1" + exit 1 + ;; + esac + shift +done + +# Resolve conf file path +if [[ -z "$CONF_FILE" ]]; then + # Default to deploy_sites_4node_test.conf in repo root + CONF_FILE="$REPO_ROOT/deploy_sites_4node_test.conf" +fi +if [[ ! -f "$CONF_FILE" ]]; then + # Try relative to repo root + if [[ -f "$REPO_ROOT/$CONF_FILE" ]]; then + CONF_FILE="$REPO_ROOT/$CONF_FILE" + else + err "Configuration file not found: $CONF_FILE" + exit 1 + fi +fi + +# Validate arguments +if [[ "$RUN_ALL" == false && -z "$SINGLE_MODEL" ]]; then + err "Must specify --all or --model NAME --job JOB_DIR" + exit 1 +fi + +# ── Load configuration ───────────────────────────────────────────────────── +# shellcheck source=/dev/null +source "$CONF_FILE" + +VERSION=$("$REPO_ROOT/scripts/build/getVersionNumber.sh") +DOCKER_IMAGE="jefftud/odelia:$VERSION" + +PROJECT_NAME=$(grep "^name: " "$REPO_ROOT/$PROJECT_FILE" \ + | sed 's/^name: //' \ + | sed "s/__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__/$VERSION/") +WORKSPACE_DIR="$REPO_ROOT/workspace/$PROJECT_NAME" + +SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR" + +# Results directory +RESULTS_DIR="$REPO_ROOT/workspace/deploy_test_results" +mkdir -p "$RESULTS_DIR" + +# ── All 6 ODELIA models ─────────────────────────────────────────────────── +# Format: JOB_DIR:MODEL_NAME +ALL_MODELS=( + "ODELIA_ternary_classification:MST" + "challenge_1DivideAndConquer:1DivideAndConquer" + "challenge_2BCN_AIM:2BCN_AIM" + "challenge_3agaldran:3agaldran" + "challenge_4abmil:4LME_ABMIL" + "challenge_5pimed:5Pimed" +) + +# ── Evaluation configuration ────────────────────────────────────────────── +# UKA_1 is the held-out test site — data is on Cosmos +EVAL_SITE_NAME="UKA_1" +EVAL_DATA_DIR="/mnt/sda1/ODELIA_Challenge_unilateral" +EVAL_SCRATCH_DIR="/mnt/scratch/deploy_test_eval" + +# ── Helper functions (same pattern as deploy_and_test.sh) ────────────────── + +site_var() { + local site=$1 var=$2 + local full_var="${site}_${var}" + echo "${!full_var}" +} + +remote_exec() { + local site=$1; shift + local host user pass + host=$(site_var "$site" HOST) + user=$(site_var "$site" USER) + pass=$(site_var "$site" PASS) + + if [[ "$host" == "localhost" ]]; then + # Local execution (for Cosmos) + eval "$@" + else + sshpass -p "$pass" ssh $SSH_OPTS "$user@$host" "$@" + fi +} + +remote_copy() { + local site=$1 src=$2 dst=$3 + local host user pass + host=$(site_var "$site" HOST) + user=$(site_var "$site" USER) + pass=$(site_var "$site" PASS) + + if [[ "$host" == "localhost" ]]; then + cp "$src" "$dst" + else + sshpass -p "$pass" scp $SSH_OPTS "$src" "$user@$host:$dst" + fi +} + +find_latest_prod() { + if [[ ! -d "$WORKSPACE_DIR" ]]; then + err "Workspace not found: $WORKSPACE_DIR" + err "Run buildDockerImageAndStartupKits.sh first." + exit 1 + fi + ls -d "$WORKSPACE_DIR"/prod_* 2>/dev/null | sort -V | tail -n 1 +} + +# ── Stop all containers ─────────────────────────────────────────────────── +stop_all() { + info "Stopping all NVFlare containers..." + + # Stop local containers + local local_containers + local_containers=$(docker ps --format '{{.Names}}' | grep -E "odelia_swarm|nvflare" || true) + if [[ -n "$local_containers" ]]; then + echo "$local_containers" | xargs docker kill 2>/dev/null || true + echo "$local_containers" | xargs docker rm -f 2>/dev/null || true + fi + + # Stop remote containers + for site in "${SITES[@]}"; do + local host + host=$(site_var "$site" HOST) + if [[ "$host" == "localhost" ]]; then + continue # Already handled above + fi + remote_exec "$site" \ + "docker ps --format '{{.Names}}' | grep -E 'odelia_swarm|nvflare' | xargs -r docker kill 2>/dev/null; \ + docker ps -a --format '{{.Names}}' | grep -E 'odelia_swarm|nvflare' | xargs -r docker rm -f 2>/dev/null" \ + 2>/dev/null || warn " Could not stop containers on $host" + done + + # Wait for containers to fully stop + sleep 5 + ok "All containers stopped" +} + +# ── Deploy startup kits ─────────────────────────────────────────────────── +deploy_kits() { + local prod_dir + prod_dir=$(find_latest_prod) + info "Deploying startup kits from: $prod_dir" + + for site in "${SITES[@]}"; do + local site_name host deploy_dir + site_name=$(site_var "$site" SITE_NAME) + host=$(site_var "$site" HOST) + deploy_dir=$(site_var "$site" DEPLOY_DIR) + + local zip_file="$prod_dir/${site_name}_${VERSION}.zip" + if [[ ! -f "$zip_file" ]]; then + # Some site names in the startup kit may differ — try the zip + zip_file=$(ls "$prod_dir"/${site_name}*.zip 2>/dev/null | head -1 || true) + if [[ -z "$zip_file" ]]; then + err "Startup kit not found for $site_name in $prod_dir" + exit 1 + fi + fi + + if [[ "$host" == "localhost" ]]; then + mkdir -p "$deploy_dir" + cp "$zip_file" "$deploy_dir/" + cd "$deploy_dir" && rm -rf "${site_name}" && unzip -qo "$(basename "$zip_file")" + cd "$REPO_ROOT" + else + remote_exec "$site" "mkdir -p '$deploy_dir'" + remote_copy "$site" "$zip_file" "$deploy_dir/" + remote_exec "$site" "cd '$deploy_dir' && rm -rf '${site_name}' && unzip -qo '$(basename "$zip_file")'" + fi + ok " Deployed $site_name to $host:$deploy_dir/" + done + + # Also deploy the server and admin kits locally + local server_name="${SERVER_NAME:-dl3.tud.de}" + local server_zip="$prod_dir/${server_name}_${VERSION}.zip" + if [[ ! -f "$server_zip" ]]; then + server_zip=$(ls "$prod_dir"/${server_name}*.zip 2>/dev/null | head -1 || true) + fi + if [[ -n "$server_zip" && -f "$server_zip" ]]; then + local deploy_base + deploy_base=$(site_var "$(echo "${SITES[0]}")" DEPLOY_DIR) + mkdir -p "$deploy_base" + cp "$server_zip" "$deploy_base/" + cd "$deploy_base" && rm -rf "$server_name" && unzip -qo "$(basename "$server_zip")" + cd "$REPO_ROOT" + ok " Deployed server kit ($server_name) locally" + fi + + local admin_zip="$prod_dir/${ADMIN_USER}_${VERSION}.zip" + if [[ ! -f "$admin_zip" ]]; then + admin_zip=$(ls "$prod_dir"/${ADMIN_USER}*.zip 2>/dev/null | head -1 || true) + fi + if [[ -n "$admin_zip" && -f "$admin_zip" ]]; then + local deploy_base + deploy_base=$(site_var "$(echo "${SITES[0]}")" DEPLOY_DIR) + cp "$admin_zip" "$deploy_base/" + cd "$deploy_base" && rm -rf "$ADMIN_USER" && unzip -qo "$(basename "$admin_zip")" + cd "$REPO_ROOT" + ok " Deployed admin kit ($ADMIN_USER) locally" + fi +} + +# ── Start server ────────────────────────────────────────────────────────── +start_server() { + local prod_dir deploy_base server_name server_startup + prod_dir=$(find_latest_prod) + deploy_base=$(site_var "$(echo "${SITES[0]}")" DEPLOY_DIR) + server_name="${SERVER_NAME:-dl3.tud.de}" + server_startup="$deploy_base/$server_name/startup" + + if [[ ! -d "$server_startup" ]]; then + # Fall back to prod_dir + server_startup="$prod_dir/$server_name/startup" + fi + + if [[ ! -d "$server_startup" ]]; then + err "Server startup kit not found at $server_startup" + exit 1 + fi + + info "Starting server from: $server_startup" + cd "$server_startup" + ./docker.sh --no_pull --start_server + cd "$REPO_ROOT" + + info "Waiting 15s for server to initialize..." + sleep 15 + + if docker ps --format '{{.Names}}' | grep -qE "odelia_swarm|nvflare"; then + ok "Server container is running" + else + warn "Server container not detected — it may still be starting" + fi +} + +# ── Start clients ───────────────────────────────────────────────────────── +start_clients() { + local model_name="${1:-}" + local model_flag="" + if [[ -n "$model_name" ]]; then + model_flag="--model_name '$model_name'" + info "Starting clients with MODEL_NAME=$model_name" + fi + + for site in "${SITES[@]}"; do + local site_name host deploy_dir datadir scratchdir gpu + site_name=$(site_var "$site" SITE_NAME) + host=$(site_var "$site" HOST) + deploy_dir=$(site_var "$site" DEPLOY_DIR) + datadir=$(site_var "$site" DATADIR) + scratchdir=$(site_var "$site" SCRATCHDIR) + gpu=$(site_var "$site" GPU) + + info "Starting client: $site_name @ $host" + + if [[ "$host" == "localhost" ]]; then + cd "$deploy_dir/$site_name/startup" + export SITE_NAME="$site_name" + export DATADIR="$datadir" + export SCRATCHDIR="$scratchdir" + eval "./docker.sh --data_dir '$datadir' --scratch_dir '$scratchdir' --GPU '$gpu' $model_flag --start_client" + cd "$REPO_ROOT" + else + remote_exec "$site" \ + "cd '$deploy_dir/$site_name/startup' && \ + export SITE_NAME='$site_name' && \ + export DATADIR='$datadir' && \ + export SCRATCHDIR='$scratchdir' && \ + ./docker.sh --data_dir '$datadir' --scratch_dir '$scratchdir' --GPU '$gpu' $model_flag --start_client" + fi + + ok " Client started: $site_name" + done + + ok "All clients started" +} + +# ── Submit job ──────────────────────────────────────────────────────────── +submit_job() { + local job_name="$1" + local deploy_base + deploy_base=$(site_var "$(echo "${SITES[0]}")" DEPLOY_DIR) + local admin_startup="$deploy_base/$ADMIN_USER/startup" + + if [[ ! -d "$admin_startup" ]]; then + local prod_dir + prod_dir=$(find_latest_prod) + admin_startup="$prod_dir/$ADMIN_USER/startup" + fi + + if [[ ! -d "$admin_startup" ]]; then + err "Admin startup kit not found" + exit 1 + fi + + local job_path="MediSwarm/application/jobs/$job_name" + info "Submitting job: $job_name (path: $job_path)" + + # Generate expect script + local expect_script + expect_script=$(mktemp /tmp/mediswarm_deploy_test_XXXXXX.exp) + cat > "$expect_script" < " +send "submit_job $job_path\r" +expect "> " +send "list_jobs\r" +expect "> " +send "bye\r" +expect eof +EXPECT_EOF + chmod +x "$expect_script" + + cd "$admin_startup" + expect -f "$expect_script" || true + cd "$REPO_ROOT" + + rm -f "$expect_script" + ok "Job submitted: $job_name" +} + +# ── Wait for training completion ────────────────────────────────────────── +wait_for_completion() { + local model_name="$1" + local timeout_minutes="${2:-$TIMEOUT_MINUTES}" + local deploy_base + deploy_base=$(site_var "$(echo "${SITES[0]}")" DEPLOY_DIR) + local server_name="${SERVER_NAME:-dl3.tud.de}" + local server_log="$deploy_base/$server_name/startup/nohup.out" + + if [[ ! -f "$server_log" ]]; then + # Fall back to prod dir + local prod_dir + prod_dir=$(find_latest_prod) + server_log="$prod_dir/$server_name/startup/nohup.out" + fi + + local max_attempts=$(( timeout_minutes * 2 )) # Check every 30 seconds + local attempt=0 + + info "Waiting for training to complete: $model_name (timeout: ${timeout_minutes}min, checking every 30s)" + info "Server log: $server_log" + + while [[ $attempt -lt $max_attempts ]]; do + if [[ -f "$server_log" ]] && grep -q 'Server runner finished\.' "$server_log" 2>/dev/null; then + ok "Training completed for $model_name! (after $((attempt * 30))s)" + return 0 + fi + + # Also check if the server container died + if ! docker ps --format '{{.Names}}' | grep -qE "odelia_swarm|nvflare"; then + # Container is gone — check if it completed + if [[ -f "$server_log" ]] && grep -q 'Server runner finished\.' "$server_log" 2>/dev/null; then + ok "Training completed for $model_name (container exited cleanly)" + return 0 + fi + warn "Server container is no longer running — training may have failed" + return 1 + fi + + attempt=$((attempt + 1)) + sleep 30 + done + + err "Timeout after ${timeout_minutes}min waiting for $model_name training to complete" + return 1 +} + +# ── Evaluate model ──────────────────────────────────────────────────────── +evaluate_model() { + local model_name="$1" + local job_name="$2" + + step "Evaluating $model_name on $EVAL_SITE_NAME" + + # Find the workspace with checkpoints + # After swarm training, checkpoints are in the client's scratch dir + # The NVFlare global model is saved as FL_global_model.pt and best_FL_global_model.pt + local prod_dir + prod_dir=$(find_latest_prod) + + mkdir -p "$EVAL_SCRATCH_DIR" + + info "Running predict.py with --workspace on $prod_dir" + info " MODEL_NAME=$model_name" + info " SITE_NAME=$EVAL_SITE_NAME" + info " DATA_DIR=$EVAL_DATA_DIR" + + # Run evaluation inside Docker container using the same image + local eval_output_dir="$RESULTS_DIR/${model_name}_evaluation" + mkdir -p "$eval_output_dir" + + # Use predict.py with the workspace to discover checkpoints + # The predict.py script runs natively (not in Docker) since it imports from the repo + export DATA_DIR="$EVAL_DATA_DIR" + export SITE_NAME="$EVAL_SITE_NAME" + export SCRATCH_DIR="$EVAL_SCRATCH_DIR" + export MODEL_NAME="$model_name" + + local eval_result=0 + python3 "$REPO_ROOT/scripts/evaluation/predict.py" \ + --workspace "$prod_dir" \ + --model-name "$model_name" \ + --output-dir "$eval_output_dir" \ + --best-only \ + --split test \ + 2>&1 | tee "$eval_output_dir/predict_stdout.log" || eval_result=$? + + if [[ $eval_result -eq 0 ]]; then + ok "Evaluation completed for $model_name" + return 0 + else + err "Evaluation failed for $model_name (exit code: $eval_result)" + return 1 + fi +} + +# ── Record result ───────────────────────────────────────────────────────── +record_result() { + local model_name="$1" + local job_name="$2" + local train_status="$3" # pass or fail + local eval_status="$4" # pass, fail, or skipped + local duration_seconds="$5" + + local result_file="$RESULTS_DIR/deploy_test_${model_name}.json" + local timestamp + timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + cat > "$result_file" < /dev/null # summary prints go to stdout + + echo "" + echo "────────────────────────────────────────" + echo -e "Total: $total | ${GREEN}Passed: $passed${NC} | ${RED}Failed: $failed${NC}" + echo "" + + # Write machine-readable summary + cat > "$summary_file" </dev/null; then + err "Missing required tool: $cmd" + exit 1 + fi +done + +# Verify workspace exists +if [[ ! -d "$WORKSPACE_DIR" ]]; then + err "Workspace not found: $WORKSPACE_DIR" + err "Build startup kits first:" + err " ./scripts/build/buildDockerImageAndStartupKits.sh -p $PROJECT_FILE" + exit 1 +fi + +# Deploy startup kits to all sites (once — shared across all models) +step "Deploying startup kits to all sites" +deploy_kits + +if [[ "$RUN_ALL" == true ]]; then + # Run all 6 models sequentially + step "Running deploy test for all 6 ODELIA models" + echo "" + + for model_spec in "${ALL_MODELS[@]}"; do + IFS=':' read -r job_name model_name <<< "$model_spec" + run_single_model "$job_name" "$model_name" + done + + # Generate summary + generate_summary +else + # Run a single model + if [[ -z "$SINGLE_JOB" ]]; then + err "Must specify --job JOB_DIR when using --model" + exit 1 + fi + run_single_model "$SINGLE_JOB" "$SINGLE_MODEL" +fi