KatherLab · Ultimate-Storm · Apr 5, 2026 · Apr 5, 2026
diff --git a/.gitignore b/.gitignore
@@ -195,4 +195,7 @@ tests/results/
 challenge_model_test/
 
 runs
-*.bak
+*.bak
+
+# Duke benchmark output
+duke_results/
diff --git a/deploy_and_test.sh b/deploy_and_test.sh
@@ -59,8 +59,11 @@ PROJECT_NAME=$(grep "^name: " "$SCRIPT_DIR/$PROJECT_FILE" \
     | sed "s/__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__/$VERSION/")
 WORKSPACE_DIR="$SCRIPT_DIR/workspace/$PROJECT_NAME"
 
-# All sites to deploy to (add more here if needed)
-SITES=(MHA RSH)
+# All sites to deploy to — configured in deploy_sites.conf via SITES=()
+# Falls back to (MHA RSH) if deploy_sites.conf doesn't define SITES.
+if [[ -z "${SITES+x}" || ${#SITES[@]} -eq 0 ]]; then
+    SITES=(MHA RSH)
+fi
 
 # SSH options for sshpass
 SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR"
@@ -216,7 +219,8 @@ cmd_start_server() {
 
     local prod_dir
     prod_dir=$(find_latest_prod)
-    local server_startup="$prod_dir/dl3.tud.de/startup"
+    local server_name="${SERVER_NAME:-dl3.tud.de}"
+    local server_startup="$prod_dir/$server_name/startup"
 
     if [[ ! -d "$server_startup" ]]; then
         err "Server startup kit not found: $server_startup"
@@ -232,7 +236,7 @@ cmd_start_server() {
     sleep 10
 
     # Verify
-    if docker ps --format '{{.Names}}' | grep -q "odelia_swarm_server"; then
+    if docker ps --format '{{.Names}}' | grep -qE "odelia_swarm|nvflare"; then
         ok "Server container is running"
     else
         warn "Server container not detected — it may still be starting"
@@ -320,7 +324,7 @@ cmd_status() {
 
     echo ""
     info "Local containers:"
-    docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' | grep -E "odelia|NAMES" || echo "  (none)"
+    docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' | grep -E "odelia|stamp|nvflare|NAMES" || echo "  (none)"
 
     check_dependencies
 
@@ -332,7 +336,7 @@ cmd_status() {
         echo ""
         info "$site ($site_name @ $host):"
         remote_exec "$site" \
-            "docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' 2>/dev/null | grep -E 'odelia|NAMES' || echo '  (none)'" \
+            "docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' 2>/dev/null | grep -E 'odelia|stamp|nvflare|NAMES' || echo '  (none)'" \
             2>/dev/null || warn "  Could not connect to $host"
     done
 }
@@ -341,7 +345,7 @@ cmd_logs() {
     local target="${1:-}"
     if [[ -z "$target" ]]; then
         err "Usage: ./deploy_and_test.sh logs <site>"
-        echo "  Sites: MHA, RSH, server"
+        echo "  Sites: ${SITES[*]}, server"
         exit 1
     fi
 
@@ -350,7 +354,8 @@ cmd_logs() {
     if [[ "$target" == "SERVER" ]]; then
         local prod_dir
         prod_dir=$(find_latest_prod)
-        local log_file="$prod_dir/dl3.tud.de/startup/nohup.out"
+        local server_name="${SERVER_NAME:-dl3.tud.de}"
+        local log_file="$prod_dir/$server_name/startup/nohup.out"
         if [[ -f "$log_file" ]]; then
             step "Server logs (last 50 lines)"
             tail -50 "$log_file"
@@ -393,7 +398,7 @@ cmd_stop() {
     info "Stopping local containers..."
     # Kill all odelia containers locally
     local local_containers
-    local_containers=$(docker ps --format '{{.Names}}' | grep "odelia_swarm" || true)
+    local_containers=$(docker ps --format '{{.Names}}' | grep -E "odelia_swarm|stamp|nvflare" || true)
     if [[ -n "$local_containers" ]]; then
         echo "$local_containers" | xargs docker kill 2>/dev/null || true
         ok "Stopped local containers"
@@ -411,7 +416,7 @@ cmd_stop() {
         echo ""
         info "Stopping containers on $site ($host)..."
         remote_exec "$site" \
-            "docker ps --format '{{.Names}}' | grep 'odelia_swarm' | xargs -r docker kill 2>/dev/null || true" \
+            "docker ps --format '{{.Names}}' | grep -E 'odelia_swarm|stamp|nvflare' | xargs -r docker kill 2>/dev/null || true" \
             2>/dev/null || warn "  Could not connect to $host"
         ok "  Stopped containers on $site"
     done
@@ -466,6 +471,9 @@ usage() {
     echo "  $0 submit challenge_3agaldran          # Submit a different job"
     echo "  $0 logs MHA                            # Check MHA logs"
     echo "  $0 stop                                # Kill everything"
+    echo ""
+    echo "Sites are configured in deploy_sites.conf via SITES=(SITE1 SITE2 ...)."
+    echo "Server name is configured via SERVER_NAME=dl3.tud.de (default)."
 }
 
 COMMAND="${1:-}"

diff --git a/deploy_sites.conf.example b/deploy_sites.conf.example
@@ -0,0 +1,75 @@
+# MediSwarm Deployment Site Configuration
+# Copy this file to deploy_sites.conf and fill in your credentials.
+# deploy_sites.conf is in .gitignore — it should NEVER be committed.
+#
+# Usage:
+#   cp deploy_sites.conf.example deploy_sites.conf
+#   vim deploy_sites.conf   # fill in passwords and paths
+#   ./deploy_and_test.sh all
+
+# ── Sites to deploy to ────────────────────────────────────────────
+# List the short names of all client sites. Each name must have
+# corresponding <NAME>_HOST, <NAME>_USER, etc. variables below.
+SITES=(MHA RSH)
+
+# ── Server ─────────────────────────────────────────────────────────
+# The FQDN of the NVFlare server (must match the provision YAML).
+SERVER_NAME=dl3.tud.de
+
+# ── Defaults ───────────────────────────────────────────────────────
+PROJECT_FILE=application/provision/project_Challenge_test.yml
+DEFAULT_JOB=challenge_1DivideAndConquer
+ADMIN_USER=jiefu.zhu@tu-dresden.de
+
+# ── Site: MHA ──────────────────────────────────────────────────────
+MHA_HOST=172.24.4.91
+MHA_USER=odelia
+MHA_PASS='CHANGEME'
+MHA_SITE_NAME=MHA_1
+MHA_DATADIR=/home/odelia/MediSwarm/data
+MHA_SCRATCHDIR=/home/odelia/MediSwarm/data/MHA_1/tmp
+MHA_DEPLOY_DIR=/home/odelia/Odelia
+MHA_GPU="device=0"
+
+# ── Site: RSH ──────────────────────────────────────────────────────
+RSH_HOST=172.24.4.71
+RSH_USER=asoro
+RSH_PASS='CHANGEME'
+RSH_SITE_NAME=RSH_1
+RSH_DATADIR=/home/asoro/odelia/RSH/
+RSH_SCRATCHDIR=/home/asoro/odelia/RSH/scratch/
+RSH_DEPLOY_DIR=/home/asoro/Odelia
+RSH_GPU="device=0"
+
+# ── Site: DL0 (Duke Benchmark) ────────────────────────────────────
+# Uncomment and configure for Duke dataset benchmarks on dl0/dl2/dl3.
+# Add DL0 DL2 DL3 to SITES=() above when using these.
+#
+# DL0_HOST=dl0.tud.de
+# DL0_USER=swarm
+# DL0_PASS='CHANGEME'
+# DL0_SITE_NAME=TUD_1
+# DL0_DATADIR=/data/duke/TUD_1
+# DL0_SCRATCHDIR=/scratch/duke/TUD_1
+# DL0_DEPLOY_DIR=/home/swarm/MediSwarm
+# DL0_GPU="device=0"
+
+# ── Site: DL2 (Duke Benchmark) ────────────────────────────────────
+# DL2_HOST=dl2.tud.de
+# DL2_USER=swarm
+# DL2_PASS='CHANGEME'
+# DL2_SITE_NAME=TUD_2
+# DL2_DATADIR=/data/duke/TUD_2
+# DL2_SCRATCHDIR=/scratch/duke/TUD_2
+# DL2_DEPLOY_DIR=/home/swarm/MediSwarm
+# DL2_GPU="device=0"
+
+# ── Site: DL3 (Duke Benchmark — also runs server) ─────────────────
+# DL3_HOST=dl3.tud.de
+# DL3_USER=swarm
+# DL3_PASS='CHANGEME'
+# DL3_SITE_NAME=TUD_3
+# DL3_DATADIR=/data/duke/TUD_3
+# DL3_SCRATCHDIR=/scratch/duke/TUD_3
+# DL3_DEPLOY_DIR=/home/swarm/MediSwarm
+# DL3_GPU="device=1"
diff --git a/docs/DUKE_BENCHMARK_RESULTS.md b/docs/DUKE_BENCHMARK_RESULTS.md
@@ -0,0 +1,119 @@
+# Duke Breast MRI Benchmark Results
+
+## Overview
+
+This document records benchmark results for MediSwarm federated learning on the
+[Duke Breast MRI](https://doi.org/10.7937/TCIA.e3sv-re93) dataset across the
+TUD compute cluster (dl0, dl2, dl3).
+
+The Duke dataset is a public collection of dynamic contrast-enhanced (DCE) MRI
+sequences used for ternary classification of breast lesions:
+
+| Class | Label | Description |
+|-------|-------|-------------|
+| 0 | Benign | Benign lesion |
+| 1 | Malignant (non-PCR) | Malignant, no pathological complete response |
+| 2 | Malignant (PCR) | Malignant, pathological complete response |
+
+## Infrastructure
+
+| Machine | Role | GPU | Dataset Partition |
+|---------|------|-----|-------------------|
+| dl3.tud.de | Server + Client (TUD_3) | NVIDIA A100 | ~33% of Duke |
+| dl0.tud.de | Client (TUD_1) | NVIDIA A100 | ~33% of Duke |
+| dl2.tud.de | Client (TUD_2) | NVIDIA A100 | ~33% of Duke |
+
+**Provision file:** `application/provision/project_DUKE_test.yml`
+
+## How to Reproduce
+
+```bash
+# 1. Configure deploy_sites.conf with DL0/DL2/DL3 credentials
+#    (see deploy_sites.conf.example for template)
+
+# 2. Run the full benchmark pipeline
+./scripts/evaluation/run_duke_benchmark.sh \
+    --project application/provision/project_DUKE_test.yml \
+    --job ODELIA_ternary_classification \
+    --local-epochs 10
+
+# 3. Or run individual phases:
+./scripts/evaluation/run_duke_benchmark.sh --skip-swarm   # local only
+./scripts/evaluation/run_duke_benchmark.sh --skip-local    # swarm only
+./scripts/evaluation/run_duke_benchmark.sh --collect-only   # just gather results
+```
+
+## Results
+
+> **Status:** Pending first run. Update this section after completing the benchmark.
+
+### Swarm Training (Federated)
+
+| Metric | Value |
+|--------|-------|
+| Aggregation | FedAvg (InTimeAccumulateWeightedAggregator) |
+| Rounds | TBD |
+| Clients | 3 (dl0, dl2, dl3) |
+| Model | TBD |
+| Best AUC-ROC (macro) | TBD |
+| Best Accuracy | TBD |
+| Training time | TBD |
+
+### Local Training (Single-Site Benchmark)
+
+| Model | ACC | AUC-ROC | F1 (macro) | Params | Time/epoch |
+|-------|-----|---------|------------|--------|------------|
+| ResNet10 | - | - | - | - | - |
+| ResNet18 | - | - | - | - | - |
+| ResNet34 | - | - | - | - | - |
+| ResNet50 | - | - | - | - | - |
+| MST | - | - | - | - | - |
+| Swin3D | - | - | - | - | - |
+
+### Swarm vs Local Comparison
+
+| Training Mode | Best Model | AUC-ROC | ACC | Notes |
+|---------------|-----------|---------|-----|-------|
+| Local (single site) | - | - | - | - |
+| Swarm (3 sites) | - | - | - | - |
+
+## Analysis
+
+> To be completed after benchmark run.
+
+### Key Questions
+
+1. **Does federated training improve over local?** Compare swarm AUC-ROC with
+   the best single-site local model.
+2. **Which model architecture works best?** Review `benchmark_results.json` for
+   the local comparison table.
+3. **How does data heterogeneity affect convergence?** Check per-round metrics
+   across sites.
+4. **Is FedProx beneficial?** If tested, compare FedAvg vs FedProx convergence
+   and final metrics.
+
+## Files
+
+Results from benchmark runs are saved under `duke_results/<timestamp>/`:
+
+```
+duke_results/
+  20250405_143000/
+    benchmark_config.json       # Run configuration
+    swarm/
+      predictions/              # predict.py output
+      <server>/<job_id>/
+        app_TUD_1/
+          best_FL_global_model.pt
+          FL_global_model.pt
+        app_TUD_2/
+          ...
+    local/
+      benchmark_results.json    # benchmark_models.py output
+      benchmark_output.log
+      ResNet18/                 # Per-model checkpoints
+      MST/
+      ...
+```
+
+Note: `duke_results/` is in `.gitignore` — results are not committed to the repo.
diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md
@@ -8,6 +8,7 @@ Scripts for evaluating and comparing MediSwarm model performance.
 |--------|---------|
 | `predict.py` | Run prediction on external test datasets using trained swarm models |
 | `benchmark_models.py` | Benchmark all MediSwarm models on a consistent train/val/test split |
+| `run_duke_benchmark.sh` | End-to-end Duke dataset benchmark: build, deploy, train, collect, evaluate |
 | `plot_aurocs_from_classprob_csvs.py` | Compute and plot AUROCs from class probability CSV files produced during training |
 | `parse_logs_and_plot.py` | Parse training logs and plot convergence curves (legacy) |
 
@@ -236,6 +237,50 @@ The script automatically verifies:
 
 ---
 
+## `run_duke_benchmark.sh`
+
+End-to-end benchmark pipeline for the Duke Breast MRI dataset on the TUD compute cluster (dl0, dl2, dl3). Orchestrates the full workflow: Docker build, push, deploy, swarm training, result collection, and local model benchmarking.
+
+### Prerequisites
+
+- `deploy_sites.conf` configured with DL0/DL2/DL3 entries (see `deploy_sites.conf.example`)
+- `sshpass` and `expect` installed
+- Duke dataset available on each site
+- GPU available on each site
+
+### Usage
+
+```bash
+# Full pipeline (build, deploy, train swarm, benchmark local):
+./run_duke_benchmark.sh
+
+# Swarm only (skip local benchmark):
+./run_duke_benchmark.sh --skip-local
+
+# Local benchmark only (skip swarm):
+./run_duke_benchmark.sh --skip-swarm
+
+# Collect results from a previous swarm run:
+./run_duke_benchmark.sh --collect-only
+
+# Custom models and epochs:
+./run_duke_benchmark.sh --models MST ResNet18 Swin3D --local-epochs 10
+
+# Dry run (print configuration only):
+./run_duke_benchmark.sh --dry-run
+```
+
+### Output
+
+Results are saved to `duke_results/<timestamp>/`:
+- `benchmark_config.json` -- Run configuration for reproducibility
+- `swarm/` -- Collected checkpoints and prediction CSVs from swarm training
+- `local/` -- `benchmark_results.json` from `benchmark_models.py`
+
+See `docs/DUKE_BENCHMARK_RESULTS.md` for the results template and analysis.
+
+---
+
 ## `parse_logs_and_plot.py` (Legacy)
 
 Parses training console logs to extract AUC-ROC values and plots convergence curves for swarm vs. local training across sites.