diff --git a/.gitignore b/.gitignore
index 57f880a5..3ed9cc73 100644
--- a/.gitignore
+++ b/.gitignore
@@ -195,4 +195,7 @@ tests/results/
 challenge_model_test/
 
 runs
-*.bak
\ No newline at end of file
+*.bak
+
+# Duke benchmark output
+duke_results/
\ No newline at end of file
diff --git a/deploy_and_test.sh b/deploy_and_test.sh
index 796d2e91..dfb73b04 100755
--- a/deploy_and_test.sh
+++ b/deploy_and_test.sh
@@ -59,8 +59,11 @@ PROJECT_NAME=$(grep "^name: " "$SCRIPT_DIR/$PROJECT_FILE" \
     | sed "s/__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__/$VERSION/")
 WORKSPACE_DIR="$SCRIPT_DIR/workspace/$PROJECT_NAME"
 
-# All sites to deploy to (add more here if needed)
-SITES=(MHA RSH)
+# All sites to deploy to — configured in deploy_sites.conf via SITES=()
+# Falls back to (MHA RSH) if deploy_sites.conf doesn't define SITES.
+if [[ -z "${SITES+x}" || ${#SITES[@]} -eq 0 ]]; then
+    SITES=(MHA RSH)
+fi
 
 # SSH options for sshpass
 SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR"
@@ -216,7 +219,8 @@ cmd_start_server() {
 
     local prod_dir
     prod_dir=$(find_latest_prod)
-    local server_startup="$prod_dir/dl3.tud.de/startup"
+    local server_name="${SERVER_NAME:-dl3.tud.de}"
+    local server_startup="$prod_dir/$server_name/startup"
 
     if [[ ! -d "$server_startup" ]]; then
         err "Server startup kit not found: $server_startup"
@@ -232,7 +236,7 @@ cmd_start_server() {
     sleep 10
 
     # Verify
-    if docker ps --format '{{.Names}}' | grep -q "odelia_swarm_server"; then
+    if docker ps --format '{{.Names}}' | grep -qE "odelia_swarm|nvflare"; then
         ok "Server container is running"
     else
         warn "Server container not detected — it may still be starting"
@@ -320,7 +324,7 @@ cmd_status() {
 
     echo ""
     info "Local containers:"
-    docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' | grep -E "odelia|NAMES" || echo "  (none)"
+    docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' | grep -E "odelia|stamp|nvflare|NAMES" || echo "  (none)"
 
     check_dependencies
 
@@ -332,7 +336,7 @@ cmd_status() {
         echo ""
         info "$site ($site_name @ $host):"
         remote_exec "$site" \
-            "docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' 2>/dev/null | grep -E 'odelia|NAMES' || echo '  (none)'" \
+            "docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' 2>/dev/null | grep -E 'odelia|stamp|nvflare|NAMES' || echo '  (none)'" \
             2>/dev/null || warn "  Could not connect to $host"
     done
 }
@@ -341,7 +345,7 @@ cmd_logs() {
     local target="${1:-}"
     if [[ -z "$target" ]]; then
         err "Usage: ./deploy_and_test.sh logs <site>"
-        echo "  Sites: MHA, RSH, server"
+        echo "  Sites: ${SITES[*]}, server"
         exit 1
     fi
 
@@ -350,7 +354,8 @@ cmd_logs() {
     if [[ "$target" == "SERVER" ]]; then
         local prod_dir
         prod_dir=$(find_latest_prod)
-        local log_file="$prod_dir/dl3.tud.de/startup/nohup.out"
+        local server_name="${SERVER_NAME:-dl3.tud.de}"
+        local log_file="$prod_dir/$server_name/startup/nohup.out"
         if [[ -f "$log_file" ]]; then
             step "Server logs (last 50 lines)"
             tail -50 "$log_file"
@@ -393,7 +398,7 @@ cmd_stop() {
     info "Stopping local containers..."
     # Kill all odelia containers locally
     local local_containers
-    local_containers=$(docker ps --format '{{.Names}}' | grep "odelia_swarm" || true)
+    local_containers=$(docker ps --format '{{.Names}}' | grep -E "odelia_swarm|stamp|nvflare" || true)
     if [[ -n "$local_containers" ]]; then
         echo "$local_containers" | xargs docker kill 2>/dev/null || true
         ok "Stopped local containers"
@@ -411,7 +416,7 @@ cmd_stop() {
         echo ""
         info "Stopping containers on $site ($host)..."
         remote_exec "$site" \
-            "docker ps --format '{{.Names}}' | grep 'odelia_swarm' | xargs -r docker kill 2>/dev/null || true" \
+            "docker ps --format '{{.Names}}' | grep -E 'odelia_swarm|stamp|nvflare' | xargs -r docker kill 2>/dev/null || true" \
             2>/dev/null || warn "  Could not connect to $host"
         ok "  Stopped containers on $site"
     done
@@ -466,6 +471,9 @@ usage() {
     echo "  $0 submit challenge_3agaldran          # Submit a different job"
     echo "  $0 logs MHA                            # Check MHA logs"
     echo "  $0 stop                                # Kill everything"
+    echo ""
+    echo "Sites are configured in deploy_sites.conf via SITES=(SITE1 SITE2 ...)."
+    echo "Server name is configured via SERVER_NAME=dl3.tud.de (default)."
 }
 
 COMMAND="${1:-}"
diff --git a/deploy_sites.conf.example b/deploy_sites.conf.example
new file mode 100644
index 00000000..b46e796a
--- /dev/null
+++ b/deploy_sites.conf.example
@@ -0,0 +1,75 @@
+# MediSwarm Deployment Site Configuration
+# Copy this file to deploy_sites.conf and fill in your credentials.
+# deploy_sites.conf is in .gitignore — it should NEVER be committed.
+#
+# Usage:
+#   cp deploy_sites.conf.example deploy_sites.conf
+#   vim deploy_sites.conf   # fill in passwords and paths
+#   ./deploy_and_test.sh all
+
+# ── Sites to deploy to ────────────────────────────────────────────
+# List the short names of all client sites. Each name must have
+# corresponding <NAME>_HOST, <NAME>_USER, etc. variables below.
+SITES=(MHA RSH)
+
+# ── Server ─────────────────────────────────────────────────────────
+# The FQDN of the NVFlare server (must match the provision YAML).
+SERVER_NAME=dl3.tud.de
+
+# ── Defaults ───────────────────────────────────────────────────────
+PROJECT_FILE=application/provision/project_Challenge_test.yml
+DEFAULT_JOB=challenge_1DivideAndConquer
+ADMIN_USER=jiefu.zhu@tu-dresden.de
+
+# ── Site: MHA ──────────────────────────────────────────────────────
+MHA_HOST=172.24.4.91
+MHA_USER=odelia
+MHA_PASS='CHANGEME'
+MHA_SITE_NAME=MHA_1
+MHA_DATADIR=/home/odelia/MediSwarm/data
+MHA_SCRATCHDIR=/home/odelia/MediSwarm/data/MHA_1/tmp
+MHA_DEPLOY_DIR=/home/odelia/Odelia
+MHA_GPU="device=0"
+
+# ── Site: RSH ──────────────────────────────────────────────────────
+RSH_HOST=172.24.4.71
+RSH_USER=asoro
+RSH_PASS='CHANGEME'
+RSH_SITE_NAME=RSH_1
+RSH_DATADIR=/home/asoro/odelia/RSH/
+RSH_SCRATCHDIR=/home/asoro/odelia/RSH/scratch/
+RSH_DEPLOY_DIR=/home/asoro/Odelia
+RSH_GPU="device=0"
+
+# ── Site: DL0 (Duke Benchmark) ────────────────────────────────────
+# Uncomment and configure for Duke dataset benchmarks on dl0/dl2/dl3.
+# Add DL0 DL2 DL3 to SITES=() above when using these.
+#
+# DL0_HOST=dl0.tud.de
+# DL0_USER=swarm
+# DL0_PASS='CHANGEME'
+# DL0_SITE_NAME=TUD_1
+# DL0_DATADIR=/data/duke/TUD_1
+# DL0_SCRATCHDIR=/scratch/duke/TUD_1
+# DL0_DEPLOY_DIR=/home/swarm/MediSwarm
+# DL0_GPU="device=0"
+
+# ── Site: DL2 (Duke Benchmark) ────────────────────────────────────
+# DL2_HOST=dl2.tud.de
+# DL2_USER=swarm
+# DL2_PASS='CHANGEME'
+# DL2_SITE_NAME=TUD_2
+# DL2_DATADIR=/data/duke/TUD_2
+# DL2_SCRATCHDIR=/scratch/duke/TUD_2
+# DL2_DEPLOY_DIR=/home/swarm/MediSwarm
+# DL2_GPU="device=0"
+
+# ── Site: DL3 (Duke Benchmark — also runs server) ─────────────────
+# DL3_HOST=dl3.tud.de
+# DL3_USER=swarm
+# DL3_PASS='CHANGEME'
+# DL3_SITE_NAME=TUD_3
+# DL3_DATADIR=/data/duke/TUD_3
+# DL3_SCRATCHDIR=/scratch/duke/TUD_3
+# DL3_DEPLOY_DIR=/home/swarm/MediSwarm
+# DL3_GPU="device=1"
diff --git a/docs/DUKE_BENCHMARK_RESULTS.md b/docs/DUKE_BENCHMARK_RESULTS.md
new file mode 100644
index 00000000..5f024476
--- /dev/null
+++ b/docs/DUKE_BENCHMARK_RESULTS.md
@@ -0,0 +1,119 @@
+# Duke Breast MRI Benchmark Results
+
+## Overview
+
+This document records benchmark results for MediSwarm federated learning on the
+[Duke Breast MRI](https://doi.org/10.7937/TCIA.e3sv-re93) dataset across the
+TUD compute cluster (dl0, dl2, dl3).
+
+The Duke dataset is a public collection of dynamic contrast-enhanced (DCE) MRI
+sequences used for ternary classification of breast lesions:
+
+| Class | Label | Description |
+|-------|-------|-------------|
+| 0 | Benign | Benign lesion |
+| 1 | Malignant (non-PCR) | Malignant, no pathological complete response |
+| 2 | Malignant (PCR) | Malignant, pathological complete response |
+
+## Infrastructure
+
+| Machine | Role | GPU | Dataset Partition |
+|---------|------|-----|-------------------|
+| dl3.tud.de | Server + Client (TUD_3) | NVIDIA A100 | ~33% of Duke |
+| dl0.tud.de | Client (TUD_1) | NVIDIA A100 | ~33% of Duke |
+| dl2.tud.de | Client (TUD_2) | NVIDIA A100 | ~33% of Duke |
+
+**Provision file:** `application/provision/project_DUKE_test.yml`
+
+## How to Reproduce
+
+```bash
+# 1. Configure deploy_sites.conf with DL0/DL2/DL3 credentials
+#    (see deploy_sites.conf.example for template)
+
+# 2. Run the full benchmark pipeline
+./scripts/evaluation/run_duke_benchmark.sh \
+    --project application/provision/project_DUKE_test.yml \
+    --job ODELIA_ternary_classification \
+    --local-epochs 10
+
+# 3. Or run individual phases:
+./scripts/evaluation/run_duke_benchmark.sh --skip-swarm   # local only
+./scripts/evaluation/run_duke_benchmark.sh --skip-local    # swarm only
+./scripts/evaluation/run_duke_benchmark.sh --collect-only   # just gather results
+```
+
+## Results
+
+> **Status:** Pending first run. Update this section after completing the benchmark.
+
+### Swarm Training (Federated)
+
+| Metric | Value |
+|--------|-------|
+| Aggregation | FedAvg (InTimeAccumulateWeightedAggregator) |
+| Rounds | TBD |
+| Clients | 3 (dl0, dl2, dl3) |
+| Model | TBD |
+| Best AUC-ROC (macro) | TBD |
+| Best Accuracy | TBD |
+| Training time | TBD |
+
+### Local Training (Single-Site Benchmark)
+
+| Model | ACC | AUC-ROC | F1 (macro) | Params | Time/epoch |
+|-------|-----|---------|------------|--------|------------|
+| ResNet10 | - | - | - | - | - |
+| ResNet18 | - | - | - | - | - |
+| ResNet34 | - | - | - | - | - |
+| ResNet50 | - | - | - | - | - |
+| MST | - | - | - | - | - |
+| Swin3D | - | - | - | - | - |
+
+### Swarm vs Local Comparison
+
+| Training Mode | Best Model | AUC-ROC | ACC | Notes |
+|---------------|-----------|---------|-----|-------|
+| Local (single site) | - | - | - | - |
+| Swarm (3 sites) | - | - | - | - |
+
+## Analysis
+
+> To be completed after benchmark run.
+
+### Key Questions
+
+1. **Does federated training improve over local?** Compare swarm AUC-ROC with
+   the best single-site local model.
+2. **Which model architecture works best?** Review `benchmark_results.json` for
+   the local comparison table.
+3. **How does data heterogeneity affect convergence?** Check per-round metrics
+   across sites.
+4. **Is FedProx beneficial?** If tested, compare FedAvg vs FedProx convergence
+   and final metrics.
+
+## Files
+
+Results from benchmark runs are saved under `duke_results/<timestamp>/`:
+
+```
+duke_results/
+  20250405_143000/
+    benchmark_config.json       # Run configuration
+    swarm/
+      predictions/              # predict.py output
+      <server>/<job_id>/
+        app_TUD_1/
+          best_FL_global_model.pt
+          FL_global_model.pt
+        app_TUD_2/
+          ...
+    local/
+      benchmark_results.json    # benchmark_models.py output
+      benchmark_output.log
+      ResNet18/                 # Per-model checkpoints
+      MST/
+      ...
+```
+
+Note: `duke_results/` is in `.gitignore` — results are not committed to the repo.
diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md
index d42f7f5c..b924b29b 100644
--- a/scripts/evaluation/README.md
+++ b/scripts/evaluation/README.md
@@ -8,6 +8,7 @@ Scripts for evaluating and comparing MediSwarm model performance.
 |--------|---------|
 | `predict.py` | Run prediction on external test datasets using trained swarm models |
 | `benchmark_models.py` | Benchmark all MediSwarm models on a consistent train/val/test split |
+| `run_duke_benchmark.sh` | End-to-end Duke dataset benchmark: build, deploy, train, collect, evaluate |
 | `plot_aurocs_from_classprob_csvs.py` | Compute and plot AUROCs from class probability CSV files produced during training |
 | `parse_logs_and_plot.py` | Parse training logs and plot convergence curves (legacy) |
 
@@ -236,6 +237,50 @@ The script automatically verifies:
 
 ---
 
+## `run_duke_benchmark.sh`
+
+End-to-end benchmark pipeline for the Duke Breast MRI dataset on the TUD compute cluster (dl0, dl2, dl3). Orchestrates the full workflow: Docker build, push, deploy, swarm training, result collection, and local model benchmarking.
+
+### Prerequisites
+
+- `deploy_sites.conf` configured with DL0/DL2/DL3 entries (see `deploy_sites.conf.example`)
+- `sshpass` and `expect` installed
+- Duke dataset available on each site
+- GPU available on each site
+
+### Usage
+
+```bash
+# Full pipeline (build, deploy, train swarm, benchmark local):
+./run_duke_benchmark.sh
+
+# Swarm only (skip local benchmark):
+./run_duke_benchmark.sh --skip-local
+
+# Local benchmark only (skip swarm):
+./run_duke_benchmark.sh --skip-swarm
+
+# Collect results from a previous swarm run:
+./run_duke_benchmark.sh --collect-only
+
+# Custom models and epochs:
+./run_duke_benchmark.sh --models MST ResNet18 Swin3D --local-epochs 10
+
+# Dry run (print configuration only):
+./run_duke_benchmark.sh --dry-run
+```
+
+### Output
+
+Results are saved to `duke_results/<timestamp>/`:
+- `benchmark_config.json` -- Run configuration for reproducibility
+- `swarm/` -- Collected checkpoints and prediction CSVs from swarm training
+- `local/` -- `benchmark_results.json` from `benchmark_models.py`
+
+See `docs/DUKE_BENCHMARK_RESULTS.md` for the results template and analysis.
+
+---
+
 ## `parse_logs_and_plot.py` (Legacy)
 
 Parses training console logs to extract AUC-ROC values and plots convergence curves for swarm vs. local training across sites.
diff --git a/scripts/evaluation/run_duke_benchmark.sh b/scripts/evaluation/run_duke_benchmark.sh
new file mode 100755
index 00000000..9cdeeede
--- /dev/null
+++ b/scripts/evaluation/run_duke_benchmark.sh
@@ -0,0 +1,298 @@
+#!/usr/bin/env bash
+# ============================================================================
+# run_duke_benchmark.sh — End-to-end Duke Breast MRI benchmark on dl0/dl2/dl3
+#
+# Orchestrates: build -> push -> deploy -> train (swarm) -> collect results ->
+#               run benchmark_models.py -> summarise
+#
+# Usage:
+#   ./run_duke_benchmark.sh [options]
+#
+# Options:
+#   --project FILE     Project provision YAML (default: project_DUKE_test.yml)
+#   --job JOB          Job to submit (default: ODELIA_ternary_classification)
+#   --models M1 M2..   Models for local benchmark (default: all built-in)
+#   --local-epochs N   Epochs for local benchmark_models.py (default: 5)
+#   --skip-build       Skip Docker build (use existing image)
+#   --skip-deploy      Skip deployment (already deployed)
+#   --skip-swarm       Skip swarm training (only run local benchmark)
+#   --skip-local       Skip local benchmark (only run swarm)
+#   --collect-only     Only collect results from a previous swarm run
+#   --output-dir DIR   Directory for results (default: ./duke_results)
+#   --dry-run          Print what would be done without executing
+#   -h, --help         Show this help
+#
+# Prerequisites:
+#   - deploy_sites.conf configured with DL0/DL2/DL3 entries and
+#     SITES=(DL0 DL2 DL3)
+#   - sshpass, expect installed
+#   - Duke dataset available on each site at the configured DATADIR paths
+#   - GPU available on each site
+#
+# The script uses deploy_and_test.sh for the swarm pipeline and
+# benchmark_models.py for local (single-site) model comparison.
+# ============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+DEPLOY_SCRIPT="$PROJECT_ROOT/deploy_and_test.sh"
+
+# ── Defaults ───────────────────────────────────────────────────────────────
+PROJECT_FILE="application/provision/project_DUKE_test.yml"
+JOB_NAME="ODELIA_ternary_classification"
+LOCAL_EPOCHS=5
+OUTPUT_DIR="$PROJECT_ROOT/duke_results"
+SKIP_BUILD=false
+SKIP_DEPLOY=false
+SKIP_SWARM=false
+SKIP_LOCAL=false
+COLLECT_ONLY=false
+DRY_RUN=false
+BENCHMARK_MODELS=()
+
+# ── Colors ─────────────────────────────────────────────────────────────────
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+BOLD='\033[1m'
+NC='\033[0m'
+
+info()  { echo -e "${BLUE}[DUKE]${NC} $*"; }
+ok()    { echo -e "${GREEN}[DUKE]${NC} $*"; }
+warn()  { echo -e "${YELLOW}[DUKE]${NC} $*"; }
+err()   { echo -e "${RED}[DUKE]${NC} $*" >&2; }
+step()  { echo -e "\n${BOLD}═══ DUKE BENCHMARK: $* ═══${NC}"; }
+
+# ── Parse Arguments ────────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --project)      PROJECT_FILE="$2"; shift 2 ;;
+        --job)          JOB_NAME="$2"; shift 2 ;;
+        --models)       shift; while [[ $# -gt 0 && "$1" != --* ]]; do
+                            BENCHMARK_MODELS+=("$1"); shift
+                        done ;;
+        --local-epochs) LOCAL_EPOCHS="$2"; shift 2 ;;
+        --skip-build)   SKIP_BUILD=true; shift ;;
+        --skip-deploy)  SKIP_DEPLOY=true; shift ;;
+        --skip-swarm)   SKIP_SWARM=true; shift ;;
+        --skip-local)   SKIP_LOCAL=true; shift ;;
+        --collect-only) COLLECT_ONLY=true; SKIP_BUILD=true; SKIP_DEPLOY=true; shift ;;
+        --output-dir)   OUTPUT_DIR="$2"; shift 2 ;;
+        --dry-run)      DRY_RUN=true; shift ;;
+        -h|--help)
+            head -n 35 "$0" | tail -n +2 | sed 's/^# \?//'
+            exit 0
+            ;;
+        *) err "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+# ── Sanity Checks ──────────────────────────────────────────────────────────
+if [[ ! -f "$DEPLOY_SCRIPT" ]]; then
+    err "deploy_and_test.sh not found at: $DEPLOY_SCRIPT"
+    exit 1
+fi
+
+if [[ ! -f "$PROJECT_ROOT/$PROJECT_FILE" ]]; then
+    err "Project file not found: $PROJECT_ROOT/$PROJECT_FILE"
+    exit 1
+fi
+
+# ── Summary ────────────────────────────────────────────────────────────────
+step "Configuration"
+info "Project file:   $PROJECT_FILE"
+info "Job:            $JOB_NAME"
+info "Local epochs:   $LOCAL_EPOCHS"
+info "Output dir:     $OUTPUT_DIR"
+info "Skip build:     $SKIP_BUILD"
+info "Skip deploy:    $SKIP_DEPLOY"
+info "Skip swarm:     $SKIP_SWARM"
+info "Skip local:     $SKIP_LOCAL"
+info "Collect only:   $COLLECT_ONLY"
+if [[ ${#BENCHMARK_MODELS[@]} -gt 0 ]]; then
+    info "Models:         ${BENCHMARK_MODELS[*]}"
+else
+    info "Models:         (all built-in)"
+fi
+
+if [[ "$DRY_RUN" == true ]]; then
+    ok "Dry run — exiting."
+    exit 0
+fi
+
+mkdir -p "$OUTPUT_DIR"
+
+# ── Timestamp ──────────────────────────────────────────────────────────────
+TIMESTAMP=$(date "+%Y%m%d_%H%M%S")
+RESULTS_DIR="$OUTPUT_DIR/$TIMESTAMP"
+mkdir -p "$RESULTS_DIR"
+info "Results directory: $RESULTS_DIR"
+
+# Save configuration for reproducibility
+cat > "$RESULTS_DIR/benchmark_config.json" <<CONFEOF
+{
+    "timestamp": "$TIMESTAMP",
+    "project_file": "$PROJECT_FILE",
+    "job_name": "$JOB_NAME",
+    "local_epochs": $LOCAL_EPOCHS,
+    "git_commit": "$(git -C "$PROJECT_ROOT" rev-parse HEAD)",
+    "git_branch": "$(git -C "$PROJECT_ROOT" branch --show-current)",
+    "version": "$("$PROJECT_ROOT/scripts/build/getVersionNumber.sh")"
+}
+CONFEOF
+
+# ── Phase 1: Build & Push ─────────────────────────────────────────────────
+if [[ "$SKIP_BUILD" == false && "$COLLECT_ONLY" == false ]]; then
+    step "Phase 1: Build Docker image"
+    (
+        export PROJECT_FILE
+        "$DEPLOY_SCRIPT" build
+    )
+    "$DEPLOY_SCRIPT" push
+else
+    info "Skipping build/push"
+fi
+
+# ── Phase 2: Deploy ───────────────────────────────────────────────────────
+if [[ "$SKIP_DEPLOY" == false && "$COLLECT_ONLY" == false ]]; then
+    step "Phase 2: Deploy to sites"
+    (
+        export PROJECT_FILE
+        "$DEPLOY_SCRIPT" deploy
+    )
+else
+    info "Skipping deploy"
+fi
+
+# ── Phase 3: Swarm Training ───────────────────────────────────────────────
+if [[ "$SKIP_SWARM" == false && "$COLLECT_ONLY" == false ]]; then
+    step "Phase 3: Swarm training"
+
+    info "Starting server..."
+    (export PROJECT_FILE; "$DEPLOY_SCRIPT" start-server)
+
+    info "Starting clients..."
+    (export PROJECT_FILE; "$DEPLOY_SCRIPT" start-clients)
+
+    info "Waiting 30s for clients to register..."
+    sleep 30
+
+    info "Submitting job: $JOB_NAME"
+    (export PROJECT_FILE; "$DEPLOY_SCRIPT" submit "$JOB_NAME")
+
+    ok "Swarm training submitted. Monitor with:"
+    info "  ./deploy_and_test.sh status"
+    info "  ./deploy_and_test.sh logs <SITE>"
+    echo ""
+    warn "Wait for training to complete before running --collect-only."
+    warn "Training typically takes 2-8 hours depending on dataset size and rounds."
+else
+    info "Skipping swarm training"
+fi
+
+# ── Phase 4: Collect Swarm Results ─────────────────────────────────────────
+if [[ "$COLLECT_ONLY" == true || "$SKIP_SWARM" == false ]]; then
+    step "Phase 4: Collect swarm results"
+
+    SWARM_RESULTS_DIR="$RESULTS_DIR/swarm"
+    mkdir -p "$SWARM_RESULTS_DIR"
+
+    # Find workspace with checkpoints
+    VERSION=$("$PROJECT_ROOT/scripts/build/getVersionNumber.sh")
+    PROJECT_NAME=$(grep "^name: " "$PROJECT_ROOT/$PROJECT_FILE" \
+        | sed 's/^name: //' \
+        | sed "s/__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__/$VERSION/")
+    WORKSPACE_DIR="$PROJECT_ROOT/workspace/$PROJECT_NAME"
+
+    if [[ -d "$WORKSPACE_DIR" ]]; then
+        # Find latest prod directory
+        PROD_DIR=$(ls -d "$WORKSPACE_DIR"/prod_* 2>/dev/null | sort -V | tail -n 1)
+        if [[ -n "$PROD_DIR" ]]; then
+            info "Collecting from: $PROD_DIR"
+
+            # Copy checkpoint files
+            find "$PROD_DIR" -name "FL_global_model.pt" -o -name "best_FL_global_model.pt" | while read -r ckpt; do
+                rel_path="${ckpt#"$PROD_DIR/"}"
+                dest_dir="$SWARM_RESULTS_DIR/$(dirname "$rel_path")"
+                mkdir -p "$dest_dir"
+                cp "$ckpt" "$dest_dir/"
+                info "  Collected: $rel_path"
+            done
+
+            # Copy CSV outputs
+            find "$PROD_DIR" -name "*.csv" | while read -r csv; do
+                rel_path="${csv#"$PROD_DIR/"}"
+                dest_dir="$SWARM_RESULTS_DIR/$(dirname "$rel_path")"
+                mkdir -p "$dest_dir"
+                cp "$csv" "$dest_dir/"
+            done
+
+            # Run predict.py on collected checkpoints if available
+            if [[ -f "$SCRIPT_DIR/predict.py" ]]; then
+                info "Running prediction evaluation on swarm checkpoints..."
+                python3 "$SCRIPT_DIR/predict.py" \
+                    --workspace "$PROD_DIR" \
+                    --best-only \
+                    --ensemble \
+                    --output-dir "$SWARM_RESULTS_DIR/predictions" \
+                    2>&1 | tee "$SWARM_RESULTS_DIR/predict_output.log" || \
+                    warn "predict.py failed — check $SWARM_RESULTS_DIR/predict_output.log"
+            fi
+
+            ok "Swarm results collected at: $SWARM_RESULTS_DIR"
+        else
+            warn "No prod_* directories found in $WORKSPACE_DIR"
+        fi
+    else
+        warn "Workspace not found: $WORKSPACE_DIR"
+        warn "Swarm training may not have completed yet."
+    fi
+fi
+
+# ── Phase 5: Local Benchmark ──────────────────────────────────────────────
+if [[ "$SKIP_LOCAL" == false ]]; then
+    step "Phase 5: Local model benchmark"
+
+    LOCAL_RESULTS_DIR="$RESULTS_DIR/local"
+    mkdir -p "$LOCAL_RESULTS_DIR"
+
+    BENCHMARK_ARGS=(
+        --max-epochs "$LOCAL_EPOCHS"
+        --output "$LOCAL_RESULTS_DIR/benchmark_results.json"
+    )
+
+    if [[ ${#BENCHMARK_MODELS[@]} -gt 0 ]]; then
+        BENCHMARK_ARGS+=(--models "${BENCHMARK_MODELS[@]}")
+    fi
+
+    info "Running benchmark_models.py with args: ${BENCHMARK_ARGS[*]}"
+
+    python3 "$SCRIPT_DIR/benchmark_models.py" "${BENCHMARK_ARGS[@]}" \
+        2>&1 | tee "$LOCAL_RESULTS_DIR/benchmark_output.log" || \
+        warn "benchmark_models.py failed — check $LOCAL_RESULTS_DIR/benchmark_output.log"
+
+    ok "Local benchmark results at: $LOCAL_RESULTS_DIR"
+else
+    info "Skipping local benchmark"
+fi
+
+# ── Phase 6: Summary ──────────────────────────────────────────────────────
+step "Summary"
+
+echo ""
+info "Results directory: $RESULTS_DIR"
+info "Contents:"
+find "$RESULTS_DIR" -type f | sort | while read -r f; do
+    size=$(du -h "$f" | cut -f1)
+    echo "  $size  ${f#"$RESULTS_DIR/"}"
+done
+
+echo ""
+ok "Duke benchmark pipeline complete!"
+info "Next steps:"
+info "  1. Review results in $RESULTS_DIR/"
+info "  2. Compare swarm vs local performance"
+info "  3. Update docs/DUKE_BENCHMARK_RESULTS.md with findings"