diff --git a/.gitignore b/.gitignore index 57f880a5..3ed9cc73 100644 --- a/.gitignore +++ b/.gitignore @@ -195,4 +195,7 @@ tests/results/ challenge_model_test/ runs -*.bak \ No newline at end of file +*.bak + +# Duke benchmark output +duke_results/ \ No newline at end of file diff --git a/deploy_and_test.sh b/deploy_and_test.sh index 796d2e91..dfb73b04 100755 --- a/deploy_and_test.sh +++ b/deploy_and_test.sh @@ -59,8 +59,11 @@ PROJECT_NAME=$(grep "^name: " "$SCRIPT_DIR/$PROJECT_FILE" \ | sed "s/__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__/$VERSION/") WORKSPACE_DIR="$SCRIPT_DIR/workspace/$PROJECT_NAME" -# All sites to deploy to (add more here if needed) -SITES=(MHA RSH) +# All sites to deploy to — configured in deploy_sites.conf via SITES=() +# Falls back to (MHA RSH) if deploy_sites.conf doesn't define SITES. +if [[ -z "${SITES+x}" || ${#SITES[@]} -eq 0 ]]; then + SITES=(MHA RSH) +fi # SSH options for sshpass SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR" @@ -216,7 +219,8 @@ cmd_start_server() { local prod_dir prod_dir=$(find_latest_prod) - local server_startup="$prod_dir/dl3.tud.de/startup" + local server_name="${SERVER_NAME:-dl3.tud.de}" + local server_startup="$prod_dir/$server_name/startup" if [[ ! -d "$server_startup" ]]; then err "Server startup kit not found: $server_startup" @@ -232,7 +236,7 @@ cmd_start_server() { sleep 10 # Verify - if docker ps --format '{{.Names}}' | grep -q "odelia_swarm_server"; then + if docker ps --format '{{.Names}}' | grep -qE "odelia_swarm|nvflare"; then ok "Server container is running" else warn "Server container not detected — it may still be starting" @@ -320,7 +324,7 @@ cmd_status() { echo "" info "Local containers:" - docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' | grep -E "odelia|NAMES" || echo " (none)" + docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' | grep -E "odelia|stamp|nvflare|NAMES" || echo " (none)" check_dependencies @@ -332,7 +336,7 @@ cmd_status() { echo "" info "$site ($site_name @ $host):" remote_exec "$site" \ - "docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' 2>/dev/null | grep -E 'odelia|NAMES' || echo ' (none)'" \ + "docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' 2>/dev/null | grep -E 'odelia|stamp|nvflare|NAMES' || echo ' (none)'" \ 2>/dev/null || warn " Could not connect to $host" done } @@ -341,7 +345,7 @@ cmd_logs() { local target="${1:-}" if [[ -z "$target" ]]; then err "Usage: ./deploy_and_test.sh logs " - echo " Sites: MHA, RSH, server" + echo " Sites: ${SITES[*]}, server" exit 1 fi @@ -350,7 +354,8 @@ cmd_logs() { if [[ "$target" == "SERVER" ]]; then local prod_dir prod_dir=$(find_latest_prod) - local log_file="$prod_dir/dl3.tud.de/startup/nohup.out" + local server_name="${SERVER_NAME:-dl3.tud.de}" + local log_file="$prod_dir/$server_name/startup/nohup.out" if [[ -f "$log_file" ]]; then step "Server logs (last 50 lines)" tail -50 "$log_file" @@ -393,7 +398,7 @@ cmd_stop() { info "Stopping local containers..." # Kill all odelia containers locally local local_containers - local_containers=$(docker ps --format '{{.Names}}' | grep "odelia_swarm" || true) + local_containers=$(docker ps --format '{{.Names}}' | grep -E "odelia_swarm|stamp|nvflare" || true) if [[ -n "$local_containers" ]]; then echo "$local_containers" | xargs docker kill 2>/dev/null || true ok "Stopped local containers" @@ -411,7 +416,7 @@ cmd_stop() { echo "" info "Stopping containers on $site ($host)..." remote_exec "$site" \ - "docker ps --format '{{.Names}}' | grep 'odelia_swarm' | xargs -r docker kill 2>/dev/null || true" \ + "docker ps --format '{{.Names}}' | grep -E 'odelia_swarm|stamp|nvflare' | xargs -r docker kill 2>/dev/null || true" \ 2>/dev/null || warn " Could not connect to $host" ok " Stopped containers on $site" done @@ -466,6 +471,9 @@ usage() { echo " $0 submit challenge_3agaldran # Submit a different job" echo " $0 logs MHA # Check MHA logs" echo " $0 stop # Kill everything" + echo "" + echo "Sites are configured in deploy_sites.conf via SITES=(SITE1 SITE2 ...)." + echo "Server name is configured via SERVER_NAME=dl3.tud.de (default)." } COMMAND="${1:-}" diff --git a/deploy_sites.conf.example b/deploy_sites.conf.example new file mode 100644 index 00000000..b46e796a --- /dev/null +++ b/deploy_sites.conf.example @@ -0,0 +1,75 @@ +# MediSwarm Deployment Site Configuration +# Copy this file to deploy_sites.conf and fill in your credentials. +# deploy_sites.conf is in .gitignore — it should NEVER be committed. +# +# Usage: +# cp deploy_sites.conf.example deploy_sites.conf +# vim deploy_sites.conf # fill in passwords and paths +# ./deploy_and_test.sh all + +# ── Sites to deploy to ──────────────────────────────────────────── +# List the short names of all client sites. Each name must have +# corresponding _HOST, _USER, etc. variables below. +SITES=(MHA RSH) + +# ── Server ───────────────────────────────────────────────────────── +# The FQDN of the NVFlare server (must match the provision YAML). +SERVER_NAME=dl3.tud.de + +# ── Defaults ─────────────────────────────────────────────────────── +PROJECT_FILE=application/provision/project_Challenge_test.yml +DEFAULT_JOB=challenge_1DivideAndConquer +ADMIN_USER=jiefu.zhu@tu-dresden.de + +# ── Site: MHA ────────────────────────────────────────────────────── +MHA_HOST=172.24.4.91 +MHA_USER=odelia +MHA_PASS='CHANGEME' +MHA_SITE_NAME=MHA_1 +MHA_DATADIR=/home/odelia/MediSwarm/data +MHA_SCRATCHDIR=/home/odelia/MediSwarm/data/MHA_1/tmp +MHA_DEPLOY_DIR=/home/odelia/Odelia +MHA_GPU="device=0" + +# ── Site: RSH ────────────────────────────────────────────────────── +RSH_HOST=172.24.4.71 +RSH_USER=asoro +RSH_PASS='CHANGEME' +RSH_SITE_NAME=RSH_1 +RSH_DATADIR=/home/asoro/odelia/RSH/ +RSH_SCRATCHDIR=/home/asoro/odelia/RSH/scratch/ +RSH_DEPLOY_DIR=/home/asoro/Odelia +RSH_GPU="device=0" + +# ── Site: DL0 (Duke Benchmark) ──────────────────────────────────── +# Uncomment and configure for Duke dataset benchmarks on dl0/dl2/dl3. +# Add DL0 DL2 DL3 to SITES=() above when using these. +# +# DL0_HOST=dl0.tud.de +# DL0_USER=swarm +# DL0_PASS='CHANGEME' +# DL0_SITE_NAME=TUD_1 +# DL0_DATADIR=/data/duke/TUD_1 +# DL0_SCRATCHDIR=/scratch/duke/TUD_1 +# DL0_DEPLOY_DIR=/home/swarm/MediSwarm +# DL0_GPU="device=0" + +# ── Site: DL2 (Duke Benchmark) ──────────────────────────────────── +# DL2_HOST=dl2.tud.de +# DL2_USER=swarm +# DL2_PASS='CHANGEME' +# DL2_SITE_NAME=TUD_2 +# DL2_DATADIR=/data/duke/TUD_2 +# DL2_SCRATCHDIR=/scratch/duke/TUD_2 +# DL2_DEPLOY_DIR=/home/swarm/MediSwarm +# DL2_GPU="device=0" + +# ── Site: DL3 (Duke Benchmark — also runs server) ───────────────── +# DL3_HOST=dl3.tud.de +# DL3_USER=swarm +# DL3_PASS='CHANGEME' +# DL3_SITE_NAME=TUD_3 +# DL3_DATADIR=/data/duke/TUD_3 +# DL3_SCRATCHDIR=/scratch/duke/TUD_3 +# DL3_DEPLOY_DIR=/home/swarm/MediSwarm +# DL3_GPU="device=1" diff --git a/docs/DUKE_BENCHMARK_RESULTS.md b/docs/DUKE_BENCHMARK_RESULTS.md new file mode 100644 index 00000000..5f024476 --- /dev/null +++ b/docs/DUKE_BENCHMARK_RESULTS.md @@ -0,0 +1,119 @@ +# Duke Breast MRI Benchmark Results + +## Overview + +This document records benchmark results for MediSwarm federated learning on the +[Duke Breast MRI](https://doi.org/10.7937/TCIA.e3sv-re93) dataset across the +TUD compute cluster (dl0, dl2, dl3). + +The Duke dataset is a public collection of dynamic contrast-enhanced (DCE) MRI +sequences used for ternary classification of breast lesions: + +| Class | Label | Description | +|-------|-------|-------------| +| 0 | Benign | Benign lesion | +| 1 | Malignant (non-PCR) | Malignant, no pathological complete response | +| 2 | Malignant (PCR) | Malignant, pathological complete response | + +## Infrastructure + +| Machine | Role | GPU | Dataset Partition | +|---------|------|-----|-------------------| +| dl3.tud.de | Server + Client (TUD_3) | NVIDIA A100 | ~33% of Duke | +| dl0.tud.de | Client (TUD_1) | NVIDIA A100 | ~33% of Duke | +| dl2.tud.de | Client (TUD_2) | NVIDIA A100 | ~33% of Duke | + +**Provision file:** `application/provision/project_DUKE_test.yml` + +## How to Reproduce + +```bash +# 1. Configure deploy_sites.conf with DL0/DL2/DL3 credentials +# (see deploy_sites.conf.example for template) + +# 2. Run the full benchmark pipeline +./scripts/evaluation/run_duke_benchmark.sh \ + --project application/provision/project_DUKE_test.yml \ + --job ODELIA_ternary_classification \ + --local-epochs 10 + +# 3. Or run individual phases: +./scripts/evaluation/run_duke_benchmark.sh --skip-swarm # local only +./scripts/evaluation/run_duke_benchmark.sh --skip-local # swarm only +./scripts/evaluation/run_duke_benchmark.sh --collect-only # just gather results +``` + +## Results + +> **Status:** Pending first run. Update this section after completing the benchmark. + +### Swarm Training (Federated) + +| Metric | Value | +|--------|-------| +| Aggregation | FedAvg (InTimeAccumulateWeightedAggregator) | +| Rounds | TBD | +| Clients | 3 (dl0, dl2, dl3) | +| Model | TBD | +| Best AUC-ROC (macro) | TBD | +| Best Accuracy | TBD | +| Training time | TBD | + +### Local Training (Single-Site Benchmark) + +| Model | ACC | AUC-ROC | F1 (macro) | Params | Time/epoch | +|-------|-----|---------|------------|--------|------------| +| ResNet10 | - | - | - | - | - | +| ResNet18 | - | - | - | - | - | +| ResNet34 | - | - | - | - | - | +| ResNet50 | - | - | - | - | - | +| MST | - | - | - | - | - | +| Swin3D | - | - | - | - | - | + +### Swarm vs Local Comparison + +| Training Mode | Best Model | AUC-ROC | ACC | Notes | +|---------------|-----------|---------|-----|-------| +| Local (single site) | - | - | - | - | +| Swarm (3 sites) | - | - | - | - | + +## Analysis + +> To be completed after benchmark run. + +### Key Questions + +1. **Does federated training improve over local?** Compare swarm AUC-ROC with + the best single-site local model. +2. **Which model architecture works best?** Review `benchmark_results.json` for + the local comparison table. +3. **How does data heterogeneity affect convergence?** Check per-round metrics + across sites. +4. **Is FedProx beneficial?** If tested, compare FedAvg vs FedProx convergence + and final metrics. + +## Files + +Results from benchmark runs are saved under `duke_results//`: + +``` +duke_results/ + 20250405_143000/ + benchmark_config.json # Run configuration + swarm/ + predictions/ # predict.py output + // + app_TUD_1/ + best_FL_global_model.pt + FL_global_model.pt + app_TUD_2/ + ... + local/ + benchmark_results.json # benchmark_models.py output + benchmark_output.log + ResNet18/ # Per-model checkpoints + MST/ + ... +``` + +Note: `duke_results/` is in `.gitignore` — results are not committed to the repo. diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md index d42f7f5c..b924b29b 100644 --- a/scripts/evaluation/README.md +++ b/scripts/evaluation/README.md @@ -8,6 +8,7 @@ Scripts for evaluating and comparing MediSwarm model performance. |--------|---------| | `predict.py` | Run prediction on external test datasets using trained swarm models | | `benchmark_models.py` | Benchmark all MediSwarm models on a consistent train/val/test split | +| `run_duke_benchmark.sh` | End-to-end Duke dataset benchmark: build, deploy, train, collect, evaluate | | `plot_aurocs_from_classprob_csvs.py` | Compute and plot AUROCs from class probability CSV files produced during training | | `parse_logs_and_plot.py` | Parse training logs and plot convergence curves (legacy) | @@ -236,6 +237,50 @@ The script automatically verifies: --- +## `run_duke_benchmark.sh` + +End-to-end benchmark pipeline for the Duke Breast MRI dataset on the TUD compute cluster (dl0, dl2, dl3). Orchestrates the full workflow: Docker build, push, deploy, swarm training, result collection, and local model benchmarking. + +### Prerequisites + +- `deploy_sites.conf` configured with DL0/DL2/DL3 entries (see `deploy_sites.conf.example`) +- `sshpass` and `expect` installed +- Duke dataset available on each site +- GPU available on each site + +### Usage + +```bash +# Full pipeline (build, deploy, train swarm, benchmark local): +./run_duke_benchmark.sh + +# Swarm only (skip local benchmark): +./run_duke_benchmark.sh --skip-local + +# Local benchmark only (skip swarm): +./run_duke_benchmark.sh --skip-swarm + +# Collect results from a previous swarm run: +./run_duke_benchmark.sh --collect-only + +# Custom models and epochs: +./run_duke_benchmark.sh --models MST ResNet18 Swin3D --local-epochs 10 + +# Dry run (print configuration only): +./run_duke_benchmark.sh --dry-run +``` + +### Output + +Results are saved to `duke_results//`: +- `benchmark_config.json` -- Run configuration for reproducibility +- `swarm/` -- Collected checkpoints and prediction CSVs from swarm training +- `local/` -- `benchmark_results.json` from `benchmark_models.py` + +See `docs/DUKE_BENCHMARK_RESULTS.md` for the results template and analysis. + +--- + ## `parse_logs_and_plot.py` (Legacy) Parses training console logs to extract AUC-ROC values and plots convergence curves for swarm vs. local training across sites. diff --git a/scripts/evaluation/run_duke_benchmark.sh b/scripts/evaluation/run_duke_benchmark.sh new file mode 100755 index 00000000..9cdeeede --- /dev/null +++ b/scripts/evaluation/run_duke_benchmark.sh @@ -0,0 +1,298 @@ +#!/usr/bin/env bash +# ============================================================================ +# run_duke_benchmark.sh — End-to-end Duke Breast MRI benchmark on dl0/dl2/dl3 +# +# Orchestrates: build -> push -> deploy -> train (swarm) -> collect results -> +# run benchmark_models.py -> summarise +# +# Usage: +# ./run_duke_benchmark.sh [options] +# +# Options: +# --project FILE Project provision YAML (default: project_DUKE_test.yml) +# --job JOB Job to submit (default: ODELIA_ternary_classification) +# --models M1 M2.. Models for local benchmark (default: all built-in) +# --local-epochs N Epochs for local benchmark_models.py (default: 5) +# --skip-build Skip Docker build (use existing image) +# --skip-deploy Skip deployment (already deployed) +# --skip-swarm Skip swarm training (only run local benchmark) +# --skip-local Skip local benchmark (only run swarm) +# --collect-only Only collect results from a previous swarm run +# --output-dir DIR Directory for results (default: ./duke_results) +# --dry-run Print what would be done without executing +# -h, --help Show this help +# +# Prerequisites: +# - deploy_sites.conf configured with DL0/DL2/DL3 entries and +# SITES=(DL0 DL2 DL3) +# - sshpass, expect installed +# - Duke dataset available on each site at the configured DATADIR paths +# - GPU available on each site +# +# The script uses deploy_and_test.sh for the swarm pipeline and +# benchmark_models.py for local (single-site) model comparison. +# ============================================================================ + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +DEPLOY_SCRIPT="$PROJECT_ROOT/deploy_and_test.sh" + +# ── Defaults ─────────────────────────────────────────────────────────────── +PROJECT_FILE="application/provision/project_DUKE_test.yml" +JOB_NAME="ODELIA_ternary_classification" +LOCAL_EPOCHS=5 +OUTPUT_DIR="$PROJECT_ROOT/duke_results" +SKIP_BUILD=false +SKIP_DEPLOY=false +SKIP_SWARM=false +SKIP_LOCAL=false +COLLECT_ONLY=false +DRY_RUN=false +BENCHMARK_MODELS=() + +# ── Colors ───────────────────────────────────────────────────────────────── +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +BOLD='\033[1m' +NC='\033[0m' + +info() { echo -e "${BLUE}[DUKE]${NC} $*"; } +ok() { echo -e "${GREEN}[DUKE]${NC} $*"; } +warn() { echo -e "${YELLOW}[DUKE]${NC} $*"; } +err() { echo -e "${RED}[DUKE]${NC} $*" >&2; } +step() { echo -e "\n${BOLD}═══ DUKE BENCHMARK: $* ═══${NC}"; } + +# ── Parse Arguments ──────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + --project) PROJECT_FILE="$2"; shift 2 ;; + --job) JOB_NAME="$2"; shift 2 ;; + --models) shift; while [[ $# -gt 0 && "$1" != --* ]]; do + BENCHMARK_MODELS+=("$1"); shift + done ;; + --local-epochs) LOCAL_EPOCHS="$2"; shift 2 ;; + --skip-build) SKIP_BUILD=true; shift ;; + --skip-deploy) SKIP_DEPLOY=true; shift ;; + --skip-swarm) SKIP_SWARM=true; shift ;; + --skip-local) SKIP_LOCAL=true; shift ;; + --collect-only) COLLECT_ONLY=true; SKIP_BUILD=true; SKIP_DEPLOY=true; shift ;; + --output-dir) OUTPUT_DIR="$2"; shift 2 ;; + --dry-run) DRY_RUN=true; shift ;; + -h|--help) + head -n 35 "$0" | tail -n +2 | sed 's/^# \?//' + exit 0 + ;; + *) err "Unknown option: $1"; exit 1 ;; + esac +done + +# ── Sanity Checks ────────────────────────────────────────────────────────── +if [[ ! -f "$DEPLOY_SCRIPT" ]]; then + err "deploy_and_test.sh not found at: $DEPLOY_SCRIPT" + exit 1 +fi + +if [[ ! -f "$PROJECT_ROOT/$PROJECT_FILE" ]]; then + err "Project file not found: $PROJECT_ROOT/$PROJECT_FILE" + exit 1 +fi + +# ── Summary ──────────────────────────────────────────────────────────────── +step "Configuration" +info "Project file: $PROJECT_FILE" +info "Job: $JOB_NAME" +info "Local epochs: $LOCAL_EPOCHS" +info "Output dir: $OUTPUT_DIR" +info "Skip build: $SKIP_BUILD" +info "Skip deploy: $SKIP_DEPLOY" +info "Skip swarm: $SKIP_SWARM" +info "Skip local: $SKIP_LOCAL" +info "Collect only: $COLLECT_ONLY" +if [[ ${#BENCHMARK_MODELS[@]} -gt 0 ]]; then + info "Models: ${BENCHMARK_MODELS[*]}" +else + info "Models: (all built-in)" +fi + +if [[ "$DRY_RUN" == true ]]; then + ok "Dry run — exiting." + exit 0 +fi + +mkdir -p "$OUTPUT_DIR" + +# ── Timestamp ────────────────────────────────────────────────────────────── +TIMESTAMP=$(date "+%Y%m%d_%H%M%S") +RESULTS_DIR="$OUTPUT_DIR/$TIMESTAMP" +mkdir -p "$RESULTS_DIR" +info "Results directory: $RESULTS_DIR" + +# Save configuration for reproducibility +cat > "$RESULTS_DIR/benchmark_config.json" <" + echo "" + warn "Wait for training to complete before running --collect-only." + warn "Training typically takes 2-8 hours depending on dataset size and rounds." +else + info "Skipping swarm training" +fi + +# ── Phase 4: Collect Swarm Results ───────────────────────────────────────── +if [[ "$COLLECT_ONLY" == true || "$SKIP_SWARM" == false ]]; then + step "Phase 4: Collect swarm results" + + SWARM_RESULTS_DIR="$RESULTS_DIR/swarm" + mkdir -p "$SWARM_RESULTS_DIR" + + # Find workspace with checkpoints + VERSION=$("$PROJECT_ROOT/scripts/build/getVersionNumber.sh") + PROJECT_NAME=$(grep "^name: " "$PROJECT_ROOT/$PROJECT_FILE" \ + | sed 's/^name: //' \ + | sed "s/__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__/$VERSION/") + WORKSPACE_DIR="$PROJECT_ROOT/workspace/$PROJECT_NAME" + + if [[ -d "$WORKSPACE_DIR" ]]; then + # Find latest prod directory + PROD_DIR=$(ls -d "$WORKSPACE_DIR"/prod_* 2>/dev/null | sort -V | tail -n 1) + if [[ -n "$PROD_DIR" ]]; then + info "Collecting from: $PROD_DIR" + + # Copy checkpoint files + find "$PROD_DIR" -name "FL_global_model.pt" -o -name "best_FL_global_model.pt" | while read -r ckpt; do + rel_path="${ckpt#"$PROD_DIR/"}" + dest_dir="$SWARM_RESULTS_DIR/$(dirname "$rel_path")" + mkdir -p "$dest_dir" + cp "$ckpt" "$dest_dir/" + info " Collected: $rel_path" + done + + # Copy CSV outputs + find "$PROD_DIR" -name "*.csv" | while read -r csv; do + rel_path="${csv#"$PROD_DIR/"}" + dest_dir="$SWARM_RESULTS_DIR/$(dirname "$rel_path")" + mkdir -p "$dest_dir" + cp "$csv" "$dest_dir/" + done + + # Run predict.py on collected checkpoints if available + if [[ -f "$SCRIPT_DIR/predict.py" ]]; then + info "Running prediction evaluation on swarm checkpoints..." + python3 "$SCRIPT_DIR/predict.py" \ + --workspace "$PROD_DIR" \ + --best-only \ + --ensemble \ + --output-dir "$SWARM_RESULTS_DIR/predictions" \ + 2>&1 | tee "$SWARM_RESULTS_DIR/predict_output.log" || \ + warn "predict.py failed — check $SWARM_RESULTS_DIR/predict_output.log" + fi + + ok "Swarm results collected at: $SWARM_RESULTS_DIR" + else + warn "No prod_* directories found in $WORKSPACE_DIR" + fi + else + warn "Workspace not found: $WORKSPACE_DIR" + warn "Swarm training may not have completed yet." + fi +fi + +# ── Phase 5: Local Benchmark ────────────────────────────────────────────── +if [[ "$SKIP_LOCAL" == false ]]; then + step "Phase 5: Local model benchmark" + + LOCAL_RESULTS_DIR="$RESULTS_DIR/local" + mkdir -p "$LOCAL_RESULTS_DIR" + + BENCHMARK_ARGS=( + --max-epochs "$LOCAL_EPOCHS" + --output "$LOCAL_RESULTS_DIR/benchmark_results.json" + ) + + if [[ ${#BENCHMARK_MODELS[@]} -gt 0 ]]; then + BENCHMARK_ARGS+=(--models "${BENCHMARK_MODELS[@]}") + fi + + info "Running benchmark_models.py with args: ${BENCHMARK_ARGS[*]}" + + python3 "$SCRIPT_DIR/benchmark_models.py" "${BENCHMARK_ARGS[@]}" \ + 2>&1 | tee "$LOCAL_RESULTS_DIR/benchmark_output.log" || \ + warn "benchmark_models.py failed — check $LOCAL_RESULTS_DIR/benchmark_output.log" + + ok "Local benchmark results at: $LOCAL_RESULTS_DIR" +else + info "Skipping local benchmark" +fi + +# ── Phase 6: Summary ────────────────────────────────────────────────────── +step "Summary" + +echo "" +info "Results directory: $RESULTS_DIR" +info "Contents:" +find "$RESULTS_DIR" -type f | sort | while read -r f; do + size=$(du -h "$f" | cut -f1) + echo " $size ${f#"$RESULTS_DIR/"}" +done + +echo "" +ok "Duke benchmark pipeline complete!" +info "Next steps:" +info " 1. Review results in $RESULTS_DIR/" +info " 2. Compare swarm vs local performance" +info " 3. Update docs/DUKE_BENCHMARK_RESULTS.md with findings"