Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,7 @@ tests/results/
challenge_model_test/

runs
*.bak
*.bak

# Duke benchmark output
duke_results/
28 changes: 18 additions & 10 deletions deploy_and_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,11 @@ PROJECT_NAME=$(grep "^name: " "$SCRIPT_DIR/$PROJECT_FILE" \
| sed "s/__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__/$VERSION/")
WORKSPACE_DIR="$SCRIPT_DIR/workspace/$PROJECT_NAME"

# All sites to deploy to (add more here if needed)
SITES=(MHA RSH)
# All sites to deploy to — configured in deploy_sites.conf via SITES=()
# Falls back to (MHA RSH) if deploy_sites.conf doesn't define SITES.
if [[ -z "${SITES+x}" || ${#SITES[@]} -eq 0 ]]; then
SITES=(MHA RSH)
fi

# SSH options for sshpass
SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR"
Expand Down Expand Up @@ -216,7 +219,8 @@ cmd_start_server() {

local prod_dir
prod_dir=$(find_latest_prod)
local server_startup="$prod_dir/dl3.tud.de/startup"
local server_name="${SERVER_NAME:-dl3.tud.de}"
local server_startup="$prod_dir/$server_name/startup"

if [[ ! -d "$server_startup" ]]; then
err "Server startup kit not found: $server_startup"
Expand All @@ -232,7 +236,7 @@ cmd_start_server() {
sleep 10

# Verify
if docker ps --format '{{.Names}}' | grep -q "odelia_swarm_server"; then
if docker ps --format '{{.Names}}' | grep -qE "odelia_swarm|nvflare"; then
ok "Server container is running"
else
warn "Server container not detected — it may still be starting"
Expand Down Expand Up @@ -320,7 +324,7 @@ cmd_status() {

echo ""
info "Local containers:"
docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' | grep -E "odelia|NAMES" || echo " (none)"
docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' | grep -E "odelia|stamp|nvflare|NAMES" || echo " (none)"

check_dependencies

Expand All @@ -332,7 +336,7 @@ cmd_status() {
echo ""
info "$site ($site_name @ $host):"
remote_exec "$site" \
"docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' 2>/dev/null | grep -E 'odelia|NAMES' || echo ' (none)'" \
"docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}' 2>/dev/null | grep -E 'odelia|stamp|nvflare|NAMES' || echo ' (none)'" \
2>/dev/null || warn " Could not connect to $host"
done
}
Expand All @@ -341,7 +345,7 @@ cmd_logs() {
local target="${1:-}"
if [[ -z "$target" ]]; then
err "Usage: ./deploy_and_test.sh logs <site>"
echo " Sites: MHA, RSH, server"
echo " Sites: ${SITES[*]}, server"
exit 1
fi

Expand All @@ -350,7 +354,8 @@ cmd_logs() {
if [[ "$target" == "SERVER" ]]; then
local prod_dir
prod_dir=$(find_latest_prod)
local log_file="$prod_dir/dl3.tud.de/startup/nohup.out"
local server_name="${SERVER_NAME:-dl3.tud.de}"
local log_file="$prod_dir/$server_name/startup/nohup.out"
if [[ -f "$log_file" ]]; then
step "Server logs (last 50 lines)"
tail -50 "$log_file"
Expand Down Expand Up @@ -393,7 +398,7 @@ cmd_stop() {
info "Stopping local containers..."
# Kill all odelia containers locally
local local_containers
local_containers=$(docker ps --format '{{.Names}}' | grep "odelia_swarm" || true)
local_containers=$(docker ps --format '{{.Names}}' | grep -E "odelia_swarm|stamp|nvflare" || true)
if [[ -n "$local_containers" ]]; then
echo "$local_containers" | xargs docker kill 2>/dev/null || true
ok "Stopped local containers"
Expand All @@ -411,7 +416,7 @@ cmd_stop() {
echo ""
info "Stopping containers on $site ($host)..."
remote_exec "$site" \
"docker ps --format '{{.Names}}' | grep 'odelia_swarm' | xargs -r docker kill 2>/dev/null || true" \
"docker ps --format '{{.Names}}' | grep -E 'odelia_swarm|stamp|nvflare' | xargs -r docker kill 2>/dev/null || true" \
2>/dev/null || warn " Could not connect to $host"
ok " Stopped containers on $site"
done
Expand Down Expand Up @@ -466,6 +471,9 @@ usage() {
echo " $0 submit challenge_3agaldran # Submit a different job"
echo " $0 logs MHA # Check MHA logs"
echo " $0 stop # Kill everything"
echo ""
echo "Sites are configured in deploy_sites.conf via SITES=(SITE1 SITE2 ...)."
echo "Server name is configured via SERVER_NAME=dl3.tud.de (default)."
}

COMMAND="${1:-}"
Expand Down
75 changes: 75 additions & 0 deletions deploy_sites.conf.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# MediSwarm Deployment Site Configuration
# Copy this file to deploy_sites.conf and fill in your credentials.
# deploy_sites.conf is in .gitignore — it should NEVER be committed.
#
# Usage:
# cp deploy_sites.conf.example deploy_sites.conf
# vim deploy_sites.conf # fill in passwords and paths
# ./deploy_and_test.sh all

# ── Sites to deploy to ────────────────────────────────────────────
# List the short names of all client sites. Each name must have
# corresponding <NAME>_HOST, <NAME>_USER, etc. variables below.
SITES=(MHA RSH)

# ── Server ─────────────────────────────────────────────────────────
# The FQDN of the NVFlare server (must match the provision YAML).
SERVER_NAME=dl3.tud.de

# ── Defaults ───────────────────────────────────────────────────────
PROJECT_FILE=application/provision/project_Challenge_test.yml
DEFAULT_JOB=challenge_1DivideAndConquer
ADMIN_USER=jiefu.zhu@tu-dresden.de

# ── Site: MHA ──────────────────────────────────────────────────────
MHA_HOST=172.24.4.91
MHA_USER=odelia
MHA_PASS='CHANGEME'
MHA_SITE_NAME=MHA_1
MHA_DATADIR=/home/odelia/MediSwarm/data
MHA_SCRATCHDIR=/home/odelia/MediSwarm/data/MHA_1/tmp
MHA_DEPLOY_DIR=/home/odelia/Odelia
MHA_GPU="device=0"

# ── Site: RSH ──────────────────────────────────────────────────────
RSH_HOST=172.24.4.71
RSH_USER=asoro
RSH_PASS='CHANGEME'
RSH_SITE_NAME=RSH_1
RSH_DATADIR=/home/asoro/odelia/RSH/
RSH_SCRATCHDIR=/home/asoro/odelia/RSH/scratch/
RSH_DEPLOY_DIR=/home/asoro/Odelia
RSH_GPU="device=0"

# ── Site: DL0 (Duke Benchmark) ────────────────────────────────────
# Uncomment and configure for Duke dataset benchmarks on dl0/dl2/dl3.
# Add DL0 DL2 DL3 to SITES=() above when using these.
#
# DL0_HOST=dl0.tud.de
# DL0_USER=swarm
# DL0_PASS='CHANGEME'
# DL0_SITE_NAME=TUD_1
# DL0_DATADIR=/data/duke/TUD_1
# DL0_SCRATCHDIR=/scratch/duke/TUD_1
# DL0_DEPLOY_DIR=/home/swarm/MediSwarm
# DL0_GPU="device=0"

# ── Site: DL2 (Duke Benchmark) ────────────────────────────────────
# DL2_HOST=dl2.tud.de
# DL2_USER=swarm
# DL2_PASS='CHANGEME'
# DL2_SITE_NAME=TUD_2
# DL2_DATADIR=/data/duke/TUD_2
# DL2_SCRATCHDIR=/scratch/duke/TUD_2
# DL2_DEPLOY_DIR=/home/swarm/MediSwarm
# DL2_GPU="device=0"

# ── Site: DL3 (Duke Benchmark — also runs server) ─────────────────
# DL3_HOST=dl3.tud.de
# DL3_USER=swarm
# DL3_PASS='CHANGEME'
# DL3_SITE_NAME=TUD_3
# DL3_DATADIR=/data/duke/TUD_3
# DL3_SCRATCHDIR=/scratch/duke/TUD_3
# DL3_DEPLOY_DIR=/home/swarm/MediSwarm
# DL3_GPU="device=1"
119 changes: 119 additions & 0 deletions docs/DUKE_BENCHMARK_RESULTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Duke Breast MRI Benchmark Results

## Overview

This document records benchmark results for MediSwarm federated learning on the
[Duke Breast MRI](https://doi.org/10.7937/TCIA.e3sv-re93) dataset across the
TUD compute cluster (dl0, dl2, dl3).

The Duke dataset is a public collection of dynamic contrast-enhanced (DCE) MRI
sequences used for ternary classification of breast lesions:

| Class | Label | Description |
|-------|-------|-------------|
| 0 | Benign | Benign lesion |
| 1 | Malignant (non-PCR) | Malignant, no pathological complete response |
| 2 | Malignant (PCR) | Malignant, pathological complete response |

## Infrastructure

| Machine | Role | GPU | Dataset Partition |
|---------|------|-----|-------------------|
| dl3.tud.de | Server + Client (TUD_3) | NVIDIA A100 | ~33% of Duke |
| dl0.tud.de | Client (TUD_1) | NVIDIA A100 | ~33% of Duke |
| dl2.tud.de | Client (TUD_2) | NVIDIA A100 | ~33% of Duke |

**Provision file:** `application/provision/project_DUKE_test.yml`

## How to Reproduce

```bash
# 1. Configure deploy_sites.conf with DL0/DL2/DL3 credentials
# (see deploy_sites.conf.example for template)

# 2. Run the full benchmark pipeline
./scripts/evaluation/run_duke_benchmark.sh \
--project application/provision/project_DUKE_test.yml \
--job ODELIA_ternary_classification \
--local-epochs 10

# 3. Or run individual phases:
./scripts/evaluation/run_duke_benchmark.sh --skip-swarm # local only
./scripts/evaluation/run_duke_benchmark.sh --skip-local # swarm only
./scripts/evaluation/run_duke_benchmark.sh --collect-only # just gather results
```

## Results

> **Status:** Pending first run. Update this section after completing the benchmark.

### Swarm Training (Federated)

| Metric | Value |
|--------|-------|
| Aggregation | FedAvg (InTimeAccumulateWeightedAggregator) |
| Rounds | TBD |
| Clients | 3 (dl0, dl2, dl3) |
| Model | TBD |
| Best AUC-ROC (macro) | TBD |
| Best Accuracy | TBD |
| Training time | TBD |

### Local Training (Single-Site Benchmark)

| Model | ACC | AUC-ROC | F1 (macro) | Params | Time/epoch |
|-------|-----|---------|------------|--------|------------|
| ResNet10 | - | - | - | - | - |
| ResNet18 | - | - | - | - | - |
| ResNet34 | - | - | - | - | - |
| ResNet50 | - | - | - | - | - |
| MST | - | - | - | - | - |
| Swin3D | - | - | - | - | - |

### Swarm vs Local Comparison

| Training Mode | Best Model | AUC-ROC | ACC | Notes |
|---------------|-----------|---------|-----|-------|
| Local (single site) | - | - | - | - |
| Swarm (3 sites) | - | - | - | - |

## Analysis

> To be completed after benchmark run.

### Key Questions

1. **Does federated training improve over local?** Compare swarm AUC-ROC with
the best single-site local model.
2. **Which model architecture works best?** Review `benchmark_results.json` for
the local comparison table.
3. **How does data heterogeneity affect convergence?** Check per-round metrics
across sites.
4. **Is FedProx beneficial?** If tested, compare FedAvg vs FedProx convergence
and final metrics.

## Files

Results from benchmark runs are saved under `duke_results/<timestamp>/`:

```
duke_results/
20250405_143000/
benchmark_config.json # Run configuration
swarm/
predictions/ # predict.py output
<server>/<job_id>/
app_TUD_1/
best_FL_global_model.pt
FL_global_model.pt
app_TUD_2/
...
local/
benchmark_results.json # benchmark_models.py output
benchmark_output.log
ResNet18/ # Per-model checkpoints
MST/
...
```

Note: `duke_results/` is in `.gitignore` — results are not committed to the repo.
45 changes: 45 additions & 0 deletions scripts/evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Scripts for evaluating and comparing MediSwarm model performance.
|--------|---------|
| `predict.py` | Run prediction on external test datasets using trained swarm models |
| `benchmark_models.py` | Benchmark all MediSwarm models on a consistent train/val/test split |
| `run_duke_benchmark.sh` | End-to-end Duke dataset benchmark: build, deploy, train, collect, evaluate |
| `plot_aurocs_from_classprob_csvs.py` | Compute and plot AUROCs from class probability CSV files produced during training |
| `parse_logs_and_plot.py` | Parse training logs and plot convergence curves (legacy) |

Expand Down Expand Up @@ -236,6 +237,50 @@ The script automatically verifies:

---

## `run_duke_benchmark.sh`

End-to-end benchmark pipeline for the Duke Breast MRI dataset on the TUD compute cluster (dl0, dl2, dl3). Orchestrates the full workflow: Docker build, push, deploy, swarm training, result collection, and local model benchmarking.

### Prerequisites

- `deploy_sites.conf` configured with DL0/DL2/DL3 entries (see `deploy_sites.conf.example`)
- `sshpass` and `expect` installed
- Duke dataset available on each site
- GPU available on each site

### Usage

```bash
# Full pipeline (build, deploy, train swarm, benchmark local):
./run_duke_benchmark.sh

# Swarm only (skip local benchmark):
./run_duke_benchmark.sh --skip-local

# Local benchmark only (skip swarm):
./run_duke_benchmark.sh --skip-swarm

# Collect results from a previous swarm run:
./run_duke_benchmark.sh --collect-only

# Custom models and epochs:
./run_duke_benchmark.sh --models MST ResNet18 Swin3D --local-epochs 10

# Dry run (print configuration only):
./run_duke_benchmark.sh --dry-run
```

### Output

Results are saved to `duke_results/<timestamp>/`:
- `benchmark_config.json` -- Run configuration for reproducibility
- `swarm/` -- Collected checkpoints and prediction CSVs from swarm training
- `local/` -- `benchmark_results.json` from `benchmark_models.py`

See `docs/DUKE_BENCHMARK_RESULTS.md` for the results template and analysis.

---

## `parse_logs_and_plot.py` (Legacy)

Parses training console logs to extract AUC-ROC values and plots convergence curves for swarm vs. local training across sites.
Expand Down
Loading
Loading