From e0f34e3fd0f792d73688b10205e13d0b3155d6ba Mon Sep 17 00:00:00 2001 From: Jeff Date: Sun, 5 Apr 2026 16:42:49 +0200 Subject: [PATCH 1/5] Fix NVFlare job signature verification by resolving symlinks in Docker image Docker COPY preserves symlinks, but NVFlare's os.walk()-based job signing and zip utilities do not follow symlinks. This caused job submission to fail with "job signature verification failed" because the custom/ symlink directories were empty in the signed zip. Resolve all symlinks to actual file/directory copies after COPY so os.walk() can traverse them. Co-Authored-By: Claude Opus 4.6 --- docker_config/Dockerfile_ODELIA | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 26cef726..24d753d1 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -277,6 +277,14 @@ COPY ./torch_home_cache /torch_home # Copy the source code for local training and deploying to the swarm COPY ./MediSwarm /MediSwarm +# Replace symlinks with actual copies so NVFlare's os.walk()-based job signing +# and zip utilities can traverse all directories (os.walk does not follow symlinks) +RUN find /MediSwarm -type l | while read link; do \ + target=$(readlink -f "$link") && \ + rm "$link" && \ + if [ -d "$target" ]; then cp -r "$target" "$link"; \ + elif [ -f "$target" ]; then cp "$target" "$link"; fi; \ + done RUN mkdir -p /fl_admin/transfer && ln -s /MediSwarm /fl_admin/transfer/MediSwarm # allow creating home directory for local user inside container if needed From 2e72ceb56f8f8d30d8d0da4a44c6dcfbb343d342 Mon Sep 17 00:00:00 2001 From: Jeff Date: Sun, 5 Apr 2026 17:19:59 +0200 Subject: [PATCH 2/5] Forward loss_kwargs to challenge model factories to fix state_dict mismatch The server-side PTFileModelPersistor creates models without class weights, so _class_weight is a plain attribute (absent from state_dict). Client-side training computes class weights from data, registering _class_weight as a buffer (present in state_dict). When the aggregated model is sent back, load_state_dict fails with "Missing key(s): _class_weight". Fix: pass loss_kwargs through models_config.py to all challenge factory functions, so both server and client models have consistent buffers. Co-Authored-By: Claude Opus 4.6 --- .../custom/models/challenge/1DivideAndConquer/model.py | 7 +++++-- .../_shared/custom/models/challenge/2bcnaim/swinunetr.py | 7 +++++-- .../jobs/_shared/custom/models/challenge/4abmil/model.py | 9 ++++++--- .../jobs/_shared/custom/models/challenge/5pimed/model.py | 7 +++++-- application/jobs/_shared/custom/models/models_config.py | 8 ++++++++ 5 files changed, 29 insertions(+), 9 deletions(-) diff --git a/application/jobs/_shared/custom/models/challenge/1DivideAndConquer/model.py b/application/jobs/_shared/custom/models/challenge/1DivideAndConquer/model.py index 1d73c33a..b51cd084 100644 --- a/application/jobs/_shared/custom/models/challenge/1DivideAndConquer/model.py +++ b/application/jobs/_shared/custom/models/challenge/1DivideAndConquer/model.py @@ -405,8 +405,11 @@ def map_key(unet_key): print("No matching encoder weights found.") return self -def create_model(num_classes: int = 3, n_input_channels = 1, spatial_dims=3, pretrained_path=None) -> BasicClassifier: - model = ResidualEncoderClsLightning(in_ch=n_input_channels, out_ch=num_classes, spatial_dims=spatial_dims) +def create_model(num_classes: int = 3, n_input_channels = 1, spatial_dims=3, pretrained_path=None, loss_kwargs=None) -> BasicClassifier: + model = ResidualEncoderClsLightning( + in_ch=n_input_channels, out_ch=num_classes, spatial_dims=spatial_dims, + loss_kwargs=loss_kwargs if loss_kwargs is not None else {}, + ) if pretrained_path: model.load_pretrained_unet_encoder(pretrained_path, verbose=True) return model diff --git a/application/jobs/_shared/custom/models/challenge/2bcnaim/swinunetr.py b/application/jobs/_shared/custom/models/challenge/2bcnaim/swinunetr.py index 9032665c..f6475975 100644 --- a/application/jobs/_shared/custom/models/challenge/2bcnaim/swinunetr.py +++ b/application/jobs/_shared/custom/models/challenge/2bcnaim/swinunetr.py @@ -1150,7 +1150,10 @@ def forward(self, x): return cls_output # seg_output, cls_output, hidden_states -def create_model(img_size: int, num_classes: int = 3, n_input_channels = 1, spatial_dims=3) -> BasicClassifier: +def create_model(img_size: int, num_classes: int = 3, n_input_channels = 1, spatial_dims=3, loss_kwargs=None) -> BasicClassifier: model = SwinUNETRMultiTask(img_size=img_size, in_channels=n_input_channels, out_seg_channels=2, out_cls_classes=num_classes, spatial_dims=spatial_dims) - wrapped_model = ModelWrapper(backbone=model, in_ch=n_input_channels, num_classes=num_classes) + wrapped_model = ModelWrapper( + backbone=model, in_ch=n_input_channels, num_classes=num_classes, + loss_kwargs=loss_kwargs if loss_kwargs is not None else {}, + ) return wrapped_model diff --git a/application/jobs/_shared/custom/models/challenge/4abmil/model.py b/application/jobs/_shared/custom/models/challenge/4abmil/model.py index a430eba5..34441445 100644 --- a/application/jobs/_shared/custom/models/challenge/4abmil/model.py +++ b/application/jobs/_shared/custom/models/challenge/4abmil/model.py @@ -215,14 +215,17 @@ def forward(self, volume: torch.Tensor): return out #A.squeeze(-1) # logits, attention weights -def create_model(model_type = "swin", n_input_channels: int = 3, num_classes: int = 3) -> BasicClassifier: - +def create_model(model_type = "swin", n_input_channels: int = 3, num_classes: int = 3, loss_kwargs=None) -> BasicClassifier: + #config = pd.read_csv(config_path, skip_blank_lines=True, na_values=['NaN']).iloc[0] if model_type =="swin_cross": model = CrossModalAttentionABMIL_Swin(num_classes=num_classes) else: model = ABMIL_Swin(num_classes=num_classes) - wrapped_model = ModelWrapper(backbone=model, in_ch=n_input_channels, num_classes=num_classes) + wrapped_model = ModelWrapper( + backbone=model, in_ch=n_input_channels, num_classes=num_classes, + loss_kwargs=loss_kwargs if loss_kwargs is not None else {}, + ) return wrapped_model diff --git a/application/jobs/_shared/custom/models/challenge/5pimed/model.py b/application/jobs/_shared/custom/models/challenge/5pimed/model.py index f286200a..ddf859ea 100644 --- a/application/jobs/_shared/custom/models/challenge/5pimed/model.py +++ b/application/jobs/_shared/custom/models/challenge/5pimed/model.py @@ -55,7 +55,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return self.net(x) -def create_model(model_name: str, num_classes: int = 3, n_input_channels = 1, spatial_dims = 3, norm: str = "batch") -> nn.Module: +def create_model(model_name: str, num_classes: int = 3, n_input_channels = 1, spatial_dims = 3, norm: str = "batch", loss_kwargs=None) -> nn.Module: model = Resnet(model_name=model_name, num_classes=num_classes, norm=norm) - wrapped_model = ModelWrapper(backbone=model, in_ch=n_input_channels, num_classes=num_classes) + wrapped_model = ModelWrapper( + backbone=model, in_ch=n_input_channels, num_classes=num_classes, + loss_kwargs=loss_kwargs if loss_kwargs is not None else {}, + ) return wrapped_model diff --git a/application/jobs/_shared/custom/models/models_config.py b/application/jobs/_shared/custom/models/models_config.py index 40a08741..504fa374 100644 --- a/application/jobs/_shared/custom/models/models_config.py +++ b/application/jobs/_shared/custom/models/models_config.py @@ -204,6 +204,14 @@ def create_model(logger=None, model_name: str = None, num_classes: int = 3, ) logger.info(f'__________ model path is : {persistor_args["pretrained_path"]}') + # Forward loss_kwargs (e.g. class weights) so the challenge model + # registers the same buffers as the server-side persistor model. + # Without this, _class_weight may appear as a buffer on one side + # but not the other, causing state_dict mismatches during + # federated aggregation. + if loss_kwargs: + persistor_args["loss_kwargs"] = loss_kwargs + factory_fn = getattr(module, func_name) logger.info(f"Now access {persistor_args} from module {module}") return factory_fn(**persistor_args) From 2a5d8ff374ba0a27e5beaef025a633654b022ecf Mon Sep 17 00:00:00 2001 From: Jeff Date: Sun, 5 Apr 2026 17:59:17 +0200 Subject: [PATCH 3/5] Fix pretrained weight path resolution and pass MODEL_NAME to clients models_config.py: Add fallback to /MediSwarm/pretrained_weights/ for challenge model checkpoints. The build script stores weights there to avoid bloating NVFlare job transfers, but the path resolution only looked inside the job folder where the weights don't exist. deploy_and_test.sh: Pass --model_name flag to docker.sh when starting clients so the correct challenge model is used instead of defaulting to MST. Add job_to_model_name() mapping function. Co-Authored-By: Claude Opus 4.6 --- .../_shared/custom/models/models_config.py | 13 +++++++- deploy_and_test.sh | 31 ++++++++++++++++--- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/application/jobs/_shared/custom/models/models_config.py b/application/jobs/_shared/custom/models/models_config.py index 504fa374..e42b0461 100644 --- a/application/jobs/_shared/custom/models/models_config.py +++ b/application/jobs/_shared/custom/models/models_config.py @@ -199,9 +199,20 @@ def create_model(logger=None, model_name: str = None, num_classes: int = 3, if "pretrained_path" in persistor_args: rel_path = persistor_args["pretrained_path"].rstrip(".") - persistor_args["pretrained_path"] = os.path.join( + # Primary: look relative to the model's challenge directory + primary_path = os.path.join( base_dir, "challenge", team_name, rel_path ) + # Fallback: Docker image stores weights at /MediSwarm/pretrained_weights/ + # to avoid bloating NVFlare job transfers (see _cacheAndCopyPretrainedModelWeights.sh) + fallback_path = os.path.join("/MediSwarm/pretrained_weights", rel_path) + if os.path.isfile(primary_path): + persistor_args["pretrained_path"] = primary_path + elif os.path.isfile(fallback_path): + persistor_args["pretrained_path"] = fallback_path + logger.info(f'Using fallback pretrained weights from Docker image: {fallback_path}') + else: + persistor_args["pretrained_path"] = primary_path # let it fail with original path logger.info(f'__________ model path is : {persistor_args["pretrained_path"]}') # Forward loss_kwargs (e.g. class weights) so the challenge model diff --git a/deploy_and_test.sh b/deploy_and_test.sh index dfb73b04..0d4a14f8 100755 --- a/deploy_and_test.sh +++ b/deploy_and_test.sh @@ -244,9 +244,16 @@ cmd_start_server() { } cmd_start_clients() { + local model_name="${1:-}" step "Starting NVFlare clients on remote sites" check_dependencies + local model_flag="" + if [[ -n "$model_name" ]]; then + model_flag="--model_name '$model_name'" + info "Using MODEL_NAME=$model_name" + fi + for site in "${SITES[@]}"; do local site_name host deploy_dir datadir scratchdir gpu site_name=$(site_var "$site" SITE_NAME) @@ -264,7 +271,7 @@ cmd_start_clients() { export SITE_NAME='$site_name' && \ export DATADIR='$datadir' && \ export SCRATCHDIR='$scratchdir' && \ - ./docker.sh --data_dir '$datadir' --scratch_dir '$scratchdir' --GPU '$gpu' --start_client" + ./docker.sh --data_dir '$datadir' --scratch_dir '$scratchdir' --GPU '$gpu' $model_flag --start_client" ok " Client started on $site_name" done @@ -272,6 +279,20 @@ cmd_start_clients() { ok "All clients started" } +# Map job directory names to the MODEL_NAME env var expected by the model factory. +job_to_model_name() { + local job="$1" + case "$job" in + challenge_1DivideAndConquer) echo "1DivideAndConquer" ;; + challenge_2BCN_AIM) echo "2BCN_AIM" ;; + challenge_3agaldran) echo "3agaldran" ;; + challenge_4abmil) echo "4LME_ABMIL" ;; + challenge_5pimed) echo "5Pimed" ;; + ODELIA_ternary_classification) echo "MST" ;; + *) echo "MST" ;; + esac +} + cmd_submit() { local job_name="${1:-$DEFAULT_JOB}" step "Submitting job: $job_name" @@ -426,15 +447,17 @@ cmd_stop() { cmd_all() { local job_name="${1:-$DEFAULT_JOB}" + local model_name + model_name=$(job_to_model_name "$job_name") step "Full deployment pipeline" - info "Job: $job_name" + info "Job: $job_name (MODEL_NAME=$model_name)" echo "" cmd_build cmd_push cmd_deploy cmd_start_server - cmd_start_clients + cmd_start_clients "$model_name" info "Waiting 15s for clients to register with server..." sleep 15 @@ -484,7 +507,7 @@ case "$COMMAND" in push) cmd_push ;; deploy) cmd_deploy ;; start-server) cmd_start_server ;; - start-clients) cmd_start_clients ;; + start-clients) cmd_start_clients "${1:-}" ;; submit) cmd_submit "${1:-}" ;; status) cmd_status ;; logs) cmd_logs "${1:-}" ;; From 28697b9027f6f6da076e5931b21f5ea1e9bd4aa7 Mon Sep 17 00:00:00 2001 From: Jeff Date: Sun, 5 Apr 2026 18:43:06 +0200 Subject: [PATCH 4/5] Fix flaky CI swarm tests and self-hosted runner reliability Replace fixed sleep durations with polling loops that check for "Server runner finished." in the server log, preventing premature assertion failures when the runner is under load. Add concurrency control so only one CI run uses the self-hosted GPU runner at a time, and prune stale Docker resources before each run to avoid "no space left on device" errors. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/pr-test.yaml | 16 +++++++++++ scripts/ci/runIntegrationTests.sh | 44 ++++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index 6f192a76..6d094a95 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -11,6 +11,12 @@ on: permissions: contents: read +# Only one validation run at a time on the self-hosted runner to prevent +# resource contention (GPU, Docker, disk) that causes flaky test failures. +concurrency: + group: pr-validation-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: validate-swarm: runs-on: self-hosted @@ -36,6 +42,16 @@ jobs: docker run --rm -v "$GITHUB_WORKSPACE":/ws alpine sh -c \ 'find /ws -mindepth 1 -maxdepth 1 -not -name ".git" -exec chown -R '"$(id -u):$(id -g)"' {} + 2>/dev/null || true' + - name: Reclaim disk space (prune stale Docker resources) + run: | + echo "Disk usage before cleanup:" + df -h / | tail -1 + # Remove stopped containers, dangling images, and build cache older + # than 24 h to prevent "no space left on device" failures. + docker system prune -f --filter "until=24h" || true + echo "Disk usage after cleanup:" + df -h / | tail -1 + - name: Checkout repository (with submodules) uses: actions/checkout@v4 with: diff --git a/scripts/ci/runIntegrationTests.sh b/scripts/ci/runIntegrationTests.sh index 102cbd1c..8785164f 100755 --- a/scripts/ci/runIntegrationTests.sh +++ b/scripts/ci/runIntegrationTests.sh @@ -527,15 +527,33 @@ verify_wrong_certificates_are_rejected () { run_dummy_training_in_swarm () { - echo "[Run] Dummy training in swarm (result will be checked after 2 minutes) ..." + echo "[Run] Dummy training in swarm (polling for completion, up to 5 minutes) ..." cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup expect -f "$CWD"/tests/integration_tests/_submitDummyTraining.exp docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX - sleep 120 cd "$CWD" + # Poll for completion instead of a fixed sleep. The server log will + # contain "Server runner finished." once all rounds are done. We check + # every 10 seconds for up to 5 minutes (30 iterations). + local server_log="$PROJECT_DIR/prod_00/localhost/startup/nohup.out" + local max_attempts=30 + local attempt=0 + echo " Waiting for swarm training to finish (checking every 10s, max ${max_attempts}0s) ..." + while [ $attempt -lt $max_attempts ]; do + if [ -f "$server_log" ] && grep -q 'Server runner finished\.' "$server_log" 2>/dev/null; then + echo " ✅ Server runner finished detected after $((attempt * 10))s" + break + fi + attempt=$((attempt + 1)) + sleep 10 + done + if [ $attempt -eq $max_attempts ]; then + echo " ⚠️ Timed out after ${max_attempts}0s waiting for swarm completion — proceeding to assertions" + fi + # check for expected output in server log (clients joined, job ID assigned, 5 rounds, start of round logged, finished training logged) cd "$PROJECT_DIR"/prod_00/localhost/startup CONSOLE_OUTPUT=nohup.out @@ -634,15 +652,33 @@ run_3dcnn_local_training () { run_3dcnn_training_in_swarm () { - echo "[Run] 3DCNN training in swarm (result will be checked after 60 minutes) ..." + echo "[Run] 3DCNN training in swarm (polling for completion, up to 60 minutes) ..." cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup expect -f "$CWD"/tests/integration_tests/_submit3DCNNTraining.exp docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX - sleep 3600 cd "$CWD" + # Poll for completion instead of a fixed sleep. The server log will + # contain "Server runner finished." once all rounds are done. We check + # every 30 seconds for up to 60 minutes (120 iterations). + local server_log="$PROJECT_DIR/prod_00/localhost/startup/nohup.out" + local max_attempts=120 + local attempt=0 + echo " Waiting for 3DCNN swarm training to finish (checking every 30s, max 60min) ..." + while [ $attempt -lt $max_attempts ]; do + if [ -f "$server_log" ] && grep -q 'Server runner finished\.' "$server_log" 2>/dev/null; then + echo " ✅ Server runner finished detected after $((attempt * 30))s" + break + fi + attempt=$((attempt + 1)) + sleep 30 + done + if [ $attempt -eq $max_attempts ]; then + echo " ⚠️ Timed out after 60min waiting for 3DCNN swarm completion — proceeding to assertions" + fi + # check for expected output in server log (clients joined, job ID assigned, 20 rounds) cd "$PROJECT_DIR"/prod_00/localhost/startup CONSOLE_OUTPUT=nohup.out From bf3ef3cfc26f924138f87b52569f66873e055767 Mon Sep 17 00:00:00 2001 From: Jeff Date: Sun, 5 Apr 2026 19:25:47 +0200 Subject: [PATCH 5/5] Add zip and unzip to STAMP Dockerfile for startup kit generation The _generateStartupKitArchives.sh script requires zip to create startup kit archives. The ODELIA Dockerfile includes zip/unzip but the STAMP Dockerfile was missing them, causing CI to fail with "zip: command not found" during the STAMP Docker build step. Co-Authored-By: Claude Opus 4.6 --- docker_config/Dockerfile_STAMP | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker_config/Dockerfile_STAMP b/docker_config/Dockerfile_STAMP index ea676aff..9cbb8a3d 100644 --- a/docker_config/Dockerfile_STAMP +++ b/docker_config/Dockerfile_STAMP @@ -36,6 +36,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ git \ libgl1-mesa-glx \ libglib2.0-0 \ + zip \ + unzip \ && rm -rf /var/lib/apt/lists/* # ---------------------------------------------------------------------------