diff --git a/application/provision/project_deploy_test_2site.yml b/application/provision/project_deploy_test_2site.yml index 138c4f53..cbdd84ec 100644 --- a/application/provision/project_deploy_test_2site.yml +++ b/application/provision/project_deploy_test_2site.yml @@ -1,6 +1,6 @@ api_version: 3 name: odelia_deploy_test___REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS___model_test -description: ODELIA 2-client deploy test across Tailscale VPN (dl2, dl3; server on Cosmos) — dl0 excluded +description: ODELIA 2-client deploy test across Tailscale VPN (dl0, dl2; server on Cosmos) participants: - name: dl3.tud.de @@ -8,10 +8,10 @@ participants: org: TUD fed_learn_port: 8002 admin_port: 8003 - - name: MHA_1 + - name: RUMC_1 type: client org: TUD - - name: CAM_1 + - name: MHA_1 type: client org: TUD - name: jiefu.zhu@tu-dresden.de diff --git a/deploy_sites_2node_test.conf b/deploy_sites_2node_test.conf index 002ce9ea..0c7552c3 100644 --- a/deploy_sites_2node_test.conf +++ b/deploy_sites_2node_test.conf @@ -1,14 +1,14 @@ # 2-client ODELIA deploy test configuration (Tailscale VPN) # ============================================================================ # Cosmos = server + admin ONLY (no training client — RTX 5070 not supported) +# dl0 = RUMC_1 client # dl2 = MHA_1 client -# dl3 = CAM_1 client # -# dl0 (RUMC_1) excluded due to persistent EXECUTION_EXCEPTION crashes. +# dl3 excluded due to GPU contention with other training jobs. # ============================================================================ # CLIENT_SITES lists only the training client entries. -CLIENT_SITES=(DL2 DL3A) +CLIENT_SITES=(DL0 DL2) # ── Cosmos (server + admin only) ────────────────────────────── COSMOS_HOST=localhost @@ -16,6 +16,16 @@ COSMOS_USER=jeff COSMOS_PASS='' COSMOS_DEPLOY_DIR=/home/jeff/deploy_test +# ── dl0 ──────────────────────────────────────────────── +DL0_HOST=100.127.161.36 +DL0_USER=swarm +DL0_PASS='Ekfz2ekfz' +DL0_SITE_NAME=RUMC_1 +DL0_DATADIR=/mnt/dlhd0/odelia_data +DL0_SCRATCHDIR=/mnt/dlhd0/deploy_test +DL0_DEPLOY_DIR=/home/swarm/deploy_test +DL0_GPU="device=0" + # ── dl2 ──────────────────────────────────────────────── DL2_HOST=100.64.251.72 DL2_USER=swarm @@ -26,16 +36,6 @@ DL2_SCRATCHDIR=/mnt/sda1/deploy_test DL2_DEPLOY_DIR=/home/swarm/deploy_test DL2_GPU="device=0" -# ── dl3: CAM_1 ──────────────────────────────────────── -DL3A_HOST=100.126.224.113 -DL3A_USER=swarm -DL3A_PASS='Ekfz2ekfz' -DL3A_SITE_NAME=CAM_1 -DL3A_DATADIR=/mnt/swarm_alpha/Odelia_challange/ODELIA_Challenge_unilateral -DL3A_SCRATCHDIR=/mnt/scratch/deploy_test -DL3A_DEPLOY_DIR=/home/swarm/deploy_test -DL3A_GPU="device=0" - # ── Defaults ─────────────────────────────────────────── PROJECT_FILE=application/provision/project_deploy_test_2site.yml DEFAULT_JOB=ODELIA_ternary_classification diff --git a/odelia_image.version b/odelia_image.version index d58dc2be..31b1aabd 100644 --- a/odelia_image.version +++ b/odelia_image.version @@ -1,2 +1,2 @@ # version of the ODELIA Docker image, read by different scripts -1.4.0 +1.4.1