Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions application/provision/project_deploy_test_2site.yml
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
api_version: 3
name: odelia_deploy_test___REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS___model_test
description: ODELIA 2-client deploy test across Tailscale VPN (dl2, dl3; server on Cosmos) — dl0 excluded
description: ODELIA 2-client deploy test across Tailscale VPN (dl0, dl2; server on Cosmos)

participants:
- name: dl3.tud.de
type: server
org: TUD
fed_learn_port: 8002
admin_port: 8003
- name: MHA_1
- name: RUMC_1
type: client
org: TUD
- name: CAM_1
- name: MHA_1
type: client
org: TUD
- name: jiefu.zhu@tu-dresden.de
Expand Down
26 changes: 13 additions & 13 deletions deploy_sites_2node_test.conf
Original file line number Diff line number Diff line change
@@ -1,21 +1,31 @@
# 2-client ODELIA deploy test configuration (Tailscale VPN)
# ============================================================================
# Cosmos = server + admin ONLY (no training client — RTX 5070 not supported)
# dl0 = RUMC_1 client
# dl2 = MHA_1 client
# dl3 = CAM_1 client
#
# dl0 (RUMC_1) excluded due to persistent EXECUTION_EXCEPTION crashes.
# dl3 excluded due to GPU contention with other training jobs.
# ============================================================================

# CLIENT_SITES lists only the training client entries.
CLIENT_SITES=(DL2 DL3A)
CLIENT_SITES=(DL0 DL2)

# ── Cosmos (server + admin only) ──────────────────────────────
COSMOS_HOST=localhost
COSMOS_USER=jeff
COSMOS_PASS=''
COSMOS_DEPLOY_DIR=/home/jeff/deploy_test

# ── dl0 ────────────────────────────────────────────────
DL0_HOST=100.127.161.36
DL0_USER=swarm
DL0_PASS='Ekfz2ekfz'
DL0_SITE_NAME=RUMC_1
DL0_DATADIR=/mnt/dlhd0/odelia_data
DL0_SCRATCHDIR=/mnt/dlhd0/deploy_test
DL0_DEPLOY_DIR=/home/swarm/deploy_test
DL0_GPU="device=0"

# ── dl2 ────────────────────────────────────────────────
DL2_HOST=100.64.251.72
DL2_USER=swarm
Expand All @@ -26,16 +36,6 @@ DL2_SCRATCHDIR=/mnt/sda1/deploy_test
DL2_DEPLOY_DIR=/home/swarm/deploy_test
DL2_GPU="device=0"

# ── dl3: CAM_1 ────────────────────────────────────────
DL3A_HOST=100.126.224.113
DL3A_USER=swarm
DL3A_PASS='Ekfz2ekfz'
DL3A_SITE_NAME=CAM_1
DL3A_DATADIR=/mnt/swarm_alpha/Odelia_challange/ODELIA_Challenge_unilateral
DL3A_SCRATCHDIR=/mnt/scratch/deploy_test
DL3A_DEPLOY_DIR=/home/swarm/deploy_test
DL3A_GPU="device=0"

# ── Defaults ───────────────────────────────────────────
PROJECT_FILE=application/provision/project_deploy_test_2site.yml
DEFAULT_JOB=ODELIA_ternary_classification
Expand Down
2 changes: 1 addition & 1 deletion odelia_image.version
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# version of the ODELIA Docker image, read by different scripts
1.4.0
1.4.1
Loading