diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 35708c00..1fd01e77 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -733,6 +733,7 @@ docker_cln_sh: | --model_name) CLI_MODEL_NAME="$2"; shift ;; --num_epochs) CLI_NUM_EPOCHS="$2"; shift ;; --config) CLI_CONFIG="$2"; shift ;; + --container_name) CONTAINER_NAME="$2"; shift ;; *) echo "Unknown parameter passed: $1"; exit 1 ;; esac shift @@ -791,7 +792,10 @@ docker_cln_sh: | docker pull "$DOCKER_IMAGE" fi - CONTAINER_NAME=odelia_swarm_client_{~~client_name~~}___REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__ + if [[ -z "$CONTAINER_NAME" ]]; then + CONTAINER_NAME=odelia_swarm_client_{~~client_name~~}___REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__ + fi + DOCKER_OPTIONS_A="--name=$CONTAINER_NAME --gpus=$GPU2USE -u $(id -u):$(id -g)" DOCKER_MOUNTS="-v /etc/passwd:/etc/passwd -v /etc/group:/etc/group -v $DIR/..:/startupkit/ -v $MY_SCRATCH_DIR:/scratch/" if [ -n "$MY_DATA_DIR" ]; then @@ -929,6 +933,7 @@ docker_cln_sh: | echo " when running with sudo, as sudo resets environment variables" echo "--num_epochs set number of training epochs (default: 100)" echo "--config set config name (default: unilateral)" + echo "--container_name set name for the Docker container to override default using version number" exit 1 fi diff --git a/scripts/ci/runIntegrationTests.sh b/scripts/ci/runIntegrationTests.sh index b445b39c..73710df5 100755 --- a/scripts/ci/runIntegrationTests.sh +++ b/scripts/ci/runIntegrationTests.sh @@ -297,6 +297,29 @@ run_docker_gpu_preflight_check () { } +run_two_containers_in_parallel () { + # requires having built a startup kit + echo "[Run] Starting two containers in parallel (local dummy training via startup kit) ..." + cd "$PROJECT_DIR/prod_00/client_A/startup/" + CONSOLE_OUTPUT=docker_gpu_preflight_check_console_output.txt + timeout --signal=kill 1m ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --dummy_training --no_pull 2>&1 | tee "$CONSOLE_OUTPUT" & + sleep 1 + + CONSOLE_OUTPUT_A=docker_gpu_preflight_check_console_output_a.txt + timeout --signal=kill 1m ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --dummy_training --no_pull --container_name MediSwarmODELIATestSecondContainer 2>&1 | tee "$CONSOLE_OUTPUT_A" & + sleep 60 + + if grep -q "Epoch 1: 100%" "$CONSOLE_OUTPUT_A" && grep -q "Training completed successfully" "$CONSOLE_OUTPUT_A"; then + echo "✅ Expected output of running two containers in parallel found" + else + echo "❌ Missing expected output of running two containers in parallel" + exit 1 + fi + + cd "$CWD" +} + + run_data_access_preflight_check () { # requires having built a startup kit and synthetic dataset echo "[Run] Data access preflight check..." @@ -802,6 +825,12 @@ case "$1" in cleanup_temporary_data ;; + run_two_containers_in_parallel) + create_startup_kits_and_check_contained_files + run_two_containers_in_parallel + cleanup_temporary_data + ;; + run_data_access_preflight_check) create_startup_kits_and_check_contained_files create_synthetic_data