From 60dfe7eb94a77e9c9016af95e8be63bf7d204f6b Mon Sep 17 00:00:00 2001 From: Austin Raney Date: Tue, 6 Dec 2022 15:10:50 -0500 Subject: [PATCH 1/3] add ngen-calibration dockerfile and entrypoint --- docker/main/ngen-calibration/Dockerfile | 38 +++++++ docker/main/ngen-calibration/entrypoint.sh | 119 +++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 docker/main/ngen-calibration/Dockerfile create mode 100755 docker/main/ngen-calibration/entrypoint.sh diff --git a/docker/main/ngen-calibration/Dockerfile b/docker/main/ngen-calibration/Dockerfile new file mode 100644 index 000000000..e84be87f6 --- /dev/null +++ b/docker/main/ngen-calibration/Dockerfile @@ -0,0 +1,38 @@ +ARG DOCKER_INTERNAL_REGISTRY + +FROM ${DOCKER_INTERNAL_REGISTRY}/ngen:latest + +ARG WORKDIR=/ngen +ARG USER=mpi +ARG NGEN_CAL_BRANCH=master +ARG NGEN_CAL_COMMIT + +ENV USER=${USER} USER_HOME=/home/${USER} + +WORKDIR ${WORKDIR} +USER ${USER} + +# try NGEN_CAL_COMMIT, if not set or empty, use NGEN_CAL_BRANCH +RUN pip install "git+https://github.com/noaa-owp/ngen-cal@${NGEN_CAL_COMMIT:-${NGEN_CAL_BRANCH}}#egg=ngen_cal&subdirectory=python/ngen_cal" + +COPY --chown=${USER} entrypoint.sh ${WORKDIR} + +# Change permissions for entrypoint and make sure dataset volume mount parent directories exists +RUN chmod +x ${WORKDIR}/entrypoint.sh \ + && for d in ${DATASET_DIRECTORIES}; do mkdir -p /dmod/datasets/${d}; done \ + && for d in noah-owp-modular topmodel cfe sloth 'evapotranspiration/evapotranspiration'; do \ + if [ -d ${WORKDIR}/ngen/extern/${d}/cmake_build ]; then \ + cp -a ${WORKDIR}/ngen/extern/${d}/cmake_build/*.so* /dmod/shared_libs/.; \ + fi; \ + done \ + && ( cp -a ${WORKDIR}/ngen/cmake_build_parallel/ngen /dmod/bin/ngen-parallel || true ) \ + && ( cp -a ${WORKDIR}/ngen/cmake_build_serial/ngen /dmod/bin/ngen-serial || true ) \ + && ( cp -a ${WORKDIR}/ngen/cmake_build/partitionGenerator /dmod/bin/partitionGenerator || true ) \ + && pushd /dmod/bin \ + # NOTE use of `ln -sf`. \ + && ( ( stat ngen-parallel && ln -sf ngen-parallel ngen ) || ( stat ngen-serial && ln -sf ngen-serial ngen ) ) \ + && popd + +ENV PATH=${WORKDIR}:$PATH +ENTRYPOINT ["entrypoint.sh"] +CMD [""] diff --git a/docker/main/ngen-calibration/entrypoint.sh b/docker/main/ngen-calibration/entrypoint.sh new file mode 100755 index 000000000..c4d054610 --- /dev/null +++ b/docker/main/ngen-calibration/entrypoint.sh @@ -0,0 +1,119 @@ +#!/bin/sh +# Managed by the _generate_docker_cmd_args function in scheduler.py of dmod.scheduler +# +# $1 will have the number of nodes associated with this run +# $2 will have comma-delimited host strings in MPI form; e.g., hostname:N,hostname:M +# $3 will have the unique job id +# $4 is the worker index +# $5 will be the name of the output dataset (which will imply a directory location) +# $6 will be the name of the hydrofabric dataset (which will imply a directory location) +# $7 will be the name of the realization configuration dataset (which will imply a directory location) +# $8 will be the name of the BMI configuration dataset (which will imply a directory location) +# $9 will be the name of the partition configuration dataset (which will imply a directory location) +# TODO: wire up $10 +# $10 will be the name of the calibration configuration dataset (which will imply a directory location) + +# Not yet supported +# no-op +MPI_NODE_COUNT="${1:?No MPI node count given}" +# no-op +MPI_HOST_STRING="${2:?No MPI host string given}" +# no-op +PARTITION_DATASET_NAME="${9:?}" + +JOB_ID=${3:?No Job id given} +WORKER_INDEX=${4:?No worker index given} + +OUTPUT_DATASET_NAME="${5:?}" +HYDROFABRIC_DATASET_NAME="${6:?}" +REALIZATION_CONFIG_DATASET_NAME="${7:?}" +BMI_CONFIG_DATASET_NAME="${8:?}" +CALIBRATION_CONFIG_DATASET_NAME="${10:?}" + +ACCESS_KEY_SECRET="object_store_exec_user_name" +SECRET_KEY_SECRET="object_store_exec_user_passwd" +DOCKER_SECRETS_DIR="/run/secrets" +ACCESS_KEY_FILE="${DOCKER_SECRETS_DIR}/${ACCESS_KEY_SECRET}" +SECRET_KEY_FILE="${DOCKER_SECRETS_DIR}/${SECRET_KEY_SECRET}" + +NGEN_EXECUTABLE="/ngen/ngen/cmake_build/ngen" + +ALL_DATASET_DIR="/dmod/datasets" +OUTPUT_DATASET_DIR="${ALL_DATASET_DIR}/output/${OUTPUT_DATASET_NAME}" +HYDROFABRIC_DATASET_DIR="${ALL_DATASET_DIR}/hydrofabric/${HYDROFABRIC_DATASET_NAME}" +REALIZATION_CONFIG_DATASET_DIR="${ALL_DATASET_DIR}/config/${REALIZATION_CONFIG_DATASET_NAME}" +BMI_CONFIG_DATASET_DIR="${ALL_DATASET_DIR}/config/${BMI_CONFIG_DATASET_NAME}" +PARTITION_DATASET_DIR="${ALL_DATASET_DIR}/config/${PARTITION_DATASET_NAME}" +CALIBRATION_CONFIG_DATASET_DIR="${ALL_DATASET_DIR}/config/${CALIBRATION_CONFIG_DATASET_NAME}" + +print_date() { + date "+%Y-%m-%d,%H:%M:%S" +} + +check_for_dataset_dir() { + # Dataset dir is $1 + _CATEG="$(echo "${1}" | sed "s|${ALL_DATASET_DIR}/\([^/]*\)/.*|\1|" | awk '{print toupper($0)}')" + if [ ! -d "${1}" ]; then + echo "Error: expected ${_CATEG} dataset directory ${1} not found." 2>&1 + exit 1 + fi +} + +load_object_store_keys_from_docker_secrets() { + # Read Docker Secrets files for Object Store access, if they exist + if [ -z "${ACCESS_KEY_FILE:-}" ]; then + echo "WARN: Cannot load object store access key when Docker secret file name not set" + elif [ -e "${ACCESS_KEY_FILE}" ]; then + ACCESS_KEY="$(cat "${ACCESS_KEY_FILE}")" + else + echo "WARN: Cannot load object store access key when Docker secret file does not exist" + fi + + if [ -z "${SECRET_KEY_FILE:-}" ]; then + echo "WARN: Cannot load object store secret key when Docker secret file name not set" + elif [ -e "${SECRET_KEY_FILE}" ]; then + SECRET_KEY="$(cat "${SECRET_KEY_FILE}")" + else + echo "WARN: Cannot load object store secret key when Docker secret file does not exist" + fi + + test -n "${ACCESS_KEY:-}" && test -n "${SECRET_KEY:-}" +} + +start_calibration() { + # Start ngen calibration + echo "$(print_date) Starting serial ngen calibration" + # CALIBRATION_CONFIG_FILE=${CALIBRATION_CONFIG_DATASET_DIR}/$(basename $(find ${CALIBRATION_CONFIG_DATASET_DIR} -name "*.yaml" -maxdepth 1 | head -1)) + + # TODO: move this to CALIBRATION_CONFIG_DATASET_DIR + # NOTE: assumes that calibration dataset will be in realization config dataset AND that it is + # the only yaml file at the top level of that dataset. + CALIBRATION_CONFIG_FILE=${REALIZATION_CONFIG_DATASET_DIR}/$(basename $(find ${REALIZATION_CONFIG_DATASET_DIR} -name "*.yaml" -maxdepth 1 | head -1)) + + if [ -z "${CALIBRATION_CONFIG_FILE}" ]; then + echo "Error: NGEN calibration yaml file not found" 2>&1 + exit 1 + fi + python3 -m ngen.cal "${CALIBRATION_CONFIG_FILE}" + + #Capture the return value to use as service exit code + NGEN_RETURN=$? + + echo "$(print_date) ngen calibration finished with return value: ${NGEN_RETURN}" + + # Exit with the model's exit code + return ${NGEN_RETURN} +} + +# Sanity check that the output, hydrofabric, and config datasets are available (i.e., their directories are in place) +check_for_dataset_dir "${REALIZATION_CONFIG_DATASET_DIR}" +check_for_dataset_dir "${BMI_CONFIG_DATASET_DIR}" +check_for_dataset_dir "${PARTITION_DATASET_DIR}" +check_for_dataset_dir "${HYDROFABRIC_DATASET_DIR}" +check_for_dataset_dir "${OUTPUT_DATASET_DIR}" +# check_for_dataset_dir "${CALIBRATION_CONFIG_DATASET_DIR}" + +# Move to the output dataset mounted directory +cd ${OUTPUT_DATASET_DIR} + +start_calibration From 229a37fdd9da6d6a51fb7baed398279340388a22 Mon Sep 17 00:00:00 2001 From: Robert Bartel Date: Wed, 15 Mar 2023 11:36:34 -0500 Subject: [PATCH 2/3] Add Docker Swarm build config for calibration. Adding "service" config to Docker build-time config file for building the ngen-calibration worker image. --- docker/main/docker-build.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docker/main/docker-build.yml b/docker/main/docker-build.yml index 78f69c4b9..2cdc998ff 100644 --- a/docker/main/docker-build.yml +++ b/docker/main/docker-build.yml @@ -94,6 +94,23 @@ services: depends_on: - ngen-deps + ngen-calibration: + image: ${DOCKER_INTERNAL_REGISTRY:?}/ngen-calibration:${NGEN_CAL_IMAGE_TAG:-latest} + build: + context: ./ngen-calibration + args: + BASE_NGEN_IMAGE_TAG: ${NGEN_CAL_BASE_IMAGE_TAG:-latest} + #REPO_URL: ${NGEN_REPO_URL?No NGen repo url configured} + NGEN_CAL_BRANCH: ${NGEN_CAL_BRANCH:-master} + NGEN_CAL_COMMIT: ${NGEN_CAL_COMMIT} + #TROUTE_REPO_URL: ${TROUTE_REPO_URL?No t-route repo url configured} + #TROUTE_BRANCH: ${TROUTE_BRANCH?No t-route branch configured} + #TROUTE_COMMIT: ${TROUTE_COMMIT} + #BUILD_PARALLEL_JOBS: ${NGEN_BUILD_PARALLEL_JOBS:-2} + DOCKER_INTERNAL_REGISTRY: ${DOCKER_INTERNAL_REGISTRY:?} + depends_on: + - ngen + request-service: image: ${DOCKER_INTERNAL_REGISTRY:?}/nwm-request-service build: From 0bf86e6c5f8ab40a9a10d96d42b0254bd4f7bbd2 Mon Sep 17 00:00:00 2001 From: Robert Bartel Date: Wed, 15 Mar 2023 11:38:34 -0500 Subject: [PATCH 3/3] Update calibration image for base tag configure. Updating Dockerfile for ngen-calibration to support specifying a tag for the base ngen image to be used, if something other than "latest" is required. --- docker/main/ngen-calibration/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/main/ngen-calibration/Dockerfile b/docker/main/ngen-calibration/Dockerfile index e84be87f6..dc900af49 100644 --- a/docker/main/ngen-calibration/Dockerfile +++ b/docker/main/ngen-calibration/Dockerfile @@ -1,6 +1,7 @@ ARG DOCKER_INTERNAL_REGISTRY +ARG BASE_NGEN_IMAGE_TAG -FROM ${DOCKER_INTERNAL_REGISTRY}/ngen:latest +FROM ${DOCKER_INTERNAL_REGISTRY}/ngen:${BASE_NGEN_IMAGE_TAG:-latest} ARG WORKDIR=/ngen ARG USER=mpi