Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docker/main/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,23 @@ services:
depends_on:
- ngen-deps

ngen-calibration:
image: ${DOCKER_INTERNAL_REGISTRY:?}/ngen-calibration:${NGEN_CAL_IMAGE_TAG:-latest}
build:
context: ./ngen-calibration
args:
BASE_NGEN_IMAGE_TAG: ${NGEN_CAL_BASE_IMAGE_TAG:-latest}
#REPO_URL: ${NGEN_REPO_URL?No NGen repo url configured}
NGEN_CAL_BRANCH: ${NGEN_CAL_BRANCH:-master}
NGEN_CAL_COMMIT: ${NGEN_CAL_COMMIT}
#TROUTE_REPO_URL: ${TROUTE_REPO_URL?No t-route repo url configured}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since these are presumedly coming from the ngen image, I think we can just remove them for now? In the future we will need to revisit this when we support parallel calibration / calibration with routing. We will almost want some kind of build matrix for ngen images and calibration images to cover all the realistic ngen build configurations.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will almost want some kind of build matrix for ngen images and calibration images to cover all the realistic ngen build configurations.

Indeed, we are going to need to expand our image variant support. I expected it is reasonably likely that we will change things in the future in a way that will need these to still be here.

#TROUTE_BRANCH: ${TROUTE_BRANCH?No t-route branch configured}
#TROUTE_COMMIT: ${TROUTE_COMMIT}
#BUILD_PARALLEL_JOBS: ${NGEN_BUILD_PARALLEL_JOBS:-2}
DOCKER_INTERNAL_REGISTRY: ${DOCKER_INTERNAL_REGISTRY:?}
depends_on:
- ngen

request-service:
image: ${DOCKER_INTERNAL_REGISTRY:?}/nwm-request-service
build:
Expand Down
39 changes: 39 additions & 0 deletions docker/main/ngen-calibration/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
ARG DOCKER_INTERNAL_REGISTRY
ARG BASE_NGEN_IMAGE_TAG

FROM ${DOCKER_INTERNAL_REGISTRY}/ngen:${BASE_NGEN_IMAGE_TAG:-latest}

ARG WORKDIR=/ngen
ARG USER=mpi
ARG NGEN_CAL_BRANCH=master
ARG NGEN_CAL_COMMIT

ENV USER=${USER} USER_HOME=/home/${USER}

WORKDIR ${WORKDIR}
USER ${USER}

# try NGEN_CAL_COMMIT, if not set or empty, use NGEN_CAL_BRANCH
RUN pip install "git+https://github.com/noaa-owp/ngen-cal@${NGEN_CAL_COMMIT:-${NGEN_CAL_BRANCH}}#egg=ngen_cal&subdirectory=python/ngen_cal"

COPY --chown=${USER} entrypoint.sh ${WORKDIR}

# Change permissions for entrypoint and make sure dataset volume mount parent directories exists
RUN chmod +x ${WORKDIR}/entrypoint.sh \
&& for d in ${DATASET_DIRECTORIES}; do mkdir -p /dmod/datasets/${d}; done \
&& for d in noah-owp-modular topmodel cfe sloth 'evapotranspiration/evapotranspiration'; do \
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may want to make this configuration by specifying these as a build time ARG. Thoughts?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not totally sure I follow, but I think it's something that can wait until the inevitable future revisions to this.

if [ -d ${WORKDIR}/ngen/extern/${d}/cmake_build ]; then \
cp -a ${WORKDIR}/ngen/extern/${d}/cmake_build/*.so* /dmod/shared_libs/.; \
fi; \
done \
&& ( cp -a ${WORKDIR}/ngen/cmake_build_parallel/ngen /dmod/bin/ngen-parallel || true ) \
&& ( cp -a ${WORKDIR}/ngen/cmake_build_serial/ngen /dmod/bin/ngen-serial || true ) \
&& ( cp -a ${WORKDIR}/ngen/cmake_build/partitionGenerator /dmod/bin/partitionGenerator || true ) \
&& pushd /dmod/bin \
# NOTE use of `ln -sf`. \
&& ( ( stat ngen-parallel && ln -sf ngen-parallel ngen ) || ( stat ngen-serial && ln -sf ngen-serial ngen ) ) \
&& popd

ENV PATH=${WORKDIR}:$PATH
ENTRYPOINT ["entrypoint.sh"]
CMD [""]
119 changes: 119 additions & 0 deletions docker/main/ngen-calibration/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/bin/sh
# Managed by the _generate_docker_cmd_args function in scheduler.py of dmod.scheduler
#
# $1 will have the number of nodes associated with this run
# $2 will have comma-delimited host strings in MPI form; e.g., hostname:N,hostname:M
# $3 will have the unique job id
# $4 is the worker index
# $5 will be the name of the output dataset (which will imply a directory location)
# $6 will be the name of the hydrofabric dataset (which will imply a directory location)
# $7 will be the name of the realization configuration dataset (which will imply a directory location)
# $8 will be the name of the BMI configuration dataset (which will imply a directory location)
# $9 will be the name of the partition configuration dataset (which will imply a directory location)
# TODO: wire up $10
# $10 will be the name of the calibration configuration dataset (which will imply a directory location)

# Not yet supported
# no-op
MPI_NODE_COUNT="${1:?No MPI node count given}"
# no-op
MPI_HOST_STRING="${2:?No MPI host string given}"
# no-op
PARTITION_DATASET_NAME="${9:?}"

JOB_ID=${3:?No Job id given}
WORKER_INDEX=${4:?No worker index given}

OUTPUT_DATASET_NAME="${5:?}"
HYDROFABRIC_DATASET_NAME="${6:?}"
REALIZATION_CONFIG_DATASET_NAME="${7:?}"
BMI_CONFIG_DATASET_NAME="${8:?}"
CALIBRATION_CONFIG_DATASET_NAME="${10:?}"

ACCESS_KEY_SECRET="object_store_exec_user_name"
SECRET_KEY_SECRET="object_store_exec_user_passwd"
DOCKER_SECRETS_DIR="/run/secrets"
ACCESS_KEY_FILE="${DOCKER_SECRETS_DIR}/${ACCESS_KEY_SECRET}"
SECRET_KEY_FILE="${DOCKER_SECRETS_DIR}/${SECRET_KEY_SECRET}"

NGEN_EXECUTABLE="/ngen/ngen/cmake_build/ngen"

ALL_DATASET_DIR="/dmod/datasets"
OUTPUT_DATASET_DIR="${ALL_DATASET_DIR}/output/${OUTPUT_DATASET_NAME}"
HYDROFABRIC_DATASET_DIR="${ALL_DATASET_DIR}/hydrofabric/${HYDROFABRIC_DATASET_NAME}"
REALIZATION_CONFIG_DATASET_DIR="${ALL_DATASET_DIR}/config/${REALIZATION_CONFIG_DATASET_NAME}"
BMI_CONFIG_DATASET_DIR="${ALL_DATASET_DIR}/config/${BMI_CONFIG_DATASET_NAME}"
PARTITION_DATASET_DIR="${ALL_DATASET_DIR}/config/${PARTITION_DATASET_NAME}"
CALIBRATION_CONFIG_DATASET_DIR="${ALL_DATASET_DIR}/config/${CALIBRATION_CONFIG_DATASET_NAME}"

print_date() {
date "+%Y-%m-%d,%H:%M:%S"
}

check_for_dataset_dir() {
# Dataset dir is $1
_CATEG="$(echo "${1}" | sed "s|${ALL_DATASET_DIR}/\([^/]*\)/.*|\1|" | awk '{print toupper($0)}')"
if [ ! -d "${1}" ]; then
echo "Error: expected ${_CATEG} dataset directory ${1} not found." 2>&1
exit 1
fi
}

load_object_store_keys_from_docker_secrets() {
# Read Docker Secrets files for Object Store access, if they exist
if [ -z "${ACCESS_KEY_FILE:-}" ]; then
echo "WARN: Cannot load object store access key when Docker secret file name not set"
elif [ -e "${ACCESS_KEY_FILE}" ]; then
ACCESS_KEY="$(cat "${ACCESS_KEY_FILE}")"
else
echo "WARN: Cannot load object store access key when Docker secret file does not exist"
fi

if [ -z "${SECRET_KEY_FILE:-}" ]; then
echo "WARN: Cannot load object store secret key when Docker secret file name not set"
elif [ -e "${SECRET_KEY_FILE}" ]; then
SECRET_KEY="$(cat "${SECRET_KEY_FILE}")"
else
echo "WARN: Cannot load object store secret key when Docker secret file does not exist"
fi

test -n "${ACCESS_KEY:-}" && test -n "${SECRET_KEY:-}"
}

start_calibration() {
# Start ngen calibration
echo "$(print_date) Starting serial ngen calibration"
# CALIBRATION_CONFIG_FILE=${CALIBRATION_CONFIG_DATASET_DIR}/$(basename $(find ${CALIBRATION_CONFIG_DATASET_DIR} -name "*.yaml" -maxdepth 1 | head -1))

# TODO: move this to CALIBRATION_CONFIG_DATASET_DIR
# NOTE: assumes that calibration dataset will be in realization config dataset AND that it is
# the only yaml file at the top level of that dataset.
CALIBRATION_CONFIG_FILE=${REALIZATION_CONFIG_DATASET_DIR}/$(basename $(find ${REALIZATION_CONFIG_DATASET_DIR} -name "*.yaml" -maxdepth 1 | head -1))

if [ -z "${CALIBRATION_CONFIG_FILE}" ]; then
echo "Error: NGEN calibration yaml file not found" 2>&1
exit 1
fi
python3 -m ngen.cal "${CALIBRATION_CONFIG_FILE}"

#Capture the return value to use as service exit code
NGEN_RETURN=$?

echo "$(print_date) ngen calibration finished with return value: ${NGEN_RETURN}"

# Exit with the model's exit code
return ${NGEN_RETURN}
}

# Sanity check that the output, hydrofabric, and config datasets are available (i.e., their directories are in place)
check_for_dataset_dir "${REALIZATION_CONFIG_DATASET_DIR}"
check_for_dataset_dir "${BMI_CONFIG_DATASET_DIR}"
check_for_dataset_dir "${PARTITION_DATASET_DIR}"
check_for_dataset_dir "${HYDROFABRIC_DATASET_DIR}"
check_for_dataset_dir "${OUTPUT_DATASET_DIR}"
# check_for_dataset_dir "${CALIBRATION_CONFIG_DATASET_DIR}"

# Move to the output dataset mounted directory
cd ${OUTPUT_DATASET_DIR}

start_calibration