diff --git a/cron-scripts/README.md b/cron-scripts/README.md new file mode 100644 index 0000000000..bad492a17e --- /dev/null +++ b/cron-scripts/README.md @@ -0,0 +1,102 @@ +# OMEGA Cron Scripts + +Automated cron job scripts for continuous testing and CDash reporting of OMEGA ocean modeling projects across multiple HPC systems. + +## Overview + +This repository orchestrates the compilation, testing, and result submission to [CDash](https://my.cdash.org) for two types of OMEGA tests: + +- **Omega CTests** +- **Polaris** - Omega tests on MPAS meshes + +## Supported Systems + +| Machine | Location | Compilers | +|---------|----------|-----------| +| Frontier | ORNL | craygnu, craycray, crayamd (with mphipcc variants) | +| Chrysalis | ANL (LCRC) | gnu, intel | +| pm-gpu | NERSC (Perlmutter GPU) | gnugpu | +| pm-cpu | NERSC (Perlmutter CPU) | gnu | + +## Repository Structure + +``` +cron-scripts/ +├── launch_all.sh # Main entry point +├── machines/ # Machine-specific configurations +│ ├── config_machine.sh # Auto-detection dispatcher +│ ├── config_frontier.sh +│ ├── config_chrysalis.sh +│ ├── config_pm-gpu.sh +│ └── config_pm-cpu.sh +└── tasks/ # Scheduled job definitions + ├── omega_cdash/ # Omega model CDash testing + │ ├── launch_omega_cdash.sh + │ └── job_*.sbatch + └── polaris_cdash/ # Polaris model CDash testing + ├── launch_polaris_ctest.sh + ├── polaris_cdash.py + └── CTestScript.txt +``` + +## Usage + +### Run on auto-detected machine + +```bash +./launch_all.sh +``` + +### Run on a specific machine + +```bash +./launch_all.sh -m frontier +./launch_all.sh -m chrysalis +./launch_all.sh -m pm-gpu +./launch_all.sh -m pm-cpu +``` + +### Set up in crontab + +```bash +# Run daily at 1 AM +0 1 * * * /path/to/cron-scripts/launch_all.sh +``` + +## How It Works + +1. `launch_all.sh` auto-detects the machine via hostname or accepts a `-m` flag +2. Sources the appropriate machine configuration (compilers, paths, modules) +3. Uses file locking to prevent concurrent executions +4. Discovers and executes all `launch*.sh` scripts in task subdirectories +5. Each task clones/updates repos, submits SBATCH jobs, and reports to CDash + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `CRONJOB_BASEDIR` | Root directory for job outputs | +| `CRONJOB_MACHINE` | Detected/specified machine name | +| `CRONJOB_LOGDIR` | Log directory location | +| `E3SM_COMPILERS` | Space-separated list of compilers to test | + +## Adding a New Machine + +1. Create `machines/config_.sh` with: + - `CRONJOB_BASEDIR` path + - `E3SM_COMPILERS` list + - Module loads and environment setup +2. Add hostname pattern to `machines/config_machine.sh` +3. Create machine-specific SBATCH scripts in task directories if needed + +## Adding a New Task + +1. Create a new directory under `tasks/` +2. Add a `launch_.sh` script +3. The script will be auto-discovered and executed by `launch_all.sh` + +## CDash Integration + +Test results are submitted to: +- E3SM project: https://my.cdash.org/submit.php?project=E3SM +- Omega project: https://my.cdash.org/submit.php?project=omega diff --git a/cron-scripts/launch_all.sh b/cron-scripts/launch_all.sh new file mode 100755 index 0000000000..f4ee8f3c9c --- /dev/null +++ b/cron-scripts/launch_all.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +set -eo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")" + +# --- Parse command-line arguments --- +CLI_MACHINE="" +while [[ $# -gt 0 ]]; do + case "$1" in + -m|--machine) + CLI_MACHINE="$2" + shift 2 + ;; + *) + echo "ERROR: Unknown option '$1'" >&2 + echo "Usage: $SCRIPT_NAME [-m|--machine MACHINE_NAME]" + exit 1 + ;; + esac +done + +echo "[$(date)] Starting $SCRIPT_NAME" + +# set CRONJOB_BASEDIR and machine-specific variables +# pass -m through so config_machine.sh uses CLI override if provided +if [[ -n "$CLI_MACHINE" ]]; then + source "${HERE}/machines/config_machine.sh" -m "$CLI_MACHINE" +else + source "${HERE}/machines/config_machine.sh" +fi + +export CRONJOB_LOGDIR="${CRONJOB_BASEDIR}/logs" +mkdir -p "$CRONJOB_LOGDIR" + +export CRONJOB_DATE=$(date +"%d") +export CRONJOB_TIME=$(date +"%T") + +LOCKFILE="/tmp/${USER}_cronjob.lock" +exec 9>"$LOCKFILE" +if ! flock -n 9; then + echo "[$(date)] launch_all.sh is already running, exiting." + exit 0 +fi + +# Run all launch*.sh scripts under immediate subdirectories of $HERE/tasks +while IFS= read -r script; do + /bin/bash "$script" +done < <( + find "$HERE/tasks" -mindepth 2 -maxdepth 2 \ + -type f -name 'launch*.sh' | sort +) + +echo "[$(date)] Finished $SCRIPT_NAME" diff --git a/cron-scripts/machines/config_chrysalis.sh b/cron-scripts/machines/config_chrysalis.sh new file mode 100755 index 0000000000..d342b5d18a --- /dev/null +++ b/cron-scripts/machines/config_chrysalis.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -eo pipefail + +source /etc/bashrc + +export CRONJOB_BASEDIR=/lcrc/globalscratch/${USER}/cronjobs +export E3SM_COMPILERS="gnu intel" + +mkdir -p "$CRONJOB_BASEDIR" diff --git a/cron-scripts/machines/config_frontier.sh b/cron-scripts/machines/config_frontier.sh new file mode 100755 index 0000000000..a7db934cdc --- /dev/null +++ b/cron-scripts/machines/config_frontier.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +set -eo pipefail + +module load cray-python cmake + +export all_proxy=socks://proxy.ccs.ornl.gov:3128/ +export ftp_proxy=ftp://proxy.ccs.ornl.gov:3128/ +export http_proxy=http://proxy.ccs.ornl.gov:3128/ +export https_proxy=http://proxy.ccs.ornl.gov:3128/ +export no_proxy='localhost,127.0.0.0/8,*.ccs.ornl.gov' + +export CRONJOB_BASEDIR=/lustre/orion/cli115/scratch/${USER}/cronjobs +export E3SM_COMPILERS="craygnu-mphipcc craycray-mphipcc crayamd-mphipcc craygnu craycray crayamd" + +mkdir -p "$CRONJOB_BASEDIR" diff --git a/cron-scripts/machines/config_machine.sh b/cron-scripts/machines/config_machine.sh new file mode 100755 index 0000000000..a0dc05a592 --- /dev/null +++ b/cron-scripts/machines/config_machine.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +set -eo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# --- Parse command-line arguments --- +usage() { + echo "Usage: $(basename "$0") [-m|--machine MACHINE_NAME] [-h|--help]" + echo " -m, --machine Override the auto-detected machine name" + echo " -h, --help Show this help message" + exit "${1:-0}" +} + +CLI_MACHINE="" +while [[ $# -gt 0 ]]; do + case "$1" in + -m|--machine) + CLI_MACHINE="$2" + shift 2 + ;; + -h|--help) + usage 0 + ;; + *) + echo "ERROR: Unknown option '$1'" >&2 + usage 1 + ;; + esac +done + +# --- Get a stable hostname / FQDN (try multiple methods) --- +get_fqdn() { + local fqdn="" + fqdn="$(hostname -f 2>/dev/null || true)" + if [[ -z "$fqdn" || "$fqdn" == "(none)" ]]; then + fqdn="$(hostname --fqdn 2>/dev/null || true)" + fi + if [[ -z "$fqdn" || "$fqdn" == "(none)" ]]; then + fqdn="$(hostname 2>/dev/null || true)" + fi + echo "$fqdn" +} + +FQDN="$(get_fqdn)" + +# --- Determine CRONJOB_MACHINE --- +if [[ -n "$CLI_MACHINE" ]]; then + # Command-line argument takes highest priority + CRONJOB_MACHINE="$CLI_MACHINE" +else + # Fall back to FQDN-based detection + CRONJOB_MACHINE="unknown" + case "$FQDN" in + *.frontier.olcf.ornl.gov) + CRONJOB_MACHINE="frontier" + ;; + *.polaris.alcf.anl.gov) + CRONJOB_MACHINE="polaris" + ;; + *.perlmutter.nersc.gov) + CRONJOB_MACHINE="pm-gpu" + ;; + *.lcrc.anl.gov) + CRONJOB_MACHINE="chrysalis" + ;; + esac +fi + +export CRONJOB_MACHINE +echo "FQDN=$FQDN" +echo "CRONJOB_MACHINE=$CRONJOB_MACHINE" + +source "${SCRIPT_DIR}/config_${CRONJOB_MACHINE}.sh" diff --git a/cron-scripts/machines/config_pm-cpu.sh b/cron-scripts/machines/config_pm-cpu.sh new file mode 100755 index 0000000000..35b4a958fa --- /dev/null +++ b/cron-scripts/machines/config_pm-cpu.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +set -eo pipefail + +module load cray-python cmake + +export CRONJOB_BASEDIR=/pscratch/sd/${USER:0:1}/${USER}/omega/cronjobs_pm-cpu +export E3SM_COMPILERS="gnu" + +mkdir -p "$CRONJOB_BASEDIR" diff --git a/cron-scripts/machines/config_pm-gpu.sh b/cron-scripts/machines/config_pm-gpu.sh new file mode 100755 index 0000000000..e025b44dcf --- /dev/null +++ b/cron-scripts/machines/config_pm-gpu.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +set -eo pipefail + +module load cray-python cmake + +export CRONJOB_BASEDIR=/pscratch/sd/${USER:0:1}/${USER}/omega/cronjobs_pm-gpu +export E3SM_COMPILERS="gnugpu" + +mkdir -p "$CRONJOB_BASEDIR" diff --git a/cron-scripts/tasks/omega_cdash/job_chrysalis_omega_cdash.sbatch b/cron-scripts/tasks/omega_cdash/job_chrysalis_omega_cdash.sbatch new file mode 100755 index 0000000000..0bfbac48be --- /dev/null +++ b/cron-scripts/tasks/omega_cdash/job_chrysalis_omega_cdash.sbatch @@ -0,0 +1,8 @@ +#!/bin/bash -l +#SBATCH --nodes=1 +#SBATCH --qos=high +#SBATCH --time 02:00:00 + +source /etc/bashrc + +exec bash $(dirname "$0")/run_omega_cdash.sh diff --git a/cron-scripts/tasks/omega_cdash/job_frontier_omega_cdash.sbatch b/cron-scripts/tasks/omega_cdash/job_frontier_omega_cdash.sbatch new file mode 100755 index 0000000000..28de743c4d --- /dev/null +++ b/cron-scripts/tasks/omega_cdash/job_frontier_omega_cdash.sbatch @@ -0,0 +1,10 @@ +#!/bin/bash -l +#SBATCH --nodes=1 +#SBATCH -q debug +#SBATCH --account=cli115 +#SBATCH --time 02:00:00 + + +source /etc/bashrc + +exec bash $(dirname "$0")/run_omega_cdash.sh diff --git a/cron-scripts/tasks/omega_cdash/job_pm-cpu_omega_cdash.sbatch b/cron-scripts/tasks/omega_cdash/job_pm-cpu_omega_cdash.sbatch new file mode 100755 index 0000000000..6c4314cc6d --- /dev/null +++ b/cron-scripts/tasks/omega_cdash/job_pm-cpu_omega_cdash.sbatch @@ -0,0 +1,16 @@ +#!/bin/bash -l +#SBATCH --job-name=OmegaSCron +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=64 +#SBATCH --output=/global/cfs/cdirs/e3sm/omega/cronjbos_pm-cpu/logs/OmegaSCronCPU_%j.out +#SBATCH --error=/global/cfs/cdirs/e3sm/omega/cronjobs_pm-cpu/logs/OmegaSCronCPU_%j.err +#SBATCH --constraint=cpu +#SBATCH --account=e3sm +#SBATCH --qos regular +#SBATCH --exclusive +#SBATCH --time 01:00:00 + + +source /etc/bashrc + +exec bash $(dirname "$0")/run_omega_cdash.sh diff --git a/cron-scripts/tasks/omega_cdash/job_pm-gpu_omega_cdash.sbatch b/cron-scripts/tasks/omega_cdash/job_pm-gpu_omega_cdash.sbatch new file mode 100755 index 0000000000..66c2a83819 --- /dev/null +++ b/cron-scripts/tasks/omega_cdash/job_pm-gpu_omega_cdash.sbatch @@ -0,0 +1,17 @@ +#!/bin/bash -l +#SBATCH --job-name=OmegaSCron +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gpus-per-node=4 +#SBATCH --output=/global/cfs/cdirs/e3sm/omega/cronjobs_pm-gpu/logs/OmegaSCronGPU_%j.out +#SBATCH --error=/global/cfs/cdirs/e3sm/omega/cronjobs_pm-gpu/logs/OmegaSCronGPU_%j.err +#SBATCH --constraint=gpu +#SBATCH --account=e3sm_g +#SBATCH --qos regular +#SBATCH --exclusive +#SBATCH --time 01:00:00 + + +source /etc/bashrc + +exec bash $(dirname "$0")/run_omega_cdash.sh diff --git a/cron-scripts/tasks/omega_cdash/launch_omega_cdash.sh b/cron-scripts/tasks/omega_cdash/launch_omega_cdash.sh new file mode 100755 index 0000000000..417deec99b --- /dev/null +++ b/cron-scripts/tasks/omega_cdash/launch_omega_cdash.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -eo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")" +echo "[$(date)] Starting $SCRIPT_NAME" + +export OMEGA_CDASH_BASEDIR=${CRONJOB_BASEDIR}/tasks/omega_cdash +export TESTROOT="${OMEGA_CDASH_BASEDIR}/tests" +mkdir -p $OMEGA_CDASH_BASEDIR +mkdir -p $TESTROOT + + +# Configuration +export REPO_PATH="${OMEGA_CDASH_BASEDIR}/Omega" +REMOTE_URL="https://github.com/E3SM-Project/Omega.git" +BRANCH="develop" + +# 1. & 2. Check existence and handle repository state +if [ ! -d "$REPO_PATH/.git" ]; then + echo "Repository not found. Cloning..." + git clone -b "$BRANCH" "$REMOTE_URL" "$REPO_PATH" + cd "$REPO_PATH" || exit +else + echo "Repository exists. Updating to latest remote state..." + cd "$REPO_PATH" || exit + + # Ensure we are on the correct branch and sync with origin + git fetch origin + git checkout "$BRANCH" + git reset --hard "origin/$BRANCH" +fi + +# 3. Update specific submodules recursively +echo "Updating submodules..." +git submodule update --init --recursive externals/ekat externals/scorpio cime components/omega/external + +if [[ ! -f ${TESTROOT}/OmegaMesh.nc ]]; then + wget -O ${TESTROOT}/OmegaMesh.nc https://web.lcrc.anl.gov/public/e3sm/inputdata/ocn/mpas-o/oQU240/ocean.QU.240km.151209.nc +fi + +if [[ ! -f ${TESTROOT}/OmegaSphereMesh.nc ]]; then + wget -O ${TESTROOT}/OmegaSphereMesh.nc https://web.lcrc.anl.gov/public/e3sm/polaris/ocean/polaris_cache/global_convergence/icos/cosine_bell/Icos480/init/initial_state.230220.nc +fi + +if [[ ! -f ${TESTROOT}/OmegaPlanarMesh.nc ]]; then + wget -O ${TESTROOT}/OmegaPlanarMesh.nc https://gist.github.com/mwarusz/f8caf260398dbe140d2102ec46a41268/raw/e3c29afbadc835797604369114321d93fd69886d/PlanarPeriodic48x48.nc +fi + +sbatch \ + --job-name=OmegaCdash \ + --output="$CRONJOB_LOGDIR/omega_cdash_%j.out" \ + --error="$CRONJOB_LOGDIR/omega_cdash_%j.err" \ + ${HERE}/job_${CRONJOB_MACHINE}_omega_cdash.sbatch + +echo "[$(date)] Finished $SCRIPT_NAME" diff --git a/cron-scripts/tasks/omega_cdash/run_omega_cdash.sh b/cron-scripts/tasks/omega_cdash/run_omega_cdash.sh new file mode 100644 index 0000000000..a90b033d28 --- /dev/null +++ b/cron-scripts/tasks/omega_cdash/run_omega_cdash.sh @@ -0,0 +1,78 @@ +#!/bin/bash -l + +set -euo pipefail + +echo "Starting omega cdash job" + +if [[ "${CRONJOB_MACHINE:-unknown}" == "chrysalis" ]]; then + module load python cmake + PARMETIS_TPL="/lcrc/soft/climate/polaris/chrysalis/spack/dev_polaris_0_10_0_COMPILER_openmpi/var/spack/environments/dev_polaris_0_10_0_COMPILER_openmpi/.spack-env/view" + +elif [[ "${CRONJOB_MACHINE:-unknown}" == "frontier" ]]; then + module load cray-python cmake git-lfs + PARMETIS_TPL="/ccs/proj/cli115/software/polaris/frontier/spack/dev_polaris_0_10_0_COMPILER_mpich/var/spack/environments/dev_polaris_0_10_0_COMPILER_mpich/.spack-env/view" + +elif [[ "${CRONJOB_MACHINE:-unknown}" == "pm-gpu" ]]; then + module load cray-python cmake + PARMETIS_TPL="/global/cfs/cdirs/e3sm/software/polaris/pm-gpu/spack/dev_polaris_0_10_0_COMPILER_mpich/var/spack/environments/dev_polaris_0_10_0_COMPILER_mpich/.spack-env/view" + +elif [[ "${CRONJOB_MACHINE:-unknown}" == "pm-cpu" ]]; then + module load cray-python cmake + PARMETIS_TPL="/global/cfs/cdirs/e3sm/software/polaris/pm-cpu/spack/dev_polaris_0_10_0_COMPILER_mpich/var/spack/environments/dev_polaris_0_10_0_COMPILER_mpich/.spack-env/view" + +elif [[ "${CRONJOB_MACHINE:-unknown}" == "unknown" ]]; then + echo "CRONJOB_MACHINE is not set." + exit 1 + +else + echo "It seems that the cron job is not configured with CRONJOB_MACHINE." + exit -1 + +fi + +echo "Compilers: ${E3SM_COMPILERS}" + +for COMPILER in ${E3SM_COMPILERS}; do + + WORKDIR=${TESTROOT}/${COMPILER}/${CRONJOB_DATE} + rm -rf ${WORKDIR} + mkdir -p ${WORKDIR} + + PARMETIS_HOME="${PARMETIS_TPL//COMPILER/$COMPILER}" + if [ ! -d "$PARMETIS_HOME" ]; then + if [[ "${CRONJOB_MACHINE:-unknown}" == "frontier" ]]; then + PARMETIS_HOME="/ccs/proj/cli115/software/polaris/frontier/spack/dev_polaris_0_10_0_craygnu-mphipcc_mpich/var/spack/environments/dev_polaris_0_10_0_craygnu-mphipcc_mpich/.spack-env/view" + fi + fi + + cmake \ + -DOMEGA_CIME_MACHINE=${CRONJOB_MACHINE} \ + -DOMEGA_CIME_COMPILER=${COMPILER} \ + -DOMEGA_ARCH=SERIAL \ + -DOMEGA_BUILD_TEST=ON \ + -DOMEGA_PARMETIS_ROOT=${PARMETIS_HOME} \ + -S ${OMEGA_HOME}/components/omega \ + -B ${WORKDIR}; + + mkdir -p ${WORKDIR}/test + + ln -sf ${TESTROOT}/OmegaMesh.nc ${WORKDIR}/test/OmegaMesh.nc + ln -sf ${TESTROOT}/OmegaSphereMesh.nc ${WORKDIR}/test/OmegaSphereMesh.nc + ln -sf ${TESTROOT}/OmegaPlanarMesh.nc ${WORKDIR}/test/OmegaPlanarMesh.nc + + source ${WORKDIR}/omega_env.sh + + ctest \ + -S ${OMEGA_HOME}/components/omega/CTestScript.cmake \ + -DCTEST_SOURCE_DIRECTORY=${OMEGA_HOME}/components/omega \ + -DCTEST_BINARY_DIRECTORY=${WORKDIR} \ + -DCTEST_SITE=${CRONJOB_MACHINE} \ + -DCTEST_BUILD_GROUP="Omega Unit-test" \ + -DCTEST_BUILD_NAME="unitest-develop-${COMPILER}" \ + -DCTEST_NIGHTLY_START_TIME="06:00:00 UTC" \ + -DCTEST_BUILD_COMMAND="${WORKDIR}/omega_build.sh" \ + -DCTEST_BUILD_CONFIGURATION="Release" \ + -DCTEST_DROP_SITE_CDASH=TRUE \ + -DCTEST_SUBMIT_URL="https://my.cdash.org/submit.php?project=E3SM"; + +done diff --git a/cron-scripts/tasks/polaris_cdash/CTestScript.txt b/cron-scripts/tasks/polaris_cdash/CTestScript.txt new file mode 100644 index 0000000000..271a2c70c0 --- /dev/null +++ b/cron-scripts/tasks/polaris_cdash/CTestScript.txt @@ -0,0 +1,55 @@ +# CTestScript.txt to submit generated XMLs +set(CTEST_PROJECT_NAME omega) +set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC") +set(CTEST_SITE "$ENV{CRONJOB_MACHINE}") +set(CTEST_BUILD_NAME "ExternalTest_Run") + +# Set source and binary directory to current (required for ctest_submit) +set(CTEST_SOURCE_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}") +set(CTEST_BINARY_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}") + +# Create Testing/Temporary directory manually for ctest log files +file(MAKE_DIRECTORY "${CTEST_BINARY_DIRECTORY}/Testing/Temporary") + +# Initialize CTest to ensure CTEST_BINARY_DIRECTORY is used for logging +ctest_start(Nightly) + +# CDash configuration +if(CMAKE_VERSION VERSION_GREATER 3.14) + set(CTEST_SUBMIT_URL "https://my.cdash.org/submit.php?project=omega") +else() + set(CTEST_DROP_METHOD "https") + set(CTEST_DROP_SITE "my.cdash.org") + set(CTEST_DROP_LOCATION "/submit.php?project=omega") +endif() + +set(CTEST_DROP_SITE_CDASH TRUE) + +# Define files to submit +set(FILES_TO_SUBMIT) + +if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/Build.xml") + list(APPEND FILES_TO_SUBMIT "${CMAKE_CURRENT_LIST_DIR}/Build.xml") +else() + message(WARNING "Build.xml not found") +endif() + +if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/Test.xml") + list(APPEND FILES_TO_SUBMIT "${CMAKE_CURRENT_LIST_DIR}/Test.xml") +else() + message(WARNING "Test.xml not found") +endif() + +if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/Done.xml") + list(APPEND FILES_TO_SUBMIT "${CMAKE_CURRENT_LIST_DIR}/Done.xml") +else() + message(WARNING "Done.xml not found") +endif() + +# Submit files +if(FILES_TO_SUBMIT) + message(STATUS "Submitting files: ${FILES_TO_SUBMIT}") + ctest_submit(FILES ${FILES_TO_SUBMIT}) +else() + message(WARNING "No files to submit") +endif() diff --git a/cron-scripts/tasks/polaris_cdash/launch_polaris_ctest.sh b/cron-scripts/tasks/polaris_cdash/launch_polaris_ctest.sh new file mode 100755 index 0000000000..06b9a98c59 --- /dev/null +++ b/cron-scripts/tasks/polaris_cdash/launch_polaris_ctest.sh @@ -0,0 +1,245 @@ +#!/bin/bash -l +set -eo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")" +echo "[$(date)] Starting $SCRIPT_NAME" + +POLARIS_CDASH_BASEDIR=${CRONJOB_BASEDIR}/tasks/polaris_cdash +POLARIS_CDASH_TESTDIR="${POLARIS_CDASH_BASEDIR}/tests" +OMEGA_HOME="${POLARIS_CDASH_BASEDIR}/polaris/e3sm_submodules/Omega" +MINIFORGE3_HOME="${POLARIS_CDASH_BASEDIR}/miniforge3" + +mkdir -p $POLARIS_CDASH_BASEDIR +mkdir -p $POLARIS_CDASH_TESTDIR + +if [[ "$CRONJOB_MACHINE" == "chrysalis" ]]; then + module load python cmake + PARMETIS_TPL=/lcrc/soft/climate/polaris/chrysalis/spack/dev_polaris_0_10_0_COMPILER_openmpi/var/spack/environments/dev_polaris_0_10_0_COMPILER_openmpi/.spack-env/view + +elif [[ "$CRONJOB_MACHINE" == "frontier" ]]; then + module load cray-python cmake git-lfs + PARMETIS_TPL="/ccs/proj/cli115/software/polaris/frontier/spack/dev_polaris_0_10_0_COMPILER_mpich/var/spack/environments/dev_polaris_0_10_0_COMPILER_mpich/.spack-env/view" + +elif [[ "$CRONJOB_MACHINE" == "pm-gpu" ]]; then + module load cray-python cmake + PARMETIS_TPL="/global/cfs/cdirs/e3sm/software/polaris/pm-gpu/spack/dev_polaris_0_10_0_COMPILER_mpich/var/spack/environments/dev_polaris_0_10_0_COMPILER_mpich/.spack-env/view" + +elif [[ "$CRONJOB_MACHINE" == "pm-cpu" ]]; then + module load cray-python cmake + PARMETIS_TPL="/global/cfs/cdirs/e3sm/software/polaris/pm-cpu/spack/dev_polaris_0_10_0_COMPILER_mpich/var/spack/environments/dev_polaris_0_10_0_COMPILER_mpich/.spack-env/view" + +elif [[ "$CRONJOB_MACHINE" == "unknown" ]]; then + echo "CRONJOB_MACHINE is not set." + exit -1 + +else + echo "It seems that the cron job is not configured with CRONJOB_MACHINE." + exit -1 + +fi + +# ============================================================================== +# Functions +# ============================================================================== + +install_miniforge3() { + +if [ ! -d "$MINIFORGE3_HOME" ]; then + echo "Installing Miniforge3..." + pushd "$POLARIS_CDASH_BASEDIR" > /dev/null + wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh + bash Miniforge3-Linux-x86_64.sh -b -p $MINIFORGE3_HOME + popd > /dev/null +fi + +} + +#setup_polaris_repo() { +# echo "================================================================================" +# echo "STEP 1: Setting up Polaris Repo (Baseline)" +# echo "================================================================================" +# cd "${POLARIS_CDASH_BASEDIR}" +# +# # Check if we are inside the 'polaris' folder or need to enter it +# if [ ! -d "polaris" ]; then +# echo "Cloning Polaris repository..." +# git clone git@github.com:E3SM-Project/polaris.git +# cd polaris +# else +# cd polaris +# echo "Repository exists. Resetting to main branch..." +# git fetch origin +# git checkout main +# git reset --hard origin/main +# fi +# +# echo "Updating specific submodules (jigsaw-python, Omega)..." +# git submodule update --init --recursive jigsaw-python +# git submodule update --init --recursive e3sm_submodules/Omega +#} + +configure_polaris() { + local compiler=$1 + + echo "--------------------------------------------------------------------------------" + echo "Configuring Polaris for $compiler" + echo "--------------------------------------------------------------------------------" + + cd "${POLARIS_CDASH_BASEDIR}/polaris" + + if [ ! -f "load_polaris_${CRONJOB_MACHINE}_${compiler}_*.sh" ]; then + ./deploy.py --machine ${CRONJOB_MACHINE} --compiler ${compiler} + fi + + source ./load_polaris_${CRONJOB_MACHINE}_${compiler}_*.sh +} + +build_omega_dev() { + local compiler=$1 + local omega_build=$2 + local parmetis_path=$3 + + echo "--------------------------------------------------------------------------------" + echo "Building Omega (dev) with $compiler in $omega_build" + echo "--------------------------------------------------------------------------------" + + + rm -rf "$omega_build" + mkdir -p "$omega_build" + pushd "$omega_build" > /dev/null + + cmake \ + -DOMEGA_CIME_MACHINE="${CRONJOB_MACHINE}" \ + -DOMEGA_CIME_COMPILER="${compiler}" \ + -DOMEGA_BUILD_TEST=ON \ + -DOMEGA_PARMETIS_ROOT="${parmetis_path}" \ + "${OMEGA_HOME}/components/omega" + + source ./omega_env.sh + + ctest -M Nightly -T Start + ctest -M Nightly -T Build + #./omega_build.sh + popd > /dev/null +} + +run_baseline_suite() { + local compiler=$1 + local omega_build=$2 + + local polaris_build="${POLARIS_CDASH_TESTDIR}/${compiler}/polaris_build" + + # Clean up previous baseline directory to avoid stale logs + if [ -d "$polaris_build" ]; then + echo "Removing previous polaris build directory: $polaris_build" + rm -rf "$polaris_build" + fi + + mkdir -p "$polaris_build" + + pushd "$polaris_build" > /dev/null + + echo "--------------------------------------------------------------------------------" + echo "Running Polaris Baseline Suite for $compiler" + echo "--------------------------------------------------------------------------------" + + cd "" + + local env_file=$(ls ${POLARIS_CDASH_BASEDIR}/polaris/load_dev_polaris_*_${CRONJOB_MACHINE}_${compiler}_*.sh | head -n 1) + if [ -f "$env_file" ]; then + echo "Sourcing $env_file" + source "$env_file" + else + echo "Warning: Environment file matching 'load_dev_polaris_*_${CRONJOB_MACHINE}_${compiler}_*.sh' not found." + fi + + # Set up baseline suite + polaris suite -c ocean -t omega_nightly --model omega \ + -w "$polaris_build" \ + -p "$omega_build" + +# --clean_build + + + # Submit baseline job + if [ -d "$polaris_build" ]; then + cd "$polaris_build" + echo "Submitting baseline job in $(pwd)..." + # Fire and forget / continue on error + sbatch --wait job_script.omega_nightly.sh || true + else + echo "Error: Baseline directory $polaris_build was not created." + fi +} + +# ============================================================================== +# Main Execution +# ============================================================================== +install_miniforge3 +#setup_polaris_repo + +for COMPILER in ${E3SM_COMPILERS}; do + echo "################################################################################" + echo "Processing Baseline for COMPILER: $COMPILER" + echo "################################################################################" + + MAIN_LOG="${CRONJOB_LOGDIR}/polaris_cdash_main_${CRONJOB_DATE}.log" + + echo "Starting $COMPILER... logging to $MAIN_LOG" + + DEVELOP_BUILD="${POLARIS_CDASH_TESTDIR}/${COMPILER}/omega_build" + + # Capture Block + { + configure_polaris "$COMPILER" + + PARMETIS_HOME="${PARMETIS_TPL//COMPILER/$COMPILER}" + if [ ! -f "$PARMETIS_HOME" ]; then + if [[ "$CRONJOB_MACHINE" == "frontier" ]]; then + PARMETIS_HOME=/ccs/proj/cli115/software/polaris/frontier/spack/dev_polaris_0_10_0_craygnu-mphipcc_mpich/var/spack/environments/dev_polaris_0_10_0_craygnu-mphipcc_mpich/.spack-env/view + fi + fi + + build_omega_dev "$COMPILER" "$DEVELOP_BUILD" "$PARMETIS_HOME" + + run_baseline_suite "$COMPILER" "$DEVELOP_BUILD" + } 2>&1 | tee "$MAIN_LOG" + + # CDash Submission Logic + BUILD_ID=$(date +%s) + + CASE_OUTPUTS_DIR="${POLARIS_CDASH_TESTDIR}/${COMPILER}/polaris_build/case_outputs" + + CDASH_DIR="${POLARIS_CDASH_TESTDIR}/${COMPILER}/cdash" + echo "Creating CDash directory: $CDASH_DIR" + rm -rf "$CDASH_DIR" + mkdir -p "$CDASH_DIR" + + echo "Submitting results to CDash..." + if [ -f "${HERE}/polaris_cdash.py" ]; then + python3 "${HERE}/polaris_cdash.py" \ + --log-dir "$CASE_OUTPUTS_DIR" \ + --output-dir "$CDASH_DIR" \ + --results-dir "$DEVELOP_BUILD/Testing" \ + --site-name "$CRONJOB_MACHINE" \ + --build-name "Baseline_${COMPILER}" \ + --build-id "$BUILD_ID" + else + echo "Error: polaris_cdash.py not found at ${HERE}/polaris_cdash.py" + fi + + echo "Running CTest submission from $CDASH_DIR..." + if [ -f "${HERE}/CTestScript.txt" ]; then + cp "${HERE}/CTestScript.txt" "$CDASH_DIR/" + pushd "$CDASH_DIR" > /dev/null + module load cmake && ctest -S CTestScript.txt -V + popd > /dev/null + else + echo "Warning: CTestScript.txt not found in ${HERE}" + fi + + echo "Finished Baseline processing for $COMPILER" +done + +echo "[$(date)] Finished $SCRIPT_NAME" diff --git a/cron-scripts/tasks/polaris_cdash/polaris_cdash.py b/cron-scripts/tasks/polaris_cdash/polaris_cdash.py new file mode 100644 index 0000000000..999e653aee --- /dev/null +++ b/cron-scripts/tasks/polaris_cdash/polaris_cdash.py @@ -0,0 +1,311 @@ +import argparse +import glob +import os +import platform +import re +import sys +import time +import xml.etree.ElementTree as ET +from xml.dom import minidom + +# Shared Utilities + + +def get_system_info(): + info = {} + info['OSName'] = platform.system() + info['Hostname'] = platform.node() + info['OSRelease'] = platform.release() + info['OSVersion'] = platform.version() + info['OSPlatform'] = platform.machine() + info['Is64Bits'] = '1' if sys.maxsize > 2**32 else '0' + + try: + import psutil + + info['NumberOfLogicalCPU'] = str(psutil.cpu_count(logical=True)) + info['NumberOfPhysicalCPU'] = str(psutil.cpu_count(logical=False)) + info['TotalPhysicalMemory'] = str( + int(psutil.virtual_memory().total / (1024 * 1024)) + ) # MB + except ImportError: + info['NumberOfLogicalCPU'] = '1' + info['NumberOfPhysicalCPU'] = '1' + info['TotalPhysicalMemory'] = '1024' + + info['VendorString'] = 'Unknown' + info['VendorID'] = 'Unknown' + info['FamilyID'] = '0' + info['ModelID'] = '0' + info['ProcessorCacheSize'] = '0' + info['ProcessorClockFrequency'] = '0' + + return info + + +def strip_ansi_codes(text): + ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') + return ansi_escape.sub('', text) + + +def read_tag_file(results_dir): + tag_path = os.path.join(results_dir, 'TAG') + if not os.path.exists(tag_path): + raise FileNotFoundError(f'TAG file not found at {tag_path}') + + with open(tag_path, 'r') as f: + lines = [line.strip() for line in f.readlines()] + + if len(lines) < 2: + raise ValueError( + f'TAG file at {tag_path} contains fewer than 2 lines.' + ) + + folder_name = lines[0] + group_name = lines[1] + + # "Joining the two lines of TAG with '-' is a BUILD_STAMP" + build_stamp = f'{folder_name}-{group_name}' + + return folder_name, build_stamp + + +def process_build_xml(args, folder_name, build_stamp, sys_info): + # "there is a folder with the same name to the first line of TAG file. + # in the folder, there is Build.xml" + source_build_xml = os.path.join(args.results_dir, folder_name, 'Build.xml') + + if not os.path.exists(source_build_xml): + print( + f'Warning: Build.xml not found at {source_build_xml}. Generating ' + f'minimal Build.xml instead.' + ) + # Fallback or error? User said "Use it instead of generating it." + # I'll try to generate a minimal one if missing, but primarily we + # expect it. + # For now let's error if strictly required, but safer to warn. + # Actually user instructions imply it exists. I will error if not found + # to be explicit. + raise FileNotFoundError( + f'Source Build.xml not found at {source_build_xml}' + ) + + print(f'Reading Build.xml from {source_build_xml}') + tree = ET.parse(source_build_xml) + site = tree.getroot() + if site.tag != 'Site': + # Check if root is Site, sometimes it might be different? XML usually + # + # CTest XMLs usually start with Site. + pass + + # "Modify BuildName of Site node with --build-name argument." + # "Also change Name of Site node with --site-name argument." + if args.build_name: + site.set('BuildName', args.build_name) + + if args.site_name: + site.set('Name', args.site_name) + + # Ensure BuildStamp is set to the one from TAG + site.set('BuildStamp', build_stamp) + + # Add system info updates if needed? + # The existing Build.xml might have system info. + # User didn't fetch system info explicitly for Build.xml, but we used to + # add it. + # Let's preserve existing attributes unless we need to overwrite. + # But usually has OS info. We can update it if missing or just + # trust existing. + # User instruction: "In Build.xml modify [Names]... Also... use [Names]... + # of Test.xml... to the same data to Build.xml" + # Doesn't explicitly say "update OS info". I will leave OS info as is from + # the source file. + + xmlstr = minidom.parseString(ET.tostring(site)).toprettyxml(indent='\t') + + output_path = os.path.join(args.output_dir, 'Build.xml') + with open(output_path, 'w') as f: + f.write(xmlstr) + print(f'Generated {output_path} (copied and modified from source)') + + return site.attrib # Return attributes for Test.xml usage + + +def generate_test_xml(args, site_attribs, sys_info): + # Same structure as before, but using site_attribs for the Site element + site = ET.Element('Site') + + # Copy attributes from Build.xml's Site element + for k, v in site_attribs.items(): + site.set(k, v) + + # Ensure our CLI args override if not already (process_build_xml updated + # the attribs, so they should be correct) + + testing = ET.SubElement(site, 'Testing') + + start_time = int(time.time()) + formatted_start_time = time.strftime( + '%b %d %H:%M %Z', time.localtime(start_time) + ) + ET.SubElement(testing, 'StartDateTime').text = formatted_start_time + ET.SubElement(testing, 'StartTestTime').text = str(start_time) + + test_list = ET.SubElement(testing, 'TestList') + + log_files = glob.glob(os.path.join(args.log_dir, '*.log')) + log_files.sort() + + tests = [] + + if not log_files: + print(f'Warning: No log files found in {args.log_dir}') + + for log_file in log_files: + filename = os.path.basename(log_file) + test_name = filename + + try: + with open(log_file, 'r', errors='replace') as f: + content = f.read() + content = strip_ansi_codes(content) + except Exception as e: + content = f'Error reading file: {e}' + + if 'POLARIS TASK: PASS' not in content: + status = 'failed' + elif ( + 'POLARIS BASELINE:' in content + and 'POLARIS BASELINE: PASS' not in content + ): + status = 'failed' + else: + status = 'passed' + + tests.append( + { + 'name': test_name, + 'status': status, + 'output': content, + 'path': log_file, + } + ) + + ET.SubElement(test_list, 'Test').text = f'./{args.log_dir}/{test_name}' + + for test_data in tests: + test_elem = ET.SubElement(testing, 'Test', Status=test_data['status']) + ET.SubElement(test_elem, 'Name').text = test_data['name'] + ET.SubElement(test_elem, 'Path').text = f'./{args.log_dir}' + ET.SubElement( + test_elem, 'FullName' + ).text = f'./{args.log_dir}/{test_data["name"]}' + ET.SubElement( + test_elem, 'FullCommandLine' + ).text = f'cat {test_data["path"]}' + + results = ET.SubElement(test_elem, 'Results') + + named_meas_time = ET.SubElement( + results, + 'NamedMeasurement', + type='numeric/double', + name='Execution Time', + ) + ET.SubElement(named_meas_time, 'Value').text = '1.0' + + named_meas_status = ET.SubElement( + results, + 'NamedMeasurement', + type='text/string', + name='Completion Status', + ) + ET.SubElement(named_meas_status, 'Value').text = 'Completed' + + named_meas_cmd = ET.SubElement( + results, + 'NamedMeasurement', + type='text/string', + name='Command Line', + ) + ET.SubElement( + named_meas_cmd, 'Value' + ).text = f'cat {test_data["path"]}' + + measurement = ET.SubElement(results, 'Measurement') + ET.SubElement(measurement, 'Value').text = test_data['output'] + + formatted_end_time = time.strftime( + '%b %d %H:%M %Z', time.localtime(int(time.time())) + ) + ET.SubElement(testing, 'EndDateTime').text = formatted_end_time + ET.SubElement(testing, 'EndTestTime').text = str(int(time.time())) + + output_path = os.path.join(args.output_dir, 'Test.xml') + tree = ET.ElementTree(site) + tree.write(output_path, encoding='UTF-8', xml_declaration=True) + print(f'Generated {output_path}') + + +def generate_done_xml(args, build_id): + root = ET.Element('Done') + ET.SubElement(root, 'buildId').text = build_id + ET.SubElement(root, 'time').text = str(int(time.time())) + + xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent='\t') + output_path = os.path.join(args.output_dir, 'Done.xml') + with open(output_path, 'w') as f: + f.write(xmlstr) + print(f'Generated {output_path}') + + +def main(): + parser = argparse.ArgumentParser( + description='Generate CDash XML files from log directory' + ) + + parser.add_argument( + '--log-dir', required=True, help='Directory containing log files' + ) + parser.add_argument( + '--results-dir', + required=True, + help='Directory containing TAG file and Build.xml subdirectory', + ) + # Removed --build-stamp + parser.add_argument('--site-name', required=True, help='Name of the site') + # Build name defaults to log folder name, but can be overridden + parser.add_argument('--build-name', help='Name of the build') + parser.add_argument('--build-id', required=True, help='ID of the build') + + parser.add_argument( + '--output-dir', default='.', help='Directory to output XML files' + ) + + args = parser.parse_args() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + if not args.build_name: + args.build_name = os.path.basename(os.path.normpath(args.log_dir)) + + sys_info = get_system_info() + + # 1. Read TAG + folder_name, build_stamp = read_tag_file(args.results_dir) + print(f'Detected BuildStamp: {build_stamp} (from {args.results_dir}/TAG)') + + # 2. Process Build.xml + site_attribs = process_build_xml(args, folder_name, build_stamp, sys_info) + + # 3. Generate Test.xml using same Site attribs + generate_test_xml(args, site_attribs, sys_info) + + # 4. Generate Done.xml + generate_done_xml(args, args.build_id) + + +if __name__ == '__main__': + main()