From 374026b4abb3baedee91b52201aa6e551027b3d7 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 31 Dec 2025 16:30:15 +0100 Subject: [PATCH 01/39] Update machines for mache.depoly --- polaris/machines/{conda-linux.cfg => default-linux-64.cfg} | 0 polaris/machines/{conda-osx.cfg => default-osx-64.cfg} | 0 polaris/machines/katara.cfg | 4 ++-- 3 files changed, 2 insertions(+), 2 deletions(-) rename polaris/machines/{conda-linux.cfg => default-linux-64.cfg} (100%) rename polaris/machines/{conda-osx.cfg => default-osx-64.cfg} (100%) diff --git a/polaris/machines/conda-linux.cfg b/polaris/machines/default-linux-64.cfg similarity index 100% rename from polaris/machines/conda-linux.cfg rename to polaris/machines/default-linux-64.cfg diff --git a/polaris/machines/conda-osx.cfg b/polaris/machines/default-osx-64.cfg similarity index 100% rename from polaris/machines/conda-osx.cfg rename to polaris/machines/default-osx-64.cfg diff --git a/polaris/machines/katara.cfg b/polaris/machines/katara.cfg index 60bc5ca20f..63f3e219b0 100644 --- a/polaris/machines/katara.cfg +++ b/polaris/machines/katara.cfg @@ -59,8 +59,8 @@ use_e3sm_hdf5_netcdf = False # Options related to machine discovery [discovery] -# a substring used to identify this machine from its hostname -hostname_contains = katara +# a regular expression used to identify this machine from its hostname +hostname_re = ^katara # Config options related to building components From 1430b71a05a7efa0458ea5a596f0d21638491ace Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 31 Dec 2025 16:30:55 +0100 Subject: [PATCH 02/39] Add missing dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 9b91388c4b..97ceeba5b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ dependencies = [ "requests", "scipy>=1.8.0", "shapely>=2.0,<3.0", + "termcolor", "tranche>=0.3.0", "xarray", ] From f79708bc7a9204514eb692e3fe23c6c91d3d65fe Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 31 Dec 2025 16:32:35 +0100 Subject: [PATCH 03/39] Switch deployment to mache.deploy --- .gitignore | 8 + configure_polaris_envs.py | 162 ---- deploy.py | 460 ++++++++++ deploy/__init__.py | 0 deploy/albany_supported.txt | 9 - deploy/bootstrap.py | 1522 -------------------------------- deploy/cli_spec.json | 93 ++ deploy/conda-dev-spec.template | 77 -- deploy/config.yaml.j2 | 132 +++ deploy/default.cfg | 46 - deploy/hooks.py | 145 +++ deploy/load_polaris.template | 37 - deploy/petsc_supported.txt | 7 - deploy/pins.cfg | 26 + deploy/pixi.toml.j2 | 97 ++ deploy/shared.py | 396 --------- deploy/spack.yaml.j2 | 44 + deploy/spec-bootstrap.txt | 3 - deploy/unsupported.txt | 9 - 19 files changed, 1005 insertions(+), 2268 deletions(-) delete mode 100755 configure_polaris_envs.py create mode 100755 deploy.py delete mode 100644 deploy/__init__.py delete mode 100644 deploy/albany_supported.txt delete mode 100755 deploy/bootstrap.py create mode 100644 deploy/cli_spec.json delete mode 100644 deploy/conda-dev-spec.template create mode 100644 deploy/config.yaml.j2 delete mode 100644 deploy/default.cfg create mode 100644 deploy/hooks.py delete mode 100644 deploy/load_polaris.template delete mode 100644 deploy/petsc_supported.txt create mode 100644 deploy/pins.cfg create mode 100644 deploy/pixi.toml.j2 delete mode 100644 deploy/shared.py create mode 100644 deploy/spack.yaml.j2 delete mode 100644 deploy/spec-bootstrap.txt delete mode 100644 deploy/unsupported.txt diff --git a/.gitignore b/.gitignore index 12575f4f42..60e07b8452 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,11 @@ docs/_build/ # build directories /build_mpas_ocean/ /build_omega/ + +# pixi +.pixi/ +pixi.lock +pixi-env/ + +# vs code +.vscode/ diff --git a/configure_polaris_envs.py b/configure_polaris_envs.py deleted file mode 100755 index 8b6dcae190..0000000000 --- a/configure_polaris_envs.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys -from configparser import ConfigParser, ExtendedInterpolation - -from deploy.shared import ( - check_call, - get_conda_base, - get_logger, - install_miniforge, - parse_args, -) - - -def main(): - """ - Entry point for the configure script - """ - - args = parse_args(bootstrap=False) - source_path = os.getcwd() - - if args.tmpdir is not None: - os.makedirs(name=args.tmpdir, exist_ok=True) - - config = _get_config(args.config_file) - - conda_base = get_conda_base(args.conda_base, config, warn=True) - conda_base = os.path.abspath(conda_base) - - env_name = 'polaris_bootstrap' - - source_activation_scripts = f'source {conda_base}/etc/profile.d/conda.sh' - - activate_base = f'{source_activation_scripts} && conda activate' - - activate_install_env = ( - f'{source_activation_scripts} && conda activate {env_name}' - ) - os.makedirs(name='deploy_tmp/logs', exist_ok=True) - - if args.verbose: - logger = None - else: - logger = get_logger( - log_filename='deploy_tmp/logs/prebootstrap.log', name=__name__ - ) - - # install miniforge if needed - install_miniforge(conda_base, activate_base, logger) - - local_mache = args.mache_fork is not None and args.mache_branch is not None - - packages = '--file deploy/spec-bootstrap.txt' - if not local_mache: - # we need to add the mache package, specifying a version, - # since we won't be installing mache from a local clone of a branch - mache_version = config.get('deploy', 'mache') - packages = f'{packages} "mache={mache_version}"' - - _setup_install_env( - env_name, - activate_base, - args.use_local, - logger, - args.recreate, - conda_base, - packages, - ) - - if local_mache: - print('Clone and install local mache\n') - commands = ( - f'{activate_install_env} && ' - f'rm -rf deploy_tmp/build_mache && ' - f'mkdir -p deploy_tmp/build_mache && ' - f'cd deploy_tmp/build_mache && ' - f'git clone -b {args.mache_branch} ' - f'git@github.com:{args.mache_fork}.git mache && ' - f'cd mache && ' - f'conda install -y --file spec-file.txt && ' - f'python -m pip install --no-deps --no-build-isolation .' - ) - - check_call(commands, logger=logger) - - # polaris only uses 'dev' environment type, but E3SM-Unified uses others - env_type = config.get('deploy', 'env_type') - if env_type not in ['dev', 'test_release', 'release']: - raise ValueError(f'Unexpected env_type: {env_type}') - - if env_type == 'test_release' and args.use_local: - local_conda_build = os.path.abspath(f'{conda_base}/conda-bld') - else: - local_conda_build = None - - _bootstrap(activate_install_env, source_path, local_conda_build) - - -def _get_config(config_file): - """ - Read in the options from the config file and return the config object - """ - - # we can't load polaris so we find the config files - here = os.path.abspath(os.path.dirname(__file__)) - default_config = os.path.join(here, 'deploy/default.cfg') - config = ConfigParser(interpolation=ExtendedInterpolation()) - config.read(default_config) - - if config_file is not None: - config.read(config_file) - - return config - - -def _setup_install_env( - env_name, activate_base, use_local, logger, recreate, conda_base, packages -): - """ - Setup a conda environment for installing polaris - """ - - env_path = os.path.join(conda_base, 'envs', env_name) - - if use_local: - channels = '--use-local' - else: - channels = '' - - if recreate or not os.path.exists(env_path): - print('Setting up a conda environment for installing polaris\n') - conda_command = 'create' - else: - print('Updating conda environment for installing polaris\n') - conda_command = 'install' - commands = ( - f'{activate_base} && ' - f'conda {conda_command} -y -n {env_name} {channels} {packages}' - ) - - check_call(commands, logger=logger) - - -def _bootstrap(activate_install_env, source_path, local_conda_build): - """ - Activate the environment for installing polaris and call bootstrap - """ - - print('Creating the polaris conda environment\n') - bootstrap_command = f'{source_path}/deploy/bootstrap.py' - command = ( - f'{activate_install_env} && ' - f'{bootstrap_command} {" ".join(sys.argv[1:])}' - ) - if local_conda_build is not None: - command = f'{command} --local_conda_build {local_conda_build}' - check_call(command) - - -if __name__ == '__main__': - main() diff --git a/deploy.py b/deploy.py new file mode 100755 index 0000000000..26ca0f7858 --- /dev/null +++ b/deploy.py @@ -0,0 +1,460 @@ +#!/usr/bin/env python3 +""" +Target software deployment entrypoint. + +- Reads pinned mache version from deploy/pins.cfg +- Reads CLI spec from deploy/cli_spec.json and builds argparse CLI +- Downloads mache/deploy/bootstrap.py for either: + * a given mache fork/branch, or + * the pinned mache version +- Calls bootstrap.py with routed args (bootstrap|both) and stops +""" + +import argparse +import configparser +import json +import os +import shlex +import shutil +import stat +import subprocess +import sys +import time +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen + +PINS_CFG = os.path.join('deploy', 'pins.cfg') +CLI_SPEC_JSON = os.path.join('deploy', 'cli_spec.json') +DEPLOY_TMP_DIR = 'deploy_tmp' +BOOTSTRAP_PATH = os.path.join(DEPLOY_TMP_DIR, 'bootstrap.py') + +# Default upstream repo for release/tag downloads +DEFAULT_MACHE_REPO = 'E3SM-Project/mache' + +# Where bootstrap.py lives inside the mache repo +BOOTSTRAP_RELPATH = 'mache/deploy/bootstrap.py' + + +def main(): + _check_location() + + pinned_mache_version, pinned_python_version = _read_pins(PINS_CFG) + cli_spec = _read_cli_spec(CLI_SPEC_JSON) + + parser = _build_parser_from_cli_spec(cli_spec) + args = parser.parse_args(sys.argv[1:]) + + if args.python: + python_version = args.python + else: + python_version = pinned_python_version + + _validate_fork_branch_pair(args) + + using_fork = getattr(args, 'mache_fork', None) is not None + + if not using_fork: + _validate_cli_spec_matches_pins(cli_spec, pinned_mache_version) + + # remove tmp dir + if os.path.exists(DEPLOY_TMP_DIR): + shutil.rmtree(DEPLOY_TMP_DIR) + + os.makedirs(DEPLOY_TMP_DIR) + + bootstrap_url = _bootstrap_url( + mache_version=pinned_mache_version, + mache_fork=getattr(args, 'mache_fork', None), + mache_branch=getattr(args, 'mache_branch', None), + ) + + _download_file(bootstrap_url, BOOTSTRAP_PATH) + + # Make sure it's executable (nice-to-have). We'll still run with + # sys.executable. + _make_executable(BOOTSTRAP_PATH) + + bootstrap_argv = _build_routed_argv(cli_spec, args, route_key='bootstrap') + + software = str(cli_spec.get('meta', {}).get('software', '')).strip() + if not software: + raise SystemExit( + 'ERROR: deploy/cli_spec.json meta.software must be set to the ' + 'target software name.' + ) + # Always include target software name (not user-facing). + bootstrap_argv = [ + '--software', + software, + '--python', + python_version, + ] + bootstrap_argv + + # Only pass a mache version when using a tagged release. If a fork/branch + # is requested, bootstrap must take dependencies from the branch's + # pixi.toml (not from a pinned release). + if not using_fork: + if '--mache-version' not in bootstrap_argv: + bootstrap_argv += ['--mache-version', pinned_mache_version] + + cmd = [sys.executable, BOOTSTRAP_PATH] + bootstrap_argv + subprocess.check_call(cmd) + + if args.bootstrap_only: + pixi_exe = _get_pixi_executable(getattr(args, 'pixi', None)) + bootstrap_dir = os.path.join(DEPLOY_TMP_DIR, 'bootstrap_pixi') + print( + '\nBootstrap environment is ready. To use it interactively:\n' + f' pixi shell -m {bootstrap_dir}/pixi.toml\n\n' + 'Then, you can run:\n' + f' mache deploy update --software {software}\n' + f' exit\n' + ) + + # Now that the bootstrap env exists and has mache installed, run + # deployment. Forward args routed to "mache". + mache_run_argv = _build_routed_argv(cli_spec, args, route_key='run') + + if not args.bootstrap_only: + pixi_exe = _get_pixi_executable(getattr(args, 'pixi', None)) + _run_mache_deploy_run( + pixi_exe=pixi_exe, + repo_root='.', + mache_run_argv=mache_run_argv, + ) + + +def _check_location(): + """Fail fast if not run from repo root.""" + expected = [ + 'deploy.py', + PINS_CFG, + CLI_SPEC_JSON, + ] + missing = [p for p in expected if not os.path.exists(p)] + if missing: + missing_str = '\n - ' + '\n - '.join(missing) + raise SystemExit( + f'ERROR: deploy.py must be run from the root of the target ' + f'software repository.\n' + f'Current location: {os.getcwd()}\n' + f'Missing expected files:{missing_str}' + ) + + +def _read_pins(pins_path): + if not os.path.exists(pins_path): + raise SystemExit(f'ERROR: Required pins file not found: {pins_path}') + + cfg = configparser.ConfigParser(interpolation=None) + try: + with open(pins_path, 'r', encoding='utf-8') as f: + cfg.read_file(f) + except OSError as e: + raise SystemExit(f'ERROR: Failed to read {pins_path}: {e!r}') from e + + section = None + if cfg.has_section('pixi') and cfg.has_option('pixi', 'mache'): + section = 'pixi' + + if section is None: + raise SystemExit(f'ERROR: {pins_path} must contain [pixi] mache') + + mache_version = cfg.get(section, 'mache').strip() + if not mache_version: + raise SystemExit( + f'ERROR: {pins_path} option [{section}] mache is empty' + ) + + python_version = cfg.get(section, 'python').strip() + if not python_version: + raise SystemExit( + f'ERROR: {pins_path} option [{section}] python is empty' + ) + + return mache_version, python_version + + +def _read_cli_spec(spec_path): + if not os.path.exists(spec_path): + raise SystemExit(f'ERROR: Required CLI spec not found: {spec_path}') + + try: + with open(spec_path, 'r', encoding='utf-8') as f: + spec = json.load(f) + except (OSError, json.JSONDecodeError) as e: + raise SystemExit(f'ERROR: Failed to parse {spec_path}: {e!r}') from e + + if 'meta' not in spec or 'arguments' not in spec: + raise SystemExit( + f"ERROR: {spec_path} must contain top-level keys 'meta' and " + f"'arguments'" + ) + + if 'mache_version' not in spec['meta']: + raise SystemExit( + f"ERROR: {spec_path} meta must include 'mache_version'" + ) + + if not isinstance(spec['arguments'], list): + raise SystemExit(f"ERROR: {spec_path} 'arguments' must be a list") + + return spec + + +def _build_parser_from_cli_spec(cli_spec): + description = cli_spec.get('meta', {}).get( + 'description', 'Deploy E3SM software environment' + ) + parser = argparse.ArgumentParser(description=description) + + for entry, flags in _iter_routed_cli_spec_entries( + cli_spec, route_key='deploy' + ): + # Build kwargs for argparse. Only allow a small, safe subset. + kwargs = {} + for key in ( + 'dest', + 'help', + 'action', + 'default', + 'required', + 'choices', + 'nargs', + ): + if key in entry: + kwargs[key] = entry[key] + + # NOTE: intentionally not supporting arbitrary 'type' here to keep it + # simple/stdlib-only. If you need types later, you can support a + # limited string->callable mapping. + + try: + parser.add_argument(*flags, **kwargs) + except TypeError as e: + raise SystemExit( + f'ERROR: Bad argparse spec for flags {flags}: {e}' + ) from e + + return parser + + +def _iter_routed_cli_spec_entries(cli_spec, route_key): + """Yield (entry, flags) for entries whose route contains route_key. + + This function centralizes CLI-spec validation shared between parser + construction and argv forwarding. + """ + for entry in cli_spec['arguments']: + flags = entry.get('flags') + route = entry.get('route') + + if not isinstance(route, list): + raise SystemExit( + f'ERROR: cli_spec.json argument {entry.get("flags")} has ' + f"invalid 'route'; must be a list" + ) + + if route_key not in route: + continue + + if not flags or not isinstance(flags, list): + raise SystemExit("ERROR: cli_spec.json entry missing 'flags' list") + + yield entry, flags + + +def _validate_fork_branch_pair(args): + fork = getattr(args, 'mache_fork', None) + branch = getattr(args, 'mache_branch', None) + if (fork is None) != (branch is None): + raise SystemExit( + 'ERROR: You must supply both --mache-fork and --mache-branch, or ' + 'neither.' + ) + + +def _validate_cli_spec_matches_pins(cli_spec, pinned_mache_version): + meta_version = str(cli_spec['meta'].get('mache_version', '')).strip() + if not meta_version: + raise SystemExit('ERROR: cli_spec.json meta.mache_version is empty') + + if meta_version != pinned_mache_version: + raise SystemExit( + f'ERROR: Mache version mismatch.\n' + f' deploy/pins.cfg pins mache = {pinned_mache_version}\n' + f' deploy/cli_spec.json meta.mache_version = {meta_version}\n\n' + f'Fix: copy deploy/cli_spec.json from the matching mache version ' + f'into this repo (or update both together).' + ) + + +def _bootstrap_url( + mache_version, + mache_fork=None, + mache_branch=None, +): + if mache_fork is not None and mache_branch is not None: + # Raw file from a fork/branch + return f'https://raw.githubusercontent.com/{mache_fork}/{mache_branch}/{BOOTSTRAP_RELPATH}' # noqa: E501 + + # Raw file from a version tag. Convention: tags are "X.Y.Z". + return f'https://raw.githubusercontent.com/{DEFAULT_MACHE_REPO}/{mache_version}/{BOOTSTRAP_RELPATH}' # noqa: E501 + + +def _download_file(url, dest_path): + # Avoid stale/cached responses from proxies/CDNs (common on HPC networks). + # GitHub raw content supports query strings; adding a cache-buster forces a + # fresh fetch even if an intermediate cache is misbehaving. + effective_url = url + if 'raw.githubusercontent.com' in url: + sep = '&' if '?' in url else '?' + effective_url = f'{url}{sep}_cb={int(time.time())}' + + req = Request( + effective_url, + headers={ + 'User-Agent': 'Mozilla/5.0', + 'Cache-Control': 'no-cache, no-store, max-age=0', + 'Pragma': 'no-cache', + }, + ) + try: + with urlopen(req, timeout=60) as resp: + data = resp.read() + except HTTPError as e: + raise SystemExit( + f'ERROR: Failed to download bootstrap.py (HTTP {e.code}) from ' + f'{effective_url}' + ) from e + except URLError as e: + raise SystemExit( + f'ERROR: Failed to download bootstrap.py from {effective_url}: ' + f'{e.reason}' + ) from e + except Exception as e: + raise SystemExit( + f'ERROR: Unexpected error downloading bootstrap.py from ' + f'{effective_url}: ' + f'{e!r}' + ) from e + + # Basic sanity check: should look like a python script. + first_line = data.splitlines()[0].strip() if data else b'' + if b'python' not in first_line and b'#!/' not in first_line: + raise SystemExit( + f'ERROR: Downloaded bootstrap.py does not look like a python ' + f'script.\n' + f'URL: {effective_url}\n' + f'This may indicate a proxy/redirect issue.' + ) + + try: + with open(dest_path, 'wb') as f: + f.write(data) + except OSError as e: + raise SystemExit(f'ERROR: Failed to write {dest_path}: {e!r}') from e + + +def _make_executable(path): + try: + st = os.stat(path) + os.chmod(path, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + except OSError: + # Not fatal; we run via sys.executable anyway. + pass + + +def _get_pixi_executable(pixi): + if pixi: + pixi = os.path.abspath(os.path.expanduser(pixi)) + if not os.path.exists(pixi): + raise SystemExit(f'ERROR: pixi executable not found: {pixi}') + return pixi + + which = shutil.which('pixi') + if which is not None: + return which + + default_pixi = os.path.join( + os.path.expanduser('~'), '.pixi', 'bin', 'pixi' + ) + if os.path.isfile(default_pixi) and os.access(default_pixi, os.X_OK): + return default_pixi + + raise SystemExit( + 'ERROR: pixi executable not found on PATH or default install ' + 'location (~/.pixi/bin). Install pixi or pass --pixi.' + ) + + +def _build_routed_argv(cli_spec, args, route_key): + """Build forwarded argv from args for entries routed to route_key.""" + argv = [] + for entry, flags in _iter_routed_cli_spec_entries( + cli_spec, route_key=route_key + ): + dest = entry.get('dest') + if not dest: + raise SystemExit( + f"ERROR: cli_spec.json argument {flags} missing 'dest'" + ) + + value = getattr(args, dest, None) + action = entry.get('action') + + # Use the first flag as the canonical one when forwarding. + flag0 = flags[0] + + if action == 'store_true': + if value: + argv.append(flag0) + else: + if value is None: + continue + + # If the argparse entry used `nargs` (or otherwise produced a + # list), expand into repeated tokens: `--flag a b c`. + if isinstance(value, (list, tuple)): + if len(value) == 0: + continue + argv.append(flag0) + argv.extend(str(v) for v in value) + else: + argv.extend([flag0, str(value)]) + + return argv + + +def _run_mache_deploy_run(pixi_exe, repo_root, mache_run_argv): + """ + Run `mache deploy run ...` inside the bootstrap pixi environment. + """ + repo_root = os.path.abspath(repo_root) + + bootstrap_dir = os.path.abspath( + os.path.join(DEPLOY_TMP_DIR, 'bootstrap_pixi') + ) + pixi_toml = os.path.join(bootstrap_dir, 'pixi.toml') + if not os.path.exists(pixi_toml): + raise SystemExit( + f'ERROR: bootstrap pixi project not found. Expected: {pixi_toml}' + ) + + # Build a bash command that runs mache inside pixi, then cd's to repo. + mache_cmd = 'mache deploy run' + if mache_run_argv: + mache_cmd = f'{mache_cmd} ' + ' '.join( + shlex.quote(a) for a in mache_run_argv + ) + + cmd = ( + f'env -u PIXI_PROJECT_MANIFEST -u PIXI_PROJECT_ROOT ' + f'{shlex.quote(pixi_exe)} run -m {shlex.quote(pixi_toml)} bash -lc ' + f'{shlex.quote("cd " + repo_root + " && " + mache_cmd)}' + ) + subprocess.check_call(['/bin/bash', '-lc', cmd]) + + +if __name__ == '__main__': + main() diff --git a/deploy/__init__.py b/deploy/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/deploy/albany_supported.txt b/deploy/albany_supported.txt deleted file mode 100644 index 3f1574609f..0000000000 --- a/deploy/albany_supported.txt +++ /dev/null @@ -1,9 +0,0 @@ -# a list of supported machine, compiler and mpi combinations for Albany - -chrysalis, gnu, openmpi -frontier, craygnu, mpich -frontier, craygnu-mphipcc, mpich -frontier, craycray, mpich -frontier, craycray-mphipcc, mpich -pm-cpu, gnu, mpich -morpheus, gnu, openmpi diff --git a/deploy/bootstrap.py b/deploy/bootstrap.py deleted file mode 100755 index b69c089f67..0000000000 --- a/deploy/bootstrap.py +++ /dev/null @@ -1,1522 +0,0 @@ -#!/usr/bin/env python3 - -import glob -import grp -import importlib.resources -import os -import platform -import shutil -import socket -import stat -import subprocess -import time -from configparser import ConfigParser, ExtendedInterpolation -from typing import Dict - -import progressbar -from jinja2 import Template -from mache import MachineInfo -from mache import discover_machine as mache_discover_machine -from mache.spack import get_spack_script, make_spack_env -from packaging import version -from shared import ( - check_call, - get_conda_base, - get_logger, - install_miniforge, - parse_args, -) - - -def main(): # noqa: C901 - """ - Entry point for bootstrap - """ - - args = parse_args(bootstrap=True) - options = vars(args) - - if options['verbose']: - options['logger'] = None - else: - options['logger'] = get_logger( - log_filename='deploy_tmp/logs/bootstrap.log', name=__name__ - ) - - source_path = os.getcwd() - options['source_path'] = source_path - options['conda_template_path'] = f'{source_path}/deploy' - options['spack_template_path'] = f'{source_path}/deploy/spack' - - polaris_version = _get_version() - options['polaris_version'] = polaris_version - - options['local_mache'] = ( - options['mache_fork'] is not None - and options['mache_branch'] is not None - ) - - machine = None - if not options['conda_env_only']: - if options['machine'] is None: - machine = _discover_machine() - else: - machine = options['machine'] - - options['known_machine'] = machine is not None - - if machine is None and not options['conda_env_only']: - if platform.system() == 'Linux': - machine = 'conda-linux' - elif platform.system() == 'Darwin': - machine = 'conda-osx' - - options['machine'] = machine - options['config'] = _get_config(options['config_file'], machine) - - env_type = options['config'].get('deploy', 'env_type') - if env_type not in ['dev', 'test_release', 'release']: - raise ValueError(f'Unexpected env_type: {env_type}') - shared = env_type != 'dev' - conda_base = get_conda_base( - options['conda_base'], options['config'], shared=shared, warn=False - ) - options['env_type'] = env_type - conda_base = os.path.abspath(conda_base) - options['conda_base'] = conda_base - - source_activation_scripts = f'source {conda_base}/etc/profile.d/conda.sh' - - activate_base = f'{source_activation_scripts} && conda activate' - - if machine is None: - compilers = [None] - mpis = ['nompi'] - else: - _get_compilers_mpis(options) - - compilers = options['compilers'] - mpis = options['mpis'] - - # write out a log file for use by matrix builds - with open('deploy_tmp/logs/matrix.log', 'w') as f: - f.write(f'{machine}\n') - for compiler, mpi in zip(compilers, mpis, strict=False): - f.write(f'{compiler}, {mpi}\n') - - print( - 'Configuring environment(s) for the following compilers and MPI ' - 'libraries:' - ) - for compiler, mpi in zip(compilers, mpis, strict=False): - print(f' {compiler}, {mpi}') - print('') - - previous_conda_env = None - - permissions_dirs = [] - activ_path = None - - soft_spack_view = _build_spack_soft_env(options) - - for compiler, mpi in zip(compilers, mpis, strict=False): - _get_env_setup(options, compiler, mpi) - - build_dir = f'deploy_tmp/build{options["activ_suffix"]}' - - _safe_rmtree(build_dir) - os.makedirs(name=build_dir, exist_ok=True) - - os.chdir(build_dir) - - if options['spack_base'] is not None: - spack_base = options['spack_base'] - elif options['known_machine'] and compiler is not None: - _get_spack_base(options) - else: - spack_base = None - - if spack_base is not None and options['update_spack']: - # even if this is not a release, we need to update permissions on - # shared system libraries - permissions_dirs.append(spack_base) - - conda_env_name = options['conda_env_name'] - if previous_conda_env != conda_env_name: - _build_conda_env(options, activate_base) - - if options['local_mache']: - print('Install local mache\n') - commands = ( - f'source {conda_base}/etc/profile.d/conda.sh && ' - f'conda activate {conda_env_name} && ' - f'cd ../build_mache/mache && ' - f'conda install -y --file spec-file.txt && ' - f'python -m pip install --no-deps --no-build-isolation .' - ) - check_call(commands, logger=options['logger']) - - previous_conda_env = conda_env_name - - if env_type != 'dev': - permissions_dirs.append(conda_base) - - spack_script = '' - if compiler is not None: - env_vars = _get_env_vars(options['machine'], compiler, mpi) - if spack_base is not None: - spack_script, env_vars = _build_spack_libs_env( - options, compiler, mpi, env_vars - ) - - spack_script = ( - f'echo Loading Spack environment...\n' - f'{spack_script}\n' - f'echo Done.\n' - f'echo\n' - ) - else: - conda_env_path = options['conda_env_path'] - env_vars = ( - f'{env_vars}' - f'export PIO={conda_env_path}\n' - f'export OPENMP_INCLUDE=-I"{conda_env_path}/include"\n' - ) - - if soft_spack_view is not None: - env_vars = ( - f'{env_vars}export PATH="{soft_spack_view}/bin:$PATH"\n' - ) - elif options['known_machine']: - raise ValueError( - 'A software compiler or a spack base was not ' - 'defined so required software was not ' - 'installed with spack.' - ) - - else: - env_vars = '' - - if env_type == 'dev': - if conda_env_name is not None: - prefix = f'load_{conda_env_name}' - else: - prefix = f'load_dev_polaris_{polaris_version}' - elif env_type == 'test_release': - prefix = f'test_polaris_{polaris_version}' - else: - prefix = f'load_polaris_{polaris_version}' - - script_filename = _write_load_polaris( - options, prefix, spack_script, env_vars - ) - - if options['check']: - _check_env(options, script_filename, conda_env_name) - - if env_type == 'release' and not ( - options['with_albany'] or options['with_petsc'] - ): - activ_path = options['activ_path'] - # make a symlink to the activation script - link = os.path.join( - activ_path, f'load_latest_polaris_{compiler}_{mpi}.sh' - ) - check_call(f'ln -sfn {script_filename} {link}') - - default_compiler = options['config'].get('deploy', 'compiler') - default_mpi = options['config'].get( - 'deploy', f'mpi_{default_compiler}' - ) - if compiler == default_compiler and mpi == default_mpi: - # make a default symlink to the activation script - link = os.path.join(activ_path, 'load_latest_polaris.sh') - check_call(f'ln -sfn {script_filename} {link}') - os.chdir(options['source_path']) - - commands = f'{activate_base} && conda clean -y -p -t' - check_call(commands, logger=options['logger']) - - if options['update_spack'] or env_type != 'dev': - # we need to update permissions on shared stuff - _update_permissions(options, permissions_dirs) - - -def _get_spack_base(options): - """ - Get the absolute path to the spack base files - """ - - config = options['config'] - spack_base = options['spack_base'] - if spack_base is None: - if config.has_option('deploy', 'spack'): - spack_base = config.get('deploy', 'spack') - else: - raise ValueError( - 'No spack base provided with --spack and none is ' - 'provided in a config file.' - ) - # handle "~" in the path - options['spack_base'] = os.path.abspath(os.path.expanduser(spack_base)) - - -def _get_config(config_file, machine): - """ - Read in the options from the config file and return the config object - """ - - # we can't load polaris so we find the config files - here = os.path.abspath(os.path.dirname(__file__)) - default_config = os.path.join(here, 'default.cfg') - config = ConfigParser(interpolation=ExtendedInterpolation()) - config.read(default_config) - - if machine is not None: - machine_config = str( - importlib.resources.files('mache.machines') / f'{machine}.cfg' - ) - # it's okay if a given machine isn't part of mache - if os.path.exists(machine_config): - config.read(machine_config) - - machine_config = os.path.join( - here, '..', 'polaris', 'machines', f'{machine}.cfg' - ) - if not os.path.exists(machine_config): - raise FileNotFoundError( - f'Could not find a config file for this machine at ' - f'polaris/machines/{machine}.cfg' - ) - - config.read(machine_config) - - if config_file is not None: - config.read(config_file) - - return config - - -def _get_version(): - """ - Get the Polaris version by parsing the version file - """ - - # we can't import polaris because we probably don't have the necessary - # dependencies, so we get the version by parsing (same approach used in - # the root setup.py) - here = os.path.abspath(os.path.dirname(__file__)) - version_path = os.path.join(here, '..', 'polaris', 'version.py') - with open(version_path) as f: - main_ns: Dict[str, str] = dict() - exec(f.read(), main_ns) - version = main_ns['__version__'] - - return version - - -def _get_compilers_mpis(options): # noqa: C901 - """ - Get the compilers and MPI variants from the config object - """ - - compilers = options['compilers'] - mpis = options['mpis'] - config = options['config'] - machine = options['machine'] - source_path = options['source_path'] - - unsupported = _parse_unsupported(machine, source_path) - if machine == 'conda-linux': - all_compilers = ['gfortran'] - all_mpis = ['mpich', 'openmpi'] - elif machine == 'conda-osx': - all_compilers = ['clang'] - all_mpis = ['mpich', 'openmpi'] - else: - machine_info = MachineInfo(machine) - all_compilers = machine_info.compilers - all_mpis = machine_info.mpilibs - - if not config.has_option('deploy', 'compiler'): - raise ValueError( - f'Machine config file for {machine} is missing a default compiler.' - ) - default_compiler = config.get('deploy', 'compiler') - - error_on_unsupported = True - - if compilers is not None and compilers[0] == 'all': - error_on_unsupported = False - if mpis is not None and mpis[0] == 'all': - # make a matrix of compilers and mpis - compilers = list() - mpis = list() - for compiler in all_compilers: - for mpi in all_mpis: - compilers.append(compiler) - mpis.append(mpi) - else: - compilers = all_compilers - if mpis is not None: - if len(mpis) > 1: - raise ValueError( - f'"--compiler all" can only be combined ' - f'with "--mpi all" or a single MPI ' - f'library, \n' - f'but got: {mpis}' - ) - mpi = mpis[0] - mpis = [mpi for _ in compilers] - - elif mpis is not None and mpis[0] == 'all': - error_on_unsupported = False - mpis = all_mpis - if compilers is None: - compiler = default_compiler - else: - if len(compilers) > 1: - raise ValueError( - f'"--mpis all" can only be combined with ' - f'"--compiler all" or a single compiler, \n' - f'but got: {compilers}' - ) - compiler = compilers[0] - # The compiler is all the same - compilers = [compiler for _ in mpis] - - if compilers is None: - compilers = [config.get('deploy', 'compiler')] - - if mpis is None: - mpis = list() - for compiler in compilers: - option = f'mpi_{compiler.replace("-", "_")}' - if not config.has_option('deploy', option): - raise ValueError( - f'Machine config file for {machine} is ' - f'missing [deploy]/{option}, the default MPI ' - f'library for the requested compiler.' - ) - mpi = config.get('deploy', option) - mpis.append(mpi) - - supported_compilers = list() - supported_mpis = list() - for compiler, mpi in zip(compilers, mpis, strict=False): - if (compiler, mpi) in unsupported: - if error_on_unsupported: - raise ValueError( - f'{compiler} with {mpi} is not supported on {machine}' - ) - else: - supported_compilers.append(compiler) - supported_mpis.append(mpi) - - options['compilers'] = supported_compilers - options['mpis'] = supported_mpis - - -def _get_env_setup(options, compiler, mpi): - """ - Setup the options for the environment for the given compiler and MPI - variant - """ - - conda_env_name = options['conda_env_name'] - env_type = options['env_type'] - source_path = options['source_path'] - config = options['config'] - logger = options['logger'] - machine = options['machine'] - polaris_version = options['polaris_version'] - conda_base = options['conda_base'] - - if options['python'] is not None: - python = options['python'] - else: - python = config.get('deploy', 'python') - - if options['recreate'] is not None: - recreate = options['recreate'] - else: - recreate = config.getboolean('deploy', 'recreate') - - if machine is None: - conda_mpi = 'nompi' - activ_suffix = '' - env_suffix = '' - elif not machine.startswith('conda'): - conda_mpi = 'nompi' - activ_suffix = f'_{machine}_{compiler}_{mpi}' - env_suffix = '' - else: - activ_suffix = f'_{mpi}' - env_suffix = activ_suffix - conda_mpi = mpi - - lib_suffix = '' - if options['with_albany']: - lib_suffix = f'{lib_suffix}_albany' - else: - config.set('deploy', 'albany', 'None') - - if options['with_petsc']: - lib_suffix = f'{lib_suffix}_petsc' - logger.info( - "Turning off OpenMP because it doesn't work well with PETSc" - ) - options['without_openmp'] = True - else: - config.set('deploy', 'petsc', 'None') - config.set('deploy', 'lapack', 'None') - - activ_suffix = f'{activ_suffix}{lib_suffix}' - - if env_type == 'dev': - activ_path = source_path - else: - activ_path = os.path.abspath(os.path.join(conda_base, '..')) - - if options['with_albany']: - _check_supported('albany', machine, compiler, mpi, source_path) - - if options['with_petsc']: - _check_supported('petsc', machine, compiler, mpi, source_path) - - if env_type == 'dev': - ver = version.parse(polaris_version) - release_version = '.'.join(str(vr) for vr in ver.release) - spack_env = f'dev_polaris_{release_version}{env_suffix}' - conda_env = f'dev_polaris_{polaris_version}{env_suffix}' - elif env_type == 'test_release': - spack_env = f'test_polaris_{polaris_version}{env_suffix}' - conda_env = spack_env - else: - spack_env = f'polaris_{polaris_version}{env_suffix}' - conda_env = spack_env - - if conda_env_name is None or env_type != 'dev': - conda_env_name = conda_env - - # add the compiler and MPI library to the spack env name - spack_env = f'{spack_env}_{compiler}_{mpi}{lib_suffix}' - # spack doesn't like dots - spack_env = spack_env.replace('.', '_') - - conda_env_path = os.path.join(conda_base, 'envs', conda_env_name) - - source_activation_scripts = f'source {conda_base}/etc/profile.d/conda.sh' - - activate_env = ( - f'{source_activation_scripts} && conda activate {conda_env_name}' - ) - - options['conda_env_name'] = conda_env_name - options['python'] = python - options['recreate'] = recreate - options['conda_mpi'] = conda_mpi - options['activ_suffix'] = activ_suffix - options['env_suffix'] = env_suffix - options['activ_path'] = activ_path - options['conda_env_path'] = conda_env_path - options['activate_env'] = activate_env - options['spack_env'] = spack_env - - -def _build_conda_env(options, activate_base): - """ - Build the conda environment - """ - - config = options['config'] - logger = options['logger'] - env_type = options['env_type'] - conda_env_name = options['conda_env_name'] - source_path = options['source_path'] - use_local = options['use_local'] - local_conda_build = options['local_conda_build'] - update_jigsaw = options['update_jigsaw'] - conda_template_path = options['conda_template_path'] - version = options['polaris_version'] - local_mache = options['local_mache'] - conda_base = options['conda_base'] - conda_mpi = options['conda_mpi'] - python = options['python'] - conda_env_path = options['conda_env_path'] - recreate = options['recreate'] - - if env_type != 'dev': - install_miniforge(conda_base, activate_base, logger) - - if conda_mpi == 'nompi': - mpi_prefix = 'nompi' - else: - mpi_prefix = f'mpi_{conda_mpi}' - - channel_list = ['-c conda-forge'] - if use_local: - channel_list = ['--use-local'] + channel_list - if local_conda_build is not None: - channel_list = ['-c', local_conda_build] + channel_list - if env_type == 'test_release': - # for a test release, we will be the polaris package from the dev label - channel_list = channel_list + ['-c e3sm/label/polaris_dev'] - channel_list = channel_list + ['-c e3sm/label/polaris'] - - channels = f'--override-channels {" ".join(channel_list)}' - packages = f'python={python}' - - base_activation_script = os.path.abspath( - f'{conda_base}/etc/profile.d/conda.sh' - ) - - activate_env = ( - f'source {base_activation_script} && conda activate {conda_env_name}' - ) - - with open(f'{conda_template_path}/conda-dev-spec.template', 'r') as f: - template = Template(f.read()) - - if env_type == 'dev': - supports_otps = platform.system() == 'Linux' - if platform.system() == 'Linux': - conda_openmp = 'libgomp' - elif platform.system() == 'Darwin': - conda_openmp = 'llvm-openmp' - else: - conda_openmp = '' - - replacements = dict( - supports_otps=supports_otps, - mpi=conda_mpi, - openmp=conda_openmp, - mpi_prefix=mpi_prefix, - include_mache=not local_mache, - ) - - for package in [ - 'esmf', - 'geometric_features', - 'mache', - 'metis', - 'mpas_tools', - 'netcdf_c', - 'netcdf_fortran', - 'otps', - 'parallelio', - 'pnetcdf', - ]: - replacements[package] = config.get('deploy', package) - - replacements['moab'] = config.get('deploy', 'conda_moab') - - spec_file = template.render(**replacements) - - spec_filename = f'spec-file-{conda_mpi}.txt' - with open(spec_filename, 'w') as handle: - handle.write(spec_file) - else: - spec_filename = None - - if not os.path.exists(conda_env_path): - recreate = True - - if recreate: - print(f'creating {conda_env_name}') - if env_type == 'dev': - # install dev dependencies and polaris itself - commands = ( - f'{activate_base} && ' - f'conda create -y -n {conda_env_name} {channels} ' - f'--file {spec_filename} {packages}' - ) - check_call(commands, logger=logger) - else: - # conda packages don't like dashes - version_conda = version.replace('-', '') - packages = f'{packages} "polaris={version_conda}={mpi_prefix}_*"' - commands = ( - f'{activate_base} && ' - f'conda create -y -n {conda_env_name}' - f'{channels} {packages}' - ) - check_call(commands, logger=logger) - else: - if env_type == 'dev': - print(f'Updating {conda_env_name}\n') - # install dev dependencies and polaris itself - commands = ( - f'{activate_base} && ' - f'conda install -y -n {conda_env_name} {channels} ' - f'--file {spec_filename} {packages}' - ) - check_call(commands, logger=logger) - else: - print(f'{conda_env_name} already exists') - - if env_type == 'dev': - if recreate or update_jigsaw: - _build_jigsaw(options, activate_env, source_path, conda_env_path) - - # install (or reinstall) polaris in edit mode - print('Installing polaris\n') - commands = ( - f'{activate_env} && ' - f'cd {source_path} && ' - f'rm -rf polaris.egg-info && ' - f'python -m pip install --no-deps --no-build-isolation -e .' - ) - check_call(commands, logger=logger) - - print('Installing pre-commit\n') - commands = f'{activate_env} && cd {source_path} && pre-commit install' - check_call(commands, logger=logger) - - -def _build_jigsaw(options, activate_env, source_path, conda_env_path): - """ - Build the JIGSAW and JIGSAW-Python tools using conda-forge compilers - """ - - logger = options['logger'] - conda_base = options['conda_base'] - - # remove conda jigsaw and jigsaw-python - t0 = time.time() - commands = ( - f'{activate_env} && conda remove -y --force-remove jigsaw jigsawpy' - ) - try: - check_call(commands, logger=logger) - except subprocess.CalledProcessError: - # this is fine, we just want to make sure these package are removed if - # present - pass - - commands = ( - f'{activate_env} && ' - f'cd {source_path} && ' - f'git submodule update --init jigsaw-python' - ) - check_call(commands, logger=logger) - - print('Building JIGSAW\n') - # add build tools to deployment env, not polaris env - jigsaw_build_deps = 'cxx-compiler cmake make' - if platform.system() == 'Linux': - jigsaw_build_deps = f'{jigsaw_build_deps} sysroot_linux-64=2.17' - netcdf_lib = f'{conda_env_path}/lib/libnetcdf.so' - elif platform.system() == 'Darwin': - jigsaw_build_deps = ( - f'{jigsaw_build_deps} macosx_deployment_target_osx-64=10.13' - ) - netcdf_lib = f'{conda_env_path}/lib/libnetcdf.dylib' - cmake_args = f'-DCMAKE_BUILD_TYPE=Release -DNETCDF_LIBRARY={netcdf_lib}' - - commands = ( - f'source {conda_base}/etc/profile.d/conda.sh && ' - f'conda activate polaris_bootstrap && ' - f'conda install -y {jigsaw_build_deps} && ' - f'cd {source_path}/jigsaw-python/external/jigsaw && ' - f'rm -rf tmp && ' - f'mkdir tmp && ' - f'cd tmp && ' - f'cmake .. {cmake_args} && ' - f'cmake --build . --config Release --target install --parallel 4 && ' - f'cd {source_path}/jigsaw-python && ' - f'rm -rf jigsawpy/_bin jigsawpy/_lib && ' - f'cp -r external/jigsaw/bin/ jigsawpy/_bin && ' - f'cp -r external/jigsaw/lib/ jigsawpy/_lib' - ) - - # need a clean environment on Aurora because of its gcc module and - # should do no harm on other machines - clean_env = { - 'HOME': os.environ['HOME'], - 'TERM': os.environ.get('TERM', 'xterm'), - } - - check_call(commands, env=clean_env, logger=logger) - - print('Installing JIGSAW and JIGSAW-Python\n') - commands = ( - f'{activate_env} && ' - f'cd {source_path}/jigsaw-python && ' - f'python -m pip install --no-deps --no-build-isolation -e . && ' - f'cp jigsawpy/_bin/* ${{CONDA_PREFIX}}/bin' - ) - check_call(commands, logger=logger) - - t1 = time.time() - total = int(t1 - t0 + 0.5) - message = f'JIGSAW install took {total:.1f} s.' - if logger is None: - print(message) - else: - logger.info(message) - - -def _get_env_vars(machine, compiler, mpi): - """ - Get the environment variables for the given machine, compiler, and MPI - variant - """ - - if machine is None: - machine = 'None' - - env_vars = ( - f'export POLARIS_COMPILER={compiler}\nexport POLARIS_MPI={mpi}\n' - ) - - env_vars = f'{env_vars}export MPAS_EXTERNAL_LIBS=""\n' - - if machine.startswith('conda'): - # we're using parallelio so we don't have ADIOS support - env_vars = f'{env_vars}export HAVE_ADIOS=false\n' - - if platform.system() == 'Linux' and machine.startswith('conda'): - env_vars = ( - f'{env_vars}' - f'export MPAS_EXTERNAL_LIBS="${{MPAS_EXTERNAL_LIBS}} -lgomp"\n' - ) - - if mpi == 'mvapich': - env_vars = ( - f'{env_vars}' - f'export MV2_ENABLE_AFFINITY=0\n' - f'export MV2_SHOW_CPU_BINDING=1\n' - ) - - if machine.startswith('chicoma') or machine.startswith('pm'): - env_vars = ( - f'{env_vars}' - f'export NETCDF=${{CRAY_NETCDF_HDF5PARALLEL_PREFIX}}\n' - f'export NETCDFF=${{CRAY_NETCDF_HDF5PARALLEL_PREFIX}}\n' - f'export PNETCDF=${{CRAY_PARALLEL_NETCDF_PREFIX}}\n' - ) - else: - env_vars = ( - f'{env_vars}' - f'export NETCDF=$(dirname $(dirname $(which nc-config)))\n' - f'export NETCDFF=$(dirname $(dirname $(which nf-config)))\n' - f'export PNETCDF=$(dirname $(dirname $(which pnetcdf-config)))\n' - ) - - return env_vars - - -def _build_spack_soft_env(options): # noqa: C901 - """ - Build the software spack environment - """ - - update_spack = options['update_spack'] - spack_template_path = options['spack_template_path'] - tmpdir = options['tmpdir'] - config = options['config'] - machine = options['machine'] - env_type = options['env_type'] - polaris_version = options['polaris_version'] - - if not config.has_option('deploy', 'software_compiler'): - return None - - compiler = config.get('deploy', 'software_compiler') - mpi_option = f'mpi_{compiler.replace("-", "_")}' - if not config.has_option('deploy', mpi_option): - raise ValueError( - f'Machine config file for {machine} is missing ' - f'{mpi_option}, the MPI library for the software ' - f'compiler.' - ) - mpi = config.get('deploy', mpi_option) - - if machine is not None: - _get_spack_base(options) - - spack_base = options['spack_base'] - - if spack_base is None: - return None - - if env_type == 'dev': - ver = version.parse(polaris_version) - release_version = '.'.join(str(vr) for vr in ver.release) - spack_env = f'dev_polaris_soft_{release_version}' - elif env_type == 'test_release': - spack_env = f'test_polaris_soft_{polaris_version}' - else: - spack_env = f'polaris_soft_{polaris_version}' - - spack_env = spack_env.replace('.', '_') - - build_dir = f'deploy_tmp/build_soft_{machine}' - - _safe_rmtree(build_dir) - os.makedirs(name=build_dir, exist_ok=True) - - os.chdir(build_dir) - - esmf = config.get('deploy', 'esmf') - moab = config.get('deploy', 'spack_moab') - - if config.has_option('deploy', 'spack_mirror'): - spack_mirror = config.get('deploy', 'spack_mirror') - else: - spack_mirror = None - - spack_branch_base = f'{spack_base}/{spack_env}' - - specs = list() - - e3sm_hdf5_netcdf = config.getboolean('deploy', 'use_e3sm_hdf5_netcdf') - if not e3sm_hdf5_netcdf: - hdf5 = config.get('deploy', 'hdf5') - netcdf_c = config.get('deploy', 'netcdf_c') - netcdf_fortran = config.get('deploy', 'netcdf_fortran') - specs.extend( - [ - f'hdf5@{hdf5}+cxx+fortran+hl+mpi+shared', - f'netcdf-c@{netcdf_c}+mpi~parallel-netcdf', - f'netcdf-fortran@{netcdf_fortran}', - ] - ) - - if esmf != 'None': - specs.append(f'esmf@{esmf}+mpi+netcdf~pnetcdf~external-parallelio') - - if moab != 'None': - specs.append( - f'moab@{moab}+eigen+fortran+hdf5+mpi+netcdf+pnetcdf+zoltan+tempest' - ) - - yaml_template: str | None = None - template_path = f'{spack_template_path}/{machine}_{compiler}_{mpi}.yaml' - if os.path.exists(template_path): - yaml_template = template_path - - if machine is not None: - here = os.path.abspath(os.path.dirname(__file__)) - machine_config = os.path.join( - here, '..', 'polaris', 'machines', f'{machine}.cfg' - ) - else: - machine_config = None - - if update_spack: - make_spack_env( - spack_path=spack_branch_base, - env_name=spack_env, - spack_specs=specs, - compiler=compiler, - mpi=mpi, - machine=machine, - config_file=machine_config, - include_e3sm_hdf5_netcdf=e3sm_hdf5_netcdf, - yaml_template=yaml_template, - tmpdir=tmpdir, - spack_mirror=spack_mirror, - ) - - spack_view = ( - f'{spack_branch_base}/var/spack/environments/' - f'{spack_env}/.spack-env/view' - ) - - os.chdir(options['source_path']) - - return spack_view - - -def _build_spack_libs_env(options, compiler, mpi, env_vars): # noqa: C901 - """ - Build the library spack environment - """ - - config = options['config'] - machine = options['machine'] - update_spack = options['update_spack'] - spack_base = options['spack_base'] - tmpdir = options['tmpdir'] - spack_template_path = options['spack_template_path'] - spack_env = options['spack_env'] - - albany = config.get('deploy', 'albany') - cmake = config.get('deploy', 'cmake') - lapack = config.get('deploy', 'lapack') - metis = config.get('deploy', 'metis') - parmetis = config.get('deploy', 'parmetis') - petsc = config.get('deploy', 'petsc') - scorpio = config.get('deploy', 'scorpio') - - spack_branch_base = f'{spack_base}/{spack_env}' - - specs = list() - - if cmake != 'None': - specs.append(f'cmake@{cmake}') - - e3sm_hdf5_netcdf = config.getboolean('deploy', 'use_e3sm_hdf5_netcdf') - if not e3sm_hdf5_netcdf: - hdf5 = config.get('deploy', 'hdf5') - netcdf_c = config.get('deploy', 'netcdf_c') - netcdf_fortran = config.get('deploy', 'netcdf_fortran') - pnetcdf = config.get('deploy', 'pnetcdf') - specs.extend( - [ - f'hdf5@{hdf5}+cxx+fortran+hl+mpi+shared', - f'netcdf-c@{netcdf_c}+mpi~parallel-netcdf', - f'netcdf-fortran@{netcdf_fortran}', - f'parallel-netcdf@{pnetcdf}+cxx+fortran', - ] - ) - - if lapack != 'None': - specs.append(f'netlib-lapack@{lapack}') - include_e3sm_lapack = False - else: - include_e3sm_lapack = True - if metis != 'None': - specs.append(f'metis@{metis}+int64+real64~shared') - if parmetis != 'None': - specs.append(f'parmetis@{parmetis}+int64~shared') - if petsc != 'None': - specs.append(f'petsc@{petsc}+mpi+batch') - - if scorpio != 'None': - specs.append( - f'e3sm-scorpio@{scorpio}+mpi~timing~internal-timing~tools+malloc' - ) - - if albany != 'None': - specs.append(f'albany@{albany}+mpas') - - yaml_template: str | None = None - template_path = f'{spack_template_path}/{machine}_{compiler}_{mpi}.yaml' - if os.path.exists(template_path): - yaml_template = template_path - - if machine is not None: - here = os.path.abspath(os.path.dirname(__file__)) - machine_config = os.path.join( - here, '..', 'polaris', 'machines', f'{machine}.cfg' - ) - else: - machine_config = None - - if update_spack: - make_spack_env( - spack_path=spack_branch_base, - env_name=spack_env, - spack_specs=specs, - compiler=compiler, - mpi=mpi, - machine=machine, - config_file=machine_config, - include_e3sm_lapack=include_e3sm_lapack, - include_e3sm_hdf5_netcdf=e3sm_hdf5_netcdf, - yaml_template=yaml_template, - tmpdir=tmpdir, - ) - - _set_ld_library_path(options, spack_branch_base, spack_env) - - spack_script = get_spack_script( - spack_path=spack_branch_base, - env_name=spack_env, - compiler=compiler, - mpi=mpi, - shell='sh', - machine=machine, - include_e3sm_lapack=include_e3sm_lapack, - include_e3sm_hdf5_netcdf=e3sm_hdf5_netcdf, - ) - - spack_view = ( - f'{spack_branch_base}/var/spack/environments/' - f'{spack_env}/.spack-env/view' - ) - env_vars = f'{env_vars}export PIO={spack_view}\n' - if albany != 'None': - albany_flag_filename = f'{spack_view}/export_albany.in' - if not os.path.exists(albany_flag_filename): - raise ValueError( - f'Missing Albany linking flags in ' - f'{albany_flag_filename}.\n Maybe your Spack ' - f'environment may need to be rebuilt with ' - f'Albany?' - ) - with open(albany_flag_filename, 'r') as f: - albany_flags = f.read() - if platform.system() == 'Darwin': - stdcxx = '-lc++' - else: - stdcxx = '-lstdc++' - if mpi == 'openmpi' and machine in ['chrysalis']: - mpicxx = '-lmpi_cxx' - else: - mpicxx = '' - env_vars = ( - f'{env_vars}' - f'export {albany_flags}\n' - f'export MPAS_EXTERNAL_LIBS="${{MPAS_EXTERNAL_LIBS}} ' - f'${{ALBANY_LINK_LIBS}} {stdcxx} {mpicxx}"\n' - ) - - if lapack != 'None': - env_vars = ( - f'{env_vars}export LAPACK={spack_view}\nexport USE_LAPACK=true\n' - ) - - if metis != 'None': - env_vars = f'{env_vars}export METIS_ROOT={spack_view}\n' - - if parmetis != 'None': - env_vars = f'{env_vars}export PARMETIS_ROOT={spack_view}\n' - - if petsc != 'None': - env_vars = ( - f'{env_vars}export PETSC={spack_view}\nexport USE_PETSC=true\n' - ) - - return spack_script, env_vars - - -def _set_ld_library_path(options, spack_branch_base, spack_env): - """ - Set the ``LD_LIBRARY_PATH environment variable for the given spack branch - and environment - """ - - commands = ( - f'source {spack_branch_base}/share/spack/setup-env.sh && ' - f'spack env activate {spack_env} && ' - f'spack config add modules:prefix_inspections:lib:[LD_LIBRARY_PATH] && ' # noqa: E501 - f'spack config add modules:prefix_inspections:lib64:[LD_LIBRARY_PATH]' - ) - check_call(commands, logger=options['logger']) - - -def _write_load_polaris(options, prefix, spack_script, env_vars): - """ - Write the Polaris load (activation) script - """ - - env_type = options['env_type'] - conda_env_name = options['conda_env_name'] - source_path = options['source_path'] - machine = options['machine'] - conda_env_only = options['conda_env_only'] - without_openmp = options['without_openmp'] - template_path = options['conda_template_path'] - polaris_version = options['polaris_version'] - conda_base = options['conda_base'] - activ_path = options['activ_path'] - activ_suffix = options['activ_suffix'] - - os.makedirs(name=activ_path, exist_ok=True) - - if prefix.endswith(activ_suffix): - # avoid a redundant activation script name if the suffix is already - # part of the environment name - script_filename = f'{activ_path}/{prefix}.sh' - else: - script_filename = f'{activ_path}/{prefix}{activ_suffix}.sh' - - if not conda_env_only: - env_vars = f'{env_vars}\nexport USE_PIO2=true' - if without_openmp: - env_vars = f'{env_vars}\nexport OPENMP=false' - else: - env_vars = f'{env_vars}\nexport OPENMP=true' - - env_vars = ( - f'{env_vars}\n' - f'export HDF5_USE_FILE_LOCKING=FALSE\n' - f'export LOAD_POLARIS_ENV={script_filename}' - ) - if machine is not None and not machine.startswith('conda'): - env_vars = f'{env_vars}\nexport POLARIS_MACHINE={machine}' - - filename = f'{template_path}/load_polaris.template' - with open(filename, 'r') as f: - template = Template(f.read()) - - if env_type == 'dev': - update_polaris = """ - if [[ -z "${NO_POLARIS_REINSTALL}" && -f "./pyproject.toml" && \\ - -d "polaris" ]]; then - # safe to assume we're in the polaris repo - # update the polaris installation to point here - mkdir -p deploy_tmp/logs - echo Reinstalling polaris package in edit mode... - python -m pip install --no-deps --no-build-isolation -e . \\ - &> deploy_tmp/logs/install_polaris.log - echo Done. - echo - fi - """ # noqa: E501 - else: - update_polaris = '' - - script = template.render( - conda_base=conda_base, - polaris_env=conda_env_name, - env_vars=env_vars, - spack=spack_script, - update_polaris=update_polaris, - env_type=env_type, - polaris_source_path=source_path, - polaris_version=polaris_version, - ) - - # strip out redundant blank lines - lines = list() - prev_line = '' - for line in script.split('\n'): - line = line.strip() - if line != '' or prev_line != '': - lines.append(line) - prev_line = line - - lines.append('') - - script = '\n'.join(lines) - - print(f'Writing:\n {script_filename}\n') - with open(script_filename, 'w') as handle: - handle.write(script) - - return script_filename - - -def _check_env(options, script_filename, conda_env_name): - """ - Check that polaris has been installed correctly - """ - - logger = options['logger'] - print(f'Checking the environment {conda_env_name}') - - activate = f'source {script_filename}' - - imports = ['geometric_features', 'mpas_tools', 'jigsawpy', 'polaris'] - commands = [ - ['gpmetis', '--help'], - ['ffmpeg', '--help'], - ['polaris', 'list'], - ['polaris', 'setup', '--help'], - ['polaris', 'suite', '--help'], - ] - - for import_name in imports: - command = f'{activate} && python -c "import {import_name}"' - _test_command(command, os.environ, import_name, logger) - - for command_list in commands: - package = command_list[0] - command = f'{activate} && {" ".join(command_list)}' - _test_command(command, os.environ, package, logger) - - -def _test_command(command, env, package, logger): - """ - Test package commands and print status of each command to logger - """ - - try: - check_call(command, env=env, logger=logger) - except subprocess.CalledProcessError as e: - print(f' {package} failed') - raise e - print(f' {package} passes') - - -def _update_permissions(options, directories): # noqa: C901 - """ - Update permissions in given directories - """ - - config = options['config'] - env_type = options['env_type'] - activ_path = options['activ_path'] - - if not config.has_option('e3sm_unified', 'group'): - return - - group = config.get('e3sm_unified', 'group') - - new_uid = os.getuid() - new_gid = grp.getgrnam(group).gr_gid - - print('changing permissions on activation scripts') - - read_perm = ( - stat.S_IRUSR - | stat.S_IWUSR - | stat.S_IRGRP - | stat.S_IWGRP - | stat.S_IROTH - ) - exec_perm = ( - stat.S_IRUSR - | stat.S_IWUSR - | stat.S_IXUSR - | stat.S_IRGRP - | stat.S_IWGRP - | stat.S_IXGRP - | stat.S_IROTH - | stat.S_IXOTH - ) - - mask = stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO - - if env_type != 'dev': - activation_files = glob.glob(f'{activ_path}/*_polaris*.sh') - for file_name in activation_files: - os.chmod(file_name, read_perm) - os.chown(file_name, new_uid, new_gid) - - print('changing permissions on environments') - - # first the base directories that don't seem to be included in - # os.walk() - for directory in directories: - dir_stat = _safe_stat(directory) - if dir_stat is None: - continue - - perm = dir_stat.st_mode & mask - - if dir_stat.st_uid != new_uid: - # current user doesn't own this dir so let's move on - continue - - if perm == exec_perm and dir_stat.st_gid == new_gid: - continue - - try: - os.chown(directory, new_uid, new_gid) - os.chmod(directory, exec_perm) - except OSError: - continue - - files_and_dirs = [] - for base in directories: - for _, dirs, files in os.walk(base): - files_and_dirs.extend(dirs) - files_and_dirs.extend(files) - - widgets = [ - progressbar.Percentage(), - ' ', - progressbar.Bar(), - ' ', - progressbar.ETA(), - ] - bar = progressbar.ProgressBar( - widgets=widgets, maxval=len(files_and_dirs) - ).start() - progress = 0 - for base in directories: - for root, dirs, files in os.walk(base): - for directory in dirs: - progress += 1 - try: - bar.update(progress) - except ValueError: - pass - - directory = os.path.join(root, directory) - - dir_stat = _safe_stat(directory) - if dir_stat is None: - continue - - if dir_stat.st_uid != new_uid: - # current user doesn't own this dir so let's move on - continue - - perm = dir_stat.st_mode & mask - - if perm == exec_perm and dir_stat.st_gid == new_gid: - continue - - try: - os.chown(directory, new_uid, new_gid) - os.chmod(directory, exec_perm) - except OSError: - continue - - for file_name in files: - progress += 1 - try: - bar.update(progress) - except ValueError: - pass - file_name = os.path.join(root, file_name) - file_stat = _safe_stat(file_name) - if file_stat is None: - continue - - if file_stat.st_uid != new_uid: - # current user doesn't own this file so let's move on - continue - - perm = file_stat.st_mode & mask - - if perm & stat.S_IXUSR: - # executable, so make sure others can execute it - new_perm = exec_perm - else: - new_perm = read_perm - - if perm == new_perm and file_stat.st_gid == new_gid: - continue - - try: - os.chown(file_name, new_uid, new_gid) - os.chmod(file_name, new_perm) - except OSError: - continue - - bar.finish() - print(' done.') - - -def _parse_unsupported(machine, source_path): - """ - Get the unsupported compilers and MPI variants for the given machine - """ - - with open( - os.path.join(source_path, 'deploy', 'unsupported.txt'), 'r' - ) as f: - content = f.readlines() - content = [ - line.strip() for line in content if not line.strip().startswith('#') - ] - unsupported = list() - for line in content: - if line.strip() == '': - continue - parts = [part.strip() for part in line.split(',')] - if len(parts) != 3: - raise ValueError(f'Bad line in "unsupported.txt" {line}') - if parts[0] != machine: - continue - compiler = parts[1] - mpi = parts[2] - unsupported.append((compiler, mpi)) - - return unsupported - - -def _check_supported(library, machine, compiler, mpi, source_path): - """ - Check that the given library is supported for the given machine, compiler, - and MPI variant - """ - - filename = os.path.join(source_path, 'deploy', f'{library}_supported.txt') - with open(filename, 'r') as f: - content = f.readlines() - content = [ - line.strip() for line in content if not line.strip().startswith('#') - ] - for line in content: - if line.strip() == '': - continue - supported = [part.strip() for part in line.split(',')] - if len(supported) != 3: - raise ValueError(f'Bad line in "{library}_supported.txt" {line}') - if ( - machine == supported[0] - and compiler == supported[1] - and mpi == supported[2] - ): - return - - raise ValueError( - f'{compiler} with {mpi} is not supported with {library} on {machine}' - ) - - -def _ignore_file_errors(f): - """ - Ignore any permission and missing file errors, but pass others on - """ - - def _wrapper(*args, **kwargs): - try: - return f(*args, **kwargs) - except (PermissionError, FileNotFoundError): - pass - - return _wrapper - - -@_ignore_file_errors -def _safe_rmtree(path): - shutil.rmtree(path) - - -@_ignore_file_errors -def _safe_stat(path): - return os.stat(path) - - -def _discover_machine(quiet=False): - """ - Figure out the machine from the host name - - Parameters - ---------- - quiet : bool, optional - Whether to print warnings if the machine name is ambiguous - - Returns - ------- - machine : str - The name of the current machine - """ - - machine = mache_discover_machine(quiet=quiet) - if machine is None: - possible_hosts = _get_possible_hosts() - hostname = socket.gethostname() - for possible_machine, hostname_contains in possible_hosts.items(): - if hostname_contains in hostname: - machine = possible_machine - break - return machine - - -def _get_possible_hosts(): - """ - Get a list of possible hosts from the existing machine config files - """ - - here = os.path.abspath(os.path.dirname(__file__)) - files = sorted( - glob.glob(os.path.join(here, '..', 'polaris', 'machines', '*.cfg')) - ) - - possible_hosts = dict() - for filename in files: - machine = os.path.splitext(os.path.split(filename)[1])[0] - config = ConfigParser(interpolation=ExtendedInterpolation()) - config.read(filename) - if config.has_section('discovery') and config.has_option( - 'discovery', 'hostname_contains' - ): - hostname_contains = config.get('discovery', 'hostname_contains') - possible_hosts[machine] = hostname_contains - - return possible_hosts - - -if __name__ == '__main__': - main() diff --git a/deploy/cli_spec.json b/deploy/cli_spec.json new file mode 100644 index 0000000000..11fb4ef5ce --- /dev/null +++ b/deploy/cli_spec.json @@ -0,0 +1,93 @@ +{ + "meta": { + "software": "polaris", + "mache_version": "2.2.0", + "description": "Deploy polaris environment" + }, + "arguments": [ + { + "flags": ["--machine"], + "dest": "machine", + "help": "Name of the machine to deploy for (must be known to mache). If not provided, mache will attempt to detect the machine from the host.", + "route": ["deploy", "bootstrap", "run"] + }, + { + "flags": ["--pixi"], + "dest": "pixi", + "help": "Path to the pixi executable. If not provided, pixi is found on PATH.", + "route": ["deploy", "bootstrap", "run"] + }, + { + "flags": ["--prefix"], + "dest": "prefix", + "help": "Install the environment into this prefix (directory). Overrides deploy/config.yaml.j2.", + "route": ["deploy", "bootstrap", "run"] + }, + { + "flags": ["--compiler"], + "dest": "compiler", + "nargs": "+", + "help": "Name of the compiler toolchain (primarily for Spack). If not provided, defaults may come from merged machine config [deploy] compiler.", + "route": ["deploy", "run"] + }, + { + "flags": ["--mpi"], + "dest": "mpi", + "nargs": "+", + "help": "Name of the MPI library (primarily for Spack). If not provided, defaults may come from merged machine config [deploy] mpi_ (or mpi).", + "route": ["deploy", "run"] + }, + { + "flags": ["--deploy-spack"], + "dest": "deploy_spack", + "action": "store_true", + "help": "Deploy all supported Spack environments (overrides spack.deploy in deploy/config.yaml.j2).", + "route": ["deploy", "run"] + }, + { + "flags": ["--recreate"], + "dest": "recreate", + "action": "store_true", + "help": "Recreate the environment if it exists.", + "route": ["deploy", "bootstrap", "run"] + }, + { + "flags": ["--mache-version"], + "dest": "mache_version", + "help": "The mache version to use if not using an org/fork/branch.", + "route": ["bootstrap", "run"] + }, + { + "flags": ["--python"], + "dest": "python", + "help": "The python major and minor version to use. Overrides deploy/pins.cfg.j2.", + "route": ["deploy", "bootstrap", "run"] + }, + { + "flags": ["--mache-fork"], + "dest": "mache_fork", + "help": "Point to a mache org/fork (and branch) for testing. Example: E3SM-Project/mache", + "route": ["deploy", "bootstrap", "run"] + }, + { + "flags": ["--mache-branch"], + "dest": "mache_branch", + "help": "Point to a mache branch (and fork) for testing.", + "route": ["deploy", "bootstrap", "run"] + }, + { + "flags": ["--quiet"], + "dest": "quiet", + "action": "store_true", + "help": "Only print output to log files, not to the terminal.", + "route": ["deploy", "bootstrap", "run"] + }, + { + "flags": ["--bootstrap-only"], + "dest": "bootstrap_only", + "action": "store_true", + "help": "Only create or update the bootstrap pixi environment.", + "route": ["deploy"] + } + ] +} diff --git a/deploy/conda-dev-spec.template b/deploy/conda-dev-spec.template deleted file mode 100644 index 08b7f5196d..0000000000 --- a/deploy/conda-dev-spec.template +++ /dev/null @@ -1,77 +0,0 @@ -# Base -python>=3.10,<3.15 -cartopy -cartopy_offlinedata -cmocean -esmf={{ esmf }}={{ mpi_prefix }}_* -ffmpeg -geometric_features={{ geometric_features }} -gsw -importlib_resources -ipython -jupyter -lxml -{%- if include_mache %} -mache={{ mache }} -{%- endif %} -matplotlib-base>=3.9.0 -metis={{ metis }} -moab={{ moab }}=*_tempest_* -mosaic>=1.2.1,<2.0.0 -mpas_tools={{ mpas_tools }} -nco -netcdf4=*=nompi_* -numpy>=2.0,<3.0 -{%- if supports_otps %} -otps={{ otps }} -{%- endif %} -progressbar2 -pyamg>=4.2.2 -pyproj -pyremap>=2.0.0,<3.0.0 -ruamel.yaml -requests -ruamel.yaml -scipy>=1.8.0 -shapely>=2.0,<3.0 -tranche>=0.3.0 -xarray - -# Static typing -types-requests - -# Linting and testing -pip -pytest -flynt -mypy -pre-commit -ruff - -# Development -{%- if mpi != "nompi" %} -c-compiler -cmake -cxx-compiler -fortran-compiler -libnetcdf={{ netcdf_c }}={{ mpi_prefix }}_* -netcdf-fortran={{ netcdf_fortran }}={{ mpi_prefix }}_* -libpnetcdf={{ pnetcdf }}={{ mpi_prefix }}_* -parallelio={{ parallelio }}={{ mpi_prefix }}_* -m4 -make -{{ mpi }} -{{ openmp }} -{%- endif %} - -# CF-compliance -cfchecker -udunits2 - -# Documentation -sphinx >=7.0.0 -sphinx_rtd_theme -myst-parser - -# Visualization -ncview diff --git a/deploy/config.yaml.j2 b/deploy/config.yaml.j2 new file mode 100644 index 0000000000..b914da4ece --- /dev/null +++ b/deploy/config.yaml.j2 @@ -0,0 +1,132 @@ +project: + software: "polaris" + + # A specific version string or "dynamic" if provided by the "pre_pixi" hook. + version: "dynamic" + + # Machine name selection. + # Priority order in `mache deploy run`: + # 1. CLI --machine + # 2. this value (project.machine) + # 3. automatic detection (if this is "dynamic") + machine: "dynamic" + + # A command that prints the *runtime* version string for the deployed + # software. + runtime_version_cmd: "python -c 'from polaris.version import __version__; print(__version__)'" + +machines: + # Optional path containing machine config files in ini format. + # + # This MUST be a filesystem path (not a Python package) because we need to + # read machine configs before the target software (and its dependencies) + # have been installed into the pixi environment. + # + # Should be a relative path, relative to the target software repo root. + # + # Files should be named like ".cfg" (e.g. "chrysalis.cfg"). + # + # Machine config is loaded in this order: + # 1. mache.machines/default.cfg + # 2. mache.machines/.cfg (if a known machine is selected) + # 3. /default.cfg (if machines.path is set) + # 4. /.cfg (if present) + path: polaris/machines + +pixi: + # Whether to deploy the pixi environment + deploy: true + + # Where to install the pixi project (and its .pixi directory). + # Absolute path is recommended for shared deployments. + # Environment variables will be expanded at runtime (e.g. $SCRATCH). + prefix: pixi-env + + # Channels used by pixi for this environment. + channels: + - conda-forge + - e3sm/label/polaris + + # MPI provider for conda packages. + # Supported values in `mache deploy run`: + # - nompi + # - mpich + # - openmpi + # - hpc (E3SM-Unified only) + # - dynamic (determine by the "pre_pixi" hook) + mpi: "dynamic" + + # Whether to install the target software in editable/development mode. + install_dev_software: false + +spack: + # Whether to deploy Spack environments at all. + # + # Behavior: + # - If true, deploy ALL supported Spack environments. + # - If false, deploy none. + # + # This can be forced on at runtime with the `mache deploy run` CLI flag: + # --deploy-spack + deploy: false + + # Whether this target repository supports a Spack *library* environment. + # + # If true, mache will deploy one library env per toolchain pair. + supported: true + + # Optional: deploy an additional "software" spack environment. + # + # This environment is built once (not per toolchain) with a single compiler + # and MPI from the merged machine config: + # [deploy] software_compiler + # [deploy] mpi_ + # + # Load scripts do NOT activate this environment; they add its view's `bin` + # directory to PATH. + software: + # Whether this target repository supports a Spack *software* environment. + supported: true + + # Optional override for the environment name. + # Default: "_software" + env_name: null + + # Base path for the spack checkout used for deployment. + # If it does not exist, mache will clone the E3SM spack repo. + # + # In practice, most target repositories should set this dynamically in the + # `pre_spack()` hook (e.g., based on machine config) by writing: + # ctx.runtime['spack']['spack_path'] = + # This config value is a fallback. + # + # Required (either via hook/runtime or here) when spack.deploy is true. + spack_path: null + + # Prefix for spack environment names. + # Final env name is computed as: "__". + env_name_prefix: "spack_env" + + # Jinja2-templated YAML file in the target repo containing a YAML list of + # spack specs (strings). This list is inserted into the appropriate + # mache spack env template for the selected machine/compiler/mpi. + specs_template: "deploy/spack.yaml.j2" + + # Optional spack build settings + tmpdir: null + mirror: null + custom_spack: "" + +jigsaw: + # If true, build/install JIGSAW + JIGSAW-Python into the pixi env + enabled: true + + # Relative path in the target repo where JIGSAW-Python lives. + jigsaw_python_path: jigsaw-python + +# Deployment hooks +hooks: + file: "deploy/hooks.py" + entrypoints: + pre_pixi: "pre_pixi" + pre_spack: "pre_spack" diff --git a/deploy/default.cfg b/deploy/default.cfg deleted file mode 100644 index 6c89fdb999..0000000000 --- a/deploy/default.cfg +++ /dev/null @@ -1,46 +0,0 @@ -# Options related to deploying a polaris conda environment on supported -# machines -[deploy] - -# The type of environment to deploy: dev, test_release or release -# This should nearly always be left as "dev". Only experienced developers -# should deploy a shared environment -env_type = dev - -# Recreate the environment if it already exists? -recreate = False - -# a suffix on the environment name -suffix = - -# the python version -python = 3.13 - -# the MPI version (nompi, mpich or openmpi) -mpi = nompi - -# versions of conda packages -geometric_features = 1.6.1 -mache = 2.1.0 -conda_moab = 5.6.0 -mpas_tools = 1.4.0 -otps = 2021.10 -parallelio = 2.6.6 - -# versions of conda or spack packages (depending on machine type) -esmf = 8.9.0 -metis = 5.1.0 -netcdf_c = 4.9.3 -netcdf_fortran = 4.6.2 -pnetcdf = 1.14.1 - -# versions of spack packages -albany = developcompass-2024-03-13 -# cmake newer than 3.23.0 needed for Trilinos -cmake = 3.23.0: -hdf5 = 1.14.6 -lapack = 3.9.1 -spack_moab = ${conda_moab} -parmetis = 4.0.3 -petsc = 3.19.1 -scorpio = 1.8.2 diff --git a/deploy/hooks.py b/deploy/hooks.py new file mode 100644 index 0000000000..64d45b27e4 --- /dev/null +++ b/deploy/hooks.py @@ -0,0 +1,145 @@ +"""Example deployment hooks for `mache deploy run`. + +This file is **example-only**: +- Hooks run arbitrary Python code from this repository. +- Hooks are disabled unless you opt-in via `deploy/config.yaml.j2`. + +To enable, add a `hooks` section like: + + hooks: + file: "deploy/hooks.py" # default + entrypoints: + pre_pixi: "pre_pixi" # optional + post_pixi: "post_pixi" # optional + post_deploy: "post_deploy" # optional + +""" + +from __future__ import annotations + +import os +from typing import TYPE_CHECKING, Any, Dict + +from packaging.version import Version + +if TYPE_CHECKING: + # This import is only for static type checking; at runtime, `mache` is + # already installed in the bootstrap environment when hooks are executed. + from mache.deploy.hooks import DeployContext + + +def pre_pixi(ctx: DeployContext) -> dict[str, Any] | None: + """Run before the pixi environment is created/updated. + + Preferred pattern: + - Compute derived values and store them in `ctx.runtime` via a returned + dict (instead of mutating `ctx.config`). + + Returns + ------- + dict | None + Optional mapping merged into `ctx.runtime` by mache. + """ + + polaris_version = _get_version() + mpi = _get_pixi_mpi(ctx.machine, ctx.machine_config) + + updates: Dict[str, Any] = { + 'project': {'version': polaris_version}, + 'pixi': {'mpi': mpi}, + } + + return updates + + +def pre_spack(ctx: DeployContext) -> dict[str, Any] | None: + """Run before the spack environment is created/updated. + + Preferred pattern: + - Compute derived values and store them in `ctx.runtime` via a returned + dict (instead of mutating `ctx.config`). + + Returns + ------- + dict | None + Optional mapping merged into `ctx.runtime` by mache. + """ + + updates: Dict[str, Any] = {} + spack_path = _get_spack_path(ctx.config, ctx.machine, ctx.machine_config) + + if spack_path is not None: + updates['spack'] = {'spack_path': spack_path} + + return updates + + +def _get_version(): + """ + Get the Polaris version by parsing the version file + """ + + # we can't import polaris because we probably don't have the necessary + # dependencies, so we get the version by parsing (same approach used in + # the root setup.py) + here = os.path.abspath(os.path.dirname(__file__)) + version_path = os.path.join(here, '..', 'polaris', 'version.py') + with open(version_path) as f: + main_ns: Dict[str, str] = dict() + exec(f.read(), main_ns) + polaris_version = main_ns['__version__'] + + return polaris_version + + +def _get_pixi_mpi(machine, machine_config): + """ + Get the MPI implementation for pixi from environment variable + """ + if machine is not None: + # we will use system compilers and mpi, not pixi mpi + mpi = 'nompi' + else: + # we will have the default-.cfg config options + if not machine_config.has_section('deploy'): + raise ValueError("Missing 'deploy' section in machine config") + section = machine_config['deploy'] + compiler = section.get('compiler') + if compiler is None: + raise ValueError("Missing 'compiler' option in 'deploy' section") + mpi_option = f'mpi_{compiler}' + mpi = section.get(mpi_option) + if mpi is None: + raise ValueError( + f"Missing '{mpi_option}' option in 'deploy' section" + ) + return mpi + + +def _get_spack_path(config, machine, machine_config): + """ + Get the Spack path from environment variable or machine config + """ + spack_path = config.get('spack', {}).get('spack_path') + if spack_path is not None: + # no need to update + return None + + if machine is None: + return None + + polaris_version = _get_version() + + # Use PEP 440 parsing to strip any pre/dev/post release tags and keep only + # the base release version. + release_version = Version(polaris_version).base_version + spack_env = f'dev_polaris_{release_version}' + + if not machine_config.has_section('deploy'): + raise ValueError("Missing 'deploy' section in machine config") + section = machine_config['deploy'] + spack_base = section.get('spack') + spack_path = os.path.join(spack_base, spack_env) + if spack_path is None: + raise ValueError("Missing 'spack' option in 'deploy' section") + return spack_path diff --git a/deploy/load_polaris.template b/deploy/load_polaris.template deleted file mode 100644 index 9d32f41f0a..0000000000 --- a/deploy/load_polaris.template +++ /dev/null @@ -1,37 +0,0 @@ -{% if env_type == 'dev' -%} -export POLARIS_BRANCH="{{ polaris_source_path }}" -export POLARIS_VERSION="{{ polaris_version }}" - -version_file="${POLARIS_BRANCH}/polaris/version.py" -code_version=$(cat $version_file) -if [[ "$code_version" != *"$POLARIS_VERSION"* ]]; then - -echo "This load script is for a different version of polaris:" -echo "__version__ = '$POLARIS_VERSION'" -echo "" -echo "Your code is version:" -echo "$code_version" -echo "" -echo "You need to run ./configure_polaris_envs.py to update your conda " -echo "environment and load script." - -else -# the right polaris version -{%- endif %} - -echo Loading conda environment -source {{ conda_base }}/etc/profile.d/conda.sh -conda activate {{ polaris_env }} -echo Done. -echo - -{{ update_polaris }} - -{{ spack }} - -{{ env_vars }} - -{% if env_type == 'dev' -%} -# the right polaris version -fi -{%- endif %} diff --git a/deploy/petsc_supported.txt b/deploy/petsc_supported.txt deleted file mode 100644 index e1c5710e4b..0000000000 --- a/deploy/petsc_supported.txt +++ /dev/null @@ -1,7 +0,0 @@ -# a list of supported machine, compiler and mpi combinations for Netlib LAPACK -# and PETSc - -chrysalis, intel, openmpi -chrysalis, gnu, openmpi -frontier, craygnu, mpich -pm-cpu, gnu, mpich diff --git a/deploy/pins.cfg b/deploy/pins.cfg new file mode 100644 index 0000000000..8499414de6 --- /dev/null +++ b/deploy/pins.cfg @@ -0,0 +1,26 @@ +# pins for the pixi environment +[pixi] +bootstrap_python = 3.13 +python = 3.13 +geometric_features = 1.6.1 +mache = 2.2.0 +mpas_tools = 1.4.0 +otps = 2021.10 +parallelio = 2.6.6 + +# pins for the spack environment +[spack] +# cmake newer than 3.23.0 needed for Trilinos +cmake = 3.23.0: +hdf5 = 1.14.6 +parmetis = 4.0.3 +scorpio = 1.8.2 + +# pins for both pixi and spack environments +[all] +esmf = 8.9.0 +metis = 5.1.0 +moab = 5.6.0 +netcdf_c = 4.9.3 +netcdf_fortran = 4.6.2 +pnetcdf = 1.14.1 diff --git a/deploy/pixi.toml.j2 b/deploy/pixi.toml.j2 new file mode 100644 index 0000000000..96c9c9da44 --- /dev/null +++ b/deploy/pixi.toml.j2 @@ -0,0 +1,97 @@ +[workspace] +name = "polaris-dev" +channels = [ +{%- for channel in pixi_channels %} + "{{ channel }}"{%- if not loop.last %},{% endif %} +{%- endfor %} +] +platforms = ["{{ platform }}"] +channel-priority = "strict" + +[dependencies] +python = "{{ python }}.*" + +{%- if include_mache %} +mache = "=={{ mache }}.*" +{%- endif %} + +{%- if include_jigsaw %} +jigsawpy = "*" +{%- endif %} + +cartopy = "*" +cartopy_offlinedata = "*" +cmocean = "*" +esmf = {version = "{{ esmf }}.*", build = "{{ mpi_prefix }}_*"} +ffmpeg = "*" +geometric_features = "{{ geometric_features }}.*" +gsw = "*" +importlib_resources = "*" +ipython = "*" +jupyter = "*" +lxml = "*" +matplotlib-base = ">=3.9.0" +metis = "{{ metis }}.*" +moab = {version = "{{ moab }}.*", build = "*_tempest_*"} +mosaic = ">=1.2.1,<2.0.0" +mpas_tools = "{{ mpas_tools }}.*" +nco = "*" +netcdf4 = {version = "*", build = "nompi_*"} +numpy = ">=2.0,<3.0" +{%- if platform == 'linux-64' %} +otps = "{{ otps }}.*" +{%- endif %} +progressbar2 = "*" +pyamg = ">=4.2.2" +pyproj = "*" +pyremap = ">=2.0.0,<3.0.0" +requests = "*" +"ruamel.yaml" = "*" +scipy = ">=1.8.0" +shapely = ">=2.0,<3.0" +termcolor = "*" +tranche = ">=0.3.0" +xarray = "*" + +# Static typing +types-requests = "*" + +# Linting and testing +pip = "*" +pytest = "*" +flynt = "*" +mypy = "*" +pre-commit = "*" +ruff = "*" + +# Development +{%- if mpi != "nompi" %} +c-compiler = "*" +cmake = "*" +cxx-compiler = "*" +fortran-compiler = "*" +libnetcdf = {version = "{{ netcdf_c }}.*", build = "{{ mpi_prefix }}_*"} +netcdf-fortran = {version = "{{ netcdf_fortran }}.*", build = "{{ mpi_prefix }}_*"} +libpnetcdf = {version = "{{ pnetcdf }}.*", build = "{{ mpi_prefix }}_*"} +parallelio = {version = "{{ parallelio }}.*", build = "{{ mpi_prefix }}_*"} +m4 = "*" +make = "*" +{{ mpi }} = "*" +{%- if system == 'linux' %} +libgomp = "*" +{%- elif system == 'osx' %} +llvm-openmp = "*" +{%- endif %} +{%- endif %} + +# CF-compliance +cfchecker = "*" +udunits2 = "*" + +# Documentation +sphinx = ">=7.0.0" +sphinx_rtd_theme = "*" +myst-parser = "*" + +# Visualization +ncview = "*" diff --git a/deploy/shared.py b/deploy/shared.py deleted file mode 100644 index e82fdde69b..0000000000 --- a/deploy/shared.py +++ /dev/null @@ -1,396 +0,0 @@ -import argparse -import logging -import os -import platform -import subprocess -import sys -from urllib.request import Request, urlopen - - -def parse_args(bootstrap): - """ - Parse arguments from the configure conda environment script call - - Parameters - ---------- - bootstrap : bool - Whether the environment being set up is a bootstrap environment - - Returns - ------- - args : list[str] - A list of each command-line argument provided in the call to the - configure conda environment script - """ - - parser = argparse.ArgumentParser( - description='Deploy a polaris conda environment' - ) - parser.add_argument( - '-m', - '--machine', - dest='machine', - help='The name of the machine for loading machine-' - 'related config options.', - ) - parser.add_argument( - '--conda', dest='conda_base', help='Path to the conda base.' - ) - parser.add_argument( - '--spack', dest='spack_base', help='Path to the spack base.' - ) - parser.add_argument( - '--env_name', - dest='conda_env_name', - help='The conda environment name and activation script prefix.', - ) - parser.add_argument( - '-p', - '--python', - dest='python', - type=str, - help='The python version to deploy.', - ) - parser.add_argument( - '-c', - '--compiler', - dest='compilers', - type=str, - nargs='*', - help='The name of the compiler(s).', - ) - parser.add_argument( - '-i', - '--mpi', - dest='mpis', - type=str, - nargs='*', - help='The MPI library (or libraries) to deploy, see ' - 'the docs for details.', - ) - parser.add_argument( - '--conda_env_only', - dest='conda_env_only', - action='store_true', - help='Create only the conda environment for running ' - 'polaris tasks (without compilers or libraries ' - 'for building E3SM components).', - ) - parser.add_argument( - '--recreate', - dest='recreate', - action='store_true', - help='Recreate the environment if it exists.', - ) - parser.add_argument( - '--update_jigsaw', - dest='update_jigsaw', - action='store_true', - help='Reinstall JIGSAW even if not recreating conda environment.', - ) - parser.add_argument( - '-f', - '--config_file', - dest='config_file', - help='Config file to override deployment config options.', - ) - parser.add_argument( - '--check', - dest='check', - action='store_true', - help='Check the resulting environment for expected packages.', - ) - parser.add_argument( - '--use_local', - dest='use_local', - action='store_true', - help='Use locally built conda packages (for testing).', - ) - parser.add_argument( - '--mache_fork', - dest='mache_fork', - help='Point to a mache fork (and branch) for testing.', - ) - parser.add_argument( - '--mache_branch', - dest='mache_branch', - help='Point to a mache branch (and fork) for testing.', - ) - parser.add_argument( - '--update_spack', - dest='update_spack', - action='store_true', - help='If the shared spack environment should be created or recreated.', - ) - parser.add_argument( - '--tmpdir', - dest='tmpdir', - help='A temporary directory for building spack packages.', - ) - parser.add_argument( - '--with_albany', - dest='with_albany', - action='store_true', - help='Whether to include albany in the spack environment.', - ) - parser.add_argument( - '--with_petsc', - dest='with_petsc', - action='store_true', - help='Whether to include PETSc and Netlib-LAPACK in ' - 'the spack environment.', - ) - parser.add_argument( - '--without_openmp', - dest='without_openmp', - action='store_true', - help='If this flag is included, OPENMP=false will ' - 'be added to the load script. By default, MPAS ' - 'builds will be with OpenMP (OPENMP=true).', - ) - parser.add_argument( - '--verbose', - dest='verbose', - action='store_true', - help='Print all output to the terminal, rather than ' - 'log files (usually for debugging).', - ) - if bootstrap: - parser.add_argument( - '--local_conda_build', - dest='local_conda_build', - type=str, - help='A path for conda packages (for testing).', - ) - - args = parser.parse_args(sys.argv[1:]) - - if (args.mache_fork is None) != (args.mache_branch is None): - raise ValueError( - 'You must supply both or neither of ' - '--mache_fork and --mache_branch' - ) - - return args - - -def get_conda_base(conda_base, config, shared=False, warn=False): - """ - Get the absolute path to the files for the conda base environment - - Parameters - ---------- - conda_base : str - The relative or absolute path to the conda base files - - config : ConfigParser - Config object - - shared : bool, optional - Whether we are working in a shared conda environment - - warn : bool, optional - Whether to print a warning that the conda path was not supplied - - Returns - ------- - conda_base : str - Path to the conda base environment - """ - - if shared: - conda_base = config.get('paths', 'polaris_envs') - elif conda_base is None: - try: - conda_base = subprocess.check_output( - ['conda', 'info', '--base'], text=True - ).strip() - if warn: - print( - f'\nWarning: --conda path not supplied. Using conda ' - f'installed at:\n' - f' {conda_base}\n' - ) - except subprocess.CalledProcessError as e: - raise ValueError( - 'No conda base provided with --conda and ' - 'none could be inferred.' - ) from e - # handle "~" in the path - conda_base = os.path.abspath(os.path.expanduser(conda_base)) - return conda_base - - -def check_call(commands, env=None, logger=None): - """ - Wrapper for making a shell call with logging and error management - - Parameters - ---------- - commands : list[str] - A list of each command given as a string - - env : Mapping[str, str], optional - Any environment variables required for the command - - logger : logging.Logger, optional - The logger for output - """ - - command_list = commands.replace(' && ', '; ').split('; ') - print_command = '\n '.join(command_list) - if logger is None: - print(f'\n Running:\n {print_command}\n') - else: - logger.info(f'\nrunning:\n {print_command}\n') - - if logger is None: - process = subprocess.Popen( - commands, env=env, executable='/bin/bash', shell=True - ) - process.wait() - else: - process = subprocess.Popen( - commands, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env, - executable='/bin/bash', - shell=True, - ) - stdout, stderr = process.communicate() - - if stdout: - stdout_decoded = stdout.decode('utf-8') - for line in stdout_decoded.split('\n'): - logger.info(line) - if stderr: - stderr_decoded = stderr.decode('utf-8') - for line in stderr_decoded.split('\n'): - logger.error(line) - - if process.returncode != 0: - raise subprocess.CalledProcessError(process.returncode, commands) - - -def install_miniforge(conda_base, activate_base, logger): - """ - Install Miniforge if it isn't installed already - - Parameters - ---------- - conda_base : str - Absolute path to the conda base environment files - - activate_base : str - Command to activate the conda base environment - - logger : logging.Logger - The logger for output - """ - - if not os.path.exists(conda_base): - print('Installing Miniforge3') - if platform.system() == 'Darwin': - system = 'MacOSX' - else: - system = 'Linux' - miniforge = f'Miniforge3-{system}-x86_64.sh' - url = f'https://github.com/conda-forge/miniforge/releases/latest/download/{miniforge}' # noqa: E501 - print(url) - req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) - f = urlopen(req) - html = f.read() - with open(miniforge, 'wb') as outfile: - outfile.write(html) - f.close() - - command = f'/bin/bash {miniforge} -b -p {conda_base}' - check_call(command, logger=logger) - os.remove(miniforge) - - print('Doing initial setup\n') - commands = ( - f'{activate_base} && ' - f'conda config --add channels conda-forge && ' - f'conda config --set channel_priority strict && ' - f'conda update -y --all && ' - f'conda init --no-user' - ) - - check_call(commands, logger=logger) - - -def get_logger(name, log_filename): - """ - Get the logger for logging output - - Parameters - ---------- - name : str - Name of the logger - - log_filename : str - Filepath for the logging file - - Returns - ------- - logger : logging.Logger - The logger for output - """ - - print(f'Logging to: {log_filename}\n') - try: - os.remove(log_filename) - except FileNotFoundError: - pass - logger = logging.getLogger(name) - handler = logging.FileHandler(log_filename) - formatter = PolarisFormatter() - handler.setFormatter(formatter) - logger.addHandler(handler) - logger.setLevel(logging.INFO) - logger.propagate = False - return logger - - -class PolarisFormatter(logging.Formatter): - """ - A custom formatter for logging - Modified from: - https://stackoverflow.com/a/8349076/7728169 - https://stackoverflow.com/a/14859558/7728169 - """ - - # printing error messages without a prefix because they are sometimes - # errors and sometimes only warnings sent to stderr - dbg_fmt = 'DEBUG: %(module)s: %(lineno)d: %(msg)s' - info_fmt = '%(msg)s' - err_fmt = info_fmt - - def __init__(self, fmt=info_fmt): - self._fmt = None - logging.Formatter.__init__(self, fmt) - - def format(self, record): - # Save the original format configured by the user - # when the logger formatter was instantiated - format_orig = self._fmt - - # Replace the original format with one customized by logging level - if record.levelno == logging.DEBUG: - self._fmt = PolarisFormatter.dbg_fmt - - elif record.levelno == logging.INFO: - self._fmt = PolarisFormatter.info_fmt - - elif record.levelno == logging.ERROR: - self._fmt = PolarisFormatter.err_fmt - - # Call the original formatter class to do the grunt work - result = logging.Formatter.format(self, record) - - # Restore the original format configured by the user - self._fmt = format_orig - - return result diff --git a/deploy/spack.yaml.j2 b/deploy/spack.yaml.j2 new file mode 100644 index 0000000000..433e8a5cc7 --- /dev/null +++ b/deploy/spack.yaml.j2 @@ -0,0 +1,44 @@ +# Spack specs for deployment. +# +# This file is rendered with Jinja2 during `mache deploy run`. +# Available template variables include: +# - software, machine, compiler, mpi +# - pins (dict of dicts from deploy/pins.cfg) +# - spack / pixi / all (shortcuts for pins sections) +# +# Expected format: a YAML mapping with one or both keys: +# - library: specs for the per-toolchain "library" environment(s) +# - software: specs for the single "software" environment +# +# Backward compatibility: if this renders to a YAML list[str], it will be +# interpreted as the "library" specs. +# +# Example: +# library: +# - "hdf5@{{ spack.hdf5 }} +fortran +hl" +# - "netcdf-c@{{ spack.netcdf_c }} +mpi" +# - "netcdf-fortran@{{ spack.netcdf_fortran }}" +# software: +# - "cmake@{{ spack.cmake }}" +# +library: + - "cmake@{{ spack.cmake }}" +{%- if not e3sm_hdf5_netcdf %} + - "hdf5@{{ spack.hdf5 }}+cxx+fortran+hl+mpi+shared" + - "netcdf-c@{{ all.netcdf_c }}+mpi~parallel-netcdf" + - "netcdf-fortran@{{ all.netcdf_fortran }}" + - "parallel-netcdf@{{ all.pnetcdf }}+cxx+fortran" +{%- endif %} + - "metis@{{ all.metis }}+int64+real64~shared" + - "parmetis@{{ spack.parmetis }}+int64~shared" + - "e3sm-scorpio@{{ spack.scorpio }}+mpi~timing~internal-timing~tools+malloc" + +software: + - "cmake@{{ spack.cmake }}" + - "esmf@{{ all.esmf }}+mpi+netcdf~pnetcdf~external-parallelio" + - "moab@{{ all.moab }}+eigen+fortran+hdf5+mpi+netcdf+pnetcdf+zoltan+tempest" +{%- if not e3sm_hdf5_netcdf %} + - "hdf5@{{ spack.hdf5 }}+cxx+fortran+hl+mpi+shared" + - "netcdf-c@{{ all.netcdf_c }}+mpi~parallel-netcdf" + - "netcdf-fortran@{{ all.netcdf_fortran }}" +{%- endif %} diff --git a/deploy/spec-bootstrap.txt b/deploy/spec-bootstrap.txt deleted file mode 100644 index e48defe242..0000000000 --- a/deploy/spec-bootstrap.txt +++ /dev/null @@ -1,3 +0,0 @@ -jinja2 -packaging -progressbar2 diff --git a/deploy/unsupported.txt b/deploy/unsupported.txt deleted file mode 100644 index 5eae58f0f9..0000000000 --- a/deploy/unsupported.txt +++ /dev/null @@ -1,9 +0,0 @@ -# a list of unsupported machine, compiler and mpi combinations - -# no spack available -chrysalis, intel, impi -chrysalis, gnu, impi -chrysalis, oneapi-ifx, openmpi -chrysalis, oneapi-ifx, impi -pm-cpu, aocc, mpich -pm-cpu, amdclang, mpich From 06c336c4a93a46e82d9283fe291b72dbcfe4af0d Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 26 Jan 2026 01:58:34 -0600 Subject: [PATCH 04/39] Install polaris in developer mode by default --- deploy/config.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/config.yaml.j2 b/deploy/config.yaml.j2 index b914da4ece..709bb51cb1 100644 --- a/deploy/config.yaml.j2 +++ b/deploy/config.yaml.j2 @@ -57,7 +57,7 @@ pixi: mpi: "dynamic" # Whether to install the target software in editable/development mode. - install_dev_software: false + install_dev_software: true spack: # Whether to deploy Spack environments at all. From fecae969e0ff1a1a83d5cd4ef512a8487c186880 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 26 Jan 2026 03:09:40 -0600 Subject: [PATCH 05/39] Switch to pixi list in provenance --- polaris/provenance.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/polaris/provenance.py b/polaris/provenance.py index 82b410dbef..c221bd7fcc 100644 --- a/polaris/provenance.py +++ b/polaris/provenance.py @@ -47,10 +47,10 @@ def write(work_dir, tasks, config=None, machine=None, baseline_dir=None): component_git_version = _get_component_git_version(config) try: - args = ['conda', 'list'] - conda_list = subprocess.check_output(args).decode('utf-8') + args = ['pixi', 'list'] + pixi_list = subprocess.check_output(args).decode('utf-8') except subprocess.CalledProcessError: - conda_list = None + pixi_list = None calling_command = ' '.join(sys.argv) @@ -109,9 +109,9 @@ def write(work_dir, tasks, config=None, machine=None, baseline_dir=None): provenance_file.write(f'{print_string}\n') - if conda_list is not None: - provenance_file.write('conda list:\n') - provenance_file.write(f'{conda_list}\n') + if pixi_list is not None: + provenance_file.write('pixi list:\n') + provenance_file.write(f'{pixi_list}\n') provenance_file.write( '**************************************************' From 06706fafa414b8868107090348e388c54340bfe2 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 26 Jan 2026 03:09:57 -0600 Subject: [PATCH 06/39] Remove or fix various conda references --- polaris/build/mpas_ocean.py | 1 - polaris/build/omega.py | 2 -- polaris/setup.py | 2 +- polaris/tasks/e3sm/init/topo/combine/viz.py | 2 +- pyproject.toml | 2 +- utils/bisect/bisect.py | 2 +- utils/bisect/bisect_step.py | 2 +- utils/omega/ctest/omega_ctest.py | 1 - 8 files changed, 5 insertions(+), 9 deletions(-) diff --git a/polaris/build/mpas_ocean.py b/polaris/build/mpas_ocean.py index 10504e02fc..2cb5d6b26e 100644 --- a/polaris/build/mpas_ocean.py +++ b/polaris/build/mpas_ocean.py @@ -79,7 +79,6 @@ def build_mpas_ocean( print('\n') # clear environment variables and start fresh with those from login - # so spack doesn't get confused by conda command = f'env -i HOME="$HOME" bash -l {script_filename}' if log_filename is not None: print(f'Logging build to: {log_filename}') diff --git a/polaris/build/omega.py b/polaris/build/omega.py index 73e4ecff91..66d768bbff 100644 --- a/polaris/build/omega.py +++ b/polaris/build/omega.py @@ -75,9 +75,7 @@ def build_omega( print(f' account: {account}') print('\n') - # remove and/or create build directory first so the log file can be created # clear environment variables and start fresh with those from login - # so spack doesn't get confused by conda command = f'env -i HOME="$HOME" bash -l {script_filename}' if log_filename is not None: print(f'Logging build to: {log_filename}') diff --git a/polaris/setup.py b/polaris/setup.py index 95dd6aaf78..f7d1431f49 100644 --- a/polaris/setup.py +++ b/polaris/setup.py @@ -933,7 +933,7 @@ def _setup_step(task, step, work_dir, baseline_dir, task_dir): def _symlink_load_script(work_dir): - """make a symlink to the script for loading the polaris conda env.""" + """make a symlink to the script for loading the polaris env.""" if 'LOAD_POLARIS_ENV' in os.environ: script_filename = os.environ['LOAD_POLARIS_ENV'] symlink(script_filename, os.path.join(work_dir, 'load_polaris_env.sh')) diff --git a/polaris/tasks/e3sm/init/topo/combine/viz.py b/polaris/tasks/e3sm/init/topo/combine/viz.py index bb4d381725..3243fcdd3d 100644 --- a/polaris/tasks/e3sm/init/topo/combine/viz.py +++ b/polaris/tasks/e3sm/init/topo/combine/viz.py @@ -134,7 +134,7 @@ def _plot_field(self, vertices, tris, field_data, field_name, cmap): except ImportError as err: raise ImportError( 'the datashader package is not installed. ' - 'Please install in your conda environment so you can run ' + 'Please install in your pixi environment so you can run ' 'the topography visualization step.' ) from err diff --git a/pyproject.toml b/pyproject.toml index 97ceeba5b9..7c2d9d2b65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,7 +81,7 @@ warn_unused_configs = true [tool.ruff] # Exclude a variety of commonly ignored directories. -exclude = ["docs*", "conda*"] +exclude = ["docs*"] line-length = 79 [tool.ruff.lint] diff --git a/utils/bisect/bisect.py b/utils/bisect/bisect.py index df5d8b1965..541412ddda 100755 --- a/utils/bisect/bisect.py +++ b/utils/bisect/bisect.py @@ -28,7 +28,7 @@ def bisect(good, bad, e3sm_path, load_script, config_file, first_parent): initialized with ``git submodule update --init``. load_script : str The relative or absolute path to the load script used to activate - the polaris conda environment and set environment variables used to + the polaris environment and set environment variables used to build the E3SM component to test. config_file : str The relative or absolute path to a config file containing config diff --git a/utils/bisect/bisect_step.py b/utils/bisect/bisect_step.py index 31e168c9cc..527817d308 100755 --- a/utils/bisect/bisect_step.py +++ b/utils/bisect/bisect_step.py @@ -33,7 +33,7 @@ def run( E3SM commit hash that is tested. load_script : str The relative or absolute path to the load script used to activate - the polaris conda environment and set environment variables used to + the polaris environment and set environment variables used to build the MPAS component to test. setup_command : str The command to use to set up the polaris test case(s) diff --git a/utils/omega/ctest/omega_ctest.py b/utils/omega/ctest/omega_ctest.py index 7dcfa1f16a..95564bfe25 100755 --- a/utils/omega/ctest/omega_ctest.py +++ b/utils/omega/ctest/omega_ctest.py @@ -296,7 +296,6 @@ def main(): ) # clear environment variables and start fresh with those from login - # so spack doesn't get confused by conda subprocess.check_call( f'env -i HOME="$HOME" bash -l "{script_filename}"', shell=True ) From 224bf3fcee9959e6368d266854feb0c98fb87cf2 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 26 Jan 2026 04:26:05 -0600 Subject: [PATCH 07/39] Add caches to git ignore --- .gitignore | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.gitignore b/.gitignore index 60e07b8452..298423cf8f 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,12 @@ pixi-env/ # vs code .vscode/ + +# mypy +.mypy_cache/ + +# ruff +.ruff_cache/ + +# mache +.mache_cache/ From 903432b9db11db32ceac71eceb3cd0ac05adc568 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 26 Jan 2026 04:35:04 -0600 Subject: [PATCH 08/39] Fix LOAD_POLARIS_ENV --> POLARIS_LOAD_SCRIPT --- docs/developers_guide/framework/build.md | 2 +- docs/developers_guide/quick_start.md | 6 +++--- polaris/build/mpas_ocean.py | 2 +- polaris/setup.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/developers_guide/framework/build.md b/docs/developers_guide/framework/build.md index 7d694a9fe1..fbdfcb7a61 100644 --- a/docs/developers_guide/framework/build.md +++ b/docs/developers_guide/framework/build.md @@ -37,7 +37,7 @@ MPAS-Ocean: - `POLARIS_MACHINE` — machine ID used by the template (e.g., chrysalis, frontier) - `POLARIS_COMPILER` — compiler toolchain ID (e.g., intel, gnu) - `POLARIS_MPI` — MPI library ID (e.g., openmpi, mpich) -- `LOAD_POLARIS_ENV` — path to the shell snippet used to load the Polaris build +- `POLARIS_LOAD_SCRIPT` — path to the shell snippet used to load the Polaris build env on the target machine Omega: diff --git a/docs/developers_guide/quick_start.md b/docs/developers_guide/quick_start.md index fb366073bd..87cf9b348b 100644 --- a/docs/developers_guide/quick_start.md +++ b/docs/developers_guide/quick_start.md @@ -237,7 +237,7 @@ source ./load____.sh This will load the appropriate conda environment, load system modules for compilers, MPI and libraries needed to build and run E3SM components, and set environment variables needed for E3SM components or polaris. It will also -set an environment variable `LOAD_POLARIS_ENV` that points to the activation +set an environment variable `POLARIS_LOAD_SCRIPT` that points to the activation script. Polaris uses this to make an symlink to the activation script called `load_polaris_env.sh` in the work directory. When the load script is executed from the base of the polaris repository (i.e., as @@ -266,7 +266,7 @@ anywhere, and it always refers to that branch. To find out which branch you are actually running `polaris` from, you should run: ```bash -echo $LOAD_POLARIS_ENV +echo $POLARIS_LOAD_SCRIPT ``` This will give you the path to the load script, which will also tell you where @@ -426,7 +426,7 @@ source ./load_.sh ``` This will load the appropriate conda environment for polaris. It will also -set an environment variable `LOAD_POLARIS_ENV` that points to the activation +set an environment variable `POLARIS_LOAD_SCRIPT` that points to the activation script. Polaris uses this to make a symlink to the activation script called `load_polaris_env.sh` in the work directory. diff --git a/polaris/build/mpas_ocean.py b/polaris/build/mpas_ocean.py index 2cb5d6b26e..e70b1e0c66 100644 --- a/polaris/build/mpas_ocean.py +++ b/polaris/build/mpas_ocean.py @@ -164,7 +164,7 @@ def make_build_script( if debug: make_flags += ' debug=TRUE' - load_script = os.environ['LOAD_POLARIS_ENV'] + load_script = os.environ['POLARIS_LOAD_SCRIPT'] script = template.render( load_script=load_script, diff --git a/polaris/setup.py b/polaris/setup.py index f7d1431f49..6aed92dcee 100644 --- a/polaris/setup.py +++ b/polaris/setup.py @@ -934,8 +934,8 @@ def _setup_step(task, step, work_dir, baseline_dir, task_dir): def _symlink_load_script(work_dir): """make a symlink to the script for loading the polaris env.""" - if 'LOAD_POLARIS_ENV' in os.environ: - script_filename = os.environ['LOAD_POLARIS_ENV'] + if 'POLARIS_LOAD_SCRIPT' in os.environ: + script_filename = os.environ['POLARIS_LOAD_SCRIPT'] symlink(script_filename, os.path.join(work_dir, 'load_polaris_env.sh')) From 914453899b5d8ed8d71f50d81c39e3e62cc80ac7 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 26 Jan 2026 05:15:38 -0600 Subject: [PATCH 09/39] Add environment variables as a snippet shell script --- deploy/load.sh | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 deploy/load.sh diff --git a/deploy/load.sh b/deploy/load.sh new file mode 100644 index 0000000000..b2264cd811 --- /dev/null +++ b/deploy/load.sh @@ -0,0 +1,21 @@ +# bash snippet for adding Polaris-specific environment variables + +# we need a special approach for cray machines ($POLARIS_MACHINE), notably +# pm-cpu and pm-gpu +if [ "$POLARIS_MACHINE" = "pm-cpu" ] || [ "$POLARIS_MACHINE" = "pm-gpu" ]; then + export NETCDF=${CRAY_NETCDF_HDF5PARALLEL_PREFIX} + export NETCDFF=${CRAY_NETCDF_HDF5PARALLEL_PREFIX} + export PNETCDF=${CRAY_PARALLEL_NETCDF_PREFIX} +else + export NETCDF=$(dirname $(dirname $(which nc-config))) + export NETCDFF=$(dirname $(dirname $(which nf-config))) + export PNETCDF=$(dirname $(dirname $(which pnetcdf-config))) +fi + +export PIO=${MACHE_DEPLOY_SPACK_LIBRARY_VIEW} +export METIS_ROOT=${MACHE_DEPLOY_SPACK_LIBRARY_VIEW} +export PARMETIS_ROOT=${MACHE_DEPLOY_SPACK_LIBRARY_VIEW} + +export USE_PIO2=true +export OPENMP=true +export HDF5_USE_FILE_LOCKING=FALSE From a6e5226b9d391d5c19a0e317e7d4d5ad22311c2b Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 6 Mar 2026 22:04:05 +0100 Subject: [PATCH 10/39] Look for compiler's default using compiler with underscore --- deploy/hooks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deploy/hooks.py b/deploy/hooks.py index 64d45b27e4..1be12d2767 100644 --- a/deploy/hooks.py +++ b/deploy/hooks.py @@ -107,7 +107,8 @@ def _get_pixi_mpi(machine, machine_config): compiler = section.get('compiler') if compiler is None: raise ValueError("Missing 'compiler' option in 'deploy' section") - mpi_option = f'mpi_{compiler}' + compiler_underscore = compiler.replace('-', '_') + mpi_option = f'mpi_{compiler_underscore}' mpi = section.get(mpi_option) if mpi is None: raise ValueError( From a38ff6e3dfffd6c418e87e37639d281549789e36 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Tue, 17 Mar 2026 03:54:11 -0700 Subject: [PATCH 11/39] Update deploy.py and cli_spec.json This brings in changes made on the mache side. --- deploy.py | 16 ++++++++++++++-- deploy/cli_spec.json | 12 +++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/deploy.py b/deploy.py index 26ca0f7858..eb07482ad2 100755 --- a/deploy.py +++ b/deploy.py @@ -98,7 +98,13 @@ def main(): bootstrap_argv += ['--mache-version', pinned_mache_version] cmd = [sys.executable, BOOTSTRAP_PATH] + bootstrap_argv - subprocess.check_call(cmd) + try: + subprocess.check_call(cmd) + except subprocess.CalledProcessError as e: + raise SystemExit( + f'\nERROR: Bootstrap step failed (exit code {e.returncode}). ' + f'See the error output above.' + ) from None if args.bootstrap_only: pixi_exe = _get_pixi_executable(getattr(args, 'pixi', None)) @@ -453,7 +459,13 @@ def _run_mache_deploy_run(pixi_exe, repo_root, mache_run_argv): f'{shlex.quote(pixi_exe)} run -m {shlex.quote(pixi_toml)} bash -lc ' f'{shlex.quote("cd " + repo_root + " && " + mache_cmd)}' ) - subprocess.check_call(['/bin/bash', '-lc', cmd]) + try: + subprocess.check_call(['/bin/bash', '-lc', cmd]) + except subprocess.CalledProcessError as e: + raise SystemExit( + f'\nERROR: Deployment step failed (exit code {e.returncode}). ' + f'See the error output above.' + ) from None if __name__ == '__main__': diff --git a/deploy/cli_spec.json b/deploy/cli_spec.json index 11fb4ef5ce..49cc6d2f18 100644 --- a/deploy/cli_spec.json +++ b/deploy/cli_spec.json @@ -1,7 +1,7 @@ { "meta": { "software": "polaris", - "mache_version": "2.2.0", + "mache_version": "3.0.0", "description": "Deploy polaris environment" }, "arguments": [ @@ -9,7 +9,7 @@ "flags": ["--machine"], "dest": "machine", "help": "Name of the machine to deploy for (must be known to mache). If not provided, mache will attempt to detect the machine from the host.", - "route": ["deploy", "bootstrap", "run"] + "route": ["deploy", "run"] }, { "flags": ["--pixi"], @@ -44,6 +44,12 @@ "help": "Deploy all supported Spack environments (overrides spack.deploy in deploy/config.yaml.j2).", "route": ["deploy", "run"] }, + { + "flags": ["--spack-path"], + "dest": "spack_path", + "help": "Path to the Spack checkout to use (overrides spack.spack_path in deploy/config.yaml.j2).", + "route": ["deploy", "run"] + }, { "flags": ["--recreate"], "dest": "recreate", @@ -55,7 +61,7 @@ "flags": ["--mache-version"], "dest": "mache_version", "help": "The mache version to use if not using an org/fork/branch.", - "route": ["bootstrap", "run"] + "route": ["deploy", "bootstrap", "run"] }, { "flags": ["--python"], From f4241fbbe91eba44285d79356d089d56b182b200 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 18 Mar 2026 10:59:13 +0100 Subject: [PATCH 12/39] Update to mache 3.0.0 --- deploy/pins.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/pins.cfg b/deploy/pins.cfg index 8499414de6..b3825ae0b8 100644 --- a/deploy/pins.cfg +++ b/deploy/pins.cfg @@ -3,7 +3,7 @@ bootstrap_python = 3.13 python = 3.13 geometric_features = 1.6.1 -mache = 2.2.0 +mache = 3.0.0 mpas_tools = 1.4.0 otps = 2021.10 parallelio = 2.6.6 From 08bffae283bb3e7ff8baeaf2b73cbb8453b257c5 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 18 Mar 2026 11:56:47 +0100 Subject: [PATCH 13/39] Update comments in config files for new approach --- polaris/machines/aurora.cfg | 5 ++--- polaris/machines/chrysalis.cfg | 5 ++--- polaris/machines/default-linux-64.cfg | 2 +- polaris/machines/default-osx-64.cfg | 2 +- polaris/machines/frontier.cfg | 5 ++--- polaris/machines/katara.cfg | 5 ++--- polaris/machines/pm-cpu.cfg | 5 ++--- polaris/machines/pm-gpu.cfg | 5 ++--- polaris/ocean/mpas_ocean.cfg | 4 ++-- polaris/ocean/omega.cfg | 4 ++-- polaris/seaice/seaice.cfg | 4 ++-- 11 files changed, 20 insertions(+), 26 deletions(-) diff --git a/polaris/machines/aurora.cfg b/polaris/machines/aurora.cfg index 19f7942f05..ccd3f36801 100644 --- a/polaris/machines/aurora.cfg +++ b/polaris/machines/aurora.cfg @@ -4,12 +4,11 @@ # A shared root directory where polaris data can be found database_root = /lus/flare/projects/E3SM_Dec/polaris -# the path to the base conda environment where polars environments have -# been created +# the path where deployed polaris environments are located polaris_envs = /lus/flare/projects/E3SM_Dec/soft/polaris/aurora/base -# Options related to deploying a polaris conda and spack environments +# Options related to deploying polaris environments on supported machines [deploy] # the compiler set to use for system libraries and MPAS builds diff --git a/polaris/machines/chrysalis.cfg b/polaris/machines/chrysalis.cfg index 760d94f780..912241701b 100644 --- a/polaris/machines/chrysalis.cfg +++ b/polaris/machines/chrysalis.cfg @@ -4,12 +4,11 @@ # A shared root directory where polaris data can be found database_root = /lcrc/group/e3sm/public_html/polaris -# the path to the base conda environment where polars environments have -# been created +# the path where deployed polaris environments are located polaris_envs = /lcrc/soft/climate/polaris/chrysalis/base -# Options related to deploying a polaris conda and spack environments +# Options related to deploying polaris environments on supported machines [deploy] # the compiler set to use for system libraries and MPAS builds diff --git a/polaris/machines/default-linux-64.cfg b/polaris/machines/default-linux-64.cfg index 7b1be09946..5c8fa3e490 100644 --- a/polaris/machines/default-linux-64.cfg +++ b/polaris/machines/default-linux-64.cfg @@ -1,4 +1,4 @@ -# Options related to deploying a polaris conda and spack environments +# Options related to deploying polaris environments on supported machines [deploy] # the compiler set to use for system libraries and MPAS builds diff --git a/polaris/machines/default-osx-64.cfg b/polaris/machines/default-osx-64.cfg index f2dc824d50..4a087d5075 100644 --- a/polaris/machines/default-osx-64.cfg +++ b/polaris/machines/default-osx-64.cfg @@ -1,4 +1,4 @@ -# Options related to deploying a polaris conda and spack environments +# Options related to deploying polaris environments on supported machines [deploy] # the compiler set to use for system libraries and MPAS builds diff --git a/polaris/machines/frontier.cfg b/polaris/machines/frontier.cfg index 5400437897..5d8b3c7325 100644 --- a/polaris/machines/frontier.cfg +++ b/polaris/machines/frontier.cfg @@ -4,12 +4,11 @@ # A shared root directory where polaris data can be found database_root = /lustre/orion/cli115/world-shared/polaris -# the path to the base conda environment where polaris environments have -# been created +# the path where deployed polaris environments are located polaris_envs = /ccs/proj/cli115/software/polaris/frontier/conda/base -# Options related to deploying a polaris conda and spack environments +# Options related to deploying polaris environments on supported machines [deploy] # the compiler set to use for system libraries and MPAS builds diff --git a/polaris/machines/katara.cfg b/polaris/machines/katara.cfg index 63f3e219b0..a2659ae1f3 100644 --- a/polaris/machines/katara.cfg +++ b/polaris/machines/katara.cfg @@ -30,12 +30,11 @@ modules_after = False # A shared root directory where MPAS standalone data can be found database_root = /home/xylar/data/polaris -# the path to the base conda environment where polaris environments have -# been created +# the path where deployed polaris environments are located polaris_envs = /home/xylar/data/polaris_envs -# Options related to deploying a polaris conda environment on supported +# Options related to deploying polaris environments on supported # machines [deploy] diff --git a/polaris/machines/pm-cpu.cfg b/polaris/machines/pm-cpu.cfg index 9afd5995d5..e7530b27da 100644 --- a/polaris/machines/pm-cpu.cfg +++ b/polaris/machines/pm-cpu.cfg @@ -4,12 +4,11 @@ # A shared root directory where polaris data can be found database_root = /global/cfs/cdirs/e3sm/polaris -# the path to the base conda environment where polaris environments have -# been created +# the path where deployed polaris environments are located polaris_envs = /global/common/software/e3sm/polaris/pm-cpu/conda/base -# Options related to deploying a polaris conda and spack environments +# Options related to deploying polaris environments on supported machines [deploy] # the compiler set to use for system libraries and MPAS builds diff --git a/polaris/machines/pm-gpu.cfg b/polaris/machines/pm-gpu.cfg index 5018b07ed5..659a5cbc00 100644 --- a/polaris/machines/pm-gpu.cfg +++ b/polaris/machines/pm-gpu.cfg @@ -4,12 +4,11 @@ # A shared root directory where polaris data can be found database_root = /global/cfs/cdirs/e3sm/polaris -# the path to the base conda environment where polaris environments have -# been created +# the path where deployed polaris environments are located polaris_envs = /global/common/software/e3sm/polaris/pm-gpu/conda/base -# Options related to deploying a polaris conda and spack environments +# Options related to deploying polaris environments on supported machines [deploy] # the compiler set to use for system libraries and MPAS builds diff --git a/polaris/ocean/mpas_ocean.cfg b/polaris/ocean/mpas_ocean.cfg index a49258494b..89b4fda20a 100644 --- a/polaris/ocean/mpas_ocean.cfg +++ b/polaris/ocean/mpas_ocean.cfg @@ -31,8 +31,8 @@ processed = ${paths:component_path}/src/Registry_processed.xml # The executables section defines paths to required executables. These # executables are provided for use by specific test cases. Most tools that -# polaris needs should be in the conda environment, so this is only the path -# to the MPAS-Ocean executable by default. +# polaris needs should be in the deployment environment, so this is only the +# path to the MPAS-Ocean executable by default. [executables] component = ${paths:component_path}/ocean_model diff --git a/polaris/ocean/omega.cfg b/polaris/ocean/omega.cfg index d34547cf16..94fe3e5d4e 100644 --- a/polaris/ocean/omega.cfg +++ b/polaris/ocean/omega.cfg @@ -15,8 +15,8 @@ defaults = ${paths:component_path}/configs/Default.yml # The executables section defines paths to required executables. These # executables are provided for use by specific test cases. Most tools that -# polaris needs should be in the conda environment, so this is only the path -# to the Omega executable by default. +# polaris needs should be in the deployment environment, so this is only the +# path to the Omega executable by default. [executables] component = ${paths:component_path}/src/omega.exe diff --git a/polaris/seaice/seaice.cfg b/polaris/seaice/seaice.cfg index bdf226c41a..42aa05caf1 100644 --- a/polaris/seaice/seaice.cfg +++ b/polaris/seaice/seaice.cfg @@ -31,7 +31,7 @@ processed = ${paths:component_path}/src/Registry_processed.xml # The executables section defines paths to required executables. These # executables are provided for use by specific test cases. Most tools that -# polaris needs should be in the conda environment, so this is only the path -# to the MPAS-Seaice executable by default. +# polaris needs should be in the deployment environment, so this is only the +# path to the MPAS-Seaice executable by default. [executables] component = ${paths:component_path}/seaice_model From c47c1b6fd0c4ffd855b3ab3da82a7abecd2ae582 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 18 Mar 2026 11:57:05 +0100 Subject: [PATCH 14/39] Update the docs to match new deploy approach --- docs/developers_guide/building_docs.md | 4 +- docs/developers_guide/command_line.md | 4 +- docs/developers_guide/machines/index.md | 40 +- docs/developers_guide/ocean/index.md | 2 +- .../organization/components.md | 2 +- docs/developers_guide/overview.md | 2 +- docs/developers_guide/quick_start.md | 350 +++++------------- docs/developers_guide/seaice/index.md | 2 +- docs/developers_guide/troubleshooting.md | 10 +- docs/developers_guide/updating_conda.md | 165 +++------ .../updating_spack/adding_new_machines.md | 15 +- .../updating_spack/deploying_shared_spack.md | 18 +- docs/developers_guide/updating_spack/index.md | 8 +- .../maintaining_past_versions.md | 8 +- .../updating_spack/testing/deploying_spack.md | 68 ++-- .../updating_spack/testing/overview.md | 4 +- .../updating_spack/testing/troubleshooting.md | 6 +- .../updating_spack/updating_packages.md | 286 +++----------- .../updating_spack/workflow.md | 20 +- .../getting_started.md | 16 +- .../dev_add_category_of_tasks/overview.md | 2 +- docs/users_guide/config_files.md | 2 +- docs/users_guide/invalid_quick_start.md | 50 +-- docs/users_guide/machines/aurora.md | 6 +- docs/users_guide/machines/chrysalis.md | 4 +- docs/users_guide/machines/frontier.md | 6 +- docs/users_guide/machines/index.md | 56 +-- docs/users_guide/machines/perlmutter.md | 8 +- 28 files changed, 351 insertions(+), 813 deletions(-) diff --git a/docs/developers_guide/building_docs.md b/docs/developers_guide/building_docs.md index c3e4a97d39..0ffda702b6 100644 --- a/docs/developers_guide/building_docs.md +++ b/docs/developers_guide/building_docs.md @@ -3,8 +3,8 @@ # Building the Documentation As long as you have followed the procedure in {ref}`dev-conda-env` for setting -up your conda environment, you will already have the packages available that -you need to build the documentation. +up your local Polaris deployment environment (pixi-based), you will already +have the packages available that you need to build the documentation. Then, run the following script to build the docs: diff --git a/docs/developers_guide/command_line.md b/docs/developers_guide/command_line.md index bb3a269f9e..78a1a92cf3 100644 --- a/docs/developers_guide/command_line.md +++ b/docs/developers_guide/command_line.md @@ -7,10 +7,10 @@ scripts: `polaris list`, `polaris setup`, `polaris suite`, and `polaris serial`. These are the primary user interface to the package, as described below. -When the `polaris` package is installed into your conda environment, you can +When the `polaris` package is installed into your deployment environment, you can run these commands as above. If you are developing polaris from a local branch off of , you will need to create a -conda environment appropriate for development (see {ref}`dev-conda-env`). +development environment appropriate for development (see {ref}`dev-conda-env`). If you do, polaris will be installed in the environment in "development" mode, meaning you can make changes to the branch and they will be reflected when you call the `polaris` command-line tool. diff --git a/docs/developers_guide/machines/index.md b/docs/developers_guide/machines/index.md index 291890c486..bf59de4b60 100644 --- a/docs/developers_guide/machines/index.md +++ b/docs/developers_guide/machines/index.md @@ -14,7 +14,7 @@ test cases are configured in a way that is appropriate for your machine. ## Supported Machines If you follow the procedure in {ref}`dev-conda-env`, you will have an -activation script for activating the development conda environment, setting +activation script for activating the development environment, setting loading system modules and setting environment variables so you can build Omega or an MPAS component and work with polaris. Just source the script that should appear in the base of your polaris branch, e.g.: @@ -27,7 +27,7 @@ After loading this environment, you can set up tasks or suites, and a link `load_polaris_env.sh` will be included in each suite or task work directory. This is a link to the activation script that you sourced when you were setting things up. You can can source this file on a compute node -(e.g. in a job script) to get the right polaris conda environment, compilers, +(e.g. in a job script) to get the right polaris deployment environment, compilers, MPI libraries and environment variables for running polaris tasks and the MPAS model. @@ -113,26 +113,24 @@ E3SM default for the given machine an compiler. ## Other Machines If you are working on an "unknown" machine, the procedure is pretty similar -to what was described in {ref}`dev-conda-env`. The main difference is that -we will use `mpich` or `openmpi` and the gnu compilers from conda-forge -rather than system compilers. To create a development conda environment and -an activation script for it, on Linux, run: +to what was described in {ref}`dev-conda-env`. In general, use `./deploy.py` +to create a local pixi environment and load scripts. For example, on Linux run: ```bash -./configure_polaris_envs.py --conda -c gnu -i mpich +./deploy.py --compiler gnu --mpi mpich ``` and on OSX run: ```bash -./configure_polaris_envs.py --conda -c clang -i mpich +./deploy.py --compiler clang --mpi mpich ``` You may use `openmpi` instead of `mpich` but we have had better experiences with the latter. The result should be an activation script `load_dev_polaris_0.1.0-alpha.1_.sh`. -Source this script to get the appropriate conda environment and environment +Source this script to get the appropriate deployment environment and environment variables. Under Linux, you can build the MPAS model with @@ -194,12 +192,12 @@ modules_after = False # A shared root directory where MPAS standalone data can be found database_root = /home/xylar/data/polaris -# the path to the base conda environment where polaris environments have +# the path where deployed Polaris environments are located # been created polaris_envs = /home/xylar/data/polaris_envs -# Options related to deploying a polaris conda environment on supported +# Options related to deploying Polaris environments on supported # machines [deploy] @@ -255,11 +253,11 @@ libraries aren't being found when you try to build an MPAS component. In the `[paths]` section, you will first give a path where you would like to store shared data files used in polaris tasks in `database_root`. Polaris will create this directory if it doesn't exist. Then, you can specify -`polaris_envs` as a path where shared conda environments will be installed -for polaris releases. If developers always create their own conda +`polaris_envs` as a path where shared deployment environments will be installed +for polaris releases. If developers always create their own local environments, this path will never be used. -In `[deploy]`, you will specify config options used in setting up conda +In `[deploy]`, you will specify config options used in setting up deployment and Spack environments for developers. The `compiler` is the default compiler to use for your system. You must supply a corresponding `mpi_` for each supported compiler (not just the default compiler) @@ -355,7 +353,7 @@ You may need to load a system module to get the compilers and potentially other libraries such as MPI, HDF5, and NetCDF-C if you prefer to use system modules rather than having Spack build them. If this is the case, the best way to do this is to add a file -`conda/spack/__.sh` along these lines: +`deploy/spack/__.sh` along these lines: ``` bash module purge @@ -382,14 +380,16 @@ The next step is to try setting up polaris and asking it to build the Spack environment with a command something like: ``` bash - ./configure_polaris_envs.py --verbose --update_spack --conda -c gnu -i openmpi ... + ./deploy.py --deploy-spack --compiler gnu --mpi openmpi ... ``` -The `--update_spack` flag tells polaris to create (or update) a Spack +The `--deploy-spack` flag tells polaris to create (or update) a Spack environment. You can specify a directory for testing Spack with the -`--spack` flag. You can specify a temporary directory for building spack -packages with `--tmpdir` (this directory must already exist). This is useful -if your `/tmp` space is small (Spack will use several GB of temporary space). +`--spack-path` flag. If needed, you can set a temporary directory for Spack +builds through `spack.tmpdir` in `deploy/config.yaml.j2`. + +For additional deployment details, see the +[mache deploy developer guide](https://docs.e3sm.org/mache/main/developers_guide/deploy.html). Creating the Spack environment may take anywhere from minutes to hours, diff --git a/docs/developers_guide/ocean/index.md b/docs/developers_guide/ocean/index.md index b5d5e19e4d..fc9ceccb76 100644 --- a/docs/developers_guide/ocean/index.md +++ b/docs/developers_guide/ocean/index.md @@ -56,7 +56,7 @@ init = ${paths:component_path}/default_inputs/streams.ocean.init # The executables section defines paths to required executables. These # executables are provided for use by specific tasks. Most tools that -# polaris needs should be in the conda environment, so this is only the path +# polaris needs should be in the deployment environment, so this is only the path # to the MPAS-Ocean or Omega executable by default. [executables] component = ${paths:component_path}/ocean_model diff --git a/docs/developers_guide/organization/components.md b/docs/developers_guide/organization/components.md index 64b07f08b1..4e158717f5 100644 --- a/docs/developers_guide/organization/components.md +++ b/docs/developers_guide/organization/components.md @@ -135,7 +135,7 @@ forward = ${paths:component_path}/default_inputs/streams.landice # The executables section defines paths to required executables. These # executables are provided for use by specific tasks. Most tools that -# polaris needs should be in the conda environment, so this is only the path +# polaris needs should be in the deployment environment, so this is only the path # to the MALI executable by default. [executables] component = ${paths:component_path}/landice_model diff --git a/docs/developers_guide/overview.md b/docs/developers_guide/overview.md index f32629423a..cc7b2e868f 100644 --- a/docs/developers_guide/overview.md +++ b/docs/developers_guide/overview.md @@ -35,7 +35,7 @@ reformatting your code (e.g. with [autopep8](https://github.com/hhatto/autopep8) because this can often produce undesirable and confusing results. The [flake8](https://flake8.pycqa.org/en/latest/) utility for linting python -files to the PEP8 standard is included in the POLARIS conda environment. To use +files to the PEP8 standard is included in the POLARIS deployment environment. To use flake8, just run `flake8` from any directory and it will return lint results for all files recursively through all subdirectories. You can also run it for a single file or using wildcards (e.g., `flake8 *.py`). There also is a diff --git a/docs/developers_guide/quick_start.md b/docs/developers_guide/quick_start.md index 87cf9b348b..e9b00cd6ac 100644 --- a/docs/developers_guide/quick_start.md +++ b/docs/developers_guide/quick_start.md @@ -32,254 +32,108 @@ directory of the polaris repository. (dev-conda-env)= -## polaris conda environment, spack environment, compilers and system modules - -As a developer, you will need your own -[conda](https://conda.io/projects/conda/en/latest/index.html) environment with -the latest dependencies for polaris and a development installation of polaris -from the branch you're working on. On supported machines, you will also need -to point to a shared [spack](https://spack.io/) environment with some tools -and libraries built for that system that polaris needs. - -In the root of the repository is a tool, `configure_polaris_envs.py` -that can get you started. - -You will need to run `./configure_polaris_envs.py` each time you check -out a new branch or create a new worktree with `git`. Typically, you will -*not* need to run this command when you make changes to files within the -`polaris` python package. These will automatically be recognized because -`polaris` is installed into the conda environment in "editable" mode. You -*will* need to run the command if you add new code files or data files to the -package because these don't get added automatically. - -Whether you are on one of the {ref}`dev-supported-machines` or an "unknown" -machine, you will need to specify a path where -[Miniforge3](https://github.com/conda-forge/miniforge#miniforge3) either has -already been installed or an empty directory where the script can install it. -You must have write permission in the base environment (if it exists). +## Polaris pixi and spack environments, compilers and system modules -:::{note} -We have found that an existing Miniconda3 installation **does not** always -work well for polaris, so please start with Miniforge3 instead. -::: +Polaris now uses `mache.deploy` for deployment. In this repository, the +deployment entry point is `./deploy.py`. -:::{note} -It is *very* important that you not use a shared installation of Miniforge3 -or Miniconda3 such as the base environment for E3SM-Unified for polaris -development. Most developers will not have write access to shared -environments, meaning that you will get write-permission errors when you -try to update the base environment or create the polaris development -environment. +For background on this workflow, see: -For anyone who does have write permission to a shared environment, you -would be creating your polaris development environment in a shared space, -which could cause confusion. +- [Mache docs index](https://docs.e3sm.org/mache/main/index.html) +- [Mache deploy user guide](https://docs.e3sm.org/mache/main/users_guide/deploy.html) +- [Mache deploy developer guide](https://docs.e3sm.org/mache/main/developers_guide/deploy.html) -Please use your own personal installation of Miniforge3 for development, -letting `configure_polaris_envs.py` download and install Miniforge3 for -you if you don't already have it installed. -::: +As a developer, rerun `./deploy.py` when you check out a new branch or use a +new worktree. In most cases you do not need to rerun deployment while editing +existing files in `polaris`, because the package is installed in editable mode. +:::{note} +Miniforge, Micromamba, and Miniconda are no longer required for Polaris +deployment. If pixi is not already installed, `./deploy.py` can install it. +::: ### Supported machines If you are on one of the {ref}`dev-supported-machines`, run: ```bash -./configure_polaris_envs.py --conda \ - [-c ] [--mpi ] [-m ] [--with_albany] \ - [--with_netlib_lapack] [--with_petsc] +./deploy.py [--machine ] [--compiler ...] \ + [--mpi ...] [--deploy-spack] [--prefix ] [--recreate] ``` -The `` is typically `~/miniforge3`. -This is the location where you would like to install Miniforge3 or where it is -already installed. If you have limited space in your home directory, you may -want to give another path. If you already have it installed, that path will -be used to add (or update) the polaris test environment. +If you are on a login node, machine detection typically works automatically. +You can pass `--machine ` explicitly if needed. -See the machine under {ref}`dev-supported-machines` for a list of available -compilers to pass to `-c`. If you don't supply a compiler, you will get -the default one for that machine. Typically, you will want the default MPI -flavor that polaris has defined for each compiler, so you should -not need to specify which MPI version to use but you may do so with `--mpi` -if you need to. +Use `--deploy-spack` when you want to deploy machine-specific Spack +environments in addition to the local pixi environment. -If you are on a login node, the script should automatically recognize what -machine you are on. You can supply the machine name with `-m ` if -you run into trouble with the automatic recognition (e.g. if you're setting -up the environment on a compute node, which is not recommended). +### Unknown machines -### Environments with Albany +If a machine is not known to mache, add machine support first +(see {ref}`dev-add-supported-machine`). -If you are working with MALI, you should specify `--with_albany`. This will -ensure that the Albany and Trilinos libraries are included among those built -with system compilers and MPI libraries, a requirement for many MAlI test -cases. Currently, only Albany is only supported with `gnu` compilers. +For workflows that need custom machine config files, see {ref}`config-files`. -It is safe to add the `--with_albany` flag for MPAS-Ocean but it is not -recommended unless a user wants to be able to run both models with the same -conda/spack environment. The main downside is simply that unneeded libraries -will be linked in to MPAS-Ocean. +### What the script does -### Environments with PETSc and Netlib-LAPACK +`./deploy.py` can: -If you are working with MPAS-Ocean tasks that need PETSC and -Netlib-LAPACK, you should specify `--with_petsc --with_netlib_lapack` to -point to Spack environments where these libraries are included. Appropriate -environment variables for pointing to these libraries will be build into the -resulting load script (see below). +- install pixi if needed +- create/update a local pixi deployment prefix (default: `pixi-env`) +- install `polaris` from your local branch in editable/development mode +- optionally deploy Spack environments for selected compiler/MPI toolchains +- generate activation scripts (`load_*.sh`) -### Unknown machines +### Useful flags -If you are on an "unknown" machine, typically a Mac or Linux laptop or -workstation, you will need to specify which flavor of MPI you want to use -(`mpich` or `openmpi`): +`--machine` -```bash -./configure_polaris_envs.py --conda --mpi -``` +: set machine explicitly instead of automatic detection -Again, the `` is typically `~/miniforge3`, and is the location -where you would like to install Miniforge3 or where it is already installed. -If you already have it installed, that path will be used to add (or update) the -polaris test environment. +`--prefix` -We only support one set of compilers for Mac and Linux (`gnu` for Linux and -`clang` with `gfortran` for Mac), so there is no need to specify them. -See {ref}`dev-other-machines` for more details. +: choose deployment prefix for the pixi environment -In addition, unknown machines require a config file to be specified when setting -up the polaris test environment. A config file can be specified using -`-f `, where `` is an absolute or relative path to the -file. More information, including example config files, can be found -in {ref}`config-files`. +`--compiler`, `--mpi` -:::{note} -Currently, there is not a good way to build Albany for an unknown machine as -part of the polaris deployment process, meaning MALI will be limited to the -shallow-ice approximation (SIA) solver. +: compiler/MPI choices (primarily for Spack deployment) -To get started on HPC systems that aren't supported by Polaris, get in touch -with the developers. -::: +`--deploy-spack` -### What the script does +: deploy supported Spack environments -In addition to installing Miniforge3 and creating the conda environment for you, -this script will also: - -- install [Jigsaw](https://github.com/dengwirda/jigsaw) and - [Jigsaw-Python](https://github.com/dengwirda/jigsaw-python) from source - from the `jigsaw-python` submodule. These tools are used to create many of - the meshes used in Polaris. -- install the `polaris` package from the local branch in "development" mode - so changes you make to the repo are immediately reflected in the conda - environment. -- with the `--update_spack` flag on supported machines, installs or - reinstalls a spack environment with various system libraries. The - `--spack` flag can be used to point to a location for the spack repo to be - checked out. Without this flag, a default location is used. Spack is used to - build several libraries with system compilers and MPI library, including: - [SCORPIO](https://github.com/E3SM-Project/scorpio) (parallel i/o for E3SM - components) [ESMF](https://earthsystemmodeling.org/) (making mapping files - in parallel), [MOAB](https://sigma.mcs.anl.gov/moab-library/), - [Trilinos](https://trilinos.github.io/), - [Albany](https://github.com/sandialabs/Albany), - [Netlib-LAPACK](http://www.netlib.org/lapack/) and - [PETSc](https://petsc.org/). **Please uses these flags with caution, as - they can affect shared environments!** See {ref}`dev-updating-spack`. -- with the `--with_albany` flag, creates or uses an existing Spack - environment that includes Albany and Trilinos. -- with the `--with_petsc --with_netlib_lapack` flags, creates or uses an - existing Spack environment that includes PETSc and Netlib-LAPACK. -- make an activation script called `load_*.sh`, where the details of the - name encode the conda environment name, the machine, compilers, MPI - libraries, and optional libraries, e.g. - `load_dev_polaris____.sh` (`` - is the polaris version, `` is the name of the - machine, `` is the compiler name, and `mpi` is the MPI flavor). -- optionally (with the `--check` flag), run some tests to make sure some of - the expected packages are available. - -### Optional flags - -`--check` - -: Check to make sure expected commands are present - -`--python` - -: Select a particular python version (the default is currently 3.8) - -`--env_name` - -: Set the name of the environment (and the prefix for the activation script) - to something other than the default (`dev_polaris_` or - `dev_polaris__`). - -`--update_jigsaw` - -: Used to reinstall Jigsaw and Jigsaw-Python into the conda environment if - you have made changes to the Jigsaw (c++) code in the `jigsaw-python` - submodule. You should not need to reinstall Jigsaw-Python if you have made - changes only to the python code in `jigsaw-python`, as the python package - is installed in - [edit mode](https://setuptools.pypa.io/en/latest/userguide/development_mode.html). +`--spack-path` -### Activating the environment +: path to the Spack checkout used for deployment -Each time you want to work with polaris, you will need to run: +`--recreate` -```bash -source ./load____.sh -``` +: recreate deployment artifacts if they already exist + +`--bootstrap-only` + +: update only the bootstrap pixi environment used by deployment -This will load the appropriate conda environment, load system modules for -compilers, MPI and libraries needed to build and run E3SM components, and -set environment variables needed for E3SM components or polaris. It will also -set an environment variable `POLARIS_LOAD_SCRIPT` that points to the activation -script. Polaris uses this to make an symlink to the activation script called -`load_polaris_env.sh` in the work directory. When the load script is -executed from the base of the polaris repository (i.e., as -`source ./load____.sh`), -it will install the version of the `polaris` package from that location into -the associated conda environment. When the load script is executed from the -work directory through the symlink, it will activate the associated conda -environment, but does *not* install the `polaris` package into the conda -environment; it is assumed that is already up to date from when the conda -environment was created or last updated. - -It is generally recommended to activate the `polaris` environment (from -either the polaris repo or via the workdir symlink) from a -clean environment. Unexpected behavior may occur if activating a different -`polaris` environment after having one already activated. - -If you switch between different polaris branches, it is safest to rerun -`./configure_polaris_envs.py` with the same arguments as above to make -sure dependencies are up to date and the `polaris` package points to the -current directory. If you are certain that no polaris dependencies are -different between branches, you can also simply source the activation script -(`load_*.sh`) in the branch. - -Once you have sourced the activation script, you can run `polaris` commands -anywhere, and it always refers to that branch. To find out which branch you -are actually running `polaris` from, you should run: +`--mache-fork`, `--mache-branch`, `--mache-version` + +: test deployment against a specific mache fork/branch/version + +See `./deploy.py --help` for the full list. + +### Activating the environment + +Each time you want to work with Polaris, source one of the generated scripts: ```bash -echo $POLARIS_LOAD_SCRIPT +source ./load_*.sh ``` -This will give you the path to the load script, which will also tell you where -the branch is. If you do not use the worktree approach, you will also need to -check what branch you are currently on with `git log`, `git branch` or -a similar command. +This activates the deployment environment, loads machine modules when +appropriate, and sets environment variables needed by Polaris and MPAS +components. -If you wish to work with another compiler, simply rerun the script with a new -compiler name and an activation script will be produced. You can then source -either activation script to get the same conda environment but with different -compilers and related modules. Make sure you are careful to set up polaris by -pointing to a version of the MPAS model that was compiled with the correct -compiler. +When working inside a task or suite work directory, source +`load_polaris_env.sh` (a symlink to the selected load script). ### Switching between different polaris environments @@ -287,50 +141,45 @@ Many developers are switching between different `polaris` branches. We have 2 main workflows for doing this: checking out different branches in the same directory (with `git checkout`) or creating new directories for each branch (with `git worktree`). Either way, you need to be careful that -the version of the `polaris` package that is installed in the conda +the version of the `polaris` package that is installed in the active environment you are using is the one you want. But how to handle it differs slightly between these workflows. If you are developing or using multiple `polaris` branches in the same directory (switching between them using `git checkout`), you will need to make sure you update your `polaris` environment after changing -branches. Often the branches you're developing will make use of the -same conda environment, because they are using the same -`polaris` version (so the dependencies aren't changing). The same -conda environment (e.g. `dev_polaris_`) can safely be used -with multiple branches if you explicitly reinstall the `polaris` package -you want to use into the conda environment *after* moving to a new branch. -You can do this by simply re-executing -`source ./load____.sh` +branches. If dependencies are unchanged, you can usually just re-source a +load script in the branch root. + +You can do this by re-executing +`source ./load_*.sh` from the *root of the repo* before proceeding. Similarly, if you are developing or using multiple `polaris` branches but you use a different directory for each (creating the directories with `git worktree`), you will need to make sure the version of the `polaris` package -in your conda environment is the one you want. +in your active environment is the one you want. If your branches use the same `polaris` version (so the dependencies -are the same), you can use the same conda environment -(e.g. `dev_polaris_`) for all of them. But you will only -be able to test one of them at a time. You will tell the conda environment -which branch to use by running -`source ./load____.sh` +are the same), you can use the same deployment prefix for all of them. +You will tell the environment which branch to use by running +`source ./load_*.sh` from the *root of the directory (worktree) you want to work with* before proceeding. -In both of these workflows, you can modify the `polaris` code and the conda +In both of these workflows, you can modify the `polaris` code and the environment will notice the changes as you make them. However, if you have added or removed any files during your development, you need to source the load script again: -`source ./load____.sh` +`source ./load_*.sh` in the root of the repo or worktree so that the added or removed files will be -accounted for in the conda environment. +accounted for in the environment. If you know that `polaris` has different dependencies in a branch or worktree you are working on compared to a previous branch you have worked with (or if you aren't sure), it is safest to not just reinstall the `polaris` package but also to check the dependencies by re-running: -`./configure_polaris_envs.py` with the same arguments as above. +`./deploy.py` with the same arguments as above. This will also reinstall the `polaris` package from the current directory. The activation script includes a check to see if the version of polaris used to produce the load script is the same as the version of polaris in the @@ -345,24 +194,18 @@ __version__ = '0.2.0' Your code is version: __version__ = '0.3.0-alpha.1' -You need to run ./configure_polaris_envs.py to update your conda -environment and load script. +You need to run ./deploy.py to update your environment and load script. ``` -If you need more than one conda environment (e.g. because you are testing -multiple branches at the same time), you can choose your own name -for the conda environment. Typically, this might be something related to the -name of the branch you are developing. This can be done with the -`--env_name` argument to `./configure_polaris_envs.py`. You -can reuse the same custom-named environment across multiple branches -if that is useful. Just remember to reinstall `polaris` each time you -switch branches. +If you need more than one environment (e.g. because you are testing +multiple branches at the same time), use different deployment prefixes with +`./deploy.py --prefix `. :::{note} -If you switch branches and *do not* remember to recreate the conda -environment (`./configure_polaris_envs.py`) or at least source the +If you switch branches and *do not* remember to recreate the environment +(`./deploy.py`) or at least source the activation script (`load_*.sh`), you are likely to end up with -an incorrect and possibly unusable `polaris` package in your conda +an incorrect and possibly unusable `polaris` package in your environment. In general, if one wishes to switch between environments created for @@ -375,7 +218,7 @@ all. ::: :::{note} -With the conda environment activated, you can switch branches and update +With the environment activated, you can switch branches and update just the `polaris` package with: ```bash @@ -398,10 +241,11 @@ If you run into trouble with the environment or just want a clean start, you can run: ```bash -./configure_polaris_envs.py --conda -c --recreate +./deploy.py [--machine ] [--compiler ...] \ + [--mpi ...] [--deploy-spack] --recreate ``` -The `--recreate` flag will delete the conda environment and create it from +The `--recreate` flag will delete the environment and create it from scratch. This takes just a little extra time. (dev-creating-only-env)= @@ -410,13 +254,15 @@ scratch. This takes just a little extra time. For some workflows (e.g. for MALI development with the Albany library when the MALI build environment has been created outside of `polaris`, for example, -on an unsupported machine), you may only want to create the conda environment +on an unsupported machine), you may only want to create the pixi environment and not build SCORPIO, ESMF or include any system modules or environment -variables in your activation script. In such cases, run with the -`--env_only` flag: +variables in your activation script. In such cases, run `./deploy.py` +without `--deploy-spack`. + +To update only the bootstrap environment used internally by deployment: ```bash -./configure_polaris_envs.py --conda --env_only ... +./deploy.py --bootstrap-only ``` Each time you want to work with polaris, you will need to run: @@ -425,7 +271,7 @@ Each time you want to work with polaris, you will need to run: source ./load_.sh ``` -This will load the appropriate conda environment for polaris. It will also +This will load the appropriate environment for polaris. It will also set an environment variable `POLARIS_LOAD_SCRIPT` that points to the activation script. Polaris uses this to make a symlink to the activation script called `load_polaris_env.sh` in the work directory. @@ -433,14 +279,14 @@ called `load_polaris_env.sh` in the work directory. If you switch to another branch, you will need to rerun: ```bash -./configure_polaris_envs.py --conda --env_only +./deploy.py ``` to make sure dependencies are up to date and the `polaris` package points to the current directory. :::{note} -With the conda environment activated, you can switch branches and update +With the environment activated, you can switch branches and update just the `polaris` package with: ```bash @@ -448,7 +294,7 @@ python -m pip install --no-deps --no-build-isolation -e . ``` This will be substantially faster than rerunning -`./configure_polaris_envs.py ...` but at the risk that dependencies are +`./deploy.py ...` but at the risk that dependencies are not up-to-date. Since dependencies change fairly rarely, this will usually be safe. ::: diff --git a/docs/developers_guide/seaice/index.md b/docs/developers_guide/seaice/index.md index e510f1163b..a7c39cf13a 100644 --- a/docs/developers_guide/seaice/index.md +++ b/docs/developers_guide/seaice/index.md @@ -39,7 +39,7 @@ processed = ${paths:component_path}/src/Registry_processed.xml # The executables section defines paths to required executables. These # executables are provided for use by specific tasks. Most tools that -# polaris needs should be in the conda environment, so this is only the path +# polaris needs should be in the deployment environment, so this is only the path # to the MPAS-Seaice executable by default. [executables] component = ${paths:component_path}/seaice_model diff --git a/docs/developers_guide/troubleshooting.md b/docs/developers_guide/troubleshooting.md index 1e596caf65..e6b28b855e 100644 --- a/docs/developers_guide/troubleshooting.md +++ b/docs/developers_guide/troubleshooting.md @@ -7,12 +7,12 @@ suggested solutions. (dev-troubleshooting-conda-solver)= -## Solver errors when configuring conda environment +## Solver errors when configuring deployment environment When setting up {ref}`dev-conda-env`, by calling: ```bash -./configure_polaris_env.sh ... +./deploy.py ... ``` you may run into an error like: @@ -40,11 +40,11 @@ The solution should be to recreate the environment rather than trying to update it: ```bash -./configure_polaris_env.sh --recreate ... +./deploy.py --recreate ... ``` -The `--recreate` flag will first delete the existing `dev_polaris_*` conda -environment before creating it again with the new set of packages required for +The `--recreate` flag will first delete existing deployment artifacts +before creating them again with the new set of packages required for developing with the requested compiler and MPI type. (dev-troubleshooting-proxy)= diff --git a/docs/developers_guide/updating_conda.md b/docs/developers_guide/updating_conda.md index e69670dc09..2ff2f42ac5 100644 --- a/docs/developers_guide/updating_conda.md +++ b/docs/developers_guide/updating_conda.md @@ -1,120 +1,49 @@ (dev-updating-conda)= -# Updating Conda Dependencies - -## 🏷️ Polaris Versioning Scheme - -Polaris is primarily a developer-focused project and is in a perpetual "alpha" -stage. Versions in the `main` branch always end with `-alpha.`. -Occasionally, releases are tagged without the `-alpha.` for documentation -and provenance. - -The `alpha` version is incremented each time Polaris' Conda dependencies -change. This signals to developers and deployment infrastructure that -environments created with an older alpha version may not be compatible with -the new code. - -## 🔄 Workflow for Dependency Changes - -When Conda dependencies are updated (added or version-bumped), the `alpha` -version must also be incremented. This ensures developers know when their -Conda environments are out of date and need to be recreated or updated. - -Each time a developer sources a Polaris load script (see {ref}`dev-conda-env`), -the script checks that the Polaris version (including `alpha`) matches the one -used to create the Conda environment. If not, an error message prompts the -developer to update or recreate the environment. - -Unless Spack dependencies are also changed (in which case you need to follow -the workflow in {ref}`dev-updating-spack`), there is no need to deploy shared -Spack environments—just update Conda dependencies, bump the `alpha` version, -make a pull request, and merge. - ---- - -## ⬆️ Bump the `alpha` version - -Unless you are updating share Spack environments, you need to start by -updating the Polaris version to the next `alpha` number in `polaris/version.py` -and committing the change. For example, change: -``` python -__version__ = '0.8.0-alpha.9' -``` -to -``` python -__version__ = '0.8.0-alpha.10' -``` -and commit this with a message like, "Update Polaris to v0.8.0-alpha.10". - ---- - -## ✍️ Updating dependencies - -There are two places you might need to make the changes to the dependencies, -depending on the package(s) involved. - -### `conda-dev-spec.template` - -The [Jinja template](https://jinja.palletsprojects.com/en/stable/) that defines -Polaris' Conda environments is -[`deploy/conda-dev-spec.template`](https://github.com/E3SM-Project/polaris/blob/main/deploy/conda-dev-spec.template). -This file is the easiest place to add new Conda dependencies. It is the right -place to add dependencies that you do not plan to pin to a specific version -and which will not also be installed in a shared Spack environment. - -Add your new dependency in the appropriate location: -- `# Base` - runtime Polaris dependencies not associated with development -- `# Static typing` - dependencies for Python type checking -- `# Linting and testing` - tools for linthing the code and running simple - unit tests -- `# Development` - other development requirements like MPI, CMake and - compilers, typically only used on unsupported machines such as laptops -- `# CF-compliance` - tools that are handy for checking - [CF compliance](https://cfconventions.org/). -- `# Documentation` - tools for building the documentation -- `# Visualization` - tools for quick visualization, not directly required - by any Polaris tasks or steps - -### `default.cfg` - -Several Conda packages have fixed versions defined in `deploy/default.cfg`. -Currently, these are: - -```cfg -python = 3.13 - -# versions of conda packages -geometric_features = 1.6.1 -mache = 1.31.0 -conda_moab = 5.5.1 -mpas_tools = 1.2.0 -otps = 2021.10 -parallelio = 2.6.6 -``` - -We choose to define these "pinned" versions in the config file so that -developers can override them with their own config file (which might be -convenient for testing) and so that we can easily reference these versions in -multiple places during the deployment workflow if needed. - -**Note:** We treat the MOAB package as a special case with different versions -from conda-forge (`conda_moab`) and Spack (`spack_moab`). This is because -MOAB has infrequent releases and, in the past, we have need features and bug -fixes that are only available from the `master` branch. Since the `master` -version is not available on conda-forge, when we need the latest MOAB features -on machines with spack support, we still must fall back on the latest release -for use on login nodes and unsupported machines. - -Some versions defined in `default.cfg` apply to both Conda and Spack package: - -```cfg -# versions of conda or spack packages (depending on machine type) -esmf = 8.8.1 -metis = 5.1.0 -netcdf_c = 4.9.2 -netcdf_fortran = 4.6.2 -pnetcdf = 1.14.0 -``` -If those get updated, you will probably need to deploy new shared Spack -environments, meaning you will need to follow the full -[Spack Deployment Workflow](updating_spack/workflow.md). +# Updating Deployment Dependencies + +Polaris now deploys environments through `./deploy.py` and `mache.deploy`. +This means dependency updates are handled through deployment templates and +configuration, not through the old standalone conda setup workflow. + +## Where to Update Dependencies + +Most dependency updates should be made in deployment templates under `deploy/`: + +- `deploy/pixi.toml.j2` for pixi-managed packages +- `deploy/spack.yaml.j2` for Spack package specs +- `deploy/config.yaml.j2` for deployment options and defaults +- `deploy/pins.cfg` for pinned mache/python versions used by `deploy.py` + +For background on how these files are rendered and used, see: + +- [mache deploy user guide](https://docs.e3sm.org/mache/main/users_guide/deploy.html) +- [mache deploy developer guide](https://docs.e3sm.org/mache/main/developers_guide/deploy.html) + +## Recommended Workflow + +1. Update the relevant template(s) in `deploy/`. +2. If dependency behavior changes, bump `polaris/version.py` as appropriate. +3. Re-run deployment locally to validate: + + ```bash + ./deploy.py --recreate + ``` + +4. If Spack dependencies changed, test with: + + ```bash + ./deploy.py --deploy-spack --recreate + ``` + +5. Run required suites/tests for your machine and component. + +If Spack dependencies changed in a way that affects shared machine deployments, +follow the full workflow in {ref}`dev-updating-spack`. + +## Notes + +- Miniforge/Micromamba/Miniconda are no longer required for deployment. +- `./deploy.py` can install pixi when needed. +- For machine-specific Spack updates, coordinate with maintainers via the + process described in {ref}`dev-updating-spack`. diff --git a/docs/developers_guide/updating_spack/adding_new_machines.md b/docs/developers_guide/updating_spack/adding_new_machines.md index b16b728071..015f886a06 100644 --- a/docs/developers_guide/updating_spack/adding_new_machines.md +++ b/docs/developers_guide/updating_spack/adding_new_machines.md @@ -37,7 +37,7 @@ After updating `mache`, you'll need to: 1. **Reference your `mache` branch in Polaris Deployment** - * Use the `--mache_fork` and `--mache_branch` flags to deploy using the + * Use the `--mache-fork` and `--mache-branch` flags to deploy using the updated branch * Confirm the new machine is recognized and templates are applied correctly @@ -55,13 +55,12 @@ Use the standard test deployment approach from [Deploying a new spack environment](testing/deploying_spack.md): ```bash -./configure_polaris_envs.py --conda ~/miniforge3 \ - --mache_fork \ - --mache_branch \ - --compiler \ - --mpi \ - --verbose \ - --recreate +./deploy.py --mache-fork \ + --mache-branch \ + --compiler \ + --mpi \ + --deploy-spack \ + --recreate ``` You can also supply the `--machine` flag: ``` diff --git a/docs/developers_guide/updating_spack/deploying_shared_spack.md b/docs/developers_guide/updating_spack/deploying_shared_spack.md index 7efe67bac5..92cda8fd99 100644 --- a/docs/developers_guide/updating_spack/deploying_shared_spack.md +++ b/docs/developers_guide/updating_spack/deploying_shared_spack.md @@ -40,23 +40,21 @@ remove bot PRs for updating pre-commit dependencies). Here is an ### 2. Deploy Shared Spack Environments on HPC Systems Use the same process as during test deployment but you do *not* use the -`--spack` flag to specify a test deployment location. For example: +`--spack-path` flag to specify a test deployment location. For example: ```bash SCRATCH= -CONDA_BASE=~/miniforge3 -mdkir -p $SCRATCH/tmp_spack -./configure_polaris_envs.py \ - --conda $CONDA_BASE \ - --update_spack \ - --tmpdir $SCRATCH/tmp_spack \ +./deploy.py \ + --deploy-spack \ --compiler intel intel gnu \ --mpi openmpi impi openmpi \ - --recreate \ - --verbose + --recreate ``` -This creates a local activation scripts like: +If you need to control Spack temporary build location, set `spack.tmpdir` in +`deploy/config.yaml.j2`. + +This creates local activation scripts like: * `load_polaris_dev____.sh` diff --git a/docs/developers_guide/updating_spack/index.md b/docs/developers_guide/updating_spack/index.md index 475cf7a808..83987d50ae 100644 --- a/docs/developers_guide/updating_spack/index.md +++ b/docs/developers_guide/updating_spack/index.md @@ -5,16 +5,16 @@ This section documents the workflow for updating Polaris' shared Spack environments and for incrementing the Polaris version (typically the minor version, and rarely the major version). These updates are required when shared -Spack packages change, which is a more involved process than updating Conda +Spack packages change, which is a more involved process than updating pixi dependencies. -**Note:** If you are only updating Conda dependencies and bumping the `alpha` +**Note:** If you are only updating deployment dependencies and bumping the `alpha` version, follow the instructions in {ref}`dev-updating-conda` instead. Building Spack dependencies for each compiler and MPI library can take several hours, so we share Spack environments between developers on supported machines. As a result, Spack dependencies in Polaris are updated less frequently than -Conda dependencies, since the process is more involved and time-consuming. When +pixi/deployment dependencies, since the process is more involved and time-consuming. When Spack dependencies need to be updated, new versions of these shared environments must be built and deployed on all supported machines. @@ -26,7 +26,7 @@ situations where shared Spack packages have changed and a new release of the shared Spack environments is required. This process requires coordinated testing and deployment across supported HPC systems. -Athough Polaris shares a lot of its deployment infrastructure with +Although Polaris shares a lot of its deployment infrastructure with [E3SM-Unified](https://docs.e3sm.org/e3sm-unified/main/releasing/index.html) (such as the `mache` package and the E3SM Spack fork), the two packages have quite distinct workflows for updating version numbers as well as deploying diff --git a/docs/developers_guide/updating_spack/maintaining_past_versions.md b/docs/developers_guide/updating_spack/maintaining_past_versions.md index 96ef318d90..27897ec9f8 100644 --- a/docs/developers_guide/updating_spack/maintaining_past_versions.md +++ b/docs/developers_guide/updating_spack/maintaining_past_versions.md @@ -45,7 +45,7 @@ used only during internal testing and should be removed when they are no longer needed to free up disk space. They should be in the maintainers own scratch space in any case. -You can also delete your own Polaris conda environments whenever you need to +You can also delete your own Polaris deployment environments whenever you need to free up space for your own use. ### Intermediate Build Artifacts @@ -67,14 +67,14 @@ If a past version breaks due to: 1. Checkout the appropriate commit in the Polaris repo (perhaps the release tag, e.g. `0.7.0`) -2. Use `configure_polaris_envs.py` as usual, since Polaris will notice the +2. Use `deploy.py` as usual, since Polaris will notice the older version in `polaris/version.py`: ``` bash - ./configure_polaris_envs.py --conda ~/miniforge3 --recreate --update_spack ... + ./deploy.py --recreate --deploy-spack ... ``` -You may run into difficulty solving for older conda environments e.g. because +You may run into difficulty solving for older deployment environments e.g. because of missing system modules. At some point, it may simply not be possible to recreate older Polaris Spack environments because of this. diff --git a/docs/developers_guide/updating_spack/testing/deploying_spack.md b/docs/developers_guide/updating_spack/testing/deploying_spack.md index d5802aded6..b24a17b1d4 100644 --- a/docs/developers_guide/updating_spack/testing/deploying_spack.md +++ b/docs/developers_guide/updating_spack/testing/deploying_spack.md @@ -12,41 +12,34 @@ best practices for deploying and validating Spack environments in Polaris. ## Deployment Workflow -Deployment is managed via the `configure_polaris_envs.py` script and associated +Deployment is managed via `./deploy.py` (backed by `mache.deploy`) and associated infrastructure. The process is typically: 1. **Update configuration files**: - Set the target version in `polaris/version.py` - - Update Spack and Conda package versions in `deploy/default.cfg` + - Update package pins in `deploy/pins.cfg` + - Update Spack specs in `deploy/spack.yaml.j2` if package specs changed - Update machine configs in `polaris/machines/` as needed 2. **Test the build** on one or more HPC machines: ```bash SCRATCH= - CONDA_BASE=~/miniforge3 - mdkir -p $SCRATCH/tmp_spack - ./configure_polaris_envs.py \ - --conda $CONDA_BASE \ - --update_spack \ - --spack $SCRATCH/test_spack \ - --tmpdir $SCRATCH/tmp_spack \ - --compiler intel intel gnu \ - --mpi openmpi impi openmpi \ - --recreate \ - --verbose + mkdir -p $SCRATCH/tmp_spack + ./deploy.py --deploy-spack --spack-path $SCRATCH/test_spack \ + --compiler intel intel gnu --mpi openmpi impi openmpi --recreate ``` - *Adjust `--compiler` and `--mpi` as needed for your machine and test matrix.* + *Adjust `--compiler` and `--mpi` as needed for your machine and test matrix.* *You may want to use `screen` or `tmux` and pipe output to a log file:* ```bash - ./configure_polaris_envs.py ... 2>&1 | tee deploy.log + ./deploy.py ... 2>&1 | tee deploy.log ``` 3. **Check output** and validate: - Spack built the expected packages - - Conda environment was created and activated + - Pixi environment was created and activated - Activation scripts were generated and symlinked correctly - Permissions have been updated successfully @@ -58,43 +51,42 @@ infrastructure. The process is typically: ## Key Deployment Components -- **`configure_polaris_envs.py`**: Main entry point for deploying Polaris - environments. Handles both Conda and Spack setup. -- **`deploy/default.cfg`**: Specifies package versions and deployment options. -- **`deploy/shared.py`**: Shared logic for deployment scripts. -- **`deploy/bootstrap.py`**: Handles environment creation and Spack builds - after the bootstrap environment is set up. -- **Templates**: Jinja2 templates in `deploy/` and `deploy/spack/` for - environment specs and activation scripts. +- **`deploy.py`**: Main entry point for deploying Polaris environments. + Handles pixi deployment and optional Spack deployment through `mache.deploy`. +- **`deploy/pins.cfg`**: Pin versions for pixi and Spack packages. +- **`deploy/config.yaml.j2`**: Deployment behavior and machine/runtime + settings consumed by `mache.deploy`. +- **`deploy/spack.yaml.j2`**: Jinja2 template for Spack specs. +- **`deploy/hooks.py`**: Polaris-specific deployment hooks used by + `mache.deploy`. +- **Mache deploy docs**: authoritative behavior and option details: + --- ## Common Command-Line Flags -- `--conda `: Path to your Miniforge3 installation (required). -- `--update_spack`: Build or rebuild the Spack environment. -- `--spack `: Path to install Spack environments (for testing). -- `--tmpdir `: Temporary directory for Spack builds (recommended). +- `--deploy-spack`: Build or rebuild Spack environments. +- `--spack-path `: Path to Spack checkout used for deployment/testing. - `--compiler `: Specify compiler(s) to build for. - `--mpi `: Specify MPI library/libraries. -- `--with_albany`: Include Albany in the Spack environment - (see `albany_supported.txt` for supported combos). -- `--with_petsc`: Include PETSc and Netlib LAPACK (see `petsc_supported.txt`). - `--recreate`: Recreate environments even if they exist. -- `--mache_fork` and `--mache_branch`: Use a specific fork/branch of `mache` +- `--mache-fork` and `--mache-branch`: Use a specific fork/branch of `mache` (for co-development/testing). -- `--verbose`: Print all output to the terminal. -See `./configure_polaris_envs.py --help` for the full list. +See `./deploy.py --help` for the full list. + +If needed, set Spack temporary build location with `spack.tmpdir` in +`deploy/config.yaml.j2`. --- ## Notes and Best Practices -- Use your own Miniforge3 installation (not Miniconda or a shared install). -- Use a unique Spack install location for testing (`--spack`). -- Use a scratch or group directory for Spack's temporary build files - (`--tmpdir`). +- Use a unique Spack install location for testing (`--spack-path`). +- Use a scratch or group directory for Spack's temporary build files. +- Set `spack.tmpdir` in `deploy/config.yaml.j2` if you need to control + temporary build location. - Only deploy shared Spack environments after thorough testing. - Check `albany_supported.txt` and `petsc_supported.txt` for supported machine/compiler/MPI combos. diff --git a/docs/developers_guide/updating_spack/testing/overview.md b/docs/developers_guide/updating_spack/testing/overview.md index a83e37f515..4cf19ecedc 100644 --- a/docs/developers_guide/updating_spack/testing/overview.md +++ b/docs/developers_guide/updating_spack/testing/overview.md @@ -43,7 +43,7 @@ new release: ### 🚀 [Deploying Spack Environments on HPCs](deploying_spack.md) -* Use the `configure_polaris_envs.py` script and template infrastructure +* Use `./deploy.py` (`mache.deploy`) and template infrastructure * Build environments and activation scripts tailored to each system ### ✅ [Running Required Test Suites](running_test_suites.md) @@ -65,7 +65,7 @@ new release: ## Audience This section is primarily intended for Polaris maintainers and release -engineers. Familiarity with Spack, Conda, and HPC system environments is +engineers. Familiarity with Spack, pixi, and HPC system environments is assumed. ➡ Start with: [Updating the E3SM Spack Fork](updating_spack_fork.md) diff --git a/docs/developers_guide/updating_spack/testing/troubleshooting.md b/docs/developers_guide/updating_spack/testing/troubleshooting.md index 54f9dbfe0e..675f727eee 100644 --- a/docs/developers_guide/updating_spack/testing/troubleshooting.md +++ b/docs/developers_guide/updating_spack/testing/troubleshooting.md @@ -50,11 +50,11 @@ you have encountered and solutions you have found. --- -## 3. 🚫 Conda Environment Problems +## 3. 🚫 Pixi Environment Problems ### Symptoms -* Conda fails to resolve dependencies +* Pixi fails to resolve dependencies * Environments install but are missing key packages ### Fixes @@ -126,7 +126,7 @@ When in doubt, remove and rebuild everything: ```bash rm -rf -./configure_polaris_envs.py --conda ~/miniforge3 --recreate +./deploy.py --recreate ``` This often resolves cases where previous state is interfering with a clean diff --git a/docs/developers_guide/updating_spack/updating_packages.md b/docs/developers_guide/updating_spack/updating_packages.md index c375aebf6b..74c106d1cf 100644 --- a/docs/developers_guide/updating_spack/updating_packages.md +++ b/docs/developers_guide/updating_spack/updating_packages.md @@ -1,245 +1,73 @@ # Updating Spack Dependencies -Updating Spack dependencies in Polaris is a key part of maintaining -compatibility with E3SM components and ensuring that all required system -libraries and tools are available for developers. Unlike Conda dependencies, -which are managed per-developer and updated more frequently, Spack environments -are shared among developers on supported machines and are updated less often -due to the time and coordination required. When Spack dependencies change—such -as when new versions of libraries like ESMF, MOAB, or SCORPIO are needed, or -when system modules are updated—a new version of the shared Spack environment -must be built and deployed across all supported platforms. This process also -typically involves incrementing the Polaris version (usually the minor version) -to track the change. The following workflow outlines the steps required to -update Spack dependencies in Polaris. - -## 🚩 Workflow for Updating Spack Dependencies - -1. **Create a Branch** - Start by creating a branch named `update-to-`, where `` - is the new Polaris version you are preparing (e.g., +Updating shared Spack dependencies in Polaris is part of the release workflow +for supported machines. Compared with pixi dependency updates, Spack updates +are heavier-weight because environments are shared and build times are often +long. + +Polaris now uses `./deploy.py` backed by `mache.deploy`. For deployment +behavior and templates, treat the mache docs as the source of truth: + +- [mache deploy user guide](https://docs.e3sm.org/mache/main/users_guide/deploy.html) +- [mache deploy developer guide](https://docs.e3sm.org/mache/main/developers_guide/deploy.html) + +## Workflow for Spack Dependency Changes + +1. **Create a version-update branch** + + Use a branch like `update-to-` (for example `update-to-0.9.0-alpha.1`). -2. **Bump the Polaris Version** - Update the version number in `polaris/version.py` to the new version. - Typically, when updating Spack dependencies, increment the *minor* version, - reset the *patch* version to `0`, and set the *alpha* version to `1`. - For example, if the current version is: - ```python - __version__ = '0.8.0-alpha.3' - ``` - change it to: - ```python - __version__ = '0.9.0-alpha.1' - ``` - Commit this change with a message like "Update Polaris to v0.9.0-alpha.1". - -3. **Update Spack Dependency Versions** - Edit `deploy/default.cfg` to update the versions of Spack dependencies as - needed. The relevant sections are: - ```cfg - # versions of conda or spack packages (depending on machine type) - esmf = 8.8.1 - metis = 5.1.0 - netcdf_c = 4.9.2 - netcdf_fortran = 4.6.2 - pnetcdf = 1.14.0 - - # versions of spack packages - albany = developcompass-2024-03-13 - # cmake newer than 3.23.0 needed for Trilinos - cmake = 3.23.0: - hdf5 = 1.14.6 - lapack = 3.9.1 - spack_moab = master - parmetis = 4.0.3 - petsc = 3.19.1 - scorpio = 1.7.0 - ``` +2. **Bump the Polaris version** -4. **Commit and Make a PR** - Commit your changes and make a pull request to the Polaris repo. This - will be used to keep track of the updated packages as well as the testing - and deployment process. You can use - [this example](https://github.com/E3SM-Project/polaris/pull/319) - as a template. Make sure to include: - * An **Updates:** section describing the packages (both Conda and Spack) - that are updated as well as a description of what the new version - provides or why it is needed. For example: - - ``` markdown - ## Updates: - - esmf v8.8.1 - - hdf5 v1.14.6 - - mache v1.31.0 -- brings in Aurora support and some related reorganization and clean-up - - moab master -- brings in bug fix related to remapping from cubed-sphere grids to MPAS meshes - - mpas_tools v1.1.0 -- brings in bug fix to barotropic streamfunciton - - parallelio v2.6.6 - - pnetcdf v1.14.0 - - scorpio v1.7.0 - ``` - - * A **Testing:** section with a checklist for each machine, compiler and - MPI variant that will be tested. This can also be a helpful place to - coordinate who will test what (if multiple maintainers are involved) and - to note any issues you are seeing (pointing to a new or existing issue - under [https://github.com/E3SM-Project/polaris/issues](https://github.com/E3SM-Project/polaris/issues)). For example: - - ``` markdown - ## Testing - - MPAS-Ocean with `pr`: - - [ ] chrysalis (@xylar) - - [ ] intel and openmpi - - [ ] gnu and openmpi - - [ ] frontier (@xylar) - - [ ] craygnu and mpich - - [ ] craycray and mpich - - [ ] pm-cpu (@xylar) - - [ ] gnu and mpich - - [ ] intel and mpich - still seeing https://github.com/E3SM-Project/polaris/issues/205 - - Omega CTests: - - [ ] chrysalis (@xylar) - - [ ] intel - - [ ] gnu - - [ ] frontier (@xylar) - - [ ] craygnu - - [ ] craygnu-mphipcc - - [ ] craycray - - [ ] craycray-mphipcc - - [ ] crayamd - - [ ] crayamd-mphipcc - - [ ] pm-cpu (@xylar) - - [ ] gnu - - [ ] intel - - [ ] pm-gpu (@xylar) - - [ ] gnugpu - ``` - - * A **Deployment:** section with another checklist for each machine, - compiler and MPI variant on all supported machines. Again, it will be - helpful to note who will do the deployemnt (and final testing) and any - issues that persist: - - ``` markdown - ## Deploying - - MPAS-Ocean with `pr`: - - [ ] chrysalis (@xylar) - - [ ] intel and openmpi - - [ ] gnu and openmpi - - [ ] frontier (@xylar) - - [ ] craygnu and mpich - - [ ] craycray and mpich - - [ ] pm-cpu (@xylar) - - [ ] gnu and mpich - - [ ] intel and mpich - still seeing https://github.com/E3SM-Project/polaris/issues/205 - - Omega CTests: - - [ ] chrysalis (@xylar) - - [ ] intel - - [ ] gnu - - [ ] frontier (@xylar) - - [ ] craygnu - - [ ] craygnu-mphipcc - - [ ] craycray - - [ ] craycray-mphipcc - - [ ] crayamd - - [ ] crayamd-mphipcc - - [ ] pm-cpu (@xylar) - - [ ] gnu - - [ ] intel - - [ ] pm-gpu (@xylar) - - [ ] gnugpu - ``` ---- - -## ➕ Adding a New Spack Package - -To add a new Spack package to the Polaris deployment, follow these steps: - -1. **Add the Package Version to `default.cfg`** - - In the `[deploy]` section of `deploy/default.cfg`, add a new entry for your - package with the desired version. For example: - ```ini - # versions of spack packages - mypackage = 1.2.3 - ``` + Update `polaris/version.py` to the target version and commit that change. -2. **Edit `bootstrap.py` to Add the Package to the Spack Specs** - - In `deploy/bootstrap.py`, you must: - - Read the version from the config, following the pattern used for other - packages: - ```python - mypackage = config.get('deploy', 'mypackage') - ``` - - Add the package to the list of Spack specs, following the approach used - for existing dependencies such as `esmf`, `metis`, `parmetis`, or - `scorpio`. For example: - ```python - if mypackage != 'None': - specs.append(f'mypackage@{mypackage}+mpi+shared') - ``` - Adjust the variant flags (`+mpi`, `+shared`, etc.) as appropriate for - your package. - - **Examples from existing packages:** - ```python - esmf = config.get('deploy', 'esmf') - metis = config.get('deploy', 'metis') - parmetis = config.get('deploy', 'parmetis') - scorpio = config.get('deploy', 'scorpio') - - ... - - if esmf != 'None': - specs.append(f'esmf@{esmf}+mpi+netcdf~pnetcdf~external-parallelio') - if metis != 'None': - specs.append(f'metis@{metis}+int64+real64~shared') - if parmetis != 'None': - specs.append(f'parmetis@{parmetis}+int64~shared') - if scorpio != 'None': - specs.append( - f'e3sm-scorpio@{scorpio}+mpi~timing~internal-timing~tools+malloc' - ) - ``` +3. **Update deployment inputs** + + - Update pinned versions in `deploy/pins.cfg`: + - `[spack]` for Spack-only pins + - `[all]` for pins shared by pixi and Spack + - Update Spack specs in `deploy/spack.yaml.j2` as needed + - Update deployment defaults/behavior in `deploy/config.yaml.j2` only if + deployment logic or paths need to change + +4. **Open a PR and track testing/deployment** -3. **Follow the Process for Existing Spack Dependencies** + Include: + - an **Updates** section listing changed versions and rationale + - a **Testing** checklist by machine/compiler/MPI + - a **Deployment** checklist for final shared deployment - Review how other Spack dependencies are handled in both `default.cfg` and - `bootstrap.py` to ensure consistency. This includes: - - Reading the version from the config. - - Adding the correct Spack spec string to the `specs` list. - - Handling any special environment variables or linking flags if needed. +5. **Run test deployments before shared deployment** + + Example: + + ```bash + ./deploy.py --deploy-spack --spack-path \ + --compiler --mpi --recreate + ``` -4. **Test and Document** +6. **Run required validation suites** - - Test the new package (as part of test deployments of Polaris) on all - supported machines and compilers. - - Document the addition in your PR, including the version. + Follow the testing pages under `updating_spack/testing/` before final + deployment. -**Tips:** -- If your package requires special variants or dependencies, consult the Spack - documentation for the correct spec syntax. -- If the package is only needed on certain machines or for certain workflows, - consider making its inclusion conditional. For examples of this process, - see how the `--with_albany` and `--with_petsc` flags (defined in - `deploy/shared.py`) are used to include the `albany` and `petsc` packages, - respectively, in specialized Spack environments that include these libraries. +## Adding a New Spack Package ---- +To add a new package to Polaris Spack environments: -## 📝 Summary +1. Add or update the package pin in `deploy/pins.cfg` (`[spack]` or `[all]`). +2. Add the corresponding spec in `deploy/spack.yaml.j2` under `library` and/or + `software`, whichever is appropriate. +3. If inclusion should be conditional, express that condition in Jinja2 + template logic and document it in your PR. +4. Test on supported machine/compiler/MPI combinations. -- Create a branch for the update. -- Bump the Polaris version (minor version up, patch to 0, alpha to 1). -- Update Spack dependency versions in `deploy/default.cfg`. -- Commit, test, and deploy on all supported machines. -- Be aware of special handling for MOAB and SCORPIO. +## Summary -If you need to update Conda dependencies as well, see -[Updating Conda Dependencies](../updating_conda.md). +- Update version and deployment inputs (`deploy/pins.cfg`, `deploy/spack.yaml.j2`). +- Validate with `./deploy.py` test deployments. +- Coordinate final shared Spack deployment only after testing passes. +If deployment (pixi) dependencies also changed, follow +[Updating Deployment Dependencies](../updating_conda.md). diff --git a/docs/developers_guide/updating_spack/workflow.md b/docs/developers_guide/updating_spack/workflow.md index 76ae01b647..13dc0bf89f 100644 --- a/docs/developers_guide/updating_spack/workflow.md +++ b/docs/developers_guide/updating_spack/workflow.md @@ -3,11 +3,11 @@ The Polaris workflow for updating shared Spack environments typically follows this progression: -1. **[Updating Conda Dependencies](../updating_conda.md)** +1. **[Updating Deployment Dependencies](../updating_conda.md)** 2. **[Updating Spack Dependencies](updating_packages.md)** 3. **[Deployment and Testing](testing/overview.md)** 4. **[Adding a New Machine](adding_new_machines.md)** -5. **[Deploying the New Versione](deploying_shared_spack.md)** +5. **[Deploying the New Version](deploying_shared_spack.md)** 6. **[Maintaining Past Versions](maintaining_past_versions.md)** We begin with some background information, then each of these steps is detailed @@ -15,9 +15,9 @@ in its own page. See below for a high-level summary. --- -## Backgraound: How Conda and Spack Work Together in Polaris +## Background: How Pixi and Spack Work Together in Polaris -Why does Polaris use both Conda and Spack? What roles do they each serve? +Why does Polaris use both pixi and Spack? What roles do they each serve? Before you start, it's critical to understand how these two systems work together. @@ -25,11 +25,11 @@ together. --- -## 1. Updating Conda Dependencies +## 1. Updating Deployment Dependencies -Frequently, developers need to updating Conda dependencies at the same time +Frequently, developers need to update deployment dependencies at the same time that they are updating shared Spack environments. In such cases, follow the -same process you would if you were just updating Conda dependencies but there +same process you would if you were just updating deployment dependencies but there is no need to bump the `alpha` version or make a separate PR for those changes. 🔗 [Read more](../updating_conda.md) @@ -38,7 +38,7 @@ is no need to bump the `alpha` version or make a separate PR for those changes. ## 2. Updating Spack Dependencies -Updates to shared Spack environments typcially occur when Polaris needs to +Updates to shared Spack environments typically occur when Polaris needs to support new Spack dependencies (e.g. a new MPI-base library or tool) or when new versions of existing Spack dependencies are required. Sometimes, new shared Spack environments are required because system modules have changed @@ -54,7 +54,7 @@ existing ones. Before full deployment, new versions of Polaris are installed on a subset of HPC platforms for iterative testing and validation. This stage often requires -updating updating `mache` to support new systems or changes in machine +updating `mache` to support new systems or changes in machine configurations, adding package versions to E3SM's Spack fork, and troubleshooting deployment scripts. @@ -75,7 +75,7 @@ provide notes on adding new HPCs that are specific to Polaris. ## 6. Deploying the Shared Spack Environments -Once test deployments have been made and the rquired test suites are passing: +Once test deployments have been made and the required test suites are passing: * Deploy across all supported HPC machines * Merge the version-update PR diff --git a/docs/tutorials/dev_add_category_of_tasks/getting_started.md b/docs/tutorials/dev_add_category_of_tasks/getting_started.md index 75262fc113..289f5d3871 100644 --- a/docs/tutorials/dev_add_category_of_tasks/getting_started.md +++ b/docs/tutorials/dev_add_category_of_tasks/getting_started.md @@ -14,25 +14,23 @@ cd add-my-overflow git checkout -b add-my-overflow ``` -Next, create a conda environment for developing Polaris, as described in +Next, create a local deployment environment for developing Polaris, as +described in {ref}`dev-conda-env`. We'll assume you're working on a supported machine and using the default compilers and MPI libraries, but consult the documentation if you need a custom environment. ```bash # This may take a while the first time -./configure_polaris_envs.py --conda $HOME/miniforge3 --verbose +./deploy.py --deploy-spack ``` -If you don't already have [miniforge3](https://github.com/conda-forge/miniforge) -installed at the location specified by `--conda`, it will be installed -automatically. +For deployment details and options, see the +[mache deploy user guide](https://docs.e3sm.org/mache/main/users_guide/deploy.html). ```{note} -If you already have [Miniconda](https://docs.conda.io/en/latest/miniconda.html) -installed, you can use that as well. However, we recommend Miniforge3, as it -comes with important tools and configuration options set up as needed for -Polaris. +Miniforge, Micromamba and Miniconda are no longer required for Polaris +deployment. If pixi is not already available, `./deploy.py` can install it. ``` After setup, you should have a file named `load_dev_polaris_*.sh`, where `*` diff --git a/docs/tutorials/dev_add_category_of_tasks/overview.md b/docs/tutorials/dev_add_category_of_tasks/overview.md index 5953eb24cf..0d219e55b5 100644 --- a/docs/tutorials/dev_add_category_of_tasks/overview.md +++ b/docs/tutorials/dev_add_category_of_tasks/overview.md @@ -7,7 +7,7 @@ outline of the tutorial, with each step linked to its detailed instructions. 1. [Getting Started](getting_started.md) Set up your development environment, clone the Polaris repository, create a - new branch, set up a conda environment, and obtain the necessary E3SM + new branch, set up a local deployment environment, and obtain the necessary E3SM submodules. 2. [Making a New Category of Tasks](creating_category_of_tasks.md) diff --git a/docs/users_guide/config_files.md b/docs/users_guide/config_files.md index a9b46fbe8e..44089f8da5 100644 --- a/docs/users_guide/config_files.md +++ b/docs/users_guide/config_files.md @@ -235,7 +235,7 @@ init = /home/xylar/code/polaris/customize_config_parser/E3SM-Project/components/ # The executables section defines paths to required executables. These # executables are provided for use by specific tasks. Most tools that -# polaris needs should be in the conda environment, so this is only the path +# polaris needs should be in the deployment environment, so this is only the path # to the MPAS-Ocean executable by default. [executables] diff --git a/docs/users_guide/invalid_quick_start.md b/docs/users_guide/invalid_quick_start.md index f03b75f09b..7d7abcb489 100644 --- a/docs/users_guide/invalid_quick_start.md +++ b/docs/users_guide/invalid_quick_start.md @@ -10,14 +10,13 @@ documentation as soon as there is one. Until then please refer to the (conda-env)= -## Loading polaris conda and spack environments +## Loading polaris deployment and spack environments ### E3SM supported machines -For each polaris release, we maintain a -[conda environment](https://docs.conda.io/en/latest/). that includes the -`polaris` package as well as all of its dependencies and some libraries -(currently [ESMF](https://earthsystemmodeling.org/), +For each polaris release, we maintain deployment environments that include the +`polaris` package and its dependencies, together with libraries +(including [ESMF](https://earthsystemmodeling.org/), [MOAB](https://sigma.mcs.anl.gov/moab-library/) and [SCORPIO](https://e3sm.org/scorpio-parallel-io-library/)) built with system MPI using [spack](https://spack.io/) on our standard machines (Aurora, @@ -48,46 +47,27 @@ source /lcrc/soft/climate/polaris/chrysalis/load_latest_polaris_gnu_openmpi.sh ### Other machines -Once it is released, you will be able to install polaris from a conda package. -To install your own polaris conda environment on non-E3SM-supported machines, -first, install [Miniforge3](https://github.com/conda-forge/miniforge#miniforge3) -if you don't already have it. Then, create a new conda environment (called -`polaris` in this example) as follows: +On non-E3SM-supported machines, deploy Polaris from a source checkout with +`./deploy.py` (rather than creating a separate conda environment manually): ```bash -conda create -n polaris -c conda-forge -c e3sm/label/polaris python=3.13 \ - "polaris=*=mpi_mpich*" +./deploy.py --compiler gnu --mpi mpich ``` -This will install the version of the package with MPI from conda-forge's MPICH -package. If you want OpenMPI, use `"polaris=*=mpi_openmpi*"` instead. If -you do not want MPI from conda-forge (e.g. because you are working with a -system with its own MPI), use `"polaris=*=nompi*"` +For additional options and deployment behavior, see: -To get a specific version of polaris, you can instead run: +- [mache deploy user guide](https://docs.e3sm.org/mache/main/users_guide/deploy.html) +- [mache deploy quick start](https://docs.e3sm.org/mache/main/users_guide/quick_start.html) -```bash -conda create -n polaris -c conda-forge -c e3sm/label/polaris python=3.13 \ - "polaris=1.0.0=mpi_mpich*" -``` - -That is, you will replace `polaris=*` with `polaris=1.0.0`. +If pixi is not already installed, `./deploy.py` can install it. -Then, you will need to create a load script to activate the conda environment -and set some environment variables. In a directory where you want to store the -script, run: - -```bash -conda activate polaris -create_polaris_load_script -``` +Once deployment completes, use the generated load script (for example +`load_*.sh`) to activate your environment. -From then on, each time you want to set up tasks or suites with polaris -or build MPAS components, you will need to source that load script, for -example: +To deploy against a specific mache branch for testing, use: ```bash -source load_polaris_1.0.0_mpich.sh +./deploy.py --mache-fork --mache-branch ``` When you set up tasks, a link called `load_polaris_env.sh` will be added to diff --git a/docs/users_guide/machines/aurora.md b/docs/users_guide/machines/aurora.md index ac17d332e9..8a203495c3 100644 --- a/docs/users_guide/machines/aurora.md +++ b/docs/users_guide/machines/aurora.md @@ -14,7 +14,7 @@ Here is a link to the ## config options Here are the default config options added when you have configured Polairs on -a Aurora login node (or specified `./configure_polaris_envs.py -m aurora`): +a Aurora login node (or specified `./deploy.py --machine aurora`): ```cfg # The paths section describes paths for data and environments @@ -23,12 +23,12 @@ a Aurora login node (or specified `./configure_polaris_envs.py -m aurora`): # A shared root directory where polaris data can be found database_root = /lus/flare/projects/E3SM_Dec/polaris -# the path to the base conda environment where polars environments have +# the path to deployed Polaris environments # been created polaris_envs = /lus/flare/projects/E3SM_Dec/soft/polaris/aurora/base -# Options related to deploying a polaris conda and spack environments +# Options related to deploying Polaris and Spack environments [deploy] # the compiler set to use for system libraries and MPAS builds diff --git a/docs/users_guide/machines/chrysalis.md b/docs/users_guide/machines/chrysalis.md index bf0235d6f9..8ec4c06fbd 100644 --- a/docs/users_guide/machines/chrysalis.md +++ b/docs/users_guide/machines/chrysalis.md @@ -15,12 +15,12 @@ suite: # A shared root directory where polaris data can be found database_root = /lcrc/group/e3sm/public_html/polaris -# the path to the base conda environment where polars environments have +# the path to deployed Polaris environments # been created polaris_envs = /lcrc/soft/climate/polaris/chrysalis/base -# Options related to deploying a polaris conda and spack environments +# Options related to deploying Polaris and Spack environments [deploy] # the compiler set to use for system libraries and MPAS builds diff --git a/docs/users_guide/machines/frontier.md b/docs/users_guide/machines/frontier.md index 3ed4b669b6..ec4a19cbb2 100644 --- a/docs/users_guide/machines/frontier.md +++ b/docs/users_guide/machines/frontier.md @@ -18,7 +18,7 @@ Here is a link to the ## config options Here are the default config options added when you have configured Polairs on -a Frontier login node (or specified `./configure_polaris_envs.py -m frontier`): +a Frontier login node (or specified `./deploy.py --machine frontier`): ```cfg # The paths section describes paths for data and environments @@ -27,12 +27,12 @@ a Frontier login node (or specified `./configure_polaris_envs.py -m frontier`): # A shared root directory where polaris data can be found database_root = /lustre/orion/cli115/world-shared/polaris -# the path to the base conda environment where polaris environments have +# the path to deployed Polaris environments # been created polaris_envs = /ccs/proj/cli115/software/polaris/frontier/conda/base -# Options related to deploying a polaris conda and spack environments +# Options related to deploying Polaris and Spack environments [deploy] # the compiler set to use for system libraries and MPAS builds diff --git a/docs/users_guide/machines/index.md b/docs/users_guide/machines/index.md index 70ddb75647..7b09a17f7f 100644 --- a/docs/users_guide/machines/index.md +++ b/docs/users_guide/machines/index.md @@ -21,12 +21,12 @@ The config options typically defined for a machine are: # A shared root directory where MPAS standalone data can be found database_root = /lcrc/group/e3sm/public_html/mpas_standalonedata -# the path to the base conda environment where polaris environments have +# the path where deployed Polaris environments are located # been created polaris_envs = /lcrc/soft/climate/polaris/chrysalis/base -# Options related to deploying a polaris conda environment on supported +# Options related to deploying Polaris environments on supported # machines [deploy] @@ -50,11 +50,11 @@ use_e3sm_hdf5_netcdf = True The `paths` section provides local paths to the root of the "databases" (local caches) of data files for each MPAS core. These are generally in a shared location for the project to save space. Similarly, `polaris_envs` -is a location where shared conda environments will be created for polaris +is a location where shared deployed environments will be created for polaris releases for users to share. The `deploy` section is used to help polaris create development and -release conda environments and activation scripts. It says which compiler set +release environments and activation scripts. It says which compiler set is the default, which MPI library is the default for each supported compiler, and where libraries built with system MPI will be placed. @@ -184,46 +184,14 @@ If the path doesn't exist, polaris will create it. If you're not working on an HPC machine, you will probably not have multiple nodes or {ref}`slurm`. You will probably install [MPICH](https://www.mpich.org/) or [OpenMPI](https://www.open-mpi.org/), -probably via a -[conda environment](https://docs.conda.io/projects/conda/en/latest/index.html). -In this case, the `parallel_executable` is `mpirun`. +typically through your deployed pixi environment. In this case, the +`parallel_executable` is usually `mpirun`. -To install the `polaris` package into a conda environment, you will first -need to install [Miniforge3](https://github.com/conda-forge/miniforge#miniforge3) -(if it is not already installed). Then, you will run one of the following -three commands, depending on how you would like to handle MPI support in the -conda packages. +To deploy Polaris for your repo checkout, run `./deploy.py` from the repo +root. For deployment details and options, see: -## MPICH +- [mache deploy user guide](https://docs.e3sm.org/mache/main/users_guide/deploy.html) +- [mache deploy quick start](https://docs.e3sm.org/mache/main/users_guide/quick_start.html) -To create a conda environment called "polaris" with MPI from the `mpich` -package, run: - -```bash -conda create -n polaris -c conda-forge -c e3sm/label/polaris python=3.10 "polaris=*=mpi_mpich*" -``` - -This is the recommended default for single-node Linux and OSX machines. - -## OpenMPI - -To create a conda environment called "polaris" with MPI from the `openmpi` -package, run: - -```bash -conda create -n polaris -c conda-forge -c e3sm/label/polaris python=3.10 "polaris=*=mpi_openmpi*" -``` - -## No MPI from conda-forge - -To create a conda environment called "polaris" without any MPI package from -conda-forge, run: - -```bash -conda create -n polaris -c conda-forge -c e3sm/label/polaris python=3.10 "polaris=*=nompi*" -``` - -This would be the starting point for working with polaris on an unknown -HPC machine. From there, you would also need to load modules and set -environment variables so that MPAS components can be built with system NetCDF, -pNetCDF and SCORPIO. This will likely require working with an MPAS developer. +For unsupported machines, you may need to add or customize machine +configuration before deployment can fully configure compiler/MPI settings. diff --git a/docs/users_guide/machines/perlmutter.md b/docs/users_guide/machines/perlmutter.md index 0bd005cab2..164dacc8a1 100644 --- a/docs/users_guide/machines/perlmutter.md +++ b/docs/users_guide/machines/perlmutter.md @@ -46,12 +46,12 @@ setting up test cases or a suite: # A shared root directory where polaris data can be found database_root = /global/cfs/cdirs/e3sm/polaris -# the path to the base conda environment where polaris environments have +# the path to deployed Polaris environments # been created polaris_envs = /global/common/software/e3sm/polaris/pm-cpu/conda/base -# Options related to deploying a polaris conda and spack environments +# Options related to deploying Polaris and Spack environments [deploy] # the compiler set to use for system libraries and MPAS builds @@ -138,12 +138,12 @@ setting up test cases or a suite: # A shared root directory where polaris data can be found database_root = /global/cfs/cdirs/e3sm/polaris -# the path to the base conda environment where polaris environments have +# the path to deployed Polaris environments # been created polaris_envs = /global/common/software/e3sm/polaris/pm-gpu/conda/base -# Options related to deploying a polaris conda and spack environments +# Options related to deploying Polaris and Spack environments [deploy] # the compiler set to use for system libraries and MPAS builds From c4f56e39087339b8e15c5aa7a3e9850a5aa9bd1e Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 25 Feb 2026 14:11:01 +0000 Subject: [PATCH 15/39] Move to mache.parallel in Polaris framework --- polaris/component.py | 93 +++++++++++++++++++++ polaris/job/__init__.py | 84 ++++++++++++++----- polaris/logging.py | 4 +- polaris/model_step.py | 38 ++++++++- polaris/parallel/__init__.py | 50 ------------ polaris/parallel/login.py | 33 -------- polaris/parallel/pbs.py | 138 -------------------------------- polaris/parallel/single_node.py | 35 -------- polaris/parallel/slurm.py | 72 ----------------- polaris/parallel/system.py | 36 --------- polaris/run/serial.py | 24 +++--- polaris/setup.py | 34 ++++++-- polaris/step.py | 53 ++++++++++++ 13 files changed, 289 insertions(+), 405 deletions(-) delete mode 100644 polaris/parallel/__init__.py delete mode 100644 polaris/parallel/login.py delete mode 100644 polaris/parallel/pbs.py delete mode 100644 polaris/parallel/single_node.py delete mode 100644 polaris/parallel/slurm.py delete mode 100644 polaris/parallel/system.py diff --git a/polaris/component.py b/polaris/component.py index d30458d85c..7a161ba4e3 100644 --- a/polaris/component.py +++ b/polaris/component.py @@ -1,8 +1,13 @@ import importlib.resources as imp_res import json +import os import xarray as xr +from mache.parallel import ParallelSystem, get_parallel_system from mpas_tools.io import write_netcdf +from mpas_tools.logging import check_call + +from polaris.config import PolarisConfigParser class Component: @@ -48,8 +53,96 @@ def __init__(self, name): self.configs = dict() self.cached_files = dict() + self.parallel_system: ParallelSystem | None = None self._read_cached_files() + def set_parallel_system(self, config: PolarisConfigParser) -> None: + """ + Construct and store the active parallel system for this component + + Parameters + ---------- + config : polaris.config.PolarisConfigParser + The config to use in constructing the parallel system + """ + if config.combined is None: + config.combine() + assert config.combined is not None + self.parallel_system = get_parallel_system(config.combined) + + def get_available_resources(self): + """ + Get available resources from the active parallel system + + Returns + ------- + available_resources : dict + Available CPU and GPU resources and machine capabilities + """ + if self.parallel_system is None: + raise ValueError( + f'Parallel system has not been set for component {self.name}' + ) + + return dict( + cores=self.parallel_system.cores, + nodes=self.parallel_system.nodes, + cores_per_node=self.parallel_system.cores_per_node, + gpus=self.parallel_system.gpus, + gpus_per_node=self.parallel_system.gpus_per_node, + mpi_allowed=self.parallel_system.mpi_allowed, + ) + + def run_parallel_command( + self, + args, + cpus_per_task, + ntasks, + openmp_threads, + logger, + gpus_per_task=0, + ): + """ + Run a command using the active parallel system + + Parameters + ---------- + args : list of str + Command line arguments for the executable + + cpus_per_task : int + Number of CPUs per task + + ntasks : int + Number of parallel tasks + + openmp_threads : int + Number of OpenMP threads + + logger : logging.Logger + Logger to output command-line execution info + + gpus_per_task : int, optional + Number of GPUs per task + """ + if self.parallel_system is None: + raise ValueError( + f'Parallel system has not been set for component {self.name}' + ) + + env = dict(os.environ) + env['OMP_NUM_THREADS'] = f'{openmp_threads}' + if openmp_threads > 1: + logger.info(f'Running with {openmp_threads} OpenMP threads') + + command_line_args = self.parallel_system.get_parallel_command( + args=args, + ntasks=ntasks, + cpus_per_task=cpus_per_task, + gpus_per_task=gpus_per_task, + ) + check_call(command_line_args, logger, env=env) + def add_task(self, task): """ Add a task to the component diff --git a/polaris/job/__init__.py b/polaris/job/__init__.py index 4f72754982..56f35b72e6 100644 --- a/polaris/job/__init__.py +++ b/polaris/job/__init__.py @@ -3,6 +3,7 @@ import numpy as np from jinja2 import Template as Template +from mache.parallel import get_parallel_system def write_job_script( @@ -12,6 +13,8 @@ def write_job_script( nodes=None, target_cores=None, min_cores=None, + target_gpus=None, + min_gpus=None, suite='', script_filename=None, run_command=None, @@ -42,6 +45,14 @@ def write_job_script( The minimum number of cores for the job to use if ``nodes`` not provided + target_gpus : int, optional + The target number of GPUs for the job to use if ``nodes`` not + provided + + min_gpus : int, optional + The minimum number of GPUs for the job to use if ``nodes`` not + provided + suite : str, optional The name of the suite @@ -53,21 +64,45 @@ def write_job_script( The command(s) to run in the job script. If not provided, defaults to 'polaris serial {{suite}}'. """ + if config.combined is None: + config.combine() + assert config.combined is not None + parallel_system = get_parallel_system(config.combined) if config.has_option('parallel', 'account'): account = config.get('parallel', 'account') else: account = '' + cores_per_node = parallel_system.get_config_int('cores_per_node') + gpus_per_node = parallel_system.get_config_int('gpus_per_node', default=0) + if nodes is None: if target_cores is None or min_cores is None: raise ValueError( 'If nodes is not provided, both target_cores and min_cores ' 'must be provided.' ) - cores_per_node = config.getint('parallel', 'cores_per_node') - cores = np.sqrt(target_cores * min_cores) - nodes = int(np.ceil(cores / cores_per_node)) + + use_gpu_nodes = ( + gpus_per_node > 0 + and target_gpus is not None + and min_gpus is not None + and max(target_gpus, min_gpus) > 0 + ) + if use_gpu_nodes: + gpus = np.sqrt(target_gpus * min_gpus) + nodes = int(np.ceil(gpus / gpus_per_node)) + nodes = max(nodes, 1) + else: + if cores_per_node is None: + raise ValueError( + 'cores_per_node must be set when computing nodes from ' + 'CPU resources' + ) + cores = np.sqrt(target_cores * min_cores) + nodes = int(np.ceil(cores / cores_per_node)) + nodes = max(nodes, 1) # Determine parallel system type system = ( @@ -80,7 +115,7 @@ def write_job_script( if system == 'slurm': partition, qos, constraint, gpus_per_node, wall_time = ( - get_slurm_options(config, machine, nodes) + get_slurm_options(config, machine, nodes, parallel_system) ) template_name = 'job_script.slurm.template' render_kwargs.update( @@ -92,7 +127,7 @@ def write_job_script( ) elif system == 'pbs': queue, constraint, gpus_per_node, wall_time, filesystems = ( - get_pbs_options(config, machine, nodes) + get_pbs_options(config, machine, nodes, parallel_system) ) template_name = 'job_script.pbs.template' render_kwargs.update( @@ -134,7 +169,7 @@ def write_job_script( handle.write(text) -def get_slurm_options(config, machine, nodes): +def get_slurm_options(config, machine, nodes, parallel_system): """ Get Slurm options for job submission. @@ -171,13 +206,14 @@ def get_slurm_options(config, machine, nodes): config, machine, nodes, + parallel_system, partition_or_queue_option='partition', partitions_or_queues='partitions', ) return partition, qos, constraint, gpus_per_node, wall_time -def get_pbs_options(config, machine, nodes): +def get_pbs_options(config, machine, nodes, parallel_system): """ Get PBS options for job submission. @@ -212,6 +248,7 @@ def get_pbs_options(config, machine, nodes): config, machine, nodes, + parallel_system, partition_or_queue_option='queue', partitions_or_queues='queues', ) @@ -220,7 +257,12 @@ def get_pbs_options(config, machine, nodes): def _get_job_options( - config, machine, nodes, partition_or_queue_option, partitions_or_queues + config, + machine, + nodes, + parallel_system, + partition_or_queue_option, + partitions_or_queues, ): """ Helper to get job options for slurm or pbs @@ -244,32 +286,34 @@ def _get_job_options( wall_time : str filesystems : str """ - par_section = config['parallel'] job_section = config['job'] partition_or_queue = job_section.get(partition_or_queue_option) if partition_or_queue == '<<>>': - if par_section.has_option(partitions_or_queues): - # get the first, which is the default - partition_or_queue = par_section.getlist(partitions_or_queues)[0] + value = parallel_system.get_config(partitions_or_queues) + if value is not None and value != '': + partition_or_queue = _parse_list(value)[0] else: partition_or_queue = '' qos = job_section.get('qos') if qos == '<<>>': - if par_section.has_option('qos'): - qos = par_section.getlist('qos')[0] + value = parallel_system.get_config('qos') + if value is not None and value != '': + qos = _parse_list(value)[0] else: qos = '' constraint = job_section.get('constraint') if constraint == '<<>>': - if par_section.has_option('constraints'): - constraint = par_section.getlist('constraints')[0] + value = parallel_system.get_config('constraints') + if value is not None and value != '': + constraint = _parse_list(value)[0] else: constraint = '' - if par_section.has_option('gpus_per_node'): - gpus_per_node = par_section.get('gpus_per_node') + gpus_per_node_value = parallel_system.get_config('gpus_per_node') + if gpus_per_node_value is not None: + gpus_per_node = str(gpus_per_node_value) else: gpus_per_node = '' @@ -288,3 +332,7 @@ def _get_job_options( wall_time, filesystems, ) + + +def _parse_list(value): + return [entry.strip() for entry in value.split(',') if entry.strip() != ''] diff --git a/polaris/logging.py b/polaris/logging.py index db1c073e72..dc4da04651 100644 --- a/polaris/logging.py +++ b/polaris/logging.py @@ -71,8 +71,8 @@ def log_function_call(function, logger): """ Log the module path and file path of a call to a function, e.g.:: - polaris calling: polaris.parallel.set_cores_per_node() - in /home/xylar/code/polaris/polaris/polaris/parallel.py + polaris calling: polaris.component.Component.run_parallel_command() + in /home/xylar/code/polaris/polaris/polaris/component.py Parameters ---------- diff --git a/polaris/model_step.py b/polaris/model_step.py index 09c4367818..ad522ff6d8 100644 --- a/polaris/model_step.py +++ b/polaris/model_step.py @@ -82,6 +82,8 @@ def __init__( ntasks=None, min_tasks=None, openmp_threads=None, + gpus_per_task=0, + min_gpus_per_task=0, max_memory=None, cached=False, namelist=None, @@ -127,6 +129,12 @@ def __init__( openmp_threads : int, optional the number of OpenMP threads to use + gpus_per_task : int, optional + the number of GPUs per task to use + + min_gpus_per_task : int, optional + the minimum number of GPUs per task required + max_memory : int, optional the amount of memory that the step is allowed to use in MB. This is currently just a placeholder for later use with task @@ -179,6 +187,8 @@ def __init__( ntasks=ntasks, min_tasks=min_tasks, openmp_threads=openmp_threads, + gpus_per_task=gpus_per_task, + min_gpus_per_task=min_gpus_per_task, max_memory=max_memory, cached=cached, ) @@ -237,7 +247,13 @@ def setup(self): ] def set_model_resources( - self, ntasks=None, min_tasks=None, openmp_threads=None, max_memory=None + self, + ntasks=None, + min_tasks=None, + openmp_threads=None, + gpus_per_task=None, + min_gpus_per_task=None, + max_memory=None, ): """ Update the resources for the step. This can be done within init, @@ -261,6 +277,12 @@ def set_model_resources( openmp_threads : int, optional the number of OpenMP threads to use + gpus_per_task : int, optional + the number of GPUs per task to use + + min_gpus_per_task : int, optional + the minimum number of GPUs per task required + max_memory : int, optional the amount of memory that the step is allowed to use in MB. This is currently just a placeholder for later use with task @@ -272,6 +294,8 @@ def set_model_resources( ntasks=ntasks, min_tasks=min_tasks, openmp_threads=openmp_threads, + gpus_per_task=gpus_per_task, + min_gpus_per_task=min_gpus_per_task, max_memory=max_memory, ) @@ -591,11 +615,17 @@ def update_io_tasks_config(self, config_model=None): If config options are available for multiple models, the model that the config options are from. """ - config = self.config - cores = self.ntasks * self.cpus_per_task - cores_per_node = config.getint('parallel', 'cores_per_node') + parallel_system = self.component.parallel_system + if parallel_system is None: + raise ValueError( + f'Parallel system has not been set for component ' + f'{self.component.name}' + ) + cores_per_node = parallel_system.get_config_int('cores_per_node') + if cores_per_node is None: + raise ValueError('cores_per_node must be set in parallel config') # update IO tasks based on machine settings and the available cores pio_num_iotasks = int(np.ceil(cores / cores_per_node)) diff --git a/polaris/parallel/__init__.py b/polaris/parallel/__init__.py deleted file mode 100644 index e682aecda9..0000000000 --- a/polaris/parallel/__init__.py +++ /dev/null @@ -1,50 +0,0 @@ -import os - -from mpas_tools.logging import check_call - -from polaris.parallel.login import LoginSystem -from polaris.parallel.pbs import PbsSystem -from polaris.parallel.single_node import SingleNodeSystem -from polaris.parallel.slurm import SlurmSystem - - -def _get_system(config): - system = config.get('parallel', 'system') - if system == 'slurm': - if 'SLURM_JOB_ID' not in os.environ: - system = 'login' - if system == 'slurm': - return SlurmSystem(config) - elif system == 'pbs': - return PbsSystem(config) - elif system == 'single_node': - return SingleNodeSystem(config) - elif system == 'login': - return LoginSystem(config) - else: - raise ValueError(f'Unexpected parallel system: {system}') - - -def get_available_parallel_resources(config): - return _get_system(config).get_available_resources() - - -def set_cores_per_node(config, cores_per_node): - _get_system(config).set_cores_per_node(cores_per_node) - - -def run_command(args, cpus_per_task, ntasks, openmp_threads, config, logger): - env = dict(os.environ) - env['OMP_NUM_THREADS'] = f'{openmp_threads}' - if openmp_threads > 1: - logger.info(f'Running with {openmp_threads} OpenMP threads') - command_line_args = get_parallel_command( - args, cpus_per_task, ntasks, config - ) - check_call(command_line_args, logger, env=env) - - -def get_parallel_command(args, cpus_per_task, ntasks, config): - return _get_system(config).get_parallel_command( - args, cpus_per_task, ntasks - ) diff --git a/polaris/parallel/login.py b/polaris/parallel/login.py deleted file mode 100644 index fecc8710d7..0000000000 --- a/polaris/parallel/login.py +++ /dev/null @@ -1,33 +0,0 @@ -import multiprocessing - -from polaris.parallel.system import ParallelSystem - - -class LoginSystem(ParallelSystem): - """Resource manager for login nodes (no parallel execution).""" - - def get_available_resources(self): - config = self.config - cores = min( - multiprocessing.cpu_count(), - config.getint('parallel', 'login_cores'), - ) - available = dict( - cores=cores, - nodes=1, - cores_per_node=cores, - mpi_allowed=False, - ) - if config.has_option('parallel', 'gpus_per_node'): - available['gpus_per_node'] = config.getint( - 'parallel', 'gpus_per_node' - ) - return available - - def set_cores_per_node(self, cores_per_node): - # No-op for login system - pass - - def get_parallel_command(self, args, cpus_per_task, ntasks): - # Not supported for login system - raise ValueError('Parallel execution is not allowed on login nodes.') diff --git a/polaris/parallel/pbs.py b/polaris/parallel/pbs.py deleted file mode 100644 index 2e7fbe15b3..0000000000 --- a/polaris/parallel/pbs.py +++ /dev/null @@ -1,138 +0,0 @@ -import os -import re -import subprocess -import warnings - -from polaris.parallel.login import LoginSystem -from polaris.parallel.system import ( - ParallelSystem, -) - - -class PbsSystem(ParallelSystem): - """PBS resource manager for parallel jobs.""" - - def get_available_resources(self): - config = self.config - if 'PBS_JOBID' not in os.environ: - # fallback to login - return LoginSystem(config).get_available_resources() - - # First, try to get nodes and cores_per_node from qstat - nodes, cores_per_node = self._get_resources_from_qstat() - - if nodes is None or cores_per_node is None: - # Final fallback: use config values - nodes = config.getint('parallel', 'nodes', fallback=1) - cores_per_node = config.getint( - 'parallel', 'cores_per_node', fallback=1 - ) - cores = nodes * cores_per_node - available = dict( - cores=cores, - nodes=nodes, - cores_per_node=cores_per_node, - mpi_allowed=True, - ) - if config.has_option('parallel', 'gpus_per_node'): - available['gpus_per_node'] = config.getint( - 'parallel', 'gpus_per_node' - ) - return available - - def set_cores_per_node(self, cores_per_node): - config = self.config - old_cores_per_node = config.getint('parallel', 'cores_per_node') - config.set('parallel', 'cores_per_node', f'{cores_per_node}') - if old_cores_per_node != cores_per_node: - warnings.warn( - f'PBS found {cores_per_node} cpus per node but ' - f'config from mache was {old_cores_per_node}', - stacklevel=2, - ) - - def get_parallel_command(self, args, cpus_per_task, ntasks): - config = self.config - section = config['parallel'] - command = section.get('parallel_executable').split(' ') - # PBS mpiexec/mpirun options are launcher's responsibility, so the - # flag used for CPUs per task is configurable per machine - if section.has_option('cpus_per_task_flag'): - cpus_per_task_flag = section.get('cpus_per_task_flag') - else: - cpus_per_task_flag = '-c' - command.extend( - ['-n', f'{ntasks}', cpus_per_task_flag, f'{cpus_per_task}'] - ) - command.extend(args) - return command - - def _get_resources_from_qstat(self): - """Try to determine nodes and cores_per_node from qstat output.""" - - jobid = os.environ.get('PBS_JOBID') - if not jobid: - return None, None - - try: - # text=True is available in Python 3.7+ - output = subprocess.check_output(['qstat', '-f', jobid], text=True) - except FileNotFoundError: # qstat executable not found - return None, None - except subprocess.CalledProcessError: # qstat returned non-zero - return None, None - - # Try to infer nodes and cores_per_node from various Resource_List - # fields. Different PBS installations format these differently. - - # Case 1: Aurora style (current ALCF Aurora machine): separate - # ncpus and nodect, and select - # Resource_List.ncpus = total_cores_for_job - # Resource_List.nodect = number_of_nodes - # Resource_List.select = number_of_nodes (or chunks) - ncpus_match = re.search(r'Resource_List\.ncpus\s*=\s*(\d+)', output) - nodect_match = re.search(r'Resource_List\.nodect\s*=\s*(\d+)', output) - simple_select_match = re.search( - r'Resource_List\.select\s*=\s*(\d+)', output - ) - - total_cores = int(ncpus_match.group(1)) if ncpus_match else None - nodect = int(nodect_match.group(1)) if nodect_match else None - simple_select = ( - int(simple_select_match.group(1)) if simple_select_match else None - ) - - if total_cores is not None and nodect is not None and nodect != 0: - nodes = nodect - cores_per_node = total_cores // nodect - return nodes, cores_per_node - - if ( - total_cores is not None - and simple_select is not None - and simple_select != 0 - ): - nodes = simple_select - cores_per_node = total_cores // simple_select - return nodes, cores_per_node - - # Case 2: PBS Pro style "select=N:ncpus=M" on a single line - select_match = re.search( - r'Resource_List\.select\s*=\s*(\d+)[^\n]*?:ncpus=(\d+)', - output, - ) - if select_match: - nodes = int(select_match.group(1)) - cores_per_node = int(select_match.group(2)) - return nodes, cores_per_node - - # Case 3: older PBS/Torque style: "nodes=N:ppn=M" - nodes_match = re.search( - r'Resource_List\.nodes\s*=\s*(\d+)[^\n]*?:ppn=(\d+)', - output, - ) - if nodes_match: - nodes = int(nodes_match.group(1)) - cores_per_node = int(nodes_match.group(2)) - return nodes, cores_per_node - return None, None diff --git a/polaris/parallel/single_node.py b/polaris/parallel/single_node.py deleted file mode 100644 index 1290e0962c..0000000000 --- a/polaris/parallel/single_node.py +++ /dev/null @@ -1,35 +0,0 @@ -import multiprocessing - -from polaris.parallel.system import ParallelSystem - - -class SingleNodeSystem(ParallelSystem): - """Resource manager for single-node parallel execution.""" - - def get_available_resources(self): - config = self.config - cores = multiprocessing.cpu_count() - if config.has_option('parallel', 'cores_per_node'): - cores = min(cores, config.getint('parallel', 'cores_per_node')) - available = dict( - cores=cores, - nodes=1, - cores_per_node=cores, - mpi_allowed=True, - ) - if config.has_option('parallel', 'gpus_per_node'): - available['gpus_per_node'] = config.getint( - 'parallel', 'gpus_per_node' - ) - return available - - def set_cores_per_node(self, cores_per_node): - config = self.config - if not config.has_option('parallel', 'cores_per_node'): - config.set('parallel', 'cores_per_node', f'{cores_per_node}') - - def get_parallel_command(self, args, cpus_per_task, ntasks): - command = self.config.get('parallel', 'parallel_executable').split(' ') - command.extend(['-n', f'{ntasks}']) - command.extend(args) - return command diff --git a/polaris/parallel/slurm.py b/polaris/parallel/slurm.py deleted file mode 100644 index 72bc6e0918..0000000000 --- a/polaris/parallel/slurm.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import warnings - -import numpy as np - -from polaris.parallel.login import LoginSystem -from polaris.parallel.system import ( - ParallelSystem, - _get_subprocess_int, - _get_subprocess_str, -) - - -class SlurmSystem(ParallelSystem): - """SLURM resource manager for parallel jobs.""" - - def get_available_resources(self): - config = self.config - if 'SLURM_JOB_ID' not in os.environ: - # fallback to login - return LoginSystem(config).get_available_resources() - job_id = os.environ['SLURM_JOB_ID'] - node = os.environ['SLURMD_NODENAME'] - args = ['sinfo', '--noheader', '--node', node, '-o', '%C'] - aiot = _get_subprocess_str(args).split('/') - cores_per_node = int(aiot[0]) - if cores_per_node == 0: - cores_per_node = int(aiot[3]) - args = ['sinfo', '--noheader', '--node', node, '-o', '%Z'] - slurm_threads_per_core = _get_subprocess_int(args) - if config.has_option('parallel', 'threads_per_core'): - threads_per_core = config.getint('parallel', 'threads_per_core') - cores_per_node = ( - cores_per_node * threads_per_core - ) // slurm_threads_per_core - args = ['squeue', '--noheader', '-j', job_id, '-o', '%D'] - nodes = _get_subprocess_int(args) - cores = cores_per_node * nodes - available = dict( - cores=cores, - nodes=nodes, - cores_per_node=cores_per_node, - mpi_allowed=True, - ) - if config.has_option('parallel', 'gpus_per_node'): - available['gpus_per_node'] = config.getint( - 'parallel', 'gpus_per_node' - ) - return available - - def set_cores_per_node(self, cores_per_node): - config = self.config - old_cores_per_node = config.getint('parallel', 'cores_per_node') - config.set('parallel', 'cores_per_node', f'{cores_per_node}') - if old_cores_per_node != cores_per_node: - warnings.warn( - f'Slurm found {cores_per_node} cpus per node but ' - f'config from mache was {old_cores_per_node}', - stacklevel=2, - ) - - def get_parallel_command(self, args, cpus_per_task, ntasks): - config = self.config - command = config.get('parallel', 'parallel_executable').split(' ') - cores = ntasks * cpus_per_task - cores_per_node = config.getint('parallel', 'cores_per_node') - nodes = int(np.ceil(cores / cores_per_node)) - command.extend( - ['-c', f'{cpus_per_task}', '-N', f'{nodes}', '-n', f'{ntasks}'] - ) - command.extend(args) - return command diff --git a/polaris/parallel/system.py b/polaris/parallel/system.py deleted file mode 100644 index b090fa7f97..0000000000 --- a/polaris/parallel/system.py +++ /dev/null @@ -1,36 +0,0 @@ -import subprocess -from typing import Any, Dict, List - - -class ParallelSystem: - """Base class for parallel system resource management.""" - - def __init__(self, config: Any): - self.config = config - - def get_available_resources(self) -> Dict[str, Any]: - """Return available resources for the system.""" - raise NotImplementedError - - def set_cores_per_node(self, cores_per_node: int) -> None: - """Set the number of cores per node.""" - raise NotImplementedError - - def get_parallel_command( - self, args: List[str], cpus_per_task: int, ntasks: int - ) -> List[str]: - """Get the parallel execution command.""" - raise NotImplementedError - - -def _get_subprocess_str(args: List[str]) -> str: - """Run a subprocess and return its output as a string.""" - value = subprocess.check_output(args) - value_str = value.decode('utf-8').strip('\n') - return value_str - - -def _get_subprocess_int(args: List[str]) -> int: - """Run a subprocess and return its output as an integer.""" - value_int = int(_get_subprocess_str(args)) - return value_int diff --git a/polaris/run/serial.py b/polaris/run/serial.py index 27b97ece7b..8ee30dd72b 100644 --- a/polaris/run/serial.py +++ b/polaris/run/serial.py @@ -12,11 +12,6 @@ from polaris import Task from polaris.logging import log_function_call, log_method_call -from polaris.parallel import ( - get_available_parallel_resources, - run_command, - set_cores_per_node, -) from polaris.run import ( complete_step_run, load_dependencies, @@ -73,7 +68,8 @@ def run_tasks( task = next(iter(suite['tasks'].values())) component = task.component common_config = setup_config(task.base_work_dir, f'{component.name}.cfg') - available_resources = get_available_parallel_resources(common_config) + component.set_parallel_system(common_config) + available_resources = component.get_available_resources() # start logging to stdout/stderr with LoggingContext(suite_name) as stdout_logger: @@ -178,8 +174,8 @@ def run_single_step(step_is_subprocess=False, quiet=False): config = setup_config(step.base_work_dir, step.config.filepath) task.config = config - available_resources = get_available_parallel_resources(config) - set_cores_per_node(task.config, available_resources['cores_per_node']) + step.component.set_parallel_system(config) + available_resources = step.component.get_available_resources() mpas_tools.io.default_format = config.get('io', 'format') mpas_tools.io.default_engine = config.get('io', 'engine') @@ -377,7 +373,8 @@ def _log_and_run_task( config = setup_config(task.base_work_dir, task.config.filepath) task.config = config - set_cores_per_node(task.config, available_resources['cores_per_node']) + task.component.set_parallel_system(config) + available_resources = task.component.get_available_resources() mpas_tools.io.default_format = config.get('io', 'format') mpas_tools.io.default_engine = config.get('io', 'engine') @@ -672,15 +669,18 @@ def _run_step( 'with command line args\n' ) for args in step.args: - log_function_call(function=run_command, logger=step_logger) + log_method_call( + method=step.component.run_parallel_command, + logger=step_logger, + ) step_logger.info('') - run_command( + step.component.run_parallel_command( args, step.cpus_per_task, step.ntasks, step.openmp_threads, - step.config, step.logger, + gpus_per_task=step.gpus_per_task, ) else: step_logger.info('') diff --git a/polaris/setup.py b/polaris/setup.py index 95dd6aaf78..0d012e4c76 100644 --- a/polaris/setup.py +++ b/polaris/setup.py @@ -157,6 +157,7 @@ def setup_tasks( ) component.configure(basic_config, list(tasks.values())) + component.set_parallel_system(basic_config) provenance.write( work_dir, @@ -217,10 +218,15 @@ def setup_tasks( _symlink_load_script(work_dir) - max_cores, max_of_min_cores = _get_required_cores(tasks) + max_cores, max_of_min_cores, max_gpus, max_of_min_gpus = ( + _get_required_resources(tasks) + ) print(f'target cores: {max_cores}') print(f'minimum cores: {max_of_min_cores}') + if max_gpus > 0 or max_of_min_gpus > 0: + print(f'target gpus: {max_gpus}') + print(f'minimum gpus: {max_of_min_gpus}') if machine is not None: write_job_script( @@ -228,6 +234,8 @@ def setup_tasks( machine=machine, target_cores=max_cores, min_cores=max_of_min_cores, + target_gpus=max_gpus, + min_gpus=max_of_min_gpus, work_dir=work_dir, suite=suite_name, ) @@ -301,11 +309,15 @@ def setup_task(path, task, machine, work_dir, baseline_dir, cached_steps): if machine is not None: cores = step.cpus_per_task * step.ntasks min_cores = step.min_cpus_per_task * step.min_tasks + gpus = step.gpus_per_task * step.ntasks + min_gpus = step.min_gpus_per_task * step.min_tasks write_job_script( config=step.config, machine=machine, target_cores=cores, min_cores=min_cores, + target_gpus=gpus, + min_gpus=min_gpus, work_dir=step.work_dir, ) step.setup_complete = True @@ -323,12 +335,16 @@ def setup_task(path, task, machine, work_dir, baseline_dir, cached_steps): _symlink_load_script(task_dir) if machine is not None: - max_cores, max_of_min_cores = _get_required_cores({path: task}) + max_cores, max_of_min_cores, max_gpus, max_of_min_gpus = ( + _get_required_resources({path: task}) + ) write_job_script( config=task.config, machine=machine, target_cores=max_cores, min_cores=max_of_min_cores, + target_gpus=max_gpus, + min_gpus=max_of_min_gpus, work_dir=task_dir, ) @@ -704,11 +720,15 @@ def _clean_tasks_and_steps(tasks, base_work_dir): pass -def _get_required_cores(tasks): - """Get the maximum number of target cores and the max of min cores""" +def _get_required_resources(tasks): + """ + Get max target and minimum CPU and GPU resource counts across task steps + """ max_cores = 0 max_of_min_cores = 0 + max_gpus = 0 + max_of_min_gpus = 0 for task in tasks.values(): for step_name in task.steps_to_run: step = task.steps[step_name] @@ -726,10 +746,14 @@ def _get_required_cores(tasks): ) cores = step.cpus_per_task * step.ntasks min_cores = step.min_cpus_per_task * step.min_tasks + gpus = step.gpus_per_task * step.ntasks + min_gpus = step.min_gpus_per_task * step.min_tasks max_cores = max(max_cores, cores) max_of_min_cores = max(max_of_min_cores, min_cores) + max_gpus = max(max_gpus, gpus) + max_of_min_gpus = max(max_of_min_gpus, min_gpus) - return max_cores, max_of_min_cores + return max_cores, max_of_min_cores, max_gpus, max_of_min_gpus def __get_machine_and_check_params( diff --git a/polaris/step.py b/polaris/step.py index 6112de1c2e..994d65d92f 100644 --- a/polaris/step.py +++ b/polaris/step.py @@ -60,6 +60,12 @@ class Step: openmp_threads : int the number of OpenMP threads to use + gpus_per_task : int + the number of GPUs per task the step would ideally use + + min_gpus_per_task : int + the number of GPUs per task the step requires + max_memory : int the amount of memory that the step is allowed to use in MB. This is currently just a placeholder for later use with task @@ -165,6 +171,8 @@ def __init__( max_memory=None, cached=False, run_as_subprocess=False, + gpus_per_task=0, + min_gpus_per_task=0, ): """ Create a new task @@ -213,6 +221,12 @@ def __init__( This is currently just a placeholder for later use with task parallelism + gpus_per_task : int, optional + the number of GPUs per task the step would ideally use + + min_gpus_per_task : int, optional + the number of GPUs per task the step requires + cached : bool, optional Whether to get all of the outputs for the step from the database of cached outputs for this component @@ -238,6 +252,8 @@ def __init__( self.ntasks = ntasks self.min_tasks = min_tasks self.openmp_threads = openmp_threads + self.gpus_per_task = gpus_per_task + self.min_gpus_per_task = min_gpus_per_task self.max_memory = max_memory self.path = os.path.join(self.component.name, self.subdir) @@ -283,6 +299,8 @@ def set_resources( min_tasks=None, openmp_threads=None, max_memory=None, + gpus_per_task=None, + min_gpus_per_task=None, ): """ Update the resources for the subtask. This can be done within init, @@ -320,6 +338,12 @@ def set_resources( the amount of memory that the step is allowed to use in MB. This is currently just a placeholder for later use with task parallelism + + gpus_per_task : int, optional + the number of GPUs per task the step would ideally use + + min_gpus_per_task : int, optional + the number of GPUs per task the step requires """ if cpus_per_task is not None: self.cpus_per_task = cpus_per_task @@ -331,6 +355,10 @@ def set_resources( self.min_tasks = min_tasks if openmp_threads is not None: self.openmp_threads = openmp_threads + if gpus_per_task is not None: + self.gpus_per_task = gpus_per_task + if min_gpus_per_task is not None: + self.min_gpus_per_task = min_gpus_per_task if max_memory is not None: self.max_memory = max_memory @@ -372,6 +400,31 @@ def constrain_resources(self, available_resources): f'minimum of {self.min_tasks} for step {self.name}' ) + available_gpus = available_resources.get('gpus') + if self.gpus_per_task > 0: + if available_gpus is None or available_gpus == 0: + raise ValueError( + f'Step {self.name} requests {self.gpus_per_task} GPUs ' + 'per task but no GPUs are available on this machine.' + ) + + available_gpu_tasks = available_gpus // self.gpus_per_task + self.ntasks = min(self.ntasks, available_gpu_tasks) + + if self.gpus_per_task < self.min_gpus_per_task: + raise ValueError( + f'Available gpus_per_task ({self.gpus_per_task}) is ' + f'below the minimum of {self.min_gpus_per_task} for ' + f'step {self.name}' + ) + + if self.ntasks < self.min_tasks: + raise ValueError( + f'Available number of MPI tasks ({self.ntasks}) is ' + f'below the minimum of {self.min_tasks} for step ' + f'{self.name} after GPU constraints are applied' + ) + def setup(self): """ Set up the task in the work directory, including downloading any From 745cc67a2c65e933f5f0e2eb07cce0bc548180b8 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 25 Feb 2026 14:11:27 +0000 Subject: [PATCH 16/39] Constrain resources with cells per GPU in ocean framework --- polaris/ocean/model/ocean_model_step.py | 33 +++++++++++++++++++++++-- polaris/ocean/ocean.cfg | 7 ++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/polaris/ocean/model/ocean_model_step.py b/polaris/ocean/model/ocean_model_step.py index 2b41d34d42..f7d5ee7330 100644 --- a/polaris/ocean/model/ocean_model_step.py +++ b/polaris/ocean/model/ocean_model_step.py @@ -522,16 +522,45 @@ def _update_ntasks(self) -> None: goal_cells_per_core = config.getfloat('ocean', 'goal_cells_per_core') max_cells_per_core = config.getfloat('ocean', 'max_cells_per_core') + model = config.get('ocean', 'model') + + goal_cells_per_gpu = config.getfloat('ocean', 'goal_cells_per_gpu') + max_cells_per_gpu = config.getfloat('ocean', 'max_cells_per_gpu') # machines (e.g. Perlmutter) seem to be happier with ntasks that # are multiples of 4 # ideally, about 200 cells per core - self.ntasks = max(1, 4 * round(cell_count / (4 * goal_cells_per_core))) + cpu_ntasks = max(1, 4 * round(cell_count / (4 * goal_cells_per_core))) # In a pinch, about 2000 cells per core - self.min_tasks = max( + cpu_min_tasks = max( 1, 4 * round(cell_count / (4 * max_cells_per_core)) ) + gpus_per_node = 0 + parallel_system = self.component.parallel_system + if parallel_system is not None: + gpus_per_node = parallel_system.get_config_int( + 'gpus_per_node', default=0 + ) + + use_gpu_resources = model == 'omega' and gpus_per_node > 0 + if use_gpu_resources: + self.gpus_per_task = 1 + self.min_gpus_per_task = 1 + # Ideally, about 8000 cells per GPU + self.ntasks = max( + 1, 4 * round(cell_count / (4 * goal_cells_per_gpu)) + ) + # In a pinch, about 80000 cells per GPU + self.min_tasks = max( + 1, 4 * round(cell_count / (4 * max_cells_per_gpu)) + ) + else: + self.gpus_per_task = 0 + self.min_gpus_per_task = 0 + self.ntasks = cpu_ntasks + self.min_tasks = cpu_min_tasks + def _read_config_map(self) -> None: """ Read the map from MPAS-Ocean to Omega config options diff --git a/polaris/ocean/ocean.cfg b/polaris/ocean/ocean.cfg index 6a3f26a9dd..b0c4771e64 100644 --- a/polaris/ocean/ocean.cfg +++ b/polaris/ocean/ocean.cfg @@ -26,6 +26,13 @@ goal_cells_per_core = 200 # few cores are available) max_cells_per_core = 2000 +# the number of cells per GPU to aim for in GPU-enabled Omega runs +goal_cells_per_gpu = 8000 + +# the approximate maximum number of cells per GPU (the test will fail if too +# few GPUs are available) +max_cells_per_gpu = 80000 + # Equation of state type, defaults to mpas-ocean default eos_type = linear From 134268eac13db9f279c5c9a635beabc221f8d53e Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 25 Feb 2026 14:12:54 +0000 Subject: [PATCH 17/39] Update e3sm-init tasks steps to use mache.parallel --- polaris/tasks/e3sm/init/topo/combine/step.py | 5 ++--- polaris/tasks/e3sm/init/topo/remap/remap.py | 15 +++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/polaris/tasks/e3sm/init/topo/combine/step.py b/polaris/tasks/e3sm/init/topo/combine/step.py index 5615806cc0..d9f5ac095c 100644 --- a/polaris/tasks/e3sm/init/topo/combine/step.py +++ b/polaris/tasks/e3sm/init/topo/combine/step.py @@ -9,7 +9,6 @@ from mpas_tools.logging import check_call from pyremap import ProjectionGridDescriptor, get_lat_lon_descriptor -from polaris.parallel import run_command from polaris.step import Step @@ -593,13 +592,13 @@ def _create_weights(self, in_filename, out_filename): '--src_regional', '--ignore_unmapped', ] - run_command( + self.component.run_parallel_command( args=args, cpus_per_task=self.cpus_per_task, ntasks=self.ntasks, openmp_threads=self.openmp_threads, - config=config, logger=self.logger, + gpus_per_task=self.gpus_per_task, ) def _remap_to_target_grid( diff --git a/polaris/tasks/e3sm/init/topo/remap/remap.py b/polaris/tasks/e3sm/init/topo/remap/remap.py index edac76dc8b..a388ca53a7 100644 --- a/polaris/tasks/e3sm/init/topo/remap/remap.py +++ b/polaris/tasks/e3sm/init/topo/remap/remap.py @@ -8,7 +8,6 @@ from polaris import Step from polaris.io import symlink -from polaris.parallel import run_command class RemapTopoStep(Step): @@ -286,13 +285,13 @@ def _create_weights(self): '1e-9', ] - run_command( - args, - self.cpus_per_task, - self.ntasks, - self.openmp_threads, - self.config, - self.logger, + self.component.run_parallel_command( + args=args, + cpus_per_task=self.cpus_per_task, + ntasks=self.ntasks, + openmp_threads=self.openmp_threads, + logger=self.logger, + gpus_per_task=self.gpus_per_task, ) logger.info(' Done.') From a85b4b66a0d2bfbe0e4d6aca1ef41ff75dea8c4d Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 25 Feb 2026 14:08:34 +0000 Subject: [PATCH 18/39] Remove parallel sections from Polaris machine configs --- polaris/machines/aurora.cfg | 8 -------- polaris/machines/frontier.cfg | 13 ------------- polaris/machines/pm-cpu.cfg | 12 ------------ polaris/machines/pm-gpu.cfg | 12 ------------ 4 files changed, 45 deletions(-) diff --git a/polaris/machines/aurora.cfg b/polaris/machines/aurora.cfg index 19f7942f05..83cc2de294 100644 --- a/polaris/machines/aurora.cfg +++ b/polaris/machines/aurora.cfg @@ -28,14 +28,6 @@ spack = /lus/flare/projects/E3SM_Dec/soft/polaris/aurora/spack # pnetcdf as E3SM (spack modules are used otherwise) use_e3sm_hdf5_netcdf = True -# The parallel section describes options related to running jobs in parallel -[parallel] - -### NOTE: Adding this temporarily in Polaris but it should come from mache -### in future releases -# the flag to `parallel_executable` to specify the number of cpus per task -cpus_per_task_flag = --depth - # Config options related to creating a job script [job] diff --git a/polaris/machines/frontier.cfg b/polaris/machines/frontier.cfg index 5400437897..63096f3b6c 100644 --- a/polaris/machines/frontier.cfg +++ b/polaris/machines/frontier.cfg @@ -43,19 +43,6 @@ spack = /ccs/proj/cli115/software/polaris/frontier/spack # pnetcdf as E3SM (spack modules are used otherwise) use_e3sm_hdf5_netcdf = True -# The parallel section describes options related to running jobs in parallel. -# Most options in this section come from mache so here we just add or override -# some defaults -[parallel] - -# allocatable cores per node on the machine -cores_per_node = 56 - -# threads per core (set to 1 because hyperthreading requires extra sbatch -# flag --threads-per-core that polaris doesn't yet support) -threads_per_core = 1 - - # Config options related to building components [build] diff --git a/polaris/machines/pm-cpu.cfg b/polaris/machines/pm-cpu.cfg index 9afd5995d5..5126be0c99 100644 --- a/polaris/machines/pm-cpu.cfg +++ b/polaris/machines/pm-cpu.cfg @@ -31,18 +31,6 @@ spack = /global/cfs/cdirs/e3sm/software/polaris/pm-cpu/spack # pnetcdf as E3SM (spack modules are used otherwise) use_e3sm_hdf5_netcdf = True -# The parallel section describes options related to running jobs in parallel. -# Most options in this section come from mache so here we just add or override -# some defaults -[parallel] - -# cores per node on the machine -cores_per_node = 128 - -# threads per core (set to 1 because trying to hyperthread seems to be causing -# hanging on perlmutter) -threads_per_core = 1 - # Config options related to building components [build] diff --git a/polaris/machines/pm-gpu.cfg b/polaris/machines/pm-gpu.cfg index 5018b07ed5..32b74a3539 100644 --- a/polaris/machines/pm-gpu.cfg +++ b/polaris/machines/pm-gpu.cfg @@ -30,15 +30,3 @@ spack = /global/cfs/cdirs/e3sm/software/polaris/pm-gpu/spack # whether to use the same modules for hdf5, netcdf-c, netcdf-fortran and # pnetcdf as E3SM (spack modules are used otherwise) use_e3sm_hdf5_netcdf = True - -# The parallel section describes options related to running jobs in parallel. -# Most options in this section come from mache so here we just add or override -# some defaults -[parallel] - -# cores per node on the machine (without hyperthreading) -cores_per_node = 64 - -# threads per core (set to 1 because trying to hyperthread seems to be causing -# hanging on perlmutter) -threads_per_core = 1 From 7020e257b632f94758a56b6dc1df93371b12f973 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 25 Feb 2026 14:19:37 +0000 Subject: [PATCH 19/39] Update the docs --- docs/developers_guide/api.md | 22 +---- docs/developers_guide/framework/parallel.md | 94 ++++++++++----------- docs/developers_guide/ocean/framework.md | 21 ++++- docs/developers_guide/overview.md | 7 +- docs/users_guide/config_files.md | 14 ++- 5 files changed, 83 insertions(+), 75 deletions(-) diff --git a/docs/developers_guide/api.md b/docs/developers_guide/api.md index c3957463c7..6e4389cee7 100644 --- a/docs/developers_guide/api.md +++ b/docs/developers_guide/api.md @@ -126,6 +126,9 @@ seaice/api Component.remove_step Component.add_config Component.get_or_create_shared_step + Component.set_parallel_system + Component.get_available_resources + Component.run_parallel_command ``` #### Task @@ -362,25 +365,6 @@ seaice/api write ``` -### parallel - -```{eval-rst} -.. currentmodule:: polaris.parallel - -.. autosummary:: - :toctree: generated/ - - get_available_parallel_resources - set_cores_per_node - run_command - get_parallel_command - - system.ParallelSystem - single_node.SingleNodeSystem - login.LoginSystem - slurm.SlurmSystem - pbs.PbsSystem -``` ### provenance diff --git a/docs/developers_guide/framework/parallel.md b/docs/developers_guide/framework/parallel.md index dd41cdde9a..f854ca241a 100644 --- a/docs/developers_guide/framework/parallel.md +++ b/docs/developers_guide/framework/parallel.md @@ -2,69 +2,69 @@ # Parallel -The `polaris.parallel` module provides a unified interface for querying and managing parallel resources across different computing environments. It abstracts the details of various parallel systems, allowing tasks and steps to request resources and construct parallel execution commands in a system-agnostic way. +Polaris now uses `mache.parallel` for parallel-system selection, resource +discovery and launcher command construction. -## Public API - -The following functions are available in `polaris.parallel`: - -- **get_available_parallel_resources(config):** - Returns a dictionary describing the available parallel resources (cores, nodes, cores per node, etc.) for the current environment, as determined by the selected parallel system. +Within Polaris, a component stores a `mache.parallel.ParallelSystem` +instance with {py:meth}`polaris.Component.set_parallel_system`, then uses it +for: -- **set_cores_per_node(config, cores_per_node):** - Sets the number of cores per node in the configuration, updating any relevant settings for the current parallel system. +- resource queries through {py:meth}`polaris.Component.get_available_resources` +- command execution through + {py:meth}`polaris.Component.run_parallel_command` -- **get_parallel_command(args, cpus_per_task, ntasks, config):** - Returns the command (as a list of strings) to launch a parallel job with the specified arguments, CPUs per task, and number of tasks, using the appropriate parallel launcher for the current system. +This change adds stronger GPU support, supports compiler-specific parallel +sections (`[parallel.]`) and avoids modifying config options during +runtime. -- **run_command(args, cpus_per_task, ntasks, openmp_threads, config, logger):** - Runs a parallel command with the specified resources and OpenMP thread count, using the appropriate launcher and logging output. +## Public API -See also the API documentation for: -- {py:func}`polaris.parallel.get_available_parallel_resources` -- {py:func}`polaris.parallel.set_cores_per_node` -- {py:func}`polaris.parallel.get_parallel_command` -- {py:func}`polaris.parallel.run_command` +The key APIs are now: -## Supported Parallel Systems +- {py:func}`mache.parallel.get_parallel_system` +- {py:class}`mache.parallel.ParallelSystem` +- {py:meth}`polaris.Component.set_parallel_system` +- {py:meth}`polaris.Component.get_available_resources` +- {py:meth}`polaris.Component.run_parallel_command` -The module currently supports four parallel systems, each with its own resource manager class: +`ParallelSystem.get_parallel_command()` supports both CPU and GPU resources +through `cpus_per_task` and `gpus_per_task`. -- **single_node:** - For running on a single node, using all available local CPU cores. - Managed by {py:class}`polaris.parallel.SingleNodeSystem`. +## Compiler-specific parallel configs -- **login:** - For running on a login node (no parallel execution, typically for setup or analysis). - Managed by {py:class}`polaris.parallel.LoginSystem`. +`mache.parallel` combines options in `[parallel]` with +`[parallel.]` (if present), where `` comes from +`[build] compiler`. -- **slurm:** - For running under the SLURM workload manager, using environment variables and SLURM commands to determine resources. - Managed by {py:class}`polaris.parallel.SlurmSystem`. +This lets machine configs specify different launcher flags and resource options +for different compiler toolchains without requiring a single machine-wide +parallel configuration. -- **pbs:** - For running under the PBS workload manager, using environment variables and PBS node files to determine resources. - Managed by {py:class}`polaris.parallel.PbsSystem`. +## GPU resources -The appropriate system is selected automatically based on the configuration and environment variables. +Polaris step resources now include GPU requirements (`gpus_per_task` and +`min_gpus_per_task`) in addition to CPU requirements. Resource constraints use +both CPU and GPU availability when determining whether a step can run. -## Adding Support for a New Parallel System +For ocean model steps with dynamic sizing, Omega runs on GPU-capable compiler +configs use: -To add a new parallel system: +- `goal_cells_per_gpu` (target; default 8000) +- `max_cells_per_gpu` (minimum required resources; default 80000) -1. **Create a new system class:** - Subclass `ParallelSystem` in `polaris/parallel/`, implementing the following methods: - - `get_available_resources(self)` - - `set_cores_per_node(self, cores_per_node)` - - `get_parallel_command(self, args, cpus_per_task, ntasks)` +## Supported Parallel Systems -2. **Handle environment detection:** - In your new system class, use environment variables or other mechanisms to detect when your system should be active. +The active system is still selected from `[parallel] system` and environment +context (`slurm`, `pbs`, `single_node`, `login`) but implementation is in +`mache.parallel`: -3. **Register the new system:** - Update the logic in `polaris/parallel/__init__.py` (the `_get_system` function) to recognize your new system based on the config or environment, and return an instance of your new class. +- {py:class}`mache.parallel.SingleNodeSystem` +- {py:class}`mache.parallel.LoginSystem` +- {py:class}`mache.parallel.SlurmSystem` +- {py:class}`mache.parallel.PbsSystem` -4. **(Optional) Add documentation:** - Update this documentation page to describe your new system and its usage. +## Notes -By following this structure, you can extend Polaris to support additional parallel resource managers or custom environments as needed. +- Polaris no longer provides a `polaris.parallel` module. +- Runtime no longer rewrites `cores_per_node` in config files. +- Machine and compiler config should provide the desired parallel options. diff --git a/docs/developers_guide/ocean/framework.md b/docs/developers_guide/ocean/framework.md index f07ef035be..7fd6831638 100644 --- a/docs/developers_guide/ocean/framework.md +++ b/docs/developers_guide/ocean/framework.md @@ -124,16 +124,31 @@ The algorithm for determining the resources is: ```python # ideally, about 200 cells per core -self.ntasks = max(1, round(cell_count / goal_cells_per_core + 0.5)) +cpu_ntasks = max(1, 4 * round(cell_count / (4 * goal_cells_per_core))) # In a pinch, about 2000 cells per core -self.min_tasks = max(1, round(cell_count / max_cells_per_core + 0.5)) +cpu_min_tasks = max(1, 4 * round(cell_count / (4 * max_cells_per_core))) ``` The config options `goal_cells_per_core` and `max_cells_per_core` in the `[ocean]` seciton can be used to control how resources scale with the size of -the planar mesh. By default, the number of MPI tasks tries to apportion 200 +the planar mesh. By default, the number of MPI tasks tries to apportion 200 cells to each core, but it will allow as many as 2000. +For Omega on GPU-capable parallel configs (`gpus_per_node > 0`), dynamic +sizing switches to GPU-based targets and sets one GPU per MPI task: + +```python +self.gpus_per_task = 1 +self.min_gpus_per_task = 1 +# ideally, about 8000 cells per GPU +self.ntasks = max(1, 4 * round(cell_count / (4 * goal_cells_per_gpu))) +# In a pinch, about 80000 cells per GPU +self.min_tasks = max(1, 4 * round(cell_count / (4 * max_cells_per_gpu))) +``` + +The corresponding `[ocean]` config options are `goal_cells_per_gpu` and +`max_cells_per_gpu`. + ### Setting time intervals in model config options It is often useful to be able to convert a `float` time interval in days or diff --git a/docs/developers_guide/overview.md b/docs/developers_guide/overview.md index f32629423a..8ae4c7c04e 100644 --- a/docs/developers_guide/overview.md +++ b/docs/developers_guide/overview.md @@ -218,9 +218,10 @@ downloading files from the and creating symlinks; `polaris.validation` can be used to ensure that variables are bit-for-bit identical between steps or when compared with a baseline, and to compare timers with a baseline; and the -`polaris.parallel` module contains a function -{py:func}`polaris.parallel.get_available_cores_and_nodes()` that can find out -the number of total cores and nodes available for running steps. +parallel framework uses `mache.parallel` through component methods such as +{py:meth}`polaris.Component.get_available_resources` and +{py:meth}`polaris.Component.run_parallel_command` to determine available +resources and launch parallel commands. ### ...within a component diff --git a/docs/users_guide/config_files.md b/docs/users_guide/config_files.md index a9b46fbe8e..fb9d122cc4 100644 --- a/docs/users_guide/config_files.md +++ b/docs/users_guide/config_files.md @@ -41,10 +41,16 @@ system = single_node # whether to use mpirun or srun to run the model parallel_executable = mpirun -host localhost -# total cores on the machine (or cores on one node if it is a multinode -# machine), detected automatically by default +# total cores on one node cores_per_node = 8 +# GPUs per node (optional) +gpus_per_node = 0 + +# optional compiler-specific overrides +[parallel.gnu] +parallel_executable = mpirun + ``` The comments in this example are hopefully pretty self-explanatory. @@ -74,7 +80,9 @@ sources: - the [machine config file](https://github.com/E3SM-Project/polaris/blob/main/polaris/machines) (using [machines/default.cfg](https://github.com/E3SM-Project/polaris/blob/main/polaris/machines/default.cfg) if no machine was specified) with information on the parallel system and - the paths to cached data files + the paths to cached data files. Parallel options can also come from + compiler-specific sections such as `[parallel.gnu]` or `[parallel.intel]` + (from mache machine configs) - the component's config file. For the {ref}`ocean` core, this sets default paths to the MPAS-Ocean model build (including the namelist templates). It uses From 9b1fce6f4ab8b9287c6de9fc080c6b23e5f2b5df Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Sat, 14 Mar 2026 22:53:51 +0100 Subject: [PATCH 20/39] Add resolution for node counts that aren't allowed This resolution comes mostly from mache.parallel. On Aurora, node counts 17-255 aren't allowed. We add a way to drop to 16 nodes if possible and jump to 256 if required. --- polaris/job/__init__.py | 240 ++++++++++++---------------------------- 1 file changed, 70 insertions(+), 170 deletions(-) diff --git a/polaris/job/__init__.py b/polaris/job/__init__.py index 56f35b72e6..640eb89fa4 100644 --- a/polaris/job/__init__.py +++ b/polaris/job/__init__.py @@ -4,6 +4,8 @@ import numpy as np from jinja2 import Template as Template from mache.parallel import get_parallel_system +from mache.parallel.pbs import PbsSystem +from mache.parallel.slurm import SlurmSystem def write_job_script( @@ -69,6 +71,8 @@ def write_job_script( assert config.combined is not None parallel_system = get_parallel_system(config.combined) + requested_nodes = nodes + if config.has_option('parallel', 'account'): account = config.get('parallel', 'account') else: @@ -77,6 +81,7 @@ def write_job_script( cores_per_node = parallel_system.get_config_int('cores_per_node') gpus_per_node = parallel_system.get_config_int('gpus_per_node', default=0) + use_gpu_nodes = False if nodes is None: if target_cores is None or min_cores is None: raise ValueError( @@ -91,6 +96,8 @@ def write_job_script( and max(target_gpus, min_gpus) > 0 ) if use_gpu_nodes: + assert target_gpus is not None + assert min_gpus is not None gpus = np.sqrt(target_gpus * min_gpus) nodes = int(np.ceil(gpus / gpus_per_node)) nodes = max(nodes, 1) @@ -104,6 +111,16 @@ def write_job_script( nodes = int(np.ceil(cores / cores_per_node)) nodes = max(nodes, 1) + if requested_nodes is None: + requested_nodes = nodes + + min_nodes_allowed = _get_min_nodes_allowed( + cores_per_node=cores_per_node, + gpus_per_node=gpus_per_node, + min_cores=min_cores, + min_gpus=min_gpus, + ) + # Determine parallel system type system = ( config.get('parallel', 'system') @@ -114,8 +131,17 @@ def write_job_script( render_kwargs: dict[str, str] = {} if system == 'slurm': - partition, qos, constraint, gpus_per_node, wall_time = ( - get_slurm_options(config, machine, nodes, parallel_system) + ( + partition, + qos, + constraint, + gpus_per_node, + wall_time, + nodes, + ) = SlurmSystem.get_slurm_options( + config=config.combined, + nodes=nodes, + min_nodes_allowed=min_nodes_allowed, ) template_name = 'job_script.slurm.template' render_kwargs.update( @@ -126,8 +152,17 @@ def write_job_script( wall_time=wall_time, ) elif system == 'pbs': - queue, constraint, gpus_per_node, wall_time, filesystems = ( - get_pbs_options(config, machine, nodes, parallel_system) + ( + queue, + constraint, + gpus_per_node, + wall_time, + filesystems, + nodes, + ) = PbsSystem.get_pbs_options( + config=config.combined, + nodes=nodes, + min_nodes_allowed=min_nodes_allowed, ) template_name = 'job_script.pbs.template' render_kwargs.update( @@ -145,6 +180,12 @@ def write_job_script( if job_name == '<<>>': job_name = f'polaris{f"_{suite}" if suite else ""}' + if requested_nodes is not None and requested_nodes != nodes: + print( + f'Adjusted node count from {requested_nodes} to {nodes} for ' + f'machine {machine} based on scheduler node limits.' + ) + template = Template( imp_res.files('polaris.job').joinpath(template_name).read_text() ) @@ -169,170 +210,29 @@ def write_job_script( handle.write(text) -def get_slurm_options(config, machine, nodes, parallel_system): - """ - Get Slurm options for job submission. - - Parameters - ---------- - config : polaris.config.PolarisConfigParser - Configuration options for this test case, a combination of user configs - and the defaults for the machine and component. - - machine : str - The name of the machine. - - nodes : int - The number of nodes required for the job. - - Returns - ------- - partition : str - The partition to use for the job. - - qos : str - The quality of service to use for the job. - - constraint : str - Any constraints to use for the job. - - gpus_per_node : str - The number of GPUs per node to request. - - wall_time : str - The wall time to request for the job. - """ - partition, qos, constraint, gpus_per_node, wall_time, _ = _get_job_options( - config, - machine, - nodes, - parallel_system, - partition_or_queue_option='partition', - partitions_or_queues='partitions', - ) - return partition, qos, constraint, gpus_per_node, wall_time - - -def get_pbs_options(config, machine, nodes, parallel_system): - """ - Get PBS options for job submission. - - Parameters - ---------- - config : polaris.config.PolarisConfigParser - Configuration options for this test case, a combination of user configs - and the defaults for the machine and component. - - machine : str - The name of the machine. - - nodes : int - The number of nodes required for the job. - - Returns - ------- - queue : str - The queue to use for the job. - - constraint : str - Any constraints to use for the job. - - gpus_per_node : str - The number of GPUs per node to request. - - wall_time : str - The wall time to request for the job. - """ - queue, _, constraint, gpus_per_node, wall_time, filesystems = ( - _get_job_options( - config, - machine, - nodes, - parallel_system, - partition_or_queue_option='queue', - partitions_or_queues='queues', - ) - ) - return queue, constraint, gpus_per_node, wall_time, filesystems - - -def _get_job_options( - config, - machine, - nodes, - parallel_system, - partition_or_queue_option, - partitions_or_queues, +def _get_min_nodes_allowed( + cores_per_node, + gpus_per_node, + min_cores, + min_gpus, ): - """ - Helper to get job options for slurm or pbs - - Parameters - ---------- - config : polaris.config.PolarisConfigParser - machine : str - nodes : int - partition_or_queue_option : str - 'partition' for slurm, 'queue' for pbs - partitions_or_queues : str - 'partitions' for slurm, 'queues' for pbs - - Returns - ------- - partition_or_queue : str - qos : str - constraint : str - gpus_per_node : str - wall_time : str - filesystems : str - """ - job_section = config['job'] - partition_or_queue = job_section.get(partition_or_queue_option) - if partition_or_queue == '<<>>': - value = parallel_system.get_config(partitions_or_queues) - if value is not None and value != '': - partition_or_queue = _parse_list(value)[0] - else: - partition_or_queue = '' - - qos = job_section.get('qos') - if qos == '<<>>': - value = parallel_system.get_config('qos') - if value is not None and value != '': - qos = _parse_list(value)[0] - else: - qos = '' - - constraint = job_section.get('constraint') - if constraint == '<<>>': - value = parallel_system.get_config('constraints') - if value is not None and value != '': - constraint = _parse_list(value)[0] - else: - constraint = '' - - gpus_per_node_value = parallel_system.get_config('gpus_per_node') - if gpus_per_node_value is not None: - gpus_per_node = str(gpus_per_node_value) - else: - gpus_per_node = '' - - wall_time = job_section.get('wall_time') - - if job_section.has_option('filesystems'): - filesystems = job_section.get('filesystems') - else: - filesystems = '' - - return ( - partition_or_queue, - qos, - constraint, - gpus_per_node, - wall_time, - filesystems, - ) - - -def _parse_list(value): - return [entry.strip() for entry in value.split(',') if entry.strip() != ''] + """Compute the minimum feasible nodes from minimum requested resources.""" + minima = [] + + if ( + min_cores is not None + and cores_per_node is not None + and cores_per_node > 0 + ): + minima.append(max(int(np.ceil(min_cores / cores_per_node)), 1)) + + if ( + min_gpus is not None + and gpus_per_node is not None + and gpus_per_node > 0 + ): + minima.append(max(int(np.ceil(min_gpus / gpus_per_node)), 1)) + + if len(minima) == 0: + return None + return max(minima) From 5a535cf43f485854d5f3ae8000f7309bf2fe7d18 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Sat, 14 Mar 2026 23:11:31 +0100 Subject: [PATCH 21/39] Remove filesystems config on Aurora This is now in mache instead. --- polaris/machines/aurora.cfg | 6 ------ 1 file changed, 6 deletions(-) diff --git a/polaris/machines/aurora.cfg b/polaris/machines/aurora.cfg index 83cc2de294..7c2be261b5 100644 --- a/polaris/machines/aurora.cfg +++ b/polaris/machines/aurora.cfg @@ -27,9 +27,3 @@ spack = /lus/flare/projects/E3SM_Dec/soft/polaris/aurora/spack # whether to use the same modules for hdf5, netcdf-c, netcdf-fortran and # pnetcdf as E3SM (spack modules are used otherwise) use_e3sm_hdf5_netcdf = True - -# Config options related to creating a job script -[job] - -# the filesystems used for the job -filesystems = home:flare From 72972110459e01aa940fc0f24e88cf92673607d9 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 16 Mar 2026 16:37:56 +0100 Subject: [PATCH 22/39] Fix how wall_time is handled in job scripts --- polaris/job/__init__.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/polaris/job/__init__.py b/polaris/job/__init__.py index 640eb89fa4..171ac0f2a5 100644 --- a/polaris/job/__init__.py +++ b/polaris/job/__init__.py @@ -130,19 +130,22 @@ def write_job_script( render_kwargs: dict[str, str] = {} + desired_wall_time = config.get('job', 'wall_time') + if system == 'slurm': ( partition, qos, constraint, gpus_per_node, - wall_time, + max_wallclock, nodes, ) = SlurmSystem.get_slurm_options( config=config.combined, nodes=nodes, min_nodes_allowed=min_nodes_allowed, ) + wall_time = _cap_wall_time(desired_wall_time, max_wallclock) template_name = 'job_script.slurm.template' render_kwargs.update( partition=partition, @@ -156,7 +159,7 @@ def write_job_script( queue, constraint, gpus_per_node, - wall_time, + max_wallclock, filesystems, nodes, ) = PbsSystem.get_pbs_options( @@ -164,6 +167,7 @@ def write_job_script( nodes=nodes, min_nodes_allowed=min_nodes_allowed, ) + wall_time = _cap_wall_time(desired_wall_time, max_wallclock) template_name = 'job_script.pbs.template' render_kwargs.update( queue=queue, @@ -236,3 +240,30 @@ def _get_min_nodes_allowed( if len(minima) == 0: return None return max(minima) + + +def _wallclock_to_seconds(wallclock): + """Convert HH:MM:SS wall-clock string to total seconds, or None.""" + parts = wallclock.split(':') + if len(parts) != 3: + return None + try: + return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]) + except ValueError: + return None + + +def _cap_wall_time(desired, max_wallclock): + """Return desired wall time, capped at max_wallclock if it is smaller. + + Defaults to desired if max_wallclock is empty or cannot be parsed. + """ + if not max_wallclock: + return desired + desired_secs = _wallclock_to_seconds(desired) + max_secs = _wallclock_to_seconds(max_wallclock) + if desired_secs is None or max_secs is None: + return desired + if desired_secs <= max_secs: + return desired + return max_wallclock From 2e0a6d7158f4e52bd49ff8c33181871e78d89333 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 18 Mar 2026 12:50:56 +0100 Subject: [PATCH 23/39] Update CI --- .github/workflows/build_workflow.yml | 76 ++++++++++++---------------- .github/workflows/docs_workflow.yml | 60 +++++++++++----------- 2 files changed, 63 insertions(+), 73 deletions(-) diff --git a/.github/workflows/build_workflow.yml b/.github/workflows/build_workflow.yml index 2cccbdcaec..ed322c6978 100644 --- a/.github/workflows/build_workflow.yml +++ b/.github/workflows/build_workflow.yml @@ -51,7 +51,7 @@ jobs: extra_args: --files ${{ steps.file_changes.outputs.files}} build: - name: test polaris - py ${{ matrix.python-version }}${{ matrix.env_only && ' (env_only)' || '' }} + name: test polaris - py ${{ matrix.python-version }}${{ matrix.mpi == 'nompi' && ' (nompi)' || '' }} runs-on: ubuntu-latest timeout-minutes: 20 defaults: @@ -60,10 +60,10 @@ jobs: strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] - env_only: [false] + mpi: ["mpich"] include: - python-version: "3.14" - env_only: true + mpi: "nompi" fail-fast: false steps: - id: skip_check @@ -76,47 +76,39 @@ jobs: uses: actions/checkout@v6 - if: ${{ steps.skip_check.outputs.should_skip != 'true' }} - name: Cache Conda - uses: actions/cache@v5 - env: - # Increase this value to reset cache if conda-dev-spec.template has not changed in the workflow - CACHE_NUMBER: 0 + name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 with: - path: ~/conda_pkgs_dir_py${{ matrix.python-version }}${{ matrix.env_only && '_envonly' || '' }} - key: ${{ runner.os }}-${{ matrix.python-version }}${{ matrix.env_only && '-envonly' || '' }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('configure_polaris_envs.py,deploy/*') }} + python-version: ${{ matrix.python-version }} - if: ${{ steps.skip_check.outputs.should_skip != 'true' }} - name: Set up Conda Environment - uses: mamba-org/setup-micromamba@v2 + name: Cache pixi packages + uses: actions/cache@v5 + env: + # Increase this value to reset cache if deploy inputs have not changed in the workflow + CACHE_NUMBER: 0 with: - environment-name: polaris_test - init-shell: bash - condarc: | - channel_priority: strict - channels: - - conda-forge - - e3sm/label/polaris - create-args: >- - python=${{ matrix.python-version }} + path: | + ~/.cache/rattler/cache + ~/.pixi/bin + key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.mpi }}-pixi-${{ env.CACHE_NUMBER }}-${{ hashFiles('deploy.py', 'deploy/**', 'pyproject.toml') }} - if: ${{ steps.skip_check.outputs.should_skip != 'true' }} name: Install polaris run: | git config --global url."https://github.com/".insteadOf "git@github.com:" - if [ "${{ matrix.env_only }}" = "true" ]; then - ./configure_polaris_envs.py \ - --conda_env_only \ - --env_name polaris_test \ - --verbose \ - --python=${{ matrix.python-version }} - source load_polaris_test.sh - else - ./configure_polaris_envs.py \ - --env_name polaris_test \ - --verbose \ - --python=${{ matrix.python-version }} - source load_polaris_test_mpich.sh + ./deploy.py \ + --compiler gnu \ + --mpi ${{ matrix.mpi }} \ + --python ${{ matrix.python-version }} \ + --recreate + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ + ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + if [ -z "$load_script" ]; then + echo "ERROR: no generated Polaris load script found" >&2 + exit 1 fi + source "$load_script" python -c "import polaris; import polaris.version; print(polaris.version.__version__)" polaris --help polaris list --help @@ -125,21 +117,17 @@ jobs: - if: ${{ steps.skip_check.outputs.should_skip != 'true' }} name: Run unit tests run: | - if [ "${{ matrix.env_only }}" = "true" ]; then - source load_polaris_test.sh - else - source load_polaris_test_mpich.sh - fi + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ + ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + source "$load_script" pytest tests/ - if: ${{ steps.skip_check.outputs.should_skip != 'true' }} name: Build Sphinx Docs run: | - if [ "${{ matrix.env_only }}" = "true" ]; then - source load_polaris_test.sh - else - source load_polaris_test_mpich.sh - fi + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ + ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + source "$load_script" DOCS_VERSION=test cd docs DOCS_VERSION=$DOCS_VERSION make versioned-html-strict diff --git a/.github/workflows/docs_workflow.yml b/.github/workflows/docs_workflow.yml index fe2bb7e9ee..cf8cd0206d 100644 --- a/.github/workflows/docs_workflow.yml +++ b/.github/workflows/docs_workflow.yml @@ -11,6 +11,7 @@ on: env: PYTHON_VERSION: "3.14" + MPI: "nompi" jobs: publish-docs: @@ -25,49 +26,50 @@ jobs: persist-credentials: false fetch-depth: 0 - - name: Cache Conda + - name: Set up Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v6 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Cache pixi packages uses: actions/cache@v5 env: - # Increase this value to reset cache if deploy/conda-dev-spec.template has not changed in the workflow + # Increase this value to reset cache if deploy inputs have not changed in the workflow CACHE_NUMBER: 0 with: - path: ~/conda_pkgs_dir - key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ - hashFiles('configure_polaris_envs.py,deploy/*') }} - - - if: ${{ steps.skip_check.outputs.should_skip != 'true' }} - name: Set up Conda Environment - uses: mamba-org/setup-micromamba@v2 - with: - environment-name: polaris_test - init-shell: bash - condarc: | - channel_priority: strict - channels: - - conda-forge - - e3sm/label/polaris - create-args: >- - python=${{ env.PYTHON_VERSION }} + path: | + ~/.cache/rattler/cache + ~/.pixi/bin + key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.MPI }}-pixi-${{ env.CACHE_NUMBER }}-${{ hashFiles('deploy.py', 'deploy/**', 'pyproject.toml') }} - - if: ${{ steps.skip_check.outputs.should_skip != 'true' }} - name: Install polaris + - name: Install polaris run: | git config --global url."https://github.com/".insteadOf "git@github.com:" - ./configure_polaris_envs.py \ - --conda_env_only \ - --env_name polaris_test \ - --verbose \ - --python=${{ env.PYTHON_VERSION }} - source load_polaris_test.sh + ./deploy.py \ + --compiler gnu \ + --mpi ${{ env.MPI }} \ + --python ${{ env.PYTHON_VERSION }} \ + --recreate + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ + ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + if [ -z "$load_script" ]; then + echo "ERROR: no generated Polaris load script found" >&2 + exit 1 + fi + source "$load_script" - name: Build Sphinx Docs run: | - source load_polaris_test.sh + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ + ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + source "$load_script" cd docs DOCS_VERSION=${{ github.ref_name }} make versioned-html - name: Copy Docs and Commit run: | - source load_polaris_test.sh + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ + ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + source "$load_script" cd docs # gh-pages branch must already exist git clone https://github.com/E3SM-Project/polaris.git --branch gh-pages --single-branch gh-pages From deef35877851d650e60a03ad274df9a07084f17f Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 9 Feb 2026 21:11:40 +0000 Subject: [PATCH 24/39] Add GPU compiler to Aurora config --- polaris/machines/aurora.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/polaris/machines/aurora.cfg b/polaris/machines/aurora.cfg index 7c2be261b5..e64a647107 100644 --- a/polaris/machines/aurora.cfg +++ b/polaris/machines/aurora.cfg @@ -21,6 +21,9 @@ software_compiler = oneapi-ifx # the system MPI library to use for oneapi-ifx compiler mpi_oneapi_ifx = mpich +# the system MPI library to use for oneapi-ifxgpu compiler +mpi_oneapi_ifxgpu = mpich + # the base path for spack environments used by polaris spack = /lus/flare/projects/E3SM_Dec/soft/polaris/aurora/spack From 8f28ab667bc672bec85af3d8cdfdc9bfbe18039b Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Thu, 19 Mar 2026 10:50:07 +0100 Subject: [PATCH 25/39] Update the docs with correct load script names --- docs/developers_guide/machines/aurora.md | 2 +- docs/developers_guide/machines/chrysalis.md | 4 ++-- docs/developers_guide/machines/frontier.md | 4 ++-- docs/developers_guide/machines/index.md | 2 +- docs/developers_guide/machines/perlmutter.md | 4 ++-- docs/developers_guide/quick_start.md | 4 ++-- .../updating_spack/testing/running_test_suites.md | 4 ++-- .../dev_add_category_of_tasks/creating_category_of_tasks.md | 2 +- docs/tutorials/dev_add_category_of_tasks/getting_started.md | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/developers_guide/machines/aurora.md b/docs/developers_guide/machines/aurora.md index 5cdbf3d28c..4448915d4f 100644 --- a/docs/developers_guide/machines/aurora.md +++ b/docs/developers_guide/machines/aurora.md @@ -8,7 +8,7 @@ This is the default polaris compiler on Aurora. If the environment has been set up properly (see {ref}`dev-conda-env`), you should be able to source: ```bash -source load_dev_polaris_0.10.0-alpha.1_aurora_oneapi-ifx_mpich.sh +source load_polaris_aurora_oneapi-ifx_mpich.sh ``` MPAS components do not yet support Aurora, but Omega does. diff --git a/docs/developers_guide/machines/chrysalis.md b/docs/developers_guide/machines/chrysalis.md index f5f0c57dda..813bdb90db 100644 --- a/docs/developers_guide/machines/chrysalis.md +++ b/docs/developers_guide/machines/chrysalis.md @@ -8,7 +8,7 @@ This is the default polaris compiler on Chrysalis. If the environment has been set up properly (see {ref}`dev-conda-env`), you should be able to source: ```bash -source load_dev_polaris_0.1.0-alpha.1_chrysalis_intel_openmpi.sh +source load_polaris_chrysalis_intel_openmpi.sh ``` Then, you can build the MPAS model with @@ -22,7 +22,7 @@ make [DEBUG=true] [OPENMP=true] ifort If you've set things up for this compiler, you should be able to: ```bash -source load_dev_polaris_0.1.0-alpha.1_chrysalis_gnu_openmpi.sh +source load_polaris_chrysalis_gnu_openmpi.sh ``` Then, you can build the MPAS model with diff --git a/docs/developers_guide/machines/frontier.md b/docs/developers_guide/machines/frontier.md index 2af0374669..d2e41cff93 100644 --- a/docs/developers_guide/machines/frontier.md +++ b/docs/developers_guide/machines/frontier.md @@ -6,7 +6,7 @@ If you've set things up for this compiler, you should be able to source a load script similar to: ```bash -source load_dev_polaris_0.6.0-alpha.1_frontier_craygnu_mpich.sh +source load_polaris_frontier_craygnu_mpich.sh ``` Then, you can build the MPAS model with @@ -21,7 +21,7 @@ Similarly to `craygnu`, for `craycray`, if you've set things up right, sourcing the load scrip will look something like: ```bash -source load_dev_polaris_0.6.0-alpha.1_frontier_craycray_mpich.sh +source load_polaris_frontier_craycray_mpich.sh ``` To build MPAS components, use: diff --git a/docs/developers_guide/machines/index.md b/docs/developers_guide/machines/index.md index bf59de4b60..a30f27f6cd 100644 --- a/docs/developers_guide/machines/index.md +++ b/docs/developers_guide/machines/index.md @@ -20,7 +20,7 @@ Omega or an MPAS component and work with polaris. Just source the script that should appear in the base of your polaris branch, e.g.: ```bash -source load_dev_polaris_0.10.0-alpha.1_chrysalis_intel_openmpi.sh +source load_polaris_chrysalis_intel_openmpi.sh ``` After loading this environment, you can set up tasks or suites, and diff --git a/docs/developers_guide/machines/perlmutter.md b/docs/developers_guide/machines/perlmutter.md index 56a524efd0..918e3b16fd 100644 --- a/docs/developers_guide/machines/perlmutter.md +++ b/docs/developers_guide/machines/perlmutter.md @@ -6,7 +6,7 @@ If you've set things up for this compiler, you should be able to source a load script similar to: ```bash -source load_dev_polaris_0.1.0-alpha.1_pm-cpu_gnu_mpich.sh +source load_polaris_pm-cpu_gnu_mpich.sh ``` Then, you can build the MPAS model with @@ -21,7 +21,7 @@ Similarly to `gnu`, for `intel`, if you've set things up right, sourcing the load scrip will look something like: ```bash -source load_dev_polaris_0.1.0-alpha.1_pm-cpu_intel_mpich.sh +source load_polaris_pm-cpu_intel_mpich.sh ``` To build MPAS components, use: diff --git a/docs/developers_guide/quick_start.md b/docs/developers_guide/quick_start.md index e9b00cd6ac..df2e7d6235 100644 --- a/docs/developers_guide/quick_start.md +++ b/docs/developers_guide/quick_start.md @@ -187,7 +187,7 @@ current branch. If the two don't match, an error like the following results and the environment is not activated: ``` -$ source load_polaris_test_morpheus_gnu_openmpi.sh +$ source load_polaris_morpheus_gnu_openmpi.sh This load script is for a different version of polaris: __version__ = '0.2.0' @@ -268,7 +268,7 @@ To update only the bootstrap environment used internally by deployment: Each time you want to work with polaris, you will need to run: ```bash -source ./load_.sh +source load_polaris___.sh ``` This will load the appropriate environment for polaris. It will also diff --git a/docs/developers_guide/updating_spack/testing/running_test_suites.md b/docs/developers_guide/updating_spack/testing/running_test_suites.md index e780aceb51..3590350d5a 100644 --- a/docs/developers_guide/updating_spack/testing/running_test_suites.md +++ b/docs/developers_guide/updating_spack/testing/running_test_suites.md @@ -19,7 +19,7 @@ with Intel and OpenMPI): 2. **Source the Load Script** ```bash - source load_polaris_dev_0.3.0-alpha.1_chrysalis_intel_openmpi.sh + source load_polaris_chrysalis_intel_openmpi.sh ``` *(Replace `chrysalis`, `intel`, and `openmpi` with your machine, compiler, @@ -73,7 +73,7 @@ with Intel and OpenMPI): 2. **Source the Load Script** ```bash - source load_polaris_dev_0.3.0-alpha.1_chrysalis_intel_openmpi.sh + source load_polaris_chrysalis_intel_openmpi.sh ``` *(Replace as appropriate for your configuration.)* diff --git a/docs/tutorials/dev_add_category_of_tasks/creating_category_of_tasks.md b/docs/tutorials/dev_add_category_of_tasks/creating_category_of_tasks.md index a8087db6bb..f8cf23ad22 100644 --- a/docs/tutorials/dev_add_category_of_tasks/creating_category_of_tasks.md +++ b/docs/tutorials/dev_add_category_of_tasks/creating_category_of_tasks.md @@ -105,7 +105,7 @@ polaris list If you don't have access to the `polaris` command, you probably need to source the load script, something like: ``` bash -source load_dev_polaris_0.1.0-alpha.3_chrysalis_intel_openmpi.sh +source load_polaris_chrysalis_intel_openmpi.sh ``` If `polaris list` gives you import errors, something isn't quite hooked up diff --git a/docs/tutorials/dev_add_category_of_tasks/getting_started.md b/docs/tutorials/dev_add_category_of_tasks/getting_started.md index 289f5d3871..4995eea699 100644 --- a/docs/tutorials/dev_add_category_of_tasks/getting_started.md +++ b/docs/tutorials/dev_add_category_of_tasks/getting_started.md @@ -38,7 +38,7 @@ depends on your Polaris version, machine, and compilers. For example, on Chrysalis, you might have `load_dev_polaris_0.1.0-alpha.3_chrysalis_intel_openmpi.sh`: ```bash -source load_dev_polaris_0.1.0-alpha.3_chrysalis_intel_openmpi.sh +source load_polaris_chrysalis_intel_openmpi.sh ``` Now, get the E3SM source code (used by Polaris to build MPAS-Ocean) via the From ca529220414757d1506e6ac4267d55b5392250c5 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Thu, 19 Mar 2026 21:01:36 +0100 Subject: [PATCH 26/39] Update to mache 3.0.1 --- deploy.py | 5 +++++ deploy/cli_spec.json | 2 +- deploy/pins.cfg | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/deploy.py b/deploy.py index eb07482ad2..8de22a98a2 100755 --- a/deploy.py +++ b/deploy.py @@ -300,6 +300,10 @@ def _bootstrap_url( mache_fork=None, mache_branch=None, ): + override_url = str(os.environ.get('MACHE_BOOTSTRAP_URL', '')).strip() + if override_url: + return override_url + if mache_fork is not None and mache_branch is not None: # Raw file from a fork/branch return f'https://raw.githubusercontent.com/{mache_fork}/{mache_branch}/{BOOTSTRAP_RELPATH}' # noqa: E501 @@ -456,6 +460,7 @@ def _run_mache_deploy_run(pixi_exe, repo_root, mache_run_argv): cmd = ( f'env -u PIXI_PROJECT_MANIFEST -u PIXI_PROJECT_ROOT ' + f'-u PIXI_ENVIRONMENT_NAME -u PIXI_IN_SHELL ' f'{shlex.quote(pixi_exe)} run -m {shlex.quote(pixi_toml)} bash -lc ' f'{shlex.quote("cd " + repo_root + " && " + mache_cmd)}' ) diff --git a/deploy/cli_spec.json b/deploy/cli_spec.json index 49cc6d2f18..b8499522b8 100644 --- a/deploy/cli_spec.json +++ b/deploy/cli_spec.json @@ -1,7 +1,7 @@ { "meta": { "software": "polaris", - "mache_version": "3.0.0", + "mache_version": "3.0.1", "description": "Deploy polaris environment" }, "arguments": [ diff --git a/deploy/pins.cfg b/deploy/pins.cfg index b3825ae0b8..92aa1bbd51 100644 --- a/deploy/pins.cfg +++ b/deploy/pins.cfg @@ -3,7 +3,7 @@ bootstrap_python = 3.13 python = 3.13 geometric_features = 1.6.1 -mache = 3.0.0 +mache = 3.0.1 mpas_tools = 1.4.0 otps = 2021.10 parallelio = 2.6.6 From f71268408182670ba2cc2a3d9b39c3a3377b2bad Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 20 Mar 2026 10:41:44 +0100 Subject: [PATCH 27/39] Update to mache 3.0.2 --- deploy.py | 2 ++ deploy/cli_spec.json | 2 +- deploy/pins.cfg | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/deploy.py b/deploy.py index 8de22a98a2..88a1cb8ac6 100755 --- a/deploy.py +++ b/deploy.py @@ -123,6 +123,8 @@ def main(): if not args.bootstrap_only: pixi_exe = _get_pixi_executable(getattr(args, 'pixi', None)) + if '--pixi' not in mache_run_argv: + mache_run_argv = ['--pixi', pixi_exe] + mache_run_argv _run_mache_deploy_run( pixi_exe=pixi_exe, repo_root='.', diff --git a/deploy/cli_spec.json b/deploy/cli_spec.json index b8499522b8..0d632b8958 100644 --- a/deploy/cli_spec.json +++ b/deploy/cli_spec.json @@ -1,7 +1,7 @@ { "meta": { "software": "polaris", - "mache_version": "3.0.1", + "mache_version": "3.0.2", "description": "Deploy polaris environment" }, "arguments": [ diff --git a/deploy/pins.cfg b/deploy/pins.cfg index 92aa1bbd51..3249bc2d87 100644 --- a/deploy/pins.cfg +++ b/deploy/pins.cfg @@ -3,7 +3,7 @@ bootstrap_python = 3.13 python = 3.13 geometric_features = 1.6.1 -mache = 3.0.1 +mache = 3.0.2 mpas_tools = 1.4.0 otps = 2021.10 parallelio = 2.6.6 From e12c046bac27923cfc767b0bcec98bb9cfb32dac Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 20 Mar 2026 15:16:48 +0100 Subject: [PATCH 28/39] Remove functions from API docs that were removed --- docs/developers_guide/api.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/developers_guide/api.md b/docs/developers_guide/api.md index 6e4389cee7..0f0ae74cb8 100644 --- a/docs/developers_guide/api.md +++ b/docs/developers_guide/api.md @@ -227,8 +227,6 @@ seaice/api :toctree: generated/ write_job_script - get_slurm_options - get_pbs_options ``` ### logging From 650136355feee8681d88153cfa418a40cefbedfa Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 20 Mar 2026 14:06:20 +0100 Subject: [PATCH 29/39] Fix support for deployment without spack --- .github/workflows/build_workflow.yml | 20 ++++--------- .github/workflows/docs_workflow.yml | 15 ++++------ deploy/cli_spec.json | 7 +++++ deploy/hooks.py | 43 +++++++++++++++++++--------- docs/developers_guide/quick_start.md | 32 +++++++++++++++------ 5 files changed, 72 insertions(+), 45 deletions(-) diff --git a/.github/workflows/build_workflow.yml b/.github/workflows/build_workflow.yml index ed322c6978..2d8570f2dd 100644 --- a/.github/workflows/build_workflow.yml +++ b/.github/workflows/build_workflow.yml @@ -51,7 +51,7 @@ jobs: extra_args: --files ${{ steps.file_changes.outputs.files}} build: - name: test polaris - py ${{ matrix.python-version }}${{ matrix.mpi == 'nompi' && ' (nompi)' || '' }} + name: test polaris - py ${{ matrix.python-version }} runs-on: ubuntu-latest timeout-minutes: 20 defaults: @@ -60,10 +60,6 @@ jobs: strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] - mpi: ["mpich"] - include: - - python-version: "3.14" - mpi: "nompi" fail-fast: false steps: - id: skip_check @@ -91,19 +87,17 @@ jobs: path: | ~/.cache/rattler/cache ~/.pixi/bin - key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.mpi }}-pixi-${{ env.CACHE_NUMBER }}-${{ hashFiles('deploy.py', 'deploy/**', 'pyproject.toml') }} + key: ${{ runner.os }}-${{ matrix.python-version }}-pixi-${{ env.CACHE_NUMBER }}-${{ hashFiles('deploy.py', 'deploy/**', 'pyproject.toml') }} - if: ${{ steps.skip_check.outputs.should_skip != 'true' }} name: Install polaris run: | git config --global url."https://github.com/".insteadOf "git@github.com:" ./deploy.py \ - --compiler gnu \ - --mpi ${{ matrix.mpi }} \ + --no-spack \ --python ${{ matrix.python-version }} \ --recreate - load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ - ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' | sort | tail -n 1) if [ -z "$load_script" ]; then echo "ERROR: no generated Polaris load script found" >&2 exit 1 @@ -117,16 +111,14 @@ jobs: - if: ${{ steps.skip_check.outputs.should_skip != 'true' }} name: Run unit tests run: | - load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ - ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' | sort | tail -n 1) source "$load_script" pytest tests/ - if: ${{ steps.skip_check.outputs.should_skip != 'true' }} name: Build Sphinx Docs run: | - load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ - ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' | sort | tail -n 1) source "$load_script" DOCS_VERSION=test cd docs diff --git a/.github/workflows/docs_workflow.yml b/.github/workflows/docs_workflow.yml index cf8cd0206d..1d96676daa 100644 --- a/.github/workflows/docs_workflow.yml +++ b/.github/workflows/docs_workflow.yml @@ -11,7 +11,6 @@ on: env: PYTHON_VERSION: "3.14" - MPI: "nompi" jobs: publish-docs: @@ -40,18 +39,16 @@ jobs: path: | ~/.cache/rattler/cache ~/.pixi/bin - key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.MPI }}-pixi-${{ env.CACHE_NUMBER }}-${{ hashFiles('deploy.py', 'deploy/**', 'pyproject.toml') }} + key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-pixi-${{ env.CACHE_NUMBER }}-${{ hashFiles('deploy.py', 'deploy/**', 'pyproject.toml') }} - name: Install polaris run: | git config --global url."https://github.com/".insteadOf "git@github.com:" ./deploy.py \ - --compiler gnu \ - --mpi ${{ env.MPI }} \ + --no-spack \ --python ${{ env.PYTHON_VERSION }} \ --recreate - load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ - ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' | sort | tail -n 1) if [ -z "$load_script" ]; then echo "ERROR: no generated Polaris load script found" >&2 exit 1 @@ -60,15 +57,13 @@ jobs: - name: Build Sphinx Docs run: | - load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ - ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' | sort | tail -n 1) source "$load_script" cd docs DOCS_VERSION=${{ github.ref_name }} make versioned-html - name: Copy Docs and Commit run: | - load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' \ - ! -name 'load_polaris_gnu_openmpi.sh' | sort | tail -n 1) + load_script=$(find . -maxdepth 1 -type f -name 'load_polaris*.sh' | sort | tail -n 1) source "$load_script" cd docs # gh-pages branch must already exist diff --git a/deploy/cli_spec.json b/deploy/cli_spec.json index 0d632b8958..5aae5cdf51 100644 --- a/deploy/cli_spec.json +++ b/deploy/cli_spec.json @@ -44,6 +44,13 @@ "help": "Deploy all supported Spack environments (overrides spack.deploy in deploy/config.yaml.j2).", "route": ["deploy", "run"] }, + { + "flags": ["--no-spack"], + "dest": "no_spack", + "action": "store_true", + "help": "Disable all Spack use for this run, including reuse of existing Spack environments.", + "route": ["deploy", "run"] + }, { "flags": ["--spack-path"], "dest": "spack_path", diff --git a/deploy/hooks.py b/deploy/hooks.py index 1be12d2767..ae0a942154 100644 --- a/deploy/hooks.py +++ b/deploy/hooks.py @@ -65,13 +65,23 @@ def pre_spack(ctx: DeployContext) -> dict[str, Any] | None: Optional mapping merged into `ctx.runtime` by mache. """ - updates: Dict[str, Any] = {} - spack_path = _get_spack_path(ctx.config, ctx.machine, ctx.machine_config) + spack_path = _get_spack_path( + ctx.config, ctx.machine, ctx.machine_config, ctx.args + ) - if spack_path is not None: - updates['spack'] = {'spack_path': spack_path} + if spack_path is None: + ctx.logger.info( + 'No supported shared Spack environment was detected for this ' + 'run; disabling Spack and relying on Pixi dependencies instead.' + ) + return { + 'spack': { + 'supported': False, + 'software': {'supported': False}, + } + } - return updates + return {'spack': {'spack_path': spack_path}} def _get_version(): @@ -117,14 +127,21 @@ def _get_pixi_mpi(machine, machine_config): return mpi -def _get_spack_path(config, machine, machine_config): +def _get_spack_path(config, machine, machine_config, args): """ - Get the Spack path from environment variable or machine config + Get the Spack path from CLI, config or machine config """ + spack_path = getattr(args, 'spack_path', None) + if spack_path is not None and str(spack_path).strip(): + return spack_path + spack_path = config.get('spack', {}).get('spack_path') - if spack_path is not None: - # no need to update - return None + if spack_path is not None and str(spack_path).strip().lower() not in ( + '', + 'none', + 'null', + ): + return spack_path if machine is None: return None @@ -137,10 +154,10 @@ def _get_spack_path(config, machine, machine_config): spack_env = f'dev_polaris_{release_version}' if not machine_config.has_section('deploy'): - raise ValueError("Missing 'deploy' section in machine config") + return None section = machine_config['deploy'] spack_base = section.get('spack') + if spack_base is None or not spack_base.strip(): + return None spack_path = os.path.join(spack_base, spack_env) - if spack_path is None: - raise ValueError("Missing 'spack' option in 'deploy' section") return spack_path diff --git a/docs/developers_guide/quick_start.md b/docs/developers_guide/quick_start.md index df2e7d6235..28055a9961 100644 --- a/docs/developers_guide/quick_start.md +++ b/docs/developers_guide/quick_start.md @@ -58,14 +58,17 @@ If you are on one of the {ref}`dev-supported-machines`, run: ```bash ./deploy.py [--machine ] [--compiler ...] \ - [--mpi ...] [--deploy-spack] [--prefix ] [--recreate] + [--mpi ...] [--deploy-spack] [--no-spack] \ + [--prefix ] [--recreate] ``` If you are on a login node, machine detection typically works automatically. You can pass `--machine ` explicitly if needed. -Use `--deploy-spack` when you want to deploy machine-specific Spack -environments in addition to the local pixi environment. +By default, Polaris will reuse existing machine-specific Spack environments +when the current deployment needs them. Use `--deploy-spack` when you want to +build or update those Spack environments. Use `--no-spack` for a Pixi-only +deployment, such as CI or unsupported machines. ### Unknown machines @@ -100,7 +103,11 @@ For workflows that need custom machine config files, see {ref}`config-files`. `--deploy-spack` -: deploy supported Spack environments +: deploy supported Spack environments instead of only reusing existing ones + +`--no-spack` + +: disable all Spack use for this run and rely on Pixi dependencies instead `--spack-path` @@ -242,7 +249,7 @@ can run: ```bash ./deploy.py [--machine ] [--compiler ...] \ - [--mpi ...] [--deploy-spack] --recreate + [--mpi ...] [--deploy-spack] [--no-spack] --recreate ``` The `--recreate` flag will delete the environment and create it from @@ -256,8 +263,14 @@ For some workflows (e.g. for MALI development with the Albany library when the MALI build environment has been created outside of `polaris`, for example, on an unsupported machine), you may only want to create the pixi environment and not build SCORPIO, ESMF or include any system modules or environment -variables in your activation script. In such cases, run `./deploy.py` -without `--deploy-spack`. +variables in your activation script. In such cases, run: + +```bash +./deploy.py --no-spack +``` + +When `--no-spack` is not used, omitting `--deploy-spack` still means Polaris +will try to reuse any required pre-existing Spack environments. To update only the bootstrap environment used internally by deployment: @@ -268,9 +281,12 @@ To update only the bootstrap environment used internally by deployment: Each time you want to work with polaris, you will need to run: ```bash -source load_polaris___.sh +source load_polaris.sh ``` +For machine-specific deployments that use Spack, the generated script is +typically `load_polaris___.sh`. + This will load the appropriate environment for polaris. It will also set an environment variable `POLARIS_LOAD_SCRIPT` that points to the activation script. Polaris uses this to make a symlink to the activation script From feae0650190e2cad855fa5dd4f5a7b5085767edb Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 20 Mar 2026 16:17:57 +0100 Subject: [PATCH 30/39] Update to mache 3.0.3 --- deploy/cli_spec.json | 2 +- deploy/pins.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deploy/cli_spec.json b/deploy/cli_spec.json index 5aae5cdf51..87033f0625 100644 --- a/deploy/cli_spec.json +++ b/deploy/cli_spec.json @@ -1,7 +1,7 @@ { "meta": { "software": "polaris", - "mache_version": "3.0.2", + "mache_version": "3.0.3", "description": "Deploy polaris environment" }, "arguments": [ diff --git a/deploy/pins.cfg b/deploy/pins.cfg index 3249bc2d87..833a318cda 100644 --- a/deploy/pins.cfg +++ b/deploy/pins.cfg @@ -3,7 +3,7 @@ bootstrap_python = 3.13 python = 3.13 geometric_features = 1.6.1 -mache = 3.0.2 +mache = 3.0.3 mpas_tools = 1.4.0 otps = 2021.10 parallelio = 2.6.6 From 2defb901a905b2acb5694521416bb8434f3ba788 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 20 Mar 2026 18:23:11 +0100 Subject: [PATCH 31/39] Fix mache spec --- deploy/pixi.toml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/pixi.toml.j2 b/deploy/pixi.toml.j2 index 96c9c9da44..6e092079b2 100644 --- a/deploy/pixi.toml.j2 +++ b/deploy/pixi.toml.j2 @@ -12,7 +12,7 @@ channel-priority = "strict" python = "{{ python }}.*" {%- if include_mache %} -mache = "=={{ mache }}.*" +mache = "{{ mache }}.*" {%- endif %} {%- if include_jigsaw %} From 9c92dbd556529078d457689904de24203b84f0f6 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Sat, 21 Mar 2026 09:01:08 +0100 Subject: [PATCH 32/39] Update to mache v3.0.4 --- deploy.py | 19 +++++++++++++++++-- deploy/cli_spec.json | 2 +- deploy/pins.cfg | 2 +- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/deploy.py b/deploy.py index 88a1cb8ac6..b16c7503b7 100755 --- a/deploy.py +++ b/deploy.py @@ -52,10 +52,17 @@ def main(): _validate_fork_branch_pair(args) using_fork = getattr(args, 'mache_fork', None) is not None + requested_mache_version = str( + getattr(args, 'mache_version', '') or '' + ).strip() if not using_fork: _validate_cli_spec_matches_pins(cli_spec, pinned_mache_version) + bootstrap_mache_version = pinned_mache_version + if not using_fork and requested_mache_version: + bootstrap_mache_version = requested_mache_version + # remove tmp dir if os.path.exists(DEPLOY_TMP_DIR): shutil.rmtree(DEPLOY_TMP_DIR) @@ -63,7 +70,7 @@ def main(): os.makedirs(DEPLOY_TMP_DIR) bootstrap_url = _bootstrap_url( - mache_version=pinned_mache_version, + mache_version=bootstrap_mache_version, mache_fork=getattr(args, 'mache_fork', None), mache_branch=getattr(args, 'mache_branch', None), ) @@ -109,11 +116,19 @@ def main(): if args.bootstrap_only: pixi_exe = _get_pixi_executable(getattr(args, 'pixi', None)) bootstrap_dir = os.path.join(DEPLOY_TMP_DIR, 'bootstrap_pixi') + update_cmd = f'mache deploy update --software {software}' + if requested_mache_version: + update_cmd = ( + f'{update_cmd} --mache-version ' + f'{shlex.quote(requested_mache_version)}' + ) print( '\nBootstrap environment is ready. To use it interactively:\n' f' pixi shell -m {bootstrap_dir}/pixi.toml\n\n' 'Then, you can run:\n' - f' mache deploy update --software {software}\n' + f' {update_cmd}\n' + 'After update, edit deploy/pins.cfg to set [pixi] mache to the ' + 'new version.\n' f' exit\n' ) diff --git a/deploy/cli_spec.json b/deploy/cli_spec.json index 87033f0625..969bf573ea 100644 --- a/deploy/cli_spec.json +++ b/deploy/cli_spec.json @@ -1,7 +1,7 @@ { "meta": { "software": "polaris", - "mache_version": "3.0.3", + "mache_version": "3.0.4", "description": "Deploy polaris environment" }, "arguments": [ diff --git a/deploy/pins.cfg b/deploy/pins.cfg index 833a318cda..ab59908e60 100644 --- a/deploy/pins.cfg +++ b/deploy/pins.cfg @@ -3,7 +3,7 @@ bootstrap_python = 3.13 python = 3.13 geometric_features = 1.6.1 -mache = 3.0.3 +mache = 3.0.4 mpas_tools = 1.4.0 otps = 2021.10 parallelio = 2.6.6 From 0f90e4837274bcaa6d0e5328e5d1fb9790192779 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 6 Mar 2026 14:58:51 +0100 Subject: [PATCH 33/39] Update to 1.0.0-alpha.1 --- polaris/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polaris/version.py b/polaris/version.py index ade57dd87c..e43a737498 100644 --- a/polaris/version.py +++ b/polaris/version.py @@ -1 +1 @@ -__version__ = '0.10.0-alpha.1' +__version__ = '1.0.0-alpha.1' From dd8a7b2a72aadee59f2a1b150d3042c06c43dbe6 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 6 Mar 2026 13:22:24 -0600 Subject: [PATCH 34/39] Update pins for dependencies --- deploy/pins.cfg | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deploy/pins.cfg b/deploy/pins.cfg index ab59908e60..6aafe3b283 100644 --- a/deploy/pins.cfg +++ b/deploy/pins.cfg @@ -1,7 +1,7 @@ # pins for the pixi environment [pixi] -bootstrap_python = 3.13 -python = 3.13 +bootstrap_python = 3.14 +python = 3.14 geometric_features = 1.6.1 mache = 3.0.4 mpas_tools = 1.4.0 @@ -18,9 +18,9 @@ scorpio = 1.8.2 # pins for both pixi and spack environments [all] -esmf = 8.9.0 +esmf = 8.9.1 metis = 5.1.0 moab = 5.6.0 -netcdf_c = 4.9.3 +netcdf_c = 4.10.0 netcdf_fortran = 4.6.2 pnetcdf = 1.14.1 From d56afddbc91349a819b90e25d6efa43f976946e7 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 6 Mar 2026 15:06:21 -0600 Subject: [PATCH 35/39] Add oneapi-ifx support for chrysalis --- polaris/machines/chrysalis.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/polaris/machines/chrysalis.cfg b/polaris/machines/chrysalis.cfg index 912241701b..39ab78e34d 100644 --- a/polaris/machines/chrysalis.cfg +++ b/polaris/machines/chrysalis.cfg @@ -23,6 +23,9 @@ mpi_intel = openmpi # the system MPI library to use for gnu compiler mpi_gnu = openmpi +# the system MPI library to use for oneapi-ifx compiler +mpi_oneapi_ifx = openmpi + # the base path for spack environments used by polaris spack = /lcrc/soft/climate/polaris/chrysalis/spack From 5736bcb1ad46aeda397ebd689e959a72c024bd61 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 16 Mar 2026 13:47:36 +0000 Subject: [PATCH 36/39] Make sure auto-build raises an error on build failures Previously, the error code from Omega or MPAS-Ocean builds was not getting trapped because of the use of the `tee` command. --- polaris/build/mpas_ocean.py | 3 ++- polaris/build/omega.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/polaris/build/mpas_ocean.py b/polaris/build/mpas_ocean.py index e70b1e0c66..320257f442 100644 --- a/polaris/build/mpas_ocean.py +++ b/polaris/build/mpas_ocean.py @@ -85,7 +85,8 @@ def build_mpas_ocean( if quiet: command += f' > {log_filename} 2>&1 ' else: - command += f' 2>&1 | tee {log_filename}' + # use pipefail so a build failure is not masked by tee's exit code + command = f'set -o pipefail; {command} 2>&1 | tee {log_filename}' subprocess.check_call(command, shell=True) print(f'MPAS-Ocean builds script written to:\n {script_filename}\n') diff --git a/polaris/build/omega.py b/polaris/build/omega.py index 66d768bbff..47fe1765f6 100644 --- a/polaris/build/omega.py +++ b/polaris/build/omega.py @@ -82,7 +82,8 @@ def build_omega( if quiet: command += f' > {log_filename} 2>&1 ' else: - command += f' 2>&1 | tee {log_filename}' + # use pipefail so a build failure is not masked by tee's exit code + command = f'set -o pipefail; {command} 2>&1 | tee {log_filename}' subprocess.check_call(command, shell=True) print(f'Omega builds script written to:\n {script_filename}\n') From c69d7e3b093a243dae33228cd6aebdba9139c802 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 18 Mar 2026 13:12:59 +0100 Subject: [PATCH 37/39] Update Omega submodule to bring in E3SM/master These changes are needed to build Omega with latest modules on Perlmutter, Aurora and Chrysalis. --- e3sm_submodules/Omega | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e3sm_submodules/Omega b/e3sm_submodules/Omega index 09b2f44b55..d0b3482cc5 160000 --- a/e3sm_submodules/Omega +++ b/e3sm_submodules/Omega @@ -1 +1 @@ -Subproject commit 09b2f44b5518697b969658b7bcff04ed6b45c025 +Subproject commit d0b3482cc5383108898f14f5566a17d20450d7d3 From 7284d86e7613858d14c73f70b9510ab3dbeb4e0c Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 18 Mar 2026 10:31:31 -0500 Subject: [PATCH 38/39] Lower rotation_2d convergence_thresh_tracer3_order2 to 0.25 --- polaris/tasks/ocean/sphere_transport/rotation_2d.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polaris/tasks/ocean/sphere_transport/rotation_2d.cfg b/polaris/tasks/ocean/sphere_transport/rotation_2d.cfg index 282f8d58d8..474a77b45b 100644 --- a/polaris/tasks/ocean/sphere_transport/rotation_2d.cfg +++ b/polaris/tasks/ocean/sphere_transport/rotation_2d.cfg @@ -12,4 +12,4 @@ convergence_thresh_tracer3_order3 = 0.4 # convergence threshold below which the test fails for order 2 convergence_thresh_tracer1_order2 = 0.6 convergence_thresh_tracer2_order2 = 1.4 -convergence_thresh_tracer3_order2 = 0.27 +convergence_thresh_tracer3_order2 = 0.25 From 87fea22395da526138dc7c57f8d13ba9e1d46e2f Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 20 Mar 2026 12:36:30 +0100 Subject: [PATCH 39/39] Update to parallelio 2.6.9 --- deploy/pins.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/pins.cfg b/deploy/pins.cfg index 6aafe3b283..dce5bb66fe 100644 --- a/deploy/pins.cfg +++ b/deploy/pins.cfg @@ -6,7 +6,7 @@ geometric_features = 1.6.1 mache = 3.0.4 mpas_tools = 1.4.0 otps = 2021.10 -parallelio = 2.6.6 +parallelio = 2.6.9 # pins for the spack environment [spack]