From cf8753cb526d15db05ef19263da5ca1fa3b7ba37 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 25 Mar 2026 09:37:49 +0100 Subject: [PATCH 1/7] Update to v1.0.0-alpha.2 --- polaris/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polaris/version.py b/polaris/version.py index e43a737498..e48e451469 100644 --- a/polaris/version.py +++ b/polaris/version.py @@ -1 +1 @@ -__version__ = '1.0.0-alpha.1' +__version__ = '1.0.0-alpha.2' From 8f93638910d052028d9802c483b11ea691e6054e Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 25 Mar 2026 09:57:58 +0100 Subject: [PATCH 2/7] Update to mache 3.2.0 --- deploy.py | 109 +++++++++++++++++++++++++++++++++++++------ deploy/cli_spec.json | 2 +- deploy/pins.cfg | 2 +- 3 files changed, 98 insertions(+), 15 deletions(-) diff --git a/deploy.py b/deploy.py index b16c7503b7..cd899183ed 100755 --- a/deploy.py +++ b/deploy.py @@ -3,7 +3,8 @@ Target software deployment entrypoint. - Reads pinned mache version from deploy/pins.cfg -- Reads CLI spec from deploy/cli_spec.json and builds argparse CLI +- Reads CLI spec from deploy/cli_spec.json plus optional + deploy/custom_cli_spec.json and builds argparse CLI - Downloads mache/deploy/bootstrap.py for either: * a given mache fork/branch, or * the pinned mache version @@ -25,6 +26,7 @@ PINS_CFG = os.path.join('deploy', 'pins.cfg') CLI_SPEC_JSON = os.path.join('deploy', 'cli_spec.json') +CUSTOM_CLI_SPEC_JSON = os.path.join('deploy', 'custom_cli_spec.json') DEPLOY_TMP_DIR = 'deploy_tmp' BOOTSTRAP_PATH = os.path.join(DEPLOY_TMP_DIR, 'bootstrap.py') @@ -40,6 +42,10 @@ def main(): pinned_mache_version, pinned_python_version = _read_pins(PINS_CFG) cli_spec = _read_cli_spec(CLI_SPEC_JSON) + cli_spec = _merge_optional_cli_spec( + cli_spec, + _read_optional_cli_spec(CUSTOM_CLI_SPEC_JSON), + ) parser = _build_parser_from_cli_spec(cli_spec) args = parser.parse_args(sys.argv[1:]) @@ -225,6 +231,74 @@ def _read_cli_spec(spec_path): return spec +def _read_optional_cli_spec(spec_path): + if not os.path.exists(spec_path): + return None + + try: + with open(spec_path, 'r', encoding='utf-8') as f: + spec = json.load(f) + except (OSError, json.JSONDecodeError) as e: + raise SystemExit(f'ERROR: Failed to parse {spec_path}: {e!r}') from e + + if not isinstance(spec, dict): + raise SystemExit(f'ERROR: {spec_path} must contain a JSON object') + if 'arguments' not in spec: + raise SystemExit( + f"ERROR: {spec_path} must contain top-level key 'arguments'" + ) + if not isinstance(spec['arguments'], list): + raise SystemExit(f"ERROR: {spec_path} 'arguments' must be a list") + meta = spec.get('meta') + if meta is not None and not isinstance(meta, dict): + raise SystemExit(f"ERROR: {spec_path} 'meta' must be an object") + + return spec + + +def _merge_optional_cli_spec(cli_spec, custom_cli_spec): + if custom_cli_spec is None: + return cli_spec + + merged_meta = dict(cli_spec.get('meta', {})) # type: dict + merged_arguments = list(cli_spec.get('arguments', [])) # type: list + merged = { + 'meta': merged_meta, + 'arguments': merged_arguments, + } + + seen_dests = set() + seen_flags = set() + for entry in merged_arguments: + dest = entry.get('dest') + if dest: + seen_dests.add(dest) + for flag in entry.get('flags', []): + seen_flags.add(flag) + + for entry in custom_cli_spec['arguments']: + dest = entry.get('dest') + if dest in seen_dests: + raise SystemExit( + 'ERROR: deploy/custom_cli_spec.json duplicates generated ' + f"dest '{dest}'" + ) + flags = entry.get('flags', []) + duplicate_flags = [flag for flag in flags if flag in seen_flags] + if duplicate_flags: + dup_str = ', '.join(duplicate_flags) + raise SystemExit( + 'ERROR: deploy/custom_cli_spec.json duplicates generated ' + f'flags: {dup_str}' + ) + merged_arguments.append(entry) + if dest: + seen_dests.add(dest) + seen_flags.update(flags) + + return merged + + def _build_parser_from_cli_spec(cli_spec): description = cli_spec.get('meta', {}).get( 'description', 'Deploy E3SM software environment' @@ -468,21 +542,30 @@ def _run_mache_deploy_run(pixi_exe, repo_root, mache_run_argv): f'ERROR: bootstrap pixi project not found. Expected: {pixi_toml}' ) - # Build a bash command that runs mache inside pixi, then cd's to repo. - mache_cmd = 'mache deploy run' + env = os.environ.copy() + for var in ( + 'PIXI_PROJECT_MANIFEST', + 'PIXI_PROJECT_ROOT', + 'PIXI_ENVIRONMENT_NAME', + 'PIXI_IN_SHELL', + ): + env.pop(var, None) + + cmd = [ + pixi_exe, + 'run', + '-m', + pixi_toml, + '--', + 'mache', + 'deploy', + 'run', + ] if mache_run_argv: - mache_cmd = f'{mache_cmd} ' + ' '.join( - shlex.quote(a) for a in mache_run_argv - ) + cmd.extend(mache_run_argv) - cmd = ( - f'env -u PIXI_PROJECT_MANIFEST -u PIXI_PROJECT_ROOT ' - f'-u PIXI_ENVIRONMENT_NAME -u PIXI_IN_SHELL ' - f'{shlex.quote(pixi_exe)} run -m {shlex.quote(pixi_toml)} bash -lc ' - f'{shlex.quote("cd " + repo_root + " && " + mache_cmd)}' - ) try: - subprocess.check_call(['/bin/bash', '-lc', cmd]) + subprocess.run(cmd, cwd=repo_root, env=env, check=True) except subprocess.CalledProcessError as e: raise SystemExit( f'\nERROR: Deployment step failed (exit code {e.returncode}). ' diff --git a/deploy/cli_spec.json b/deploy/cli_spec.json index 969bf573ea..8bff7a8d52 100644 --- a/deploy/cli_spec.json +++ b/deploy/cli_spec.json @@ -1,7 +1,7 @@ { "meta": { "software": "polaris", - "mache_version": "3.0.4", + "mache_version": "3.2.0", "description": "Deploy polaris environment" }, "arguments": [ diff --git a/deploy/pins.cfg b/deploy/pins.cfg index dce5bb66fe..a023be6966 100644 --- a/deploy/pins.cfg +++ b/deploy/pins.cfg @@ -3,7 +3,7 @@ bootstrap_python = 3.14 python = 3.14 geometric_features = 1.6.1 -mache = 3.0.4 +mache = 3.2.0 mpas_tools = 1.4.0 otps = 2021.10 parallelio = 2.6.9 From 513a18117cd1fd7de69e08a3d27d0a2707539c30 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 25 Mar 2026 10:53:53 +0100 Subject: [PATCH 3/7] Add group and world-read config options These will be used to update permissions after the spack environment is deployed. --- deploy/config.yaml.j2 | 11 +++++++++++ polaris/machines/aurora.cfg | 6 ++++++ polaris/machines/chrysalis.cfg | 6 ++++++ polaris/machines/frontier.cfg | 6 ++++++ polaris/machines/pm-cpu.cfg | 6 ++++++ polaris/machines/pm-gpu.cfg | 6 ++++++ 6 files changed, 41 insertions(+) diff --git a/deploy/config.yaml.j2 b/deploy/config.yaml.j2 index 709bb51cb1..a710213dda 100644 --- a/deploy/config.yaml.j2 +++ b/deploy/config.yaml.j2 @@ -124,6 +124,17 @@ jigsaw: # Relative path in the target repo where JIGSAW-Python lives. jigsaw_python_path: jigsaw-python +permissions: + # Optional overrides for post-deploy permission updates. + # If unset here, `mache deploy run` falls back to: + # 1. hook-provided runtime permissions + # 2. machine config [deploy] group/world_readable + # 3. legacy machine config [e3sm_unified] group + group: null + + # Whether deployed files should be readable outside the shared group. + world_readable: true + # Deployment hooks hooks: file: "deploy/hooks.py" diff --git a/polaris/machines/aurora.cfg b/polaris/machines/aurora.cfg index 4db2de35cf..d22867a220 100644 --- a/polaris/machines/aurora.cfg +++ b/polaris/machines/aurora.cfg @@ -11,6 +11,12 @@ polaris_envs = /lus/flare/projects/E3SM_Dec/soft/polaris/aurora/base # Options related to deploying polaris environments on supported machines [deploy] +# the unix group for permissions for deployed polaris environments +group = E3SMinput + +# whether deployed environments should be readable outside the shared group +world_readable = true + # the compiler set to use for system libraries and MPAS builds compiler = oneapi-ifx diff --git a/polaris/machines/chrysalis.cfg b/polaris/machines/chrysalis.cfg index 39ab78e34d..1a00e2c861 100644 --- a/polaris/machines/chrysalis.cfg +++ b/polaris/machines/chrysalis.cfg @@ -11,6 +11,12 @@ polaris_envs = /lcrc/soft/climate/polaris/chrysalis/base # Options related to deploying polaris environments on supported machines [deploy] +# the unix group for permissions for deployed polaris environments +group = cels + +# whether deployed environments should be readable outside the shared group +world_readable = true + # the compiler set to use for system libraries and MPAS builds compiler = intel diff --git a/polaris/machines/frontier.cfg b/polaris/machines/frontier.cfg index ce40359bc0..d66d5c1eb9 100644 --- a/polaris/machines/frontier.cfg +++ b/polaris/machines/frontier.cfg @@ -11,6 +11,12 @@ polaris_envs = /ccs/proj/cli115/software/polaris/frontier/conda/base # Options related to deploying polaris environments on supported machines [deploy] +# the unix group for permissions for deployed polaris environments +group = cli115 + +# whether deployed environments should be readable outside the shared group +world_readable = true + # the compiler set to use for system libraries and MPAS builds compiler = craygnu diff --git a/polaris/machines/pm-cpu.cfg b/polaris/machines/pm-cpu.cfg index 4f19cc86d4..71c2292e1e 100644 --- a/polaris/machines/pm-cpu.cfg +++ b/polaris/machines/pm-cpu.cfg @@ -11,6 +11,12 @@ polaris_envs = /global/common/software/e3sm/polaris/pm-cpu/conda/base # Options related to deploying polaris environments on supported machines [deploy] +# the unix group for permissions for deployed polaris environments +group = e3sm + +# whether deployed environments should be readable outside the shared group +world_readable = true + # the compiler set to use for system libraries and MPAS builds compiler = gnu diff --git a/polaris/machines/pm-gpu.cfg b/polaris/machines/pm-gpu.cfg index 5297573da5..319a3e2071 100644 --- a/polaris/machines/pm-gpu.cfg +++ b/polaris/machines/pm-gpu.cfg @@ -11,6 +11,12 @@ polaris_envs = /global/common/software/e3sm/polaris/pm-gpu/conda/base # Options related to deploying polaris environments on supported machines [deploy] +# the unix group for permissions for deployed polaris environments +group = e3sm + +# whether deployed environments should be readable outside the shared group +world_readable = true + # the compiler set to use for system libraries and MPAS builds compiler = gnugpu From 6c79862a9b35758dfc4dd4fa887777c7052fc17b Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 25 Mar 2026 20:48:51 +0100 Subject: [PATCH 4/7] Drop gpus_per_node from PBS job script This didn't seem to work as expected on Aurora and isn't needed. --- polaris/job/job_script.pbs.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polaris/job/job_script.pbs.template b/polaris/job/job_script.pbs.template index 4832a8c475..6c62bc53df 100644 --- a/polaris/job/job_script.pbs.template +++ b/polaris/job/job_script.pbs.template @@ -3,7 +3,7 @@ {%- if account != '' %} #PBS -A {{ account }} {%- endif %} -#PBS -l select={{ nodes }}{% if gpus_per_node != '' %}:ngpus={{ gpus_per_node }}{% endif %} +#PBS -l select={{ nodes }} #PBS -l walltime={{ wall_time }} #PBS -V {%- if queue != '' %} From 89389133954510ac3644698512264cd7b75268fa Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Thu, 26 Mar 2026 12:30:32 +0100 Subject: [PATCH 5/7] Fix libnetcdf, libpnetcdf, etc. with --no-spack --- deploy/hooks.py | 12 +++++++----- deploy/load.sh | 41 +++++++++++++++++++++++++++++++++++------ 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/deploy/hooks.py b/deploy/hooks.py index ae0a942154..24aaaa4087 100644 --- a/deploy/hooks.py +++ b/deploy/hooks.py @@ -42,7 +42,7 @@ def pre_pixi(ctx: DeployContext) -> dict[str, Any] | None: """ polaris_version = _get_version() - mpi = _get_pixi_mpi(ctx.machine, ctx.machine_config) + mpi = _get_pixi_mpi(ctx.machine, ctx.machine_config, ctx.args) updates: Dict[str, Any] = { 'project': {'version': polaris_version}, @@ -102,15 +102,17 @@ def _get_version(): return polaris_version -def _get_pixi_mpi(machine, machine_config): +def _get_pixi_mpi(machine, machine_config, args): """ Get the MPI implementation for pixi from environment variable """ - if machine is not None: - # we will use system compilers and mpi, not pixi mpi + if machine is not None and not getattr(args, 'no_spack', False): + # On supported machines with spack enabled, we use the system MPI + # through spack rather than installing an MPI stack in pixi. mpi = 'nompi' else: - # we will have the default-.cfg config options + # For unknown machines, and for explicit --no-spack deployments on + # known machines, pixi must provide the MPI-aware dependency stack. if not machine_config.has_section('deploy'): raise ValueError("Missing 'deploy' section in machine config") section = machine_config['deploy'] diff --git a/deploy/load.sh b/deploy/load.sh index b2264cd811..7092882690 100644 --- a/deploy/load.sh +++ b/deploy/load.sh @@ -1,5 +1,26 @@ # bash snippet for adding Polaris-specific environment variables +_polaris_detect_prefix() { + local helper="$1" + + if command -v "${helper}" >/dev/null 2>&1; then + dirname "$(dirname "$(command -v "${helper}")")" + return 0 + fi + + if [ -n "${CONDA_PREFIX:-}" ] && [ -d "${CONDA_PREFIX}" ]; then + printf '%s\n' "${CONDA_PREFIX}" + return 0 + fi + + return 1 +} + +_polaris_stack_root="${MACHE_DEPLOY_SPACK_LIBRARY_VIEW}" +if [ -z "${_polaris_stack_root}" ] && [ -n "${CONDA_PREFIX:-}" ]; then + _polaris_stack_root="${CONDA_PREFIX}" +fi + # we need a special approach for cray machines ($POLARIS_MACHINE), notably # pm-cpu and pm-gpu if [ "$POLARIS_MACHINE" = "pm-cpu" ] || [ "$POLARIS_MACHINE" = "pm-gpu" ]; then @@ -7,14 +28,22 @@ if [ "$POLARIS_MACHINE" = "pm-cpu" ] || [ "$POLARIS_MACHINE" = "pm-gpu" ]; then export NETCDFF=${CRAY_NETCDF_HDF5PARALLEL_PREFIX} export PNETCDF=${CRAY_PARALLEL_NETCDF_PREFIX} else - export NETCDF=$(dirname $(dirname $(which nc-config))) - export NETCDFF=$(dirname $(dirname $(which nf-config))) - export PNETCDF=$(dirname $(dirname $(which pnetcdf-config))) + if _polaris_detect_prefix nc-config >/dev/null 2>&1; then + export NETCDF=$(_polaris_detect_prefix nc-config) + fi + if _polaris_detect_prefix nf-config >/dev/null 2>&1; then + export NETCDFF=$(_polaris_detect_prefix nf-config) + fi + if command -v pnetcdf-config >/dev/null 2>&1; then + export PNETCDF=$(dirname "$(dirname "$(command -v pnetcdf-config)")") + elif [ -n "${CONDA_PREFIX:-}" ] && ls "${CONDA_PREFIX}"/lib/libpnetcdf* >/dev/null 2>&1; then + export PNETCDF="${CONDA_PREFIX}" + fi fi -export PIO=${MACHE_DEPLOY_SPACK_LIBRARY_VIEW} -export METIS_ROOT=${MACHE_DEPLOY_SPACK_LIBRARY_VIEW} -export PARMETIS_ROOT=${MACHE_DEPLOY_SPACK_LIBRARY_VIEW} +export PIO=${_polaris_stack_root} +export METIS_ROOT=${_polaris_stack_root} +export PARMETIS_ROOT=${_polaris_stack_root} export USE_PIO2=true export OPENMP=true From 986a6714dfeb6020900b4d7bfdd357ca234e2085 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 27 Mar 2026 14:50:23 +0000 Subject: [PATCH 6/7] Fix setting GPUs per task for fixed-resource steps For tests like decomposition and restart, we still need to set `gpus_per_task` and `min_gpus_per_task` depending on whether GPUs are available or not. --- polaris/ocean/model/ocean_model_step.py | 66 ++++++++++++++----------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/polaris/ocean/model/ocean_model_step.py b/polaris/ocean/model/ocean_model_step.py index f7d5ee7330..fac215dbe9 100644 --- a/polaris/ocean/model/ocean_model_step.py +++ b/polaris/ocean/model/ocean_model_step.py @@ -200,6 +200,7 @@ def setup(self) -> None: if self.dynamic_ntasks: self._update_ntasks() + self._set_gpus_per_task() super().setup() @@ -227,6 +228,7 @@ def constrain_resources(self, available_cores: Dict[str, Any]) -> None: """ if self.dynamic_ntasks: self._update_ntasks() + self._set_gpus_per_task() super().constrain_resources(available_cores) def compute_cell_count(self) -> Optional[int]: @@ -520,22 +522,46 @@ def _update_ntasks(self) -> None: 'been overridden.' ) - goal_cells_per_core = config.getfloat('ocean', 'goal_cells_per_core') - max_cells_per_core = config.getfloat('ocean', 'max_cells_per_core') - model = config.get('ocean', 'model') - - goal_cells_per_gpu = config.getfloat('ocean', 'goal_cells_per_gpu') - max_cells_per_gpu = config.getfloat('ocean', 'max_cells_per_gpu') - + if self._use_gpu_resources(): + goal_cells_per_core = config.getfloat( + 'ocean', 'goal_cells_per_gpu' + ) + max_cells_per_core = config.getfloat('ocean', 'max_cells_per_gpu') + else: + goal_cells_per_core = config.getfloat( + 'ocean', 'goal_cells_per_core' + ) + max_cells_per_core = config.getfloat('ocean', 'max_cells_per_core') # machines (e.g. Perlmutter) seem to be happier with ntasks that # are multiples of 4 - # ideally, about 200 cells per core - cpu_ntasks = max(1, 4 * round(cell_count / (4 * goal_cells_per_core))) - # In a pinch, about 2000 cells per core - cpu_min_tasks = max( + # ideally, about 200 cells per cpu or 8000 cells per gpu + self.ntasks = max(1, 4 * round(cell_count / (4 * goal_cells_per_core))) + # In a pinch, about 2000 cells per cpu or 80000 cells per gpu + self.min_tasks = max( 1, 4 * round(cell_count / (4 * max_cells_per_core)) ) + def _set_gpus_per_task(self) -> None: + """ + Set ``gpus_per_task`` and ``min_gpus_per_task`` for the step based + on whether gpus are available and the model is Omega + """ + if self._use_gpu_resources(): + self.gpus_per_task = 1 + self.min_gpus_per_task = 1 + else: + self.gpus_per_task = 0 + self.min_gpus_per_task = 0 + + def _use_gpu_resources(self) -> bool: + """ + Whether to use GPU resources based on whether gpus are available and + the model is Omega + """ + config = self.config + + model = config.get('ocean', 'model') + gpus_per_node = 0 parallel_system = self.component.parallel_system if parallel_system is not None: @@ -543,23 +569,7 @@ def _update_ntasks(self) -> None: 'gpus_per_node', default=0 ) - use_gpu_resources = model == 'omega' and gpus_per_node > 0 - if use_gpu_resources: - self.gpus_per_task = 1 - self.min_gpus_per_task = 1 - # Ideally, about 8000 cells per GPU - self.ntasks = max( - 1, 4 * round(cell_count / (4 * goal_cells_per_gpu)) - ) - # In a pinch, about 80000 cells per GPU - self.min_tasks = max( - 1, 4 * round(cell_count / (4 * max_cells_per_gpu)) - ) - else: - self.gpus_per_task = 0 - self.min_gpus_per_task = 0 - self.ntasks = cpu_ntasks - self.min_tasks = cpu_min_tasks + return model == 'omega' and gpus_per_node > 0 def _read_config_map(self) -> None: """ From 4a0c220a68affdc783de8531f42c0b75a05fad96 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Fri, 27 Mar 2026 22:18:27 +0100 Subject: [PATCH 7/7] Update Omega submodule. This brings in https://github.com/E3SM-Project/Omega/pull/363. --- e3sm_submodules/Omega | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e3sm_submodules/Omega b/e3sm_submodules/Omega index 56de8ec3d6..c409e9d53d 160000 --- a/e3sm_submodules/Omega +++ b/e3sm_submodules/Omega @@ -1 +1 @@ -Subproject commit 56de8ec3d68b837f87a18c15f1417518131e72e0 +Subproject commit c409e9d53dc116c7baefcf1bfcb740a5149ae786