From 7d3ed578f01bfab900595bfda25aa07d6a2947ee Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 17 Sep 2025 19:38:23 -0400 Subject: [PATCH 01/37] Bump version to 3.15.0 (#3025) Co-authored-by: hgreebe <141743196+hgreebe@users.noreply.github.com> --- .../aws-parallelcluster-awsbatch/metadata.rb | 4 ++-- .../aws-parallelcluster-computefleet/metadata.rb | 4 ++-- .../aws-parallelcluster-entrypoints/metadata.rb | 14 +++++++------- .../aws-parallelcluster-environment/metadata.rb | 4 ++-- .../aws-parallelcluster-platform/metadata.rb | 4 ++-- .../attributes/versions.rb | 8 ++++---- cookbooks/aws-parallelcluster-shared/metadata.rb | 2 +- cookbooks/aws-parallelcluster-slurm/metadata.rb | 10 +++++----- cookbooks/aws-parallelcluster-tests/metadata.rb | 12 ++++++------ kitchen.ec2.yml | 2 +- metadata.rb | 16 ++++++++-------- 11 files changed, 40 insertions(+), 40 deletions(-) diff --git a/cookbooks/aws-parallelcluster-awsbatch/metadata.rb b/cookbooks/aws-parallelcluster-awsbatch/metadata.rb index cee6eaa547..068e22f189 100644 --- a/cookbooks/aws-parallelcluster-awsbatch/metadata.rb +++ b/cookbooks/aws-parallelcluster-awsbatch/metadata.rb @@ -7,7 +7,7 @@ issues_url 'https://github.com/aws/aws-parallelcluster/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '>= 18' -version '3.14.0' +version '3.15.0' depends 'iptables', '~> 8.0.0' depends 'nfs', '~> 5.1.5' @@ -15,4 +15,4 @@ depends 'openssh', '~> 2.11.14' depends 'yum', '~> 7.4.20' depends 'yum-epel', '~> 5.0.8' -depends 'aws-parallelcluster-shared', '~> 3.14.0' +depends 'aws-parallelcluster-shared', '~> 3.15.0' diff --git a/cookbooks/aws-parallelcluster-computefleet/metadata.rb b/cookbooks/aws-parallelcluster-computefleet/metadata.rb index 1ffed36715..b8a4abedab 100644 --- a/cookbooks/aws-parallelcluster-computefleet/metadata.rb +++ b/cookbooks/aws-parallelcluster-computefleet/metadata.rb @@ -7,6 +7,6 @@ issues_url 'https://github.com/aws/aws-parallelcluster-cookbook/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '>= 18' -version '3.14.0' +version '3.15.0' -depends 'aws-parallelcluster-shared', '~> 3.14.0' +depends 'aws-parallelcluster-shared', '~> 3.15.0' diff --git a/cookbooks/aws-parallelcluster-entrypoints/metadata.rb b/cookbooks/aws-parallelcluster-entrypoints/metadata.rb index 6c5ae62689..3bb7ed80dc 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/metadata.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/metadata.rb @@ -7,11 +7,11 @@ issues_url 'https://github.com/aws/aws-parallelcluster-cookbook/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '>= 18' -version '3.14.0' +version '3.15.0' -depends 'aws-parallelcluster-shared', '~> 3.14.0' -depends 'aws-parallelcluster-platform', '~> 3.14.0' -depends 'aws-parallelcluster-environment', '~> 3.14.0' -depends 'aws-parallelcluster-computefleet', '~> 3.14.0' -depends 'aws-parallelcluster-slurm', '~> 3.14.0' -depends 'aws-parallelcluster-awsbatch', '~> 3.14.0' +depends 'aws-parallelcluster-shared', '~> 3.15.0' +depends 'aws-parallelcluster-platform', '~> 3.15.0' +depends 'aws-parallelcluster-environment', '~> 3.15.0' +depends 'aws-parallelcluster-computefleet', '~> 3.15.0' +depends 'aws-parallelcluster-slurm', '~> 3.15.0' +depends 'aws-parallelcluster-awsbatch', '~> 3.15.0' diff --git a/cookbooks/aws-parallelcluster-environment/metadata.rb b/cookbooks/aws-parallelcluster-environment/metadata.rb index f6ebe4e5ca..c25c784125 100644 --- a/cookbooks/aws-parallelcluster-environment/metadata.rb +++ b/cookbooks/aws-parallelcluster-environment/metadata.rb @@ -7,9 +7,9 @@ issues_url 'https://github.com/aws/aws-parallelcluster-cookbook/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '>= 18' -version '3.14.0' +version '3.15.0' depends 'line', '~> 4.5.21' depends 'nfs', '~> 5.1.5' -depends 'aws-parallelcluster-shared', '~> 3.14.0' +depends 'aws-parallelcluster-shared', '~> 3.15.0' diff --git a/cookbooks/aws-parallelcluster-platform/metadata.rb b/cookbooks/aws-parallelcluster-platform/metadata.rb index 5aa243b87d..e43929a1a5 100644 --- a/cookbooks/aws-parallelcluster-platform/metadata.rb +++ b/cookbooks/aws-parallelcluster-platform/metadata.rb @@ -7,8 +7,8 @@ issues_url 'https://github.com/aws/aws-parallelcluster-cookbook/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '>= 18' -version '3.14.0' +version '3.15.0' depends 'line', '~> 4.5.21' -depends 'aws-parallelcluster-shared', '~> 3.14.0' +depends 'aws-parallelcluster-shared', '~> 3.15.0' diff --git a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb index 12e97db6fe..540cb4bf3d 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb @@ -7,7 +7,7 @@ end # ParallelCluster versions -default['cluster']['parallelcluster-version'] = '3.14.0' -default['cluster']['parallelcluster-cookbook-version'] = '3.14.0' -default['cluster']['parallelcluster-node-version'] = '3.14.0' -default['cluster']['parallelcluster-awsbatch-cli-version'] = '1.4.0' +default['cluster']['parallelcluster-version'] = '3.15.0' +default['cluster']['parallelcluster-cookbook-version'] = '3.15.0' +default['cluster']['parallelcluster-node-version'] = '3.15.0' +default['cluster']['parallelcluster-awsbatch-cli-version'] = '1.5.0' diff --git a/cookbooks/aws-parallelcluster-shared/metadata.rb b/cookbooks/aws-parallelcluster-shared/metadata.rb index c691f82ef7..7a39a58847 100644 --- a/cookbooks/aws-parallelcluster-shared/metadata.rb +++ b/cookbooks/aws-parallelcluster-shared/metadata.rb @@ -7,7 +7,7 @@ issues_url 'https://github.com/aws/aws-parallelcluster-cookbook/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '>= 18' -version '3.14.0' +version '3.15.0' depends 'yum', '~> 7.4.20' depends 'yum-epel', '~> 5.0.8' diff --git a/cookbooks/aws-parallelcluster-slurm/metadata.rb b/cookbooks/aws-parallelcluster-slurm/metadata.rb index 0bfa27ec73..8e4003650f 100644 --- a/cookbooks/aws-parallelcluster-slurm/metadata.rb +++ b/cookbooks/aws-parallelcluster-slurm/metadata.rb @@ -7,7 +7,7 @@ issues_url 'https://github.com/aws/aws-parallelcluster-cookbook/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '>= 18' -version '3.14.0' +version '3.15.0' depends 'iptables', '~> 8.0.0' depends 'line', '~> 4.5.21' @@ -15,7 +15,7 @@ depends 'openssh', '~> 2.11.14' depends 'yum', '~> 7.4.20' depends 'yum-epel', '~> 5.0.8' -depends 'aws-parallelcluster-computefleet', '~> 3.14.0' -depends 'aws-parallelcluster-environment', '~> 3.14.0' -depends 'aws-parallelcluster-shared', '~> 3.14.0' -depends 'aws-parallelcluster-platform', '~> 3.14.0' +depends 'aws-parallelcluster-computefleet', '~> 3.15.0' +depends 'aws-parallelcluster-environment', '~> 3.15.0' +depends 'aws-parallelcluster-shared', '~> 3.15.0' +depends 'aws-parallelcluster-platform', '~> 3.15.0' diff --git a/cookbooks/aws-parallelcluster-tests/metadata.rb b/cookbooks/aws-parallelcluster-tests/metadata.rb index 39ca0b298f..614e9cd12e 100644 --- a/cookbooks/aws-parallelcluster-tests/metadata.rb +++ b/cookbooks/aws-parallelcluster-tests/metadata.rb @@ -7,10 +7,10 @@ issues_url 'https://github.com/aws/aws-parallelcluster-cookbook/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '>= 18' -version '3.14.0' +version '3.15.0' -depends 'aws-parallelcluster-shared', '~> 3.14.0' -depends 'aws-parallelcluster-platform', '~> 3.14.0' -depends 'aws-parallelcluster-environment', '~> 3.14.0' -depends 'aws-parallelcluster-computefleet', '~> 3.14.0' -depends 'aws-parallelcluster-slurm', '~> 3.14.0' +depends 'aws-parallelcluster-shared', '~> 3.15.0' +depends 'aws-parallelcluster-platform', '~> 3.15.0' +depends 'aws-parallelcluster-environment', '~> 3.15.0' +depends 'aws-parallelcluster-computefleet', '~> 3.15.0' +depends 'aws-parallelcluster-slurm', '~> 3.15.0' diff --git a/kitchen.ec2.yml b/kitchen.ec2.yml index 8cf39fb0dc..df36791684 100644 --- a/kitchen.ec2.yml +++ b/kitchen.ec2.yml @@ -1,5 +1,5 @@ <% - pcluster_version = ENV['KITCHEN_PCLUSTER_VERSION'] || '3.14.0' + pcluster_version = ENV['KITCHEN_PCLUSTER_VERSION'] || '3.15.0' pcluster_prefix = "aws-parallelcluster-#{pcluster_version}" %> --- diff --git a/metadata.rb b/metadata.rb index f9ce620fb5..72e48d4094 100644 --- a/metadata.rb +++ b/metadata.rb @@ -7,7 +7,7 @@ issues_url 'https://github.com/aws/aws-parallelcluster-cookbook/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '>= 18' -version '3.14.0' +version '3.15.0' depends 'iptables', '~> 8.0.0' depends 'line', '~> 4.5.21' @@ -15,10 +15,10 @@ depends 'openssh', '~> 2.11.14' depends 'yum', '~> 7.4.20' depends 'yum-epel', '~> 5.0.8' -depends 'aws-parallelcluster-slurm', '~> 3.14.0' -depends 'aws-parallelcluster-awsbatch', '~> 3.14.0' -depends 'aws-parallelcluster-platform', '~> 3.14.0' -depends 'aws-parallelcluster-environment', '~> 3.14.0' -depends 'aws-parallelcluster-computefleet', '~> 3.14.0' -depends 'aws-parallelcluster-shared', '~> 3.14.0' -depends 'aws-parallelcluster-entrypoints', '~> 3.14.0' +depends 'aws-parallelcluster-slurm', '~> 3.15.0' +depends 'aws-parallelcluster-awsbatch', '~> 3.15.0' +depends 'aws-parallelcluster-platform', '~> 3.15.0' +depends 'aws-parallelcluster-environment', '~> 3.15.0' +depends 'aws-parallelcluster-computefleet', '~> 3.15.0' +depends 'aws-parallelcluster-shared', '~> 3.15.0' +depends 'aws-parallelcluster-entrypoints', '~> 3.15.0' From 84fe9691fa0313320a99cb755dec2574c47f839b Mon Sep 17 00:00:00 2001 From: hgreebe <141743196+hgreebe@users.noreply.github.com> Date: Fri, 19 Sep 2025 14:48:07 -0400 Subject: [PATCH 02/37] Update changelog to be inline with release notes (#3028) --- CHANGELOG.md | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d844b9d0f..12231c2bdf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,18 +7,23 @@ This file is used to list changes made in each version of the AWS ParallelCluste ------ **ENHANCEMENTS** -- Add support for P6e-GB200 instances. ParallelCluster sets up Slurm topology plugin to handle P6e-GB200 UltraServers. See limitations section for important additional setup requirements. -- Add support for P6-B200 instances for all OSs except AL2. +- Include drivers for P6e-GB200 and P6-B200 instances. ParallelCluster sets up Slurm topology plugin to handle P6e-GB200 UltraServers. See limitations section for important additional setup requirements. +- Support `prioritized` and `capacity-optimized-prioritized` Allocation Strategy. This allows users to prioritize subnets for instance placement to optimize costs and performance. - Add `build-image` support for Amazon Linux 2023 AMIs based on kernel 6.12 (in addition to 6.1). +- Support DCV on Amazon Linux 2023. +- Echo chef-client logs in the instance console when a node fails to bootstrap. This helps with investigating bootstrap failures in cases CloudWatch logs are not available. **LIMITATIONS** - P6e-GB200 instances are only tested on Amazon Linux 2023, Ubuntu 22.04 and Ubuntu 24.04. -- Using IMEX on P6e-GB200 requires additional setup. Please refer to . +- Using IMEX on P6e-GB200 requires additional setup. Please refer to the dedicated tutorial in our public documentation. +- P6-B200 instances are only tested on Amazon Linux 2023, RHEL9, Ubuntu 22.04 and Ubuntu 24.04. **CHANGES** -- Install nvidia-imex for all OSs except AL2. -- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management. +- Install nvidia-imex for all OSs except Amazon Linux 2. - Remove `UnkillableStepTimeout` from slurm.conf and let slurm set this value. +- Upgrade Python runtime used by Lambda functions to Python 3.12 (from 3.9). See Lambda Documentation for important information about Python 3.9 EOL: https://docs.aws.amazon.com/lambda/latest/dg/lambda-runtimes.html +- Support encryption of EFS file system used for the head node internal shared storage via a new configuration parameter `HeadNode/SharedStorageEfsSettings/Encrypted` +- Add validator that warns against using non GPU instances with DCV. - Upgrade Slurm to version 24.11.6 (from 24.05.8). - Upgrade EFA installer to 1.43.2 (from 1.41.0). - Efa-driver: efa-2.17.2-1 @@ -28,20 +33,26 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Rdma-core: rdma-core-58.0-1 - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.6-11 - Upgrade Cinc Client to version 18.4.12 (from 18.2.7). -- Upgrade NVIDIA driver to version 570.172.08 (from 570.86.15) for all OSs except AL2. -- Upgrade CUDA Toolkit to version 12.8.1 (from 12.8.0) for all OSs except AL2. -- Upgrade DCGM to version 4.4.1 (from 3.3.6) for all OSs except AL2. -- Upgrade Python to 3.12.11 (from 3.12.8) for all OSs except AL2. -- Upgrade Python to 3.9.23 (from 3.9.20) for AL2. +- Upgrade NVIDIA driver to version 570.172.08 (from 570.86.15) for all OSs except Amazon Linux 2. +- Upgrade CUDA Toolkit to version 12.8.1 (from 12.8.0) for all OSs except Amazon Linux 2. +- Upgrade DCGM to version 4.4.1 (from 3.3.6) for all OSs except Amazon Linux 2. +- Upgrade Python to 3.12.11 (from 3.12.8) for all OSs except Amazon Linux 2. +- Upgrade Python to 3.9.23 (from 3.9.20) for Amazon Linux 2. - Upgrade Intel MPI Library to 2021.16.0 (from 2021.13.1). - Upgrade DCV to version 2024.0-19030. - Upgrade the official ParallelCluster Amazon Linux 2023 AMIs to kernel 6.12 (from 6.1). **BUG FIXES** -- Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures. -- Fix cluster id mismatch issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting. +- Prevent `build-image` stack deletion failures by deploying a global role that automatically deletes the `build-image` stack after images either succeed or fail the build. + The role is meant to exist even after the stack has been deleted. See https://github.com/aws/aws-parallelcluster/issues/5914. +- Fix an issue where Security Group validation failed when a rule contained both IPv4 ranges (IpRanges) and security group references (UserIdGroupPairs). +- Fix `build-image` failure on Rocky 9, occurring when the parent image does not ship the latest kernel version on the latest Rocky minor version. +- Fix cluster id mismatch issue which causes cluster update failures when slurm accounting is used. +- Fix a race condition in CloudWatch Agent startup that could cause node bootstrap failures. **DEPRECATIONS** +- The configuration parameter `LoginNodes/Pools/Ssh/KeyName` has been deprecated, and it will be removed in future releases. The CLI now returns a warning message when it is used in the cluster configuration. + See https://github.com/aws/aws-parallelcluster/issues/6811. - Ubuntu 20.04 is no longer supported. 3.13.2 From 3850535bdbf42a1fd257bd2b5e0b29aa47b5f081 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Fri, 19 Sep 2025 08:36:36 -0700 Subject: [PATCH 03/37] Instruct NetPlan to use systemd-networkd systemd-networkd is used by default with Ubuntu Server. Installing ubuntu-desktop (as part of DCV installation) installs NetworkManager. NetworkManager is more complex (with WiFi capabilities) and causes confusion to systemd-networkd. When systemd-networkd is confused, it delays the boot by 2 minutes. This commit instructs NetPlan to use systemd-networkd to manage network interfaces. The code is added at the end of DCV installation because the mitigation is strictly related to the installation of ubuntu-desktop. Always using systemd-networkd also improves consistency between how ParallelCluster handles single-nic instances vs multi-nic instances. With multi-nic instances ParallelCluster has been instructing netplan to use systemd-networkd ([code](https://github.com/aws/aws-parallelcluster-cookbook/blob/develop/cookbooks/aws-parallelcluster-environment/files/ubuntu/network_interfaces/configure_nw_interface.sh#L62)) # Technical details: ## Output of `networkctl list` ### Prior to this commit Base Ubuntu: ``` IDX LINK TYPE OPERATIONAL SETUP 1 lo loopback carrier unmanaged 2 ens5 ether routable configured 2 links listed. ``` Ubuntu with ubuntu-desktop ``` IDX LINK TYPE OPERATIONAL SETUP 1 lo loopback carrier unmanaged 2 ens5 ether routable unmanaged 2 links listed. ``` systemd-networkd got confused because it saw no network interface was setup (because NetworkManager took over control of all network interfaces) and waited until 2 minutes timeout at the beginning of system boot: ``` $ journalctl -b | grep -i "ipv6\|timeout\|waiting" ... Sep 18 14:51:23 systemd-networkd-wait-online[1602]: Timeout occurred while waiting for network connectivity. ... Sep 18 14:53:31 systemd-networkd-wait-online[1891]: Timeout occurred while waiting for network connectivity. ... ``` ### After this commit Ubuntu with ubuntu-desktop has the same output as Base Ubuntu and the delay is gone Signed-off-by: Hanwen --- .../resources/dcv/partial/_ubuntu_common.rb | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb b/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb index 818b83c16c..2eb8a40cce 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb @@ -81,4 +81,20 @@ def optionally_disable_rnd command "sed --in-place '/RANDFILE/d' /etc/ssl/openssl.cnf" end end + + def post_install + # ubuntu-desktop comes with NetworkManager. On a cloud instance NetworkManager is unnecessary and causes delay. + # Instruct Netplan to use networkd for better performance + bash 'Instruct Netplan to use networkd' do + code <<-NETPLAN + set -e + cat > /etc/netplan/95-parallelcluster-force-networkd.yaml << 'EOF' +network: + version: 2 + renderer: networkd +EOF + netplan apply + NETPLAN + end unless on_docker? + end end From 8788fff9e350d29008f56721f068b49147adf18a Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Fri, 19 Sep 2025 18:10:07 -0400 Subject: [PATCH 04/37] [Gb200] Support IMEX configuration to be local to a node (#3029) * we remove /opt/parallelcluster/shared/nvidia-imex directory creation * We keep default path of `/etc/nvidia-imex/nodes_config.cfg` and `/etc/nvidia-imex/config.cfg` for IMEX configuration * We override `/etc/nvidia-imex/nodes_config.cfg` only if it is missing to avoid Imex start failures. * Update unit test Co-authored-by: Himani Anil Deshpande --- .../attributes/platform.rb | 1 - .../partial/_nvidia_imex_common.rb | 60 ++++++++------ .../spec/unit/resources/nvidia_imex_spec.rb | 81 +++++++++++-------- .../test/controls/nvidia_imex_spec.rb | 2 +- 4 files changed, 83 insertions(+), 61 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index 6566160ec4..973f74fb74 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -24,7 +24,6 @@ end # nvidia-imex -default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex" default['cluster']['nvidia']['imex']['force_configuration'] = false # NVIDIA NVLSM diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index fc126e43e9..f791eb5f1d 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -19,21 +19,51 @@ return unless nvidia_enabled_or_installed? return if on_docker? || imex_installed? || aws_region.start_with?("us-iso") - directory node['cluster']['nvidia']['imex']['shared_dir'] - action_install_imex + + # Create Imex configuration files + action_create_configuration_files # Save Imex version in Node Attributes for InSpec Tests node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package node_attributes 'dump node attributes' end +action :create_configuration_files do + # We create or update IMEX configuration files if ParallelCluster is installing IMEX + template nvidia_imex_nodes_conf_file do + source 'nvidia-imex/nvidia-imex-nodes.erb' + owner 'root' + group 'root' + mode '0755' + action :create + end + + template nvidia_imex_main_conf_file do + source 'nvidia-imex/nvidia-imex-config.erb' + owner 'root' + group 'root' + mode '0755' + action :create + variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file) + end + + # We keep nvidia-imex.service file in this location to give precedence to pcluster configured service file. + template "/etc/systemd/system/#{nvidia_imex_service}.service" do + source 'nvidia-imex/nvidia-imex.service.erb' + owner 'root' + group 'root' + mode '0644' + action :create + variables(imex_main_config_file_path: nvidia_imex_main_conf_file) + end +end + action :configure do return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet" # Start nvidia-imex on p6e-gb200 and only on ComputeFleet if is_gb200_node? || enable_force_configuration? - # For each Compute Resource, we generate a unique NVIDIA IMEX configuration file, - # if one doesn't already exist in a common, shared location. + # Create the file if this is missing otherwise Imex service will not start template nvidia_imex_nodes_conf_file do source 'nvidia-imex/nvidia-imex-nodes.erb' owner 'root' @@ -42,24 +72,6 @@ action :create_if_missing end - template nvidia_imex_main_conf_file do - source 'nvidia-imex/nvidia-imex-config.erb' - owner 'root' - group 'root' - mode '0755' - action :create_if_missing - variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file) - end - - template "/etc/systemd/system/#{nvidia_imex_service}.service" do - source 'nvidia-imex/nvidia-imex.service.erb' - owner 'root' - group 'root' - mode '0644' - action :create - variables(imex_main_config_file_path: nvidia_imex_main_conf_file) - end - service nvidia_imex_service do action %i(enable start) supports status: true @@ -92,11 +104,11 @@ def nvidia_enabled_or_installed? end def nvidia_imex_main_conf_file - "#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg" + "/etc/nvidia-imex/config.cfg" end def nvidia_imex_nodes_conf_file - "#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg" + "/etc/nvidia-imex/nodes_config.cfg" end def enable_force_configuration? diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 0985bffdbf..44da506152 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -2,11 +2,12 @@ nvidia_version = "1.2.3" SOURCE_DIR = 'SOURCE_DIR'.freeze -nvidia_imex_shared_dir = "SHARED_DIR/nvidia-imex" +nvidia_imex_dir = "/etc/nvidia-imex" +imex_main_conf_file = "#{nvidia_imex_dir}/config.cfg" +imex_nodes_conf_file = "#{nvidia_imex_dir}/nodes_config.cfg" +imex_service_file = "/etc/systemd/system/nvidia-imex.service" imex_binary = '/usr/bin/nvidia-imex' imex_ctl_binary = '/usr/bin/nvidia-imex-ctl' -queue_name = 'queue-name' -compute_resource_name = 'compute-resource-name' cluster_artifacts_s3_url = 'https://aws_region-aws-parallelcluster.s3.aws_region.AWS_DOMAIN' class ConvergeNvidiaImex @@ -18,6 +19,14 @@ def self.install(chef_run) end end + def self.create_configuration_files(chef_run) + chef_run.converge_dsl('aws-parallelcluster-platform') do + nvidia_imex 'create_configuration_files' do + action :create_configuration_files + end + end + end + def self.configure(chef_run) chef_run.converge_dsl('aws-parallelcluster-platform') do nvidia_imex 'configure' do @@ -231,7 +240,6 @@ def self.configure(chef_run) cached(:node) { chef_run.node } before do - chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir chef_run.node.override['cluster']['artifacts_s3_url'] = cluster_artifacts_s3_url chef_run.node.override['cluster']['region'] = 'aws_region' chef_run.node.override['cluster']['sources_dir'] = SOURCE_DIR @@ -241,7 +249,6 @@ def self.configure(chef_run) end if platform == 'amazon' && version == '2' it 'does not install nvidia-imex' do - is_expected.not_to create_directory(nvidia_imex_shared_dir) is_expected.not_to install_install_packages('Install nvidia-imex') .with(packages: "#{nvidia_imex_name}") .with(action: %i(install)) @@ -254,7 +261,6 @@ def self.configure(chef_run) else it 'installs nvidia-imex' do - is_expected.to create_directory(nvidia_imex_shared_dir) if platform == 'ubuntu' is_expected.to create_if_missing_remote_file("#{SOURCE_DIR}/#{nvidia_imex_package}-#{nvidia_imex_version}.deb").with( source: "#{cluster_artifacts_s3_url}/dependencies/nvidia_imex/#{url_suffix}.deb", @@ -294,6 +300,38 @@ def self.configure(chef_run) end end +describe 'nvidia_imex:create_configuration_files' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.create_configuration_files(runner) + end + cached(:node) { chef_run.node } + + it 'does create Imex configuration files' do + is_expected.to create_template("#{imex_nodes_conf_file}") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.to create_template("#{imex_main_conf_file}") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + .with(variables: { imex_nodes_config_file_path: "#{imex_nodes_conf_file}" }) + is_expected.to create_template(imex_service_file) + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + .with(variables: { imex_main_config_file_path: "#{imex_main_conf_file}" }) + end + end + end +end + describe 'nvidia_imex:configure' do [%w(false), [false], %w(no), %w(true), [true], %w(yes)].each do |force_indicator| for_all_oses do |platform, version| @@ -329,54 +367,27 @@ def self.configure(chef_run) before do chef_run.node.override['cluster']['region'] = 'aws_region' chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator - chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir chef_run.node.override['cluster']['node_type'] = node_type - chef_run.node.override['cluster']['scheduler_queue_name'] = queue_name - chef_run.node.override['cluster']['scheduler_compute_resource_name'] = compute_resource_name ConvergeNvidiaImex.configure(chef_run) end if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type) it 'does not configure nvidia-imex' do - is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg") + is_expected.not_to create_if_missing_template("#{imex_nodes_conf_file}") .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') .with(user: 'root') .with(group: 'root') .with(mode: '0755') - is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-config.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" }) - is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" }) is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) end else it 'it starts nvidia-imex service' do - is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg") + is_expected.to create_if_missing_template("#{imex_nodes_conf_file}") .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') .with(user: 'root') .with(group: 'root') .with(mode: '0755') - is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-config.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" }) - is_expected.to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" }) is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) end end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb index b3524db81d..36a1c714a3 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb @@ -36,7 +36,7 @@ its('owner') { should eq 'root' } its('group') { should eq 'root' } its('mode') { should cmp '0644' } - its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c #{node['cluster']['nvidia']['imex']['shared_dir']}} } + its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c /etc/nvidia-imex/config.cfg} } end describe service('nvidia-imex') do From d52a3fd66bbfd407bf9572e534f769b4c512b5b9 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 17 Apr 2025 11:59:52 -0400 Subject: [PATCH 05/37] [Isolated] Install Pypi dependencies for boto3 and cfn-bootstrap scripts --- .../install/custom_parallelcluster_node.rb | 21 ++++++++++++++++++ .../recipes/install/cfn_bootstrap.rb | 22 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb b/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb index c2bdf0930f..247823c173 100644 --- a/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb +++ b/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb @@ -19,6 +19,27 @@ # TODO: once the pyenv Chef resource supports installing packages from a path (e.g. `pip install .`), convert the # bash block to a recipe that uses the pyenv resource. +if aws_region.start_with?("us-iso") && platform?('amazon') && node['platform_version'] == "2" + remote_file "#{node['cluster']['base_dir']}/node-dependencies.tgz" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end + + bash 'pip install' do + user 'root' + group 'root' + cwd "#{node['cluster']['base_dir']}" + code <<-REQ + set -e + tar xzf node-dependencies.tgz + cd node + #{node_virtualenv_path}/bin/pip install * -f ./ --no-index + REQ + end +end bash "install custom aws-parallelcluster-node" do cwd Chef::Config[:file_cache_path] diff --git a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb index 3985418b74..cfe95c4ed5 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb @@ -33,6 +33,28 @@ not_if { ::File.exist?("#{virtualenv_path}/bin/activate") } end +if aws_region.start_with?("us-iso") + remote_file "#{node['cluster']['base_dir']}/cfn-dependencies.tgz" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/cfn-dependencies.tgz" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end + + bash 'pip install' do + user 'root' + group 'root' + cwd "#{node['cluster']['base_dir']}" + code <<-REQ + set -e + tar xzf cfn-dependencies.tgz + cd cfn + #{virtualenv_path}/bin/pip install * -f ./ --no-index + REQ + end +end + cfnbootstrap_version = '2.0-33' cfnbootstrap_package = "aws-cfn-bootstrap-py3-#{cfnbootstrap_version}.tar.gz" From c5515b0d38fffbd6befe82620a3f080fa07bf800 Mon Sep 17 00:00:00 2001 From: hgreebe <141743196+hgreebe@users.noreply.github.com> Date: Thu, 21 Aug 2025 08:14:45 -0400 Subject: [PATCH 06/37] [Isolated] Update Pypi dependencies and install efs-proxy dependency (#3011) * [Isolated] Install cfn-dependencies only for AL2 * Revert "[Isolated] Install cfn-dependencies only for AL2" This reverts commit 976d4799313634b3274be1da55ff768f1bd3503f. * [Isolated] USe latest cfn-dependencies * [Isolated] Using Git REF for Uploading cookbook * [Isolated] Rename the cfn-dependecies files * [Isolated] Chnage the name of Cookbook Dependencies and the folder name inside the Tar * [Isolated] Chnage the name of CFN Dependencies and the folder name inside the Tar * [Isolated] Installing Cfn-bootstrap using `--no-build-isolation` as 3.12.8 uses setup.py based installation where it uses a isolated build instead of looking at existing site-packages * [Isolated] Install efs-proxy cargo dependecies for isolated environment * [Isolated] Install new node pypi dependencies and move efs-proxy installation * [Isolated] Only install efs-proxy deps when in adc regions * [Isolated] Only install efs-proxy-deps in adc * [Isolated] Fix unit tests * [Isolated] Test python pacakges are installed when in an ADC region --------- Co-authored-by: Himani Anil Deshpande --- .../install/custom_parallelcluster_node.rb | 21 ++++++-- .../recipes/install/cfn_bootstrap.rb | 17 ++++-- .../resources/efs/efs_redhat8.rb | 52 +++++++++++++++++++ .../efs/partial/_install_from_tar.rb | 26 +++++++--- .../recipes/install/cookbook_virtualenv.rb | 32 +++++++----- .../unit/recipes/cookbook_virtualenv_spec.rb | 18 ++++--- util/upload-cookbook.sh | 13 ++--- 7 files changed, 138 insertions(+), 41 deletions(-) diff --git a/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb b/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb index 247823c173..788a715f33 100644 --- a/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb +++ b/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb @@ -19,9 +19,22 @@ # TODO: once the pyenv Chef resource supports installing packages from a path (e.g. `pip install .`), convert the # bash block to a recipe that uses the pyenv resource. -if aws_region.start_with?("us-iso") && platform?('amazon') && node['platform_version'] == "2" +command = if aws_region.start_with?("us-iso") + "pip install . --no-build-isolation" + else + "pip install ." + end + +if aws_region.start_with?("us-iso") + dependency_package_name = "pypi-node-dependencies-#{node['cluster']['python-major-minor-version']}-#{node['kernel']['machine']}" + dependency_folder_name = dependency_package_name + if platform?('amazon') && node['platform_version'] == "2" + dependency_package_name = "node-dependencies" + dependency_folder_name = "node" + end + remote_file "#{node['cluster']['base_dir']}/node-dependencies.tgz" do - source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz" + source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/#{dependency_package_name}.tgz" mode '0644' retries 3 retry_delay 5 @@ -35,7 +48,7 @@ code <<-REQ set -e tar xzf node-dependencies.tgz - cd node + cd #{dependency_folder_name} #{node_virtualenv_path}/bin/pip install * -f ./ --no-index REQ end @@ -59,7 +72,7 @@ mkdir aws-parallelcluster-custom-node tar -xzf aws-parallelcluster-node.tgz --directory aws-parallelcluster-custom-node cd aws-parallelcluster-custom-node/*aws-parallelcluster-node* - pip install . + #{command} deactivate NODE end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb index cfe95c4ed5..e795e70943 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb @@ -34,8 +34,14 @@ end if aws_region.start_with?("us-iso") + dependency_package_name = "pypi-cfn-dependencies-#{node['cluster']['python-major-minor-version']}-#{node['kernel']['machine']}" + dependency_folder_name = dependency_package_name + if platform?('amazon') && node['platform_version'] == "2" + dependency_package_name = "cfn-dependencies" + dependency_folder_name = "cfn" + end remote_file "#{node['cluster']['base_dir']}/cfn-dependencies.tgz" do - source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/cfn-dependencies.tgz" + source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/#{dependency_package_name}.tgz" mode '0644' retries 3 retry_delay 5 @@ -49,7 +55,7 @@ code <<-REQ set -e tar xzf cfn-dependencies.tgz - cd cfn + cd #{dependency_folder_name} #{virtualenv_path}/bin/pip install * -f ./ --no-index REQ end @@ -73,11 +79,16 @@ retry_delay 5 end +command = if aws_region.start_with?("us-iso") + "#{virtualenv_path}/bin/pip install #{cfnbootstrap_package} --no-build-isolation" + else + "#{virtualenv_path}/bin/pip install #{cfnbootstrap_package}" + end bash "Install CloudFormation helpers from #{cfnbootstrap_package}" do user 'root' group 'root' cwd '/tmp' - code "#{virtualenv_path}/bin/pip install #{cfnbootstrap_package}" + code command creates "#{virtualenv_path}/bin/cfn-hup" end diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb index b2ab45bc1b..abe333a933 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb @@ -21,3 +21,55 @@ use 'partial/_redhat_based' use 'partial/_install_from_tar' use 'partial/_mount_umount' + +def adc_install_script_code(efs_utils_tarball, efs_utils_package, efs_utils_version) + <<-EFSUTILSINSTALL + set -e + tar xf #{efs_utils_tarball} + mv efs-proxy-dependencies-#{efs_utils_version}.tar.gz efs-utils-#{efs_utils_version}/src/proxy/ + cd efs-utils-#{efs_utils_version}/src/proxy/ + tar -xf efs-proxy-dependencies-#{efs_utils_version}.tar.gz + cargo build --offline + cd ../.. + make rpm + yum -y install ./build/#{efs_utils_package}*rpm + EFSUTILSINSTALL +end + +def prerequisites + %w(rpm-build make rust cargo openssl-devel) +end + +action :install_efs_utils do + package_name = "amazon-efs-utils" + package_version = new_resource.efs_utils_version + efs_utils_tarball = "#{node['cluster']['sources_dir']}/efs-utils-#{package_version}.tar.gz" + + if aws_region.start_with?("us-iso") + + efs_proxy_deps = "efs-proxy-dependencies-#{package_version}.tar.gz" + efs_proxy_deps_tarball = "#{node['cluster']['sources_dir']}/#{efs_proxy_deps}" + efs_proxy_deps_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/efs/#{efs_proxy_deps}" + remote_file efs_proxy_deps_tarball do + source efs_proxy_deps_url + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end + + bash "install efs utils" do + cwd node['cluster']['sources_dir'] + code adc_install_script_code(efs_utils_tarball, package_name, package_version) + end + + else + # Install EFS Utils following https://docs.aws.amazon.com/efs/latest/ug/installing-amazon-efs-utils.html + bash "install efs utils" do + cwd node['cluster']['sources_dir'] + code install_script_code(efs_utils_tarball, package_name, package_version) + end + end + + action_increase_poll_interval +end diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_install_from_tar.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_install_from_tar.rb index dea1449ac1..c3e320c906 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_install_from_tar.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_install_from_tar.rb @@ -12,8 +12,13 @@ # or in the "LICENSE.txt" file accompanying this file. # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. +package_name = "amazon-efs-utils" action :install_utils do + package_version = new_resource.efs_utils_version + efs_utils_tarball = "#{node['cluster']['sources_dir']}/efs-utils-#{package_version}.tar.gz" + efs_utils_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/efs/v#{package_version}.tar.gz" + package_repos 'update package repositories' do action :update end @@ -29,11 +34,6 @@ return if redhat_on_docker? - package_name = "amazon-efs-utils" - package_version = _efs_utils_version - efs_utils_tarball = "#{node['cluster']['sources_dir']}/efs-utils-#{package_version}.tar.gz" - efs_utils_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/efs/v#{package_version}.tar.gz" - # Do not install efs-utils if a same or newer version is already installed. return if already_installed?(package_name, package_version) @@ -46,14 +46,26 @@ mode '0644' retries 3 retry_delay 5 - checksum _efs_utils_checksum + checksum new_resource.efs_utils_checksum action :create_if_missing end + action_install_efs_utils + # Install EFS Utils following https://docs.aws.amazon.com/efs/latest/ug/installing-amazon-efs-utils.html + # bash "install efs utils" do + # cwd node['cluster']['sources_dir'] + # code install_script_code(efs_utils_tarball, package_name, package_version) + # end + action_increase_poll_interval +end + +action :install_efs_utils do + package_version = new_resource.efs_utils_version + efs_utils_tarball = "#{node['cluster']['sources_dir']}/efs-utils-#{package_version}.tar.gz" + bash "install efs utils" do cwd node['cluster']['sources_dir'] code install_script_code(efs_utils_tarball, package_name, package_version) end - action_increase_poll_interval end diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb index cdc557df5f..8717b06a69 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb @@ -12,8 +12,10 @@ # limitations under the License. virtualenv_path = cookbook_virtualenv_path -pypi_s3_uri = "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/pypi-dependencies-#{node['cluster']['python-major-minor-version']}-#{node['kernel']['machine']}.tgz" +dependency_package_name = "pypi-cookbook-dependencies-#{node['cluster']['python-major-minor-version']}-#{node['kernel']['machine']}" +pypi_s3_uri = "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{dependency_package_name}.tgz" if platform?('amazon') && node['platform_version'] == "2" + dependency_package_name = "dependencies" pypi_s3_uri = "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/cookbook-dependencies.tgz" end @@ -31,22 +33,24 @@ not_if { ::File.exist?("#{cookbook_virtualenv_path}/bin/activate") } end -remote_file "#{node['cluster']['base_dir']}/cookbook-dependencies.tgz" do - source pypi_s3_uri - mode '0644' - retries 3 - retry_delay 5 - action :create_if_missing -end +if aws_region.start_with?("us-iso") + remote_file "#{node['cluster']['base_dir']}/cookbook-dependencies.tgz" do + source pypi_s3_uri + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end -bash 'pip install' do - user 'root' - group 'root' - cwd "#{node['cluster']['base_dir']}" - code <<-REQ + bash 'pip install' do + user 'root' + group 'root' + cwd "#{node['cluster']['base_dir']}" + code <<-REQ set -e tar xzf cookbook-dependencies.tgz - cd dependencies + cd #{dependency_package_name} #{virtualenv_path}/bin/pip install * -f ./ --no-index REQ + end end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb index 65b83d509e..0ce79250cc 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb @@ -6,12 +6,15 @@ cached(:python_version) { 'python_version' } cached(:system_pyenv_root) { 'system_pyenv_root' } cached(:virtualenv_path) { 'system_pyenv_root/versions/python_version/envs/cookbook_virtualenv' } + cached(:aws_region) { 'us-iso-test' } context "when cookbook virtualenv not installed yet" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| + allow_any_instance_of(Object).to receive(:aws_region).and_return(aws_region) node.override['cluster']['system_pyenv_root'] = system_pyenv_root node.override['cluster']['python-version'] = python_version + node.override['cluster']['region'] = aws_region end runner.converge(described_recipe) end @@ -32,13 +35,14 @@ expect(node.default['cluster']['cookbook_virtualenv_path']).to eq(virtualenv_path) is_expected.to write_node_attributes('dump node attributes') end - - it 'installs python packages' do - is_expected.to run_bash("pip install").with( - user: 'root', - group: 'root', - cwd: "#{node['cluster']['base_dir']}" - ).with_code(/tar xzf cookbook-dependencies.tgz/) + context "when in isolated region" do + it 'installs python packages' do + is_expected.to run_bash("pip install").with( + user: 'root', + group: 'root', + cwd: "#{node['cluster']['base_dir']}" + ).with_code(/tar xzf cookbook-dependencies.tgz/) + end end end end diff --git a/util/upload-cookbook.sh b/util/upload-cookbook.sh index c2169a4c3e..864ce56959 100755 --- a/util/upload-cookbook.sh +++ b/util/upload-cookbook.sh @@ -93,21 +93,22 @@ main() { # Create archive and md5 _cwd=$(pwd) pushd "${_srcdir}" > /dev/null || exit + GIT_REF=$(git rev-parse HEAD) _stashName=$(git stash create) - git archive --format tar --prefix="aws-parallelcluster-cookbook-${_version}/" "${_stashName:-HEAD}" | gzip > "${_cwd}/aws-parallelcluster-cookbook-${_version}.tgz" + git archive --format tar --prefix="aws-parallelcluster-cookbook-${_version}/" "${_stashName:-HEAD}" | gzip > "${_cwd}/aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz" #tar zcvf "${_cwd}/aws-parallelcluster-cookbook-${_version}.tgz" --transform "s,^aws-parallelcluster-cookbook/,aws-parallelcluster-cookbook-${_version}/," ../aws-parallelcluster-cookbook popd > /dev/null || exit - md5sum aws-parallelcluster-cookbook-${_version}.tgz > aws-parallelcluster-cookbook-${_version}.md5 + md5sum aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz > aws-parallelcluster-cookbook-${_version}-${GIT_REF}.md5 # upload packages _key_path="parallelcluster/${_version}/cookbooks" if [ -n "${_scope}" ]; then _key_path="${_key_path}/${_scope}" fi - aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-cookbook-${_version}.tgz s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}.tgz || _error_exit 'Failed to push cookbook to S3' - aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-cookbook-${_version}.md5 s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}.md5 || _error_exit 'Failed to push cookbook md5 to S3' - aws ${_profile} --region "${_region}" s3api head-object --bucket ${_bucket} --key ${_key_path}/aws-parallelcluster-cookbook-${_version}.tgz --output text --query LastModified > aws-parallelcluster-cookbook-${_version}.tgz.date || _error_exit 'Failed to fetch LastModified date' - aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-cookbook-${_version}.tgz.date s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}.tgz.date || _error_exit 'Failed to push cookbook date' + aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz || _error_exit 'Failed to push cookbook to S3' + aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-cookbook-${_version}-${GIT_REF}.md5 s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}-${GIT_REF}.md5 || _error_exit 'Failed to push cookbook md5 to S3' + aws ${_profile} --region "${_region}" s3api head-object --bucket ${_bucket} --key ${_key_path}/aws-parallelcluster-cookbook-${_version}.tgz --output text --query LastModified > aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz.date || _error_exit 'Failed to fetch LastModified date' + aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz.date s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz.date || _error_exit 'Failed to push cookbook date' _bucket_region=$(aws ${_profile} s3api get-bucket-location --bucket ${_bucket} --output text) if [ ${_bucket_region} = "None" ]; then From d4bd35a1c1a552d5146fe50b543b33e66a2552e8 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 7 Oct 2025 17:01:14 -0400 Subject: [PATCH 07/37] [Isolated] Remove use of new_resource as this will be empty without a default value set --- .../resources/efs/efs_redhat8.rb | 2 +- .../resources/efs/partial/_common.rb | 7 ++----- .../resources/efs/partial/_install_from_tar.rb | 8 ++++---- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb index abe333a933..6af950560c 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb @@ -42,7 +42,7 @@ def prerequisites action :install_efs_utils do package_name = "amazon-efs-utils" - package_version = new_resource.efs_utils_version + package_version = _efs_utils_version efs_utils_tarball = "#{node['cluster']['sources_dir']}/efs-utils-#{package_version}.tar.gz" if aws_region.start_with?("us-iso") diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_common.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_common.rb index 618e91d3fb..612edabe20 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_common.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_common.rb @@ -2,15 +2,12 @@ default_action :install_utils -property :efs_utils_version, String -property :efs_utils_checksum, String - def _efs_utils_version - efs_utils_version || node['cluster']['efs']['version'] + node['cluster']['efs']['version'] end def _efs_utils_checksum - efs_utils_checksum || node['cluster']['efs']['sha256'] + node['cluster']['efs']['sha256'] end def already_installed?(package_name, expected_version) diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_install_from_tar.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_install_from_tar.rb index c3e320c906..2337b33917 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_install_from_tar.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_install_from_tar.rb @@ -15,7 +15,7 @@ package_name = "amazon-efs-utils" action :install_utils do - package_version = new_resource.efs_utils_version + package_version = _efs_utils_version efs_utils_tarball = "#{node['cluster']['sources_dir']}/efs-utils-#{package_version}.tar.gz" efs_utils_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/efs/v#{package_version}.tar.gz" @@ -46,7 +46,7 @@ mode '0644' retries 3 retry_delay 5 - checksum new_resource.efs_utils_checksum + checksum _efs_utils_checksum action :create_if_missing end @@ -61,9 +61,9 @@ end action :install_efs_utils do - package_version = new_resource.efs_utils_version + package_version = _efs_utils_version efs_utils_tarball = "#{node['cluster']['sources_dir']}/efs-utils-#{package_version}.tar.gz" - + # Install EFS Utils following https://docs.aws.amazon.com/efs/latest/ug/installing-amazon-efs-utils.html bash "install efs utils" do cwd node['cluster']['sources_dir'] code install_script_code(efs_utils_tarball, package_name, package_version) From b62c816a5364b80eeb12df8baf3efe826f491b93 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 7 Oct 2025 19:14:18 -0400 Subject: [PATCH 08/37] [Isolated] Update unit tests --- .../spec/unit/resources/efs_spec.rb | 53 ++++++++++++++----- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb index 738999a3eb..8ad07f0644 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb @@ -1,11 +1,9 @@ require 'spec_helper' class ConvergeEfs - def self.install_utils(chef_run, efs_utils_version:, tarball_checksum:) + def self.install_utils(chef_run) chef_run.converge_dsl('aws-parallelcluster-environment') do efs 'install_utils' do - efs_utils_checksum tarball_checksum - efs_utils_version efs_utils_version action :install_utils end end @@ -28,14 +26,17 @@ def mock_already_installed(package, expected_version, installed) context "on amazon2" do cached(:efs_utils_version) { '1.2.3' } cached(:tarball_checksum) { 'tarball_checksum' } - let(:chef_run) do - runner(platform: 'amazon', version: '2', step_into: ['efs']) - end context "when same version of amazon-efs-utils already installed" do + cached(:chef_run) do + runner(platform: 'amazon', version: '2', step_into: ['efs']) + end + cached(:node) { chef_run.node } before do + node.override['cluster']['efs']['version'] = efs_utils_version + node.override['cluster']['efs']['sha256'] = tarball_checksum mock_get_package_version('amazon-efs-utils', efs_utils_version) - ConvergeEfs.install_utils(chef_run, efs_utils_version: efs_utils_version, tarball_checksum: tarball_checksum) + ConvergeEfs.install_utils(chef_run) end it 'does not install amazon-efs-utils' do @@ -44,9 +45,13 @@ def mock_already_installed(package, expected_version, installed) end context "when newer version of amazon-efs-utils already installed" do + cached(:chef_run) do + runner(platform: 'amazon', version: '2', step_into: ['efs']) + end + cached(:node) { chef_run.node } before do mock_get_package_version('amazon-efs-utils', '1.3.2') - ConvergeEfs.install_utils(chef_run, efs_utils_version: efs_utils_version, tarball_checksum: tarball_checksum) + ConvergeEfs.install_utils(chef_run) end it 'does not install amazon-efs-utils' do @@ -55,9 +60,15 @@ def mock_already_installed(package, expected_version, installed) end context "when amazon-efs-utils not installed" do + cached(:chef_run) do + runner(platform: 'amazon', version: '2', step_into: ['efs']) + end + cached(:node) { chef_run.node } before do + node.override['cluster']['efs']['version'] = efs_utils_version + node.override['cluster']['efs']['sha256'] = tarball_checksum mock_get_package_version('amazon-efs-utils', '') - ConvergeEfs.install_utils(chef_run, efs_utils_version: efs_utils_version, tarball_checksum: tarball_checksum) + ConvergeEfs.install_utils(chef_run) end it 'installs amazon-efs-utils' do @@ -67,9 +78,15 @@ def mock_already_installed(package, expected_version, installed) end context "when older version of amazon-efs-utils installed" do + cached(:chef_run) do + runner(platform: 'amazon', version: '2', step_into: ['efs']) + end + cached(:node) { chef_run.node } before do + node.override['cluster']['efs']['version'] = efs_utils_version + node.override['cluster']['efs']['sha256'] = tarball_checksum mock_get_package_version('amazon-efs-utils', '1.1.4') - ConvergeEfs.install_utils(chef_run, efs_utils_version: efs_utils_version, tarball_checksum: tarball_checksum) + ConvergeEfs.install_utils(chef_run) end it 'installs amazon-efs-utils' do @@ -105,8 +122,10 @@ def mock_already_installed(package, expected_version, installed) node.override['cluster']['efs_utils']['tarball_path'] = tarball_path node.override['cluster']['sources_dir'] = source_dir node.override['cluster']['region'] = aws_region + node.override['cluster']['efs']['version'] = utils_version + node.override['cluster']['efs']['sha256'] = tarball_checksum end - ConvergeEfs.install_utils(runner, efs_utils_version: utils_version, tarball_checksum: tarball_checksum) + ConvergeEfs.install_utils(runner) end cached(:node) { chef_run.node } @@ -140,8 +159,10 @@ def mock_already_installed(package, expected_version, installed) runner = runner(platform: platform, version: version, step_into: ['efs']) do |node| node.override['cluster']['efs_utils']['tarball_path'] = tarball_path node.override['cluster']['sources_dir'] = source_dir + node.override['cluster']['efs']['version'] = utils_version + node.override['cluster']['efs']['sha256'] = tarball_checksum end - ConvergeEfs.install_utils(runner, efs_utils_version: utils_version, tarball_checksum: tarball_checksum) + ConvergeEfs.install_utils(runner) end cached(:node) { chef_run.node } @@ -190,9 +211,11 @@ def mock_already_installed(package, expected_version, installed) runner = runner(platform: platform, version: version, step_into: ['efs']) do |node| node.override['cluster']['efs_utils']['tarball_path'] = tarball_path node.override['cluster']['sources_dir'] = source_dir + node.override['cluster']['efs']['version'] = utils_version + node.override['cluster']['efs']['sha256'] = tarball_checksum node.override['cluster']['region'] = aws_region end - ConvergeEfs.install_utils(runner, efs_utils_version: utils_version, tarball_checksum: tarball_checksum) + ConvergeEfs.install_utils(runner) end it 'creates sources dir' do @@ -230,9 +253,11 @@ def mock_already_installed(package, expected_version, installed) mock_already_installed('amazon-efs-utils', utils_version, true) runner = runner(platform: platform, version: version, step_into: ['efs']) do |node| node.override['cluster']['efs_utils']['tarball_path'] = tarball_path + node.override['cluster']['efs']['version'] = utils_version + node.override['cluster']['efs']['sha256'] = tarball_checksum node.override['cluster']['sources_dir'] = source_dir end - ConvergeEfs.install_utils(runner, efs_utils_version: utils_version, tarball_checksum: tarball_checksum) + ConvergeEfs.install_utils(runner) end it 'does not download tarball' do From 76f9a00f4101383e1527c5a6fd8d797df5f07bad Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 16 Oct 2025 11:54:12 -0400 Subject: [PATCH 09/37] [Bug] Install cfn dependencies in all regions --- .../recipes/install/cfn_bootstrap.rb | 58 +++++++++---------- .../spec/unit/recipes/cfn_bootstrap_spec.rb | 48 ++++++++++++++- 2 files changed, 73 insertions(+), 33 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb index e795e70943..4d504a8621 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/install/cfn_bootstrap.rb @@ -33,32 +33,31 @@ not_if { ::File.exist?("#{virtualenv_path}/bin/activate") } end -if aws_region.start_with?("us-iso") - dependency_package_name = "pypi-cfn-dependencies-#{node['cluster']['python-major-minor-version']}-#{node['kernel']['machine']}" - dependency_folder_name = dependency_package_name - if platform?('amazon') && node['platform_version'] == "2" - dependency_package_name = "cfn-dependencies" - dependency_folder_name = "cfn" - end - remote_file "#{node['cluster']['base_dir']}/cfn-dependencies.tgz" do - source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/#{dependency_package_name}.tgz" - mode '0644' - retries 3 - retry_delay 5 - action :create_if_missing - end - - bash 'pip install' do - user 'root' - group 'root' - cwd "#{node['cluster']['base_dir']}" - code <<-REQ - set -e - tar xzf cfn-dependencies.tgz - cd #{dependency_folder_name} - #{virtualenv_path}/bin/pip install * -f ./ --no-index - REQ - end +dependency_package_name = "pypi-cfn-dependencies-#{node['cluster']['python-major-minor-version']}-#{node['kernel']['machine']}" +dependency_folder_name = dependency_package_name +if platform?('amazon') && node['platform_version'] == "2" + dependency_package_name = "cfn-dependencies" + dependency_folder_name = "cfn" +end + +remote_file "#{node['cluster']['base_dir']}/cfn-dependencies.tgz" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/#{dependency_package_name}.tgz" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing +end + +bash 'pip install' do + user 'root' + group 'root' + cwd "#{node['cluster']['base_dir']}" + code <<-REQ + set -e + tar xzf cfn-dependencies.tgz + cd #{dependency_folder_name} + #{virtualenv_path}/bin/pip install * -f ./ --no-index + REQ end cfnbootstrap_version = '2.0-33' @@ -79,11 +78,8 @@ retry_delay 5 end -command = if aws_region.start_with?("us-iso") - "#{virtualenv_path}/bin/pip install #{cfnbootstrap_package} --no-build-isolation" - else - "#{virtualenv_path}/bin/pip install #{cfnbootstrap_package}" - end +command = "#{virtualenv_path}/bin/pip install #{cfnbootstrap_package} --no-build-isolation" + bash "Install CloudFormation helpers from #{cfnbootstrap_package}" do user 'root' group 'root' diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb index 1e47a13200..274f3d41cf 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/cfn_bootstrap_spec.rb @@ -4,18 +4,46 @@ for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:cfnbootstrap_version) { '2.0-33' } - cached(:cfnbootstrap_package) { "aws-cfn-bootstrap-py3-#{cfnbootstrap_version}.tar.gz" } + cached(:arch) { 'x86_64' } + cached(:s3_url) { 's3://url' } + cached(:base_dir) { 'base_dir' } cached(:python_version) { "#{node['cluster']['python-version']}" } + cached(:dependecy_package_name_suffix) do + if platform == 'amazon' && version == '2' + "cfn-dependencies" + else + "pypi-cfn-dependencies-#{node['cluster']['python-major-minor-version']}-#{arch}" + end + end + cached(:dependecy_folder_name) do + if platform == 'amazon' && version == '2' + "cfn" + else + dependecy_package_name_suffix + end + end + cached(:cfnbootstrap_package) { "aws-cfn-bootstrap-py3-#{cfnbootstrap_version}.tar.gz" } cached(:system_pyenv_root) { 'system_pyenv_root' } cached(:virtualenv_path) { "system_pyenv_root/versions/#{python_version}/envs/cfn_bootstrap_virtualenv" } cached(:timeout) { 1800 } + cached(:dependency_bash_code) do + <<-REQ + set -e + tar xzf cfn-dependencies.tgz + cd #{dependecy_folder_name} + #{virtualenv_path}/bin/pip install * -f ./ --no-index + REQ + end context "when cfn_bootstrap virtualenv not installed yet" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| node.override['cluster']['system_pyenv_root'] = system_pyenv_root node.override['cluster']['region'] = 'non_china' + node.override['cluster']['base_dir'] = base_dir node.override['cluster']['compute_node_bootstrap_timeout'] = timeout + node.override['cluster']['artifacts_s3_url'] = s3_url + node.override['kernel']['machine'] = arch end runner.converge(described_recipe) end @@ -37,6 +65,22 @@ is_expected.to write_node_attributes('dump node attributes') end + it 'downloads cfn_dependecies package from s3' do + is_expected.to create_if_missing_remote_file("#{base_dir}/cfn-dependencies.tgz") + .with(source: "#{s3_url}/dependencies/PyPi/#{arch}/#{dependecy_package_name_suffix}.tgz") + .with(mode: '0644') + .with(retries: 3) + .with(retry_delay: 5) + end + + it 'pip installs dependencies' do + is_expected.to run_bash('pip install') + .with(user: 'root') + .with(group: 'root') + .with(cwd: base_dir) + .with(code: dependency_bash_code) + end + it 'downloads cfn_bootstrap package from s3' do is_expected.to create_remote_file("/tmp/#{cfnbootstrap_package}").with( source: "https://s3.amazonaws.com/cloudformation-examples/#{cfnbootstrap_package}" @@ -48,7 +92,7 @@ user: 'root', group: 'root', cwd: '/tmp', - code: "#{virtualenv_path}/bin/pip install #{cfnbootstrap_package}", + code: "#{virtualenv_path}/bin/pip install #{cfnbootstrap_package} --no-build-isolation", creates: "#{virtualenv_path}/bin/cfn-hup" ) end From b99018d64b8b37bb944eb300bb6df48f376d3425 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 16 Oct 2025 11:54:27 -0400 Subject: [PATCH 10/37] [Bug] Install node dependencies in all regions --- .../install/custom_parallelcluster_node.rb | 54 +++++------ .../custom_parallelcluster_node_spec.rb | 91 +++++++++++++++++++ 2 files changed, 115 insertions(+), 30 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/custom_parallelcluster_node_spec.rb diff --git a/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb b/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb index 788a715f33..df96cb2f65 100644 --- a/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb +++ b/cookbooks/aws-parallelcluster-computefleet/recipes/install/custom_parallelcluster_node.rb @@ -19,39 +19,33 @@ # TODO: once the pyenv Chef resource supports installing packages from a path (e.g. `pip install .`), convert the # bash block to a recipe that uses the pyenv resource. -command = if aws_region.start_with?("us-iso") - "pip install . --no-build-isolation" - else - "pip install ." - end +command = "pip install . --no-build-isolation" -if aws_region.start_with?("us-iso") - dependency_package_name = "pypi-node-dependencies-#{node['cluster']['python-major-minor-version']}-#{node['kernel']['machine']}" - dependency_folder_name = dependency_package_name - if platform?('amazon') && node['platform_version'] == "2" - dependency_package_name = "node-dependencies" - dependency_folder_name = "node" - end +dependency_package_name = "pypi-node-dependencies-#{node['cluster']['python-major-minor-version']}-#{node['kernel']['machine']}" +dependency_folder_name = dependency_package_name +if platform?('amazon') && node['platform_version'] == "2" + dependency_package_name = "node-dependencies" + dependency_folder_name = "node" +end - remote_file "#{node['cluster']['base_dir']}/node-dependencies.tgz" do - source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/#{dependency_package_name}.tgz" - mode '0644' - retries 3 - retry_delay 5 - action :create_if_missing - end +remote_file "#{node['cluster']['base_dir']}/node-dependencies.tgz" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/#{dependency_package_name}.tgz" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing +end - bash 'pip install' do - user 'root' - group 'root' - cwd "#{node['cluster']['base_dir']}" - code <<-REQ - set -e - tar xzf node-dependencies.tgz - cd #{dependency_folder_name} - #{node_virtualenv_path}/bin/pip install * -f ./ --no-index - REQ - end +bash 'pip install' do + user 'root' + group 'root' + cwd "#{node['cluster']['base_dir']}" + code <<-REQ + set -e + tar xzf node-dependencies.tgz + cd #{dependency_folder_name} + #{node_virtualenv_path}/bin/pip install * -f ./ --no-index + REQ end bash "install custom aws-parallelcluster-node" do diff --git a/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/custom_parallelcluster_node_spec.rb b/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/custom_parallelcluster_node_spec.rb new file mode 100644 index 0000000000..41bbccfbcf --- /dev/null +++ b/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/custom_parallelcluster_node_spec.rb @@ -0,0 +1,91 @@ +require 'spec_helper' + +describe 'aws-parallelcluster-computefleet::custom_parallelcluster_node' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:s3_url) { 's3://url' } + cached(:base_dir) { 'base_dir' } + cached(:arch) { 'x86_64' } + cached(:region) { 'any-region' } + cached(:python_version) { 'python_version' } + cached(:dependency_pkg_name_suffix) do + if platform == 'amazon' && version == '2' + 'node-dependencies' + else + "pypi-node-dependencies-#{python_version}-#{arch}" + end + end + cached(:dependency_folder_name_suffix) do + if platform == 'amazon' && version == '2' + "node" + else + dependency_pkg_name_suffix + end + end + cached(:virtualenv_path) { "#{base_dir}/pyenv/versions/#{python_version}/envs/node_virtualenv" } + cached(:cookbook_virtualenv_path) { "#{base_dir}/pyenv/versions/#{python_version}/envs/cookbook_virtualenv" } + cached(:custom_node_s3_url) { "#{s3_url}/pyenv/versions/#{python_version}/envs/node_virtualenv" } + cached(:pip_install_bash_code) do + <<-REQ + set -e + tar xzf node-dependencies.tgz + cd #{dependency_folder_name_suffix} + #{virtualenv_path}/bin/pip install * -f ./ --no-index + REQ + end + cached(:node_bash_code) do + <<-NODE + set -e + [[ ":$PATH:" != *":/usr/local/bin:"* ]] && PATH="/usr/local/bin:${PATH}" + echo "PATH is $PATH" + source #{virtualenv_path}/bin/activate + pip uninstall --yes aws-parallelcluster-node + if [[ "#{custom_node_s3_url}" =~ ^s3:// ]]; then + custom_package_url=$(#{cookbook_virtualenv_path}/bin/aws s3 presign #{custom_node_s3_url} --region #{region}) + else + custom_package_url=#{custom_node_s3_url} + fi + curl --retry 3 -L -o aws-parallelcluster-node.tgz ${custom_package_url} + rm -fr aws-parallelcluster-custom-node + mkdir aws-parallelcluster-custom-node + tar -xzf aws-parallelcluster-node.tgz --directory aws-parallelcluster-custom-node + cd aws-parallelcluster-custom-node/*aws-parallelcluster-node* + pip install . --no-build-isolation + deactivate + NODE + end + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['kernel']['machine'] = arch + node.override['cluster']['python-major-minor-version'] = python_version + node.override['cluster']['python-version'] = python_version + node.override['cluster']['base_dir'] = base_dir + node.override['cluster']['region'] = region + node.override['cluster']['artifacts_s3_url'] = s3_url + node.override['cluster']['custom_node_package'] = custom_node_s3_url + end + allow(File).to receive(:exist?).with("#{virtualenv_path}/bin/activate").and_return(true) + runner.converge(described_recipe) + end + + it 'downloads tarball' do + is_expected.to create_if_missing_remote_file("base_dir/node-dependencies.tgz") + .with(source: "#{s3_url}/dependencies/PyPi/#{arch}/#{dependency_pkg_name_suffix}.tgz") + .with(mode: '0644') + .with(retries: 3) + .with(retry_delay: 5) + end + + it 'pip installs' do + is_expected.to run_bash('pip install') + .with(cwd: base_dir) + .with(code: pip_install_bash_code.gsub(/^ /, ' ')) + end + + it 'install custom aws-parallelcluster-node' do + is_expected.to run_bash('install custom aws-parallelcluster-node') + .with(code: node_bash_code.gsub(/^ /, ' ')) + end + end + end +end From 468b742c7f2da81c504f8b0c91996453afd2b291 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 16 Oct 2025 11:54:35 -0400 Subject: [PATCH 11/37] [Bug] Install cookbook dependencies in all regions --- .../recipes/install/cookbook_virtualenv.rb | 38 +++++++++---------- .../unit/recipes/cookbook_virtualenv_spec.rb | 16 ++++---- 2 files changed, 25 insertions(+), 29 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb index 8717b06a69..e2063eb56f 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb @@ -13,7 +13,7 @@ virtualenv_path = cookbook_virtualenv_path dependency_package_name = "pypi-cookbook-dependencies-#{node['cluster']['python-major-minor-version']}-#{node['kernel']['machine']}" -pypi_s3_uri = "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{dependency_package_name}.tgz" +pypi_s3_uri = "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/#{dependency_package_name}.tgz" if platform?('amazon') && node['platform_version'] == "2" dependency_package_name = "dependencies" pypi_s3_uri = "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/cookbook-dependencies.tgz" @@ -33,24 +33,22 @@ not_if { ::File.exist?("#{cookbook_virtualenv_path}/bin/activate") } end -if aws_region.start_with?("us-iso") - remote_file "#{node['cluster']['base_dir']}/cookbook-dependencies.tgz" do - source pypi_s3_uri - mode '0644' - retries 3 - retry_delay 5 - action :create_if_missing - end +remote_file "#{node['cluster']['base_dir']}/cookbook-dependencies.tgz" do + source pypi_s3_uri + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing +end - bash 'pip install' do - user 'root' - group 'root' - cwd "#{node['cluster']['base_dir']}" - code <<-REQ - set -e - tar xzf cookbook-dependencies.tgz - cd #{dependency_package_name} - #{virtualenv_path}/bin/pip install * -f ./ --no-index - REQ - end +bash 'pip install' do + user 'root' + group 'root' + cwd "#{node['cluster']['base_dir']}" + code <<-REQ + set -e + tar xzf cookbook-dependencies.tgz + cd #{dependency_package_name} + #{virtualenv_path}/bin/pip install * -f ./ --no-index + REQ end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb index 0ce79250cc..4ed242ed53 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cookbook_virtualenv_spec.rb @@ -6,7 +6,7 @@ cached(:python_version) { 'python_version' } cached(:system_pyenv_root) { 'system_pyenv_root' } cached(:virtualenv_path) { 'system_pyenv_root/versions/python_version/envs/cookbook_virtualenv' } - cached(:aws_region) { 'us-iso-test' } + cached(:aws_region) { 'any-region' } context "when cookbook virtualenv not installed yet" do cached(:chef_run) do @@ -35,14 +35,12 @@ expect(node.default['cluster']['cookbook_virtualenv_path']).to eq(virtualenv_path) is_expected.to write_node_attributes('dump node attributes') end - context "when in isolated region" do - it 'installs python packages' do - is_expected.to run_bash("pip install").with( - user: 'root', - group: 'root', - cwd: "#{node['cluster']['base_dir']}" - ).with_code(/tar xzf cookbook-dependencies.tgz/) - end + it 'installs python packages' do + is_expected.to run_bash("pip install").with( + user: 'root', + group: 'root', + cwd: "#{node['cluster']['base_dir']}" + ).with_code(/tar xzf cookbook-dependencies.tgz/) end end end From 2eace6f79fb414337129cba9f2ef2350b737ddc3 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 16 Oct 2025 15:49:42 -0400 Subject: [PATCH 12/37] [IMEX] Install Nvidia-imex in all regions --- .../resources/nvidia_imex/partial/_nvidia_imex_common.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index f791eb5f1d..454ea6e99b 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -17,7 +17,7 @@ action :install do return unless nvidia_enabled_or_installed? - return if on_docker? || imex_installed? || aws_region.start_with?("us-iso") + return if on_docker? || imex_installed? action_install_imex From 0113335576c056824b9f3ff7fa8c7872d48ac6a9 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Tue, 28 Oct 2025 18:13:09 -0400 Subject: [PATCH 13/37] [CodeLinters] Disable Flake8 rule B042 as it is a minor, and it is also affected by false positive. Rule Description: Exception class with `__init__` should pass all args to `super().__init__()` in order to work with `copy.copy()`. False Positive: https://github.com/PyCQA/flake8-bugbear/issues/525 --- .flake8 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.flake8 b/.flake8 index 70ad5311ae..fd21ff8d7e 100644 --- a/.flake8 +++ b/.flake8 @@ -15,6 +15,9 @@ ignore = # B028: Consider replacing f"'{foo}'" with f"{foo!r}". # Currently being disabled by flake8-bugbear. See https://github.com/PyCQA/flake8-bugbear/pull/333 B028 + # B042: Exception class with `__init__` should pass all args to `super().__init__()` in order to work with `copy.copy()`. + # Affected by false positive, https://github.com/PyCQA/flake8-bugbear/issues/525 + B042 exclude = .tox, .git, From c89f4a2458b954dcf192f5f88f6728c22d89938b Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Tue, 28 Oct 2025 18:13:46 -0400 Subject: [PATCH 14/37] [CodeLinters] Addressed linter error about extra whitespaces. --- .../files/cloudwatch/cloudwatch_agent_config_util.py | 4 ++-- .../files/default/head_node_slurm/slurm/config_renderer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config_util.py b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config_util.py index 4598e64a0c..f0c51e652f 100644 --- a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config_util.py +++ b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config_util.py @@ -97,8 +97,8 @@ def _validate_timestamp_keys(input_json): if log_config.get("timestamp_format_key") not in valid_keys: _fail( f"Log config with log_stream_name {log_config.get('log_stream_name')} and " - f"file_path {log_config.get('file_path'),} contains an invalid timestamp_format_key: " - f"{log_config.get('timestamp_format_key')}. Valid values are {', '.join(valid_keys),}" + f"file_path {log_config.get('file_path'), } contains an invalid timestamp_format_key: " + f"{log_config.get('timestamp_format_key')}. Valid values are {', '.join(valid_keys), }" ) diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/config_renderer.py b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/config_renderer.py index f580a74be7..3f4110289f 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/config_renderer.py +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/config_renderer.py @@ -80,7 +80,7 @@ def _definitions(self, dynamic=False): definitions += f" Weight={self.dynamic_node_priority if dynamic else self.static_node_priority}" if self.has_gpu and self.gpu_count > 0: - definitions += f" Gres=gpu:{ self.gpu_type }:{self.gpu_count}" + definitions += f" Gres=gpu:{self.gpu_type}:{self.gpu_count}" return definitions From 500d9727363bfcdb5ab53299dba631384ffb9bfb Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 24 Oct 2025 17:07:38 -0400 Subject: [PATCH 15/37] [Tools] In the utility to upload cookbook: include GitRef as artifact suffix where it was missing. --- util/upload-cookbook.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/upload-cookbook.sh b/util/upload-cookbook.sh index 864ce56959..f40445c091 100755 --- a/util/upload-cookbook.sh +++ b/util/upload-cookbook.sh @@ -107,7 +107,7 @@ main() { fi aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz || _error_exit 'Failed to push cookbook to S3' aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-cookbook-${_version}-${GIT_REF}.md5 s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}-${GIT_REF}.md5 || _error_exit 'Failed to push cookbook md5 to S3' - aws ${_profile} --region "${_region}" s3api head-object --bucket ${_bucket} --key ${_key_path}/aws-parallelcluster-cookbook-${_version}.tgz --output text --query LastModified > aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz.date || _error_exit 'Failed to fetch LastModified date' + aws ${_profile} --region "${_region}" s3api head-object --bucket ${_bucket} --key ${_key_path}/aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz --output text --query LastModified > aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz.date || _error_exit 'Failed to fetch LastModified date' aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz.date s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz.date || _error_exit 'Failed to push cookbook date' _bucket_region=$(aws ${_profile} s3api get-bucket-location --bucket ${_bucket} --output text) @@ -122,7 +122,7 @@ main() { echo "" echo "DevSettings:" echo " Cookbook:" - echo " ChefCookbook: s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}.tgz" + echo " ChefCookbook: s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}-${GIT_REF}.tgz" } main "$@" From 6eda378e1bcc060acf40df0f03cfcb89c60d3212 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Tue, 28 Oct 2025 09:07:11 -0400 Subject: [PATCH 16/37] [Performance] Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes by disabling cfn-hup on those nodes. As a consequence, it also disables the cluster readiness checks executed by the head node on cluster update. Disabling cfn-hup mitigates a relevant performance degradation that may occur with tightly coupled workload st scale. --- CHANGELOG.md | 8 +++ .../recipes/config/supervisord_config.rb | 3 +- .../unit/recipes/supervisord_config_spec.rb | 61 +++++++++++++++++++ .../parallelcluster_supervisord.conf.erb | 3 +- .../attributes/cluster.rb | 3 + .../libraries/helpers.rb | 11 ++++ .../spec/unit/libraries/helpers_spec.rb | 42 +++++++++++++ .../recipes/update/update_head_node.rb | 2 +- .../unit/recipes/update_head_node_spec.rb | 22 +++++++ 9 files changed, 151 insertions(+), 4 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 12231c2bdf..0c4b161880 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,14 @@ aws-parallelcluster-cookbook CHANGELOG This file is used to list changes made in each version of the AWS ParallelCluster cookbook. +3.15.0 +------ + +**CHANGES** +1. Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes + and achieve better performance at scale. + + 3.14.0 ------ diff --git a/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb b/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb index 0f0ad87850..8ef48ad12d 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb @@ -32,6 +32,7 @@ dcv_port: node['cluster']['dcv_port'], dcv_auth_certificate: node['cluster']['dcv']['authenticator']['certificate'], dcv_auth_private_key: node['cluster']['dcv']['authenticator']['private_key'], - dcv_auth_user: node['cluster']['dcv']['authenticator']['user'] + dcv_auth_user: node['cluster']['dcv']['authenticator']['user'], + cfnhup_enabled: cfnhup_enabled? ) end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb index 0434761402..7d49664e10 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb @@ -57,6 +57,28 @@ end end + context "when head node and cfn-hup disabled on fleet" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['cluster']['node_type'] = 'HeadNode' + node.override['cluster']['dcv_enabled'] = 'head_node' + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' + allow_any_instance_of(Object).to receive(:dcv_installed?).and_return(true) + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + it 'has the correct content' do + is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:cfn-hup]") + .with_content("[program:clustermgtd]") + .with_content("[program:clusterstatusmgtd]") + .with_content("[program:pcluster_dcv_authenticator]") + .with_content("--port 8444") + end + end + context "when compute fleet" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| @@ -77,6 +99,25 @@ .with_content("[program:pcluster_dcv_authenticator]") end end + + context "when compute fleet with cfn-hup disabled on fleet" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['cluster']['node_type'] = 'ComputeFleet' + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + it 'has the correct content' do + is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:computemgtd]") + + is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:cfn-hup]") + end + end context "when login node and dcv configured" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| @@ -109,12 +150,32 @@ it 'has the correct content' do is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:cfn-hup]") .with_content("[program:loginmgtd]") is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') .with_content("[program:pcluster_dcv_authenticator]") end end + + context "when login node with cfn-hup disabled on fleet" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['cluster']['node_type'] = 'LoginNode' + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + it 'has the correct content' do + is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:loginmgtd]") + + is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:cfn-hup]") + end + end end end end diff --git a/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb b/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb index 61fc1aaf73..e98a755235 100644 --- a/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb +++ b/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb @@ -1,8 +1,7 @@ # Generated by Chef for AWS ParallelCluster <%= node['cluster']['node_type'] -%> # Local modifications could be overwritten. <%# HeadNode, ComputeFleet, LoginNode -%> -<% case node['cluster']['node_type'] -%> -<% when 'HeadNode', 'ComputeFleet', 'LoginNode' -%> +<% if @cfnhup_enabled -%> [program:cfn-hup] command = <%= node['cluster']['scripts_dir']%>/cfn-hup-runner.sh autorestart = true diff --git a/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb b/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb index def13a134a..4bbfffd820 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb @@ -34,3 +34,6 @@ # Default NFS mount options default['cluster']['nfs']['hard_mount_options'] = 'hard,_netdev,noatime' + +# Cluster Updates +default['cluster']['in_place_update_on_fleet_enabled'] = 'true' diff --git a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb index ce3a27532f..37e0114051 100644 --- a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb @@ -106,3 +106,14 @@ def wait_sync_file(path) timeout 5 end end + +def cfnhup_enabled? + # cfn-hup is always enabled on the head node, as it is required to perform cluster updates. + # cfn-hup can be disabled on compute nodes and login nodes, limiting the cluster update in the sense that + # live updates on compute and login nodes are not possible. + node['cluster']['node_type'] == 'HeadNode' || node['cluster']['in_place_update_on_fleet_enabled'] == 'true' +end + +def cluster_readiness_check_on_update_enabled? + node['cluster']['in_place_update_on_fleet_enabled'] == 'true' +end diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb new file mode 100644 index 0000000000..e9d180d5e0 --- /dev/null +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb @@ -0,0 +1,42 @@ +require_relative '../../../libraries/helpers' +require 'spec_helper' + +describe 'cfnhup_enabled?' do + let(:node) { Chef::Node.new } + + context 'when node type is HeadNode' do + before { node.override['cluster']['node_type'] = 'HeadNode' } + + it 'returns true regardless of in_place_update_on_fleet_enabled setting' do + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' + expect(cfnhup_enabled?).to be true + end + end + + %w(ComputeFleet LoginNode).each do |node_type| + context "when node type is #{node_type}" do + before { node.override['cluster']['node_type'] = node_type } + + it 'returns true when in_place_update_on_fleet_enabled is true' do + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'true' + expect(cfnhup_enabled?).to be true + end + + it 'returns false when in_place_update_on_fleet_enabled is false' do + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' + expect(cfnhup_enabled?).to be false + end + end + end +end + +describe 'cluster_readiness_check_on_update_enabled?' do + let(:node) { Chef::Node.new } + + [true, false].each do |in_place_update_on_fleet_enabled| + it "returns #{in_place_update_on_fleet_enabled} when in_place_update_on_fleet_enabled is #{in_place_update_on_fleet_enabled}" do + node.override['cluster']['in_place_update_on_fleet_enabled'] = in_place_update_on_fleet_enabled.to_s + expect(cluster_readiness_check_on_update_enabled?).to be in_place_update_on_fleet_enabled + end + end +end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 9b63a4a4b6..76ba95362a 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -272,7 +272,7 @@ def update_nodes_in_queue(strategy, queues) chef_sleep '15' -wait_cluster_ready +wait_cluster_ready if cluster_readiness_check_on_update_enabled? execute 'start clustermgtd' do command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd" diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb index 38b894c013..f2f53d13d4 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb @@ -15,6 +15,7 @@ allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(are_mount_or_unmount_required) allow_any_instance_of(Object).to receive(:dig).and_return(true) allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_venv_path) + allow_any_instance_of(Object).to receive(:cluster_readiness_check_on_update_enabled?).and_return(true) RSpec::Mocks.configuration.allow_message_expectations_on_nil = true node.override['cluster']['stack_name'] = cluster_name @@ -58,6 +59,27 @@ end end end + + context 'when cluster readiness check is disabled' do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(false) + allow_any_instance_of(Object).to receive(:dig).and_return(true) + allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_venv_path) + allow_any_instance_of(Object).to receive(:cluster_readiness_check_on_update_enabled?).and_return(false) + RSpec::Mocks.configuration.allow_message_expectations_on_nil = true + + node.override['cluster']['stack_name'] = cluster_name + node.override['cluster']['region'] = region + node.override['cluster']['cluster_config_version'] = cluster_config_version + node.override['cluster']['scripts_dir'] = scripts_dir + end + runner.converge(described_recipe) + end + it 'does not check cluster readiness' do + is_expected.not_to run_execute("Check cluster readiness") + end + end end end end From c3c60a64e1eb1f11e18690a6f017e4c63cf4305c Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 31 Oct 2025 13:31:07 -0400 Subject: [PATCH 17/37] [SlurmDbd] Adding a message to make sure that we do not use # in Database Password --- .../slurm/head_node/update_slurm_database_password.sh.erb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_slurm_database_password.sh.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_slurm_database_password.sh.erb index b0f2148dd9..30af40918d 100644 --- a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_slurm_database_password.sh.erb +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_slurm_database_password.sh.erb @@ -30,6 +30,11 @@ fi echo "Reading password from AWS Secrets Manager: ${SECRET_ARN}" password_from_secrets_manager=$(aws secretsmanager get-secret-value --secret-id ${SECRET_ARN} --region ${REGION} --query 'SecretString' --output text) +if [[ "${password_from_secrets_manager}" =~ '#' ]]; then + echo "You cannot use the # character in the database password as Slurm does not support it with $SLURMDBD_PROPERTY configuration paramter in $SLURMDBD_CONFIG_FILE. Please refer to the official SchedMD documentation for more details." + exit 1 +fi + [ "${password_from_dbd_config}" == "${password_from_secrets_manager}" ] && echo "Password match, skipping update" && exit 0 echo "Writing AWS Secrets Manager password to ${SLURMDBD_CONFIG_FILE}" From b64604518f373fe4add2bc1b880a662da92eb2b3 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Thu, 13 Nov 2025 13:01:32 -0500 Subject: [PATCH 18/37] [BuildImage] Load kernel module `drm_client_lib` before the installation of NVIDIA driver, if the module is available on the kernel. Starting kernel `5.14.0-611`, some DRM symbols required by the NVIDIA driver are exported by new client modules. --- CHANGELOG.md | 5 ++-- .../partial/_nvidia_driver_common.rb | 11 ++++++++ .../spec/unit/resources/nvidia_driver_spec.rb | 25 +++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c4b161880..282c0f45cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste ------ **CHANGES** -1. Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes - and achieve better performance at scale. +- Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes + and achieve better performance at scale. +- Load kernel module `drm_client_lib` before installation of NVIDIA driver, if available on the kernel. 3.14.0 diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb index 5cfa7ea906..791e7a8311 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb @@ -72,6 +72,13 @@ end end + # Load kernel modules in best effort + kernel_modules_to_load.each do |km| + execute "Load kernel module if exposed by the kernel: #{km}" do + command "if modinfo #{km}; then modprobe #{km}; fi" + end + end + # Install driver bash 'nvidia.run advanced' do user 'root' @@ -126,3 +133,7 @@ def nvidia_kernel_module "kernel-open" end end + +def kernel_modules_to_load + %w(drm_client_lib) +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb index 4d0b8b57ca..ad71e58a4b 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb @@ -140,11 +140,27 @@ def self.setup(chef_run, nvidia_driver_version: nil) end end +describe 'nvidia_driver:kernel_modules_to_load' do + cached(:chef_run) do + ChefSpec::SoloRunner.new(step_into: ['nvidia_driver']) + end + + cached(:resource) do + ConvergeNvidiaDriver.setup(chef_run) + chef_run.find_resource('nvidia_driver', 'setup') + end + + it 'returns expected kernel modules' do + expect(resource.kernel_modules_to_load).to eq(%w(drm_client_lib)) + end +end + describe 'nvidia_driver:setup' do for_all_oses do |platform, version| cached(:nvidia_arch) { 'nvidia_arch' } cached(:nvidia_kernel_module) { 'nvidia_kernel_module' } cached(:nvidia_driver_version) { 'nvidia_driver_version' } + cached(:kernel_modules_to_load) { %w(module1 module2) } cached(:nvidia_driver_url) { "https://us.download.nvidia.com/tesla/#{nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" } context "on #{platform}#{version} when nvidia_driver not enabled" do @@ -176,6 +192,7 @@ def self.setup(chef_run, nvidia_driver_version: nil) allow(res).to receive(:nvidia_arch).and_return(nvidia_arch) allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module) allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version) + allow(res).to receive(:kernel_modules_to_load).and_return(kernel_modules_to_load) end stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true) @@ -220,6 +237,14 @@ def self.setup(chef_run, nvidia_driver_version: nil) ) end + it 'loads kernel modules in they are exposed by the kernel' do + kernel_modules_to_load.each do |km| + is_expected.to run_execute("Load kernel module if exposed by the kernel: #{km}").with( + command: "if modinfo #{km}; then modprobe #{km}; fi" + ) + end + end + if platform == 'amazon' compiler_version = version == '2023' ? 'gcc' : 'gcc10' compiler_path = version == '2023' ? 'CC=/usr/bin/gcc' : 'CC=/usr/bin/gcc10-gcc' From 0bebf8af752f5fbc02ad9c6d129891ec99c4873e Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 14 Nov 2025 16:06:23 -0500 Subject: [PATCH 19/37] [Docs] Created changelog entry for 3.14.1. --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 282c0f45cd..967fee4011 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,12 +6,14 @@ This file is used to list changes made in each version of the AWS ParallelCluste 3.15.0 ------ +3.14.1 +------ + **CHANGES** - Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes and achieve better performance at scale. - Load kernel module `drm_client_lib` before installation of NVIDIA driver, if available on the kernel. - 3.14.0 ------ From 97e161274fca6827aaa62e20e378f389b1a092d1 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Tue, 4 Nov 2025 12:57:00 -0500 Subject: [PATCH 20/37] [Dependencies] Reduce dependency footprint by installing only sssd-common rather than sssd. --- CHANGELOG.md | 4 +++- .../partial/_system_authentication_debian.rb | 2 +- .../system_authentication/system_authentication_alinux2.rb | 2 +- .../system_authentication_alinux2023.rb | 2 +- .../system_authentication/system_authentication_redhat8.rb | 2 +- .../system_authentication/system_authentication_rocky8.rb | 2 +- .../spec/unit/resources/system_authentication_spec.rb | 6 +++--- .../test/controls/system_authentication_spec.rb | 2 +- 8 files changed, 12 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 967fee4011..5af3918058 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste **CHANGES** - Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes and achieve better performance at scale. -- Load kernel module `drm_client_lib` before installation of NVIDIA driver, if available on the kernel. +- Load kernel module `drm_client_lib` before installation of NVIDIA driver, if available on the kernel. +- Reduce dependency footprint by installing the package `sssd-common` rather than `sssd`. + 3.14.0 ------ diff --git a/cookbooks/aws-parallelcluster-environment/resources/system_authentication/partial/_system_authentication_debian.rb b/cookbooks/aws-parallelcluster-environment/resources/system_authentication/partial/_system_authentication_debian.rb index 72e7455757..b7c8655704 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/system_authentication/partial/_system_authentication_debian.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/system_authentication/partial/_system_authentication_debian.rb @@ -25,6 +25,6 @@ action_class do def required_packages - %w(sssd sssd-tools sssd-ldap) + %w(sssd-common sssd-tools sssd-ldap) end end diff --git a/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_alinux2.rb b/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_alinux2.rb index 36abc56760..5acaffc727 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_alinux2.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_alinux2.rb @@ -19,6 +19,6 @@ action_class do def required_packages - %w(sssd sssd-tools sssd-ldap authconfig) + %w(sssd-common sssd-tools sssd-ldap authconfig) end end diff --git a/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_alinux2023.rb b/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_alinux2023.rb index 9fe9acca5b..27cd7b3515 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_alinux2023.rb @@ -21,6 +21,6 @@ action_class do def required_packages - %w(sssd sssd-tools sssd-ldap authconfig) + %w(sssd-common sssd-tools sssd-ldap authconfig) end end diff --git a/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_redhat8.rb b/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_redhat8.rb index 35396e2f91..2d23337e3d 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_redhat8.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_redhat8.rb @@ -36,6 +36,6 @@ action_class do def required_packages - %w(sssd sssd-tools sssd-ldap authselect oddjob-mkhomedir) + %w(sssd-common sssd-tools sssd-ldap authselect oddjob-mkhomedir) end end diff --git a/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_rocky8.rb b/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_rocky8.rb index e60ece1f6c..9c60229ce1 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_rocky8.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/system_authentication/system_authentication_rocky8.rb @@ -36,6 +36,6 @@ action_class do def required_packages - %w(sssd sssd-tools sssd-ldap authselect oddjob-mkhomedir) + %w(sssd-common sssd-tools sssd-ldap authselect oddjob-mkhomedir) end end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/system_authentication_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/system_authentication_spec.rb index a4907d2171..47194358a0 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/system_authentication_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/system_authentication_spec.rb @@ -24,11 +24,11 @@ def self.configure(chef_run) cached(:required_packages) do case platform when 'amazon', 'centos' - %w(sssd sssd-tools sssd-ldap authconfig) + %w(sssd-common sssd-tools sssd-ldap authconfig) when 'redhat', 'rocky' - %w(sssd sssd-tools sssd-ldap authselect oddjob-mkhomedir) + %w(sssd-common sssd-tools sssd-ldap authselect oddjob-mkhomedir) else - %w(sssd sssd-tools sssd-ldap) + %w(sssd-common sssd-tools sssd-ldap) end end cached(:chef_run) do diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/system_authentication_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/system_authentication_spec.rb index 4d2e8f4c91..6a28e72d86 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/system_authentication_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/system_authentication_spec.rb @@ -12,7 +12,7 @@ control 'tag:install_system_authentication_packages_installed' do title 'Check that system authentication packages are installed correctly' - packages = %w(sssd sssd-tools sssd-ldap) + packages = %w(sssd-common sssd-tools sssd-ldap) if os_properties.redhat8? packages.append("authselect") From 4d1d0b474f4a36f7508bf8110f1429abaa35ccd2 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Mon, 17 Nov 2025 09:59:32 -0800 Subject: [PATCH 21/37] Fix github system test on Ubuntu22 and 24 --- .../cloudwatch/partial/_cloudwatch_install_package_debian.rb | 2 +- .../test/controls/cloudwatch_spec.rb | 2 +- .../test/libraries/os_properties.rb | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_install_package_debian.rb b/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_install_package_debian.rb index 7e00600a2b..20f0fcdfd7 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_install_package_debian.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_install_package_debian.rb @@ -1,7 +1,7 @@ action :cloudwatch_install_package do dpkg_package package_path do source package_path - end + end unless on_docker? end action_class do diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb index 9e8f90a06e..1ee10f72c7 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb @@ -36,7 +36,7 @@ title "Check if cloudwatch package is installed" describe package('amazon-cloudwatch-agent') do it { should be_installed } - end + end unless os_properties.ubuntu_on_docker? end control 'tag:config_cloudwatch_configured' do diff --git a/cookbooks/aws-parallelcluster-shared/test/libraries/os_properties.rb b/cookbooks/aws-parallelcluster-shared/test/libraries/os_properties.rb index 768d5c9a98..e0745f0933 100644 --- a/cookbooks/aws-parallelcluster-shared/test/libraries/os_properties.rb +++ b/cookbooks/aws-parallelcluster-shared/test/libraries/os_properties.rb @@ -36,6 +36,10 @@ def ubuntu? inspec.os.name == 'ubuntu' end + def ubuntu_on_docker? + on_docker? && ubuntu? + end + def redhat8? redhat? && inspec.os.release.to_i == 8 end From decf009017f195d151fc2160d28800fc07744638 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 18 Nov 2025 09:37:16 -0500 Subject: [PATCH 22/37] [GH] Update the version Bump workflow to mention how to run GH actions --- .github/workflows/bump_version.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/bump_version.yml b/.github/workflows/bump_version.yml index 820f0f40ed..7d944ac72d 100644 --- a/.github/workflows/bump_version.yml +++ b/.github/workflows/bump_version.yml @@ -39,6 +39,7 @@ jobs: title: 'Bump version to ${{ inputs.pcluster-version }}' body: | This PR contains version bump. + Please close and re-open the PR for Github Actions to run. Auto-generated by Github Action branch: versionbump${{ inputs.branch }}${{ inputs.pcluster-version }} delete-branch: true From bfbff9f470be948406d97b8c839305fc2bce0dab Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 11 Nov 2025 12:26:40 -0500 Subject: [PATCH 23/37] [EFS] Upgrade EFS utils from 2.3.1 to 2.4.0 --- .../aws-parallelcluster-environment/attributes/environment.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb index 02041927b3..2e1f78b5d1 100644 --- a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb @@ -73,8 +73,8 @@ default['cluster']['efa']['version'] = '1.43.2' default['cluster']['efa']['sha256'] = 'de15c5bdbc83b952afbde876110830c604ad0796680e5157c05f7c1979a41069' -default['cluster']['efs']['version'] = '2.3.1' -default['cluster']['efs']['sha256'] = 'ced12f82e76f9740476b63f30c49bd76cc00b6375e12a9f5f7ba852635c49e15' +default['cluster']['efs']['version'] = '2.4.0' +default['cluster']['efs']['sha256'] = '9b60c039c162388091d6fab6e9c6cfc5832f34b26b6d05b0a68b333147d78a25' if platform?('amazon') default['cluster']['efs']['version'] = '2.1.0' end From cceeada0bac8784347b784ca80efbab77b968748 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 11 Nov 2025 13:35:33 -0500 Subject: [PATCH 24/37] [EFS] Upgrade EFS and unit tests --- .../aws-parallelcluster-environment/test/controls/efs_spec.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/efs_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/efs_spec.rb index 7d9e812496..f54a81911f 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/efs_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/efs_spec.rb @@ -4,9 +4,9 @@ only_if { !os_properties.redhat_on_docker? } - describe file("#{node['cluster']['sources_dir']}/efs-utils-2.3.1.tar.gz") do + describe file("#{node['cluster']['sources_dir']}/efs-utils-2.4.0.tar.gz") do it { should exist } - its('sha256sum') { should eq 'ced12f82e76f9740476b63f30c49bd76cc00b6375e12a9f5f7ba852635c49e15' } + its('sha256sum') { should eq '9b60c039c162388091d6fab6e9c6cfc5832f34b26b6d05b0a68b333147d78a25' } its('owner') { should eq 'root' } its('group') { should eq 'root' } its('mode') { should cmp '0644' } From 7f39ed2a24673e12bad0f7c0888be515d6d62b30 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 11 Nov 2025 13:41:32 -0500 Subject: [PATCH 25/37] [PMIX] Upgrade PMIx from 5.0.6 to 5.0.9 --- .../aws-parallelcluster-slurm/attributes/slurm_attributes.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb index b6f0670a5c..a7c40d8cf4 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb @@ -13,8 +13,8 @@ default['cluster']['enable_nss_slurm'] = node['cluster']['directory_service']['enabled'] # PMIX Version and Checksum -default['cluster']['pmix']['version'] = '5.0.6' -default['cluster']['pmix']['sha256'] = '5a5e0cd36067144e2171d59164d59ea478a2e540ccf4eee4530f55fc6e8cf78b' +default['cluster']['pmix']['version'] = '5.0.9' +default['cluster']['pmix']['sha256'] = '11b9911aadaac590e5b02749caa618837de9fb644183c1bdc04378b54bf396bb' # Slurmdbd default['cluster']['slurmdbd_service_enabled'] = "true" From 5f281e636f69c355666fb1a7b9d59de4fbdbface Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 11 Nov 2025 13:44:08 -0500 Subject: [PATCH 26/37] [Libjwt] Upgrade libjwt from 1.17.0 to 1.18.4 --- .../aws-parallelcluster-slurm/recipes/install/install_jwt.rb | 4 ++-- .../spec/unit/recipes/install_jwt_spec.rb | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb index 427cde1363..2edbdaaa65 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb @@ -15,10 +15,10 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. -jwt_version = '1.17.0' +jwt_version = '1.18.4' jwt_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/jwt/v#{jwt_version}.tar.gz" jwt_tarball = "#{node['cluster']['sources_dir']}/libjwt-#{jwt_version}.tar.gz" -jwt_sha256 = '617778f9687682220abf9b7daacbe72bab7c2985479f8bee4db9648bd2440687' +jwt_sha256 = '8496257cb39ee7dddfdfc919e7b80a997399b0319f9fdcbefd374b0e4f147159' remote_file jwt_tarball do source jwt_url diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/install_jwt_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/install_jwt_spec.rb index 19901247e0..7b80316358 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/install_jwt_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/install_jwt_spec.rb @@ -18,8 +18,8 @@ context "on #{platform}#{version}" do cached(:cluster_artifacts_s3_url) { 'https://REGION-aws-parallelcluster.s3.REGION.AWS_DOMAIN' } cached(:cluster_sources_dir) { '/path/to/cluster/sources/dir' } - cached(:jwt_version) { '1.17.0' } - cached(:jwt_checksum) { '617778f9687682220abf9b7daacbe72bab7c2985479f8bee4db9648bd2440687' } + cached(:jwt_version) { '1.18.4' } + cached(:jwt_checksum) { '8496257cb39ee7dddfdfc919e7b80a997399b0319f9fdcbefd374b0e4f147159' } cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| From 1b7b21ca5e4279204c4f37e6f4a8c33094e139f6 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 11 Nov 2025 15:12:46 -0500 Subject: [PATCH 27/37] [Slurm] Upgrade Slurm from 24.11.6-1 to 2.11.7-1 --- cookbooks/aws-parallelcluster-slurm/attributes/versions.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb index e14f0ae6f3..252fa1b16b 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb @@ -1,8 +1,8 @@ # Slurm -default['cluster']['slurm']['version'] = '24-11-6-1' +default['cluster']['slurm']['version'] = '24-11-7-1' default['cluster']['slurm']['commit'] = '' default['cluster']['slurm']['branch'] = '' -default['cluster']['slurm']['sha256'] = '282708483326f381eb001a14852a1a82e65e18f37b62b7a5f4936c0ed443b600' +default['cluster']['slurm']['sha256'] = 'f0912d85a9a9b417fd23ca4997c8d3dfed89b3b70b15aad4da54f2812d30d48c' default['cluster']['slurm']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/slurm" # Munge default['cluster']['munge']['munge_version'] = '0.5.16' From 6f59e39cffd7bcf82b30b63575e7ce89d02d1630 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 12 Nov 2025 16:18:39 -0500 Subject: [PATCH 28/37] [EfS-Utils] Add Go/GoLang which is efs-utils pre-requisite --- .../resources/efs/efs_redhat8.rb | 2 +- .../resources/efs/partial/_debian.rb | 2 +- .../resources/efs/partial/_redhat_based.rb | 2 +- .../spec/unit/resources/efs_spec.rb | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb index 6af950560c..5ab6eb3b0e 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb @@ -37,7 +37,7 @@ def adc_install_script_code(efs_utils_tarball, efs_utils_package, efs_utils_vers end def prerequisites - %w(rpm-build make rust cargo openssl-devel) + %w(rpm-build make rust go cargo openssl-devel) end action :install_efs_utils do diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_debian.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_debian.rb index 4af4e52ddc..91abe44198 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_debian.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_debian.rb @@ -24,5 +24,5 @@ def install_script_code(efs_utils_tarball, efs_utils_package, efs_utils_version) end def prerequisites - %w(dkms pkg-config libssl-dev rustc cargo ) + %w(dkms pkg-config libssl-dev rustc cargo golang) end diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_redhat_based.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_redhat_based.rb index bc7887a5a7..a89557967f 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_redhat_based.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_redhat_based.rb @@ -24,5 +24,5 @@ def install_script_code(efs_utils_tarball, efs_utils_package, efs_utils_version) end def prerequisites - %w(rpm-build make rust cargo openssl-devel) + %w(rpm-build make rust go cargo openssl-devel) end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb index 8ad07f0644..1d78bf5f7d 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb @@ -200,8 +200,8 @@ def mock_already_installed(package, expected_version, installed) end cached(:required_packages) do { - "redhat" => %w(rpm-build make rust cargo openssl-devel), - "rocky" => %w(rpm-build make rust cargo openssl-devel), + "redhat" => %w(rpm-build make rust go cargo openssl-devel), + "rocky" => %w(rpm-build make rust go cargo openssl-devel), } end From a62e365a22cf7941209f6cdc65fff3bc851d72db Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 14 Nov 2025 12:25:54 -0500 Subject: [PATCH 29/37] [LibJwt] Update libJWt version to v1.18.4 for all OS except for AL2 --- .../aws-parallelcluster-slurm/attributes/versions.rb | 8 ++++++++ .../recipes/install/install_jwt.rb | 7 +++---- .../spec/unit/recipes/install_jwt_spec.rb | 6 ++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb index 252fa1b16b..401acf3396 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb @@ -8,3 +8,11 @@ default['cluster']['munge']['munge_version'] = '0.5.16' default['cluster']['munge']['sha256'] = 'fa27205d6d29ce015b0d967df8f3421067d7058878e75d0d5ec3d91f4d32bb57' default['cluster']['munge']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/munge" +# LibJwt +default['cluster']['jwt']['version'] = '1.18.4' +default['cluster']['jwt']['sha256'] = '8496257cb39ee7dddfdfc919e7b80a997399b0319f9fdcbefd374b0e4f147159' +if platform?('amazon') && node['platform_version'] == "2" + default['cluster']['jwt']['version'] = '1.17.0' + default['cluster']['jwt']['sha256'] = '617778f9687682220abf9b7daacbe72bab7c2985479f8bee4db9648bd2440687' +end +default['cluster']['jwt']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/jwt" \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb index 2edbdaaa65..1d292f83c3 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb @@ -15,17 +15,16 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. -jwt_version = '1.18.4' -jwt_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/jwt/v#{jwt_version}.tar.gz" +jwt_version = node['cluster']['jwt']['version'] +jwt_url = "#{node['cluster']['jwt']['base_url']}/v#{jwt_version}.tar.gz" jwt_tarball = "#{node['cluster']['sources_dir']}/libjwt-#{jwt_version}.tar.gz" -jwt_sha256 = '8496257cb39ee7dddfdfc919e7b80a997399b0319f9fdcbefd374b0e4f147159' remote_file jwt_tarball do source jwt_url mode '0644' retries 3 retry_delay 5 - checksum jwt_sha256 + checksum node['cluster']['jwt']['sha256'] action :create_if_missing end diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/install_jwt_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/install_jwt_spec.rb index 7b80316358..1197ef8d7d 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/install_jwt_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/install_jwt_spec.rb @@ -18,8 +18,8 @@ context "on #{platform}#{version}" do cached(:cluster_artifacts_s3_url) { 'https://REGION-aws-parallelcluster.s3.REGION.AWS_DOMAIN' } cached(:cluster_sources_dir) { '/path/to/cluster/sources/dir' } - cached(:jwt_version) { '1.18.4' } - cached(:jwt_checksum) { '8496257cb39ee7dddfdfc919e7b80a997399b0319f9fdcbefd374b0e4f147159' } + cached(:jwt_version) { '1.2.3' } + cached(:jwt_checksum) { 'somechecksum' } cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| @@ -27,6 +27,8 @@ node.override['cluster']['artifacts_s3_url'] = cluster_artifacts_s3_url node.override['cluster']['sources_dir'] = cluster_sources_dir + node.override['cluster']['jwt']['version'] = jwt_version + node.override['cluster']['jwt']['sha256'] = jwt_checksum end allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(true) runner.converge(described_recipe) From 82f644e9b6b2ad0735d04110d3d56689e2a9f8de Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 17 Nov 2025 13:39:44 -0500 Subject: [PATCH 30/37] [EFA] Upgrade EFA utils from 1.43.2 to 1.44.0 --- .../aws-parallelcluster-environment/attributes/environment.rb | 4 ++-- .../spec/unit/resources/efa_spec.rb | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb index 2e1f78b5d1..bad665800c 100644 --- a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb @@ -70,8 +70,8 @@ default['cluster']['head_node_private_ip'] = nil -default['cluster']['efa']['version'] = '1.43.2' -default['cluster']['efa']['sha256'] = 'de15c5bdbc83b952afbde876110830c604ad0796680e5157c05f7c1979a41069' +default['cluster']['efa']['version'] = '1.44.0' +default['cluster']['efa']['sha256'] = 'f129a5b44a49d593d247e55a59eb9bcb57121566e1c2e42b832a4e794fa83d8a' default['cluster']['efs']['version'] = '2.4.0' default['cluster']['efs']['sha256'] = '9b60c039c162388091d6fab6e9c6cfc5832f34b26b6d05b0a68b333147d78a25' diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb index c72466258c..26fc4020ab 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb @@ -2,8 +2,8 @@ # parallelcluster default source dir defined in attributes source_dir = '/opt/parallelcluster/sources' -efa_version = '1.43.2' -efa_checksum = 'de15c5bdbc83b952afbde876110830c604ad0796680e5157c05f7c1979a41069' +efa_version = '1.44.0' +efa_checksum = 'f129a5b44a49d593d247e55a59eb9bcb57121566e1c2e42b832a4e794fa83d8a' class ConvergeEfa def self.setup(chef_run, efa_version: nil, efa_checksum: nil) From 81408182aff349a1aedefbcc8ba23d944e0f1b49 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 17 Nov 2025 13:59:29 -0500 Subject: [PATCH 31/37] [Changelog] Update Changelog for 3.14.1 --- CHANGELOG.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5af3918058..b377bd8979 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,9 +12,19 @@ This file is used to list changes made in each version of the AWS ParallelCluste **CHANGES** - Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes and achieve better performance at scale. -- Load kernel module `drm_client_lib` before installation of NVIDIA driver, if available on the kernel. +- Load kernel module `drm_client_lib` before installation of NVIDIA driver, if available on the kernel. - Reduce dependency footprint by installing the package `sssd-common` rather than `sssd`. - +- Upgrade Slurm to version 24.11.7 (from 24.11.6). +- Upgrade Pmix to 5.0.9 (from 5.0.6). +- Upgrade libjwt to version 1.18.4 (from 1.17.0) for all OSs except Amazon Linux 2. +- Upgrade amazon-efs-utils to version 2.4.0 (from v2.3.1). +- Upgrade EFA installer to 1.44.0 (from 1.43.2). + - Efa-driver: efa-2.17.3-1 + - Efa-config: efa-config-1.18-1 + - Efa-profile: efa-profile-1.7-1 + - Libfabric-aws: libfabric-aws-2.3.1-1 + - Rdma-core: rdma-core-59.0-1 + - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.8-11 3.14.0 ------ From 85a0bc7b17a08b8887d5a76dbb729ae13caf2ecc Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 17 Nov 2025 14:58:44 -0500 Subject: [PATCH 32/37] [EFS] Add cmake and perl which are pre-requisite for efs-utils --- .../resources/efs/efs_redhat8.rb | 2 +- .../resources/efs/partial/_debian.rb | 2 +- .../resources/efs/partial/_redhat_based.rb | 2 +- .../spec/unit/resources/efs_spec.rb | 4 ++-- cookbooks/aws-parallelcluster-slurm/attributes/versions.rb | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb index 5ab6eb3b0e..783386bc5f 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/efs_redhat8.rb @@ -37,7 +37,7 @@ def adc_install_script_code(efs_utils_tarball, efs_utils_package, efs_utils_vers end def prerequisites - %w(rpm-build make rust go cargo openssl-devel) + %w(rpm-build make rust go cargo openssl-devel cmake perl) end action :install_efs_utils do diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_debian.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_debian.rb index 91abe44198..5e7c727f6d 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_debian.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_debian.rb @@ -24,5 +24,5 @@ def install_script_code(efs_utils_tarball, efs_utils_package, efs_utils_version) end def prerequisites - %w(dkms pkg-config libssl-dev rustc cargo golang) + %w(dkms pkg-config libssl-dev rustc cargo golang cmake perl) end diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_redhat_based.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_redhat_based.rb index a89557967f..1f0f58193a 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_redhat_based.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_redhat_based.rb @@ -24,5 +24,5 @@ def install_script_code(efs_utils_tarball, efs_utils_package, efs_utils_version) end def prerequisites - %w(rpm-build make rust go cargo openssl-devel) + %w(rpm-build make rust go cargo openssl-devel cmake perl) end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb index 1d78bf5f7d..051d7c0d66 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb @@ -200,8 +200,8 @@ def mock_already_installed(package, expected_version, installed) end cached(:required_packages) do { - "redhat" => %w(rpm-build make rust go cargo openssl-devel), - "rocky" => %w(rpm-build make rust go cargo openssl-devel), + "redhat" => %w(rpm-build make rust go cargo openssl-devel cmake perl), + "rocky" => %w(rpm-build make rust go cargo openssl-devel cmake perl), } end diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb index 401acf3396..b1a9360a7a 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb @@ -15,4 +15,4 @@ default['cluster']['jwt']['version'] = '1.17.0' default['cluster']['jwt']['sha256'] = '617778f9687682220abf9b7daacbe72bab7c2985479f8bee4db9648bd2440687' end -default['cluster']['jwt']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/jwt" \ No newline at end of file +default['cluster']['jwt']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/jwt" From 48044c2ee0f9271b3c28add7d1ca7f2873490d6c Mon Sep 17 00:00:00 2001 From: Xuanqi He <93849823+hehe7318@users.noreply.github.com> Date: Fri, 5 Dec 2025 15:45:05 -0500 Subject: [PATCH 33/37] [develop] Fix DCV on Ubuntu 22.04+ by disabling Wayland (#3057) * Fix DCV on Ubuntu 22.04+ on DLAMI by disabling Wayland Disable Wayland protocol in GDM3 for Ubuntu 22.04+ to force the use of Xorg on GPU instances running without a display. Ubuntu 22.04+ defaults to Wayland which causes GDM startup issues with NVIDIA drivers and NICE DCV. Force Xorg by setting `WaylandEnable=false` in `/etc/gdm3/custom.conf`. * Add kitchen test to check if GDM is using X11 session type --- CHANGELOG.md | 1 + .../resources/dcv/partial/_ubuntu_common.rb | 56 +++++++++++++++++++ .../test/controls/dcv_spec.rb | 24 ++++++++ 3 files changed, 81 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b377bd8979..9029793616 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste and achieve better performance at scale. - Load kernel module `drm_client_lib` before installation of NVIDIA driver, if available on the kernel. - Reduce dependency footprint by installing the package `sssd-common` rather than `sssd`. +- Disable Wayland protocol in GDM3 for Ubuntu 22.04+ to force the use of Xorg on GPU instances running without a display. - Upgrade Slurm to version 24.11.7 (from 24.11.6). - Upgrade Pmix to 5.0.9 (from 5.0.6). - Upgrade libjwt to version 1.18.4 (from 1.17.0) for all OSs except Amazon Linux 2. diff --git a/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb b/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb index 2eb8a40cce..4c490468f8 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb @@ -82,6 +82,62 @@ def optionally_disable_rnd end end + # Disable Wayland in GDM to ensure Xorg is used + # This is required for Ubuntu 22.04+ where Wayland is the default + # Without this, GDM won't start Xorg on headless GPU instances + def disable_wayland + bash 'Disable Wayland in GDM' do + user 'root' + code <<-DISABLEWAYLAND + set -e + if [ -f /etc/gdm3/custom.conf ]; then + sed -i 's/#WaylandEnable=false/WaylandEnable=false/' /etc/gdm3/custom.conf + # If the line doesn't exist at all, add it under [daemon] section + if ! grep -q "^WaylandEnable=false" /etc/gdm3/custom.conf; then + sed -i '/\\[daemon\\]/a WaylandEnable=false' /etc/gdm3/custom.conf + fi + fi + DISABLEWAYLAND + end + end + + # Override allow_gpu_acceleration to disable Wayland before starting X + def allow_gpu_acceleration + # Update the xorg.conf to set up NVIDIA drivers. + # NOTE: --enable-all-gpus parameter is needed to support servers with more than one NVIDIA GPU. + nvidia_xconfig_command = "nvidia-xconfig --preserve-busid --enable-all-gpus" + nvidia_xconfig_command += " --use-display-device=none" if node['ec2']['instance_type'].start_with?("g2.") + execute "Set up Nvidia drivers for X configuration" do + user 'root' + command nvidia_xconfig_command + end + + # dcvgl package must be installed after NVIDIA and before starting up X + # DO NOT install dcv-gl on non-GPU instances, or will run into a black screen issue + install_dcv_gl + + # Disable Wayland to ensure GDM starts Xorg + disable_wayland + + # Configure the X server to start automatically when the Linux server boots and start the X server in background + bash 'Launch X' do + user 'root' + code <<-SETUPX + set -e + systemctl set-default graphical.target + systemctl isolate graphical.target & + SETUPX + end + + # Verify that the X server is running + execute 'Wait for X to start' do + user 'root' + command "pidof X || pidof Xorg" + retries 10 + retry_delay 5 + end + end + def post_install # ubuntu-desktop comes with NetworkManager. On a cloud instance NetworkManager is unnecessary and causes delay. # Instruct Netplan to use networkd for better performance diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/dcv_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/dcv_spec.rb index 9d901205fa..96baa91bd2 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/dcv_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/dcv_spec.rb @@ -318,3 +318,27 @@ end end end + +control 'tag:config_dcv_xorg_running_with_x11_session_type' do + title 'Check that Xorg is running and GDM is using X11 session type (not Wayland)' + only_if do + !os_properties.on_docker? && + instance.head_node? && + instance.dcv_installed? && + node['cluster']['dcv_enabled'] == "head_node" && + instance.graphic? && + instance.nvidia_installed? && + instance.dcv_gpu_accel_supported? + end + + describe 'Xorg process should be running' do + subject { command('pidof Xorg || pidof X') } + its('exit_status') { should eq 0 } + its('stdout') { should_not be_empty } + end + + describe 'GDM should be using X11 session type, not Wayland' do + subject { command("loginctl show-session $(loginctl | grep gdm | awk '{print $1}') -p Type 2>/dev/null | grep -i x11") } + its('exit_status') { should eq 0 } + end +end From 8167f39b2736ee113ddd797f2460cb7e29fb20b7 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Thu, 11 Dec 2025 14:23:47 -0500 Subject: [PATCH 34/37] [UpdateWorkflow] Ensure clustermgtd runs after cluster update and fix race condition making compute node deploy wrong cluster config version on update failure. Ensure clustermgtd is running after an update completes, regardless of whether the update succeeded or failed. On success, restart clustermgtd unconditionally at the end of the update recipe, regardless of whether the update includes queue changes On failure on the head node, execute recovery actions: - Clean up DNA files shared with compute nodes to prevent them from deploying a config version that is about to be rolled back - Restart clustermgtd if scontrol reconfigure succeeded, ensuring cluster management resumes after update/rollback failures --- CHANGELOG.md | 6 + .../libraries/command_runner.rb | 56 +++++ .../libraries/update_failure_handler.rb | 114 +++++++++ .../recipes/update.rb | 5 + .../unit/libraries/command_runner_spec.rb | 128 ++++++++++ .../libraries/update_failure_handler_spec.rb | 222 ++++++++++++++++++ .../spec/unit/recipes/update_spec.rb | 4 + .../libraries/update.rb | 2 + .../recipes/update/update_head_node.rb | 3 +- .../unit/recipes/update_head_node_spec.rb | 6 + 10 files changed, 544 insertions(+), 2 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-entrypoints/libraries/command_runner.rb create mode 100644 cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb create mode 100644 cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/command_runner_spec.rb create mode 100644 cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/update_failure_handler_spec.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 9029793616..bd7d9793c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste 3.14.1 ------ +**ENHANCEMENTS** +- Ensure clustermgtd runs after cluster update. On success, start it unconditionally. On failure, start it if the queue reconfiguration succeeded. + **CHANGES** - Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes and achieve better performance at scale. @@ -27,6 +30,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Rdma-core: rdma-core-59.0-1 - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.8-11 +**BUG FIXES** +- Fix race condition where compute nodes could deploy the wrong cluster config version after an update failure. + 3.14.0 ------ diff --git a/cookbooks/aws-parallelcluster-entrypoints/libraries/command_runner.rb b/cookbooks/aws-parallelcluster-entrypoints/libraries/command_runner.rb new file mode 100644 index 0000000000..b5f7c88fb9 --- /dev/null +++ b/cookbooks/aws-parallelcluster-entrypoints/libraries/command_runner.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +# +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +module ErrorHandlers + # Executes shell commands with retry logic and logging. + class CommandRunner + include Chef::Mixin::ShellOut + + DEFAULT_RETRIES = 10 + DEFAULT_RETRY_DELAY = 90 + DEFAULT_TIMEOUT = 30 + + def initialize(log_prefix:) + @log_prefix = log_prefix + end + + def run_with_retries(command, description:, retries: DEFAULT_RETRIES, retry_delay: DEFAULT_RETRY_DELAY, timeout: DEFAULT_TIMEOUT) + Chef::Log.info("#{@log_prefix} Executing: #{description}") + max_attempts = retries + 1 + + max_attempts.times do |attempt| + attempt_num = attempt + 1 + Chef::Log.info("#{@log_prefix} Running command (attempt #{attempt_num}/#{max_attempts}): #{command}") + result = shell_out(command, timeout: timeout) + Chef::Log.info("#{@log_prefix} Command stdout: #{result.stdout}") + Chef::Log.info("#{@log_prefix} Command stderr: #{result.stderr}") + + if result.exitstatus == 0 + Chef::Log.info("#{@log_prefix} Successfully executed: #{description}") + return true + end + + Chef::Log.warn("#{@log_prefix} Failed to #{description} (attempt #{attempt_num}/#{max_attempts})") + + if attempt_num < max_attempts + Chef::Log.info("#{@log_prefix} Retrying in #{retry_delay} seconds...") + sleep(retry_delay) + end + end + + Chef::Log.error("#{@log_prefix} Failed to #{description} after #{max_attempts} attempts") + false + end + end +end diff --git a/cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb b/cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb new file mode 100644 index 0000000000..18dcad4382 --- /dev/null +++ b/cookbooks/aws-parallelcluster-entrypoints/libraries/update_failure_handler.rb @@ -0,0 +1,114 @@ +# frozen_string_literal: true + +# +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +require 'chef/handler' +require_relative 'command_runner' + +module ErrorHandlers + # Chef exception handler for cluster update failures. + # + # This handler is triggered when the update recipe fails. It performs recovery actions + # to restore the cluster to a consistent state: + # 1. Logs information about the update failure including which resources succeeded before failure + # 2. Cleans up DNA files shared with compute nodes + # 3. Starts clustermgtd if scontrol reconfigure succeeded + # + # Only runs on HeadNode - compute and login nodes skip this handler. + class UpdateFailureHandler < Chef::Handler + def report + Chef::Log.info("#{log_prefix} Started") + + unless node_type == 'HeadNode' + Chef::Log.info("#{log_prefix} Node type is #{node_type}, recovery from update failure only executes on the HeadNode") + return + end + + begin + write_error_report + run_recovery + Chef::Log.info("#{log_prefix} Completed successfully") + rescue => e + Chef::Log.error("#{log_prefix} Failed with error: #{e.message}") + Chef::Log.error("#{log_prefix} Backtrace: #{e.backtrace.join("\n")}") + end + end + + def write_error_report + Chef::Log.info("#{log_prefix} Update failed on #{node_type} due to: #{run_status.exception}") + Chef::Log.info("#{log_prefix} Resources that have been successfully executed before the failure:") + run_status.updated_resources.each do |resource| + Chef::Log.info("#{log_prefix} - #{resource}") + end + end + + def run_recovery + Chef::Log.info("#{log_prefix} Running recovery commands") + + # Cleanup DNA files + cleanup_dna_files + + # Start clustermgtd if scontrol reconfigure succeeded + # Must match SCONTROL_RECONFIGURE_RESOURCE_NAME in aws-parallelcluster-slurm/libraries/update.rb + scontrol_reconfigure_resource_name = 'reload config for running nodes' + Chef::Log.info("#{log_prefix} Resource '#{scontrol_reconfigure_resource_name}' has execution status: #{resource_status(scontrol_reconfigure_resource_name)}") + if resource_succeeded?(scontrol_reconfigure_resource_name) + Chef::Log.info("#{log_prefix} scontrol reconfigure succeeded, starting clustermgtd") + start_clustermgtd + else + Chef::Log.info("#{log_prefix} scontrol reconfigure did not succeed, skipping clustermgtd start") + end + end + + def cleanup_dna_files + command = "#{cookbook_virtualenv_path}/bin/python #{cluster_attributes['scripts_dir']}/share_compute_fleet_dna.py --region #{cluster_attributes['region']} --cleanup" + command_runner.run_with_retries(command, description: "cleanup DNA files") + end + + def start_clustermgtd + command = "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd" + command_runner.run_with_retries(command, description: "start clustermgtd") + end + + def cluster_attributes + run_status.node['cluster'] + end + + def node_type + cluster_attributes['node_type'] + end + + def cookbook_virtualenv_path + "#{cluster_attributes['system_pyenv_root']}/versions/#{cluster_attributes['python-version']}/envs/cookbook_virtualenv" + end + + def resource_succeeded?(resource_name) + %i(updated up_to_date).include?(resource_status(resource_name)) + end + + def resource_status(resource_name) + # Use action_collection directly (inherited from Chef::Handler) + action_records = action_collection.filtered_collection + record = action_records.find { |r| r.new_resource.resource_name == :execute && r.new_resource.name == resource_name } + record ? record.status : :not_executed + end + + def command_runner + @command_runner ||= CommandRunner.new(log_prefix: log_prefix) + end + + def log_prefix + @log_prefix ||= "#{self.class.name}:" + end + end +end diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb index f69aa24530..a2bf3cc6b3 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb @@ -11,6 +11,11 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. + +chef_handler 'ErrorHandlers::UpdateFailureHandler' do + type exception: true +end + include_recipe "aws-parallelcluster-shared::setup_envars" # Fetch and load cluster configs diff --git a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/command_runner_spec.rb b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/command_runner_spec.rb new file mode 100644 index 0000000000..f5d8937017 --- /dev/null +++ b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/command_runner_spec.rb @@ -0,0 +1,128 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. and its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +require_relative '../../spec_helper' +require_relative '../../../libraries/command_runner' + +describe ErrorHandlers::CommandRunner do + let(:log_prefix) { 'TestPrefix:' } + let(:runner) { described_class.new(log_prefix: log_prefix) } + let(:command) { 'test command' } + let(:description) { 'test operation' } + let(:shell_out_result) { double('shell_out_result', exitstatus: 0, stdout: 'success', stderr: '') } + + before do + allow(runner).to receive(:shell_out).and_return(shell_out_result) + allow(runner).to receive(:sleep) + end + + describe '#run_with_retries' do + context 'when command succeeds on first attempt' do + it 'returns true and does not retry' do + expect(runner).to receive(:shell_out).once.and_return(shell_out_result) + expect(runner).not_to receive(:sleep) + expect(runner.run_with_retries(command, description: description)).to be true + end + + it 'logs stdout and stderr' do + allow(Chef::Log).to receive(:info) + expect(Chef::Log).to receive(:info).with(/Command stdout: success/) + expect(Chef::Log).to receive(:info).with(/Command stderr:/) + runner.run_with_retries(command, description: description) + end + + it 'logs success message' do + allow(Chef::Log).to receive(:info) + expect(Chef::Log).to receive(:info).with(/Successfully executed: test operation/) + runner.run_with_retries(command, description: description) + end + end + + context 'when command fails then succeeds' do + let(:failed_result) { double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') } + + it 'retries and returns true on success' do + expect(runner).to receive(:shell_out).and_return(failed_result, shell_out_result) + expect(runner).to receive(:sleep).with(90).once + expect(runner.run_with_retries(command, description: description, retries: 1)).to be true + end + + it 'logs retry message' do + allow(runner).to receive(:shell_out).and_return(failed_result, shell_out_result) + allow(Chef::Log).to receive(:info) + allow(Chef::Log).to receive(:warn) + expect(Chef::Log).to receive(:info).with(/Retrying in 90 seconds/) + runner.run_with_retries(command, description: description, retries: 1) + end + end + + context 'when command fails all attempts' do + let(:failed_result) { double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') } + + it 'returns false after exhausting retries' do + allow(runner).to receive(:shell_out).and_return(failed_result) + expect(runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0)).to be false + end + + it 'logs error after all attempts fail' do + allow(runner).to receive(:shell_out).and_return(failed_result) + expect(Chef::Log).to receive(:error).with(/Failed to test operation after 2 attempts/) + runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0) + end + + it 'logs warning for each failed attempt' do + allow(runner).to receive(:shell_out).and_return(failed_result) + allow(Chef::Log).to receive(:info) + allow(Chef::Log).to receive(:error) + expect(Chef::Log).to receive(:warn).with(%r{Failed to test operation \(attempt 1/2\)}) + expect(Chef::Log).to receive(:warn).with(%r{Failed to test operation \(attempt 2/2\)}) + runner.run_with_retries(command, description: description, retries: 1, retry_delay: 0) + end + end + + context 'with custom retry parameters' do + it 'respects custom retries count' do + failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') + allow(runner).to receive(:shell_out).and_return(failed_result) + expect(runner).to receive(:shell_out).exactly(3).times + runner.run_with_retries(command, description: description, retries: 2, retry_delay: 0) + end + + it 'respects custom retry delay' do + failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') + allow(runner).to receive(:shell_out).and_return(failed_result, shell_out_result) + expect(runner).to receive(:sleep).with(30).once + runner.run_with_retries(command, description: description, retries: 1, retry_delay: 30) + end + + it 'respects custom timeout' do + expect(runner).to receive(:shell_out).with(command, timeout: 60).and_return(shell_out_result) + runner.run_with_retries(command, description: description, timeout: 60) + end + end + + context 'with default parameters' do + it 'uses DEFAULT_RETRIES' do + failed_result = double('failed_result', exitstatus: 1, stdout: '', stderr: 'error') + allow(runner).to receive(:shell_out).and_return(failed_result) + expect(runner).to receive(:shell_out).exactly(11).times # 10 retries + 1 initial = 11 attempts + runner.run_with_retries(command, description: description, retry_delay: 0) + end + + it 'uses DEFAULT_TIMEOUT' do + expect(runner).to receive(:shell_out).with(command, timeout: 30).and_return(shell_out_result) + runner.run_with_retries(command, description: description) + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/update_failure_handler_spec.rb b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/update_failure_handler_spec.rb new file mode 100644 index 0000000000..ba8d7db92d --- /dev/null +++ b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/libraries/update_failure_handler_spec.rb @@ -0,0 +1,222 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. and its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +require_relative '../../spec_helper' +require_relative '../../../libraries/update_failure_handler' + +describe ErrorHandlers::UpdateFailureHandler do + let(:handler) { described_class.new } + let(:exception) { StandardError.new('Test error') } + let(:resource1) { double('resource1', to_s: 'file[/tmp/test]') } + let(:updated_resources) { [resource1] } + let(:action_collection) { double('action_collection') } + let(:pyenv_root) { '/opt/parallelcluster/pyenv' } + let(:python_version) { '3.9.0' } + let(:scripts_dir) { '/opt/parallelcluster/scripts' } + let(:region) { 'us-east-1' } + let(:virtualenv_path) { "#{pyenv_root}/versions/#{python_version}/envs/cookbook_virtualenv" } + let(:node) do + { + 'cluster' => { + 'node_type' => node_type, + 'system_pyenv_root' => pyenv_root, + 'python-version' => python_version, + 'scripts_dir' => scripts_dir, + 'region' => region, + }, + } + end + let(:node_type) { 'HeadNode' } + let(:run_status) { double('run_status', exception: exception, updated_resources: updated_resources, node: node) } + let(:scontrol_resource_name) { 'reload config for running nodes' } + let(:command_runner) { instance_double(ErrorHandlers::CommandRunner) } + + before do + allow(handler).to receive(:run_status).and_return(run_status) + allow(handler).to receive(:action_collection).and_return(action_collection) + allow(action_collection).to receive(:filtered_collection).and_return([]) + allow(handler).to receive(:command_runner).and_return(command_runner) + allow(command_runner).to receive(:run_with_retries).and_return(true) + end + + describe '#node_type' do + it 'returns the node type from cluster attributes' do + expect(handler.node_type).to eq('HeadNode') + end + end + + describe '#cookbook_virtualenv_path' do + it 'constructs the correct virtualenv path' do + expect(handler.cookbook_virtualenv_path).to eq(virtualenv_path) + end + end + + describe '#report' do + context 'when node type is HeadNode' do + it 'writes error report and runs recovery commands' do + expect(handler).to receive(:write_error_report) + expect(handler).to receive(:run_recovery) + handler.report + end + + it 'catches and logs exceptions during recovery' do + allow(handler).to receive(:write_error_report).and_raise(StandardError.new('Recovery failed')) + expect(Chef::Log).to receive(:error).with(/Failed with error: Recovery failed/) + expect(Chef::Log).to receive(:error).with(/Backtrace:/) + handler.report + end + end + + context 'when node type is not HeadNode' do + let(:node_type) { 'ComputeFleet' } + + it 'skips recovery and returns early' do + expect(handler).not_to receive(:write_error_report) + expect(handler).not_to receive(:run_recovery) + allow(Chef::Log).to receive(:info) + expect(Chef::Log).to receive(:info).with(/Node type is ComputeFleet/) + handler.report + end + end + end + + describe '#write_error_report' do + it 'logs the exception and updated resources' do + expect(Chef::Log).to receive(:info).with(/Update failed on HeadNode due to: Test error/) + expect(Chef::Log).to receive(:info).with(/Resources that have been successfully executed/) + expect(Chef::Log).to receive(:info).with(%r{file\[/tmp/test\]}) + handler.write_error_report + end + end + + describe '#run_recovery' do + context 'when scontrol reconfigure succeeded' do + let(:reload_resource) { double('reload_resource', resource_name: :execute, name: scontrol_resource_name) } + let(:action_record) { double('action_record', new_resource: reload_resource, status: :updated) } + + before do + allow(action_collection).to receive(:filtered_collection).and_return([action_record]) + end + + it 'cleans up DNA files and starts clustermgtd' do + expect(handler).to receive(:cleanup_dna_files) + expect(handler).to receive(:start_clustermgtd) + handler.run_recovery + end + end + + context 'when scontrol reconfigure did not succeed' do + it 'cleans up DNA files but does not start clustermgtd' do + expect(handler).to receive(:cleanup_dna_files) + expect(handler).not_to receive(:start_clustermgtd) + handler.run_recovery + end + end + end + + describe '#cleanup_dna_files' do + it 'runs the cleanup command with correct arguments' do + expected_command = "#{virtualenv_path}/bin/python #{scripts_dir}/share_compute_fleet_dna.py --region #{region} --cleanup" + expect(command_runner).to receive(:run_with_retries).with(expected_command, description: "cleanup DNA files") + handler.cleanup_dna_files + end + end + + describe '#start_clustermgtd' do + it 'runs the supervisorctl command' do + expected_command = "#{virtualenv_path}/bin/supervisorctl start clustermgtd" + expect(command_runner).to receive(:run_with_retries).with(expected_command, description: "start clustermgtd") + handler.start_clustermgtd + end + end + + describe '#command_runner' do + before do + allow(handler).to receive(:command_runner).and_call_original + end + + it 'returns a CommandRunner instance' do + expect(handler.command_runner).to be_a(ErrorHandlers::CommandRunner) + end + + it 'memoizes the command runner' do + expect(handler.command_runner).to be(handler.command_runner) + end + end + + describe '#resource_succeeded?' do + let(:resource_name) { 'test resource' } + let(:test_resource) { double('test_resource', resource_name: :execute, name: resource_name) } + + context 'when resource was updated' do + let(:action_record) { double('action_record', new_resource: test_resource, status: :updated) } + + before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) } + + it 'returns true' do + expect(handler.resource_succeeded?(resource_name)).to be true + end + end + + context 'when resource was up_to_date' do + let(:action_record) { double('action_record', new_resource: test_resource, status: :up_to_date) } + + before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) } + + it 'returns true' do + expect(handler.resource_succeeded?(resource_name)).to be true + end + end + + context 'when resource was not executed' do + before { allow(action_collection).to receive(:filtered_collection).and_return([]) } + + it 'returns false' do + expect(handler.resource_succeeded?(resource_name)).to be false + end + end + + context 'when resource failed' do + let(:action_record) { double('action_record', new_resource: test_resource, status: :failed) } + + before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) } + + it 'returns false' do + expect(handler.resource_succeeded?(resource_name)).to be false + end + end + end + + describe '#resource_status' do + let(:resource_name) { 'test resource' } + let(:test_resource) { double('test_resource', resource_name: :execute, name: resource_name) } + + context 'when resource was not executed' do + before { allow(action_collection).to receive(:filtered_collection).and_return([]) } + + it 'returns :not_executed' do + expect(handler.resource_status(resource_name)).to eq(:not_executed) + end + end + + context 'when resource was executed' do + let(:action_record) { double('action_record', new_resource: test_resource, status: :updated) } + + before { allow(action_collection).to receive(:filtered_collection).and_return([action_record]) } + + it 'returns the resource status' do + expect(handler.resource_status(resource_name)).to eq(:updated) + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb index cb05d90530..d8cb9d4c91 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb @@ -63,6 +63,10 @@ chef_run expect(@included_recipes).to eq(expected_recipes) end + + it "enables the update failure handler" do + expect(chef_run).to enable_chef_handler('ErrorHandlers::UpdateFailureHandler').with(type: { exception: true }) + end end end end diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb index d4747d502c..2f7fd1feef 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb @@ -18,6 +18,8 @@ require 'net/http' require 'timeout' +SCONTROL_RECONFIGURE_RESOURCE_NAME = 'reload config for running nodes' + # Verify if Scheduling section of cluster configuration and compute node bootstrap_timeout have been updated def are_queues_updated? require 'yaml' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 76ba95362a..4aabeaa21c 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -262,7 +262,7 @@ def update_nodes_in_queue(strategy, queues) retry_delay 2 end -execute 'reload config for running nodes' do +execute SCONTROL_RECONFIGURE_RESOURCE_NAME do command "#{node['cluster']['slurm']['install_dir']}/bin/scontrol reconfigure" retries 3 retry_delay 5 @@ -276,7 +276,6 @@ def update_nodes_in_queue(strategy, queues) execute 'start clustermgtd' do command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd" - not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? } end # The updated cfnconfig will be used by post update custom scripts diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb index f2f53d13d4..c609db63cd 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb @@ -57,6 +57,12 @@ retry_delay: 90 ) end + + it 'starts clustermgtd unconditionally' do + is_expected.to run_execute('start clustermgtd').with( + command: "#{cookbook_venv_path}/bin/supervisorctl start clustermgtd" + ) + end end end From 0854f393a6b96ae48ffc1e0e4a0fdebc2c4e16c0 Mon Sep 17 00:00:00 2001 From: hgreebe <141743196+hgreebe@users.noreply.github.com> Date: Mon, 15 Dec 2025 12:15:35 -0500 Subject: [PATCH 35/37] Do not count missing records as a failure of the cluster readiness check (#3067) * Do not consider missing records as a cluster readiness check failure (cherry picked from commit 75c586738d5c2c410f5f58595edd48ac9c1a89f0) * Update CHANGELOG (cherry picked from commit 94943dd29e5aba5b59262e62ff63dcdd06e60051) * Add note that missing records don't cause failure (cherry picked from commit 16ad89f1a85fa72bf1c005258bc71d25c391e62f) --- CHANGELOG.md | 1 + .../default/head_node_checks/check_cluster_ready.py | 12 ++++++++++-- .../head_node_checks/test_check_cluster_ready.py | 11 ++++------- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd7d9793c2..1dce16b7f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.8-11 **BUG FIXES** +- Prevent cluster readiness check failures due to instances launched while the check is in progress. - Fix race condition where compute nodes could deploy the wrong cluster config version after an update failure. 3.14.0 diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_checks/check_cluster_ready.py b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_checks/check_cluster_ready.py index a828cf2acd..b2807f9583 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_checks/check_cluster_ready.py +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_checks/check_cluster_ready.py @@ -112,13 +112,21 @@ def check_deployed_config_version(cluster_name: str, table_name: str, expected_c missing, incomplete, wrong = _check_cluster_config_items(instance_ids, items, expected_config_version) - if missing or incomplete or wrong: + if incomplete or wrong: raise CheckFailedError( - f"Check failed due to the following erroneous records:\n" + f"Check failed due to the following erroneous records " + f"(missing records are not counted for the failure):\n" f" * missing records ({len(missing)}): {missing}\n" f" * incomplete records ({len(incomplete)}): {incomplete}\n" f" * wrong records ({len(wrong)}): {wrong}" ) + if missing: + logger.warning( + "Ignoring the following missing records due them being recently bootstrapped:\n" + " * missing records (%s): %s", + len(missing), + missing, + ) logger.info("Verified cluster configuration for cluster node(s) %s", instance_ids) diff --git a/test/unit/head_node_checks/test_check_cluster_ready.py b/test/unit/head_node_checks/test_check_cluster_ready.py index ed89b36156..3e71b4887c 100644 --- a/test/unit/head_node_checks/test_check_cluster_ready.py +++ b/test/unit/head_node_checks/test_check_cluster_ready.py @@ -83,10 +83,7 @@ def _mocked_request_batch_get_items(table_name: str, compute_nodes: [str], ddb_r ["i-cmp123456789"], ["i-lgn123456789"], {}, - "Check failed due to the following erroneous records:\n" - " * missing records (2): ['i-cmp123456789', 'i-lgn123456789']\n" - " * incomplete records (0): []\n" - " * wrong records (0): []", + None, id="Check with missing DDB records", ), pytest.param( @@ -96,7 +93,7 @@ def _mocked_request_batch_get_items(table_name: str, compute_nodes: [str], ddb_r "i-cmp123456789": {"UNEXPECTED_KEY_A": {"S": "UNEXPECTED_KEY_VALUE_A"}}, "i-lgn123456789": {"UNEXPECTED_KEY_B": {"S": "UNEXPECTED_KEY_VALUE_B"}}, }, - "Check failed due to the following erroneous records:\n" + "Check failed due to the following erroneous records (missing records are not counted for the failure):\n" " * missing records (0): []\n" " * incomplete records (2): ['i-cmp123456789', 'i-lgn123456789']\n" " * wrong records (0): []", @@ -109,7 +106,7 @@ def _mocked_request_batch_get_items(table_name: str, compute_nodes: [str], ddb_r "i-cmp123456789": {"cluster_config_version": {"S": "WRONG_CLUSTER_CONFIG_VERSION_A"}}, "i-lgn123456789": {"cluster_config_version": {"S": "WRONG_CLUSTER_CONFIG_VERSION_B"}}, }, - "Check failed due to the following erroneous records:\n" + "Check failed due to the following erroneous records (missing records are not counted for the failure):\n" " * missing records (0): []\n" " * incomplete records (0): []\n" " * wrong records (2): [('i-cmp123456789', 'WRONG_CLUSTER_CONFIG_VERSION_A'), " @@ -127,7 +124,7 @@ def _mocked_request_batch_get_items(table_name: str, compute_nodes: [str], ddb_r "i-cmp1234567893": {"cluster_config_version": {"S": "WRONG_CLUSTER_CONFIG_VERSION_A"}}, "i-lgn1234567893": {"cluster_config_version": {"S": "WRONG_CLUSTER_CONFIG_VERSION_B"}}, }, - "Check failed due to the following erroneous records:\n" + "Check failed due to the following erroneous records (missing records are not counted for the failure):\n" " * missing records (2): ['i-cmp1234567894', 'i-lgn1234567894']\n" " * incomplete records (2): ['i-cmp1234567892', 'i-lgn1234567892']\n" " * wrong records (2): [('i-cmp1234567893', 'WRONG_CLUSTER_CONFIG_VERSION_A'), " From 695ff9a6f244f66ced7078378fbc80dc5f314324 Mon Sep 17 00:00:00 2001 From: Xuanqi He <93849823+hehe7318@users.noreply.github.com> Date: Tue, 16 Dec 2025 12:57:32 -0500 Subject: [PATCH 36/37] [Develop][Bug] Fix incorrect timestamp parsing for chef-client.log in CloudWatch Agent (#3068) The CloudWatch Agent configuration was using the `default` timestamp format (%Y-%m-%d %H:%M:%S,%f) for chef-client.log, but Chef/Cinc outputs ISO 8601 timestamps in format: `[YYYY-MM-DDTHH:MM:SS+TZ]`. This mismatch caused CloudWatch to fail parsing timestamps, resulting in log lines being associated with incorrect timestamps. - Add new 'chef' timestamp format: `[%Y-%m-%dT%H:%M:%S` (Note: CloudWatch Agent's %z only supports timezone without colon like -0700, but Chef outputs +02:00 format. We only match up to seconds and let CloudWatch handle the rest.) - Update chef-client.log configuration to use the new 'chef' format --- CHANGELOG.md | 1 + .../files/cloudwatch/cloudwatch_agent_config.json | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dce16b7f2..c1e3217991 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.8-11 **BUG FIXES** +- Fix incorrect timestamp parsing for chef-client.log in CloudWatch Agent configuration. - Prevent cluster readiness check failures due to instances launched while the check is in progress. - Fix race condition where compute nodes could deploy the wrong cluster config version after an update failure. diff --git a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json index 95ceb287a0..08e610745f 100644 --- a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json +++ b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json @@ -4,6 +4,7 @@ "default": "%Y-%m-%d %H:%M:%S,%f", "bracket_default": "[%Y-%m-%d %H:%M:%S]", "slurm": "%Y-%m-%dT%H:%M:%S.%f", + "chef": "[%Y-%m-%dT%H:%M:%S", "json": "" }, "log_configs": [ @@ -82,7 +83,7 @@ "feature_conditions": [] }, { - "timestamp_format_key": "default", + "timestamp_format_key": "chef", "file_path": "/var/log/chef-client.log", "log_stream_name": "chef-client", "schedulers": [ From a50c468f072d2d0fa6076b2bdb0c065bca3843ff Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Tue, 16 Dec 2025 16:43:01 -0500 Subject: [PATCH 37/37] [Logging] Standardize timestamp formats across log configurations - Rename timestamp format keys to use consistent naming convention (iso8610, default_seconds) - Update CloudWatch agent config to use iso8610 format for JSON event logs (clustermgtd, slurm_resume) - Consolidate Slurm log timestamp formats (slurmd, slurmctld, slurmdbd) to use iso8610 - Update SSSD log timestamp format from default to default_seconds for consistency - Change DCV authenticator log format from bracket_default to default - Add millisecond precision to PS4 prompt in generate_ssh_key.sh for better debug logging - Add millisecond precision to pcluster_dcv_connect.sh log timestamps for improved log accuracy - Improves log parsing consistency and debugging capabilities across all services --- .../cloudwatch/cloudwatch_agent_config.json | 31 +++++++++---------- .../directory_service/generate_ssh_key.sh.erb | 1 + .../files/dcv/pcluster_dcv_connect.sh | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json index 08e610745f..5ebf8770dd 100644 --- a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json +++ b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json @@ -2,10 +2,9 @@ "timestamp_formats": { "month_first": "%b %-d %H:%M:%S", "default": "%Y-%m-%d %H:%M:%S,%f", - "bracket_default": "[%Y-%m-%d %H:%M:%S]", - "slurm": "%Y-%m-%dT%H:%M:%S.%f", - "chef": "[%Y-%m-%dT%H:%M:%S", - "json": "" + "default_seconds": "%Y-%m-%d %H:%M:%S", + "iso8610": "%Y-%m-%dT%H:%M:%S.%f", + "chef": "[%Y-%m-%dT%H:%M:%S" }, "log_configs": [ { @@ -100,7 +99,7 @@ "feature_conditions": [] }, { - "timestamp_format_key": "json", + "timestamp_format_key": "iso8610", "file_path": "/var/log/parallelcluster/bootstrap_error_msg", "log_stream_name": "bootstrap_error_msg", "schedulers": [ @@ -113,7 +112,7 @@ "feature_conditions": [] }, { - "timestamp_format_key": "default", + "timestamp_format_key": "default_seconds", "file_path": "/var/log/cloud-init.log", "log_stream_name": "cloud-init", "schedulers": [ @@ -130,7 +129,7 @@ "feature_conditions": [] }, { - "timestamp_format_key": "default", + "timestamp_format_key": "default_seconds", "file_path": "/var/log/cloud-init-output.log", "log_stream_name": "cloud-init-output", "schedulers": [ @@ -174,7 +173,7 @@ "feature_conditions": [] }, { - "timestamp_format_key": "json", + "timestamp_format_key": "iso8610", "file_path": "/var/log/parallelcluster/clustermgtd.events", "log_stream_name": "clustermgtd_events", "schedulers": [ @@ -187,7 +186,7 @@ "feature_conditions": [] }, { - "timestamp_format_key": "json", + "timestamp_format_key": "iso8610", "file_path": "/var/log/parallelcluster/slurm_resume.events", "log_stream_name": "slurm_resume_events", "schedulers": [ @@ -265,7 +264,7 @@ "feature_conditions": [] }, { - "timestamp_format_key": "slurm", + "timestamp_format_key": "iso8610", "file_path": "/var/log/slurmd.log", "log_stream_name": "slurmd", "schedulers": [ @@ -278,7 +277,7 @@ "feature_conditions": [] }, { - "timestamp_format_key": "slurm", + "timestamp_format_key": "iso8610", "file_path": "/var/log/slurmctld.log", "log_stream_name": "slurmctld", "schedulers": [ @@ -291,7 +290,7 @@ "feature_conditions": [] }, { - "timestamp_format_key": "slurm", + "timestamp_format_key": "iso8610", "file_path": "/var/log/slurmdbd.log", "log_stream_name": "slurmdbd", "schedulers": [ @@ -325,7 +324,7 @@ ] }, { - "timestamp_format_key": "default", + "timestamp_format_key": "default_seconds", "file_path": "/var/log/sssd/sssd.log", "log_stream_name": "sssd", "schedulers": [ @@ -346,7 +345,7 @@ ] }, { - "timestamp_format_key": "default", + "timestamp_format_key": "default_seconds", "file_path": "/var/log/sssd/sssd_default.log", "log_stream_name": "sssd_domain_default", "schedulers": [ @@ -389,7 +388,7 @@ ] }, { - "timestamp_format_key": "bracket_default", + "timestamp_format_key": "default", "file_path": "/var/log/parallelcluster/pcluster_dcv_connect.log", "log_stream_name": "dcv-ext-authenticator", "schedulers": [ @@ -522,7 +521,7 @@ "feature_conditions": [] }, { - "timestamp_format_key": "json", + "timestamp_format_key": "iso8610", "file_path": "/var/log/parallelcluster/slurm_health_check.events", "log_stream_name": "slurm_health_check_events", "schedulers": [ diff --git a/cookbooks/aws-parallelcluster-environment/templates/directory_service/generate_ssh_key.sh.erb b/cookbooks/aws-parallelcluster-environment/templates/directory_service/generate_ssh_key.sh.erb index f6721ad7a8..e1725df374 100644 --- a/cookbooks/aws-parallelcluster-environment/templates/directory_service/generate_ssh_key.sh.erb +++ b/cookbooks/aws-parallelcluster-environment/templates/directory_service/generate_ssh_key.sh.erb @@ -1,4 +1,5 @@ #!/bin/bash +PS4="$(date '+%Y-%m-%d %H:%M:%S.%3N') " set -ex env diff --git a/cookbooks/aws-parallelcluster-platform/files/dcv/pcluster_dcv_connect.sh b/cookbooks/aws-parallelcluster-platform/files/dcv/pcluster_dcv_connect.sh index 73f55176a9..3222b05268 100644 --- a/cookbooks/aws-parallelcluster-platform/files/dcv/pcluster_dcv_connect.sh +++ b/cookbooks/aws-parallelcluster-platform/files/dcv/pcluster_dcv_connect.sh @@ -82,7 +82,7 @@ _log() { fi # append log - log_time=$(date "+%Y-%m-%d %H:%M:%S") + log_time=$(date "+%Y-%m-%d %H:%M:%S,%3N") echo "[${log_time}]: ${text}" >> "${LOG_FILE_PATH}" }