Skip to content

Commit 11f5b88

Browse files
himani2411Himani Deshpande
andauthored
Upgrading NVIDIA driver,fabric manager and Cuda to v535 and v12.2 respectively (#2388)
Co-authored-by: Himani Deshpande <himanidp@amazon.com>
1 parent 3b23fb2 commit 11f5b88

File tree

9 files changed

+44
-24
lines changed

9 files changed

+44
-24
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
2626
**CHANGES**
2727
- Assign Slurm dynamic nodes a priority (weight) of 1000 by default. This allows Slurm to prioritize idle static nodes over idle dynamic ones.
2828
- Create a Slurm partition-nodelist mapping JSON file to be used by the node package daemons to recognize PC-managed Slurm partitions and nodelists.
29-
- Upgrade NVIDIA driver to version 470.199.02.
29+
- Upgrade NVIDIA driver to version 535.54.03.
30+
- Upgrade CUDA library to version 12.2.0.
31+
- Upgrade NVIDIA Fabric manager to `nvidia-fabricmanager-535`
3032
- Increase EFS-utils watchdog poll interval to 10 seconds. Note: This change is meaningful only if [EncryptionInTransit](https://docs.aws.amazon.com/parallelcluster/latest/ug/SharedStorage-v3.html#yaml-SharedStorage-EfsSettings-EncryptionInTransit) is set to `true`, because watchdog does not run otherwise.
3133
- Upgrade EFA installer to `1.25.0`
3234
- Efa-driver: `efa-2.5.0-1`

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
# NVidia
1313
default['cluster']['nvidia']['enabled'] = 'no'
14-
default['cluster']['nvidia']['driver_version'] = '470.199.02'
14+
default['cluster']['nvidia']['driver_version'] = '535.54.03'
1515

1616
# DCV
1717
default['cluster']['dcv']['authenticator']['user'] = "dcvextauth"

cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@
1919

2020
# Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive
2121
# Cuda installer naming: cuda_11.8.0_520.61.05_linux
22-
cuda_version = '11.8'
22+
cuda_version = '12.2'
2323
cuda_patch = '0'
2424
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
25-
cuda_version_suffix = '520.61.05'
25+
cuda_version_suffix = '535.54.03'
2626
cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux'
2727
cuda_url = "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run"
28-
cuda_samples_version = '11.8'
28+
cuda_samples_version = '12.2'
2929
cuda_samples_url = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz"
3030
tmp_cuda_run = '/tmp/cuda.run'
3131
tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz'

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
use 'partial/_fabric_manager_install_debian.rb'
2121

2222
def fabric_manager_package
23-
'nvidia-fabricmanager-470'
23+
'nvidia-fabricmanager-535'
2424
end
2525

2626
def fabric_manager_version

cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_amazon2.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,7 @@ def set_compiler?
2020
# Amazon linux 2 with Kernel 5 need to set CC to /usr/bin/gcc10-gcc using dkms override
2121
node['kernel']['release'].split('.')[0].to_i == 5
2222
end
23+
24+
def compiler_version
25+
'CC=/usr/bin/gcc10-gcc'
26+
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
cwd '/tmp'
6969
code <<-NVIDIA
7070
set -e
71-
./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check
71+
#{compiler_version} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check
7272
rm -f /tmp/nvidia.run
7373
NVIDIA
7474
creates '/usr/bin/nvidia-smi'
@@ -103,3 +103,7 @@ def rebuild_initramfs?
103103
def set_compiler?
104104
false
105105
end
106+
107+
def compiler_version
108+
""
109+
end

cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
require 'spec_helper'
22

33
describe 'aws-parallelcluster-platform::cuda' do
4-
cached(:cuda_version) { '11.8' }
4+
cached(:cuda_version) { '12.2' }
55
cached(:cuda_patch) { '0' }
66
cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" }
7-
cached(:cuda_version_suffix) { '520.61.05' }
7+
cached(:cuda_version_suffix) { '535.54.03' }
88

99
context 'when nvidia not enabled' do
1010
cached(:chef_run) do
@@ -20,7 +20,7 @@
2020
context 'when on arm' do
2121
cached(:cuda_arch) { 'linux_sbsa' }
2222
cached(:cuda_url) { "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" }
23-
cached(:cuda_samples_version) { '11.8' }
23+
cached(:cuda_samples_version) { '12.2' }
2424
cached(:cuda_samples_url) { "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz" }
2525

2626
cached(:chef_run) do

cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def self.configure(chef_run)
167167

168168
for_all_oses do |platform, version|
169169
context "on #{platform}#{version}" do
170-
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-470' : 'nvidia-fabric-manager' }
170+
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
171171
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version }
172172

173173
context 'when fabric manager is to install' do
@@ -218,7 +218,7 @@ def self.configure(chef_run)
218218

219219
for_all_oses do |platform, version|
220220
context "on #{platform}#{version}" do
221-
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-470' : 'nvidia-fabric-manager' }
221+
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
222222
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version }
223223

224224
context('when nvswithes are > 1') do

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -203,22 +203,32 @@ def self.setup(chef_run, nvidia_driver_version: nil)
203203
mode: '0644'
204204
)
205205
end
206+
it 'installs nvidia driver' do
207+
is_expected.to run_bash('nvidia.run advanced')
208+
.with(
209+
user: 'root',
210+
group: 'root',
211+
cwd: '/tmp',
212+
creates: '/usr/bin/nvidia-smi'
213+
)
214+
.with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check})
215+
.with_code(%r{rm -f /tmp/nvidia.run})
216+
end
206217
else
207218
it "doesn't install gcc10" do
208219
is_expected.not_to install_package('gcc10')
209220
end
210-
end
211-
212-
it 'installs nvidia driver' do
213-
is_expected.to run_bash('nvidia.run advanced')
214-
.with(
215-
user: 'root',
216-
group: 'root',
217-
cwd: '/tmp',
218-
creates: '/usr/bin/nvidia-smi'
219-
)
220-
.with_code(%r{./nvidia.run --silent --dkms --disable-nouveau})
221-
.with_code(%r{rm -f /tmp/nvidia.run})
221+
it 'installs nvidia driver' do
222+
is_expected.to run_bash('nvidia.run advanced')
223+
.with(
224+
user: 'root',
225+
group: 'root',
226+
cwd: '/tmp',
227+
creates: '/usr/bin/nvidia-smi'
228+
)
229+
.with_code(%r{./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check})
230+
.with_code(%r{rm -f /tmp/nvidia.run})
231+
end
222232
end
223233

224234
if platform == 'ubuntu'

0 commit comments

Comments
 (0)