diff --git a/gpu-validation/defaults/main.yaml b/gpu-validation/defaults/main.yaml index f651821..854bc79 100644 --- a/gpu-validation/defaults/main.yaml +++ b/gpu-validation/defaults/main.yaml @@ -41,9 +41,6 @@ gpu_validation_dns_server: 192.168.122.1 gpu_validation_pci_devices: 10de:27b8: 1 -# [string] libnvidia-container-toolkit version -gpu_validation_libnvidia_container_toolkit_version_release: 1.17.8-1 - gpu_validation_model_tests_enabled: true # Can be disabled for lighter CUDA-only sanity testing gpu_validation_model_name: TinyLlama/TinyLlama-1.1B-Chat-v1.0 # RedHatAI/Llama-3.2-1B-Instruct-FP8 for RHAIIS # [string] (optional) HuggingFace token if required for download @@ -53,8 +50,6 @@ gpu_validation_model_download_hf_token: gpu_validation_model_perf_max_avg_time_per_tok: !!float "0.03" gpu_validation_model_perf_max_avg_time_to_first_tok: !!float "0.3" -gpu_validation_nvidia_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel9/" - # [string] Container image to use for model serving gpu_validation_workload_container_image: "docker.io/vllm/vllm-openai:latest" # "registry.redhat.io/rhaiis/vllm-cuda-rhel9:3.0.0" for RHAIIS # [string] (optional) registry username if required for container image download diff --git a/gpu-validation/tasks/main.yaml b/gpu-validation/tasks/main.yaml index 10a949c..040a63a 100644 --- a/gpu-validation/tasks/main.yaml +++ b/gpu-validation/tasks/main.yaml @@ -4,8 +4,14 @@ ansible.builtin.import_tasks: setup.yaml - name: Nvidia Setup - become: true - ansible.builtin.import_tasks: nvidia_setup.yaml + ansible.builtin.include_role: + name: nvidia_driver + apply: + become: true + vars: + nvidia_driver_module_version: "latest-dkms" + nvidia_driver_install_management_library: false + - name: Check GPUs ansible.builtin.import_tasks: gpus.yaml diff --git a/gpu-validation/tasks/nvidia_container_tools.yaml b/gpu-validation/tasks/nvidia_container_tools.yaml deleted file mode 100644 index 4011ec5..0000000 --- a/gpu-validation/tasks/nvidia_container_tools.yaml +++ /dev/null @@ -1,21 +0,0 @@ ---- -- name: Install NVIDIA Container Tools RPM - ansible.builtin.dnf: - name: "nvidia-container-toolkit-{{ gpu_validation_libnvidia_container_toolkit_version_release }}" - state: present - -- name: Check if CDI configfile exists - ansible.builtin.stat: - path: /etc/cdi/nvidia.yaml - register: _cdi_config_file - -- name: Configure NVIDIA container runtime - when: not _cdi_config_file.stat.exists - block: - - name: Configure NVIDIA container runtime - ansible.builtin.command: nvidia-ctk runtime configure --runtime=containerd - changed_when: true - - - name: Generate NVIDIA CDI configuration - ansible.builtin.command: nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml - changed_when: true diff --git a/gpu-validation/tasks/nvidia_driver.yaml b/gpu-validation/tasks/nvidia_driver.yaml deleted file mode 100644 index 1330577..0000000 --- a/gpu-validation/tasks/nvidia_driver.yaml +++ /dev/null @@ -1,55 +0,0 @@ ---- -- name: Check for installed nvidia-driver RPM - ansible.builtin.package_facts: - manager: rpm - -- name: Check if nvidia-driver installation is needed - ansible.builtin.set_fact: - _install_nvidia_driver: "{{ 'nvidia-driver' not in ansible_facts.packages }}" - -- name: Install nvidia-driver RPM if needed - when: _install_nvidia_driver - block: - - name: Blacklist nouveau kernel module - ansible.builtin.blockinfile: - block: | - blacklist nouveau - path: /etc/modprobe.d/blacklist.conf - mode: "0644" - create: true - - - name: Remove nouveau kernel module if loaded - ansible.builtin.command: modprobe -r nouveau - failed_when: false - changed_when: false - - - name: Add nvidia CUDA repo - ansible.builtin.yum_repository: - name: nvidia-cuda-rhel9 - description: NVIDIA CUDA repo for RHEL 9 - baseurl: "{{ gpu_validation_nvidia_repo_url }}/$basearch/" - gpgcheck: true - gpgkey: "{{ gpu_validation_nvidia_repo_url }}/$basearch/D42D0685.pub" - - - name: Add EPEL repository for DKMS support - ansible.builtin.yum_repository: - name: epel - description: EPEL YUM repo - baseurl: https://download.fedoraproject.org/pub/epel/9/Everything/$basearch/ - enabled: 1 - gpgcheck: 1 - gpgkey: https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-9 - - - name: Enable nvidia-driver RPM module - ansible.builtin.dnf: - name: "@nvidia-driver:latest-dkms" - state: present - - - name: Install the nvidia CUDA driver - ansible.builtin.dnf: - name: cuda-drivers - state: present - - - name: Refresh package facts after driver installation - ansible.builtin.package_facts: - manager: rpm diff --git a/gpu-validation/tasks/nvidia_setup.yaml b/gpu-validation/tasks/nvidia_setup.yaml deleted file mode 100644 index 7f8b1da..0000000 --- a/gpu-validation/tasks/nvidia_setup.yaml +++ /dev/null @@ -1,6 +0,0 @@ ---- -- name: Nvidia Driver - ansible.builtin.import_tasks: nvidia_driver.yaml - -- name: Nvidia Container Tools - ansible.builtin.import_tasks: nvidia_container_tools.yaml diff --git a/requirements.yaml b/requirements.yaml index f3163f8..c78dc97 100644 --- a/requirements.yaml +++ b/requirements.yaml @@ -4,3 +4,7 @@ collections: version: ">=2.4.1" - name: community.general version: ">=10.0.0" +roles: + - name: nvidia_driver + src: https://github.com/rhos-vaf/gpu-drivers-ansible.git + version: main