Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .github/workflows/ansible-lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
name: ansible-lint
on:
pull_request:
branches: ["main"]
jobs:
build:
name: Ansible Lint
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- name: Run ansible-lint
uses: ansible/ansible-lint@main
with:
setup_python: "true"
requirements_file: "requirements.yaml"
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ repos:
hooks:
- id: shellcheck
- repo: https://github.com/ansible/ansible-lint.git
rev: v6.22.2
rev: v25.9.1
hooks:
- id: ansible-lint
files: \.(yaml|yml)$
Expand Down
8 changes: 8 additions & 0 deletions .yamllint
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,11 @@ extends: default
rules:
line-length:
max: 160
comments:
min-spaces-from-content: 1
comments-indentation: false
braces:
max-spaces-inside: 1
octal-values:
forbid-implicit-octal: true
forbid-explicit-octal: true
4 changes: 2 additions & 2 deletions gpu-validation/tasks/cuda_assertions.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

---
- name: TEST[CUDA] Run the CUDA Sanity Check
ansible.builtin.command: python /tmp/scripts/cuda_sanity_check.py
register: cuda_sanity_result
failed_when: cuda_sanity_result.rc != 0
changed_when: false
changed_when: false
2 changes: 1 addition & 1 deletion gpu-validation/tasks/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

- name: Check CUDA libs
ansible.builtin.import_tasks: cuda.yaml
- name: CUDA Assertions
- name: CUDA Assertions # noqa: ignore-errors
ansible.builtin.import_tasks: cuda_assertions.yaml
ignore_errors: true

Expand Down
1 change: 0 additions & 1 deletion gpu-validation/tasks/model_performance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,3 @@
- name: Parse the JSON output from performance check script
ansible.builtin.set_fact:
performance_json: "{{ performance_script_output.stdout | from_json }}"

3 changes: 2 additions & 1 deletion gpu-validation/tasks/model_performance_assertions.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
---
- name: TEST[model_performance]Check the performance thresholds
ansible.builtin.debug:
var: performance_json
failed_when: |-
(performance_json.avg_time_per_tok | float) == 0
or (performance_json.avg_time_per_tok | float) > gpu_validation_model_perf_max_avg_time_per_tok
or (performance_json.avg_time_to_first_tok | float) == 0
or (performance_json.avg_time_to_first_tok | float) > gpu_validation_model_perf_max_avg_time_to_first_tok
or (performance_json.avg_time_to_first_tok | float) > gpu_validation_model_perf_max_avg_time_to_first_tok
84 changes: 43 additions & 41 deletions gpu-validation/tasks/nvidia.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,53 +10,55 @@
- name: Install nvidia-driver RPM if needed
when: _install_nvidia_driver
block:
- name: Blacklist nouveau kernel module
become: true
ansible.builtin.blockinfile:
block: |
blacklist nouveau
path: /etc/modprobe.d/blacklist.conf
create: true
- name: Blacklist nouveau kernel module
become: true
ansible.builtin.blockinfile:
block: |
blacklist nouveau
path: /etc/modprobe.d/blacklist.conf
create: true
mode: '0644'

- name: Remove nouveau kernel module if loaded
become: true
ansible.builtin.command: modprobe -r nouveau
ignore_errors: true
- name: Remove nouveau kernel module if loaded
become: true
ansible.builtin.command: modprobe -r nouveau
failed_when: false
changed_when: false

- name: Add nvidia CUDA repo
become: true
yum_repository:
name: nvidia-cuda-rhel9
description: NVIDIA CUDA repo for RHEL 9
baseurl: "{{gpu_validation_nvidia_repo_url}}/$basearch/"
gpgcheck: yes
gpgkey: "{{gpu_validation_nvidia_repo_url}}/$basearch/D42D0685.pub"
- name: Add nvidia CUDA repo
become: true
ansible.builtin.yum_repository:
name: nvidia-cuda-rhel9
description: NVIDIA CUDA repo for RHEL 9
baseurl: "{{ gpu_validation_nvidia_repo_url }}/$basearch/"
gpgcheck: true
gpgkey: "{{ gpu_validation_nvidia_repo_url }}/$basearch/D42D0685.pub"

- name: Add EPEL repository for DKMS support
become: true
ansible.builtin.yum_repository:
name: epel
description: EPEL YUM repo
baseurl: https://download.fedoraproject.org/pub/epel/9/Everything/$basearch/
enabled: 1
gpgcheck: 1
gpgkey: https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-9
- name: Add EPEL repository for DKMS support
become: true
ansible.builtin.yum_repository:
name: epel
description: EPEL YUM repo
baseurl: https://download.fedoraproject.org/pub/epel/9/Everything/$basearch/
enabled: 1
gpgcheck: 1
gpgkey: https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-9

- name: Enable nvidia-driver RPM module
become: true
ansible.builtin.dnf:
name: "@nvidia-driver:latest-dkms"
state: present
- name: Enable nvidia-driver RPM module
become: true
ansible.builtin.dnf:
name: "@nvidia-driver:latest-dkms"
state: present

- name: Install the nvidia driver
become: true
ansible.builtin.dnf:
name: cuda-drivers
state: present
- name: Install the nvidia driver
become: true
ansible.builtin.dnf:
name: cuda-drivers
state: present

- name: Refresh package facts after driver installation
ansible.builtin.package_facts:
manager: rpm
- name: Refresh package facts after driver installation
ansible.builtin.package_facts:
manager: rpm

- name: Run nvidia-smi to list NVIDIA GPUs and count them
ansible.builtin.shell: set -o pipefail && nvidia-smi --list-gpus | wc -l
Expand Down
4 changes: 2 additions & 2 deletions gpu-validation/tasks/setup.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
creates: /etc/alternatives/python3

- name: Set python3 alternative to /usr/bin/python3.12
ansible.builtin.alternatives:
community.general.alternatives:
name: python3
path: /usr/bin/python3.12

- name: Set ansible to use python3.12
set_fact:
ansible.builtin.set_fact:
ansible_python_interpreter: /usr/bin/python3.12

- name: Ensure python3.12-setuptools is installed
Expand Down
7 changes: 4 additions & 3 deletions gpu-validation/tasks/vllm.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
- name: PIP; install vllm
---
- name: PIP; install vllm
ansible.builtin.pip:
name: vllm
state: present

- ansible.builtin.import_tasks: vllm_config.yaml # noqa: name[missing]
- ansible.builtin.import_tasks: model_download_and_serve.yaml # noqa: name[missing]
- ansible.builtin.import_tasks: model_performance.yaml # noqa: name[missing]
- ansible.builtin.import_tasks: model_performance_assertions.yaml # noqa: name[missing]
ignore_errors: true
- ansible.builtin.import_tasks: model_performance_assertions.yaml # noqa: name[missing] ignore-errors
ignore_errors: true
13 changes: 7 additions & 6 deletions gpu-validation/tasks/vm_image.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
---
- name: Download VM image
ansible.builtin.get_url:
url: "{{ gpu_validation_image_url }}"
Expand All @@ -6,7 +7,7 @@

- name: Create glance image from GPU validation download source
openstack.cloud.image:
name: "{{gpu_validation_image_name}}"
name: "{{ gpu_validation_image_name }}"
container_format: bare
disk_format: qcow2
filename: "/tmp/{{ gpu_validation_image_url | basename }}"
Expand All @@ -15,12 +16,12 @@

- name: Create flavor for GPU validation
openstack.cloud.compute_flavor:
name: "{{gpu_validation_flavor_name}}"
ram: "{{gpu_validation_flavor_ram}}"
vcpus: "{{gpu_validation_flavor_vcpus}}"
disk: "{{gpu_validation_flavor_disk}}"
name: "{{ gpu_validation_flavor_name }}"
ram: "{{ gpu_validation_flavor_ram }}"
vcpus: "{{ gpu_validation_flavor_vcpus }}"
disk: "{{ gpu_validation_flavor_disk }}"
extra_specs:
"pci_passthrough:alias": "gpu-l4:{{gpu_validation_flavor_gpus}}"
"pci_passthrough:alias": "gpu-l4:{{ gpu_validation_flavor_gpus }}"
"hw:pci_numa_affinity_policy": "preferred"
"hw:hide_hypervisor_id": "true"
ca_cert: "{{ gpu_validation_ca_cert_path }}"
2 changes: 1 addition & 1 deletion gpu-validation/tasks/vm_net.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
register: existing_fip_info

- name: Set floating IP creation flag
set_fact:
ansible.builtin.set_fact:
create_floating_ip: "{{ (existing_fip_info.floating_ips | length) == 0 }}"

- name: Create floating ip address on public network
Expand Down
2 changes: 2 additions & 0 deletions requirements.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
collections:
- name: openstack.cloud
version: ">=2.4.1"
- name: community.general
version: ">=10.0.0"