Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
.venv
*.pyc
.ansible
.vscode
tls-ca-bundle.pem
creds.yaml
inventory-local
main-local.yaml
my-AnsibleTestCR.yaml
42 changes: 31 additions & 11 deletions gpu-validation/defaults/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ gpu_validation_image_name: gpu-validation
# [string] Flavor to use when creating the VM
gpu_validation_flavor_name: nvidia
# [string] RAM value for the flavor
gpu_validation_flavor_ram: 40960
gpu_validation_flavor_ram: 16384
# [string] CPU value for the flavor
gpu_validation_flavor_vcpus: 8
gpu_validation_flavor_vcpus: 4
# [string] Disk value for the flavor
gpu_validation_flavor_disk: 20
gpu_validation_flavor_disk: 80
# [string] Number of GPUs for the flavor
gpu_validation_flavor_gpus: 1

Expand All @@ -41,19 +41,39 @@ gpu_validation_dns_server: 192.168.122.1
gpu_validation_pci_devices:
10de:27b8: 1

# [string] libnvidia-container-toolkit version
gpu_validation_libnvidia_container_toolkit_version_release: 1.17.8-1

gpu_validation_model_tests_enabled: true # Can be disabled for lighter CUDA-only sanity testing
gpu_validation_model_name: TinyLlama/TinyLlama-1.1B-Chat-v1.0
gpu_validation_model_name: TinyLlama/TinyLlama-1.1B-Chat-v1.0 # RedHatAI/Llama-3.2-1B-Instruct-FP8 for RHAIIS
# [string] (optional) HuggingFace token if required for download
gpu_validation_model_download_hf_token:
# [string] (optional) Model registry username if required for download
gpu_validation_model_download_registry_username:
# [string] (optional) Model registry password if required for download
gpu_validation_model_download_registry_password:
# [string] (optional) Model container tag to download
gpu_validation_model_download_release: '' # Set to '' for TinyLlama
gpu_validation_model_download_repository_base: '' # Set to '' if downloading from HuggingFace

# [float] Performance threshholds
gpu_validation_model_perf_max_avg_time_per_tok: !!float "0.03"
gpu_validation_model_perf_max_avg_time_to_first_tok: !!float "0.3"

gpu_validation_nvidia_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel9/"

# [string] Container image to use for model serving
gpu_validation_workload_container_image: "docker.io/vllm/vllm-openai:latest" # "registry.redhat.io/rhaiis/vllm-cuda-rhel9:3.0.0" for RHAIIS
# [string] (optional) registry username if required for container image download
gpu_validation_workload_registry_username:
# [string] (optional) registry password if required for container image download
gpu_validation_workload_registry_password:
# [string] Cache directory path for model downloads
gpu_validation_workload_cache_dir: "/home/cloud-user/.cache/workload"
# [string] Cache mount path inside container
gpu_validation_workload_cache_mount_path: "/root/.cache/huggingface" # "/opt/app-root/src/.cache" for RHAIIS
# [string] Container Device options
gpu_validation_workload_device_opts: "--device nvidia.com/gpu=all"
# [string] Container security options
gpu_validation_workload_security_opts: "--security-opt=label=disable"
# [string] Container options
gpu_validation_workload_additional_opts: "--rm -it"
# [string] Container user namespace option
gpu_validation_workload_userns: # "--userns=keep-id:uid=1001" for RHAIIS
# [string] Shared memory size
gpu_validation_workload_shm_size: "--shm-size=4g"
# [string] Additional environment variables
gpu_validation_workload_additional_env: "--env=HF_HUB_OFFLINE=0 --env=VLLM_NO_USAGE_STATS=1"
4 changes: 4 additions & 0 deletions gpu-validation/tasks/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
become: true
ansible.builtin.import_tasks: setup.yaml

- name: Nvidia Setup
become: true
ansible.builtin.import_tasks: nvidia_setup.yaml

- name: Check GPUs
ansible.builtin.import_tasks: gpus.yaml
- name: GPUs Assertions # noqa: ignore-errors
Expand Down
20 changes: 11 additions & 9 deletions gpu-validation/tasks/model_download_and_serve.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
---
- name: Extract the repository host name
ansible.builtin.set_fact:
gpu_validation_model_download_registry_host: "{{ gpu_validation_model_download_repository_base | regex_replace('docker://(.*?)/.*', '\\1') }}"
_gpu_validation_workload_registry_host: "{{ gpu_validation_workload_container_image | regex_replace('docker://(.*?)/.*', '\\1') }}"

- name: Log into registry if username is specified
ansible.builtin.command:
cmd: >
podman login --username {{ gpu_validation_model_download_registry_username | quote }} --password
{{ gpu_validation_model_download_registry_password | quote }} {{ gpu_validation_model_download_registry_host | quote }}
podman login --username {{ gpu_validation_workload_registry_username | quote }} --password
{{ gpu_validation_workload_registry_password | quote }} {{ _gpu_validation_workload_registry_host | quote }}
no_log: true
when: gpu_validation_model_download_registry_username
when: gpu_validation_workload_registry_username
changed_when: false

- name: Append --hf-token to model download args if specified
ansible.builtin.set_fact:
gpu_validation_model_download_token_args: --hf-token {{ gpu_validation_model_download_hf_token | quote }}
no_log: true
when: gpu_validation_model_download_hf_token
- name: Create model cache directory
ansible.builtin.file:
path: "{{ gpu_validation_workload_cache_dir }}"
state: directory
mode: "0755"
owner: cloud-user
group: cloud-user

- name: Create user systemd directory
ansible.builtin.file:
Expand Down
61 changes: 0 additions & 61 deletions gpu-validation/tasks/nvidia.yaml
Original file line number Diff line number Diff line change
@@ -1,65 +1,4 @@
---
- name: Check for installed nvidia-driver RPM
ansible.builtin.package_facts:
manager: rpm

- name: Check if nvidia-driver installation is needed
ansible.builtin.set_fact:
_install_nvidia_driver: "{{ 'nvidia-driver' not in ansible_facts.packages }}"

- name: Install nvidia-driver RPM if needed
when: _install_nvidia_driver
block:
- name: Blacklist nouveau kernel module
become: true
ansible.builtin.blockinfile:
block: |
blacklist nouveau
path: /etc/modprobe.d/blacklist.conf
create: true
mode: '0644'

- name: Remove nouveau kernel module if loaded
become: true
ansible.builtin.command: modprobe -r nouveau
failed_when: false
changed_when: false

- name: Add nvidia CUDA repo
become: true
ansible.builtin.yum_repository:
name: nvidia-cuda-rhel9
description: NVIDIA CUDA repo for RHEL 9
baseurl: "{{ gpu_validation_nvidia_repo_url }}/$basearch/"
gpgcheck: true
gpgkey: "{{ gpu_validation_nvidia_repo_url }}/$basearch/D42D0685.pub"

- name: Add EPEL repository for DKMS support
become: true
ansible.builtin.yum_repository:
name: epel
description: EPEL YUM repo
baseurl: https://download.fedoraproject.org/pub/epel/9/Everything/$basearch/
enabled: 1
gpgcheck: 1
gpgkey: https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-9

- name: Enable nvidia-driver RPM module
become: true
ansible.builtin.dnf:
name: "@nvidia-driver:latest-dkms"
state: present

- name: Install the nvidia driver
become: true
ansible.builtin.dnf:
name: cuda-drivers
state: present

- name: Refresh package facts after driver installation
ansible.builtin.package_facts:
manager: rpm

- name: Run nvidia-smi to list NVIDIA GPUs and count them
ansible.builtin.shell: set -o pipefail && nvidia-smi --list-gpus | wc -l
register: nvidia_gpu_count
Expand Down
21 changes: 21 additions & 0 deletions gpu-validation/tasks/nvidia_container_tools.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
- name: Install NVIDIA Container Tools RPM
ansible.builtin.dnf:
name: "nvidia-container-toolkit-{{ gpu_validation_libnvidia_container_toolkit_version_release }}"
state: present

- name: Check if CDI configfile exists
ansible.builtin.stat:
path: /etc/cdi/nvidia.yaml
register: _cdi_config_file

- name: Configure NVIDIA container runtime
when: not _cdi_config_file.stat.exists
block:
- name: Configure NVIDIA container runtime
ansible.builtin.command: nvidia-ctk runtime configure --runtime=containerd
changed_when: true

- name: Generate NVIDIA CDI configuration
ansible.builtin.command: nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
changed_when: true
55 changes: 55 additions & 0 deletions gpu-validation/tasks/nvidia_driver.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
---
- name: Check for installed nvidia-driver RPM
ansible.builtin.package_facts:
manager: rpm

- name: Check if nvidia-driver installation is needed
ansible.builtin.set_fact:
_install_nvidia_driver: "{{ 'nvidia-driver' not in ansible_facts.packages }}"

- name: Install nvidia-driver RPM if needed
when: _install_nvidia_driver
block:
- name: Blacklist nouveau kernel module
ansible.builtin.blockinfile:
block: |
blacklist nouveau
path: /etc/modprobe.d/blacklist.conf
mode: "0644"
create: true

- name: Remove nouveau kernel module if loaded
ansible.builtin.command: modprobe -r nouveau
failed_when: false
changed_when: false

- name: Add nvidia CUDA repo
ansible.builtin.yum_repository:
name: nvidia-cuda-rhel9
description: NVIDIA CUDA repo for RHEL 9
baseurl: "{{ gpu_validation_nvidia_repo_url }}/$basearch/"
gpgcheck: true
gpgkey: "{{ gpu_validation_nvidia_repo_url }}/$basearch/D42D0685.pub"

- name: Add EPEL repository for DKMS support
ansible.builtin.yum_repository:
name: epel
description: EPEL YUM repo
baseurl: https://download.fedoraproject.org/pub/epel/9/Everything/$basearch/
enabled: 1
gpgcheck: 1
gpgkey: https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-9

- name: Enable nvidia-driver RPM module
ansible.builtin.dnf:
name: "@nvidia-driver:latest-dkms"
state: present

- name: Install the nvidia CUDA driver
ansible.builtin.dnf:
name: cuda-drivers
state: present

- name: Refresh package facts after driver installation
ansible.builtin.package_facts:
manager: rpm
6 changes: 6 additions & 0 deletions gpu-validation/tasks/nvidia_setup.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
- name: Nvidia Driver
ansible.builtin.import_tasks: nvidia_driver.yaml

- name: Nvidia Container Tools
ansible.builtin.import_tasks: nvidia_container_tools.yaml
34 changes: 5 additions & 29 deletions gpu-validation/tasks/setup.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,40 +13,16 @@
name: pciutils
state: present

- name: Ensure python3.12 is installed
ansible.builtin.dnf:
name: python3.12
state: present

- name: Set up alternatives for python3
ansible.builtin.command:
cmd: "alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 120"
args:
creates: /etc/alternatives/python3

- name: Set python3 alternative to /usr/bin/python3.12
community.general.alternatives:
name: python3
path: /usr/bin/python3.12

- name: Set ansible to use python3.12
ansible.builtin.set_fact:
ansible_python_interpreter: /usr/bin/python3.12

- name: Ensure python3.12-setuptools is installed
ansible.builtin.dnf:
name: python3.12-setuptools
state: present

- name: Ensure python3.12-devel is installed
- name: Install pip package
ansible.builtin.dnf:
name: python3.12-devel
use_backend: dnf4
name: python-pip
state: present

- name: Install pip package
- name: Install podman package
ansible.builtin.dnf:
use_backend: dnf4
name: python3.12-pip
name: podman
state: present

- name: Copy scripts from local to RHEL AI machine
Expand Down
5 changes: 0 additions & 5 deletions gpu-validation/tasks/vllm.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
---
- name: PIP; install vllm
ansible.builtin.pip:
name: vllm
state: present

- ansible.builtin.import_tasks: vllm_config.yaml # noqa: name[missing]
- ansible.builtin.import_tasks: model_download_and_serve.yaml # noqa: name[missing]
- ansible.builtin.import_tasks: model_performance.yaml # noqa: name[missing]
Expand Down
12 changes: 12 additions & 0 deletions gpu-validation/tasks/vm_image.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
---
- name: Check if image with the correct name already exists in Glance
openstack.cloud.image_info:
name: "{{ gpu_validation_image_name }}"
ca_cert: "{{ gpu_validation_ca_cert_path }}"
register: _gpu_validation_image_info

- name: Set fact if image exists
ansible.builtin.set_fact:
_gpu_validation_image_exists: "{{ (_gpu_validation_image_info.images | length) > 0 }}"

- name: Download VM image
ansible.builtin.get_url:
url: "{{ gpu_validation_image_url }}"
dest: "/tmp/{{ gpu_validation_image_url | basename }}"
mode: '0644'
when: not _gpu_validation_image_exists

- name: Create glance image from GPU validation download source
openstack.cloud.image:
Expand All @@ -13,6 +24,7 @@
filename: "/tmp/{{ gpu_validation_image_url | basename }}"
visibility: public
ca_cert: "{{ gpu_validation_ca_cert_path }}"
when: not _gpu_validation_image_exists

- name: Create flavor for GPU validation
openstack.cloud.compute_flavor:
Expand Down
1 change: 1 addition & 0 deletions gpu-validation/tasks/vm_net.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,4 @@
floating_ip_address: "{{ gpu_validation_floating_ip }}"
floating_network_id: "{{ public_network_info.networks[0].id }}"
ca_cert: "{{ gpu_validation_ca_cert_path }}"
when: create_floating_ip | bool
17 changes: 15 additions & 2 deletions gpu-validation/templates/vllm-serve.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,18 @@ Description=vllm model serve service
WantedBy=multi-user.target default.target

[Service]
ExecStart=/home/cloud-user/.local/bin/vllm serve {{ gpu_validation_model_name | quote }} --tensor-parallel-size {{ gpu_validation_num_gpus }} --port 8000
Restart=always
ExecStart=podman run \
{{ gpu_validation_workload_device_opts }} \
{{ gpu_validation_workload_security_opts }} \
{{ gpu_validation_workload_shm_size }} \
{{ gpu_validation_workload_userns }} \
{{ gpu_validation_workload_additional_opts }} \
-v {{ gpu_validation_workload_cache_dir }}:{{ gpu_validation_workload_cache_mount_path }}:Z \
{% if gpu_validation_model_download_hf_token %}
--env "HUGGING_FACE_HUB_TOKEN={{ gpu_validation_model_download_hf_token | quote }}" \
{% endif %}
{{ gpu_validation_workload_additional_env }} \
-p 8000:8000 \
{{ gpu_validation_workload_container_image }} \
--model {{ gpu_validation_model_name | quote }} \
--tensor-parallel-size {{ gpu_validation_num_gpus }}