diff --git a/.gitignore b/.gitignore index 5fd36b1..05d7ca4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ .venv *.pyc +.ansible +.vscode tls-ca-bundle.pem creds.yaml inventory-local -main-local.yaml +my-AnsibleTestCR.yaml diff --git a/gpu-validation/defaults/main.yaml b/gpu-validation/defaults/main.yaml index 2a3d86f..f651821 100644 --- a/gpu-validation/defaults/main.yaml +++ b/gpu-validation/defaults/main.yaml @@ -10,11 +10,11 @@ gpu_validation_image_name: gpu-validation # [string] Flavor to use when creating the VM gpu_validation_flavor_name: nvidia # [string] RAM value for the flavor -gpu_validation_flavor_ram: 40960 +gpu_validation_flavor_ram: 16384 # [string] CPU value for the flavor -gpu_validation_flavor_vcpus: 8 +gpu_validation_flavor_vcpus: 4 # [string] Disk value for the flavor -gpu_validation_flavor_disk: 20 +gpu_validation_flavor_disk: 80 # [string] Number of GPUs for the flavor gpu_validation_flavor_gpus: 1 @@ -41,19 +41,39 @@ gpu_validation_dns_server: 192.168.122.1 gpu_validation_pci_devices: 10de:27b8: 1 +# [string] libnvidia-container-toolkit version +gpu_validation_libnvidia_container_toolkit_version_release: 1.17.8-1 + gpu_validation_model_tests_enabled: true # Can be disabled for lighter CUDA-only sanity testing -gpu_validation_model_name: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +gpu_validation_model_name: TinyLlama/TinyLlama-1.1B-Chat-v1.0 # RedHatAI/Llama-3.2-1B-Instruct-FP8 for RHAIIS # [string] (optional) HuggingFace token if required for download gpu_validation_model_download_hf_token: -# [string] (optional) Model registry username if required for download -gpu_validation_model_download_registry_username: -# [string] (optional) Model registry password if required for download -gpu_validation_model_download_registry_password: -# [string] (optional) Model container tag to download -gpu_validation_model_download_release: '' # Set to '' for TinyLlama -gpu_validation_model_download_repository_base: '' # Set to '' if downloading from HuggingFace + # [float] Performance threshholds gpu_validation_model_perf_max_avg_time_per_tok: !!float "0.03" gpu_validation_model_perf_max_avg_time_to_first_tok: !!float "0.3" gpu_validation_nvidia_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel9/" + +# [string] Container image to use for model serving +gpu_validation_workload_container_image: "docker.io/vllm/vllm-openai:latest" # "registry.redhat.io/rhaiis/vllm-cuda-rhel9:3.0.0" for RHAIIS +# [string] (optional) registry username if required for container image download +gpu_validation_workload_registry_username: +# [string] (optional) registry password if required for container image download +gpu_validation_workload_registry_password: +# [string] Cache directory path for model downloads +gpu_validation_workload_cache_dir: "/home/cloud-user/.cache/workload" +# [string] Cache mount path inside container +gpu_validation_workload_cache_mount_path: "/root/.cache/huggingface" # "/opt/app-root/src/.cache" for RHAIIS +# [string] Container Device options +gpu_validation_workload_device_opts: "--device nvidia.com/gpu=all" +# [string] Container security options +gpu_validation_workload_security_opts: "--security-opt=label=disable" +# [string] Container options +gpu_validation_workload_additional_opts: "--rm -it" +# [string] Container user namespace option +gpu_validation_workload_userns: # "--userns=keep-id:uid=1001" for RHAIIS +# [string] Shared memory size +gpu_validation_workload_shm_size: "--shm-size=4g" +# [string] Additional environment variables +gpu_validation_workload_additional_env: "--env=HF_HUB_OFFLINE=0 --env=VLLM_NO_USAGE_STATS=1" diff --git a/gpu-validation/tasks/main.yaml b/gpu-validation/tasks/main.yaml index 328c385..10a949c 100644 --- a/gpu-validation/tasks/main.yaml +++ b/gpu-validation/tasks/main.yaml @@ -3,6 +3,10 @@ become: true ansible.builtin.import_tasks: setup.yaml +- name: Nvidia Setup + become: true + ansible.builtin.import_tasks: nvidia_setup.yaml + - name: Check GPUs ansible.builtin.import_tasks: gpus.yaml - name: GPUs Assertions # noqa: ignore-errors diff --git a/gpu-validation/tasks/model_download_and_serve.yaml b/gpu-validation/tasks/model_download_and_serve.yaml index d3e0c0e..fa802df 100644 --- a/gpu-validation/tasks/model_download_and_serve.yaml +++ b/gpu-validation/tasks/model_download_and_serve.yaml @@ -1,22 +1,24 @@ --- - name: Extract the repository host name ansible.builtin.set_fact: - gpu_validation_model_download_registry_host: "{{ gpu_validation_model_download_repository_base | regex_replace('docker://(.*?)/.*', '\\1') }}" + _gpu_validation_workload_registry_host: "{{ gpu_validation_workload_container_image | regex_replace('docker://(.*?)/.*', '\\1') }}" - name: Log into registry if username is specified ansible.builtin.command: cmd: > - podman login --username {{ gpu_validation_model_download_registry_username | quote }} --password - {{ gpu_validation_model_download_registry_password | quote }} {{ gpu_validation_model_download_registry_host | quote }} + podman login --username {{ gpu_validation_workload_registry_username | quote }} --password + {{ gpu_validation_workload_registry_password | quote }} {{ _gpu_validation_workload_registry_host | quote }} no_log: true - when: gpu_validation_model_download_registry_username + when: gpu_validation_workload_registry_username changed_when: false -- name: Append --hf-token to model download args if specified - ansible.builtin.set_fact: - gpu_validation_model_download_token_args: --hf-token {{ gpu_validation_model_download_hf_token | quote }} - no_log: true - when: gpu_validation_model_download_hf_token +- name: Create model cache directory + ansible.builtin.file: + path: "{{ gpu_validation_workload_cache_dir }}" + state: directory + mode: "0755" + owner: cloud-user + group: cloud-user - name: Create user systemd directory ansible.builtin.file: diff --git a/gpu-validation/tasks/nvidia.yaml b/gpu-validation/tasks/nvidia.yaml index 0936baf..9b3bf05 100644 --- a/gpu-validation/tasks/nvidia.yaml +++ b/gpu-validation/tasks/nvidia.yaml @@ -1,65 +1,4 @@ --- -- name: Check for installed nvidia-driver RPM - ansible.builtin.package_facts: - manager: rpm - -- name: Check if nvidia-driver installation is needed - ansible.builtin.set_fact: - _install_nvidia_driver: "{{ 'nvidia-driver' not in ansible_facts.packages }}" - -- name: Install nvidia-driver RPM if needed - when: _install_nvidia_driver - block: - - name: Blacklist nouveau kernel module - become: true - ansible.builtin.blockinfile: - block: | - blacklist nouveau - path: /etc/modprobe.d/blacklist.conf - create: true - mode: '0644' - - - name: Remove nouveau kernel module if loaded - become: true - ansible.builtin.command: modprobe -r nouveau - failed_when: false - changed_when: false - - - name: Add nvidia CUDA repo - become: true - ansible.builtin.yum_repository: - name: nvidia-cuda-rhel9 - description: NVIDIA CUDA repo for RHEL 9 - baseurl: "{{ gpu_validation_nvidia_repo_url }}/$basearch/" - gpgcheck: true - gpgkey: "{{ gpu_validation_nvidia_repo_url }}/$basearch/D42D0685.pub" - - - name: Add EPEL repository for DKMS support - become: true - ansible.builtin.yum_repository: - name: epel - description: EPEL YUM repo - baseurl: https://download.fedoraproject.org/pub/epel/9/Everything/$basearch/ - enabled: 1 - gpgcheck: 1 - gpgkey: https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-9 - - - name: Enable nvidia-driver RPM module - become: true - ansible.builtin.dnf: - name: "@nvidia-driver:latest-dkms" - state: present - - - name: Install the nvidia driver - become: true - ansible.builtin.dnf: - name: cuda-drivers - state: present - - - name: Refresh package facts after driver installation - ansible.builtin.package_facts: - manager: rpm - - name: Run nvidia-smi to list NVIDIA GPUs and count them ansible.builtin.shell: set -o pipefail && nvidia-smi --list-gpus | wc -l register: nvidia_gpu_count diff --git a/gpu-validation/tasks/nvidia_container_tools.yaml b/gpu-validation/tasks/nvidia_container_tools.yaml new file mode 100644 index 0000000..4011ec5 --- /dev/null +++ b/gpu-validation/tasks/nvidia_container_tools.yaml @@ -0,0 +1,21 @@ +--- +- name: Install NVIDIA Container Tools RPM + ansible.builtin.dnf: + name: "nvidia-container-toolkit-{{ gpu_validation_libnvidia_container_toolkit_version_release }}" + state: present + +- name: Check if CDI configfile exists + ansible.builtin.stat: + path: /etc/cdi/nvidia.yaml + register: _cdi_config_file + +- name: Configure NVIDIA container runtime + when: not _cdi_config_file.stat.exists + block: + - name: Configure NVIDIA container runtime + ansible.builtin.command: nvidia-ctk runtime configure --runtime=containerd + changed_when: true + + - name: Generate NVIDIA CDI configuration + ansible.builtin.command: nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml + changed_when: true diff --git a/gpu-validation/tasks/nvidia_driver.yaml b/gpu-validation/tasks/nvidia_driver.yaml new file mode 100644 index 0000000..1330577 --- /dev/null +++ b/gpu-validation/tasks/nvidia_driver.yaml @@ -0,0 +1,55 @@ +--- +- name: Check for installed nvidia-driver RPM + ansible.builtin.package_facts: + manager: rpm + +- name: Check if nvidia-driver installation is needed + ansible.builtin.set_fact: + _install_nvidia_driver: "{{ 'nvidia-driver' not in ansible_facts.packages }}" + +- name: Install nvidia-driver RPM if needed + when: _install_nvidia_driver + block: + - name: Blacklist nouveau kernel module + ansible.builtin.blockinfile: + block: | + blacklist nouveau + path: /etc/modprobe.d/blacklist.conf + mode: "0644" + create: true + + - name: Remove nouveau kernel module if loaded + ansible.builtin.command: modprobe -r nouveau + failed_when: false + changed_when: false + + - name: Add nvidia CUDA repo + ansible.builtin.yum_repository: + name: nvidia-cuda-rhel9 + description: NVIDIA CUDA repo for RHEL 9 + baseurl: "{{ gpu_validation_nvidia_repo_url }}/$basearch/" + gpgcheck: true + gpgkey: "{{ gpu_validation_nvidia_repo_url }}/$basearch/D42D0685.pub" + + - name: Add EPEL repository for DKMS support + ansible.builtin.yum_repository: + name: epel + description: EPEL YUM repo + baseurl: https://download.fedoraproject.org/pub/epel/9/Everything/$basearch/ + enabled: 1 + gpgcheck: 1 + gpgkey: https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-9 + + - name: Enable nvidia-driver RPM module + ansible.builtin.dnf: + name: "@nvidia-driver:latest-dkms" + state: present + + - name: Install the nvidia CUDA driver + ansible.builtin.dnf: + name: cuda-drivers + state: present + + - name: Refresh package facts after driver installation + ansible.builtin.package_facts: + manager: rpm diff --git a/gpu-validation/tasks/nvidia_setup.yaml b/gpu-validation/tasks/nvidia_setup.yaml new file mode 100644 index 0000000..7f8b1da --- /dev/null +++ b/gpu-validation/tasks/nvidia_setup.yaml @@ -0,0 +1,6 @@ +--- +- name: Nvidia Driver + ansible.builtin.import_tasks: nvidia_driver.yaml + +- name: Nvidia Container Tools + ansible.builtin.import_tasks: nvidia_container_tools.yaml diff --git a/gpu-validation/tasks/setup.yaml b/gpu-validation/tasks/setup.yaml index 178838c..18e6bd5 100644 --- a/gpu-validation/tasks/setup.yaml +++ b/gpu-validation/tasks/setup.yaml @@ -13,40 +13,16 @@ name: pciutils state: present -- name: Ensure python3.12 is installed - ansible.builtin.dnf: - name: python3.12 - state: present - -- name: Set up alternatives for python3 - ansible.builtin.command: - cmd: "alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 120" - args: - creates: /etc/alternatives/python3 - -- name: Set python3 alternative to /usr/bin/python3.12 - community.general.alternatives: - name: python3 - path: /usr/bin/python3.12 - -- name: Set ansible to use python3.12 - ansible.builtin.set_fact: - ansible_python_interpreter: /usr/bin/python3.12 - -- name: Ensure python3.12-setuptools is installed - ansible.builtin.dnf: - name: python3.12-setuptools - state: present - -- name: Ensure python3.12-devel is installed +- name: Install pip package ansible.builtin.dnf: - name: python3.12-devel + use_backend: dnf4 + name: python-pip state: present -- name: Install pip package +- name: Install podman package ansible.builtin.dnf: use_backend: dnf4 - name: python3.12-pip + name: podman state: present - name: Copy scripts from local to RHEL AI machine diff --git a/gpu-validation/tasks/vllm.yaml b/gpu-validation/tasks/vllm.yaml index 1eb9de7..efbc3bf 100644 --- a/gpu-validation/tasks/vllm.yaml +++ b/gpu-validation/tasks/vllm.yaml @@ -1,9 +1,4 @@ --- -- name: PIP; install vllm - ansible.builtin.pip: - name: vllm - state: present - - ansible.builtin.import_tasks: vllm_config.yaml # noqa: name[missing] - ansible.builtin.import_tasks: model_download_and_serve.yaml # noqa: name[missing] - ansible.builtin.import_tasks: model_performance.yaml # noqa: name[missing] diff --git a/gpu-validation/tasks/vm_image.yaml b/gpu-validation/tasks/vm_image.yaml index 14172f8..878331e 100644 --- a/gpu-validation/tasks/vm_image.yaml +++ b/gpu-validation/tasks/vm_image.yaml @@ -1,9 +1,20 @@ --- +- name: Check if image with the correct name already exists in Glance + openstack.cloud.image_info: + name: "{{ gpu_validation_image_name }}" + ca_cert: "{{ gpu_validation_ca_cert_path }}" + register: _gpu_validation_image_info + +- name: Set fact if image exists + ansible.builtin.set_fact: + _gpu_validation_image_exists: "{{ (_gpu_validation_image_info.images | length) > 0 }}" + - name: Download VM image ansible.builtin.get_url: url: "{{ gpu_validation_image_url }}" dest: "/tmp/{{ gpu_validation_image_url | basename }}" mode: '0644' + when: not _gpu_validation_image_exists - name: Create glance image from GPU validation download source openstack.cloud.image: @@ -13,6 +24,7 @@ filename: "/tmp/{{ gpu_validation_image_url | basename }}" visibility: public ca_cert: "{{ gpu_validation_ca_cert_path }}" + when: not _gpu_validation_image_exists - name: Create flavor for GPU validation openstack.cloud.compute_flavor: diff --git a/gpu-validation/tasks/vm_net.yaml b/gpu-validation/tasks/vm_net.yaml index c1b6898..ac086ef 100644 --- a/gpu-validation/tasks/vm_net.yaml +++ b/gpu-validation/tasks/vm_net.yaml @@ -84,3 +84,4 @@ floating_ip_address: "{{ gpu_validation_floating_ip }}" floating_network_id: "{{ public_network_info.networks[0].id }}" ca_cert: "{{ gpu_validation_ca_cert_path }}" + when: create_floating_ip | bool diff --git a/gpu-validation/templates/vllm-serve.service.j2 b/gpu-validation/templates/vllm-serve.service.j2 index fb7ae7a..6def836 100644 --- a/gpu-validation/templates/vllm-serve.service.j2 +++ b/gpu-validation/templates/vllm-serve.service.j2 @@ -5,5 +5,18 @@ Description=vllm model serve service WantedBy=multi-user.target default.target [Service] -ExecStart=/home/cloud-user/.local/bin/vllm serve {{ gpu_validation_model_name | quote }} --tensor-parallel-size {{ gpu_validation_num_gpus }} --port 8000 -Restart=always +ExecStart=podman run \ + {{ gpu_validation_workload_device_opts }} \ + {{ gpu_validation_workload_security_opts }} \ + {{ gpu_validation_workload_shm_size }} \ + {{ gpu_validation_workload_userns }} \ + {{ gpu_validation_workload_additional_opts }} \ + -v {{ gpu_validation_workload_cache_dir }}:{{ gpu_validation_workload_cache_mount_path }}:Z \ +{% if gpu_validation_model_download_hf_token %} + --env "HUGGING_FACE_HUB_TOKEN={{ gpu_validation_model_download_hf_token | quote }}" \ +{% endif %} + {{ gpu_validation_workload_additional_env }} \ + -p 8000:8000 \ + {{ gpu_validation_workload_container_image }} \ + --model {{ gpu_validation_model_name | quote }} \ + --tensor-parallel-size {{ gpu_validation_num_gpus }}