From 76552b67b7cf6faf66f6376a38efec8d28234afa Mon Sep 17 00:00:00 2001 From: Mesut Oezdil Date: Tue, 31 Mar 2026 22:27:15 +0200 Subject: [PATCH 1/7] docs: add quick start guide to verify HAMi in Kubernetes Signed-off-by: Mesut Oezdil --- docs/get-started/verify-hami.md | 123 ++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 docs/get-started/verify-hami.md diff --git a/docs/get-started/verify-hami.md b/docs/get-started/verify-hami.md new file mode 100644 index 00000000..76e111a6 --- /dev/null +++ b/docs/get-started/verify-hami.md @@ -0,0 +1,123 @@ +--- +id: verify-hami +title: Verify HAMi (Quick Start) +sidebar_label: Verify HAMi +--- + +# Verify HAMi (Quick Start) + +This guide provides a rapid, end-to-end setup to verify that GPU workloads run correctly in a Kubernetes cluster with HAMi. + +What "working" actually means: A successful HAMi setup goes beyond just running pods or a successful Helm installation. It means the GPU is accessible inside a container, Kubernetes correctly advertises the resources, and vGPU isolation (like memory limits) behaves predictably. + +## Step 0: Configure Node Container Runtime (If not already done) +HAMi requires the `nvidia-container-toolkit` to be installed and set as the default low-level runtime on all your GPU nodes. + +### 1. Install nvidia-container-toolkit (Debian/Ubuntu example) +``` +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list \ + | sudo tee /etc/apt/sources.list.d/libnvidia-container.list +curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add - +sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit +``` + +### 2. Configure your runtime +* For containerd: Edit `/etc/containerd/config.toml` to set the default runtime name to `"nvidia"` and the binary name to `"/usr/bin/nvidia-container-runtime"`. + * Restart: `sudo systemctl daemon-reload && systemctl restart containerd` +* For Docker: Edit `/etc/docker/daemon.json` to set `"default-runtime": "nvidia"`. + * Restart: `sudo systemctl daemon-reload && systemctl restart docker` + +## Step 1: Validate the Native GPU Stack (Crucial Pre-flight Check) +Before installing HAMi, you must prove that Kubernetes can natively access the GPU. + +This step validates your GPU stack independently of HAMi. + +### 1. Deploy a native test pod +``` +cat < Date: Thu, 2 Apr 2026 09:13:49 +0200 Subject: [PATCH 2/7] docs: add verify-hami to v2.8.0 docs --- .../version-v2.8.0}/get-started/verify-hami.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {docs => versioned_docs/version-v2.8.0}/get-started/verify-hami.md (100%) diff --git a/docs/get-started/verify-hami.md b/versioned_docs/version-v2.8.0/get-started/verify-hami.md similarity index 100% rename from docs/get-started/verify-hami.md rename to versioned_docs/version-v2.8.0/get-started/verify-hami.md From 5fc9f2368fdafb9ff3d918e149bb1ce2b9d9b45d Mon Sep 17 00:00:00 2001 From: Mesut Oezdil Date: Thu, 2 Apr 2026 09:15:29 +0200 Subject: [PATCH 3/7] docs: add verify-hami to versioned sidebar --- versioned_sidebars/version-v2.8.0-sidebars.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/versioned_sidebars/version-v2.8.0-sidebars.json b/versioned_sidebars/version-v2.8.0-sidebars.json index f71d1f0e..40946815 100644 --- a/versioned_sidebars/version-v2.8.0-sidebars.json +++ b/versioned_sidebars/version-v2.8.0-sidebars.json @@ -21,7 +21,8 @@ "type": "category", "label": "Get Started", "items": [ - "get-started/deploy-with-helm" + "get-started/deploy-with-helm", + "get-started/verify-hami" ] }, { From 56564392201bac0d7a990a0ef8fbda507f6279b5 Mon Sep 17 00:00:00 2001 From: Mesut Oezdil Date: Thu, 2 Apr 2026 09:38:42 +0200 Subject: [PATCH 4/7] fix: remove id to match sidebar path --- versioned_docs/version-v2.8.0/get-started/verify-hami.md | 1 - 1 file changed, 1 deletion(-) diff --git a/versioned_docs/version-v2.8.0/get-started/verify-hami.md b/versioned_docs/version-v2.8.0/get-started/verify-hami.md index 76e111a6..d44ec953 100644 --- a/versioned_docs/version-v2.8.0/get-started/verify-hami.md +++ b/versioned_docs/version-v2.8.0/get-started/verify-hami.md @@ -1,5 +1,4 @@ --- -id: verify-hami title: Verify HAMi (Quick Start) sidebar_label: Verify HAMi --- From d439e2306053aae514863f5652bcb1a7107133f9 Mon Sep 17 00:00:00 2001 From: Mesut Oezdil Date: Thu, 2 Apr 2026 10:27:59 +0200 Subject: [PATCH 5/7] docs: improve title to better reflect validation scope --- versioned_docs/version-v2.8.0/get-started/verify-hami.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/versioned_docs/version-v2.8.0/get-started/verify-hami.md b/versioned_docs/version-v2.8.0/get-started/verify-hami.md index d44ec953..5147f6ef 100644 --- a/versioned_docs/version-v2.8.0/get-started/verify-hami.md +++ b/versioned_docs/version-v2.8.0/get-started/verify-hami.md @@ -1,11 +1,11 @@ --- -title: Verify HAMi (Quick Start) -sidebar_label: Verify HAMi +title: Validate HAMi Setup and vGPU Behavior +sidebar_label: Validate HAMi --- -# Verify HAMi (Quick Start) +# Validate HAMi Setup and vGPU Behavior -This guide provides a rapid, end-to-end setup to verify that GPU workloads run correctly in a Kubernetes cluster with HAMi. +This guide provides a rapid, end-to-end setup to validate that GPU workloads run correctly in a Kubernetes cluster with HAMi. What "working" actually means: A successful HAMi setup goes beyond just running pods or a successful Helm installation. It means the GPU is accessible inside a container, Kubernetes correctly advertises the resources, and vGPU isolation (like memory limits) behaves predictably. From 2cb6c02952dac7193bb2b4064309977cb99fbab8 Mon Sep 17 00:00:00 2001 From: Mesut Oezdil Date: Thu, 2 Apr 2026 11:12:07 +0200 Subject: [PATCH 6/7] docs: clarify scope and fix wording --- .../version-v2.8.0/get-started/verify-hami.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/versioned_docs/version-v2.8.0/get-started/verify-hami.md b/versioned_docs/version-v2.8.0/get-started/verify-hami.md index 5147f6ef..4c8f8c9f 100644 --- a/versioned_docs/version-v2.8.0/get-started/verify-hami.md +++ b/versioned_docs/version-v2.8.0/get-started/verify-hami.md @@ -2,12 +2,15 @@ title: Validate HAMi Setup and vGPU Behavior sidebar_label: Validate HAMi --- - # Validate HAMi Setup and vGPU Behavior -This guide provides a rapid, end-to-end setup to validate that GPU workloads run correctly in a Kubernetes cluster with HAMi. +## Scope and Assumptions + +This guide assumes that HAMi is already installed (for example, via the "Deploy HAMi using Helm" guide in the Get Started section). -What "working" actually means: A successful HAMi setup goes beyond just running pods or a successful Helm installation. It means the GPU is accessible inside a container, Kubernetes correctly advertises the resources, and vGPU isolation (like memory limits) behaves predictably. +The goal of this document is not to repeat installation steps, but to validate that HAMi is working correctly in a real Kubernetes environment, including GPU access and vGPU behavior. + +If HAMi is not yet installed, please follow the deployment guide first. ## Step 0: Configure Node Container Runtime (If not already done) HAMi requires the `nvidia-container-toolkit` to be installed and set as the default low-level runtime on all your GPU nodes. @@ -59,8 +62,10 @@ kubectl logs cuda-test ``` Note: You must see the standard `nvidia-smi` output. Do not proceed if this fails. -## Step 2: Install HAMi -Once the baseline is verified, label your node so the HAMi scheduler can manage it, and deploy via Helm. +## Step 2: Verify HAMi Installation +Once the baseline is verified, ensure that HAMi is installed and its components are running correctly. + +If you have already deployed HAMi, you can skip the installation command and only verify that the components are running. ### 1. Label the node ``` From 3872bf10ce5016a6dc64621a7d5519b46d887995 Mon Sep 17 00:00:00 2001 From: Mesut Oezdil Date: Thu, 2 Apr 2026 11:13:10 +0200 Subject: [PATCH 7/7] trigger rebuild