Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
369 changes: 369 additions & 0 deletions .github/workflows/gcp-gpu-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,369 @@
name: GCP GPU Test

on:
push:
branches:
- garvit/gcp-gpu-test
workflow_dispatch:
inputs:
gpu_install_type:
description: 'GPU installation type'
required: false
default: 'nvidia-device-plugin'
type: choice
options:
- gpu-operator
- nvidia-device-plugin
dcgm_install_type:
description: 'DCGM install type'
required: false
default: 'devzero-dcgm'
type: choice
options:
- nvidia-dcgm
- devzero-dcgm
cluster_version:
description: 'Kubernetes cluster version'
required: false
default: '1.30'
type: choice
options:
- '1.26'
- '1.27'
- '1.28'
- '1.29'
- '1.30'
- '1.31'
- '1.32'
- '1.33'

permissions:
id-token: write
contents: read

jobs:
apply-terraform:
name: Apply Terraform
runs-on: ubuntu-latest
env:
GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}

outputs:
job_identifier: ${{ steps.job-identifier.outputs.job_identifier }}

steps:
- name: Validate Inputs
run: |
echo "GPU_INSTALL_TYPE=${GPU_INSTALL_TYPE}"
echo "DCGM_INSTALL_TYPE=${DCGM_INSTALL_TYPE}"

if [[ "$GPU_INSTALL_TYPE" == "nvidia-device-plugin" && "$DCGM_INSTALL_TYPE" != "devzero-dcgm" ]]; then
echo "Error: When GPU_INSTALL_TYPE is 'nvidia-device-plugin', DCGM_INSTALL_TYPE must be 'devzero-dcgm'."
exit 1
fi

- name: Checkout Repository
uses: actions/checkout@v4

- name: 'Authenticate to Google Cloud'
id: 'auth'
uses: 'google-github-actions/auth@v2'
with:
workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool'
service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com'
create_credentials_file: true
export_environment_variables: true

- name: Export Terraform-friendly environment variables
run: |
echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV
echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV

- name: Generate Unique Job Identifier
id: job-identifier
shell: bash
run: |
SHORT_SHA=$(git rev-parse --short HEAD)
if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
SUFFIX="dd"
else
SUFFIX="nd"
fi
JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}"
echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV
echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT

- name: Set up Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: "1.11.3"

- name: Apply Terraform
working-directory: terraform/gcp
run: |
cat <<EOF > backend_override.tf
terraform {
backend "gcs" {
bucket = "zxporter-tf-state"
prefix = "${JOB_IDENTIFIER}/terraform.tfstate"
}
}
EOF
terraform init
terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" -var='cluster_version=${{ env.CLUSTER_VERSION }}'

install-and-validate:
name: Install and Validate GPU Resources and ZXPorter
runs-on: ubuntu-latest
needs: apply-terraform
env:
GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}

steps:
- name: Checkout Repository
uses: actions/checkout@v4

- name: 'Authenticate to Google Cloud'
id: 'auth'
uses: 'google-github-actions/auth@v2'
with:
workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool'
service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com'
create_credentials_file: true
export_environment_variables: true

- name: 'Set up Cloud SDK'
uses: 'google-github-actions/setup-gcloud@v2'
with:
version: '>= 363.0.0'

- name: Install gke-gcloud-auth-plugin
run: |
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
sudo apt-get update
sudo apt-get install -y google-cloud-sdk-gke-gcloud-auth-plugin

- name: Configure Kubernetes Access
run: |
gcloud container clusters get-credentials ${{ needs.apply-terraform.outputs.job_identifier }} --zone us-central1 --project devzero-self-hosted

- name: Check GPU Availability
id: gpu_check
run: |
echo "Checking GPU resources on nodes..."
for i in {1..6}; do
if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
echo "GPU resources are available on the nodes."
echo "GPU_CHECK=true" >> $GITHUB_ENV
exit 0
else
echo "[$i/6] GPUs not available yet, retrying in 10s..."
sleep 10
fi
done
echo "❌ GPU check failed after retries."
echo "GPU_CHECK=false" >> $GITHUB_ENV

- name: Install GPU Operator (if needed)
if: env.GPU_CHECK == 'false' && env.GPU_INSTALL_TYPE == 'gpu-operator'
run: |
echo "GPU resources not found, installing GPU Operator..."
kubectl create ns gpu-operator
kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite
kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \
helm repo update
INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0"
if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false"
fi
echo "Running: $INSTALL_CMD"
$INSTALL_CMD

- name: Install Nvidia Device Plugin
if: env.GPU_INSTALL_TYPE == 'nvidia-device-plugin' && env.GPU_CHECK == 'false'
run: |
echo "Installing Nvidia Device Plugin..."
kubectl get nodes -o jsonpath='{.items[*].metadata.name}' | xargs -I {} kubectl label node {} nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite
kubectl create ns nvidia-device-plugin
kubectl apply -f nvidia-device-plugin-prereq
helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
helm repo update
helm upgrade -i nvdp nvdp/nvidia-device-plugin \
--namespace nvidia-device-plugin \
--version 0.17.1

- name: Check GPU Availability After Installing GPU Operator
if: env.GPU_CHECK == 'false'
run: |
echo "Re-checking GPU resources on nodes after GPU Operator installation..."
if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
echo "GPU resources are available on the nodes."
else
echo "GPU check failed after GPU Operator installation"
exit 1
fi

- name: Check Nvidia DCGM DaemonSet
id: dcgm_check
if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }}
run: |
echo "Checking if DCGM DaemonSet is installed..."
if kubectl get daemonset -A | grep -q dcgm; then
echo "Nvidia DCGM found, proceeding with validation."
else
echo "Nvidia DCGM not found."
exit 1
fi

- name: Install DevZero DCGM
if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
run: |
echo "Installing DCGM Exporter..."
kubectl create ns devzero-zxporter
curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/gke.yml | kubectl apply -f -

- name: Check DCGM DaemonSet After Installing DCGM Exporter
if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
run: |
echo "Re-checking DCGM pods after DCGM Exporter installation..."
if kubectl get daemonset -A | grep -q dcgm; then
echo "DCGM DaemonSet is running."
else
echo "DCGM DaemonSet not running after installation"
exit 1
fi

- name: Verify DCGM Pods and Prometheus Annotations
run: |
NAMESPACE="devzero-zxporter"
if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then
NAMESPACE="gpu-operator"
fi
kubectl get pods -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s
echo "Verifying DCGM pods and Prometheus annotations..."
kubectl get pods -A | grep dcgm-exporter | awk '
BEGIN { all_running = 1; pod_count = 0 }
{
pod_count++
status = $4
printf "Pod: %s/%s - Status: %s\n", $1, $2, status
if (status != "Running") all_running = 0
}
END {
printf "\nTotal Pods: %d\n", pod_count
printf "All Running: %s\n", (all_running ? "true" : "false")
}'
kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done

- name: Install and Verify DeepSeek Workload
run: |
kubectl create ns deepseek
kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml

kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s
pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}')

if [[ -n "$pod_status" ]]; then
echo "Pods are not in Running state. Failing the pipeline."
exit 1
else
echo "All pods are running successfully."
fi

- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.22'
cache: true

- name: Install ZXPorter
run: |
ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
make docker-build docker-push IMG=${ZXPORTER_IMG}
make deploy IMG=${ZXPORTER_IMG}

echo "Waiting for ZXPorter pods to be ready..."
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s

- name: Test ZXPorter with Prometheus
run: |
kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter > pf.log 2>&1 &
PF_PID=$!
sleep 20
MAX_RETRIES=6
for i in $(seq 1 $MAX_RETRIES); do
if curl -s "http://localhost:9090/-/ready" >/dev/null; then
echo "Prometheus port-forward is ready."
break
fi
echo "[$i/$MAX_RETRIES] Waiting for Prometheus to become ready..."
sleep 5
done

sleep 20

result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
kill $PF_PID || true

echo "Metric found: $result"
if [[ -z "$result" || "$result" == [] ]]; then
echo "❌ DCGM_FI_DEV_SM_CLOCK metric not found!"
echo "Port-forward log:"
cat pf.log
exit 1
fi

destroy-terraform:
name: Destroy Terraform
runs-on: ubuntu-latest
env:
CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}

if: always()
needs:
- apply-terraform
- install-and-validate

steps:
- name: Checkout Repository
uses: actions/checkout@v4

- name: 'Authenticate to Google Cloud'
id: 'auth'
uses: 'google-github-actions/auth@v2'
with:
workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool'
service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com'
create_credentials_file: true
export_environment_variables: true

- name: Export Terraform-friendly environment variables
run: |
echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV
echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV

- name: Set up Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: "1.11.3"

- name: Destroy Infrastructure
working-directory: terraform/gcp
run: |
cat <<EOF > backend_override.tf
terraform {
backend "gcs" {
bucket = "zxporter-tf-state"
prefix = "${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate"
}
}
EOF
terraform init
terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}" -var='cluster_version=${{ env.CLUSTER_VERSION }}'
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,6 @@ config/**/charts
*.swp
*.swo
*~

.terraform*
*.tfstate*
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,11 @@ help: ## Display this help.

.PHONY: manifests
manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases
$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases -w

.PHONY: generate
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..."
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." -w

.PHONY: fmt
fmt: ## Run go fmt against code.
Expand Down
Loading
Loading