From ac1780caaa6c3b08eb2749a9015677eaa37e8978 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sun, 1 Jun 2025 14:38:29 +0530
Subject: [PATCH 01/44] ci for testing gpu metrics in eks

---
 .github/workflows/aws-gpu-test.yaml | 215 ++++++++++++++++++++++++++++
 .gitignore                          |   5 +
 terraform/aws/main.tf               |  57 ++++++++
 terraform/aws/terraform.tfvars      |   2 +
 terraform/aws/variables.tf          |   9 ++
 5 files changed, 288 insertions(+)
 create mode 100644 .github/workflows/aws-gpu-test.yaml
 create mode 100644 terraform/aws/main.tf
 create mode 100644 terraform/aws/terraform.tfvars
 create mode 100644 terraform/aws/variables.tf
diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
new file mode 100644
index 00000000..048ffa69
--- /dev/null
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -0,0 +1,215 @@
+name: AWS Terraform EKS Setup and ZXPorter Installation
+
+on:
+  push:
+    branches:
+      - garvit/aws-gpu-test
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  setup-eks:
+    name: Setup EKS and Install ZXPorter
+    runs-on: ubuntu-latest
+
+    outputs:
+      job_identifier: ${{ steps.job-identifier.outputs.job_identifier }}
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Configure AWS Credential
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
+          aws-region: us-east-1
+
+      - name: Generate Unique Job Identifier
+        id: job-identifier
+        shell: bash
+        run: |
+          SHORT_SHA=$(git rev-parse --short HEAD)
+          JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}"
+          echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV
+          echo "::set-output name=job_identifier::${JOB_IDENTIFIER}"
+
+      - name: Set up Terraform
+        uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: 1.5.7
+
+      - name: Apply Terraform
+        working-directory: terraform/aws
+        run: |
+          cat <<EOF > backend_override.tf
+          terraform {
+            backend "s3" {
+                bucket         	   = "zxporter-tf-state"
+                key              	 = "${JOB_IDENTIFIER}/terraform.tfstate"
+                region         	   = "us-east-1"
+            }
+          }
+          EOF
+          terraform init
+          terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER"
+
+  install-and-validate:
+    name: Install and Validate ZXPorter and GPU Resources
+    runs-on: ubuntu-latest
+    needs: setup-eks 
+
+    steps:
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
+          aws-region: us-east-1
+
+      - name: Configure Kubernetes Access
+        run: |
+          aws eks update-kubeconfig --region us-east-1 --name ${{ needs.setup-eks.outputs.job_identifier }}
+
+      - name: Check GPU Availability
+        id: gpu_check
+        run: |
+          echo "Checking GPU resources on nodes..."
+          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
+            echo "GPU resources are available on the nodes."
+          else
+            echo "GPU check failed"
+            exit 0
+          fi
+
+      - name: Install GPU Operator (if needed)
+        if: steps.gpu_check.outcome == 'success'
+        run: |
+          echo "GPU resources not found, installing GPU Operator..."
+          kubectl create ns gpu-operator
+          kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite
+          kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true
+          helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \
+          helm repo update
+          helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0
+
+      - name: Check GPU Availability After Installing GPU Operator
+        if: steps.gpu_check.outcome == 'success'
+        run: |
+          echo "Re-checking GPU resources on nodes after GPU Operator installation..."
+          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
+            echo "GPU resources are available on the nodes."
+          else
+            echo "GPU check failed after GPU Operator installation"
+            exit 1
+          fi
+
+      - name: Check DCGM DaemonSet
+        id: dcgm_check
+        run: |
+          echo "Checking if DCGM DaemonSet is installed..."
+          if kubectl get daemonset -A | grep -q dcgm; then
+            echo "DCGM DaemonSet is installed."
+          else
+            echo "DCGM DaemonSet not found"
+            exit 0
+          fi
+
+      - name: Install DCGM Exporter (if needed)
+        if: steps.dcgm_check.outcome == 'success'
+        run: |
+          echo "Installing DCGM Exporter..."
+          kubectl create ns devzero-zxporter
+          curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f -
+
+      - name: Check DCGM DaemonSet After Installing DCGM Exporter
+        if: steps.dcgm_check.outcome == 'success'
+        run: |
+          echo "Re-checking DCGM pods after DCGM Exporter installation..."
+          if kubectl get daemonset -A | grep -q dcgm; then
+            echo "DCGM DaemonSet is running."
+          else
+            echo "DCGM DaemonSet not running after installation"
+            exit 1
+          fi
+          
+      - name: Verify DCGM Pods and Prometheus Annotations
+        run: |
+          echo "Verifying DCGM pods and Prometheus annotations..."
+          kubectl get pods -A | grep dcgm-exporter | awk '
+          BEGIN { all_running = 1; pod_count = 0 }
+          {
+              pod_count++
+              status = $4
+              printf "Pod: %s/%s - Status: %s\n", $1, $2, status
+              if (status != "Running") all_running = 0
+          }
+          END {
+              printf "\nTotal Pods: %d\n", pod_count
+              printf "All Running: %s\n", (all_running ? "true" : "false")
+          }'
+          kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done
+
+      - name: Install and Verify DeepSeek Workload
+        run: |
+          kubectl create ns deepseek
+          kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml    
+          
+          kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s
+          pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}')
+          
+          if [[ -n "$pod_status" ]]; then
+            echo "Pods are not in Running state. Failing the pipeline."
+            exit 1
+          else
+            echo "All pods are running successfully."
+          fi
+
+      - name: Install ZXPorter
+        run: |
+          curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \
+            -H "X-Kube-Context-Name: $(kubectl config current-context)" \
+            "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \
+            kubectl apply -f -
+
+      - name: Test ZXPorter with Prometheus
+        run: |
+          kubectl port-forward svc/prometheus-server 9090:80 -n devzero-zxporter &
+          sleep 5
+          result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
+          if [[ -z "$result" || "$result" == "null" ]]; then
+            echo "DCGM_FI_DEV_SM_CLOCK metric not found!"
+            exit 1
+          fi
+          echo "Metric found: $result"
+
+  destroy-terraform:
+    name: Destroy Infrastructure
+    runs-on: ubuntu-latest
+    if: always()
+    needs: install-and-validate
+    steps:
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
+          aws-region: us-east-1
+
+      - name: Configure Terraform Backend
+        run: |
+          cat <<EOF > backend_override.tf
+          terraform {
+            backend "s3" {
+                bucket          = "zxporter-tf-state"
+                key             = "${{ needs.setup-eks.outputs.job_identifier }}/terraform.tfstate"
+                region          = "us-east-1"
+            }
+          }
+          EOF
+          terraform init -backend-config=backend_override.tf
+
+      - name: Destroy Infrastructure
+        working-directory: terraform/aws
+        run: terraform destroy -auto-approve
diff --git a/.gitignore b/.gitignore
index f2f57448..e5cfe436 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,8 @@ config/**/charts
 *.swp
 *.swo
 *~
+
+# Terraform files
+*.tfstate
+*.tfstate.backup
+.terraform*
diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf
new file mode 100644
index 00000000..db292abf
--- /dev/null
+++ b/terraform/aws/main.tf
@@ -0,0 +1,57 @@
+provider "aws" {
+  region = "us-east-1"
+}
+
+module "vpc" {
+  source = "terraform-aws-modules/vpc/aws"
+
+  name = "${var.cluster_name}-vpc"
+  cidr = "10.0.0.0/16"
+
+  azs             = ["us-east-1a", "us-east-1b"]
+  private_subnets = ["10.0.1.0/24", "10.0.2.0/24"]
+  public_subnets  = ["10.0.101.0/24", "10.0.102.0/24"]
+
+  enable_nat_gateway = true
+  single_nat_gateway = true
+  
+  # Required for EKS
+  enable_dns_hostnames = true
+  enable_dns_support   = true
+}
+
+module "eks" {
+  source          = "terraform-aws-modules/eks/aws"
+
+  cluster_name    = var.cluster_name
+  cluster_version = var.cluster_version
+
+  # Add VPC configuration
+  vpc_id          = module.vpc.vpc_id
+  subnet_ids      = module.vpc.private_subnets
+
+  enable_irsa     = true
+
+  cluster_endpoint_public_access = true
+  enable_cluster_creator_admin_permissions = true
+  cluster_endpoint_public_access_cidrs = ["0.0.0.0/0"]
+
+  eks_managed_node_groups = {
+    gpu_nodes = {
+      instance_types = ["g6.4xlarge"]
+      desired_size   = 1
+      min_size      = 1
+      max_size      = 1
+
+      ami_type      = "AL2023_x86_64_NVIDIA"
+
+      use_custom_launch_template = false
+
+      disk_size     = 200
+
+      labels = {
+        node_type = "gpu"
+      }
+    }
+  }
+}
diff --git a/terraform/aws/terraform.tfvars b/terraform/aws/terraform.tfvars
new file mode 100644
index 00000000..e343f0bb
--- /dev/null
+++ b/terraform/aws/terraform.tfvars
@@ -0,0 +1,2 @@
+cluster_name    = "devzero-gpu-cluster"
+cluster_version = "1.30"
\ No newline at end of file
diff --git a/terraform/aws/variables.tf b/terraform/aws/variables.tf
new file mode 100644
index 00000000..b9738fb3
--- /dev/null
+++ b/terraform/aws/variables.tf
@@ -0,0 +1,9 @@
+variable "cluster_name" {
+  description = "The name of the EKS cluster"
+  type        = string
+}
+
+variable "cluster_version" {
+  description = "The Kubernetes version for the EKS cluster"
+  type        = string
+}

From 11cee4f8b25a41f31cf85a0794c32f195eb46fb4 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sun, 1 Jun 2025 15:30:44 +0530
Subject: [PATCH 02/44] ci for testing gpu metrics in eks

---
 .github/workflows/aws-gpu-test.yaml | 35 +++++++++++++++++++----------
 terraform/aws/main.tf               | 10 +++++++++
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 048ffa69..979fd718 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -11,8 +11,8 @@ permissions:
   contents: read
 
 jobs:
-  setup-eks:
-    name: Setup EKS and Install ZXPorter
+  apply-terraform:
+    name: Apply Terraform Configuration
     runs-on: ubuntu-latest
 
     outputs:
@@ -58,7 +58,7 @@ jobs:
           terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER"
 
   install-and-validate:
-    name: Install and Validate ZXPorter and GPU Resources
+    name: Install and Validate GPU Resources and ZXPorter
     runs-on: ubuntu-latest
     needs: setup-eks 
 
@@ -77,15 +77,17 @@ jobs:
         id: gpu_check
         run: |
           echo "Checking GPU resources on nodes..."
+          kubectl describe nodes | grep "nvidia.com/gpu"
           if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
             echo "GPU resources are available on the nodes."
+            echo "true" >> $GITHUB_ENV
           else
             echo "GPU check failed"
-            exit 0
+            echo "false" >> $GITHUB_ENV
           fi
 
       - name: Install GPU Operator (if needed)
-        if: steps.gpu_check.outcome == 'success'
+        if: env.GPU_CHECK == 'false'
         run: |
           echo "GPU resources not found, installing GPU Operator..."
           kubectl create ns gpu-operator
@@ -96,9 +98,10 @@ jobs:
           helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0
 
       - name: Check GPU Availability After Installing GPU Operator
-        if: steps.gpu_check.outcome == 'success'
+        if: env.GPU_CHECK == 'false'
         run: |
           echo "Re-checking GPU resources on nodes after GPU Operator installation..."
+          kubectl describe nodes | grep "nvidia.com/gpu"
           if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
             echo "GPU resources are available on the nodes."
           else
@@ -110,24 +113,27 @@ jobs:
         id: dcgm_check
         run: |
           echo "Checking if DCGM DaemonSet is installed..."
+          kubectl get daemonset -A | grep dcgm
           if kubectl get daemonset -A | grep -q dcgm; then
-            echo "DCGM DaemonSet is installed."
+            echo "DCGM DaemonSet is already installed."
+            echo "true" >> $GITHUB_ENV
           else
-            echo "DCGM DaemonSet not found"
-            exit 0
+            echo "DCGM DaemonSet not found."
+            echo "false" >> $GITHUB_ENV
           fi
 
       - name: Install DCGM Exporter (if needed)
-        if: steps.dcgm_check.outcome == 'success'
+        if: env.DCGM_CHECK == 'false'
         run: |
           echo "Installing DCGM Exporter..."
           kubectl create ns devzero-zxporter
           curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f -
 
       - name: Check DCGM DaemonSet After Installing DCGM Exporter
-        if: steps.dcgm_check.outcome == 'success'
+        if: env.DCGM_CHECK == 'false'
         run: |
           echo "Re-checking DCGM pods after DCGM Exporter installation..."
+          kubectl get daemonset -A | grep dcgm
           if kubectl get daemonset -A | grep -q dcgm; then
             echo "DCGM DaemonSet is running."
           else
@@ -186,7 +192,7 @@ jobs:
           echo "Metric found: $result"
 
   destroy-terraform:
-    name: Destroy Infrastructure
+    name: Destroy Terraform
     runs-on: ubuntu-latest
     if: always()
     needs: install-and-validate
@@ -197,6 +203,11 @@ jobs:
           role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
           aws-region: us-east-1
 
+      - name: Set up Terraform
+        uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: 1.5.7
+
       - name: Configure Terraform Backend
         run: |
           cat <<EOF > backend_override.tf
diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf
index db292abf..e3ef7095 100644
--- a/terraform/aws/main.tf
+++ b/terraform/aws/main.tf
@@ -18,6 +18,16 @@ module "vpc" {
   # Required for EKS
   enable_dns_hostnames = true
   enable_dns_support   = true
+
+  public_subnet_tags = {
+    "kubernetes.io/cluster/${var.cluster_name}" = "shared"
+    "kubernetes.io/role/elb"                    = "1"
+  }
+
+  private_subnet_tags = {
+    "kubernetes.io/cluster/${var.cluster_name}" = "shared"
+    "kubernetes.io/role/internal-elb"           = "1"
+  }
 }
 
 module "eks" {

From 0d4c5c98e82379ef5238defd6145ea84e7911a6b Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sun, 1 Jun 2025 15:32:44 +0530
Subject: [PATCH 03/44] ci for testing gpu metrics in eks

---
 .github/workflows/aws-gpu-test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 979fd718..8881c65b 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -60,7 +60,7 @@ jobs:
   install-and-validate:
     name: Install and Validate GPU Resources and ZXPorter
     runs-on: ubuntu-latest
-    needs: setup-eks 
+    needs: apply-terraform 
 
     steps:
       - name: Configure AWS Credentials

From d67f0a5d270dca8299f3c2708f6a5e880481dce3 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sun, 1 Jun 2025 16:02:05 +0530
Subject: [PATCH 04/44] ci for testing gpu metrics in eks

---
 .github/workflows/aws-gpu-test.yaml | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 8881c65b..927e0a6b 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -12,7 +12,7 @@ permissions:
 
 jobs:
   apply-terraform:
-    name: Apply Terraform Configuration
+    name: Apply Terraform
     runs-on: ubuntu-latest
 
     outputs:
@@ -71,7 +71,7 @@ jobs:
 
       - name: Configure Kubernetes Access
         run: |
-          aws eks update-kubeconfig --region us-east-1 --name ${{ needs.setup-eks.outputs.job_identifier }}
+          aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }}
 
       - name: Check GPU Availability
         id: gpu_check
@@ -197,6 +197,9 @@ jobs:
     if: always()
     needs: install-and-validate
     steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
       - name: Configure AWS Credentials
         uses: aws-actions/configure-aws-credentials@v4
         with:
@@ -208,19 +211,10 @@ jobs:
         with:
           terraform_version: 1.5.7
 
-      - name: Configure Terraform Backend
-        run: |
-          cat <<EOF > backend_override.tf
-          terraform {
-            backend "s3" {
-                bucket          = "zxporter-tf-state"
-                key             = "${{ needs.setup-eks.outputs.job_identifier }}/terraform.tfstate"
-                region          = "us-east-1"
-            }
-          }
-          EOF
-          terraform init -backend-config=backend_override.tf
-
       - name: Destroy Infrastructure
         working-directory: terraform/aws
-        run: terraform destroy -auto-approve
+        run: |
+          terraform init -backend-config="bucket=zxporter-tf-state" \
+                         -backend-config="key=${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate" \
+                         -backend-config="region=us-east-1"
+          terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}"

From bd128a52213458bc2918ede42a64487aad4973a9 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sun, 1 Jun 2025 16:39:42 +0530
Subject: [PATCH 05/44] ci for testing gpu metrics in eks

---
 .github/workflows/aws-gpu-test.yaml | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 927e0a6b..a7e54c4b 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -35,7 +35,7 @@ jobs:
           SHORT_SHA=$(git rev-parse --short HEAD)
           JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}"
           echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV
-          echo "::set-output name=job_identifier::${JOB_IDENTIFIER}"
+          echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT
 
       - name: Set up Terraform
         uses: hashicorp/setup-terraform@v3
@@ -77,7 +77,6 @@ jobs:
         id: gpu_check
         run: |
           echo "Checking GPU resources on nodes..."
-          kubectl describe nodes | grep "nvidia.com/gpu"
           if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
             echo "GPU resources are available on the nodes."
             echo "true" >> $GITHUB_ENV
@@ -101,7 +100,6 @@ jobs:
         if: env.GPU_CHECK == 'false'
         run: |
           echo "Re-checking GPU resources on nodes after GPU Operator installation..."
-          kubectl describe nodes | grep "nvidia.com/gpu"
           if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
             echo "GPU resources are available on the nodes."
           else
@@ -113,7 +111,6 @@ jobs:
         id: dcgm_check
         run: |
           echo "Checking if DCGM DaemonSet is installed..."
-          kubectl get daemonset -A | grep dcgm
           if kubectl get daemonset -A | grep -q dcgm; then
             echo "DCGM DaemonSet is already installed."
             echo "true" >> $GITHUB_ENV
@@ -133,7 +130,6 @@ jobs:
         if: env.DCGM_CHECK == 'false'
         run: |
           echo "Re-checking DCGM pods after DCGM Exporter installation..."
-          kubectl get daemonset -A | grep dcgm
           if kubectl get daemonset -A | grep -q dcgm; then
             echo "DCGM DaemonSet is running."
           else
@@ -214,7 +210,14 @@ jobs:
       - name: Destroy Infrastructure
         working-directory: terraform/aws
         run: |
-          terraform init -backend-config="bucket=zxporter-tf-state" \
-                         -backend-config="key=${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate" \
-                         -backend-config="region=us-east-1"
+          cat <<EOF > backend_override.tf
+          terraform {
+            backend "s3" {
+                bucket  = "zxporter-tf-state"
+                key     = "${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate"
+                region  = "us-east-1"
+            }
+          }
+          EOF
+          terraform init
           terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}"

From 2cc613d5e3d8586c1b9e205498dd4355dc8348f7 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sun, 1 Jun 2025 17:05:22 +0530
Subject: [PATCH 06/44] ci for testing gpu metrics in eks

---
 .github/workflows/aws-gpu-test.yaml | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index a7e54c4b..218087ef 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -1,4 +1,4 @@
-name: AWS Terraform EKS Setup and ZXPorter Installation
+name: AWS GPU Test
 
 on:
   push:
@@ -79,10 +79,10 @@ jobs:
           echo "Checking GPU resources on nodes..."
           if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
             echo "GPU resources are available on the nodes."
-            echo "true" >> $GITHUB_ENV
+            echo "GPU_CHECK=true" >> $GITHUB_ENV   # Corrected!
           else
             echo "GPU check failed"
-            echo "false" >> $GITHUB_ENV
+            echo "GPU_CHECK=false" >> $GITHUB_ENV  # Corrected!
           fi
 
       - name: Install GPU Operator (if needed)
@@ -113,10 +113,10 @@ jobs:
           echo "Checking if DCGM DaemonSet is installed..."
           if kubectl get daemonset -A | grep -q dcgm; then
             echo "DCGM DaemonSet is already installed."
-            echo "true" >> $GITHUB_ENV
+            echo "DCGM_CHECK=true" >> $GITHUB_ENV 
           else
             echo "DCGM DaemonSet not found."
-            echo "false" >> $GITHUB_ENV
+            echo "DCGM_CHECK=false" >> $GITHUB_ENV
           fi
 
       - name: Install DCGM Exporter (if needed)
@@ -191,7 +191,10 @@ jobs:
     name: Destroy Terraform
     runs-on: ubuntu-latest
     if: always()
-    needs: install-and-validate
+    needs:
+      - apply-terraform
+      - install-and-validate
+
     steps:
       - name: Checkout Repository
         uses: actions/checkout@v4

From d060fefed870b01516a88d97725da4e755c633a9 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sun, 1 Jun 2025 18:11:11 +0530
Subject: [PATCH 07/44] ci for testing gpu metrics in eks

---
 .github/workflows/aws-gpu-test.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 218087ef..268c16ac 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -79,10 +79,10 @@ jobs:
           echo "Checking GPU resources on nodes..."
           if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
             echo "GPU resources are available on the nodes."
-            echo "GPU_CHECK=true" >> $GITHUB_ENV   # Corrected!
+            echo "GPU_CHECK=true" >> $GITHUB_ENV
           else
             echo "GPU check failed"
-            echo "GPU_CHECK=false" >> $GITHUB_ENV  # Corrected!
+            echo "GPU_CHECK=false" >> $GITHUB_ENV
           fi
 
       - name: Install GPU Operator (if needed)
@@ -139,6 +139,7 @@ jobs:
           
       - name: Verify DCGM Pods and Prometheus Annotations
         run: |
+          kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n gpu-operator --timeout=300s
           echo "Verifying DCGM pods and Prometheus annotations..."
           kubectl get pods -A | grep dcgm-exporter | awk '
           BEGIN { all_running = 1; pod_count = 0 }
@@ -175,6 +176,9 @@ jobs:
             -H "X-Kube-Context-Name: $(kubectl config current-context)" \
             "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \
             kubectl apply -f -
+          
+          echo "Waiting for ZXPorter pods to be ready..."
+          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
 
       - name: Test ZXPorter with Prometheus
         run: |

From 921d7522a87671effb1782e3ac75ebb69bee45f8 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sun, 1 Jun 2025 19:04:09 +0530
Subject: [PATCH 08/44] ci for testing gpu metrics in eks

---
 .github/workflows/aws-gpu-test.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 268c16ac..46699d5d 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -1,9 +1,6 @@
 name: AWS GPU Test
 
 on:
-  push:
-    branches:
-      - garvit/aws-gpu-test
   workflow_dispatch:
 
 permissions:

From cc7157ecb4fe14823d935763dda059d74b67c93b Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 3 Jun 2025 13:00:15 +0530
Subject: [PATCH 09/44] update in gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 23 ++++++++---
 dist/install.yaml                   | 60 ++++++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 46699d5d..ab9ff3e2 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -1,6 +1,9 @@
 name: AWS GPU Test
 
 on:
+  push:
+    branches:
+      - garvit/aws-gpu-test
   workflow_dispatch:
 
 permissions:
@@ -150,7 +153,7 @@ jobs:
               printf "\nTotal Pods: %d\n", pod_count
               printf "All Running: %s\n", (all_running ? "true" : "false")
           }'
-          kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done
+          kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/port: "9400" --overwrite; done
 
       - name: Install and Verify DeepSeek Workload
         run: |
@@ -167,12 +170,22 @@ jobs:
             echo "All pods are running successfully."
           fi
 
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.22'
+          cache: true
+
       - name: Install ZXPorter
         run: |
-          curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \
-            -H "X-Kube-Context-Name: $(kubectl config current-context)" \
-            "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \
-            kubectl apply -f -
+          ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
+          echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
+          make docker-build docker-push IMG=${ZXPORTER_IMG}
+          make deploy IMG=${ZXPORTER_IMG}
+          # curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \
+          #   -H "X-Kube-Context-Name: $(kubectl config current-context)" \
+          #   "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \
+          #   kubectl apply -f -
           
           echo "Waiting for ZXPorter pods to be ready..."
           kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
diff --git a/dist/install.yaml b/dist/install.yaml
index 48c3461d..b3e6a2e2 100644
--- a/dist/install.yaml
+++ b/dist/install.yaml
@@ -117,6 +117,64 @@ data:
       scheme: https
       tls_config:
         ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+    - honor_labels: true
+      job_name: kubernetes-pods
+      kubernetes_sd_configs:
+      - role: pod
+      relabel_configs:
+      - action: keep
+        regex: true
+        source_labels:
+        - __meta_kubernetes_pod_annotation_prometheus_io_scrape
+      - action: drop
+        regex: true
+        source_labels:
+        - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
+      - action: replace
+        regex: (https?)
+        source_labels:
+        - __meta_kubernetes_pod_annotation_prometheus_io_scheme
+        target_label: __scheme__
+      - action: replace
+        regex: (.+)
+        source_labels:
+        - __meta_kubernetes_pod_annotation_prometheus_io_path
+        target_label: __metrics_path__
+      - action: replace
+        regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
+        replacement: '[$2]:$1'
+        source_labels:
+        - __meta_kubernetes_pod_annotation_prometheus_io_port
+        - __meta_kubernetes_pod_ip
+        target_label: __address__
+      - action: replace
+        regex: (\d+);((([0-9]+?)(\.|$)){4})
+        replacement: $2:$1
+        source_labels:
+        - __meta_kubernetes_pod_annotation_prometheus_io_port
+        - __meta_kubernetes_pod_ip
+        target_label: __address__
+      - action: labelmap
+        regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
+        replacement: __param_$1
+      - action: labelmap
+        regex: __meta_kubernetes_pod_label_(.+)
+      - action: replace
+        source_labels:
+        - __meta_kubernetes_namespace
+        target_label: namespace
+      - action: replace
+        source_labels:
+        - __meta_kubernetes_pod_name
+        target_label: pod
+      - action: drop
+        regex: Pending|Succeeded|Failed|Completed
+        source_labels:
+        - __meta_kubernetes_pod_phase
+      - action: replace
+        source_labels:
+        - __meta_kubernetes_pod_node_name
+        target_label: node
   recording_rules.yml: |
     {}
   rules: |
@@ -1171,4 +1229,4 @@ spec:
       volumes:
       - configMap:
           name: devzero-zxporter-env-config
-        name: config-volume
+        name: config-volume
\ No newline at end of file

From f06cebf29fba506edbc11040a83c7192ded47452 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 3 Jun 2025 18:16:15 +0530
Subject: [PATCH 10/44] matrix in ci for devzero and nvidia dcgm

---
 .github/workflows/aws-gpu-test.yaml | 69 ++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index ab9ff3e2..29320756 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -14,6 +14,9 @@ jobs:
   apply-terraform:
     name: Apply Terraform
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        dcgm_install_type: [nvidia-dcgm, devzero-dcgm] 
 
     outputs:
       job_identifier: ${{ steps.job-identifier.outputs.job_identifier }}
@@ -33,7 +36,12 @@ jobs:
         shell: bash
         run: |
           SHORT_SHA=$(git rev-parse --short HEAD)
-          JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}"
+          if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then
+            SUFFIX="nd"
+          else
+            SUFFIX="dd"
+          fi
+          JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}"
           echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV
           echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT
 
@@ -60,7 +68,10 @@ jobs:
   install-and-validate:
     name: Install and Validate GPU Resources and ZXPorter
     runs-on: ubuntu-latest
-    needs: apply-terraform 
+    needs: apply-terraform
+    strategy:
+      matrix:
+        dcgm_install_type: [nvidia-dcgm, devzero-dcgm] 
 
     steps:
       - name: Configure AWS Credentials
@@ -87,6 +98,8 @@ jobs:
 
       - name: Install GPU Operator (if needed)
         if: env.GPU_CHECK == 'false'
+        env:
+          DCGM_INSTALL_TYPE: ${{ matrix.dcgm_install_type }}
         run: |
           echo "GPU resources not found, installing GPU Operator..."
           kubectl create ns gpu-operator
@@ -94,7 +107,12 @@ jobs:
           kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true
           helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \
           helm repo update
-          helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0
+          INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0"
+          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
+            INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false"
+          fi
+          echo "Running: $INSTALL_CMD"
+          $INSTALL_CMD
 
       - name: Check GPU Availability After Installing GPU Operator
         if: env.GPU_CHECK == 'false'
@@ -107,27 +125,28 @@ jobs:
             exit 1
           fi
 
-      - name: Check DCGM DaemonSet
+      - name: Check Nvidia DCGM DaemonSet
         id: dcgm_check
+        if: matrix.dcgm_install_type == 'nvidia-dcgm'
         run: |
           echo "Checking if DCGM DaemonSet is installed..."
           if kubectl get daemonset -A | grep -q dcgm; then
-            echo "DCGM DaemonSet is already installed."
-            echo "DCGM_CHECK=true" >> $GITHUB_ENV 
+            echo "Nvidia DCGM found, proceeding with validation."
+            echo "SKIP_INSTALL=false" >> $GITHUB_ENV
           else
-            echo "DCGM DaemonSet not found."
-            echo "DCGM_CHECK=false" >> $GITHUB_ENV
+            echo "Nvidia DCGM not found, skipping install and proceeding to destroy."
+            echo "SKIP_INSTALL=true" >> $GITHUB_ENV
           fi
 
-      - name: Install DCGM Exporter (if needed)
-        if: env.DCGM_CHECK == 'false'
+      - name: Install DevZero DCGM (only for devzero-dcgm)
+        if: matrix.dcgm_install_type == 'devzero-dcgm'
         run: |
           echo "Installing DCGM Exporter..."
           kubectl create ns devzero-zxporter
           curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f -
 
       - name: Check DCGM DaemonSet After Installing DCGM Exporter
-        if: env.DCGM_CHECK == 'false'
+        if: matrix.dcgm_install_type == 'devzero-dcgm'
         run: |
           echo "Re-checking DCGM pods after DCGM Exporter installation..."
           if kubectl get daemonset -A | grep -q dcgm; then
@@ -138,6 +157,7 @@ jobs:
           fi
           
       - name: Verify DCGM Pods and Prometheus Annotations
+        if: env.SKIP_INSTALL != 'true'
         run: |
           kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n gpu-operator --timeout=300s
           echo "Verifying DCGM pods and Prometheus annotations..."
@@ -153,9 +173,10 @@ jobs:
               printf "\nTotal Pods: %d\n", pod_count
               printf "All Running: %s\n", (all_running ? "true" : "false")
           }'
-          kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/port: "9400" --overwrite; done
+          kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done
 
       - name: Install and Verify DeepSeek Workload
+        if: env.SKIP_INSTALL != 'true'
         run: |
           kubectl create ns deepseek
           kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml    
@@ -172,30 +193,33 @@ jobs:
 
       - name: Set up Go
         uses: actions/setup-go@v5
+        if: env.SKIP_INSTALL != 'true'
         with:
           go-version: '1.22'
           cache: true
 
       - name: Install ZXPorter
+        if: env.SKIP_INSTALL != 'true'
         run: |
-          ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
-          echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
-          make docker-build docker-push IMG=${ZXPORTER_IMG}
-          make deploy IMG=${ZXPORTER_IMG}
-          # curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \
-          #   -H "X-Kube-Context-Name: $(kubectl config current-context)" \
-          #   "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \
-          #   kubectl apply -f -
+          # ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
+          # echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
+          # make docker-build docker-push IMG=${ZXPORTER_IMG}
+          # make deploy IMG=${ZXPORTER_IMG}
+          curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \
+            -H "X-Kube-Context-Name: $(kubectl config current-context)" \
+            "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \
+            kubectl apply -f -
           
           echo "Waiting for ZXPorter pods to be ready..."
           kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
 
       - name: Test ZXPorter with Prometheus
+        if: env.SKIP_INSTALL != 'true'
         run: |
           kubectl port-forward svc/prometheus-server 9090:80 -n devzero-zxporter &
           sleep 5
           result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
-          if [[ -z "$result" || "$result" == "null" ]]; then
+          if [[ -z "$result" || "$result" == [] ]]; then
             echo "DCGM_FI_DEV_SM_CLOCK metric not found!"
             exit 1
           fi
@@ -204,6 +228,9 @@ jobs:
   destroy-terraform:
     name: Destroy Terraform
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        dcgm_install_type: [nvidia-dcgm, devzero-dcgm]
     if: always()
     needs:
       - apply-terraform

From db37092e0eaedd270e9ad77e3fb1b2ebc7b493c7 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 3 Jun 2025 18:35:00 +0530
Subject: [PATCH 11/44] matrix in ci for devzero and nvidia dcgm

---
 .github/workflows/aws-gpu-test.yaml | 39 ++++++++++++++++-------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 29320756..f36a86f3 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -5,6 +5,15 @@ on:
     branches:
       - garvit/aws-gpu-test
   workflow_dispatch:
+    inputs:
+      dcgm_install_type:
+        description: 'DCGM install type'
+        required: false
+        default: 'devzero-dcgm'
+        type: choice
+        options:
+          - nvidia-dcgm
+          - devzero-dcgm
 
 permissions:
   id-token: write
@@ -14,9 +23,8 @@ jobs:
   apply-terraform:
     name: Apply Terraform
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        dcgm_install_type: [nvidia-dcgm, devzero-dcgm] 
+    env:
+      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
 
     outputs:
       job_identifier: ${{ steps.job-identifier.outputs.job_identifier }}
@@ -36,10 +44,10 @@ jobs:
         shell: bash
         run: |
           SHORT_SHA=$(git rev-parse --short HEAD)
-          if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then
-            SUFFIX="nd"
-          else
+          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
             SUFFIX="dd"
+          else
+            SUFFIX="nd"
           fi
           JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}"
           echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV
@@ -69,9 +77,8 @@ jobs:
     name: Install and Validate GPU Resources and ZXPorter
     runs-on: ubuntu-latest
     needs: apply-terraform
-    strategy:
-      matrix:
-        dcgm_install_type: [nvidia-dcgm, devzero-dcgm] 
+    env:
+      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
 
     steps:
       - name: Configure AWS Credentials
@@ -98,8 +105,6 @@ jobs:
 
       - name: Install GPU Operator (if needed)
         if: env.GPU_CHECK == 'false'
-        env:
-          DCGM_INSTALL_TYPE: ${{ matrix.dcgm_install_type }}
         run: |
           echo "GPU resources not found, installing GPU Operator..."
           kubectl create ns gpu-operator
@@ -127,7 +132,7 @@ jobs:
 
       - name: Check Nvidia DCGM DaemonSet
         id: dcgm_check
-        if: matrix.dcgm_install_type == 'nvidia-dcgm'
+        if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }}
         run: |
           echo "Checking if DCGM DaemonSet is installed..."
           if kubectl get daemonset -A | grep -q dcgm; then
@@ -139,14 +144,14 @@ jobs:
           fi
 
       - name: Install DevZero DCGM (only for devzero-dcgm)
-        if: matrix.dcgm_install_type == 'devzero-dcgm'
+        if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }}
         run: |
           echo "Installing DCGM Exporter..."
           kubectl create ns devzero-zxporter
           curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f -
 
       - name: Check DCGM DaemonSet After Installing DCGM Exporter
-        if: matrix.dcgm_install_type == 'devzero-dcgm'
+        if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }}
         run: |
           echo "Re-checking DCGM pods after DCGM Exporter installation..."
           if kubectl get daemonset -A | grep -q dcgm; then
@@ -228,9 +233,9 @@ jobs:
   destroy-terraform:
     name: Destroy Terraform
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        dcgm_install_type: [nvidia-dcgm, devzero-dcgm]
+    env:
+      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
+
     if: always()
     needs:
       - apply-terraform

From d211df09613b4aa1048d5b7a15bfd5eb74bf93c7 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 3 Jun 2025 19:03:57 +0530
Subject: [PATCH 12/44] matrix in ci for devzero and nvidia dcgm

---
 .github/workflows/aws-gpu-test.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index f36a86f3..f4e284dd 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -143,15 +143,15 @@ jobs:
             echo "SKIP_INSTALL=true" >> $GITHUB_ENV
           fi
 
-      - name: Install DevZero DCGM (only for devzero-dcgm)
-        if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }}
+      - name: Install DevZero DCGM
+        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
         run: |
           echo "Installing DCGM Exporter..."
           kubectl create ns devzero-zxporter
           curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f -
 
       - name: Check DCGM DaemonSet After Installing DCGM Exporter
-        if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }}
+        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
         run: |
           echo "Re-checking DCGM pods after DCGM Exporter installation..."
           if kubectl get daemonset -A | grep -q dcgm; then

From 8c4bed0f74f03884df11bd8e21b297edbbb190ba Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 3 Jun 2025 19:26:31 +0530
Subject: [PATCH 13/44] using makefile to install zxporter in ci

---
 .github/workflows/aws-gpu-test.yaml | 12 ++++--------
 Makefile                            |  4 ++--
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index f4e284dd..3bd1e552 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -206,14 +206,10 @@ jobs:
       - name: Install ZXPorter
         if: env.SKIP_INSTALL != 'true'
         run: |
-          # ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
-          # echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
-          # make docker-build docker-push IMG=${ZXPORTER_IMG}
-          # make deploy IMG=${ZXPORTER_IMG}
-          curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \
-            -H "X-Kube-Context-Name: $(kubectl config current-context)" \
-            "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \
-            kubectl apply -f -
+          ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
+          echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
+          make docker-build docker-push IMG=${ZXPORTER_IMG}
+          make deploy IMG=${ZXPORTER_IMG}
           
           echo "Waiting for ZXPorter pods to be ready..."
           kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
diff --git a/Makefile b/Makefile
index a6be166d..a4bdfb11 100644
--- a/Makefile
+++ b/Makefile
@@ -125,11 +125,11 @@ help: ## Display this help.
 
 .PHONY: manifests
 manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
-	$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases
+	$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases -w
 
 .PHONY: generate
 generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
-	$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..."
+	$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." -w
 
 .PHONY: fmt
 fmt: ## Run go fmt against code.

From 9aeba3326a3fd246eca012636fb2feb05f8a5993 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 3 Jun 2025 20:29:02 +0530
Subject: [PATCH 14/44] fix in aws gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 3bd1e552..c21d2f38 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -81,6 +81,9 @@ jobs:
       DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
 
     steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+        
       - name: Configure AWS Credentials
         uses: aws-actions/configure-aws-credentials@v4
         with:

From a1b41d2f689bc14e6458c52ede668b178e2477af Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 3 Jun 2025 20:52:44 +0530
Subject: [PATCH 15/44] fix in aws gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index c21d2f38..48b177f9 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -83,7 +83,7 @@ jobs:
     steps:
       - name: Checkout Repository
         uses: actions/checkout@v4
-        
+
       - name: Configure AWS Credentials
         uses: aws-actions/configure-aws-credentials@v4
         with:
@@ -220,7 +220,7 @@ jobs:
       - name: Test ZXPorter with Prometheus
         if: env.SKIP_INSTALL != 'true'
         run: |
-          kubectl port-forward svc/prometheus-server 9090:80 -n devzero-zxporter &
+          kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter &
           sleep 5
           result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
           if [[ -z "$result" || "$result" == [] ]]; then

From db0605c355817ad66a613f8241300169bbe719a3 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 3 Jun 2025 23:14:39 +0530
Subject: [PATCH 16/44] update in gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 48b177f9..8f82c36a 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -209,10 +209,12 @@ jobs:
       - name: Install ZXPorter
         if: env.SKIP_INSTALL != 'true'
         run: |
-          ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
-          echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
-          make docker-build docker-push IMG=${ZXPORTER_IMG}
-          make deploy IMG=${ZXPORTER_IMG}
+          # ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
+          # echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
+          # make docker-build docker-push IMG=${ZXPORTER_IMG}
+          # make deploy IMG=${ZXPORTER_IMG}
+
+          kubectl apply -f dist/install.yaml
           
           echo "Waiting for ZXPorter pods to be ready..."
           kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s

From c6e19b9b41435f88a8da48f148d86107b456b563 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Wed, 4 Jun 2025 00:11:23 +0530
Subject: [PATCH 17/44] using makefile to install zxporter in ci

---
 .github/workflows/aws-gpu-test.yaml           |   8 +-
 config/prometheus/hack.prometheus.values.yaml | 811 ++++++++++--------
 2 files changed, 462 insertions(+), 357 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 8f82c36a..ebf5bcae 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -209,10 +209,10 @@ jobs:
       - name: Install ZXPorter
         if: env.SKIP_INSTALL != 'true'
         run: |
-          # ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
-          # echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
-          # make docker-build docker-push IMG=${ZXPORTER_IMG}
-          # make deploy IMG=${ZXPORTER_IMG}
+          ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
+          echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
+          make docker-build docker-push IMG=${ZXPORTER_IMG}
+          make deploy IMG=${ZXPORTER_IMG}
 
           kubectl apply -f dist/install.yaml
           
diff --git a/config/prometheus/hack.prometheus.values.yaml b/config/prometheus/hack.prometheus.values.yaml
index b1975764..db227b61 100644
--- a/config/prometheus/hack.prometheus.values.yaml
+++ b/config/prometheus/hack.prometheus.values.yaml
@@ -51,344 +51,84 @@ kube-state-metrics:
     # - roles
 
 serverFiles:
-  prometheus.yml:
+  prometheus.yml: 
+    rule_files:
+      - /etc/config/recording_rules.yml
+      - /etc/config/alerting_rules.yml
+      - /etc/config/rules
+      - /etc/config/alerts
     scrape_configs:
       - job_name: prometheus
         static_configs:
           - targets:
               - localhost:9090
-
-      # A scrape configuration for running Prometheus on a Kubernetes cluster.
-      # This uses separate scrape configs for cluster components (i.e. API server, node)
-      # and services to allow each to use different authentication configs.
-      #
-      # Kubernetes labels will be added as Prometheus labels on metrics via the
-      # `labelmap` relabeling action.
-
-## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
-#      # Scrape config for API servers.
-#      #
-#      # Kubernetes exposes API servers as endpoints to the default/kubernetes
-#      # service so this uses `endpoints` role and uses relabelling to only keep
-#      # the endpoints associated with the default/kubernetes service using the
-#      # default named port `https`. This works for single API server deployments as
-#      # well as HA API server deployments.
-#      - job_name: 'kubernetes-apiservers'
-#
-#        kubernetes_sd_configs:
-#          - role: endpoints
-#
-#        # Default to scraping over https. If required, just disable this or change to
-#        # `http`.
-#        scheme: https
-#
-#        # This TLS & bearer token file config is used to connect to the actual scrape
-#        # endpoints for cluster components. This is separate to discovery auth
-#        # configuration because discovery & scraping are two separate concerns in
-#        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-#        # the cluster. Otherwise, more config options have to be provided within the
-#        # <kubernetes_sd_config>.
-#        tls_config:
-#          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-#          # If your node certificates are self-signed or use a different CA to the
-#          # master CA, then disable certificate verification below. Note that
-#          # certificate verification is an integral part of a secure infrastructure
-#          # so this should only be disabled in a controlled environment. You can
-#          # disable certificate verification by uncommenting the line below.
-#          #
-#          # insecure_skip_verify: true
-#        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-#
-#        # Keep only the default/kubernetes service endpoints for the https port. This
-#        # will add targets for each API server which Kubernetes adds an endpoint to
-#        # the default/kubernetes service.
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name ]
-#            action: keep
-#            regex: default;kubernetes;https
-
-      - job_name: 'kubernetes-nodes'
-
-        # Default to scraping over https. If required, just disable this or change to
-        # `http`.
+      - job_name: kubernetes-nodes
         scheme: https
-
-        # This TLS & bearer token file config is used to connect to the actual scrape
-        # endpoints for cluster components. This is separate to discovery auth
-        # configuration because discovery & scraping are two separate concerns in
-        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-        # the cluster. Otherwise, more config options have to be provided within the
-        # <kubernetes_sd_config>.
         tls_config:
           ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-          # If your node certificates are self-signed or use a different CA to the
-          # master CA, then disable certificate verification below. Note that
-          # certificate verification is an integral part of a secure infrastructure
-          # so this should only be disabled in a controlled environment. You can
-          # disable certificate verification by uncommenting the line below.
-          #
-          # insecure_skip_verify: true
         bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
         kubernetes_sd_configs:
           - role: node
-
         relabel_configs:
           - action: labelmap
             regex: __meta_kubernetes_node_label_(.+)
           - target_label: __address__
             replacement: kubernetes.default.svc:443
-          - source_labels: [ __meta_kubernetes_node_name ]
+          - source_labels:
+              - __meta_kubernetes_node_name
             regex: (.+)
             target_label: __metrics_path__
             replacement: /api/v1/nodes/$1/proxy/metrics
-
-
-      - job_name: 'kubernetes-nodes-cadvisor'
-
-        # Default to scraping over https. If required, just disable this or change to
-        # `http`.
+      - job_name: kubernetes-nodes-cadvisor
         scheme: https
-
-        # This TLS & bearer token file config is used to connect to the actual scrape
-        # endpoints for cluster components. This is separate to discovery auth
-        # configuration because discovery & scraping are two separate concerns in
-        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-        # the cluster. Otherwise, more config options have to be provided within the
-        # <kubernetes_sd_config>.
         tls_config:
           ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-          # If your node certificates are self-signed or use a different CA to the
-          # master CA, then disable certificate verification below. Note that
-          # certificate verification is an integral part of a secure infrastructure
-          # so this should only be disabled in a controlled environment. You can
-          # disable certificate verification by uncommenting the line below.
-          #
-          # insecure_skip_verify: true
         bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
         kubernetes_sd_configs:
           - role: node
-
-        # This configuration will work only on kubelet 1.7.3+
-        # As the scrape endpoints for cAdvisor have changed
-        # if you are using older version you need to change the replacement to
-        # replacement: /api/v1/nodes/$1:4194/proxy/metrics
-        # more info here https://github.com/coreos/prometheus-operator/issues/633
         relabel_configs:
           - action: labelmap
             regex: __meta_kubernetes_node_label_(.+)
           - target_label: __address__
             replacement: kubernetes.default.svc:443
-          - source_labels: [ __meta_kubernetes_node_name ]
+          - source_labels:
+              - __meta_kubernetes_node_name
             regex: (.+)
             target_label: __metrics_path__
             replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
-
-        # Metric relabel configs to apply to samples before ingestion.
-        # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs)
-        # metric_relabel_configs:
-        # - action: labeldrop
-        #   regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone)
-
-## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
-#      # Scrape config for service endpoints.
-#      #
-#      # The relabeling allows the actual service scrape endpoint to be configured
-#      # via the following annotations:
-#      #
-#      # * `prometheus.io/scrape`: Only scrape services that have a value of
-#      # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well.
-#      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-#      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-#      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-#      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
-#      # service then set this appropriately.
-#      # * `prometheus.io/param_<parameter>`: If the metrics endpoint uses parameters
-#      # then you can set any parameter
-#      - job_name: 'kubernetes-service-endpoints'
-#        honor_labels: true
-#
-#        kubernetes_sd_configs:
-#          - role: endpoints
-#
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape ]
-#            action: keep
-#            regex: true
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ]
-#            action: drop
-#            regex: true
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ]
-#            action: replace
-#            target_label: __scheme__
-#            regex: (https?)
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ]
-#            action: replace
-#            target_label: __metrics_path__
-#            regex: (.+)
-#          - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ]
-#            action: replace
-#            target_label: __address__
-#            regex: (.+?)(?::\d+)?;(\d+)
-#            replacement: $1:$2
-#          - action: labelmap
-#            regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
-#            replacement: __param_$1
-#          - action: labelmap
-#            regex: __meta_kubernetes_service_label_(.+)
-#          - source_labels: [ __meta_kubernetes_namespace ]
-#            action: replace
-#            target_label: namespace
-#          - source_labels: [ __meta_kubernetes_service_name ]
-#            action: replace
-#            target_label: service
-#          - source_labels: [ __meta_kubernetes_pod_node_name ]
-#            action: replace
-#            target_label: node
-
-
-## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
-#      # Scrape config for slow service endpoints; same as above, but with a larger
-#      # timeout and a larger interval
-#      #
-#      # The relabeling allows the actual service scrape endpoint to be configured
-#      # via the following annotations:
-#      #
-#      # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true`
-#      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-#      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-#      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-#      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
-#      # service then set this appropriately.
-#      # * `prometheus.io/param_<parameter>`: If the metrics endpoint uses parameters
-#      # then you can set any parameter
-#      - job_name: 'kubernetes-service-endpoints-slow'
-#        honor_labels: true
-#
-#        scrape_interval: 5m
-#        scrape_timeout: 30s
-#
-#        kubernetes_sd_configs:
-#          - role: endpoints
-#
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ]
-#            action: keep
-#            regex: true
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ]
-#            action: replace
-#            target_label: __scheme__
-#            regex: (https?)
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ]
-#            action: replace
-#            target_label: __metrics_path__
-#            regex: (.+)
-#          - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ]
-#            action: replace
-#            target_label: __address__
-#            regex: (.+?)(?::\d+)?;(\d+)
-#            replacement: $1:$2
-#          - action: labelmap
-#            regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
-#            replacement: __param_$1
-#          - action: labelmap
-#            regex: __meta_kubernetes_service_label_(.+)
-#          - source_labels: [ __meta_kubernetes_namespace ]
-#            action: replace
-#            target_label: namespace
-#          - source_labels: [ __meta_kubernetes_service_name ]
-#            action: replace
-#            target_label: service
-#          - source_labels: [ __meta_kubernetes_pod_node_name ]
-#            action: replace
-#            target_label: node
-#
-#      - job_name: 'prometheus-pushgateway'
-#        honor_labels: true
-#
-#        kubernetes_sd_configs:
-#          - role: service
-#
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ]
-#            action: keep
-#            regex: pushgateway
-
-
-## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
-#      # Example scrape config for probing services via the Blackbox Exporter.
-#      #
-#      # The relabeling allows the actual service scrape endpoint to be configured
-#      # via the following annotations:
-#      #
-#      # * `prometheus.io/probe`: Only probe services that have a value of `true`
-#      - job_name: 'kubernetes-services'
-#        honor_labels: true
-#
-#        metrics_path: /probe
-#        params:
-#          module: [ http_2xx ]
-#
-#        kubernetes_sd_configs:
-#          - role: service
-#
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ]
-#            action: keep
-#            regex: true
-#          - source_labels: [ __address__ ]
-#            target_label: __param_target
-#          - target_label: __address__
-#            replacement: blackbox
-#          - source_labels: [ __param_target ]
-#            target_label: instance
-#          - action: labelmap
-#            regex: __meta_kubernetes_service_label_(.+)
-#          - source_labels: [ __meta_kubernetes_namespace ]
-#            target_label: namespace
-#          - source_labels: [ __meta_kubernetes_service_name ]
-#            target_label: service
-
-
-      # Example scrape config for pods
-      #
-      # The relabeling allows the actual pod scrape endpoint to be configured via the
-      # following annotations:
-      #
-      # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`,
-      # except if `prometheus.io/scrape-slow` is set to `true` as well.
-      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-      # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
-      - job_name: 'kubernetes-pods'
+      - job_name: kubernetes-pods
         honor_labels: true
-
         kubernetes_sd_configs:
           - role: pod
-
         relabel_configs:
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape ]
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_scrape
             action: keep
             regex: true
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ]
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
             action: drop
             regex: true
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ]
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_scheme
             action: replace
             regex: (https?)
             target_label: __scheme__
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ]
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_path
             action: replace
-            target_label: __metrics_path__
             regex: (.+)
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+            target_label: __metrics_path__
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_port
+              - __meta_kubernetes_pod_ip
             action: replace
             regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
-            replacement: '[$2]:$1'
+            replacement: "[$2]:$1"
             target_label: __address__
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_port
+              - __meta_kubernetes_pod_ip
             action: replace
             regex: (\d+);((([0-9]+?)(\.|$)){4})
             replacement: $2:$1
@@ -398,77 +138,442 @@ serverFiles:
             replacement: __param_$1
           - action: labelmap
             regex: __meta_kubernetes_pod_label_(.+)
-          - source_labels: [ __meta_kubernetes_namespace ]
+          - source_labels:
+              - __meta_kubernetes_namespace
             action: replace
             target_label: namespace
-          - source_labels: [ __meta_kubernetes_pod_name ]
+          - source_labels:
+              - __meta_kubernetes_pod_name
             action: replace
             target_label: pod
-          - source_labels: [ __meta_kubernetes_pod_phase ]
+          - source_labels:
+              - __meta_kubernetes_pod_phase
             regex: Pending|Succeeded|Failed|Completed
             action: drop
-          - source_labels: [ __meta_kubernetes_pod_node_name ]
+          - source_labels:
+              - __meta_kubernetes_pod_node_name
             action: replace
             target_label: node
 
+# serverFiles:
+#   prometheus.yml:
+#     scrape_configs:
+#       - job_name: prometheus
+#         static_configs:
+#           - targets:
+#               - localhost:9090
+
+#       # A scrape configuration for running Prometheus on a Kubernetes cluster.
+#       # This uses separate scrape configs for cluster components (i.e. API server, node)
+#       # and services to allow each to use different authentication configs.
+#       #
+#       # Kubernetes labels will be added as Prometheus labels on metrics via the
+#       # `labelmap` relabeling action.
+
+# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
+# #      # Scrape config for API servers.
+# #      #
+# #      # Kubernetes exposes API servers as endpoints to the default/kubernetes
+# #      # service so this uses `endpoints` role and uses relabelling to only keep
+# #      # the endpoints associated with the default/kubernetes service using the
+# #      # default named port `https`. This works for single API server deployments as
+# #      # well as HA API server deployments.
+# #      - job_name: 'kubernetes-apiservers'
+# #
+# #        kubernetes_sd_configs:
+# #          - role: endpoints
+# #
+# #        # Default to scraping over https. If required, just disable this or change to
+# #        # `http`.
+# #        scheme: https
+# #
+# #        # This TLS & bearer token file config is used to connect to the actual scrape
+# #        # endpoints for cluster components. This is separate to discovery auth
+# #        # configuration because discovery & scraping are two separate concerns in
+# #        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+# #        # the cluster. Otherwise, more config options have to be provided within the
+# #        # <kubernetes_sd_config>.
+# #        tls_config:
+# #          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+# #          # If your node certificates are self-signed or use a different CA to the
+# #          # master CA, then disable certificate verification below. Note that
+# #          # certificate verification is an integral part of a secure infrastructure
+# #          # so this should only be disabled in a controlled environment. You can
+# #          # disable certificate verification by uncommenting the line below.
+# #          #
+# #          # insecure_skip_verify: true
+# #        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+# #
+# #        # Keep only the default/kubernetes service endpoints for the https port. This
+# #        # will add targets for each API server which Kubernetes adds an endpoint to
+# #        # the default/kubernetes service.
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name ]
+# #            action: keep
+# #            regex: default;kubernetes;https
+
+#       - job_name: 'kubernetes-nodes'
+
+#         # Default to scraping over https. If required, just disable this or change to
+#         # `http`.
+#         scheme: https
+
+#         # This TLS & bearer token file config is used to connect to the actual scrape
+#         # endpoints for cluster components. This is separate to discovery auth
+#         # configuration because discovery & scraping are two separate concerns in
+#         # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+#         # the cluster. Otherwise, more config options have to be provided within the
+#         # <kubernetes_sd_config>.
+#         tls_config:
+#           ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+#           # If your node certificates are self-signed or use a different CA to the
+#           # master CA, then disable certificate verification below. Note that
+#           # certificate verification is an integral part of a secure infrastructure
+#           # so this should only be disabled in a controlled environment. You can
+#           # disable certificate verification by uncommenting the line below.
+#           #
+#           # insecure_skip_verify: true
+#         bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+#         kubernetes_sd_configs:
+#           - role: node
+
+#         relabel_configs:
+#           - action: labelmap
+#             regex: __meta_kubernetes_node_label_(.+)
+#           - target_label: __address__
+#             replacement: kubernetes.default.svc:443
+#           - source_labels: [ __meta_kubernetes_node_name ]
+#             regex: (.+)
+#             target_label: __metrics_path__
+#             replacement: /api/v1/nodes/$1/proxy/metrics
+
+
+#       - job_name: 'kubernetes-nodes-cadvisor'
+
+#         # Default to scraping over https. If required, just disable this or change to
+#         # `http`.
+#         scheme: https
+
+#         # This TLS & bearer token file config is used to connect to the actual scrape
+#         # endpoints for cluster components. This is separate to discovery auth
+#         # configuration because discovery & scraping are two separate concerns in
+#         # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+#         # the cluster. Otherwise, more config options have to be provided within the
+#         # <kubernetes_sd_config>.
+#         tls_config:
+#           ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+#           # If your node certificates are self-signed or use a different CA to the
+#           # master CA, then disable certificate verification below. Note that
+#           # certificate verification is an integral part of a secure infrastructure
+#           # so this should only be disabled in a controlled environment. You can
+#           # disable certificate verification by uncommenting the line below.
+#           #
+#           # insecure_skip_verify: true
+#         bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+#         kubernetes_sd_configs:
+#           - role: node
+
+#         # This configuration will work only on kubelet 1.7.3+
+#         # As the scrape endpoints for cAdvisor have changed
+#         # if you are using older version you need to change the replacement to
+#         # replacement: /api/v1/nodes/$1:4194/proxy/metrics
+#         # more info here https://github.com/coreos/prometheus-operator/issues/633
+#         relabel_configs:
+#           - action: labelmap
+#             regex: __meta_kubernetes_node_label_(.+)
+#           - target_label: __address__
+#             replacement: kubernetes.default.svc:443
+#           - source_labels: [ __meta_kubernetes_node_name ]
+#             regex: (.+)
+#             target_label: __metrics_path__
+#             replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
+
+#         # Metric relabel configs to apply to samples before ingestion.
+#         # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs)
+#         # metric_relabel_configs:
+#         # - action: labeldrop
+#         #   regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone)
+
+# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
+# #      # Scrape config for service endpoints.
+# #      #
+# #      # The relabeling allows the actual service scrape endpoint to be configured
+# #      # via the following annotations:
+# #      #
+# #      # * `prometheus.io/scrape`: Only scrape services that have a value of
+# #      # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well.
+# #      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+# #      # to set this to `https` & most likely set the `tls_config` of the scrape config.
+# #      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+# #      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
+# #      # service then set this appropriately.
+# #      # * `prometheus.io/param_<parameter>`: If the metrics endpoint uses parameters
+# #      # then you can set any parameter
+# #      - job_name: 'kubernetes-service-endpoints'
+# #        honor_labels: true
+# #
+# #        kubernetes_sd_configs:
+# #          - role: endpoints
+# #
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape ]
+# #            action: keep
+# #            regex: true
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ]
+# #            action: drop
+# #            regex: true
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ]
+# #            action: replace
+# #            target_label: __scheme__
+# #            regex: (https?)
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ]
+# #            action: replace
+# #            target_label: __metrics_path__
+# #            regex: (.+)
+# #          - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ]
+# #            action: replace
+# #            target_label: __address__
+# #            regex: (.+?)(?::\d+)?;(\d+)
+# #            replacement: $1:$2
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
+# #            replacement: __param_$1
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_service_label_(.+)
+# #          - source_labels: [ __meta_kubernetes_namespace ]
+# #            action: replace
+# #            target_label: namespace
+# #          - source_labels: [ __meta_kubernetes_service_name ]
+# #            action: replace
+# #            target_label: service
+# #          - source_labels: [ __meta_kubernetes_pod_node_name ]
+# #            action: replace
+# #            target_label: node
+
+
+# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
+# #      # Scrape config for slow service endpoints; same as above, but with a larger
+# #      # timeout and a larger interval
+# #      #
+# #      # The relabeling allows the actual service scrape endpoint to be configured
+# #      # via the following annotations:
+# #      #
+# #      # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true`
+# #      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+# #      # to set this to `https` & most likely set the `tls_config` of the scrape config.
+# #      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+# #      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
+# #      # service then set this appropriately.
+# #      # * `prometheus.io/param_<parameter>`: If the metrics endpoint uses parameters
+# #      # then you can set any parameter
+# #      - job_name: 'kubernetes-service-endpoints-slow'
+# #        honor_labels: true
+# #
+# #        scrape_interval: 5m
+# #        scrape_timeout: 30s
+# #
+# #        kubernetes_sd_configs:
+# #          - role: endpoints
+# #
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ]
+# #            action: keep
+# #            regex: true
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ]
+# #            action: replace
+# #            target_label: __scheme__
+# #            regex: (https?)
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ]
+# #            action: replace
+# #            target_label: __metrics_path__
+# #            regex: (.+)
+# #          - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ]
+# #            action: replace
+# #            target_label: __address__
+# #            regex: (.+?)(?::\d+)?;(\d+)
+# #            replacement: $1:$2
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
+# #            replacement: __param_$1
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_service_label_(.+)
+# #          - source_labels: [ __meta_kubernetes_namespace ]
+# #            action: replace
+# #            target_label: namespace
+# #          - source_labels: [ __meta_kubernetes_service_name ]
+# #            action: replace
+# #            target_label: service
+# #          - source_labels: [ __meta_kubernetes_pod_node_name ]
+# #            action: replace
+# #            target_label: node
+# #
+# #      - job_name: 'prometheus-pushgateway'
+# #        honor_labels: true
+# #
+# #        kubernetes_sd_configs:
+# #          - role: service
+# #
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ]
+# #            action: keep
+# #            regex: pushgateway
+
+
+# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
+# #      # Example scrape config for probing services via the Blackbox Exporter.
+# #      #
+# #      # The relabeling allows the actual service scrape endpoint to be configured
+# #      # via the following annotations:
+# #      #
+# #      # * `prometheus.io/probe`: Only probe services that have a value of `true`
+# #      - job_name: 'kubernetes-services'
+# #        honor_labels: true
+# #
+# #        metrics_path: /probe
+# #        params:
+# #          module: [ http_2xx ]
+# #
+# #        kubernetes_sd_configs:
+# #          - role: service
+# #
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ]
+# #            action: keep
+# #            regex: true
+# #          - source_labels: [ __address__ ]
+# #            target_label: __param_target
+# #          - target_label: __address__
+# #            replacement: blackbox
+# #          - source_labels: [ __param_target ]
+# #            target_label: instance
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_service_label_(.+)
+# #          - source_labels: [ __meta_kubernetes_namespace ]
+# #            target_label: namespace
+# #          - source_labels: [ __meta_kubernetes_service_name ]
+# #            target_label: service
+
+
+#       # Example scrape config for pods
+#       #
+#       # The relabeling allows the actual pod scrape endpoint to be configured via the
+#       # following annotations:
+#       #
+#       # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`,
+#       # except if `prometheus.io/scrape-slow` is set to `true` as well.
+#       # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+#       # to set this to `https` & most likely set the `tls_config` of the scrape config.
+#       # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+#       # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
+#       - job_name: 'kubernetes-pods'
+#         honor_labels: true
+
+#         kubernetes_sd_configs:
+#           - role: pod
+
+#         relabel_configs:
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape ]
+#             action: keep
+#             regex: true
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ]
+#             action: drop
+#             regex: true
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ]
+#             action: replace
+#             regex: (https?)
+#             target_label: __scheme__
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ]
+#             action: replace
+#             target_label: __metrics_path__
+#             regex: (.+)
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+#             action: replace
+#             regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
+#             replacement: '[$2]:$1'
+#             target_label: __address__
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+#             action: replace
+#             regex: (\d+);((([0-9]+?)(\.|$)){4})
+#             replacement: $2:$1
+#             target_label: __address__
+#           - action: labelmap
+#             regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
+#             replacement: __param_$1
+#           - action: labelmap
+#             regex: __meta_kubernetes_pod_label_(.+)
+#           - source_labels: [ __meta_kubernetes_namespace ]
+#             action: replace
+#             target_label: namespace
+#           - source_labels: [ __meta_kubernetes_pod_name ]
+#             action: replace
+#             target_label: pod
+#           - source_labels: [ __meta_kubernetes_pod_phase ]
+#             regex: Pending|Succeeded|Failed|Completed
+#             action: drop
+#           - source_labels: [ __meta_kubernetes_pod_node_name ]
+#             action: replace
+#             target_label: node
+
 
-## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
-#      # Example Scrape config for pods which should be scraped slower. An useful example
-#      # would be stackriver-exporter which queries an API on every scrape of the pod
-#      #
-#      # The relabeling allows the actual pod scrape endpoint to be configured via the
-#      # following annotations:
-#      #
-#      # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true`
-#      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-#      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-#      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-#      # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
-#      - job_name: 'kubernetes-pods-slow'
-#        honor_labels: true
-#
-#        scrape_interval: 5m
-#        scrape_timeout: 30s
-#
-#        kubernetes_sd_configs:
-#          - role: pod
-#
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ]
-#            action: keep
-#            regex: true
-#          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ]
-#            action: replace
-#            regex: (https?)
-#            target_label: __scheme__
-#          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ]
-#            action: replace
-#            target_label: __metrics_path__
-#            regex: (.+)
-#          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
-#            action: replace
-#            regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
-#            replacement: '[$2]:$1'
-#            target_label: __address__
-#          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
-#            action: replace
-#            regex: (\d+);((([0-9]+?)(\.|$)){4})
-#            replacement: $2:$1
-#            target_label: __address__
-#          - action: labelmap
-#            regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
-#            replacement: __param_$1
-#          - action: labelmap
-#            regex: __meta_kubernetes_pod_label_(.+)
-#          - source_labels: [ __meta_kubernetes_namespace ]
-#            action: replace
-#            target_label: namespace
-#          - source_labels: [ __meta_kubernetes_pod_name ]
-#            action: replace
-#            target_label: pod
-#          - source_labels: [ __meta_kubernetes_pod_phase ]
-#            regex: Pending|Succeeded|Failed|Completed
-#            action: drop
-#          - source_labels: [ __meta_kubernetes_pod_node_name ]
-#            action: replace
-#            target_label: node
+# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
+# #      # Example Scrape config for pods which should be scraped slower. An useful example
+# #      # would be stackriver-exporter which queries an API on every scrape of the pod
+# #      #
+# #      # The relabeling allows the actual pod scrape endpoint to be configured via the
+# #      # following annotations:
+# #      #
+# #      # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true`
+# #      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+# #      # to set this to `https` & most likely set the `tls_config` of the scrape config.
+# #      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+# #      # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
+# #      - job_name: 'kubernetes-pods-slow'
+# #        honor_labels: true
+# #
+# #        scrape_interval: 5m
+# #        scrape_timeout: 30s
+# #
+# #        kubernetes_sd_configs:
+# #          - role: pod
+# #
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ]
+# #            action: keep
+# #            regex: true
+# #          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ]
+# #            action: replace
+# #            regex: (https?)
+# #            target_label: __scheme__
+# #          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ]
+# #            action: replace
+# #            target_label: __metrics_path__
+# #            regex: (.+)
+# #          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+# #            action: replace
+# #            regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
+# #            replacement: '[$2]:$1'
+# #            target_label: __address__
+# #          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+# #            action: replace
+# #            regex: (\d+);((([0-9]+?)(\.|$)){4})
+# #            replacement: $2:$1
+# #            target_label: __address__
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
+# #            replacement: __param_$1
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_pod_label_(.+)
+# #          - source_labels: [ __meta_kubernetes_namespace ]
+# #            action: replace
+# #            target_label: namespace
+# #          - source_labels: [ __meta_kubernetes_pod_name ]
+# #            action: replace
+# #            target_label: pod
+# #          - source_labels: [ __meta_kubernetes_pod_phase ]
+# #            regex: Pending|Succeeded|Failed|Completed
+# #            action: drop
+# #          - source_labels: [ __meta_kubernetes_pod_node_name ]
+# #            action: replace
+# #            target_label: node

From 44113e8e4de42d26e0f9e22b190444270b52e113 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Fri, 6 Jun 2025 15:20:35 +0530
Subject: [PATCH 18/44] fixes in aws-gpu-test ci

---
 .github/workflows/aws-gpu-test.yaml |  23 ++-
 .github/workflows/gcp-gpu-test.yaml | 302 ++++++++++++++++++++++++++++
 2 files changed, 320 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/gcp-gpu-test.yaml

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index ebf5bcae..9305e9bd 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -14,6 +14,20 @@ on:
         options:
           - nvidia-dcgm
           - devzero-dcgm
+      cluster_version:
+        description: 'Kubernetes cluster version'
+        required: false
+        default: '1.30'
+        type: choice
+        options:
+          - '1.26'
+          - '1.27'
+          - '1.28'
+          - '1.29'
+          - '1.30'
+          - '1.31'
+          - '1.32'
+          - '1.33'
 
 permissions:
   id-token: write
@@ -25,6 +39,7 @@ jobs:
     runs-on: ubuntu-latest
     env:
       DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
+      CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}
 
     outputs:
       job_identifier: ${{ steps.job-identifier.outputs.job_identifier }}
@@ -71,7 +86,7 @@ jobs:
           }
           EOF
           terraform init
-          terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER"
+          terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" -var='cluster_version=${{ env.CLUSTER_VERSION }}'
 
   install-and-validate:
     name: Install and Validate GPU Resources and ZXPorter
@@ -213,8 +228,6 @@ jobs:
           echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
           make docker-build docker-push IMG=${ZXPORTER_IMG}
           make deploy IMG=${ZXPORTER_IMG}
-
-          kubectl apply -f dist/install.yaml
           
           echo "Waiting for ZXPorter pods to be ready..."
           kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
@@ -235,7 +248,7 @@ jobs:
     name: Destroy Terraform
     runs-on: ubuntu-latest
     env:
-      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
+      CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}
 
     if: always()
     needs:
@@ -270,4 +283,4 @@ jobs:
           }
           EOF
           terraform init
-          terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}"
+          terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}" -var='cluster_version=${{ env.CLUSTER_VERSION }}'
diff --git a/.github/workflows/gcp-gpu-test.yaml b/.github/workflows/gcp-gpu-test.yaml
new file mode 100644
index 00000000..6c089b92
--- /dev/null
+++ b/.github/workflows/gcp-gpu-test.yaml
@@ -0,0 +1,302 @@
+name: GCP GPU Test
+
+on:
+  push:
+    branches:
+      - garvit/gcp-gpu-test
+  workflow_dispatch:
+    inputs:
+      dcgm_install_type:
+        description: 'DCGM install type'
+        required: false
+        default: 'devzero-dcgm'
+        type: choice
+        options:
+          - nvidia-dcgm
+          - devzero-dcgm
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  apply-terraform:
+    name: Apply Terraform
+    runs-on: ubuntu-latest
+    env:
+      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
+
+    outputs:
+      job_identifier: ${{ steps.job-identifier.outputs.job_identifier }}
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: 'Authenticate to Google Cloud'
+        id: 'auth'
+        uses: 'google-github-actions/auth@v2'
+        with:
+          workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool'
+          service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com'
+          create_credentials_file: true
+          export_environment_variables: true
+
+      - name: Export Terraform-friendly environment variables
+        run: |
+          echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV
+          echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV
+
+      - name: Generate Unique Job Identifier
+        id: job-identifier
+        shell: bash
+        run: |
+          SHORT_SHA=$(git rev-parse --short HEAD)
+          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
+            SUFFIX="dd"
+          else
+            SUFFIX="nd"
+          fi
+          JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}"
+          echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV
+          echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT
+
+      - name: Set up Terraform
+        uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: "1.11.3"
+
+      - name: Apply Terraform
+        working-directory: terraform/gcp
+        run: |
+          cat <<EOF > backend_override.tf
+          terraform {
+            backend "gcs" {
+              bucket  = "zxporter-tf-state"
+              prefix  = "${JOB_IDENTIFIER}/terraform.tfstate"
+            }
+          }
+          EOF
+          terraform init
+          terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER"
+
+  install-and-validate:
+    name: Install and Validate GPU Resources and ZXPorter
+    runs-on: ubuntu-latest
+    needs: apply-terraform
+    env:
+      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: 'Authenticate to Google Cloud'
+        id: 'auth'
+        uses: 'google-github-actions/auth@v2'
+        with:
+          workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool'
+          service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com'
+          create_credentials_file: true
+          export_environment_variables: true
+
+      - name: 'Set up Cloud SDK'
+        uses: 'google-github-actions/setup-gcloud@v2'
+        with:
+          version: '>= 363.0.0'
+
+      - name: Install gke-gcloud-auth-plugin
+        run: |
+          echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+          curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
+          sudo apt-get update
+          sudo apt-get install -y google-cloud-sdk-gke-gcloud-auth-plugin
+
+      - name: Configure Kubernetes Access
+        run: |
+          gcloud container clusters get-credentials ${{ needs.apply-terraform.outputs.job_identifier }} --zone us-central1 --project devzero-self-hosted
+
+      - name: Check GPU Availability
+        id: gpu_check
+        run: |
+          echo "Checking GPU resources on nodes..."
+          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
+            echo "GPU resources are available on the nodes."
+            echo "GPU_CHECK=true" >> $GITHUB_ENV
+          else
+            echo "GPU check failed"
+            echo "GPU_CHECK=false" >> $GITHUB_ENV
+          fi
+
+      - name: Install GPU Operator (if needed)
+        if: env.GPU_CHECK == 'false'
+        run: |
+          echo "GPU resources not found, installing GPU Operator..."
+          kubectl create ns gpu-operator
+          kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite
+          kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true
+          helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \
+          helm repo update
+          INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0"
+          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
+            INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false"
+          fi
+          echo "Running: $INSTALL_CMD"
+          $INSTALL_CMD
+
+      - name: Check GPU Availability After Installing GPU Operator
+        if: env.GPU_CHECK == 'false'
+        run: |
+          echo "Re-checking GPU resources on nodes after GPU Operator installation..."
+          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
+            echo "GPU resources are available on the nodes."
+          else
+            echo "GPU check failed after GPU Operator installation"
+            exit 1
+          fi
+
+      - name: Check Nvidia DCGM DaemonSet
+        id: dcgm_check
+        if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }}
+        run: |
+          echo "Checking if DCGM DaemonSet is installed..."
+          if kubectl get daemonset -A | grep -q dcgm; then
+            echo "Nvidia DCGM found, proceeding with validation."
+            echo "SKIP_INSTALL=false" >> $GITHUB_ENV
+          else
+            echo "Nvidia DCGM not found, skipping install and proceeding to destroy."
+            echo "SKIP_INSTALL=true" >> $GITHUB_ENV
+          fi
+
+      - name: Install DevZero DCGM
+        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
+        run: |
+          echo "Installing DCGM Exporter..."
+          kubectl create ns devzero-zxporter
+          curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/gke.yml | kubectl apply -f -
+
+      - name: Check DCGM DaemonSet After Installing DCGM Exporter
+        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
+        run: |
+          echo "Re-checking DCGM pods after DCGM Exporter installation..."
+          if kubectl get daemonset -A | grep -q dcgm; then
+            echo "DCGM DaemonSet is running."
+          else
+            echo "DCGM DaemonSet not running after installation"
+            exit 1
+          fi
+          
+      - name: Verify DCGM Pods and Prometheus Annotations
+        if: env.SKIP_INSTALL != 'true'
+        run: |
+          kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n gpu-operator --timeout=300s
+          echo "Verifying DCGM pods and Prometheus annotations..."
+          kubectl get pods -A | grep dcgm-exporter | awk '
+          BEGIN { all_running = 1; pod_count = 0 }
+          {
+              pod_count++
+              status = $4
+              printf "Pod: %s/%s - Status: %s\n", $1, $2, status
+              if (status != "Running") all_running = 0
+          }
+          END {
+              printf "\nTotal Pods: %d\n", pod_count
+              printf "All Running: %s\n", (all_running ? "true" : "false")
+          }'
+          kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done
+
+      - name: Install and Verify DeepSeek Workload
+        if: env.SKIP_INSTALL != 'true'
+        run: |
+          kubectl create ns deepseek
+          kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml    
+          
+          kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s
+          pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}')
+          
+          if [[ -n "$pod_status" ]]; then
+            echo "Pods are not in Running state. Failing the pipeline."
+            exit 1
+          else
+            echo "All pods are running successfully."
+          fi
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        if: env.SKIP_INSTALL != 'true'
+        with:
+          go-version: '1.22'
+          cache: true
+
+      - name: Install ZXPorter
+        if: env.SKIP_INSTALL != 'true'
+        run: |
+          ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
+          echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
+          make docker-build docker-push IMG=${ZXPORTER_IMG}
+          make deploy IMG=${ZXPORTER_IMG}
+
+          kubectl apply -f dist/install.yaml
+          
+          echo "Waiting for ZXPorter pods to be ready..."
+          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
+
+      - name: Test ZXPorter with Prometheus
+        if: env.SKIP_INSTALL != 'true'
+        run: |
+          kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter &
+          sleep 5
+          result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
+          if [[ -z "$result" || "$result" == [] ]]; then
+            echo "DCGM_FI_DEV_SM_CLOCK metric not found!"
+            exit 1
+          fi
+          echo "Metric found: $result"
+
+  destroy-terraform:
+    name: Destroy Terraform
+    runs-on: ubuntu-latest
+    env:
+      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
+
+    if: always()
+    needs:
+      - apply-terraform
+      - install-and-validate
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: 'Authenticate to Google Cloud'
+        id: 'auth'
+        uses: 'google-github-actions/auth@v2'
+        with:
+          workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool'
+          service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com'
+          create_credentials_file: true
+          export_environment_variables: true
+
+      - name: Export Terraform-friendly environment variables
+        run: |
+          echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV
+          echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV
+          
+      - name: Set up Terraform
+        uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: "1.11.3"
+
+      - name: Destroy Infrastructure
+        working-directory: terraform/gcp
+        run: |
+          cat <<EOF > backend_override.tf
+          terraform {
+            backend "gcs" {
+              bucket  = "zxporter-tf-state"
+              prefix  = "${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate"
+            }
+          }
+          EOF
+          terraform init
+          terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}"
\ No newline at end of file

From bcdc404f13f53c2b244679623acd4f0ce99e6fe2 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Fri, 6 Jun 2025 16:25:31 +0530
Subject: [PATCH 19/44] fix in aws gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 9305e9bd..64c6903a 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -236,7 +236,7 @@ jobs:
         if: env.SKIP_INSTALL != 'true'
         run: |
           kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter &
-          sleep 5
+          sleep 10
           result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
           if [[ -z "$result" || "$result" == [] ]]; then
             echo "DCGM_FI_DEV_SM_CLOCK metric not found!"

From 432da6764b6c377c8be3774bd50ea3eb908dbdbb Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Fri, 6 Jun 2025 21:33:50 +0530
Subject: [PATCH 20/44] fix aws-gpu-test ci

---
 .github/workflows/aws-gpu-test.yaml |   4 +-
 .github/workflows/gcp-gpu-test.yaml | 302 ----------------------------
 2 files changed, 2 insertions(+), 304 deletions(-)
 delete mode 100644 .github/workflows/gcp-gpu-test.yaml

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 64c6903a..a98b8864 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -238,11 +238,11 @@ jobs:
           kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter &
           sleep 10
           result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
+          echo "Metric found: $result"
           if [[ -z "$result" || "$result" == [] ]]; then
             echo "DCGM_FI_DEV_SM_CLOCK metric not found!"
-            exit 1
           fi
-          echo "Metric found: $result"
+          
 
   destroy-terraform:
     name: Destroy Terraform
diff --git a/.github/workflows/gcp-gpu-test.yaml b/.github/workflows/gcp-gpu-test.yaml
deleted file mode 100644
index 6c089b92..00000000
--- a/.github/workflows/gcp-gpu-test.yaml
+++ /dev/null
@@ -1,302 +0,0 @@
-name: GCP GPU Test
-
-on:
-  push:
-    branches:
-      - garvit/gcp-gpu-test
-  workflow_dispatch:
-    inputs:
-      dcgm_install_type:
-        description: 'DCGM install type'
-        required: false
-        default: 'devzero-dcgm'
-        type: choice
-        options:
-          - nvidia-dcgm
-          - devzero-dcgm
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  apply-terraform:
-    name: Apply Terraform
-    runs-on: ubuntu-latest
-    env:
-      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
-
-    outputs:
-      job_identifier: ${{ steps.job-identifier.outputs.job_identifier }}
-
-    steps:
-      - name: Checkout Repository
-        uses: actions/checkout@v4
-
-      - name: 'Authenticate to Google Cloud'
-        id: 'auth'
-        uses: 'google-github-actions/auth@v2'
-        with:
-          workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool'
-          service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com'
-          create_credentials_file: true
-          export_environment_variables: true
-
-      - name: Export Terraform-friendly environment variables
-        run: |
-          echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV
-          echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV
-
-      - name: Generate Unique Job Identifier
-        id: job-identifier
-        shell: bash
-        run: |
-          SHORT_SHA=$(git rev-parse --short HEAD)
-          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
-            SUFFIX="dd"
-          else
-            SUFFIX="nd"
-          fi
-          JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}"
-          echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV
-          echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT
-
-      - name: Set up Terraform
-        uses: hashicorp/setup-terraform@v3
-        with:
-          terraform_version: "1.11.3"
-
-      - name: Apply Terraform
-        working-directory: terraform/gcp
-        run: |
-          cat <<EOF > backend_override.tf
-          terraform {
-            backend "gcs" {
-              bucket  = "zxporter-tf-state"
-              prefix  = "${JOB_IDENTIFIER}/terraform.tfstate"
-            }
-          }
-          EOF
-          terraform init
-          terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER"
-
-  install-and-validate:
-    name: Install and Validate GPU Resources and ZXPorter
-    runs-on: ubuntu-latest
-    needs: apply-terraform
-    env:
-      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
-
-    steps:
-      - name: Checkout Repository
-        uses: actions/checkout@v4
-
-      - name: 'Authenticate to Google Cloud'
-        id: 'auth'
-        uses: 'google-github-actions/auth@v2'
-        with:
-          workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool'
-          service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com'
-          create_credentials_file: true
-          export_environment_variables: true
-
-      - name: 'Set up Cloud SDK'
-        uses: 'google-github-actions/setup-gcloud@v2'
-        with:
-          version: '>= 363.0.0'
-
-      - name: Install gke-gcloud-auth-plugin
-        run: |
-          echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
-          curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
-          sudo apt-get update
-          sudo apt-get install -y google-cloud-sdk-gke-gcloud-auth-plugin
-
-      - name: Configure Kubernetes Access
-        run: |
-          gcloud container clusters get-credentials ${{ needs.apply-terraform.outputs.job_identifier }} --zone us-central1 --project devzero-self-hosted
-
-      - name: Check GPU Availability
-        id: gpu_check
-        run: |
-          echo "Checking GPU resources on nodes..."
-          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
-            echo "GPU resources are available on the nodes."
-            echo "GPU_CHECK=true" >> $GITHUB_ENV
-          else
-            echo "GPU check failed"
-            echo "GPU_CHECK=false" >> $GITHUB_ENV
-          fi
-
-      - name: Install GPU Operator (if needed)
-        if: env.GPU_CHECK == 'false'
-        run: |
-          echo "GPU resources not found, installing GPU Operator..."
-          kubectl create ns gpu-operator
-          kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite
-          kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true
-          helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \
-          helm repo update
-          INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0"
-          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
-            INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false"
-          fi
-          echo "Running: $INSTALL_CMD"
-          $INSTALL_CMD
-
-      - name: Check GPU Availability After Installing GPU Operator
-        if: env.GPU_CHECK == 'false'
-        run: |
-          echo "Re-checking GPU resources on nodes after GPU Operator installation..."
-          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
-            echo "GPU resources are available on the nodes."
-          else
-            echo "GPU check failed after GPU Operator installation"
-            exit 1
-          fi
-
-      - name: Check Nvidia DCGM DaemonSet
-        id: dcgm_check
-        if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }}
-        run: |
-          echo "Checking if DCGM DaemonSet is installed..."
-          if kubectl get daemonset -A | grep -q dcgm; then
-            echo "Nvidia DCGM found, proceeding with validation."
-            echo "SKIP_INSTALL=false" >> $GITHUB_ENV
-          else
-            echo "Nvidia DCGM not found, skipping install and proceeding to destroy."
-            echo "SKIP_INSTALL=true" >> $GITHUB_ENV
-          fi
-
-      - name: Install DevZero DCGM
-        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
-        run: |
-          echo "Installing DCGM Exporter..."
-          kubectl create ns devzero-zxporter
-          curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/gke.yml | kubectl apply -f -
-
-      - name: Check DCGM DaemonSet After Installing DCGM Exporter
-        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
-        run: |
-          echo "Re-checking DCGM pods after DCGM Exporter installation..."
-          if kubectl get daemonset -A | grep -q dcgm; then
-            echo "DCGM DaemonSet is running."
-          else
-            echo "DCGM DaemonSet not running after installation"
-            exit 1
-          fi
-          
-      - name: Verify DCGM Pods and Prometheus Annotations
-        if: env.SKIP_INSTALL != 'true'
-        run: |
-          kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n gpu-operator --timeout=300s
-          echo "Verifying DCGM pods and Prometheus annotations..."
-          kubectl get pods -A | grep dcgm-exporter | awk '
-          BEGIN { all_running = 1; pod_count = 0 }
-          {
-              pod_count++
-              status = $4
-              printf "Pod: %s/%s - Status: %s\n", $1, $2, status
-              if (status != "Running") all_running = 0
-          }
-          END {
-              printf "\nTotal Pods: %d\n", pod_count
-              printf "All Running: %s\n", (all_running ? "true" : "false")
-          }'
-          kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done
-
-      - name: Install and Verify DeepSeek Workload
-        if: env.SKIP_INSTALL != 'true'
-        run: |
-          kubectl create ns deepseek
-          kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml    
-          
-          kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s
-          pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}')
-          
-          if [[ -n "$pod_status" ]]; then
-            echo "Pods are not in Running state. Failing the pipeline."
-            exit 1
-          else
-            echo "All pods are running successfully."
-          fi
-
-      - name: Set up Go
-        uses: actions/setup-go@v5
-        if: env.SKIP_INSTALL != 'true'
-        with:
-          go-version: '1.22'
-          cache: true
-
-      - name: Install ZXPorter
-        if: env.SKIP_INSTALL != 'true'
-        run: |
-          ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
-          echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
-          make docker-build docker-push IMG=${ZXPORTER_IMG}
-          make deploy IMG=${ZXPORTER_IMG}
-
-          kubectl apply -f dist/install.yaml
-          
-          echo "Waiting for ZXPorter pods to be ready..."
-          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
-
-      - name: Test ZXPorter with Prometheus
-        if: env.SKIP_INSTALL != 'true'
-        run: |
-          kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter &
-          sleep 5
-          result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
-          if [[ -z "$result" || "$result" == [] ]]; then
-            echo "DCGM_FI_DEV_SM_CLOCK metric not found!"
-            exit 1
-          fi
-          echo "Metric found: $result"
-
-  destroy-terraform:
-    name: Destroy Terraform
-    runs-on: ubuntu-latest
-    env:
-      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
-
-    if: always()
-    needs:
-      - apply-terraform
-      - install-and-validate
-
-    steps:
-      - name: Checkout Repository
-        uses: actions/checkout@v4
-
-      - name: 'Authenticate to Google Cloud'
-        id: 'auth'
-        uses: 'google-github-actions/auth@v2'
-        with:
-          workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool'
-          service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com'
-          create_credentials_file: true
-          export_environment_variables: true
-
-      - name: Export Terraform-friendly environment variables
-        run: |
-          echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV
-          echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV
-          
-      - name: Set up Terraform
-        uses: hashicorp/setup-terraform@v3
-        with:
-          terraform_version: "1.11.3"
-
-      - name: Destroy Infrastructure
-        working-directory: terraform/gcp
-        run: |
-          cat <<EOF > backend_override.tf
-          terraform {
-            backend "gcs" {
-              bucket  = "zxporter-tf-state"
-              prefix  = "${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate"
-            }
-          }
-          EOF
-          terraform init
-          terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}"
\ No newline at end of file

From 0ed7300d1ef0b60fba1e583c8659ab5c687512e9 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sat, 7 Jun 2025 08:53:09 +0530
Subject: [PATCH 21/44] fix aws-gpu-test ci

---
 .github/workflows/aws-gpu-test.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index a98b8864..536c97a1 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -241,6 +241,7 @@ jobs:
           echo "Metric found: $result"
           if [[ -z "$result" || "$result" == [] ]]; then
             echo "DCGM_FI_DEV_SM_CLOCK metric not found!"
+            exit 1
           fi
           
 

From e87559f9a97ebf25b62b3ead7f243eb8442b3540 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sat, 7 Jun 2025 22:57:00 +0530
Subject: [PATCH 22/44] Added nvidia-device-plugin in AWS GPU test CI

---
 .github/workflows/aws-gpu-test.yaml           | 52 +++++++++---
 .../container-toolkit.yaml                    | 84 +++++++++++++++++++
 .../driver-installer.yaml                     | 81 ++++++++++++++++++
 3 files changed, 207 insertions(+), 10 deletions(-)
 create mode 100644 nvidia-device-plugin-prereq/container-toolkit.yaml
 create mode 100644 nvidia-device-plugin-prereq/driver-installer.yaml

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 536c97a1..af83e409 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -6,6 +6,14 @@ on:
       - garvit/aws-gpu-test
   workflow_dispatch:
     inputs:
+      gpu_install_type:
+        description: 'GPU installation type'
+        required: false
+        default: 'nvidia-device-plugin'
+        type: choice
+        options:
+          - gpu-operator
+          - nvidia-device-plugin
       dcgm_install_type:
         description: 'DCGM install type'
         required: false
@@ -38,6 +46,7 @@ jobs:
     name: Apply Terraform
     runs-on: ubuntu-latest
     env:
+      GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
       DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
       CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}
 
@@ -45,6 +54,16 @@ jobs:
       job_identifier: ${{ steps.job-identifier.outputs.job_identifier }}
 
     steps:
+      - name: Validate Inputs
+        run: |
+          echo "GPU_INSTALL_TYPE=${GPU_INSTALL_TYPE}"
+          echo "DCGM_INSTALL_TYPE=${DCGM_INSTALL_TYPE}"
+
+          if [[ "$GPU_INSTALL_TYPE" == "nvidia-device-plugin" && "$DCGM_INSTALL_TYPE" != "devzero-dcgm" ]]; then
+            echo "Error: When GPU_INSTALL_TYPE is 'nvidia-device-plugin', DCGM_INSTALL_TYPE must be 'devzero-dcgm'."
+            exit 1
+          fi
+
       - name: Checkout Repository
         uses: actions/checkout@v4
 
@@ -93,6 +112,7 @@ jobs:
     runs-on: ubuntu-latest
     needs: apply-terraform
     env:
+      GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
       DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
 
     steps:
@@ -122,7 +142,7 @@ jobs:
           fi
 
       - name: Install GPU Operator (if needed)
-        if: env.GPU_CHECK == 'false'
+        if: env.GPU_CHECK == 'false' && env.GPU_INSTALL_TYPE == 'gpu-operator'
         run: |
           echo "GPU resources not found, installing GPU Operator..."
           kubectl create ns gpu-operator
@@ -137,6 +157,20 @@ jobs:
           echo "Running: $INSTALL_CMD"
           $INSTALL_CMD
 
+      - name: Install Nvidia Device Plugin
+        if: env.GPU_INSTALL_TYPE == 'nvidia-device-plugin' && env.GPU_CHECK == 'false'
+        run: |
+          echo "Installing Nvidia Device Plugin..."
+          kubectl label node "$(kubectl get nodes -o jsonpath='{.items[0].metadata.name}')" nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite
+
+          kubectl apply -f nvidia-device-plugin-prereq
+          helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
+          helm repo update
+          helm upgrade -i nvdp nvdp/nvidia-device-plugin \
+            --namespace nvidia-device-plugin \
+            --create-namespace \
+            --version 0.17.1
+
       - name: Check GPU Availability After Installing GPU Operator
         if: env.GPU_CHECK == 'false'
         run: |
@@ -155,10 +189,9 @@ jobs:
           echo "Checking if DCGM DaemonSet is installed..."
           if kubectl get daemonset -A | grep -q dcgm; then
             echo "Nvidia DCGM found, proceeding with validation."
-            echo "SKIP_INSTALL=false" >> $GITHUB_ENV
           else
-            echo "Nvidia DCGM not found, skipping install and proceeding to destroy."
-            echo "SKIP_INSTALL=true" >> $GITHUB_ENV
+            echo "Nvidia DCGM not found."
+            exit 1
           fi
 
       - name: Install DevZero DCGM
@@ -180,9 +213,12 @@ jobs:
           fi
           
       - name: Verify DCGM Pods and Prometheus Annotations
-        if: env.SKIP_INSTALL != 'true'
         run: |
-          kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n gpu-operator --timeout=300s
+          NAMESPACE="devzero-zxporter"
+          if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then
+            NAMESPACE="gpu-operator"
+          fi
+          kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s
           echo "Verifying DCGM pods and Prometheus annotations..."
           kubectl get pods -A | grep dcgm-exporter | awk '
           BEGIN { all_running = 1; pod_count = 0 }
@@ -199,7 +235,6 @@ jobs:
           kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done
 
       - name: Install and Verify DeepSeek Workload
-        if: env.SKIP_INSTALL != 'true'
         run: |
           kubectl create ns deepseek
           kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml    
@@ -216,13 +251,11 @@ jobs:
 
       - name: Set up Go
         uses: actions/setup-go@v5
-        if: env.SKIP_INSTALL != 'true'
         with:
           go-version: '1.22'
           cache: true
 
       - name: Install ZXPorter
-        if: env.SKIP_INSTALL != 'true'
         run: |
           ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
           echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
@@ -233,7 +266,6 @@ jobs:
           kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
 
       - name: Test ZXPorter with Prometheus
-        if: env.SKIP_INSTALL != 'true'
         run: |
           kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter &
           sleep 10
diff --git a/nvidia-device-plugin-prereq/container-toolkit.yaml b/nvidia-device-plugin-prereq/container-toolkit.yaml
new file mode 100644
index 00000000..17ada11b
--- /dev/null
+++ b/nvidia-device-plugin-prereq/container-toolkit.yaml
@@ -0,0 +1,84 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-toolkit-installer
+  namespace: nvidia-device-plugin
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-toolkit-installer
+  template:
+    metadata:
+      labels:
+        name: nvidia-toolkit-installer
+    spec:
+      nodeSelector:
+        nvidia.com/gpu.present: "true"
+      hostPID: true
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+        - key: "CriticalAddonsOnly"
+          operator: "Exists"
+        - effect: NoSchedule
+          key: node-role.kubernetes.io/control-plane
+        - effect: NoSchedule
+          key: node-role.kubernetes.io/master
+      containers:
+      - name: install-nvidia-toolkit
+        image: amazonlinux:2023
+        securityContext:
+          privileged: true
+        command:
+          - /bin/bash
+          - -c
+          - |
+            set -ex
+
+            # Add NVIDIA repo
+            curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
+              -o /etc/yum.repos.d/nvidia-container-toolkit.repo
+
+            # Install toolkit
+            yum install -y nvidia-container-toolkit
+
+            # Configure containerd
+            nvidia-ctk runtime configure --runtime=containerd
+
+            # Restart containerd
+            systemctl restart containerd || true
+
+            # Exit cleanly
+            echo "NVIDIA container toolkit installed and configured."
+            sleep infinity
+        volumeMounts:
+        - name: root
+          mountPath: /host
+          mountPropagation: Bidirectional
+        - name: containerd-config
+          mountPath: /etc/containerd
+        - name: systemd
+          mountPath: /run/systemd
+        - name: modules
+          mountPath: /lib/modules
+          readOnly: true
+        - name: dev
+          mountPath: /dev
+      volumes:
+        - name: root
+          hostPath:
+            path: /
+        - name: containerd-config
+          hostPath:
+            path: /etc/containerd
+        - name: systemd
+          hostPath:
+            path: /run/systemd
+        - name: modules
+          hostPath:
+            path: /lib/modules
+        - name: dev
+          hostPath:
+            path: /dev
+      restartPolicy: Always
diff --git a/nvidia-device-plugin-prereq/driver-installer.yaml b/nvidia-device-plugin-prereq/driver-installer.yaml
new file mode 100644
index 00000000..7f04e106
--- /dev/null
+++ b/nvidia-device-plugin-prereq/driver-installer.yaml
@@ -0,0 +1,81 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-driver-installer
+  namespace: nvidia-device-plugin
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-driver-installer
+  template:
+    metadata:
+      labels:
+        name: nvidia-driver-installer
+    spec:
+      hostPID: true
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+        - key: CriticalAddonsOnly
+          operator: Exists
+        - key: node-role.kubernetes.io/control-plane
+          effect: NoSchedule
+        - key: node-role.kubernetes.io/master
+          effect: NoSchedule
+      nodeSelector:
+        nvidia.com/gpu.present: "true"
+      containers:
+      - name: driver-installer
+        image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0
+        securityContext:
+          privileged: true
+        env:
+          - name: NVIDIA_DRIVER_VERSION
+            value: "535.129.03"  # or the version you require
+          - name: NODE_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: spec.nodeName
+        volumeMounts:
+          - name: root
+            mountPath: /host
+            mountPropagation: Bidirectional
+          - name: modules
+            mountPath: /lib/modules
+            readOnly: true
+          - name: nvidia-local
+            mountPath: /host/usr/local/nvidia
+      - name: fix-dcgm-dir
+        image: amazonlinux:2023
+        securityContext:
+          privileged: true
+        command: ["/bin/bash", "-c"]
+        args:
+          - |
+            set -ex
+            TARGET_DIR="/host/usr/local/nvidia"
+            # If it doesn't exist, symlink something useful
+            if [ ! -d "$TARGET_DIR" ]; then
+              mkdir -p /host/usr/local
+              ln -s /usr/lib64 "$TARGET_DIR"
+            fi
+            echo "/usr/local/nvidia set up for DCGM."
+            sleep 10
+        volumeMounts:
+          - name: nvidia-local
+            mountPath: /host/usr/local/nvidia
+          - name: root
+            mountPath: /host
+            mountPropagation: Bidirectional
+      volumes:
+        - name: root
+          hostPath:
+            path: /
+        - name: modules
+          hostPath:
+            path: /lib/modules
+        - name: nvidia-local
+          hostPath:
+            path: /usr/local/nvidia
+            type: DirectoryOrCreate

From eceac966ed3ae5a0f99d14708c69df04cb5dddb8 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sat, 7 Jun 2025 23:13:33 +0530
Subject: [PATCH 23/44] Added nvidia-device-plugin in AWS GPU test CI

---
 .github/workflows/aws-gpu-test.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index af83e409..c774a985 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -162,13 +162,12 @@ jobs:
         run: |
           echo "Installing Nvidia Device Plugin..."
           kubectl label node "$(kubectl get nodes -o jsonpath='{.items[0].metadata.name}')" nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite
-
+          kubectl create ns nvidia-device-plugin
           kubectl apply -f nvidia-device-plugin-prereq
           helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
           helm repo update
           helm upgrade -i nvdp nvdp/nvidia-device-plugin \
             --namespace nvidia-device-plugin \
-            --create-namespace \
             --version 0.17.1
 
       - name: Check GPU Availability After Installing GPU Operator

From a921e2fe00322516955b08037de4c7376393e0bf Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sun, 8 Jun 2025 00:59:38 +0530
Subject: [PATCH 24/44] Added nvidia-device-plugin in AWS GPU test CI

---
 .github/workflows/aws-gpu-test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index c774a985..9a874773 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -217,7 +217,7 @@ jobs:
           if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then
             NAMESPACE="gpu-operator"
           fi
-          kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s
+          kubectl get pods -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s
           echo "Verifying DCGM pods and Prometheus annotations..."
           kubectl get pods -A | grep dcgm-exporter | awk '
           BEGIN { all_running = 1; pod_count = 0 }

From ca3ed8767c826fd86aed90320f717aa6413ddf70 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Sun, 8 Jun 2025 01:30:36 +0530
Subject: [PATCH 25/44] Added nvidia-device-plugin in AWS GPU test CI

---
 .github/workflows/aws-gpu-test.yaml | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 9a874773..7c29b76f 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -266,15 +266,29 @@ jobs:
 
       - name: Test ZXPorter with Prometheus
         run: |
-          kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter &
-          sleep 10
+          kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter > pf.log 2>&1 &
+          PF_PID=$!
+          sleep 5
+          MAX_RETRIES=6
+          for i in $(seq 1 $MAX_RETRIES); do
+            if curl -s "http://localhost:9090/-/ready" >/dev/null; then
+              echo "Prometheus port-forward is ready."
+              break
+            fi
+            echo "[$i/$MAX_RETRIES] Waiting for Prometheus to become ready..."
+            sleep 5
+          done
+
           result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
+          kill $PF_PID || true
+
           echo "Metric found: $result"
           if [[ -z "$result" || "$result" == [] ]]; then
-            echo "DCGM_FI_DEV_SM_CLOCK metric not found!"
+            echo "❌ DCGM_FI_DEV_SM_CLOCK metric not found!"
+            echo "Port-forward log:"
+            cat pf.log
             exit 1
           fi
-          
 
   destroy-terraform:
     name: Destroy Terraform

From 9696cd7f68c818ef0a349f437dd0c92ef4e21f74 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 03:18:11 +0530
Subject: [PATCH 26/44] test karpenter in aws gpu test CI

---
 .github/workflows/aws-gpu-test.yaml |  68 +++++-
 terraform/aws/main.tf               | 311 +++++++++++++++++++++++++++-
 terraform/aws/outputs.tf            |  13 ++
 terraform/aws/terraform.tfvars      |   3 +-
 terraform/aws/variables.tf          |   5 +
 5 files changed, 391 insertions(+), 9 deletions(-)
 create mode 100644 terraform/aws/outputs.tf

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 7c29b76f..577b5edb 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -36,6 +36,14 @@ on:
           - '1.31'
           - '1.32'
           - '1.33'
+      karpenter_version:
+        description: 'Karpenter Version'
+        required: false
+        default: '0.37.7'
+        type: choice
+        options:
+          - 'no_karpenter'
+          - '0.37.7'
 
 permissions:
   id-token: write
@@ -114,6 +122,7 @@ jobs:
     env:
       GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
       DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
+      Karpenter_VERSION: ${{ github.event.inputs.karpenter_version || '0.37.7' }}
 
     steps:
       - name: Checkout Repository
@@ -129,6 +138,29 @@ jobs:
         run: |
           aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }}
 
+      - name: Install Karpenter (if needed)
+        if: env.Karpenter_VERSION != 'no_karpenter'
+        run: |
+          echo "Installing Karpenter..."
+          AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
+          CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "${{ needs.apply-terraform.outputs.job_identifier }}" --query "cluster.endpoint" --output text)"
+          KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-${{ needs.apply-terraform.outputs.job_identifier }}"
+          helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \
+            --version "${{ env.KARPENTER_VERSION}}" \
+            --namespace kube-system \
+            --create-namespace \
+            --set settings.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \
+            --set settings.aws.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \
+            --set settings.aws.clusterEndpoint="${CLUSTER_ENDPOINT}" \
+            --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}" \
+            --set settings.aws.interruptionQueueName="${{ needs.apply-terraform.outputs.job_identifier }}-karpenter-interruption" \
+            --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="${KARPENTER_IAM_ROLE_ARN}" \
+            --set controller.resources.requests.cpu="1" \
+            --set controller.resources.requests.memory="1Gi" \
+            --set controller.resources.limits.cpu="1" \
+            --set controller.resources.limits.memory="1Gi" \
+            --wait
+
       - name: Check GPU Availability
         id: gpu_check
         run: |
@@ -161,7 +193,7 @@ jobs:
         if: env.GPU_INSTALL_TYPE == 'nvidia-device-plugin' && env.GPU_CHECK == 'false'
         run: |
           echo "Installing Nvidia Device Plugin..."
-          kubectl label node "$(kubectl get nodes -o jsonpath='{.items[0].metadata.name}')" nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite
+          kubectl get nodes -l node_type=gpu -o jsonpath='{.items[*].metadata.name}' | xargs -I {} kubectl label node {} nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite
           kubectl create ns nvidia-device-plugin
           kubectl apply -f nvidia-device-plugin-prereq
           helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
@@ -290,6 +322,40 @@ jobs:
             exit 1
           fi
 
+      - name: Test Karpenter
+        if: inputs.karpenter_version != 'no_karpenter'
+        run: |
+          echo "Verifying Karpenter installation..."
+          kubectl port-forward -n kube-system service/karpenter 8000:8000 > /dev/null 2>&1 &
+          PF_PID=$!
+          
+          # Allow port-forward to establish
+          sleep 5
+          
+          MAX_RETRIES=6
+          HEALTH=""
+          
+          for i in $(seq 1 $MAX_RETRIES); do
+            if curl -s http://localhost:8000/metrics | grep -q "controller_runtime_max_concurrent_reconciles"; then
+              HEALTH="OK"
+              break
+            fi
+            echo "[$i/$MAX_RETRIES] Waiting for Karpenter to become ready..."
+            sleep 10
+          done
+          
+          # Cleanup port-forward
+          kill $PF_PID || true
+          
+          if [ "$HEALTH" == "OK" ]; then
+            echo "Karpenter is healthy ✅"
+          else
+            echo "Karpenter health check failed ❌"
+            kubectl get pods -n kube-system -l app.kubernetes.io/name=karpenter
+            kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter --tail=50
+            exit 1
+          fi
+
   destroy-terraform:
     name: Destroy Terraform
     runs-on: ubuntu-latest
diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf
index e3ef7095..248e45b3 100644
--- a/terraform/aws/main.tf
+++ b/terraform/aws/main.tf
@@ -2,6 +2,9 @@ provider "aws" {
   region = "us-east-1"
 }
 
+data "aws_caller_identity" "current" {}
+
+# VPC Configuration
 module "vpc" {
   source = "terraform-aws-modules/vpc/aws"
 
@@ -27,9 +30,213 @@ module "vpc" {
   private_subnet_tags = {
     "kubernetes.io/cluster/${var.cluster_name}" = "shared"
     "kubernetes.io/role/internal-elb"           = "1"
+    "karpenter.sh/discovery"                    = "${var.cluster_name}"  # Added Karpenter discovery tag
   }
 }
 
+# IAM Roles and Policies for Karpenter
+resource "aws_iam_role" "karpenter_node_role" {
+  name = "KarpenterNodeRole-${var.cluster_name}"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect    = "Allow"
+        Principal = {
+          Service = "ec2.amazonaws.com"
+        }
+        Action   = "sts:AssumeRole"
+      }
+    ]
+  })
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_node_role_policy_attachment" {
+  role       = aws_iam_role.karpenter_node_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_node_ssm_policy_attachment" {
+  role       = aws_iam_role.karpenter_node_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_node_registry_policy_attachment" {
+  role       = aws_iam_role.karpenter_node_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryPullOnly"
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_node_admin_policy_attachment" {
+  role       = aws_iam_role.karpenter_node_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
+}
+
+# IAM Role for Karpenter Controller (with OIDC Trust Relationship)
+resource "aws_iam_role" "karpenter_controller_role" {
+  name = "KarpenterControllerRole-${var.cluster_name}"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect    = "Allow"
+        Principal = {
+          Federated = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/oidc.eks.${var.region}.amazonaws.com/id/${split("/id/", module.eks.cluster_oidc_issuer_url)[1]}"
+        }
+        Action   = "sts:AssumeRoleWithWebIdentity"
+        Condition = {
+          StringEquals = {
+            "oidc.eks.${var.region}.amazonaws.com/id/${split("/id/", module.eks.cluster_oidc_issuer_url)[1]}:sub" = "system:serviceaccount:kube-system:karpenter"
+          }
+        }
+      }
+    ]
+  })
+}
+
+resource "aws_iam_policy" "karpenter_controller_policy" {
+  name        = "KarpenterControllerPolicy-${var.cluster_name}"
+  description = "Custom Karpenter controller policy for managing EC2 instances, IAM roles, and EKS."
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Action = [
+          "ssm:GetParameter",
+          "ec2:DescribeImages",
+          "ec2:RunInstances",
+          "ec2:DescribeSubnets",
+          "ec2:DescribeSecurityGroups",
+          "ec2:DescribeLaunchTemplates",
+          "ec2:DescribeInstances",
+          "ec2:DescribeInstanceTypes",
+          "ec2:DescribeInstanceTypeOfferings",
+          "ec2:DeleteLaunchTemplate",
+          "ec2:CreateTags",
+          "ec2:CreateLaunchTemplate",
+          "ec2:CreateFleet",
+          "ec2:DescribeSpotPriceHistory",
+          "pricing:GetProducts"
+        ]
+        Effect = "Allow"
+        Resource = "*"
+        Sid = "Karpenter"
+      },
+      {
+        Action = "ec2:TerminateInstances"
+        Condition = {
+          StringLike = {
+            "ec2:ResourceTag/karpenter.sh/nodepool" = "*"
+          }
+        }
+        Effect = "Allow"
+        Resource = "*"
+        Sid = "ConditionalEC2Termination"
+      },
+      {
+        Effect = "Allow"
+        Action = "iam:PassRole"
+        Resource = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/KarpenterNodeRole-${var.cluster_name}"
+        Sid = "PassNodeIAMRole"
+      },
+      {
+        Effect = "Allow"
+        Action = "eks:DescribeCluster"
+        Resource = "arn:aws:eks:${var.region}:${data.aws_caller_identity.current.account_id}:cluster/${var.cluster_name}"
+        Sid = "EKSClusterEndpointLookup"
+      },
+      {
+        Sid = "AllowScopedInstanceProfileCreationActions"
+        Effect = "Allow"
+        Resource = "*"
+        Action = ["iam:CreateInstanceProfile"]
+        Condition = {
+          StringEquals = {
+            "aws:RequestTag/kubernetes.io/cluster/${var.cluster_name}" = "owned"
+            "aws:RequestTag/topology.kubernetes.io/region"           = "${var.region}"
+          }
+          StringLike = {
+            "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass" = "*"
+          }
+        }
+      },
+      {
+        Sid = "AllowScopedInstanceProfileTagActions"
+        Effect = "Allow"
+        Resource = "*"
+        Action = ["iam:TagInstanceProfile"]
+        Condition = {
+          StringEquals = {
+            "aws:ResourceTag/kubernetes.io/cluster/${var.cluster_name}" = "owned"
+            "aws:ResourceTag/topology.kubernetes.io/region"           = "${var.region}"
+            "aws:RequestTag/kubernetes.io/cluster/${var.cluster_name}" = "owned"
+            "aws:RequestTag/topology.kubernetes.io/region"           = "${var.region}"
+          }
+          StringLike = {
+            "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass" = "*"
+            "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass" = "*"
+          }
+        }
+      },
+      {
+        Sid = "AllowScopedInstanceProfileActions"
+        Effect = "Allow"
+        Resource = "*"
+        Action = [
+          "iam:AddRoleToInstanceProfile",
+          "iam:RemoveRoleFromInstanceProfile",
+          "iam:DeleteInstanceProfile"
+        ]
+        Condition = {
+          StringEquals = {
+            "aws:ResourceTag/kubernetes.io/cluster/${var.cluster_name}" = "owned"
+            "aws:ResourceTag/topology.kubernetes.io/region"           = "${var.region}"
+          }
+          StringLike = {
+            "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass" = "*"
+          }
+        }
+      },
+      {
+        Sid = "AllowInstanceProfileReadActions"
+        Effect = "Allow"
+        Resource = "*"
+        Action = "iam:GetInstanceProfile"
+      },
+      {
+        Effect = "Allow"
+        Action = [
+          "sqs:DeleteMessage",
+          "sqs:GetQueueUrl",
+          "sqs:GetQueueAttributes",
+          "sqs:ReceiveMessage"
+        ]
+        Resource = "*"
+        Sid      = "KarpenterInterruptionQueue"
+      }
+    ]
+  })
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_controller_custom_policy_attachment" {
+  role       = aws_iam_role.karpenter_controller_role.name
+  policy_arn = aws_iam_policy.karpenter_controller_policy.arn
+}
+
+
+resource "aws_iam_role_policy_attachment" "karpenter_controller_policy_attachment" {
+  role       = aws_iam_role.karpenter_controller_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_controller_admin_policy_attachment" {
+  role       = aws_iam_role.karpenter_controller_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
+}
+
+# EKS Cluster Configuration
 module "eks" {
   source          = "terraform-aws-modules/eks/aws"
 
@@ -41,27 +248,117 @@ module "eks" {
   subnet_ids      = module.vpc.private_subnets
 
   enable_irsa     = true
-
-  cluster_endpoint_public_access = true
   enable_cluster_creator_admin_permissions = true
+  cluster_endpoint_public_access = true
   cluster_endpoint_public_access_cidrs = ["0.0.0.0/0"]
 
+  # Disable automatic node IAM role creation
+  create_node_iam_role = false
+
   eks_managed_node_groups = {
     gpu_nodes = {
       instance_types = ["g6.4xlarge"]
       desired_size   = 1
-      min_size      = 1
-      max_size      = 1
-
-      ami_type      = "AL2023_x86_64_NVIDIA"
+      min_size       = 1
+      max_size       = 1
 
+      ami_type       = "AL2023_x86_64_NVIDIA"
       use_custom_launch_template = false
 
-      disk_size     = 200
+      metadata_options = {
+        http_endpoint               = "enabled"
+        http_tokens                 = "optional" 
+        http_put_response_hop_limit = 2           
+        instance_metadata_tags      = "enabled"
+      }
 
+      disk_size      = 200
       labels = {
         node_type = "gpu"
       }
+
+      # Attach the IAM role for Karpenter to the managed node group
+      iam_instance_profile = aws_iam_role.karpenter_node_role.name
     }
   }
 }
+
+# Security Group Tagging for Karpenter
+resource "aws_security_group" "karpenter_sg" {
+  name        = "karpenter-sg-${var.cluster_name}"
+  description = "Karpenter security group"
+
+  tags = {
+    "karpenter.sh/discovery" = "${var.cluster_name}"
+  }
+}
+
+resource "aws_security_group_rule" "karpenter_inbound" {
+  security_group_id = aws_security_group.karpenter_sg.id
+  type              = "ingress"
+  from_port         = 0
+  to_port           = 65535
+  protocol          = "tcp"
+  cidr_blocks       = ["0.0.0.0/0"]
+}
+
+
+// Replace the existing aws_sqs_queue resource
+resource "aws_sqs_queue" "karpenter_interruption_queue" {
+  name = "${var.cluster_name}-karpenter-interruption"  // Changed name to be more specific
+  sqs_managed_sse_enabled = true
+
+  tags = {
+    "karpenter.sh/discovery" = var.cluster_name
+  }
+}
+
+// Update the SQS queue policy
+resource "aws_sqs_queue_policy" "karpenter_interruption_queue_policy" {
+  queue_url = aws_sqs_queue.karpenter_interruption_queue.url
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Sid    = "AllowKarpenterController"
+        Effect = "Allow"
+        Principal = {
+          AWS = aws_iam_role.karpenter_controller_role.arn
+        }
+        Action = [
+          "sqs:DeleteMessage",
+          "sqs:GetQueueUrl",
+          "sqs:GetQueueAttributes",
+          "sqs:ReceiveMessage"
+        ]
+        Resource = aws_sqs_queue.karpenter_interruption_queue.arn
+      },
+      {
+        Sid    = "EC2SpotInterruption"
+        Effect = "Allow"
+        Principal = {
+          Service = ["events.amazonaws.com", "sqs.amazonaws.com"]
+        }
+        Action   = ["sqs:SendMessage"]
+        Resource = aws_sqs_queue.karpenter_interruption_queue.arn
+      }
+    ]
+  })
+}
+
+resource "aws_cloudwatch_event_rule" "spot_interruption" {
+  name        = "${var.cluster_name}-spot-interruption"
+  description = "Capture EC2 Spot Instance interruption notices"
+
+  event_pattern = jsonencode({
+    source      = ["aws.ec2"]
+    detail-type = ["EC2 Spot Instance Interruption Warning"]
+  })
+}
+
+resource "aws_cloudwatch_event_target" "spot_interruption" {
+  target_id = "KarpenterInterruptionQueueTarget"
+  rule      = aws_cloudwatch_event_rule.spot_interruption.name
+  arn       = aws_sqs_queue.karpenter_interruption_queue.arn
+}
\ No newline at end of file
diff --git a/terraform/aws/outputs.tf b/terraform/aws/outputs.tf
new file mode 100644
index 00000000..87ec9d95
--- /dev/null
+++ b/terraform/aws/outputs.tf
@@ -0,0 +1,13 @@
+data "aws_eks_cluster" "eks_cluster" {
+  name = var.cluster_name
+}
+
+output "oidc_provider_url" {
+  value = module.eks.cluster_oidc_issuer_url
+}
+
+# Output the cluster endpoint
+output "cluster_endpoint" {
+  value = data.aws_eks_cluster.eks_cluster.endpoint
+  description = "The endpoint of the EKS cluster"
+}
\ No newline at end of file
diff --git a/terraform/aws/terraform.tfvars b/terraform/aws/terraform.tfvars
index e343f0bb..6e098115 100644
--- a/terraform/aws/terraform.tfvars
+++ b/terraform/aws/terraform.tfvars
@@ -1,2 +1,3 @@
 cluster_name    = "devzero-gpu-cluster"
-cluster_version = "1.30"
\ No newline at end of file
+cluster_version = "1.30"
+region          = "us-east-1"
\ No newline at end of file
diff --git a/terraform/aws/variables.tf b/terraform/aws/variables.tf
index b9738fb3..741aed7d 100644
--- a/terraform/aws/variables.tf
+++ b/terraform/aws/variables.tf
@@ -7,3 +7,8 @@ variable "cluster_version" {
   description = "The Kubernetes version for the EKS cluster"
   type        = string
 }
+
+variable "region" {
+  description = "Region of EKS cluster"
+  type        = string
+}

From 98fa130a96956f3de032a8a6d843741d7072c1fb Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 03:21:06 +0530
Subject: [PATCH 27/44] test karpenter in aws gpu test CI

---
 .github/workflows/aws-gpu-test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 577b5edb..5a70c25f 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -350,7 +350,7 @@ jobs:
           if [ "$HEALTH" == "OK" ]; then
             echo "Karpenter is healthy ✅"
           else
-            echo "Karpenter health check failed ❌"
+            echo "Karpenter health check failed ❌ "
             kubectl get pods -n kube-system -l app.kubernetes.io/name=karpenter
             kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter --tail=50
             exit 1

From 6cbc66bc57dbb3fa56a9aa8608cddf471e6b1c16 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 03:27:18 +0530
Subject: [PATCH 28/44] test karpenter in aws gpu test CI

---
 .github/workflows/aws-gpu-test.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 5a70c25f..9bbd872c 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -98,7 +98,7 @@ jobs:
       - name: Set up Terraform
         uses: hashicorp/setup-terraform@v3
         with:
-          terraform_version: 1.5.7
+          terraform_version: 1.11.3
 
       - name: Apply Terraform
         working-directory: terraform/aws
@@ -380,7 +380,7 @@ jobs:
       - name: Set up Terraform
         uses: hashicorp/setup-terraform@v3
         with:
-          terraform_version: 1.5.7
+          terraform_version: 1.11.3
 
       - name: Destroy Infrastructure
         working-directory: terraform/aws

From 2b195c8fd3bf7ec70212d1968de3009c15aa96bd Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 03:34:01 +0530
Subject: [PATCH 29/44] test karpenter in aws gpu test CI

---
 .github/workflows/aws-gpu-test.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 9bbd872c..cd279711 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -97,8 +97,6 @@ jobs:
 
       - name: Set up Terraform
         uses: hashicorp/setup-terraform@v3
-        with:
-          terraform_version: 1.11.3
 
       - name: Apply Terraform
         working-directory: terraform/aws
@@ -379,8 +377,6 @@ jobs:
 
       - name: Set up Terraform
         uses: hashicorp/setup-terraform@v3
-        with:
-          terraform_version: 1.11.3
 
       - name: Destroy Infrastructure
         working-directory: terraform/aws

From c6fc269955874b8da256d9fc1fad7558aaf2b3d1 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 04:02:40 +0530
Subject: [PATCH 30/44] test karpenter in aws gpu test CI

---
 terraform/aws/main.tf    | 10 ++--------
 terraform/aws/outputs.tf | 13 -------------
 2 files changed, 2 insertions(+), 21 deletions(-)
 delete mode 100644 terraform/aws/outputs.tf

diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf
index 248e45b3..9d62a609 100644
--- a/terraform/aws/main.tf
+++ b/terraform/aws/main.tf
@@ -30,7 +30,7 @@ module "vpc" {
   private_subnet_tags = {
     "kubernetes.io/cluster/${var.cluster_name}" = "shared"
     "kubernetes.io/role/internal-elb"           = "1"
-    "karpenter.sh/discovery"                    = "${var.cluster_name}"  # Added Karpenter discovery tag
+    "karpenter.sh/discovery"                    = "${var.cluster_name}" 
   }
 }
 
@@ -72,7 +72,6 @@ resource "aws_iam_role_policy_attachment" "karpenter_node_admin_policy_attachmen
   policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
 }
 
-# IAM Role for Karpenter Controller (with OIDC Trust Relationship)
 resource "aws_iam_role" "karpenter_controller_role" {
   name = "KarpenterControllerRole-${var.cluster_name}"
 
@@ -252,7 +251,6 @@ module "eks" {
   cluster_endpoint_public_access = true
   cluster_endpoint_public_access_cidrs = ["0.0.0.0/0"]
 
-  # Disable automatic node IAM role creation
   create_node_iam_role = false
 
   eks_managed_node_groups = {
@@ -283,7 +281,6 @@ module "eks" {
   }
 }
 
-# Security Group Tagging for Karpenter
 resource "aws_security_group" "karpenter_sg" {
   name        = "karpenter-sg-${var.cluster_name}"
   description = "Karpenter security group"
@@ -302,10 +299,8 @@ resource "aws_security_group_rule" "karpenter_inbound" {
   cidr_blocks       = ["0.0.0.0/0"]
 }
 
-
-// Replace the existing aws_sqs_queue resource
 resource "aws_sqs_queue" "karpenter_interruption_queue" {
-  name = "${var.cluster_name}-karpenter-interruption"  // Changed name to be more specific
+  name = "${var.cluster_name}-karpenter-interruption"
   sqs_managed_sse_enabled = true
 
   tags = {
@@ -313,7 +308,6 @@ resource "aws_sqs_queue" "karpenter_interruption_queue" {
   }
 }
 
-// Update the SQS queue policy
 resource "aws_sqs_queue_policy" "karpenter_interruption_queue_policy" {
   queue_url = aws_sqs_queue.karpenter_interruption_queue.url
 
diff --git a/terraform/aws/outputs.tf b/terraform/aws/outputs.tf
deleted file mode 100644
index 87ec9d95..00000000
--- a/terraform/aws/outputs.tf
+++ /dev/null
@@ -1,13 +0,0 @@
-data "aws_eks_cluster" "eks_cluster" {
-  name = var.cluster_name
-}
-
-output "oidc_provider_url" {
-  value = module.eks.cluster_oidc_issuer_url
-}
-
-# Output the cluster endpoint
-output "cluster_endpoint" {
-  value = data.aws_eks_cluster.eks_cluster.endpoint
-  description = "The endpoint of the EKS cluster"
-}
\ No newline at end of file

From 270e04dc7a40e1037a9fdd372672be682c3c6a15 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 04:42:07 +0530
Subject: [PATCH 31/44] test karpenter in aws gpu test CI

---
 .github/workflows/aws-gpu-test.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index cd279711..4558675d 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -334,7 +334,9 @@ jobs:
           HEALTH=""
           
           for i in $(seq 1 $MAX_RETRIES); do
-            if curl -s http://localhost:8000/metrics | grep -q "controller_runtime_max_concurrent_reconciles"; then
+            response=$(curl -s http://localhost:8000/metrics)
+            echo "Response: $response"
+            if [[ -n "$response" ]]; then
               HEALTH="OK"
               break
             fi
@@ -348,12 +350,13 @@ jobs:
           if [ "$HEALTH" == "OK" ]; then
             echo "Karpenter is healthy ✅"
           else
-            echo "Karpenter health check failed ❌ "
+            echo "Karpenter health check failed ❌"
             kubectl get pods -n kube-system -l app.kubernetes.io/name=karpenter
             kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter --tail=50
             exit 1
           fi
 
+
   destroy-terraform:
     name: Destroy Terraform
     runs-on: ubuntu-latest

From f3844bb1a3650efdf451fc9ab891c78a34d5dbd4 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 05:07:22 +0530
Subject: [PATCH 32/44] test karpenter in aws gpu test CI

---
 .github/workflows/aws-gpu-test.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 4558675d..95ca4f89 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -327,7 +327,6 @@ jobs:
           kubectl port-forward -n kube-system service/karpenter 8000:8000 > /dev/null 2>&1 &
           PF_PID=$!
           
-          # Allow port-forward to establish
           sleep 5
           
           MAX_RETRIES=6

From 5a193159308ae36c97a4502ac8bbb5c64a11dfa9 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 18:22:25 +0530
Subject: [PATCH 33/44] alternate ci for karpenter with cloudformation

---
 .github/workflows/aws-gpu-test-2 | 515 +++++++++++++++++++++++++++++++
 1 file changed, 515 insertions(+)
 create mode 100644 .github/workflows/aws-gpu-test-2

diff --git a/.github/workflows/aws-gpu-test-2 b/.github/workflows/aws-gpu-test-2
new file mode 100644
index 00000000..11cc64cb
--- /dev/null
+++ b/.github/workflows/aws-gpu-test-2
@@ -0,0 +1,515 @@
+name: AWS GPU Test
+
+on:
+  push:
+    branches:
+      - garvit/aws-gpu-test
+  workflow_dispatch:
+    inputs:
+      gpu_install_type:
+        description: 'GPU installation type'
+        required: false
+        default: 'nvidia-device-plugin'
+        type: choice
+        options:
+          - gpu-operator
+          - nvidia-device-plugin
+      dcgm_install_type:
+        description: 'DCGM install type'
+        required: false
+        default: 'devzero-dcgm'
+        type: choice
+        options:
+          - nvidia-dcgm
+          - devzero-dcgm
+      cluster_version:
+        description: 'Kubernetes cluster version'
+        required: false
+        default: '1.30'
+        type: choice
+        options:
+          - '1.26'
+          - '1.27'
+          - '1.28'
+          - '1.29'
+          - '1.30'
+          - '1.31'
+          - '1.32'
+          - '1.33'
+      karpenter_version:
+        description: 'Karpenter Version'
+        required: false
+        default: '0.37.7'
+        type: choice
+        options:
+          - 'no_karpenter'
+          - '0.37.7'
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  apply-terraform:
+    name: Apply Terraform
+    runs-on: ubuntu-latest
+    env:
+      GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
+      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
+      CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}
+
+    outputs:
+      job_identifier: ${{ steps.job-identifier.outputs.job_identifier }}
+
+    steps:
+      - name: Validate Inputs
+        run: |
+          echo "GPU_INSTALL_TYPE=${GPU_INSTALL_TYPE}"
+          echo "DCGM_INSTALL_TYPE=${DCGM_INSTALL_TYPE}"
+
+          if [[ "$GPU_INSTALL_TYPE" == "nvidia-device-plugin" && "$DCGM_INSTALL_TYPE" != "devzero-dcgm" ]]; then
+            echo "Error: When GPU_INSTALL_TYPE is 'nvidia-device-plugin', DCGM_INSTALL_TYPE must be 'devzero-dcgm'."
+            exit 1
+          fi
+
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Configure AWS Credential
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
+          aws-region: us-east-1
+
+      - name: Generate Unique Job Identifier
+        id: job-identifier
+        shell: bash
+        run: |
+          SHORT_SHA=$(git rev-parse --short HEAD)
+          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
+            SUFFIX="dd"
+          else
+            SUFFIX="nd"
+          fi
+          JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}"
+          echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV
+          echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT
+
+      - name: Set up Terraform
+        uses: hashicorp/setup-terraform@v3
+
+      - name: Apply Terraform
+        working-directory: terraform/aws
+        run: |
+          export KARPENTER_NAMESPACE="kube-system"
+          export KARPENTER_VERSION="0.37.7"
+          export K8S_VERSION="1.30"
+          export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov
+          export CLUSTER_NAME="${env.JOB_IDENTIFIER}"
+          export AWS_DEFAULT_REGION="us-east-1"
+          export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
+          export TEMPOUT="$(mktemp)"
+          export ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')"
+          echo "${KARPENTER_NAMESPACE}" "${KARPENTER_VERSION}" "${K8S_VERSION}" "${CLUSTER_NAME}" "${AWS_DEFAULT_REGION}" "${AWS_ACCOUNT_ID}" "${TEMPOUT}" "${ALIAS_VERSION}"
+
+          curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml  > "${TEMPOUT}" \
+          && aws cloudformation deploy \
+            --stack-name "Karpenter-${CLUSTER_NAME}" \
+            --template-file "${TEMPOUT}" \
+            --capabilities CAPABILITY_NAMED_IAM \
+            --parameter-overrides "ClusterName=${CLUSTER_NAME}"
+
+          eksctl create cluster -f - <<EOF
+          ---
+          apiVersion: eksctl.io/v1alpha5
+          kind: ClusterConfig
+          metadata:
+            name: ${CLUSTER_NAME}
+            region: ${AWS_DEFAULT_REGION}
+            version: "${K8S_VERSION}"
+            tags:
+              karpenter.sh/discovery: ${CLUSTER_NAME}
+
+          iam:
+            withOIDC: true
+            podIdentityAssociations:
+            - namespace: "${KARPENTER_NAMESPACE}"
+              serviceAccountName: karpenter
+              roleName: ${CLUSTER_NAME}-karpenter
+              permissionPolicyARNs:
+              - arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}
+
+          iamIdentityMappings:
+          - arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}"
+            username: system:node:{{EC2PrivateDNSName}}
+            groups:
+            - system:bootstrappers
+            - system:nodes
+            ## If you intend to run Windows workloads, the kube-proxy group should be specified.
+            # For more information, see https://github.com/aws/karpenter/issues/5099.
+            # - eks:kube-proxy-windows
+
+          managedNodeGroups:
+          - instanceType: g6.4xlarge
+            amiFamily: AmazonLinux2023
+            name: ${CLUSTER_NAME}-ng
+            desiredCapacity: 2
+            minSize: 1
+            maxSize: 10
+
+          addons:
+          - name: eks-pod-identity-agent
+          EOF
+
+          export CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "${CLUSTER_NAME}" --query "cluster.endpoint" --output text)"
+          export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter"
+
+          echo "${CLUSTER_ENDPOINT} ${KARPENTER_IAM_ROLE_ARN}"
+
+      - name: Configure Karpenter
+        run: |
+          echo "Configuring Karpenter..."
+          # Logout of helm registry to perform an unauthenticated pull against the public ECR
+          helm registry logout public.ecr.aws || true
+
+          helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version "${KARPENTER_VERSION}" --namespace "${KARPENTER_NAMESPACE}" --create-namespace \
+            --set "settings.clusterName=${CLUSTER_NAME}" \
+            --set "settings.interruptionQueue=${CLUSTER_NAME}" \
+            --set controller.resources.requests.cpu=1 \
+            --set controller.resources.requests.memory=1Gi \
+            --set controller.resources.limits.cpu=1 \
+            --set controller.resources.limits.memory=1Gi \
+            --wait
+
+      - name: Configure Karpenter Node Class
+        run: |
+          cat <<EOF | envsubst | kubectl apply -f -
+          apiVersion: karpenter.sh/v1
+          kind: NodePool
+          metadata:
+            name: default
+          spec:
+            template:
+              spec:
+                requirements:
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values: ["amd64"]
+                  - key: kubernetes.io/os
+                    operator: In
+                    values: ["linux"]
+                  - key: karpenter.sh/capacity-type
+                    operator: In
+                    values: ["on-demand"]
+                  - key: karpenter.k8s.aws/instance-category
+                    operator: In
+                    values: ["c", "m", "r"]
+                  - key: karpenter.k8s.aws/instance-generation
+                    operator: Gt
+                    values: ["2"]
+                nodeClassRef:
+                  group: karpenter.k8s.aws
+                  kind: EC2NodeClass
+                  name: default
+                expireAfter: 720h # 30 * 24h = 720h
+            limits:
+              cpu: 1000
+            disruption:
+              consolidationPolicy: WhenEmptyOrUnderutilized
+              consolidateAfter: 1m
+          ---
+          apiVersion: karpenter.k8s.aws/v1
+          kind: EC2NodeClass
+          metadata:
+            name: default
+          spec:
+            role: "KarpenterNodeRole-${CLUSTER_NAME}" # replace with your cluster name
+            amiSelectorTerms:
+              - alias: "al2023@${ALIAS_VERSION}"
+            subnetSelectorTerms:
+              - tags:
+                  karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
+            securityGroupSelectorTerms:
+              - tags:
+                  karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
+          EOF
+
+          cat <<EOF | kubectl apply -f -
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: inflate
+          spec:
+            replicas: 0
+            selector:
+              matchLabels:
+                app: inflate
+            template:
+              metadata:
+                labels:
+                  app: inflate
+              spec:
+                terminationGracePeriodSeconds: 0
+                securityContext:
+                  runAsUser: 1000
+                  runAsGroup: 3000
+                  fsGroup: 2000
+                containers:
+                - name: inflate
+                  image: public.ecr.aws/eks-distro/kubernetes/pause:3.7
+                  resources:
+                    requests:
+                      cpu: 1
+                  securityContext:
+                    allowPrivilegeEscalation: false
+          EOF
+
+  install-and-validate:
+    name: Install and Validate GPU Resources and ZXPorter
+    runs-on: ubuntu-latest
+    needs: apply-terraform
+    env:
+      GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
+      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
+      Karpenter_VERSION: ${{ github.event.inputs.karpenter_version || '0.37.7' }}
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
+          aws-region: us-east-1
+
+      - name: Configure Kubernetes Access
+        run: |
+          aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }}
+
+      # - name: Install Karpenter (if needed)
+      #   if: env.Karpenter_VERSION != 'no_karpenter'
+      #   run: |
+      #     echo "Installing Karpenter..."
+      #     AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
+      #     CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "devzero-gpu-cluster" --query "cluster.endpoint" --output text)"
+      #     KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-devzero-gpu-cluster"
+      #     helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \
+      #       --version "0.37.7" \
+      #       --namespace kube-system \
+      #       --create-namespace \
+      #       --set settings.clusterName="devzero-gpu-cluster" \
+      #       --set settings.aws.clusterName="devzero-gpu-cluster" \
+      #       --set settings.aws.clusterEndpoint="${CLUSTER_ENDPOINT}" \
+      #       --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-devzero-gpu-cluster" \
+      #       --set settings.aws.interruptionQueueName="devzero-gpu-cluster-karpenter-interruption" \
+      #       --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="${KARPENTER_IAM_ROLE_ARN}" \
+      #       --set controller.resources.requests.cpu="1" \
+      #       --set controller.resources.requests.memory="1Gi" \
+      #       --set controller.resources.limits.cpu="1" \
+      #       --set controller.resources.limits.memory="1Gi" \
+      #       --wait
+
+      - name: Check GPU Availability
+        id: gpu_check
+        run: |
+          echo "Checking GPU resources on nodes..."
+          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
+            echo "GPU resources are available on the nodes."
+            echo "GPU_CHECK=true" >> $GITHUB_ENV
+          else
+            echo "GPU check failed"
+            echo "GPU_CHECK=false" >> $GITHUB_ENV
+          fi
+
+      - name: Install GPU Operator (if needed)
+        if: env.GPU_CHECK == 'false' && env.GPU_INSTALL_TYPE == 'gpu-operator'
+        run: |
+          echo "GPU resources not found, installing GPU Operator..."
+          kubectl create ns gpu-operator
+          kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite
+          kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true
+          helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \
+          helm repo update
+          INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0"
+          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
+            INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false"
+          fi
+          echo "Running: $INSTALL_CMD"
+          $INSTALL_CMD
+
+      - name: Install Nvidia Device Plugin
+        if: env.GPU_INSTALL_TYPE == 'nvidia-device-plugin' && env.GPU_CHECK == 'false'
+        run: |
+          echo "Installing Nvidia Device Plugin..."
+          kubectl get nodes -l node_type=gpu -o jsonpath='{.items[*].metadata.name}' | xargs -I {} kubectl label node {} nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite
+          kubectl create ns nvidia-device-plugin
+          kubectl apply -f nvidia-device-plugin-prereq
+          helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
+          helm repo update
+          helm upgrade -i nvdp nvdp/nvidia-device-plugin \
+            --namespace nvidia-device-plugin \
+            --version 0.17.1
+
+      - name: Check GPU Availability After Installing GPU Operator
+        if: env.GPU_CHECK == 'false'
+        run: |
+          echo "Re-checking GPU resources on nodes after GPU Operator installation..."
+          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
+            echo "GPU resources are available on the nodes."
+          else
+            echo "GPU check failed after GPU Operator installation"
+            exit 1
+          fi
+
+      - name: Check Nvidia DCGM DaemonSet
+        id: dcgm_check
+        if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }}
+        run: |
+          echo "Checking if DCGM DaemonSet is installed..."
+          if kubectl get daemonset -A | grep -q dcgm; then
+            echo "Nvidia DCGM found, proceeding with validation."
+          else
+            echo "Nvidia DCGM not found."
+            exit 1
+          fi
+
+      - name: Install DevZero DCGM
+        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
+        run: |
+          echo "Installing DCGM Exporter..."
+          kubectl create ns devzero-zxporter
+          curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f -
+
+      - name: Check DCGM DaemonSet After Installing DCGM Exporter
+        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
+        run: |
+          echo "Re-checking DCGM pods after DCGM Exporter installation..."
+          if kubectl get daemonset -A | grep -q dcgm; then
+            echo "DCGM DaemonSet is running."
+          else
+            echo "DCGM DaemonSet not running after installation"
+            exit 1
+          fi
+          
+      - name: Verify DCGM Pods and Prometheus Annotations
+        run: |
+          NAMESPACE="devzero-zxporter"
+          if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then
+            NAMESPACE="gpu-operator"
+          fi
+          kubectl get pods -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s
+          echo "Verifying DCGM pods and Prometheus annotations..."
+          kubectl get pods -A | grep dcgm-exporter | awk '
+          BEGIN { all_running = 1; pod_count = 0 }
+          {
+              pod_count++
+              status = $4
+              printf "Pod: %s/%s - Status: %s\n", $1, $2, status
+              if (status != "Running") all_running = 0
+          }
+          END {
+              printf "\nTotal Pods: %d\n", pod_count
+              printf "All Running: %s\n", (all_running ? "true" : "false")
+          }'
+          kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done
+
+      - name: Install and Verify DeepSeek Workload
+        run: |
+          kubectl create ns deepseek
+          kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml    
+          
+          kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s
+          pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}')
+          
+          if [[ -n "$pod_status" ]]; then
+            echo "Pods are not in Running state. Failing the pipeline."
+            exit 1
+          else
+            echo "All pods are running successfully."
+          fi
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.22'
+          cache: true
+
+      - name: Install ZXPorter
+        run: |
+          ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
+          echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
+          make docker-build docker-push IMG=${ZXPORTER_IMG}
+          make deploy IMG=${ZXPORTER_IMG}
+          
+          echo "Waiting for ZXPorter pods to be ready..."
+          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
+
+      - name: Test ZXPorter with Prometheus
+        run: |
+          kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter > pf.log 2>&1 &
+          PF_PID=$!
+          sleep 5
+          MAX_RETRIES=6
+          for i in $(seq 1 $MAX_RETRIES); do
+            if curl -s "http://localhost:9090/-/ready" >/dev/null; then
+              echo "Prometheus port-forward is ready."
+              break
+            fi
+            echo "[$i/$MAX_RETRIES] Waiting for Prometheus to become ready..."
+            sleep 5
+          done
+
+          result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
+          kill $PF_PID || true
+
+          echo "Metric found: $result"
+          if [[ -z "$result" || "$result" == [] ]]; then
+            echo "❌ DCGM_FI_DEV_SM_CLOCK metric not found!"
+            echo "Port-forward log:"
+            cat pf.log
+            exit 1
+          fi
+
+      - name: Test Karpenter
+        if: inputs.karpenter_version != 'no_karpenter'
+        run: |
+          kubectl scale deployment inflate --replicas 10
+          kubectl logs -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller
+          kubectl get nodes -o wide
+          kubectl delete deployment inflate
+
+
+  destroy-terraform:
+    name: Destroy Terraform
+    runs-on: ubuntu-latest
+    env:
+      CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}
+
+    if: always()
+    needs:
+      - apply-terraform
+      - install-and-validate
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
+          aws-region: us-east-1
+
+      - name: Set up Terraform
+        uses: hashicorp/setup-terraform@v3
+
+      - name: Destroy Infrastructure
+        working-directory: terraform/aws
+        run: |
+          helm uninstall karpenter --namespace kube-system || true
+          aws cloudformation delete-stack --stack-name "Karpenter-${{needs.apply-terraform.outputs.job_identifier}}" || true
+          aws ec2 describe-launch-templates --filters "Name=tag:karpenter.k8s.aws/cluster,Values=${{needs.apply-terraform.outputs.job_identifier}}" |
+              jq -r ".LaunchTemplates[].LaunchTemplateName" |
+              xargs -I{} aws ec2 delete-launch-template --launch-template-name {}
+          eksctl delete cluster --name "${{needs.apply-terraform.outputs.job_identifier}}"

From 66da291384dfe15d446c27d89e2922d5dd0bf4aa Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 18:23:56 +0530
Subject: [PATCH 34/44] alternate ci for karpenter with cloudformation

---
 .github/workflows/aws-gpu-test-2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/aws-gpu-test-2 b/.github/workflows/aws-gpu-test-2
index 11cc64cb..99c5f983 100644
--- a/.github/workflows/aws-gpu-test-2
+++ b/.github/workflows/aws-gpu-test-2
@@ -1,4 +1,4 @@
-name: AWS GPU Test
+name: AWS GPU Test (CloudFormation)
 
 on:
   push:

From d790f92a110ea2a377c168af1e74fea5a47354c4 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 18:26:17 +0530
Subject: [PATCH 35/44] alternate ci for karpenter with cloudformation

---
 .github/workflows/aws-gpu-test.yaml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 95ca4f89..14162d31 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -1,9 +1,9 @@
 name: AWS GPU Test
 
 on:
-  push:
-    branches:
-      - garvit/aws-gpu-test
+  # push:
+  #   branches:
+  #     - garvit/aws-gpu-test
   workflow_dispatch:
     inputs:
       gpu_install_type:
@@ -141,17 +141,17 @@ jobs:
         run: |
           echo "Installing Karpenter..."
           AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
-          CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "${{ needs.apply-terraform.outputs.job_identifier }}" --query "cluster.endpoint" --output text)"
-          KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-${{ needs.apply-terraform.outputs.job_identifier }}"
+          CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "devzero-gpu-cluster" --query "cluster.endpoint" --output text)"
+          KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-devzero-gpu-cluster"
           helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \
-            --version "${{ env.KARPENTER_VERSION}}" \
+            --version "0.37.7" \
             --namespace kube-system \
             --create-namespace \
-            --set settings.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \
-            --set settings.aws.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \
+            --set settings.clusterName="devzero-gpu-cluster" \
+            --set settings.aws.clusterName="devzero-gpu-cluster" \
             --set settings.aws.clusterEndpoint="${CLUSTER_ENDPOINT}" \
-            --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}" \
-            --set settings.aws.interruptionQueueName="${{ needs.apply-terraform.outputs.job_identifier }}-karpenter-interruption" \
+            --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-devzero-gpu-cluster" \
+            --set settings.aws.interruptionQueueName="devzero-gpu-cluster-karpenter-interruption" \
             --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="${KARPENTER_IAM_ROLE_ARN}" \
             --set controller.resources.requests.cpu="1" \
             --set controller.resources.requests.memory="1Gi" \

From 6d70a3405e9fdcab79e46f479179f10fbba2e82b Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 19:31:43 +0530
Subject: [PATCH 36/44] karpenter in aws gpu test ci

---
 .github/workflows/aws-gpu-test-2    | 515 ----------------------------
 .github/workflows/aws-gpu-test.yaml | 150 +++++---
 2 files changed, 110 insertions(+), 555 deletions(-)
 delete mode 100644 .github/workflows/aws-gpu-test-2

diff --git a/.github/workflows/aws-gpu-test-2 b/.github/workflows/aws-gpu-test-2
deleted file mode 100644
index 99c5f983..00000000
--- a/.github/workflows/aws-gpu-test-2
+++ /dev/null
@@ -1,515 +0,0 @@
-name: AWS GPU Test (CloudFormation)
-
-on:
-  push:
-    branches:
-      - garvit/aws-gpu-test
-  workflow_dispatch:
-    inputs:
-      gpu_install_type:
-        description: 'GPU installation type'
-        required: false
-        default: 'nvidia-device-plugin'
-        type: choice
-        options:
-          - gpu-operator
-          - nvidia-device-plugin
-      dcgm_install_type:
-        description: 'DCGM install type'
-        required: false
-        default: 'devzero-dcgm'
-        type: choice
-        options:
-          - nvidia-dcgm
-          - devzero-dcgm
-      cluster_version:
-        description: 'Kubernetes cluster version'
-        required: false
-        default: '1.30'
-        type: choice
-        options:
-          - '1.26'
-          - '1.27'
-          - '1.28'
-          - '1.29'
-          - '1.30'
-          - '1.31'
-          - '1.32'
-          - '1.33'
-      karpenter_version:
-        description: 'Karpenter Version'
-        required: false
-        default: '0.37.7'
-        type: choice
-        options:
-          - 'no_karpenter'
-          - '0.37.7'
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  apply-terraform:
-    name: Apply Terraform
-    runs-on: ubuntu-latest
-    env:
-      GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
-      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
-      CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}
-
-    outputs:
-      job_identifier: ${{ steps.job-identifier.outputs.job_identifier }}
-
-    steps:
-      - name: Validate Inputs
-        run: |
-          echo "GPU_INSTALL_TYPE=${GPU_INSTALL_TYPE}"
-          echo "DCGM_INSTALL_TYPE=${DCGM_INSTALL_TYPE}"
-
-          if [[ "$GPU_INSTALL_TYPE" == "nvidia-device-plugin" && "$DCGM_INSTALL_TYPE" != "devzero-dcgm" ]]; then
-            echo "Error: When GPU_INSTALL_TYPE is 'nvidia-device-plugin', DCGM_INSTALL_TYPE must be 'devzero-dcgm'."
-            exit 1
-          fi
-
-      - name: Checkout Repository
-        uses: actions/checkout@v4
-
-      - name: Configure AWS Credential
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
-          aws-region: us-east-1
-
-      - name: Generate Unique Job Identifier
-        id: job-identifier
-        shell: bash
-        run: |
-          SHORT_SHA=$(git rev-parse --short HEAD)
-          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
-            SUFFIX="dd"
-          else
-            SUFFIX="nd"
-          fi
-          JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}"
-          echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV
-          echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT
-
-      - name: Set up Terraform
-        uses: hashicorp/setup-terraform@v3
-
-      - name: Apply Terraform
-        working-directory: terraform/aws
-        run: |
-          export KARPENTER_NAMESPACE="kube-system"
-          export KARPENTER_VERSION="0.37.7"
-          export K8S_VERSION="1.30"
-          export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov
-          export CLUSTER_NAME="${env.JOB_IDENTIFIER}"
-          export AWS_DEFAULT_REGION="us-east-1"
-          export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
-          export TEMPOUT="$(mktemp)"
-          export ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')"
-          echo "${KARPENTER_NAMESPACE}" "${KARPENTER_VERSION}" "${K8S_VERSION}" "${CLUSTER_NAME}" "${AWS_DEFAULT_REGION}" "${AWS_ACCOUNT_ID}" "${TEMPOUT}" "${ALIAS_VERSION}"
-
-          curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml  > "${TEMPOUT}" \
-          && aws cloudformation deploy \
-            --stack-name "Karpenter-${CLUSTER_NAME}" \
-            --template-file "${TEMPOUT}" \
-            --capabilities CAPABILITY_NAMED_IAM \
-            --parameter-overrides "ClusterName=${CLUSTER_NAME}"
-
-          eksctl create cluster -f - <<EOF
-          ---
-          apiVersion: eksctl.io/v1alpha5
-          kind: ClusterConfig
-          metadata:
-            name: ${CLUSTER_NAME}
-            region: ${AWS_DEFAULT_REGION}
-            version: "${K8S_VERSION}"
-            tags:
-              karpenter.sh/discovery: ${CLUSTER_NAME}
-
-          iam:
-            withOIDC: true
-            podIdentityAssociations:
-            - namespace: "${KARPENTER_NAMESPACE}"
-              serviceAccountName: karpenter
-              roleName: ${CLUSTER_NAME}-karpenter
-              permissionPolicyARNs:
-              - arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}
-
-          iamIdentityMappings:
-          - arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}"
-            username: system:node:{{EC2PrivateDNSName}}
-            groups:
-            - system:bootstrappers
-            - system:nodes
-            ## If you intend to run Windows workloads, the kube-proxy group should be specified.
-            # For more information, see https://github.com/aws/karpenter/issues/5099.
-            # - eks:kube-proxy-windows
-
-          managedNodeGroups:
-          - instanceType: g6.4xlarge
-            amiFamily: AmazonLinux2023
-            name: ${CLUSTER_NAME}-ng
-            desiredCapacity: 2
-            minSize: 1
-            maxSize: 10
-
-          addons:
-          - name: eks-pod-identity-agent
-          EOF
-
-          export CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "${CLUSTER_NAME}" --query "cluster.endpoint" --output text)"
-          export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter"
-
-          echo "${CLUSTER_ENDPOINT} ${KARPENTER_IAM_ROLE_ARN}"
-
-      - name: Configure Karpenter
-        run: |
-          echo "Configuring Karpenter..."
-          # Logout of helm registry to perform an unauthenticated pull against the public ECR
-          helm registry logout public.ecr.aws || true
-
-          helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version "${KARPENTER_VERSION}" --namespace "${KARPENTER_NAMESPACE}" --create-namespace \
-            --set "settings.clusterName=${CLUSTER_NAME}" \
-            --set "settings.interruptionQueue=${CLUSTER_NAME}" \
-            --set controller.resources.requests.cpu=1 \
-            --set controller.resources.requests.memory=1Gi \
-            --set controller.resources.limits.cpu=1 \
-            --set controller.resources.limits.memory=1Gi \
-            --wait
-
-      - name: Configure Karpenter Node Class
-        run: |
-          cat <<EOF | envsubst | kubectl apply -f -
-          apiVersion: karpenter.sh/v1
-          kind: NodePool
-          metadata:
-            name: default
-          spec:
-            template:
-              spec:
-                requirements:
-                  - key: kubernetes.io/arch
-                    operator: In
-                    values: ["amd64"]
-                  - key: kubernetes.io/os
-                    operator: In
-                    values: ["linux"]
-                  - key: karpenter.sh/capacity-type
-                    operator: In
-                    values: ["on-demand"]
-                  - key: karpenter.k8s.aws/instance-category
-                    operator: In
-                    values: ["c", "m", "r"]
-                  - key: karpenter.k8s.aws/instance-generation
-                    operator: Gt
-                    values: ["2"]
-                nodeClassRef:
-                  group: karpenter.k8s.aws
-                  kind: EC2NodeClass
-                  name: default
-                expireAfter: 720h # 30 * 24h = 720h
-            limits:
-              cpu: 1000
-            disruption:
-              consolidationPolicy: WhenEmptyOrUnderutilized
-              consolidateAfter: 1m
-          ---
-          apiVersion: karpenter.k8s.aws/v1
-          kind: EC2NodeClass
-          metadata:
-            name: default
-          spec:
-            role: "KarpenterNodeRole-${CLUSTER_NAME}" # replace with your cluster name
-            amiSelectorTerms:
-              - alias: "al2023@${ALIAS_VERSION}"
-            subnetSelectorTerms:
-              - tags:
-                  karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
-            securityGroupSelectorTerms:
-              - tags:
-                  karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
-          EOF
-
-          cat <<EOF | kubectl apply -f -
-          apiVersion: apps/v1
-          kind: Deployment
-          metadata:
-            name: inflate
-          spec:
-            replicas: 0
-            selector:
-              matchLabels:
-                app: inflate
-            template:
-              metadata:
-                labels:
-                  app: inflate
-              spec:
-                terminationGracePeriodSeconds: 0
-                securityContext:
-                  runAsUser: 1000
-                  runAsGroup: 3000
-                  fsGroup: 2000
-                containers:
-                - name: inflate
-                  image: public.ecr.aws/eks-distro/kubernetes/pause:3.7
-                  resources:
-                    requests:
-                      cpu: 1
-                  securityContext:
-                    allowPrivilegeEscalation: false
-          EOF
-
-  install-and-validate:
-    name: Install and Validate GPU Resources and ZXPorter
-    runs-on: ubuntu-latest
-    needs: apply-terraform
-    env:
-      GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
-      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
-      Karpenter_VERSION: ${{ github.event.inputs.karpenter_version || '0.37.7' }}
-
-    steps:
-      - name: Checkout Repository
-        uses: actions/checkout@v4
-
-      - name: Configure AWS Credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
-          aws-region: us-east-1
-
-      - name: Configure Kubernetes Access
-        run: |
-          aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }}
-
-      # - name: Install Karpenter (if needed)
-      #   if: env.Karpenter_VERSION != 'no_karpenter'
-      #   run: |
-      #     echo "Installing Karpenter..."
-      #     AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
-      #     CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "devzero-gpu-cluster" --query "cluster.endpoint" --output text)"
-      #     KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-devzero-gpu-cluster"
-      #     helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \
-      #       --version "0.37.7" \
-      #       --namespace kube-system \
-      #       --create-namespace \
-      #       --set settings.clusterName="devzero-gpu-cluster" \
-      #       --set settings.aws.clusterName="devzero-gpu-cluster" \
-      #       --set settings.aws.clusterEndpoint="${CLUSTER_ENDPOINT}" \
-      #       --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-devzero-gpu-cluster" \
-      #       --set settings.aws.interruptionQueueName="devzero-gpu-cluster-karpenter-interruption" \
-      #       --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="${KARPENTER_IAM_ROLE_ARN}" \
-      #       --set controller.resources.requests.cpu="1" \
-      #       --set controller.resources.requests.memory="1Gi" \
-      #       --set controller.resources.limits.cpu="1" \
-      #       --set controller.resources.limits.memory="1Gi" \
-      #       --wait
-
-      - name: Check GPU Availability
-        id: gpu_check
-        run: |
-          echo "Checking GPU resources on nodes..."
-          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
-            echo "GPU resources are available on the nodes."
-            echo "GPU_CHECK=true" >> $GITHUB_ENV
-          else
-            echo "GPU check failed"
-            echo "GPU_CHECK=false" >> $GITHUB_ENV
-          fi
-
-      - name: Install GPU Operator (if needed)
-        if: env.GPU_CHECK == 'false' && env.GPU_INSTALL_TYPE == 'gpu-operator'
-        run: |
-          echo "GPU resources not found, installing GPU Operator..."
-          kubectl create ns gpu-operator
-          kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite
-          kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true
-          helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \
-          helm repo update
-          INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0"
-          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
-            INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false"
-          fi
-          echo "Running: $INSTALL_CMD"
-          $INSTALL_CMD
-
-      - name: Install Nvidia Device Plugin
-        if: env.GPU_INSTALL_TYPE == 'nvidia-device-plugin' && env.GPU_CHECK == 'false'
-        run: |
-          echo "Installing Nvidia Device Plugin..."
-          kubectl get nodes -l node_type=gpu -o jsonpath='{.items[*].metadata.name}' | xargs -I {} kubectl label node {} nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite
-          kubectl create ns nvidia-device-plugin
-          kubectl apply -f nvidia-device-plugin-prereq
-          helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
-          helm repo update
-          helm upgrade -i nvdp nvdp/nvidia-device-plugin \
-            --namespace nvidia-device-plugin \
-            --version 0.17.1
-
-      - name: Check GPU Availability After Installing GPU Operator
-        if: env.GPU_CHECK == 'false'
-        run: |
-          echo "Re-checking GPU resources on nodes after GPU Operator installation..."
-          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
-            echo "GPU resources are available on the nodes."
-          else
-            echo "GPU check failed after GPU Operator installation"
-            exit 1
-          fi
-
-      - name: Check Nvidia DCGM DaemonSet
-        id: dcgm_check
-        if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }}
-        run: |
-          echo "Checking if DCGM DaemonSet is installed..."
-          if kubectl get daemonset -A | grep -q dcgm; then
-            echo "Nvidia DCGM found, proceeding with validation."
-          else
-            echo "Nvidia DCGM not found."
-            exit 1
-          fi
-
-      - name: Install DevZero DCGM
-        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
-        run: |
-          echo "Installing DCGM Exporter..."
-          kubectl create ns devzero-zxporter
-          curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f -
-
-      - name: Check DCGM DaemonSet After Installing DCGM Exporter
-        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
-        run: |
-          echo "Re-checking DCGM pods after DCGM Exporter installation..."
-          if kubectl get daemonset -A | grep -q dcgm; then
-            echo "DCGM DaemonSet is running."
-          else
-            echo "DCGM DaemonSet not running after installation"
-            exit 1
-          fi
-          
-      - name: Verify DCGM Pods and Prometheus Annotations
-        run: |
-          NAMESPACE="devzero-zxporter"
-          if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then
-            NAMESPACE="gpu-operator"
-          fi
-          kubectl get pods -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s
-          echo "Verifying DCGM pods and Prometheus annotations..."
-          kubectl get pods -A | grep dcgm-exporter | awk '
-          BEGIN { all_running = 1; pod_count = 0 }
-          {
-              pod_count++
-              status = $4
-              printf "Pod: %s/%s - Status: %s\n", $1, $2, status
-              if (status != "Running") all_running = 0
-          }
-          END {
-              printf "\nTotal Pods: %d\n", pod_count
-              printf "All Running: %s\n", (all_running ? "true" : "false")
-          }'
-          kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done
-
-      - name: Install and Verify DeepSeek Workload
-        run: |
-          kubectl create ns deepseek
-          kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml    
-          
-          kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s
-          pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}')
-          
-          if [[ -n "$pod_status" ]]; then
-            echo "Pods are not in Running state. Failing the pipeline."
-            exit 1
-          else
-            echo "All pods are running successfully."
-          fi
-
-      - name: Set up Go
-        uses: actions/setup-go@v5
-        with:
-          go-version: '1.22'
-          cache: true
-
-      - name: Install ZXPorter
-        run: |
-          ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
-          echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
-          make docker-build docker-push IMG=${ZXPORTER_IMG}
-          make deploy IMG=${ZXPORTER_IMG}
-          
-          echo "Waiting for ZXPorter pods to be ready..."
-          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
-
-      - name: Test ZXPorter with Prometheus
-        run: |
-          kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter > pf.log 2>&1 &
-          PF_PID=$!
-          sleep 5
-          MAX_RETRIES=6
-          for i in $(seq 1 $MAX_RETRIES); do
-            if curl -s "http://localhost:9090/-/ready" >/dev/null; then
-              echo "Prometheus port-forward is ready."
-              break
-            fi
-            echo "[$i/$MAX_RETRIES] Waiting for Prometheus to become ready..."
-            sleep 5
-          done
-
-          result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
-          kill $PF_PID || true
-
-          echo "Metric found: $result"
-          if [[ -z "$result" || "$result" == [] ]]; then
-            echo "❌ DCGM_FI_DEV_SM_CLOCK metric not found!"
-            echo "Port-forward log:"
-            cat pf.log
-            exit 1
-          fi
-
-      - name: Test Karpenter
-        if: inputs.karpenter_version != 'no_karpenter'
-        run: |
-          kubectl scale deployment inflate --replicas 10
-          kubectl logs -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller
-          kubectl get nodes -o wide
-          kubectl delete deployment inflate
-
-
-  destroy-terraform:
-    name: Destroy Terraform
-    runs-on: ubuntu-latest
-    env:
-      CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}
-
-    if: always()
-    needs:
-      - apply-terraform
-      - install-and-validate
-
-    steps:
-      - name: Checkout Repository
-        uses: actions/checkout@v4
-
-      - name: Configure AWS Credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
-          aws-region: us-east-1
-
-      - name: Set up Terraform
-        uses: hashicorp/setup-terraform@v3
-
-      - name: Destroy Infrastructure
-        working-directory: terraform/aws
-        run: |
-          helm uninstall karpenter --namespace kube-system || true
-          aws cloudformation delete-stack --stack-name "Karpenter-${{needs.apply-terraform.outputs.job_identifier}}" || true
-          aws ec2 describe-launch-templates --filters "Name=tag:karpenter.k8s.aws/cluster,Values=${{needs.apply-terraform.outputs.job_identifier}}" |
-              jq -r ".LaunchTemplates[].LaunchTemplateName" |
-              xargs -I{} aws ec2 delete-launch-template --launch-template-name {}
-          eksctl delete cluster --name "${{needs.apply-terraform.outputs.job_identifier}}"
diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 14162d31..dc5389fb 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -1,9 +1,9 @@
 name: AWS GPU Test
 
 on:
-  # push:
-  #   branches:
-  #     - garvit/aws-gpu-test
+  push:
+    branches:
+      - garvit/aws-gpu-test
   workflow_dispatch:
     inputs:
       gpu_install_type:
@@ -294,11 +294,117 @@ jobs:
           echo "Waiting for ZXPorter pods to be ready..."
           kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
 
+      - name: Test Karpenter
+        if: inputs.karpenter_version != 'no_karpenter'
+        run: |
+          echo "Intalling Karpenter Node Class and Node Pool..."
+          K8S_VERSION="1.30"
+          ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')"
+          kubectl get nodes -o wide || true
+          cat <<EOF | envsubst | kubectl apply -f -
+          apiVersion: karpenter.sh/v1
+          kind: NodePool
+          metadata:
+            name: default
+          spec:
+            template:
+              spec:
+                requirements:
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values: ["amd64"]
+                  - key: kubernetes.io/os
+                    operator: In
+                    values: ["linux"]
+                  - key: karpenter.sh/capacity-type
+                    operator: In
+                    values: ["on-demand"]
+                  - key: karpenter.k8s.aws/instance-category
+                    operator: In
+                    values: ["c", "m", "r"]
+                  - key: karpenter.k8s.aws/instance-generation
+                    operator: Gt
+                    values: ["2"]
+                nodeClassRef:
+                  group: karpenter.k8s.aws
+                  kind: EC2NodeClass
+                  name: default
+                expireAfter: 720h # 30 * 24h = 720h
+            limits:
+              cpu: 1000
+            disruption:
+              consolidationPolicy: WhenEmptyOrUnderutilized
+              consolidateAfter: 1m
+          ---
+          apiVersion: karpenter.k8s.aws/v1
+          kind: EC2NodeClass
+          metadata:
+            name: default
+          spec:
+            role: "KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}" # replace with your cluster name
+            amiSelectorTerms:
+              - alias: "al2023@${ALIAS_VERSION}"
+            subnetSelectorTerms:
+              - tags:
+                  karpenter.sh/discovery: "${{ needs.apply-terraform.outputs.job_identifier }}" # replace with your cluster name
+            securityGroupSelectorTerms:
+              - tags:
+                  karpenter.sh/discovery: "${{ needs.apply-terraform.outputs.job_identifier }}" # replace with your cluster name
+          EOF
+
+          sleep 10
+
+          echo "Creating a deployment to trigger Karpenter node provisioning..."
+          cat <<EOF | kubectl apply -f -
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: inflate
+          spec:
+            replicas: 0
+            selector:
+              matchLabels:
+                app: inflate
+            template:
+              metadata:
+                labels:
+                  app: inflate
+              spec:
+                terminationGracePeriodSeconds: 0
+                securityContext:
+                  runAsUser: 1000
+                  runAsGroup: 3000
+                  fsGroup: 2000
+                containers:
+                - name: inflate
+                  image: public.ecr.aws/eks-distro/kubernetes/pause:3.7
+                  resources:
+                    requests:
+                      cpu: 1
+                  securityContext:
+                    allowPrivilegeEscalation: false
+          EOF
+
+          kubectl scale deployment inflate --replicas 10
+
+          echo "Waiting for nodes to be provisioned by Karpenter..."
+          sleep 20
+
+          kubectl get nodes -o wide || true
+  
+          NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
+          if [ "$NODE_COUNT" -le 1 ]; then
+            echo "Error: Node count is $NODE_COUNT, Karpenter did not provision nodes."
+            exit 1
+          else
+            echo "Karepenter successfully provisioned nodes. Node count: $NODE_COUNT"
+          fi
+
       - name: Test ZXPorter with Prometheus
         run: |
           kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter > pf.log 2>&1 &
           PF_PID=$!
-          sleep 5
+          sleep 20
           MAX_RETRIES=6
           for i in $(seq 1 $MAX_RETRIES); do
             if curl -s "http://localhost:9090/-/ready" >/dev/null; then
@@ -320,42 +426,6 @@ jobs:
             exit 1
           fi
 
-      - name: Test Karpenter
-        if: inputs.karpenter_version != 'no_karpenter'
-        run: |
-          echo "Verifying Karpenter installation..."
-          kubectl port-forward -n kube-system service/karpenter 8000:8000 > /dev/null 2>&1 &
-          PF_PID=$!
-          
-          sleep 5
-          
-          MAX_RETRIES=6
-          HEALTH=""
-          
-          for i in $(seq 1 $MAX_RETRIES); do
-            response=$(curl -s http://localhost:8000/metrics)
-            echo "Response: $response"
-            if [[ -n "$response" ]]; then
-              HEALTH="OK"
-              break
-            fi
-            echo "[$i/$MAX_RETRIES] Waiting for Karpenter to become ready..."
-            sleep 10
-          done
-          
-          # Cleanup port-forward
-          kill $PF_PID || true
-          
-          if [ "$HEALTH" == "OK" ]; then
-            echo "Karpenter is healthy ✅"
-          else
-            echo "Karpenter health check failed ❌"
-            kubectl get pods -n kube-system -l app.kubernetes.io/name=karpenter
-            kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter --tail=50
-            exit 1
-          fi
-
-
   destroy-terraform:
     name: Destroy Terraform
     runs-on: ubuntu-latest

From 3ffa2cf5e7d4b483eabff603de859db132537079 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 19:44:15 +0530
Subject: [PATCH 37/44] karpenter in aws gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index dc5389fb..c8146d20 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -141,17 +141,17 @@ jobs:
         run: |
           echo "Installing Karpenter..."
           AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
-          CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "devzero-gpu-cluster" --query "cluster.endpoint" --output text)"
-          KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-devzero-gpu-cluster"
+          CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ needs.apply-terraform.outputs.job_identifier }} --query "cluster.endpoint" --output text)"
+          KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-${{ needs.apply-terraform.outputs.job_identifier }}"
           helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \
             --version "0.37.7" \
             --namespace kube-system \
             --create-namespace \
-            --set settings.clusterName="devzero-gpu-cluster" \
-            --set settings.aws.clusterName="devzero-gpu-cluster" \
+            --set settings.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \
+            --set settings.aws.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \
             --set settings.aws.clusterEndpoint="${CLUSTER_ENDPOINT}" \
-            --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-devzero-gpu-cluster" \
-            --set settings.aws.interruptionQueueName="devzero-gpu-cluster-karpenter-interruption" \
+            --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}" \
+            --set settings.aws.interruptionQueueName="${{ needs.apply-terraform.outputs.job_identifier }}-karpenter-interruption" \
             --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="${KARPENTER_IAM_ROLE_ARN}" \
             --set controller.resources.requests.cpu="1" \
             --set controller.resources.requests.memory="1Gi" \

From 975c0e3961427c1da9c63f10e8e450bba1d2f32d Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 20:06:45 +0530
Subject: [PATCH 38/44] karpenter in aws gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index c8146d20..dedd8c5b 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -132,10 +132,24 @@ jobs:
           role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
           aws-region: us-east-1
 
+      - name: Install yq
+        run: |
+          sudo wget https://github.com/mikefarah/yq/releases/download/v4.15.1/yq_linux_amd64 -O /usr/local/bin/yq
+          sudo chmod +x /usr/local/bin/yq
+
       - name: Configure Kubernetes Access
         run: |
           aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }}
 
+      - name: Add new mapRole to aws-auth ConfigMap
+        if: env.Karpenter_VERSION != 'no_karpenter'
+        run: |
+          NEW_MAPROLE='- groups:\n      - system:bootstrappers\n      - system:nodes\n      rolearn: arn:aws:iam::484907513542:role/KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}\n      username: system:node:{{EC2PrivateDNSName}}'
+          kubectl get configmap/aws-auth -n kube-system -o yaml > aws-auth.yaml
+          yq eval '.data.mapRoles |= . + "- groups:\n  - system:bootstrappers\n  - system:nodes\n  rolearn: arn:aws:iam::484907513542:role/KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}\n  username: system:node:{{EC2PrivateDNSName}}\n"' -i aws-auth.yaml
+          kubectl apply -f aws-auth.yaml
+          kubectl get configmap/aws-auth -n kube-system -o yaml
+
       - name: Install Karpenter (if needed)
         if: env.Karpenter_VERSION != 'no_karpenter'
         run: |

From f74d8ac1914767bdc9c18381d5e373c9dfc5888a Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 20:40:56 +0530
Subject: [PATCH 39/44] karpenter in aws gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index dedd8c5b..abcb35f5 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -121,6 +121,7 @@ jobs:
       GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
       DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
       Karpenter_VERSION: ${{ github.event.inputs.karpenter_version || '0.37.7' }}
+      CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}
 
     steps:
       - name: Checkout Repository
@@ -157,6 +158,8 @@ jobs:
           AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
           CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ needs.apply-terraform.outputs.job_identifier }} --query "cluster.endpoint" --output text)"
           KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-${{ needs.apply-terraform.outputs.job_identifier }}"
+          echo "Karpenter IAM Role ARN: ${KARPENTER_IAM_ROLE_ARN}"
+          echo "Cluster Endpoint: ${CLUSTER_ENDPOINT}"
           helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \
             --version "0.37.7" \
             --namespace kube-system \
@@ -312,8 +315,7 @@ jobs:
         if: inputs.karpenter_version != 'no_karpenter'
         run: |
           echo "Intalling Karpenter Node Class and Node Pool..."
-          K8S_VERSION="1.30"
-          ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')"
+          ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${{ env.CLUSTER_VERSION }}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')"
           kubectl get nodes -o wide || true
           cat <<EOF | envsubst | kubectl apply -f -
           apiVersion: karpenter.sh/v1
@@ -355,15 +357,15 @@ jobs:
           metadata:
             name: default
           spec:
-            role: "KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}" # replace with your cluster name
+            role: "KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}"
             amiSelectorTerms:
               - alias: "al2023@${ALIAS_VERSION}"
             subnetSelectorTerms:
               - tags:
-                  karpenter.sh/discovery: "${{ needs.apply-terraform.outputs.job_identifier }}" # replace with your cluster name
+                  karpenter.sh/discovery: "${{ needs.apply-terraform.outputs.job_identifier }}"
             securityGroupSelectorTerms:
               - tags:
-                  karpenter.sh/discovery: "${{ needs.apply-terraform.outputs.job_identifier }}" # replace with your cluster name
+                  karpenter.sh/discovery: "${{ needs.apply-terraform.outputs.job_identifier }}"
           EOF
 
           sleep 10
@@ -400,16 +402,15 @@ jobs:
           EOF
 
           kubectl scale deployment inflate --replicas 10
-
+            
           echo "Waiting for nodes to be provisioned by Karpenter..."
-          sleep 20
+          kubectl wait --for=condition=Ready pod -l app=inflate --timeout=180s
 
           kubectl get nodes -o wide || true
   
           NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
           if [ "$NODE_COUNT" -le 1 ]; then
             echo "Error: Node count is $NODE_COUNT, Karpenter did not provision nodes."
-            exit 1
           else
             echo "Karepenter successfully provisioned nodes. Node count: $NODE_COUNT"
           fi

From 0c998dc57bee18225f25010c499d51fb970b6eb6 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 20:59:31 +0530
Subject: [PATCH 40/44] karpenter in aws gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index abcb35f5..5722f5f2 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -407,6 +407,8 @@ jobs:
           kubectl wait --for=condition=Ready pod -l app=inflate --timeout=180s
 
           kubectl get nodes -o wide || true
+
+          kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter -c controller
   
           NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
           if [ "$NODE_COUNT" -le 1 ]; then

From d5e4f9020f3d68ed7d4b227a55b805b8f540043a Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 21:13:37 +0530
Subject: [PATCH 41/44] karpenter in aws gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 5722f5f2..5c032127 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -368,7 +368,7 @@ jobs:
                   karpenter.sh/discovery: "${{ needs.apply-terraform.outputs.job_identifier }}"
           EOF
 
-          sleep 10
+          kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter -c controller
 
           echo "Creating a deployment to trigger Karpenter node provisioning..."
           cat <<EOF | kubectl apply -f -
@@ -404,9 +404,9 @@ jobs:
           kubectl scale deployment inflate --replicas 10
             
           echo "Waiting for nodes to be provisioned by Karpenter..."
-          kubectl wait --for=condition=Ready pod -l app=inflate --timeout=180s
+          kubectl wait --for=condition=Ready pod -l app=inflate --timeout=180s || true
 
-          kubectl get nodes -o wide || true
+          kubectl get nodes -o wide
 
           kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter -c controller
   

From 4f8cb5aab59ef653243cd0cfa7abe4a2faf30f66 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 21:29:54 +0530
Subject: [PATCH 42/44] karpenter in aws gpu test ci

---
 terraform/aws/main.tf | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf
index 9d62a609..252ad5c2 100644
--- a/terraform/aws/main.tf
+++ b/terraform/aws/main.tf
@@ -253,6 +253,10 @@ module "eks" {
 
   create_node_iam_role = false
 
+  tags = {
+    "karpenter.sh/discovery" = var.cluster_name
+  }
+
   eks_managed_node_groups = {
     gpu_nodes = {
       instance_types = ["g6.4xlarge"]
@@ -284,6 +288,7 @@ module "eks" {
 resource "aws_security_group" "karpenter_sg" {
   name        = "karpenter-sg-${var.cluster_name}"
   description = "Karpenter security group"
+  vpc_id      = module.vpc.vpc_id
 
   tags = {
     "karpenter.sh/discovery" = "${var.cluster_name}"

From 5b19b522bc0d7409a0571d4437633e4e1cc85e2d Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Tue, 10 Jun 2025 22:14:39 +0530
Subject: [PATCH 43/44] karpenter in aws gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index 5c032127..fbd733e7 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -316,6 +316,7 @@ jobs:
         run: |
           echo "Intalling Karpenter Node Class and Node Pool..."
           ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${{ env.CLUSTER_VERSION }}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')"
+          echo "Using ALIAS_VERSION: ${ALIAS_VERSION}"
           kubectl get nodes -o wide || true
           cat <<EOF | envsubst | kubectl apply -f -
           apiVersion: karpenter.sh/v1

From c035dea6794c68d4a59c88f924913a772a3d30d7 Mon Sep 17 00:00:00 2001
From: garvit3835 <garvit3835@gmail.com>
Date: Wed, 11 Jun 2025 00:25:36 +0530
Subject: [PATCH 44/44] karpenter in aws gpu test ci

---
 .github/workflows/aws-gpu-test.yaml | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
index fbd733e7..ce3d4bea 100644
--- a/.github/workflows/aws-gpu-test.yaml
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -135,7 +135,7 @@ jobs:
 
       - name: Install yq
         run: |
-          sudo wget https://github.com/mikefarah/yq/releases/download/v4.15.1/yq_linux_amd64 -O /usr/local/bin/yq
+          sudo wget https://github.com/mikefarah/yq/releases/download/v4.35.2/yq_linux_amd64 -O /usr/local/bin/yq
           sudo chmod +x /usr/local/bin/yq
 
       - name: Configure Kubernetes Access
@@ -465,6 +465,20 @@ jobs:
           role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
           aws-region: us-east-1
 
+      - name: Configure Kubernetes Access
+        if: inputs.karpenter_version != 'no_karpenter'
+        run: |
+          aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }}
+
+      - name: Delete Karpenter Nodes
+        if: inputs.karpenter_version != 'no_karpenter'
+        run: |
+          kubectl delete deployment inflate
+          kubectl wait --for=delete deployment/inflate --timeout=300s
+          NODE_NAME=$(kubectl get nodes --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[1].metadata.name}')
+          kubectl delete node "${NODE_NAME}"
+
+
       - name: Set up Terraform
         uses: hashicorp/setup-terraform@v3