From ac1780caaa6c3b08eb2749a9015677eaa37e8978 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sun, 1 Jun 2025 14:38:29 +0530 Subject: [PATCH 01/44] ci for testing gpu metrics in eks --- .github/workflows/aws-gpu-test.yaml | 215 ++++++++++++++++++++++++++++ .gitignore | 5 + terraform/aws/main.tf | 57 ++++++++ terraform/aws/terraform.tfvars | 2 + terraform/aws/variables.tf | 9 ++ 5 files changed, 288 insertions(+) create mode 100644 .github/workflows/aws-gpu-test.yaml create mode 100644 terraform/aws/main.tf create mode 100644 terraform/aws/terraform.tfvars create mode 100644 terraform/aws/variables.tf diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml new file mode 100644 index 00000000..048ffa69 --- /dev/null +++ b/.github/workflows/aws-gpu-test.yaml @@ -0,0 +1,215 @@ +name: AWS Terraform EKS Setup and ZXPorter Installation + +on: + push: + branches: + - garvit/aws-gpu-test + workflow_dispatch: + +permissions: + id-token: write + contents: read + +jobs: + setup-eks: + name: Setup EKS and Install ZXPorter + runs-on: ubuntu-latest + + outputs: + job_identifier: ${{ steps.job-identifier.outputs.job_identifier }} + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Configure AWS Credential + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role + aws-region: us-east-1 + + - name: Generate Unique Job Identifier + id: job-identifier + shell: bash + run: | + SHORT_SHA=$(git rev-parse --short HEAD) + JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}" + echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV + echo "::set-output name=job_identifier::${JOB_IDENTIFIER}" + + - name: Set up Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.5.7 + + - name: Apply Terraform + working-directory: terraform/aws + run: | + cat < backend_override.tf + terraform { + backend "s3" { + bucket = "zxporter-tf-state" + key = "${JOB_IDENTIFIER}/terraform.tfstate" + region = "us-east-1" + } + } + EOF + terraform init + terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" + + install-and-validate: + name: Install and Validate ZXPorter and GPU Resources + runs-on: ubuntu-latest + needs: setup-eks + + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role + aws-region: us-east-1 + + - name: Configure Kubernetes Access + run: | + aws eks update-kubeconfig --region us-east-1 --name ${{ needs.setup-eks.outputs.job_identifier }} + + - name: Check GPU Availability + id: gpu_check + run: | + echo "Checking GPU resources on nodes..." + if kubectl describe nodes | grep -q "nvidia.com/gpu"; then + echo "GPU resources are available on the nodes." + else + echo "GPU check failed" + exit 0 + fi + + - name: Install GPU Operator (if needed) + if: steps.gpu_check.outcome == 'success' + run: | + echo "GPU resources not found, installing GPU Operator..." + kubectl create ns gpu-operator + kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite + kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \ + helm repo update + helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0 + + - name: Check GPU Availability After Installing GPU Operator + if: steps.gpu_check.outcome == 'success' + run: | + echo "Re-checking GPU resources on nodes after GPU Operator installation..." + if kubectl describe nodes | grep -q "nvidia.com/gpu"; then + echo "GPU resources are available on the nodes." + else + echo "GPU check failed after GPU Operator installation" + exit 1 + fi + + - name: Check DCGM DaemonSet + id: dcgm_check + run: | + echo "Checking if DCGM DaemonSet is installed..." + if kubectl get daemonset -A | grep -q dcgm; then + echo "DCGM DaemonSet is installed." + else + echo "DCGM DaemonSet not found" + exit 0 + fi + + - name: Install DCGM Exporter (if needed) + if: steps.dcgm_check.outcome == 'success' + run: | + echo "Installing DCGM Exporter..." + kubectl create ns devzero-zxporter + curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f - + + - name: Check DCGM DaemonSet After Installing DCGM Exporter + if: steps.dcgm_check.outcome == 'success' + run: | + echo "Re-checking DCGM pods after DCGM Exporter installation..." + if kubectl get daemonset -A | grep -q dcgm; then + echo "DCGM DaemonSet is running." + else + echo "DCGM DaemonSet not running after installation" + exit 1 + fi + + - name: Verify DCGM Pods and Prometheus Annotations + run: | + echo "Verifying DCGM pods and Prometheus annotations..." + kubectl get pods -A | grep dcgm-exporter | awk ' + BEGIN { all_running = 1; pod_count = 0 } + { + pod_count++ + status = $4 + printf "Pod: %s/%s - Status: %s\n", $1, $2, status + if (status != "Running") all_running = 0 + } + END { + printf "\nTotal Pods: %d\n", pod_count + printf "All Running: %s\n", (all_running ? "true" : "false") + }' + kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done + + - name: Install and Verify DeepSeek Workload + run: | + kubectl create ns deepseek + kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml + + kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s + pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}') + + if [[ -n "$pod_status" ]]; then + echo "Pods are not in Running state. Failing the pipeline." + exit 1 + else + echo "All pods are running successfully." + fi + + - name: Install ZXPorter + run: | + curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \ + -H "X-Kube-Context-Name: $(kubectl config current-context)" \ + "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \ + kubectl apply -f - + + - name: Test ZXPorter with Prometheus + run: | + kubectl port-forward svc/prometheus-server 9090:80 -n devzero-zxporter & + sleep 5 + result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result') + if [[ -z "$result" || "$result" == "null" ]]; then + echo "DCGM_FI_DEV_SM_CLOCK metric not found!" + exit 1 + fi + echo "Metric found: $result" + + destroy-terraform: + name: Destroy Infrastructure + runs-on: ubuntu-latest + if: always() + needs: install-and-validate + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role + aws-region: us-east-1 + + - name: Configure Terraform Backend + run: | + cat < backend_override.tf + terraform { + backend "s3" { + bucket = "zxporter-tf-state" + key = "${{ needs.setup-eks.outputs.job_identifier }}/terraform.tfstate" + region = "us-east-1" + } + } + EOF + terraform init -backend-config=backend_override.tf + + - name: Destroy Infrastructure + working-directory: terraform/aws + run: terraform destroy -auto-approve diff --git a/.gitignore b/.gitignore index f2f57448..e5cfe436 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,8 @@ config/**/charts *.swp *.swo *~ + +# Terraform files +*.tfstate +*.tfstate.backup +.terraform* diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf new file mode 100644 index 00000000..db292abf --- /dev/null +++ b/terraform/aws/main.tf @@ -0,0 +1,57 @@ +provider "aws" { + region = "us-east-1" +} + +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + + name = "${var.cluster_name}-vpc" + cidr = "10.0.0.0/16" + + azs = ["us-east-1a", "us-east-1b"] + private_subnets = ["10.0.1.0/24", "10.0.2.0/24"] + public_subnets = ["10.0.101.0/24", "10.0.102.0/24"] + + enable_nat_gateway = true + single_nat_gateway = true + + # Required for EKS + enable_dns_hostnames = true + enable_dns_support = true +} + +module "eks" { + source = "terraform-aws-modules/eks/aws" + + cluster_name = var.cluster_name + cluster_version = var.cluster_version + + # Add VPC configuration + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + + enable_irsa = true + + cluster_endpoint_public_access = true + enable_cluster_creator_admin_permissions = true + cluster_endpoint_public_access_cidrs = ["0.0.0.0/0"] + + eks_managed_node_groups = { + gpu_nodes = { + instance_types = ["g6.4xlarge"] + desired_size = 1 + min_size = 1 + max_size = 1 + + ami_type = "AL2023_x86_64_NVIDIA" + + use_custom_launch_template = false + + disk_size = 200 + + labels = { + node_type = "gpu" + } + } + } +} diff --git a/terraform/aws/terraform.tfvars b/terraform/aws/terraform.tfvars new file mode 100644 index 00000000..e343f0bb --- /dev/null +++ b/terraform/aws/terraform.tfvars @@ -0,0 +1,2 @@ +cluster_name = "devzero-gpu-cluster" +cluster_version = "1.30" \ No newline at end of file diff --git a/terraform/aws/variables.tf b/terraform/aws/variables.tf new file mode 100644 index 00000000..b9738fb3 --- /dev/null +++ b/terraform/aws/variables.tf @@ -0,0 +1,9 @@ +variable "cluster_name" { + description = "The name of the EKS cluster" + type = string +} + +variable "cluster_version" { + description = "The Kubernetes version for the EKS cluster" + type = string +} From 11cee4f8b25a41f31cf85a0794c32f195eb46fb4 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sun, 1 Jun 2025 15:30:44 +0530 Subject: [PATCH 02/44] ci for testing gpu metrics in eks --- .github/workflows/aws-gpu-test.yaml | 35 +++++++++++++++++++---------- terraform/aws/main.tf | 10 +++++++++ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 048ffa69..979fd718 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -11,8 +11,8 @@ permissions: contents: read jobs: - setup-eks: - name: Setup EKS and Install ZXPorter + apply-terraform: + name: Apply Terraform Configuration runs-on: ubuntu-latest outputs: @@ -58,7 +58,7 @@ jobs: terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" install-and-validate: - name: Install and Validate ZXPorter and GPU Resources + name: Install and Validate GPU Resources and ZXPorter runs-on: ubuntu-latest needs: setup-eks @@ -77,15 +77,17 @@ jobs: id: gpu_check run: | echo "Checking GPU resources on nodes..." + kubectl describe nodes | grep "nvidia.com/gpu" if kubectl describe nodes | grep -q "nvidia.com/gpu"; then echo "GPU resources are available on the nodes." + echo "true" >> $GITHUB_ENV else echo "GPU check failed" - exit 0 + echo "false" >> $GITHUB_ENV fi - name: Install GPU Operator (if needed) - if: steps.gpu_check.outcome == 'success' + if: env.GPU_CHECK == 'false' run: | echo "GPU resources not found, installing GPU Operator..." kubectl create ns gpu-operator @@ -96,9 +98,10 @@ jobs: helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0 - name: Check GPU Availability After Installing GPU Operator - if: steps.gpu_check.outcome == 'success' + if: env.GPU_CHECK == 'false' run: | echo "Re-checking GPU resources on nodes after GPU Operator installation..." + kubectl describe nodes | grep "nvidia.com/gpu" if kubectl describe nodes | grep -q "nvidia.com/gpu"; then echo "GPU resources are available on the nodes." else @@ -110,24 +113,27 @@ jobs: id: dcgm_check run: | echo "Checking if DCGM DaemonSet is installed..." + kubectl get daemonset -A | grep dcgm if kubectl get daemonset -A | grep -q dcgm; then - echo "DCGM DaemonSet is installed." + echo "DCGM DaemonSet is already installed." + echo "true" >> $GITHUB_ENV else - echo "DCGM DaemonSet not found" - exit 0 + echo "DCGM DaemonSet not found." + echo "false" >> $GITHUB_ENV fi - name: Install DCGM Exporter (if needed) - if: steps.dcgm_check.outcome == 'success' + if: env.DCGM_CHECK == 'false' run: | echo "Installing DCGM Exporter..." kubectl create ns devzero-zxporter curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f - - name: Check DCGM DaemonSet After Installing DCGM Exporter - if: steps.dcgm_check.outcome == 'success' + if: env.DCGM_CHECK == 'false' run: | echo "Re-checking DCGM pods after DCGM Exporter installation..." + kubectl get daemonset -A | grep dcgm if kubectl get daemonset -A | grep -q dcgm; then echo "DCGM DaemonSet is running." else @@ -186,7 +192,7 @@ jobs: echo "Metric found: $result" destroy-terraform: - name: Destroy Infrastructure + name: Destroy Terraform runs-on: ubuntu-latest if: always() needs: install-and-validate @@ -197,6 +203,11 @@ jobs: role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role aws-region: us-east-1 + - name: Set up Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.5.7 + - name: Configure Terraform Backend run: | cat < backend_override.tf diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf index db292abf..e3ef7095 100644 --- a/terraform/aws/main.tf +++ b/terraform/aws/main.tf @@ -18,6 +18,16 @@ module "vpc" { # Required for EKS enable_dns_hostnames = true enable_dns_support = true + + public_subnet_tags = { + "kubernetes.io/cluster/${var.cluster_name}" = "shared" + "kubernetes.io/role/elb" = "1" + } + + private_subnet_tags = { + "kubernetes.io/cluster/${var.cluster_name}" = "shared" + "kubernetes.io/role/internal-elb" = "1" + } } module "eks" { From 0d4c5c98e82379ef5238defd6145ea84e7911a6b Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sun, 1 Jun 2025 15:32:44 +0530 Subject: [PATCH 03/44] ci for testing gpu metrics in eks --- .github/workflows/aws-gpu-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 979fd718..8881c65b 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -60,7 +60,7 @@ jobs: install-and-validate: name: Install and Validate GPU Resources and ZXPorter runs-on: ubuntu-latest - needs: setup-eks + needs: apply-terraform steps: - name: Configure AWS Credentials From d67f0a5d270dca8299f3c2708f6a5e880481dce3 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sun, 1 Jun 2025 16:02:05 +0530 Subject: [PATCH 04/44] ci for testing gpu metrics in eks --- .github/workflows/aws-gpu-test.yaml | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 8881c65b..927e0a6b 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -12,7 +12,7 @@ permissions: jobs: apply-terraform: - name: Apply Terraform Configuration + name: Apply Terraform runs-on: ubuntu-latest outputs: @@ -71,7 +71,7 @@ jobs: - name: Configure Kubernetes Access run: | - aws eks update-kubeconfig --region us-east-1 --name ${{ needs.setup-eks.outputs.job_identifier }} + aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }} - name: Check GPU Availability id: gpu_check @@ -197,6 +197,9 @@ jobs: if: always() needs: install-and-validate steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v4 with: @@ -208,19 +211,10 @@ jobs: with: terraform_version: 1.5.7 - - name: Configure Terraform Backend - run: | - cat < backend_override.tf - terraform { - backend "s3" { - bucket = "zxporter-tf-state" - key = "${{ needs.setup-eks.outputs.job_identifier }}/terraform.tfstate" - region = "us-east-1" - } - } - EOF - terraform init -backend-config=backend_override.tf - - name: Destroy Infrastructure working-directory: terraform/aws - run: terraform destroy -auto-approve + run: | + terraform init -backend-config="bucket=zxporter-tf-state" \ + -backend-config="key=${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate" \ + -backend-config="region=us-east-1" + terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}" From bd128a52213458bc2918ede42a64487aad4973a9 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sun, 1 Jun 2025 16:39:42 +0530 Subject: [PATCH 05/44] ci for testing gpu metrics in eks --- .github/workflows/aws-gpu-test.yaml | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 927e0a6b..a7e54c4b 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -35,7 +35,7 @@ jobs: SHORT_SHA=$(git rev-parse --short HEAD) JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}" echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV - echo "::set-output name=job_identifier::${JOB_IDENTIFIER}" + echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT - name: Set up Terraform uses: hashicorp/setup-terraform@v3 @@ -77,7 +77,6 @@ jobs: id: gpu_check run: | echo "Checking GPU resources on nodes..." - kubectl describe nodes | grep "nvidia.com/gpu" if kubectl describe nodes | grep -q "nvidia.com/gpu"; then echo "GPU resources are available on the nodes." echo "true" >> $GITHUB_ENV @@ -101,7 +100,6 @@ jobs: if: env.GPU_CHECK == 'false' run: | echo "Re-checking GPU resources on nodes after GPU Operator installation..." - kubectl describe nodes | grep "nvidia.com/gpu" if kubectl describe nodes | grep -q "nvidia.com/gpu"; then echo "GPU resources are available on the nodes." else @@ -113,7 +111,6 @@ jobs: id: dcgm_check run: | echo "Checking if DCGM DaemonSet is installed..." - kubectl get daemonset -A | grep dcgm if kubectl get daemonset -A | grep -q dcgm; then echo "DCGM DaemonSet is already installed." echo "true" >> $GITHUB_ENV @@ -133,7 +130,6 @@ jobs: if: env.DCGM_CHECK == 'false' run: | echo "Re-checking DCGM pods after DCGM Exporter installation..." - kubectl get daemonset -A | grep dcgm if kubectl get daemonset -A | grep -q dcgm; then echo "DCGM DaemonSet is running." else @@ -214,7 +210,14 @@ jobs: - name: Destroy Infrastructure working-directory: terraform/aws run: | - terraform init -backend-config="bucket=zxporter-tf-state" \ - -backend-config="key=${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate" \ - -backend-config="region=us-east-1" + cat < backend_override.tf + terraform { + backend "s3" { + bucket = "zxporter-tf-state" + key = "${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate" + region = "us-east-1" + } + } + EOF + terraform init terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}" From 2cc613d5e3d8586c1b9e205498dd4355dc8348f7 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sun, 1 Jun 2025 17:05:22 +0530 Subject: [PATCH 06/44] ci for testing gpu metrics in eks --- .github/workflows/aws-gpu-test.yaml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index a7e54c4b..218087ef 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -1,4 +1,4 @@ -name: AWS Terraform EKS Setup and ZXPorter Installation +name: AWS GPU Test on: push: @@ -79,10 +79,10 @@ jobs: echo "Checking GPU resources on nodes..." if kubectl describe nodes | grep -q "nvidia.com/gpu"; then echo "GPU resources are available on the nodes." - echo "true" >> $GITHUB_ENV + echo "GPU_CHECK=true" >> $GITHUB_ENV # Corrected! else echo "GPU check failed" - echo "false" >> $GITHUB_ENV + echo "GPU_CHECK=false" >> $GITHUB_ENV # Corrected! fi - name: Install GPU Operator (if needed) @@ -113,10 +113,10 @@ jobs: echo "Checking if DCGM DaemonSet is installed..." if kubectl get daemonset -A | grep -q dcgm; then echo "DCGM DaemonSet is already installed." - echo "true" >> $GITHUB_ENV + echo "DCGM_CHECK=true" >> $GITHUB_ENV else echo "DCGM DaemonSet not found." - echo "false" >> $GITHUB_ENV + echo "DCGM_CHECK=false" >> $GITHUB_ENV fi - name: Install DCGM Exporter (if needed) @@ -191,7 +191,10 @@ jobs: name: Destroy Terraform runs-on: ubuntu-latest if: always() - needs: install-and-validate + needs: + - apply-terraform + - install-and-validate + steps: - name: Checkout Repository uses: actions/checkout@v4 From d060fefed870b01516a88d97725da4e755c633a9 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sun, 1 Jun 2025 18:11:11 +0530 Subject: [PATCH 07/44] ci for testing gpu metrics in eks --- .github/workflows/aws-gpu-test.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 218087ef..268c16ac 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -79,10 +79,10 @@ jobs: echo "Checking GPU resources on nodes..." if kubectl describe nodes | grep -q "nvidia.com/gpu"; then echo "GPU resources are available on the nodes." - echo "GPU_CHECK=true" >> $GITHUB_ENV # Corrected! + echo "GPU_CHECK=true" >> $GITHUB_ENV else echo "GPU check failed" - echo "GPU_CHECK=false" >> $GITHUB_ENV # Corrected! + echo "GPU_CHECK=false" >> $GITHUB_ENV fi - name: Install GPU Operator (if needed) @@ -139,6 +139,7 @@ jobs: - name: Verify DCGM Pods and Prometheus Annotations run: | + kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n gpu-operator --timeout=300s echo "Verifying DCGM pods and Prometheus annotations..." kubectl get pods -A | grep dcgm-exporter | awk ' BEGIN { all_running = 1; pod_count = 0 } @@ -175,6 +176,9 @@ jobs: -H "X-Kube-Context-Name: $(kubectl config current-context)" \ "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \ kubectl apply -f - + + echo "Waiting for ZXPorter pods to be ready..." + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s - name: Test ZXPorter with Prometheus run: | From 921d7522a87671effb1782e3ac75ebb69bee45f8 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sun, 1 Jun 2025 19:04:09 +0530 Subject: [PATCH 08/44] ci for testing gpu metrics in eks --- .github/workflows/aws-gpu-test.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 268c16ac..46699d5d 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -1,9 +1,6 @@ name: AWS GPU Test on: - push: - branches: - - garvit/aws-gpu-test workflow_dispatch: permissions: From cc7157ecb4fe14823d935763dda059d74b67c93b Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 3 Jun 2025 13:00:15 +0530 Subject: [PATCH 09/44] update in gpu test ci --- .github/workflows/aws-gpu-test.yaml | 23 ++++++++--- dist/install.yaml | 60 ++++++++++++++++++++++++++++- 2 files changed, 77 insertions(+), 6 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 46699d5d..ab9ff3e2 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -1,6 +1,9 @@ name: AWS GPU Test on: + push: + branches: + - garvit/aws-gpu-test workflow_dispatch: permissions: @@ -150,7 +153,7 @@ jobs: printf "\nTotal Pods: %d\n", pod_count printf "All Running: %s\n", (all_running ? "true" : "false") }' - kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done + kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/port: "9400" --overwrite; done - name: Install and Verify DeepSeek Workload run: | @@ -167,12 +170,22 @@ jobs: echo "All pods are running successfully." fi + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.22' + cache: true + - name: Install ZXPorter run: | - curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \ - -H "X-Kube-Context-Name: $(kubectl config current-context)" \ - "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \ - kubectl apply -f - + ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" + echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" + make docker-build docker-push IMG=${ZXPORTER_IMG} + make deploy IMG=${ZXPORTER_IMG} + # curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \ + # -H "X-Kube-Context-Name: $(kubectl config current-context)" \ + # "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \ + # kubectl apply -f - echo "Waiting for ZXPorter pods to be ready..." kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s diff --git a/dist/install.yaml b/dist/install.yaml index 48c3461d..b3e6a2e2 100644 --- a/dist/install.yaml +++ b/dist/install.yaml @@ -117,6 +117,64 @@ data: scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node recording_rules.yml: | {} rules: | @@ -1171,4 +1229,4 @@ spec: volumes: - configMap: name: devzero-zxporter-env-config - name: config-volume + name: config-volume \ No newline at end of file From f06cebf29fba506edbc11040a83c7192ded47452 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 3 Jun 2025 18:16:15 +0530 Subject: [PATCH 10/44] matrix in ci for devzero and nvidia dcgm --- .github/workflows/aws-gpu-test.yaml | 69 ++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index ab9ff3e2..29320756 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -14,6 +14,9 @@ jobs: apply-terraform: name: Apply Terraform runs-on: ubuntu-latest + strategy: + matrix: + dcgm_install_type: [nvidia-dcgm, devzero-dcgm] outputs: job_identifier: ${{ steps.job-identifier.outputs.job_identifier }} @@ -33,7 +36,12 @@ jobs: shell: bash run: | SHORT_SHA=$(git rev-parse --short HEAD) - JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}" + if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then + SUFFIX="nd" + else + SUFFIX="dd" + fi + JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}" echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT @@ -60,7 +68,10 @@ jobs: install-and-validate: name: Install and Validate GPU Resources and ZXPorter runs-on: ubuntu-latest - needs: apply-terraform + needs: apply-terraform + strategy: + matrix: + dcgm_install_type: [nvidia-dcgm, devzero-dcgm] steps: - name: Configure AWS Credentials @@ -87,6 +98,8 @@ jobs: - name: Install GPU Operator (if needed) if: env.GPU_CHECK == 'false' + env: + DCGM_INSTALL_TYPE: ${{ matrix.dcgm_install_type }} run: | echo "GPU resources not found, installing GPU Operator..." kubectl create ns gpu-operator @@ -94,7 +107,12 @@ jobs: kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \ helm repo update - helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0 + INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0" + if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then + INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false" + fi + echo "Running: $INSTALL_CMD" + $INSTALL_CMD - name: Check GPU Availability After Installing GPU Operator if: env.GPU_CHECK == 'false' @@ -107,27 +125,28 @@ jobs: exit 1 fi - - name: Check DCGM DaemonSet + - name: Check Nvidia DCGM DaemonSet id: dcgm_check + if: matrix.dcgm_install_type == 'nvidia-dcgm' run: | echo "Checking if DCGM DaemonSet is installed..." if kubectl get daemonset -A | grep -q dcgm; then - echo "DCGM DaemonSet is already installed." - echo "DCGM_CHECK=true" >> $GITHUB_ENV + echo "Nvidia DCGM found, proceeding with validation." + echo "SKIP_INSTALL=false" >> $GITHUB_ENV else - echo "DCGM DaemonSet not found." - echo "DCGM_CHECK=false" >> $GITHUB_ENV + echo "Nvidia DCGM not found, skipping install and proceeding to destroy." + echo "SKIP_INSTALL=true" >> $GITHUB_ENV fi - - name: Install DCGM Exporter (if needed) - if: env.DCGM_CHECK == 'false' + - name: Install DevZero DCGM (only for devzero-dcgm) + if: matrix.dcgm_install_type == 'devzero-dcgm' run: | echo "Installing DCGM Exporter..." kubectl create ns devzero-zxporter curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f - - name: Check DCGM DaemonSet After Installing DCGM Exporter - if: env.DCGM_CHECK == 'false' + if: matrix.dcgm_install_type == 'devzero-dcgm' run: | echo "Re-checking DCGM pods after DCGM Exporter installation..." if kubectl get daemonset -A | grep -q dcgm; then @@ -138,6 +157,7 @@ jobs: fi - name: Verify DCGM Pods and Prometheus Annotations + if: env.SKIP_INSTALL != 'true' run: | kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n gpu-operator --timeout=300s echo "Verifying DCGM pods and Prometheus annotations..." @@ -153,9 +173,10 @@ jobs: printf "\nTotal Pods: %d\n", pod_count printf "All Running: %s\n", (all_running ? "true" : "false") }' - kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/port: "9400" --overwrite; done + kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done - name: Install and Verify DeepSeek Workload + if: env.SKIP_INSTALL != 'true' run: | kubectl create ns deepseek kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml @@ -172,30 +193,33 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 + if: env.SKIP_INSTALL != 'true' with: go-version: '1.22' cache: true - name: Install ZXPorter + if: env.SKIP_INSTALL != 'true' run: | - ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" - echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" - make docker-build docker-push IMG=${ZXPORTER_IMG} - make deploy IMG=${ZXPORTER_IMG} - # curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \ - # -H "X-Kube-Context-Name: $(kubectl config current-context)" \ - # "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \ - # kubectl apply -f - + # ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" + # echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" + # make docker-build docker-push IMG=${ZXPORTER_IMG} + # make deploy IMG=${ZXPORTER_IMG} + curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \ + -H "X-Kube-Context-Name: $(kubectl config current-context)" \ + "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \ + kubectl apply -f - echo "Waiting for ZXPorter pods to be ready..." kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s - name: Test ZXPorter with Prometheus + if: env.SKIP_INSTALL != 'true' run: | kubectl port-forward svc/prometheus-server 9090:80 -n devzero-zxporter & sleep 5 result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result') - if [[ -z "$result" || "$result" == "null" ]]; then + if [[ -z "$result" || "$result" == [] ]]; then echo "DCGM_FI_DEV_SM_CLOCK metric not found!" exit 1 fi @@ -204,6 +228,9 @@ jobs: destroy-terraform: name: Destroy Terraform runs-on: ubuntu-latest + strategy: + matrix: + dcgm_install_type: [nvidia-dcgm, devzero-dcgm] if: always() needs: - apply-terraform From db37092e0eaedd270e9ad77e3fb1b2ebc7b493c7 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 3 Jun 2025 18:35:00 +0530 Subject: [PATCH 11/44] matrix in ci for devzero and nvidia dcgm --- .github/workflows/aws-gpu-test.yaml | 39 ++++++++++++++++------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 29320756..f36a86f3 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -5,6 +5,15 @@ on: branches: - garvit/aws-gpu-test workflow_dispatch: + inputs: + dcgm_install_type: + description: 'DCGM install type' + required: false + default: 'devzero-dcgm' + type: choice + options: + - nvidia-dcgm + - devzero-dcgm permissions: id-token: write @@ -14,9 +23,8 @@ jobs: apply-terraform: name: Apply Terraform runs-on: ubuntu-latest - strategy: - matrix: - dcgm_install_type: [nvidia-dcgm, devzero-dcgm] + env: + DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} outputs: job_identifier: ${{ steps.job-identifier.outputs.job_identifier }} @@ -36,10 +44,10 @@ jobs: shell: bash run: | SHORT_SHA=$(git rev-parse --short HEAD) - if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then - SUFFIX="nd" - else + if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then SUFFIX="dd" + else + SUFFIX="nd" fi JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}" echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV @@ -69,9 +77,8 @@ jobs: name: Install and Validate GPU Resources and ZXPorter runs-on: ubuntu-latest needs: apply-terraform - strategy: - matrix: - dcgm_install_type: [nvidia-dcgm, devzero-dcgm] + env: + DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} steps: - name: Configure AWS Credentials @@ -98,8 +105,6 @@ jobs: - name: Install GPU Operator (if needed) if: env.GPU_CHECK == 'false' - env: - DCGM_INSTALL_TYPE: ${{ matrix.dcgm_install_type }} run: | echo "GPU resources not found, installing GPU Operator..." kubectl create ns gpu-operator @@ -127,7 +132,7 @@ jobs: - name: Check Nvidia DCGM DaemonSet id: dcgm_check - if: matrix.dcgm_install_type == 'nvidia-dcgm' + if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }} run: | echo "Checking if DCGM DaemonSet is installed..." if kubectl get daemonset -A | grep -q dcgm; then @@ -139,14 +144,14 @@ jobs: fi - name: Install DevZero DCGM (only for devzero-dcgm) - if: matrix.dcgm_install_type == 'devzero-dcgm' + if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }} run: | echo "Installing DCGM Exporter..." kubectl create ns devzero-zxporter curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f - - name: Check DCGM DaemonSet After Installing DCGM Exporter - if: matrix.dcgm_install_type == 'devzero-dcgm' + if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }} run: | echo "Re-checking DCGM pods after DCGM Exporter installation..." if kubectl get daemonset -A | grep -q dcgm; then @@ -228,9 +233,9 @@ jobs: destroy-terraform: name: Destroy Terraform runs-on: ubuntu-latest - strategy: - matrix: - dcgm_install_type: [nvidia-dcgm, devzero-dcgm] + env: + DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} + if: always() needs: - apply-terraform From d211df09613b4aa1048d5b7a15bfd5eb74bf93c7 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 3 Jun 2025 19:03:57 +0530 Subject: [PATCH 12/44] matrix in ci for devzero and nvidia dcgm --- .github/workflows/aws-gpu-test.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index f36a86f3..f4e284dd 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -143,15 +143,15 @@ jobs: echo "SKIP_INSTALL=true" >> $GITHUB_ENV fi - - name: Install DevZero DCGM (only for devzero-dcgm) - if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }} + - name: Install DevZero DCGM + if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} run: | echo "Installing DCGM Exporter..." kubectl create ns devzero-zxporter curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f - - name: Check DCGM DaemonSet After Installing DCGM Exporter - if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }} + if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} run: | echo "Re-checking DCGM pods after DCGM Exporter installation..." if kubectl get daemonset -A | grep -q dcgm; then From 8c4bed0f74f03884df11bd8e21b297edbbb190ba Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 3 Jun 2025 19:26:31 +0530 Subject: [PATCH 13/44] using makefile to install zxporter in ci --- .github/workflows/aws-gpu-test.yaml | 12 ++++-------- Makefile | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index f4e284dd..3bd1e552 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -206,14 +206,10 @@ jobs: - name: Install ZXPorter if: env.SKIP_INSTALL != 'true' run: | - # ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" - # echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" - # make docker-build docker-push IMG=${ZXPORTER_IMG} - # make deploy IMG=${ZXPORTER_IMG} - curl -XPOST -H 'Authorization: Bearer dzu-bdef3HBkpAs-SfpVcHXH0VJFhVibZ2qRCL1IRdYRlIs=' \ - -H "X-Kube-Context-Name: $(kubectl config current-context)" \ - "https://api.devzero.io/backend/v0/dakr/installer-manifest?cluster-provider=aws" | \ - kubectl apply -f - + ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" + echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" + make docker-build docker-push IMG=${ZXPORTER_IMG} + make deploy IMG=${ZXPORTER_IMG} echo "Waiting for ZXPorter pods to be ready..." kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s diff --git a/Makefile b/Makefile index a6be166d..a4bdfb11 100644 --- a/Makefile +++ b/Makefile @@ -125,11 +125,11 @@ help: ## Display this help. .PHONY: manifests manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. - $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases + $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases -w .PHONY: generate generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. - $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." + $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." -w .PHONY: fmt fmt: ## Run go fmt against code. From 9aeba3326a3fd246eca012636fb2feb05f8a5993 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 3 Jun 2025 20:29:02 +0530 Subject: [PATCH 14/44] fix in aws gpu test ci --- .github/workflows/aws-gpu-test.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 3bd1e552..c21d2f38 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -81,6 +81,9 @@ jobs: DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v4 with: From a1b41d2f689bc14e6458c52ede668b178e2477af Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 3 Jun 2025 20:52:44 +0530 Subject: [PATCH 15/44] fix in aws gpu test ci --- .github/workflows/aws-gpu-test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index c21d2f38..48b177f9 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -83,7 +83,7 @@ jobs: steps: - name: Checkout Repository uses: actions/checkout@v4 - + - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v4 with: @@ -220,7 +220,7 @@ jobs: - name: Test ZXPorter with Prometheus if: env.SKIP_INSTALL != 'true' run: | - kubectl port-forward svc/prometheus-server 9090:80 -n devzero-zxporter & + kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter & sleep 5 result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result') if [[ -z "$result" || "$result" == [] ]]; then From db0605c355817ad66a613f8241300169bbe719a3 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 3 Jun 2025 23:14:39 +0530 Subject: [PATCH 16/44] update in gpu test ci --- .github/workflows/aws-gpu-test.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 48b177f9..8f82c36a 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -209,10 +209,12 @@ jobs: - name: Install ZXPorter if: env.SKIP_INSTALL != 'true' run: | - ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" - echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" - make docker-build docker-push IMG=${ZXPORTER_IMG} - make deploy IMG=${ZXPORTER_IMG} + # ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" + # echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" + # make docker-build docker-push IMG=${ZXPORTER_IMG} + # make deploy IMG=${ZXPORTER_IMG} + + kubectl apply -f dist/install.yaml echo "Waiting for ZXPorter pods to be ready..." kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s From c6e19b9b41435f88a8da48f148d86107b456b563 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Wed, 4 Jun 2025 00:11:23 +0530 Subject: [PATCH 17/44] using makefile to install zxporter in ci --- .github/workflows/aws-gpu-test.yaml | 8 +- config/prometheus/hack.prometheus.values.yaml | 811 ++++++++++-------- 2 files changed, 462 insertions(+), 357 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 8f82c36a..ebf5bcae 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -209,10 +209,10 @@ jobs: - name: Install ZXPorter if: env.SKIP_INSTALL != 'true' run: | - # ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" - # echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" - # make docker-build docker-push IMG=${ZXPORTER_IMG} - # make deploy IMG=${ZXPORTER_IMG} + ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" + echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" + make docker-build docker-push IMG=${ZXPORTER_IMG} + make deploy IMG=${ZXPORTER_IMG} kubectl apply -f dist/install.yaml diff --git a/config/prometheus/hack.prometheus.values.yaml b/config/prometheus/hack.prometheus.values.yaml index b1975764..db227b61 100644 --- a/config/prometheus/hack.prometheus.values.yaml +++ b/config/prometheus/hack.prometheus.values.yaml @@ -51,344 +51,84 @@ kube-state-metrics: # - roles serverFiles: - prometheus.yml: + prometheus.yml: + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts scrape_configs: - job_name: prometheus static_configs: - targets: - localhost:9090 - - # A scrape configuration for running Prometheus on a Kubernetes cluster. - # This uses separate scrape configs for cluster components (i.e. API server, node) - # and services to allow each to use different authentication configs. - # - # Kubernetes labels will be added as Prometheus labels on metrics via the - # `labelmap` relabeling action. - -## DEVZERO COMMENTED OUT TO PREVENT SCRAPING -# # Scrape config for API servers. -# # -# # Kubernetes exposes API servers as endpoints to the default/kubernetes -# # service so this uses `endpoints` role and uses relabelling to only keep -# # the endpoints associated with the default/kubernetes service using the -# # default named port `https`. This works for single API server deployments as -# # well as HA API server deployments. -# - job_name: 'kubernetes-apiservers' -# -# kubernetes_sd_configs: -# - role: endpoints -# -# # Default to scraping over https. If required, just disable this or change to -# # `http`. -# scheme: https -# -# # This TLS & bearer token file config is used to connect to the actual scrape -# # endpoints for cluster components. This is separate to discovery auth -# # configuration because discovery & scraping are two separate concerns in -# # Prometheus. The discovery auth config is automatic if Prometheus runs inside -# # the cluster. Otherwise, more config options have to be provided within the -# # . -# tls_config: -# ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt -# # If your node certificates are self-signed or use a different CA to the -# # master CA, then disable certificate verification below. Note that -# # certificate verification is an integral part of a secure infrastructure -# # so this should only be disabled in a controlled environment. You can -# # disable certificate verification by uncommenting the line below. -# # -# # insecure_skip_verify: true -# bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token -# -# # Keep only the default/kubernetes service endpoints for the https port. This -# # will add targets for each API server which Kubernetes adds an endpoint to -# # the default/kubernetes service. -# relabel_configs: -# - source_labels: [ __meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name ] -# action: keep -# regex: default;kubernetes;https - - - job_name: 'kubernetes-nodes' - - # Default to scraping over https. If required, just disable this or change to - # `http`. + - job_name: kubernetes-nodes scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - # insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - role: node - relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - - source_labels: [ __meta_kubernetes_node_name ] + - source_labels: + - __meta_kubernetes_node_name regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$1/proxy/metrics - - - - job_name: 'kubernetes-nodes-cadvisor' - - # Default to scraping over https. If required, just disable this or change to - # `http`. + - job_name: kubernetes-nodes-cadvisor scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - # insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - role: node - - # This configuration will work only on kubelet 1.7.3+ - # As the scrape endpoints for cAdvisor have changed - # if you are using older version you need to change the replacement to - # replacement: /api/v1/nodes/$1:4194/proxy/metrics - # more info here https://github.com/coreos/prometheus-operator/issues/633 relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - - source_labels: [ __meta_kubernetes_node_name ] + - source_labels: + - __meta_kubernetes_node_name regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor - - # Metric relabel configs to apply to samples before ingestion. - # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs) - # metric_relabel_configs: - # - action: labeldrop - # regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone) - -## DEVZERO COMMENTED OUT TO PREVENT SCRAPING -# # Scrape config for service endpoints. -# # -# # The relabeling allows the actual service scrape endpoint to be configured -# # via the following annotations: -# # -# # * `prometheus.io/scrape`: Only scrape services that have a value of -# # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well. -# # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need -# # to set this to `https` & most likely set the `tls_config` of the scrape config. -# # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. -# # * `prometheus.io/port`: If the metrics are exposed on a different port to the -# # service then set this appropriately. -# # * `prometheus.io/param_`: If the metrics endpoint uses parameters -# # then you can set any parameter -# - job_name: 'kubernetes-service-endpoints' -# honor_labels: true -# -# kubernetes_sd_configs: -# - role: endpoints -# -# relabel_configs: -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape ] -# action: keep -# regex: true -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ] -# action: drop -# regex: true -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ] -# action: replace -# target_label: __scheme__ -# regex: (https?) -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ] -# action: replace -# target_label: __metrics_path__ -# regex: (.+) -# - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ] -# action: replace -# target_label: __address__ -# regex: (.+?)(?::\d+)?;(\d+) -# replacement: $1:$2 -# - action: labelmap -# regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) -# replacement: __param_$1 -# - action: labelmap -# regex: __meta_kubernetes_service_label_(.+) -# - source_labels: [ __meta_kubernetes_namespace ] -# action: replace -# target_label: namespace -# - source_labels: [ __meta_kubernetes_service_name ] -# action: replace -# target_label: service -# - source_labels: [ __meta_kubernetes_pod_node_name ] -# action: replace -# target_label: node - - -## DEVZERO COMMENTED OUT TO PREVENT SCRAPING -# # Scrape config for slow service endpoints; same as above, but with a larger -# # timeout and a larger interval -# # -# # The relabeling allows the actual service scrape endpoint to be configured -# # via the following annotations: -# # -# # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true` -# # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need -# # to set this to `https` & most likely set the `tls_config` of the scrape config. -# # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. -# # * `prometheus.io/port`: If the metrics are exposed on a different port to the -# # service then set this appropriately. -# # * `prometheus.io/param_`: If the metrics endpoint uses parameters -# # then you can set any parameter -# - job_name: 'kubernetes-service-endpoints-slow' -# honor_labels: true -# -# scrape_interval: 5m -# scrape_timeout: 30s -# -# kubernetes_sd_configs: -# - role: endpoints -# -# relabel_configs: -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ] -# action: keep -# regex: true -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ] -# action: replace -# target_label: __scheme__ -# regex: (https?) -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ] -# action: replace -# target_label: __metrics_path__ -# regex: (.+) -# - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ] -# action: replace -# target_label: __address__ -# regex: (.+?)(?::\d+)?;(\d+) -# replacement: $1:$2 -# - action: labelmap -# regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) -# replacement: __param_$1 -# - action: labelmap -# regex: __meta_kubernetes_service_label_(.+) -# - source_labels: [ __meta_kubernetes_namespace ] -# action: replace -# target_label: namespace -# - source_labels: [ __meta_kubernetes_service_name ] -# action: replace -# target_label: service -# - source_labels: [ __meta_kubernetes_pod_node_name ] -# action: replace -# target_label: node -# -# - job_name: 'prometheus-pushgateway' -# honor_labels: true -# -# kubernetes_sd_configs: -# - role: service -# -# relabel_configs: -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ] -# action: keep -# regex: pushgateway - - -## DEVZERO COMMENTED OUT TO PREVENT SCRAPING -# # Example scrape config for probing services via the Blackbox Exporter. -# # -# # The relabeling allows the actual service scrape endpoint to be configured -# # via the following annotations: -# # -# # * `prometheus.io/probe`: Only probe services that have a value of `true` -# - job_name: 'kubernetes-services' -# honor_labels: true -# -# metrics_path: /probe -# params: -# module: [ http_2xx ] -# -# kubernetes_sd_configs: -# - role: service -# -# relabel_configs: -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ] -# action: keep -# regex: true -# - source_labels: [ __address__ ] -# target_label: __param_target -# - target_label: __address__ -# replacement: blackbox -# - source_labels: [ __param_target ] -# target_label: instance -# - action: labelmap -# regex: __meta_kubernetes_service_label_(.+) -# - source_labels: [ __meta_kubernetes_namespace ] -# target_label: namespace -# - source_labels: [ __meta_kubernetes_service_name ] -# target_label: service - - - # Example scrape config for pods - # - # The relabeling allows the actual pod scrape endpoint to be configured via the - # following annotations: - # - # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`, - # except if `prometheus.io/scrape-slow` is set to `true` as well. - # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # to set this to `https` & most likely set the `tls_config` of the scrape config. - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. - - job_name: 'kubernetes-pods' + - job_name: kubernetes-pods honor_labels: true - kubernetes_sd_configs: - role: pod - relabel_configs: - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape ] + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape action: keep regex: true - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ] + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow action: drop regex: true - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ] + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme action: replace regex: (https?) target_label: __scheme__ - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ] + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path action: replace - target_label: __metrics_path__ regex: (.+) - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] + target_label: __metrics_path__ + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip action: replace regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' + replacement: "[$2]:$1" target_label: __address__ - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip action: replace regex: (\d+);((([0-9]+?)(\.|$)){4}) replacement: $2:$1 @@ -398,77 +138,442 @@ serverFiles: replacement: __param_$1 - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [ __meta_kubernetes_namespace ] + - source_labels: + - __meta_kubernetes_namespace action: replace target_label: namespace - - source_labels: [ __meta_kubernetes_pod_name ] + - source_labels: + - __meta_kubernetes_pod_name action: replace target_label: pod - - source_labels: [ __meta_kubernetes_pod_phase ] + - source_labels: + - __meta_kubernetes_pod_phase regex: Pending|Succeeded|Failed|Completed action: drop - - source_labels: [ __meta_kubernetes_pod_node_name ] + - source_labels: + - __meta_kubernetes_pod_node_name action: replace target_label: node +# serverFiles: +# prometheus.yml: +# scrape_configs: +# - job_name: prometheus +# static_configs: +# - targets: +# - localhost:9090 + +# # A scrape configuration for running Prometheus on a Kubernetes cluster. +# # This uses separate scrape configs for cluster components (i.e. API server, node) +# # and services to allow each to use different authentication configs. +# # +# # Kubernetes labels will be added as Prometheus labels on metrics via the +# # `labelmap` relabeling action. + +# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING +# # # Scrape config for API servers. +# # # +# # # Kubernetes exposes API servers as endpoints to the default/kubernetes +# # # service so this uses `endpoints` role and uses relabelling to only keep +# # # the endpoints associated with the default/kubernetes service using the +# # # default named port `https`. This works for single API server deployments as +# # # well as HA API server deployments. +# # - job_name: 'kubernetes-apiservers' +# # +# # kubernetes_sd_configs: +# # - role: endpoints +# # +# # # Default to scraping over https. If required, just disable this or change to +# # # `http`. +# # scheme: https +# # +# # # This TLS & bearer token file config is used to connect to the actual scrape +# # # endpoints for cluster components. This is separate to discovery auth +# # # configuration because discovery & scraping are two separate concerns in +# # # Prometheus. The discovery auth config is automatic if Prometheus runs inside +# # # the cluster. Otherwise, more config options have to be provided within the +# # # . +# # tls_config: +# # ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +# # # If your node certificates are self-signed or use a different CA to the +# # # master CA, then disable certificate verification below. Note that +# # # certificate verification is an integral part of a secure infrastructure +# # # so this should only be disabled in a controlled environment. You can +# # # disable certificate verification by uncommenting the line below. +# # # +# # # insecure_skip_verify: true +# # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token +# # +# # # Keep only the default/kubernetes service endpoints for the https port. This +# # # will add targets for each API server which Kubernetes adds an endpoint to +# # # the default/kubernetes service. +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name ] +# # action: keep +# # regex: default;kubernetes;https + +# - job_name: 'kubernetes-nodes' + +# # Default to scraping over https. If required, just disable this or change to +# # `http`. +# scheme: https + +# # This TLS & bearer token file config is used to connect to the actual scrape +# # endpoints for cluster components. This is separate to discovery auth +# # configuration because discovery & scraping are two separate concerns in +# # Prometheus. The discovery auth config is automatic if Prometheus runs inside +# # the cluster. Otherwise, more config options have to be provided within the +# # . +# tls_config: +# ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +# # If your node certificates are self-signed or use a different CA to the +# # master CA, then disable certificate verification below. Note that +# # certificate verification is an integral part of a secure infrastructure +# # so this should only be disabled in a controlled environment. You can +# # disable certificate verification by uncommenting the line below. +# # +# # insecure_skip_verify: true +# bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + +# kubernetes_sd_configs: +# - role: node + +# relabel_configs: +# - action: labelmap +# regex: __meta_kubernetes_node_label_(.+) +# - target_label: __address__ +# replacement: kubernetes.default.svc:443 +# - source_labels: [ __meta_kubernetes_node_name ] +# regex: (.+) +# target_label: __metrics_path__ +# replacement: /api/v1/nodes/$1/proxy/metrics + + +# - job_name: 'kubernetes-nodes-cadvisor' + +# # Default to scraping over https. If required, just disable this or change to +# # `http`. +# scheme: https + +# # This TLS & bearer token file config is used to connect to the actual scrape +# # endpoints for cluster components. This is separate to discovery auth +# # configuration because discovery & scraping are two separate concerns in +# # Prometheus. The discovery auth config is automatic if Prometheus runs inside +# # the cluster. Otherwise, more config options have to be provided within the +# # . +# tls_config: +# ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +# # If your node certificates are self-signed or use a different CA to the +# # master CA, then disable certificate verification below. Note that +# # certificate verification is an integral part of a secure infrastructure +# # so this should only be disabled in a controlled environment. You can +# # disable certificate verification by uncommenting the line below. +# # +# # insecure_skip_verify: true +# bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + +# kubernetes_sd_configs: +# - role: node + +# # This configuration will work only on kubelet 1.7.3+ +# # As the scrape endpoints for cAdvisor have changed +# # if you are using older version you need to change the replacement to +# # replacement: /api/v1/nodes/$1:4194/proxy/metrics +# # more info here https://github.com/coreos/prometheus-operator/issues/633 +# relabel_configs: +# - action: labelmap +# regex: __meta_kubernetes_node_label_(.+) +# - target_label: __address__ +# replacement: kubernetes.default.svc:443 +# - source_labels: [ __meta_kubernetes_node_name ] +# regex: (.+) +# target_label: __metrics_path__ +# replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + +# # Metric relabel configs to apply to samples before ingestion. +# # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs) +# # metric_relabel_configs: +# # - action: labeldrop +# # regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone) + +# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING +# # # Scrape config for service endpoints. +# # # +# # # The relabeling allows the actual service scrape endpoint to be configured +# # # via the following annotations: +# # # +# # # * `prometheus.io/scrape`: Only scrape services that have a value of +# # # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well. +# # # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# # # to set this to `https` & most likely set the `tls_config` of the scrape config. +# # # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# # # * `prometheus.io/port`: If the metrics are exposed on a different port to the +# # # service then set this appropriately. +# # # * `prometheus.io/param_`: If the metrics endpoint uses parameters +# # # then you can set any parameter +# # - job_name: 'kubernetes-service-endpoints' +# # honor_labels: true +# # +# # kubernetes_sd_configs: +# # - role: endpoints +# # +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape ] +# # action: keep +# # regex: true +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ] +# # action: drop +# # regex: true +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ] +# # action: replace +# # target_label: __scheme__ +# # regex: (https?) +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ] +# # action: replace +# # target_label: __metrics_path__ +# # regex: (.+) +# # - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ] +# # action: replace +# # target_label: __address__ +# # regex: (.+?)(?::\d+)?;(\d+) +# # replacement: $1:$2 +# # - action: labelmap +# # regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) +# # replacement: __param_$1 +# # - action: labelmap +# # regex: __meta_kubernetes_service_label_(.+) +# # - source_labels: [ __meta_kubernetes_namespace ] +# # action: replace +# # target_label: namespace +# # - source_labels: [ __meta_kubernetes_service_name ] +# # action: replace +# # target_label: service +# # - source_labels: [ __meta_kubernetes_pod_node_name ] +# # action: replace +# # target_label: node + + +# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING +# # # Scrape config for slow service endpoints; same as above, but with a larger +# # # timeout and a larger interval +# # # +# # # The relabeling allows the actual service scrape endpoint to be configured +# # # via the following annotations: +# # # +# # # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true` +# # # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# # # to set this to `https` & most likely set the `tls_config` of the scrape config. +# # # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# # # * `prometheus.io/port`: If the metrics are exposed on a different port to the +# # # service then set this appropriately. +# # # * `prometheus.io/param_`: If the metrics endpoint uses parameters +# # # then you can set any parameter +# # - job_name: 'kubernetes-service-endpoints-slow' +# # honor_labels: true +# # +# # scrape_interval: 5m +# # scrape_timeout: 30s +# # +# # kubernetes_sd_configs: +# # - role: endpoints +# # +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ] +# # action: keep +# # regex: true +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ] +# # action: replace +# # target_label: __scheme__ +# # regex: (https?) +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ] +# # action: replace +# # target_label: __metrics_path__ +# # regex: (.+) +# # - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ] +# # action: replace +# # target_label: __address__ +# # regex: (.+?)(?::\d+)?;(\d+) +# # replacement: $1:$2 +# # - action: labelmap +# # regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) +# # replacement: __param_$1 +# # - action: labelmap +# # regex: __meta_kubernetes_service_label_(.+) +# # - source_labels: [ __meta_kubernetes_namespace ] +# # action: replace +# # target_label: namespace +# # - source_labels: [ __meta_kubernetes_service_name ] +# # action: replace +# # target_label: service +# # - source_labels: [ __meta_kubernetes_pod_node_name ] +# # action: replace +# # target_label: node +# # +# # - job_name: 'prometheus-pushgateway' +# # honor_labels: true +# # +# # kubernetes_sd_configs: +# # - role: service +# # +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ] +# # action: keep +# # regex: pushgateway + + +# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING +# # # Example scrape config for probing services via the Blackbox Exporter. +# # # +# # # The relabeling allows the actual service scrape endpoint to be configured +# # # via the following annotations: +# # # +# # # * `prometheus.io/probe`: Only probe services that have a value of `true` +# # - job_name: 'kubernetes-services' +# # honor_labels: true +# # +# # metrics_path: /probe +# # params: +# # module: [ http_2xx ] +# # +# # kubernetes_sd_configs: +# # - role: service +# # +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ] +# # action: keep +# # regex: true +# # - source_labels: [ __address__ ] +# # target_label: __param_target +# # - target_label: __address__ +# # replacement: blackbox +# # - source_labels: [ __param_target ] +# # target_label: instance +# # - action: labelmap +# # regex: __meta_kubernetes_service_label_(.+) +# # - source_labels: [ __meta_kubernetes_namespace ] +# # target_label: namespace +# # - source_labels: [ __meta_kubernetes_service_name ] +# # target_label: service + + +# # Example scrape config for pods +# # +# # The relabeling allows the actual pod scrape endpoint to be configured via the +# # following annotations: +# # +# # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`, +# # except if `prometheus.io/scrape-slow` is set to `true` as well. +# # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# # to set this to `https` & most likely set the `tls_config` of the scrape config. +# # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. +# - job_name: 'kubernetes-pods' +# honor_labels: true + +# kubernetes_sd_configs: +# - role: pod + +# relabel_configs: +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape ] +# action: keep +# regex: true +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ] +# action: drop +# regex: true +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ] +# action: replace +# regex: (https?) +# target_label: __scheme__ +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ] +# action: replace +# target_label: __metrics_path__ +# regex: (.+) +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] +# action: replace +# regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) +# replacement: '[$2]:$1' +# target_label: __address__ +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] +# action: replace +# regex: (\d+);((([0-9]+?)(\.|$)){4}) +# replacement: $2:$1 +# target_label: __address__ +# - action: labelmap +# regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) +# replacement: __param_$1 +# - action: labelmap +# regex: __meta_kubernetes_pod_label_(.+) +# - source_labels: [ __meta_kubernetes_namespace ] +# action: replace +# target_label: namespace +# - source_labels: [ __meta_kubernetes_pod_name ] +# action: replace +# target_label: pod +# - source_labels: [ __meta_kubernetes_pod_phase ] +# regex: Pending|Succeeded|Failed|Completed +# action: drop +# - source_labels: [ __meta_kubernetes_pod_node_name ] +# action: replace +# target_label: node + -## DEVZERO COMMENTED OUT TO PREVENT SCRAPING -# # Example Scrape config for pods which should be scraped slower. An useful example -# # would be stackriver-exporter which queries an API on every scrape of the pod -# # -# # The relabeling allows the actual pod scrape endpoint to be configured via the -# # following annotations: -# # -# # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true` -# # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need -# # to set this to `https` & most likely set the `tls_config` of the scrape config. -# # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. -# # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. -# - job_name: 'kubernetes-pods-slow' -# honor_labels: true -# -# scrape_interval: 5m -# scrape_timeout: 30s -# -# kubernetes_sd_configs: -# - role: pod -# -# relabel_configs: -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ] -# action: keep -# regex: true -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ] -# action: replace -# regex: (https?) -# target_label: __scheme__ -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ] -# action: replace -# target_label: __metrics_path__ -# regex: (.+) -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] -# action: replace -# regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) -# replacement: '[$2]:$1' -# target_label: __address__ -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] -# action: replace -# regex: (\d+);((([0-9]+?)(\.|$)){4}) -# replacement: $2:$1 -# target_label: __address__ -# - action: labelmap -# regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) -# replacement: __param_$1 -# - action: labelmap -# regex: __meta_kubernetes_pod_label_(.+) -# - source_labels: [ __meta_kubernetes_namespace ] -# action: replace -# target_label: namespace -# - source_labels: [ __meta_kubernetes_pod_name ] -# action: replace -# target_label: pod -# - source_labels: [ __meta_kubernetes_pod_phase ] -# regex: Pending|Succeeded|Failed|Completed -# action: drop -# - source_labels: [ __meta_kubernetes_pod_node_name ] -# action: replace -# target_label: node +# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING +# # # Example Scrape config for pods which should be scraped slower. An useful example +# # # would be stackriver-exporter which queries an API on every scrape of the pod +# # # +# # # The relabeling allows the actual pod scrape endpoint to be configured via the +# # # following annotations: +# # # +# # # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true` +# # # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# # # to set this to `https` & most likely set the `tls_config` of the scrape config. +# # # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# # # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. +# # - job_name: 'kubernetes-pods-slow' +# # honor_labels: true +# # +# # scrape_interval: 5m +# # scrape_timeout: 30s +# # +# # kubernetes_sd_configs: +# # - role: pod +# # +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ] +# # action: keep +# # regex: true +# # - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ] +# # action: replace +# # regex: (https?) +# # target_label: __scheme__ +# # - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ] +# # action: replace +# # target_label: __metrics_path__ +# # regex: (.+) +# # - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] +# # action: replace +# # regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) +# # replacement: '[$2]:$1' +# # target_label: __address__ +# # - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] +# # action: replace +# # regex: (\d+);((([0-9]+?)(\.|$)){4}) +# # replacement: $2:$1 +# # target_label: __address__ +# # - action: labelmap +# # regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) +# # replacement: __param_$1 +# # - action: labelmap +# # regex: __meta_kubernetes_pod_label_(.+) +# # - source_labels: [ __meta_kubernetes_namespace ] +# # action: replace +# # target_label: namespace +# # - source_labels: [ __meta_kubernetes_pod_name ] +# # action: replace +# # target_label: pod +# # - source_labels: [ __meta_kubernetes_pod_phase ] +# # regex: Pending|Succeeded|Failed|Completed +# # action: drop +# # - source_labels: [ __meta_kubernetes_pod_node_name ] +# # action: replace +# # target_label: node From 44113e8e4de42d26e0f9e22b190444270b52e113 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Fri, 6 Jun 2025 15:20:35 +0530 Subject: [PATCH 18/44] fixes in aws-gpu-test ci --- .github/workflows/aws-gpu-test.yaml | 23 ++- .github/workflows/gcp-gpu-test.yaml | 302 ++++++++++++++++++++++++++++ 2 files changed, 320 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/gcp-gpu-test.yaml diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index ebf5bcae..9305e9bd 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -14,6 +14,20 @@ on: options: - nvidia-dcgm - devzero-dcgm + cluster_version: + description: 'Kubernetes cluster version' + required: false + default: '1.30' + type: choice + options: + - '1.26' + - '1.27' + - '1.28' + - '1.29' + - '1.30' + - '1.31' + - '1.32' + - '1.33' permissions: id-token: write @@ -25,6 +39,7 @@ jobs: runs-on: ubuntu-latest env: DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} + CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }} outputs: job_identifier: ${{ steps.job-identifier.outputs.job_identifier }} @@ -71,7 +86,7 @@ jobs: } EOF terraform init - terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" + terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" -var='cluster_version=${{ env.CLUSTER_VERSION }}' install-and-validate: name: Install and Validate GPU Resources and ZXPorter @@ -213,8 +228,6 @@ jobs: echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" make docker-build docker-push IMG=${ZXPORTER_IMG} make deploy IMG=${ZXPORTER_IMG} - - kubectl apply -f dist/install.yaml echo "Waiting for ZXPorter pods to be ready..." kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s @@ -235,7 +248,7 @@ jobs: name: Destroy Terraform runs-on: ubuntu-latest env: - DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} + CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }} if: always() needs: @@ -270,4 +283,4 @@ jobs: } EOF terraform init - terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}" + terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}" -var='cluster_version=${{ env.CLUSTER_VERSION }}' diff --git a/.github/workflows/gcp-gpu-test.yaml b/.github/workflows/gcp-gpu-test.yaml new file mode 100644 index 00000000..6c089b92 --- /dev/null +++ b/.github/workflows/gcp-gpu-test.yaml @@ -0,0 +1,302 @@ +name: GCP GPU Test + +on: + push: + branches: + - garvit/gcp-gpu-test + workflow_dispatch: + inputs: + dcgm_install_type: + description: 'DCGM install type' + required: false + default: 'devzero-dcgm' + type: choice + options: + - nvidia-dcgm + - devzero-dcgm + +permissions: + id-token: write + contents: read + +jobs: + apply-terraform: + name: Apply Terraform + runs-on: ubuntu-latest + env: + DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} + + outputs: + job_identifier: ${{ steps.job-identifier.outputs.job_identifier }} + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: 'Authenticate to Google Cloud' + id: 'auth' + uses: 'google-github-actions/auth@v2' + with: + workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool' + service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com' + create_credentials_file: true + export_environment_variables: true + + - name: Export Terraform-friendly environment variables + run: | + echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV + echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV + + - name: Generate Unique Job Identifier + id: job-identifier + shell: bash + run: | + SHORT_SHA=$(git rev-parse --short HEAD) + if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then + SUFFIX="dd" + else + SUFFIX="nd" + fi + JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}" + echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV + echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT + + - name: Set up Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: "1.11.3" + + - name: Apply Terraform + working-directory: terraform/gcp + run: | + cat < backend_override.tf + terraform { + backend "gcs" { + bucket = "zxporter-tf-state" + prefix = "${JOB_IDENTIFIER}/terraform.tfstate" + } + } + EOF + terraform init + terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" + + install-and-validate: + name: Install and Validate GPU Resources and ZXPorter + runs-on: ubuntu-latest + needs: apply-terraform + env: + DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: 'Authenticate to Google Cloud' + id: 'auth' + uses: 'google-github-actions/auth@v2' + with: + workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool' + service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com' + create_credentials_file: true + export_environment_variables: true + + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v2' + with: + version: '>= 363.0.0' + + - name: Install gke-gcloud-auth-plugin + run: | + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list + curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg + sudo apt-get update + sudo apt-get install -y google-cloud-sdk-gke-gcloud-auth-plugin + + - name: Configure Kubernetes Access + run: | + gcloud container clusters get-credentials ${{ needs.apply-terraform.outputs.job_identifier }} --zone us-central1 --project devzero-self-hosted + + - name: Check GPU Availability + id: gpu_check + run: | + echo "Checking GPU resources on nodes..." + if kubectl describe nodes | grep -q "nvidia.com/gpu"; then + echo "GPU resources are available on the nodes." + echo "GPU_CHECK=true" >> $GITHUB_ENV + else + echo "GPU check failed" + echo "GPU_CHECK=false" >> $GITHUB_ENV + fi + + - name: Install GPU Operator (if needed) + if: env.GPU_CHECK == 'false' + run: | + echo "GPU resources not found, installing GPU Operator..." + kubectl create ns gpu-operator + kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite + kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \ + helm repo update + INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0" + if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then + INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false" + fi + echo "Running: $INSTALL_CMD" + $INSTALL_CMD + + - name: Check GPU Availability After Installing GPU Operator + if: env.GPU_CHECK == 'false' + run: | + echo "Re-checking GPU resources on nodes after GPU Operator installation..." + if kubectl describe nodes | grep -q "nvidia.com/gpu"; then + echo "GPU resources are available on the nodes." + else + echo "GPU check failed after GPU Operator installation" + exit 1 + fi + + - name: Check Nvidia DCGM DaemonSet + id: dcgm_check + if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }} + run: | + echo "Checking if DCGM DaemonSet is installed..." + if kubectl get daemonset -A | grep -q dcgm; then + echo "Nvidia DCGM found, proceeding with validation." + echo "SKIP_INSTALL=false" >> $GITHUB_ENV + else + echo "Nvidia DCGM not found, skipping install and proceeding to destroy." + echo "SKIP_INSTALL=true" >> $GITHUB_ENV + fi + + - name: Install DevZero DCGM + if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} + run: | + echo "Installing DCGM Exporter..." + kubectl create ns devzero-zxporter + curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/gke.yml | kubectl apply -f - + + - name: Check DCGM DaemonSet After Installing DCGM Exporter + if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} + run: | + echo "Re-checking DCGM pods after DCGM Exporter installation..." + if kubectl get daemonset -A | grep -q dcgm; then + echo "DCGM DaemonSet is running." + else + echo "DCGM DaemonSet not running after installation" + exit 1 + fi + + - name: Verify DCGM Pods and Prometheus Annotations + if: env.SKIP_INSTALL != 'true' + run: | + kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n gpu-operator --timeout=300s + echo "Verifying DCGM pods and Prometheus annotations..." + kubectl get pods -A | grep dcgm-exporter | awk ' + BEGIN { all_running = 1; pod_count = 0 } + { + pod_count++ + status = $4 + printf "Pod: %s/%s - Status: %s\n", $1, $2, status + if (status != "Running") all_running = 0 + } + END { + printf "\nTotal Pods: %d\n", pod_count + printf "All Running: %s\n", (all_running ? "true" : "false") + }' + kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done + + - name: Install and Verify DeepSeek Workload + if: env.SKIP_INSTALL != 'true' + run: | + kubectl create ns deepseek + kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml + + kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s + pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}') + + if [[ -n "$pod_status" ]]; then + echo "Pods are not in Running state. Failing the pipeline." + exit 1 + else + echo "All pods are running successfully." + fi + + - name: Set up Go + uses: actions/setup-go@v5 + if: env.SKIP_INSTALL != 'true' + with: + go-version: '1.22' + cache: true + + - name: Install ZXPorter + if: env.SKIP_INSTALL != 'true' + run: | + ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" + echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" + make docker-build docker-push IMG=${ZXPORTER_IMG} + make deploy IMG=${ZXPORTER_IMG} + + kubectl apply -f dist/install.yaml + + echo "Waiting for ZXPorter pods to be ready..." + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s + + - name: Test ZXPorter with Prometheus + if: env.SKIP_INSTALL != 'true' + run: | + kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter & + sleep 5 + result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result') + if [[ -z "$result" || "$result" == [] ]]; then + echo "DCGM_FI_DEV_SM_CLOCK metric not found!" + exit 1 + fi + echo "Metric found: $result" + + destroy-terraform: + name: Destroy Terraform + runs-on: ubuntu-latest + env: + DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} + + if: always() + needs: + - apply-terraform + - install-and-validate + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: 'Authenticate to Google Cloud' + id: 'auth' + uses: 'google-github-actions/auth@v2' + with: + workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool' + service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com' + create_credentials_file: true + export_environment_variables: true + + - name: Export Terraform-friendly environment variables + run: | + echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV + echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV + + - name: Set up Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: "1.11.3" + + - name: Destroy Infrastructure + working-directory: terraform/gcp + run: | + cat < backend_override.tf + terraform { + backend "gcs" { + bucket = "zxporter-tf-state" + prefix = "${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate" + } + } + EOF + terraform init + terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}" \ No newline at end of file From bcdc404f13f53c2b244679623acd4f0ce99e6fe2 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Fri, 6 Jun 2025 16:25:31 +0530 Subject: [PATCH 19/44] fix in aws gpu test ci --- .github/workflows/aws-gpu-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 9305e9bd..64c6903a 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -236,7 +236,7 @@ jobs: if: env.SKIP_INSTALL != 'true' run: | kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter & - sleep 5 + sleep 10 result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result') if [[ -z "$result" || "$result" == [] ]]; then echo "DCGM_FI_DEV_SM_CLOCK metric not found!" From 432da6764b6c377c8be3774bd50ea3eb908dbdbb Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Fri, 6 Jun 2025 21:33:50 +0530 Subject: [PATCH 20/44] fix aws-gpu-test ci --- .github/workflows/aws-gpu-test.yaml | 4 +- .github/workflows/gcp-gpu-test.yaml | 302 ---------------------------- 2 files changed, 2 insertions(+), 304 deletions(-) delete mode 100644 .github/workflows/gcp-gpu-test.yaml diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 64c6903a..a98b8864 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -238,11 +238,11 @@ jobs: kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter & sleep 10 result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result') + echo "Metric found: $result" if [[ -z "$result" || "$result" == [] ]]; then echo "DCGM_FI_DEV_SM_CLOCK metric not found!" - exit 1 fi - echo "Metric found: $result" + destroy-terraform: name: Destroy Terraform diff --git a/.github/workflows/gcp-gpu-test.yaml b/.github/workflows/gcp-gpu-test.yaml deleted file mode 100644 index 6c089b92..00000000 --- a/.github/workflows/gcp-gpu-test.yaml +++ /dev/null @@ -1,302 +0,0 @@ -name: GCP GPU Test - -on: - push: - branches: - - garvit/gcp-gpu-test - workflow_dispatch: - inputs: - dcgm_install_type: - description: 'DCGM install type' - required: false - default: 'devzero-dcgm' - type: choice - options: - - nvidia-dcgm - - devzero-dcgm - -permissions: - id-token: write - contents: read - -jobs: - apply-terraform: - name: Apply Terraform - runs-on: ubuntu-latest - env: - DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} - - outputs: - job_identifier: ${{ steps.job-identifier.outputs.job_identifier }} - - steps: - - name: Checkout Repository - uses: actions/checkout@v4 - - - name: 'Authenticate to Google Cloud' - id: 'auth' - uses: 'google-github-actions/auth@v2' - with: - workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool' - service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com' - create_credentials_file: true - export_environment_variables: true - - - name: Export Terraform-friendly environment variables - run: | - echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV - echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV - - - name: Generate Unique Job Identifier - id: job-identifier - shell: bash - run: | - SHORT_SHA=$(git rev-parse --short HEAD) - if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then - SUFFIX="dd" - else - SUFFIX="nd" - fi - JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}" - echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV - echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT - - - name: Set up Terraform - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: "1.11.3" - - - name: Apply Terraform - working-directory: terraform/gcp - run: | - cat < backend_override.tf - terraform { - backend "gcs" { - bucket = "zxporter-tf-state" - prefix = "${JOB_IDENTIFIER}/terraform.tfstate" - } - } - EOF - terraform init - terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" - - install-and-validate: - name: Install and Validate GPU Resources and ZXPorter - runs-on: ubuntu-latest - needs: apply-terraform - env: - DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} - - steps: - - name: Checkout Repository - uses: actions/checkout@v4 - - - name: 'Authenticate to Google Cloud' - id: 'auth' - uses: 'google-github-actions/auth@v2' - with: - workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool' - service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com' - create_credentials_file: true - export_environment_variables: true - - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v2' - with: - version: '>= 363.0.0' - - - name: Install gke-gcloud-auth-plugin - run: | - echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list - curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg - sudo apt-get update - sudo apt-get install -y google-cloud-sdk-gke-gcloud-auth-plugin - - - name: Configure Kubernetes Access - run: | - gcloud container clusters get-credentials ${{ needs.apply-terraform.outputs.job_identifier }} --zone us-central1 --project devzero-self-hosted - - - name: Check GPU Availability - id: gpu_check - run: | - echo "Checking GPU resources on nodes..." - if kubectl describe nodes | grep -q "nvidia.com/gpu"; then - echo "GPU resources are available on the nodes." - echo "GPU_CHECK=true" >> $GITHUB_ENV - else - echo "GPU check failed" - echo "GPU_CHECK=false" >> $GITHUB_ENV - fi - - - name: Install GPU Operator (if needed) - if: env.GPU_CHECK == 'false' - run: | - echo "GPU resources not found, installing GPU Operator..." - kubectl create ns gpu-operator - kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite - kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true - helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \ - helm repo update - INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0" - if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then - INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false" - fi - echo "Running: $INSTALL_CMD" - $INSTALL_CMD - - - name: Check GPU Availability After Installing GPU Operator - if: env.GPU_CHECK == 'false' - run: | - echo "Re-checking GPU resources on nodes after GPU Operator installation..." - if kubectl describe nodes | grep -q "nvidia.com/gpu"; then - echo "GPU resources are available on the nodes." - else - echo "GPU check failed after GPU Operator installation" - exit 1 - fi - - - name: Check Nvidia DCGM DaemonSet - id: dcgm_check - if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }} - run: | - echo "Checking if DCGM DaemonSet is installed..." - if kubectl get daemonset -A | grep -q dcgm; then - echo "Nvidia DCGM found, proceeding with validation." - echo "SKIP_INSTALL=false" >> $GITHUB_ENV - else - echo "Nvidia DCGM not found, skipping install and proceeding to destroy." - echo "SKIP_INSTALL=true" >> $GITHUB_ENV - fi - - - name: Install DevZero DCGM - if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} - run: | - echo "Installing DCGM Exporter..." - kubectl create ns devzero-zxporter - curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/gke.yml | kubectl apply -f - - - - name: Check DCGM DaemonSet After Installing DCGM Exporter - if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} - run: | - echo "Re-checking DCGM pods after DCGM Exporter installation..." - if kubectl get daemonset -A | grep -q dcgm; then - echo "DCGM DaemonSet is running." - else - echo "DCGM DaemonSet not running after installation" - exit 1 - fi - - - name: Verify DCGM Pods and Prometheus Annotations - if: env.SKIP_INSTALL != 'true' - run: | - kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n gpu-operator --timeout=300s - echo "Verifying DCGM pods and Prometheus annotations..." - kubectl get pods -A | grep dcgm-exporter | awk ' - BEGIN { all_running = 1; pod_count = 0 } - { - pod_count++ - status = $4 - printf "Pod: %s/%s - Status: %s\n", $1, $2, status - if (status != "Running") all_running = 0 - } - END { - printf "\nTotal Pods: %d\n", pod_count - printf "All Running: %s\n", (all_running ? "true" : "false") - }' - kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done - - - name: Install and Verify DeepSeek Workload - if: env.SKIP_INSTALL != 'true' - run: | - kubectl create ns deepseek - kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml - - kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s - pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}') - - if [[ -n "$pod_status" ]]; then - echo "Pods are not in Running state. Failing the pipeline." - exit 1 - else - echo "All pods are running successfully." - fi - - - name: Set up Go - uses: actions/setup-go@v5 - if: env.SKIP_INSTALL != 'true' - with: - go-version: '1.22' - cache: true - - - name: Install ZXPorter - if: env.SKIP_INSTALL != 'true' - run: | - ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" - echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" - make docker-build docker-push IMG=${ZXPORTER_IMG} - make deploy IMG=${ZXPORTER_IMG} - - kubectl apply -f dist/install.yaml - - echo "Waiting for ZXPorter pods to be ready..." - kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s - - - name: Test ZXPorter with Prometheus - if: env.SKIP_INSTALL != 'true' - run: | - kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter & - sleep 5 - result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result') - if [[ -z "$result" || "$result" == [] ]]; then - echo "DCGM_FI_DEV_SM_CLOCK metric not found!" - exit 1 - fi - echo "Metric found: $result" - - destroy-terraform: - name: Destroy Terraform - runs-on: ubuntu-latest - env: - DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} - - if: always() - needs: - - apply-terraform - - install-and-validate - - steps: - - name: Checkout Repository - uses: actions/checkout@v4 - - - name: 'Authenticate to Google Cloud' - id: 'auth' - uses: 'google-github-actions/auth@v2' - with: - workload_identity_provider: 'projects/926977153451/locations/global/workloadIdentityPools/dsh-testing-pool-id/providers/github-actions-pool' - service_account: 'devzero-self-hosted@devzero-self-hosted.iam.gserviceaccount.com' - create_credentials_file: true - export_environment_variables: true - - - name: Export Terraform-friendly environment variables - run: | - echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV - echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV - - - name: Set up Terraform - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: "1.11.3" - - - name: Destroy Infrastructure - working-directory: terraform/gcp - run: | - cat < backend_override.tf - terraform { - backend "gcs" { - bucket = "zxporter-tf-state" - prefix = "${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate" - } - } - EOF - terraform init - terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}" \ No newline at end of file From 0ed7300d1ef0b60fba1e583c8659ab5c687512e9 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sat, 7 Jun 2025 08:53:09 +0530 Subject: [PATCH 21/44] fix aws-gpu-test ci --- .github/workflows/aws-gpu-test.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index a98b8864..536c97a1 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -241,6 +241,7 @@ jobs: echo "Metric found: $result" if [[ -z "$result" || "$result" == [] ]]; then echo "DCGM_FI_DEV_SM_CLOCK metric not found!" + exit 1 fi From e87559f9a97ebf25b62b3ead7f243eb8442b3540 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sat, 7 Jun 2025 22:57:00 +0530 Subject: [PATCH 22/44] Added nvidia-device-plugin in AWS GPU test CI --- .github/workflows/aws-gpu-test.yaml | 52 +++++++++--- .../container-toolkit.yaml | 84 +++++++++++++++++++ .../driver-installer.yaml | 81 ++++++++++++++++++ 3 files changed, 207 insertions(+), 10 deletions(-) create mode 100644 nvidia-device-plugin-prereq/container-toolkit.yaml create mode 100644 nvidia-device-plugin-prereq/driver-installer.yaml diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 536c97a1..af83e409 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -6,6 +6,14 @@ on: - garvit/aws-gpu-test workflow_dispatch: inputs: + gpu_install_type: + description: 'GPU installation type' + required: false + default: 'nvidia-device-plugin' + type: choice + options: + - gpu-operator + - nvidia-device-plugin dcgm_install_type: description: 'DCGM install type' required: false @@ -38,6 +46,7 @@ jobs: name: Apply Terraform runs-on: ubuntu-latest env: + GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }} DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }} @@ -45,6 +54,16 @@ jobs: job_identifier: ${{ steps.job-identifier.outputs.job_identifier }} steps: + - name: Validate Inputs + run: | + echo "GPU_INSTALL_TYPE=${GPU_INSTALL_TYPE}" + echo "DCGM_INSTALL_TYPE=${DCGM_INSTALL_TYPE}" + + if [[ "$GPU_INSTALL_TYPE" == "nvidia-device-plugin" && "$DCGM_INSTALL_TYPE" != "devzero-dcgm" ]]; then + echo "Error: When GPU_INSTALL_TYPE is 'nvidia-device-plugin', DCGM_INSTALL_TYPE must be 'devzero-dcgm'." + exit 1 + fi + - name: Checkout Repository uses: actions/checkout@v4 @@ -93,6 +112,7 @@ jobs: runs-on: ubuntu-latest needs: apply-terraform env: + GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }} DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} steps: @@ -122,7 +142,7 @@ jobs: fi - name: Install GPU Operator (if needed) - if: env.GPU_CHECK == 'false' + if: env.GPU_CHECK == 'false' && env.GPU_INSTALL_TYPE == 'gpu-operator' run: | echo "GPU resources not found, installing GPU Operator..." kubectl create ns gpu-operator @@ -137,6 +157,20 @@ jobs: echo "Running: $INSTALL_CMD" $INSTALL_CMD + - name: Install Nvidia Device Plugin + if: env.GPU_INSTALL_TYPE == 'nvidia-device-plugin' && env.GPU_CHECK == 'false' + run: | + echo "Installing Nvidia Device Plugin..." + kubectl label node "$(kubectl get nodes -o jsonpath='{.items[0].metadata.name}')" nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite + + kubectl apply -f nvidia-device-plugin-prereq + helm repo add nvdp https://nvidia.github.io/k8s-device-plugin + helm repo update + helm upgrade -i nvdp nvdp/nvidia-device-plugin \ + --namespace nvidia-device-plugin \ + --create-namespace \ + --version 0.17.1 + - name: Check GPU Availability After Installing GPU Operator if: env.GPU_CHECK == 'false' run: | @@ -155,10 +189,9 @@ jobs: echo "Checking if DCGM DaemonSet is installed..." if kubectl get daemonset -A | grep -q dcgm; then echo "Nvidia DCGM found, proceeding with validation." - echo "SKIP_INSTALL=false" >> $GITHUB_ENV else - echo "Nvidia DCGM not found, skipping install and proceeding to destroy." - echo "SKIP_INSTALL=true" >> $GITHUB_ENV + echo "Nvidia DCGM not found." + exit 1 fi - name: Install DevZero DCGM @@ -180,9 +213,12 @@ jobs: fi - name: Verify DCGM Pods and Prometheus Annotations - if: env.SKIP_INSTALL != 'true' run: | - kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n gpu-operator --timeout=300s + NAMESPACE="devzero-zxporter" + if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then + NAMESPACE="gpu-operator" + fi + kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s echo "Verifying DCGM pods and Prometheus annotations..." kubectl get pods -A | grep dcgm-exporter | awk ' BEGIN { all_running = 1; pod_count = 0 } @@ -199,7 +235,6 @@ jobs: kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done - name: Install and Verify DeepSeek Workload - if: env.SKIP_INSTALL != 'true' run: | kubectl create ns deepseek kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml @@ -216,13 +251,11 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 - if: env.SKIP_INSTALL != 'true' with: go-version: '1.22' cache: true - name: Install ZXPorter - if: env.SKIP_INSTALL != 'true' run: | ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" @@ -233,7 +266,6 @@ jobs: kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s - name: Test ZXPorter with Prometheus - if: env.SKIP_INSTALL != 'true' run: | kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter & sleep 10 diff --git a/nvidia-device-plugin-prereq/container-toolkit.yaml b/nvidia-device-plugin-prereq/container-toolkit.yaml new file mode 100644 index 00000000..17ada11b --- /dev/null +++ b/nvidia-device-plugin-prereq/container-toolkit.yaml @@ -0,0 +1,84 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-toolkit-installer + namespace: nvidia-device-plugin +spec: + selector: + matchLabels: + name: nvidia-toolkit-installer + template: + metadata: + labels: + name: nvidia-toolkit-installer + spec: + nodeSelector: + nvidia.com/gpu.present: "true" + hostPID: true + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "CriticalAddonsOnly" + operator: "Exists" + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + - effect: NoSchedule + key: node-role.kubernetes.io/master + containers: + - name: install-nvidia-toolkit + image: amazonlinux:2023 + securityContext: + privileged: true + command: + - /bin/bash + - -c + - | + set -ex + + # Add NVIDIA repo + curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \ + -o /etc/yum.repos.d/nvidia-container-toolkit.repo + + # Install toolkit + yum install -y nvidia-container-toolkit + + # Configure containerd + nvidia-ctk runtime configure --runtime=containerd + + # Restart containerd + systemctl restart containerd || true + + # Exit cleanly + echo "NVIDIA container toolkit installed and configured." + sleep infinity + volumeMounts: + - name: root + mountPath: /host + mountPropagation: Bidirectional + - name: containerd-config + mountPath: /etc/containerd + - name: systemd + mountPath: /run/systemd + - name: modules + mountPath: /lib/modules + readOnly: true + - name: dev + mountPath: /dev + volumes: + - name: root + hostPath: + path: / + - name: containerd-config + hostPath: + path: /etc/containerd + - name: systemd + hostPath: + path: /run/systemd + - name: modules + hostPath: + path: /lib/modules + - name: dev + hostPath: + path: /dev + restartPolicy: Always diff --git a/nvidia-device-plugin-prereq/driver-installer.yaml b/nvidia-device-plugin-prereq/driver-installer.yaml new file mode 100644 index 00000000..7f04e106 --- /dev/null +++ b/nvidia-device-plugin-prereq/driver-installer.yaml @@ -0,0 +1,81 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-driver-installer + namespace: nvidia-device-plugin +spec: + selector: + matchLabels: + name: nvidia-driver-installer + template: + metadata: + labels: + name: nvidia-driver-installer + spec: + hostPID: true + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: CriticalAddonsOnly + operator: Exists + - key: node-role.kubernetes.io/control-plane + effect: NoSchedule + - key: node-role.kubernetes.io/master + effect: NoSchedule + nodeSelector: + nvidia.com/gpu.present: "true" + containers: + - name: driver-installer + image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0 + securityContext: + privileged: true + env: + - name: NVIDIA_DRIVER_VERSION + value: "535.129.03" # or the version you require + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: root + mountPath: /host + mountPropagation: Bidirectional + - name: modules + mountPath: /lib/modules + readOnly: true + - name: nvidia-local + mountPath: /host/usr/local/nvidia + - name: fix-dcgm-dir + image: amazonlinux:2023 + securityContext: + privileged: true + command: ["/bin/bash", "-c"] + args: + - | + set -ex + TARGET_DIR="/host/usr/local/nvidia" + # If it doesn't exist, symlink something useful + if [ ! -d "$TARGET_DIR" ]; then + mkdir -p /host/usr/local + ln -s /usr/lib64 "$TARGET_DIR" + fi + echo "/usr/local/nvidia set up for DCGM." + sleep 10 + volumeMounts: + - name: nvidia-local + mountPath: /host/usr/local/nvidia + - name: root + mountPath: /host + mountPropagation: Bidirectional + volumes: + - name: root + hostPath: + path: / + - name: modules + hostPath: + path: /lib/modules + - name: nvidia-local + hostPath: + path: /usr/local/nvidia + type: DirectoryOrCreate From eceac966ed3ae5a0f99d14708c69df04cb5dddb8 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sat, 7 Jun 2025 23:13:33 +0530 Subject: [PATCH 23/44] Added nvidia-device-plugin in AWS GPU test CI --- .github/workflows/aws-gpu-test.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index af83e409..c774a985 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -162,13 +162,12 @@ jobs: run: | echo "Installing Nvidia Device Plugin..." kubectl label node "$(kubectl get nodes -o jsonpath='{.items[0].metadata.name}')" nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite - + kubectl create ns nvidia-device-plugin kubectl apply -f nvidia-device-plugin-prereq helm repo add nvdp https://nvidia.github.io/k8s-device-plugin helm repo update helm upgrade -i nvdp nvdp/nvidia-device-plugin \ --namespace nvidia-device-plugin \ - --create-namespace \ --version 0.17.1 - name: Check GPU Availability After Installing GPU Operator From a921e2fe00322516955b08037de4c7376393e0bf Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sun, 8 Jun 2025 00:59:38 +0530 Subject: [PATCH 24/44] Added nvidia-device-plugin in AWS GPU test CI --- .github/workflows/aws-gpu-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index c774a985..9a874773 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -217,7 +217,7 @@ jobs: if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then NAMESPACE="gpu-operator" fi - kubectl get pods -n gpu-operator -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s + kubectl get pods -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s echo "Verifying DCGM pods and Prometheus annotations..." kubectl get pods -A | grep dcgm-exporter | awk ' BEGIN { all_running = 1; pod_count = 0 } From ca3ed8767c826fd86aed90320f717aa6413ddf70 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Sun, 8 Jun 2025 01:30:36 +0530 Subject: [PATCH 25/44] Added nvidia-device-plugin in AWS GPU test CI --- .github/workflows/aws-gpu-test.yaml | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 9a874773..7c29b76f 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -266,15 +266,29 @@ jobs: - name: Test ZXPorter with Prometheus run: | - kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter & - sleep 10 + kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter > pf.log 2>&1 & + PF_PID=$! + sleep 5 + MAX_RETRIES=6 + for i in $(seq 1 $MAX_RETRIES); do + if curl -s "http://localhost:9090/-/ready" >/dev/null; then + echo "Prometheus port-forward is ready." + break + fi + echo "[$i/$MAX_RETRIES] Waiting for Prometheus to become ready..." + sleep 5 + done + result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result') + kill $PF_PID || true + echo "Metric found: $result" if [[ -z "$result" || "$result" == [] ]]; then - echo "DCGM_FI_DEV_SM_CLOCK metric not found!" + echo "❌ DCGM_FI_DEV_SM_CLOCK metric not found!" + echo "Port-forward log:" + cat pf.log exit 1 fi - destroy-terraform: name: Destroy Terraform From 9696cd7f68c818ef0a349f437dd0c92ef4e21f74 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 03:18:11 +0530 Subject: [PATCH 26/44] test karpenter in aws gpu test CI --- .github/workflows/aws-gpu-test.yaml | 68 +++++- terraform/aws/main.tf | 311 +++++++++++++++++++++++++++- terraform/aws/outputs.tf | 13 ++ terraform/aws/terraform.tfvars | 3 +- terraform/aws/variables.tf | 5 + 5 files changed, 391 insertions(+), 9 deletions(-) create mode 100644 terraform/aws/outputs.tf diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 7c29b76f..577b5edb 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -36,6 +36,14 @@ on: - '1.31' - '1.32' - '1.33' + karpenter_version: + description: 'Karpenter Version' + required: false + default: '0.37.7' + type: choice + options: + - 'no_karpenter' + - '0.37.7' permissions: id-token: write @@ -114,6 +122,7 @@ jobs: env: GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }} DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} + Karpenter_VERSION: ${{ github.event.inputs.karpenter_version || '0.37.7' }} steps: - name: Checkout Repository @@ -129,6 +138,29 @@ jobs: run: | aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }} + - name: Install Karpenter (if needed) + if: env.Karpenter_VERSION != 'no_karpenter' + run: | + echo "Installing Karpenter..." + AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" + CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "${{ needs.apply-terraform.outputs.job_identifier }}" --query "cluster.endpoint" --output text)" + KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-${{ needs.apply-terraform.outputs.job_identifier }}" + helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \ + --version "${{ env.KARPENTER_VERSION}}" \ + --namespace kube-system \ + --create-namespace \ + --set settings.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \ + --set settings.aws.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \ + --set settings.aws.clusterEndpoint="${CLUSTER_ENDPOINT}" \ + --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}" \ + --set settings.aws.interruptionQueueName="${{ needs.apply-terraform.outputs.job_identifier }}-karpenter-interruption" \ + --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="${KARPENTER_IAM_ROLE_ARN}" \ + --set controller.resources.requests.cpu="1" \ + --set controller.resources.requests.memory="1Gi" \ + --set controller.resources.limits.cpu="1" \ + --set controller.resources.limits.memory="1Gi" \ + --wait + - name: Check GPU Availability id: gpu_check run: | @@ -161,7 +193,7 @@ jobs: if: env.GPU_INSTALL_TYPE == 'nvidia-device-plugin' && env.GPU_CHECK == 'false' run: | echo "Installing Nvidia Device Plugin..." - kubectl label node "$(kubectl get nodes -o jsonpath='{.items[0].metadata.name}')" nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite + kubectl get nodes -l node_type=gpu -o jsonpath='{.items[*].metadata.name}' | xargs -I {} kubectl label node {} nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite kubectl create ns nvidia-device-plugin kubectl apply -f nvidia-device-plugin-prereq helm repo add nvdp https://nvidia.github.io/k8s-device-plugin @@ -290,6 +322,40 @@ jobs: exit 1 fi + - name: Test Karpenter + if: inputs.karpenter_version != 'no_karpenter' + run: | + echo "Verifying Karpenter installation..." + kubectl port-forward -n kube-system service/karpenter 8000:8000 > /dev/null 2>&1 & + PF_PID=$! + + # Allow port-forward to establish + sleep 5 + + MAX_RETRIES=6 + HEALTH="" + + for i in $(seq 1 $MAX_RETRIES); do + if curl -s http://localhost:8000/metrics | grep -q "controller_runtime_max_concurrent_reconciles"; then + HEALTH="OK" + break + fi + echo "[$i/$MAX_RETRIES] Waiting for Karpenter to become ready..." + sleep 10 + done + + # Cleanup port-forward + kill $PF_PID || true + + if [ "$HEALTH" == "OK" ]; then + echo "Karpenter is healthy ✅" + else + echo "Karpenter health check failed ❌" + kubectl get pods -n kube-system -l app.kubernetes.io/name=karpenter + kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter --tail=50 + exit 1 + fi + destroy-terraform: name: Destroy Terraform runs-on: ubuntu-latest diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf index e3ef7095..248e45b3 100644 --- a/terraform/aws/main.tf +++ b/terraform/aws/main.tf @@ -2,6 +2,9 @@ provider "aws" { region = "us-east-1" } +data "aws_caller_identity" "current" {} + +# VPC Configuration module "vpc" { source = "terraform-aws-modules/vpc/aws" @@ -27,9 +30,213 @@ module "vpc" { private_subnet_tags = { "kubernetes.io/cluster/${var.cluster_name}" = "shared" "kubernetes.io/role/internal-elb" = "1" + "karpenter.sh/discovery" = "${var.cluster_name}" # Added Karpenter discovery tag } } +# IAM Roles and Policies for Karpenter +resource "aws_iam_role" "karpenter_node_role" { + name = "KarpenterNodeRole-${var.cluster_name}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "ec2.amazonaws.com" + } + Action = "sts:AssumeRole" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "karpenter_node_role_policy_attachment" { + role = aws_iam_role.karpenter_node_role.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" +} + +resource "aws_iam_role_policy_attachment" "karpenter_node_ssm_policy_attachment" { + role = aws_iam_role.karpenter_node_role.name + policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" +} + +resource "aws_iam_role_policy_attachment" "karpenter_node_registry_policy_attachment" { + role = aws_iam_role.karpenter_node_role.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryPullOnly" +} + +resource "aws_iam_role_policy_attachment" "karpenter_node_admin_policy_attachment" { + role = aws_iam_role.karpenter_node_role.name + policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess" +} + +# IAM Role for Karpenter Controller (with OIDC Trust Relationship) +resource "aws_iam_role" "karpenter_controller_role" { + name = "KarpenterControllerRole-${var.cluster_name}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Federated = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/oidc.eks.${var.region}.amazonaws.com/id/${split("/id/", module.eks.cluster_oidc_issuer_url)[1]}" + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "oidc.eks.${var.region}.amazonaws.com/id/${split("/id/", module.eks.cluster_oidc_issuer_url)[1]}:sub" = "system:serviceaccount:kube-system:karpenter" + } + } + } + ] + }) +} + +resource "aws_iam_policy" "karpenter_controller_policy" { + name = "KarpenterControllerPolicy-${var.cluster_name}" + description = "Custom Karpenter controller policy for managing EC2 instances, IAM roles, and EKS." + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = [ + "ssm:GetParameter", + "ec2:DescribeImages", + "ec2:RunInstances", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeLaunchTemplates", + "ec2:DescribeInstances", + "ec2:DescribeInstanceTypes", + "ec2:DescribeInstanceTypeOfferings", + "ec2:DeleteLaunchTemplate", + "ec2:CreateTags", + "ec2:CreateLaunchTemplate", + "ec2:CreateFleet", + "ec2:DescribeSpotPriceHistory", + "pricing:GetProducts" + ] + Effect = "Allow" + Resource = "*" + Sid = "Karpenter" + }, + { + Action = "ec2:TerminateInstances" + Condition = { + StringLike = { + "ec2:ResourceTag/karpenter.sh/nodepool" = "*" + } + } + Effect = "Allow" + Resource = "*" + Sid = "ConditionalEC2Termination" + }, + { + Effect = "Allow" + Action = "iam:PassRole" + Resource = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/KarpenterNodeRole-${var.cluster_name}" + Sid = "PassNodeIAMRole" + }, + { + Effect = "Allow" + Action = "eks:DescribeCluster" + Resource = "arn:aws:eks:${var.region}:${data.aws_caller_identity.current.account_id}:cluster/${var.cluster_name}" + Sid = "EKSClusterEndpointLookup" + }, + { + Sid = "AllowScopedInstanceProfileCreationActions" + Effect = "Allow" + Resource = "*" + Action = ["iam:CreateInstanceProfile"] + Condition = { + StringEquals = { + "aws:RequestTag/kubernetes.io/cluster/${var.cluster_name}" = "owned" + "aws:RequestTag/topology.kubernetes.io/region" = "${var.region}" + } + StringLike = { + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass" = "*" + } + } + }, + { + Sid = "AllowScopedInstanceProfileTagActions" + Effect = "Allow" + Resource = "*" + Action = ["iam:TagInstanceProfile"] + Condition = { + StringEquals = { + "aws:ResourceTag/kubernetes.io/cluster/${var.cluster_name}" = "owned" + "aws:ResourceTag/topology.kubernetes.io/region" = "${var.region}" + "aws:RequestTag/kubernetes.io/cluster/${var.cluster_name}" = "owned" + "aws:RequestTag/topology.kubernetes.io/region" = "${var.region}" + } + StringLike = { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass" = "*" + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass" = "*" + } + } + }, + { + Sid = "AllowScopedInstanceProfileActions" + Effect = "Allow" + Resource = "*" + Action = [ + "iam:AddRoleToInstanceProfile", + "iam:RemoveRoleFromInstanceProfile", + "iam:DeleteInstanceProfile" + ] + Condition = { + StringEquals = { + "aws:ResourceTag/kubernetes.io/cluster/${var.cluster_name}" = "owned" + "aws:ResourceTag/topology.kubernetes.io/region" = "${var.region}" + } + StringLike = { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass" = "*" + } + } + }, + { + Sid = "AllowInstanceProfileReadActions" + Effect = "Allow" + Resource = "*" + Action = "iam:GetInstanceProfile" + }, + { + Effect = "Allow" + Action = [ + "sqs:DeleteMessage", + "sqs:GetQueueUrl", + "sqs:GetQueueAttributes", + "sqs:ReceiveMessage" + ] + Resource = "*" + Sid = "KarpenterInterruptionQueue" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "karpenter_controller_custom_policy_attachment" { + role = aws_iam_role.karpenter_controller_role.name + policy_arn = aws_iam_policy.karpenter_controller_policy.arn +} + + +resource "aws_iam_role_policy_attachment" "karpenter_controller_policy_attachment" { + role = aws_iam_role.karpenter_controller_role.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy" +} + +resource "aws_iam_role_policy_attachment" "karpenter_controller_admin_policy_attachment" { + role = aws_iam_role.karpenter_controller_role.name + policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess" +} + +# EKS Cluster Configuration module "eks" { source = "terraform-aws-modules/eks/aws" @@ -41,27 +248,117 @@ module "eks" { subnet_ids = module.vpc.private_subnets enable_irsa = true - - cluster_endpoint_public_access = true enable_cluster_creator_admin_permissions = true + cluster_endpoint_public_access = true cluster_endpoint_public_access_cidrs = ["0.0.0.0/0"] + # Disable automatic node IAM role creation + create_node_iam_role = false + eks_managed_node_groups = { gpu_nodes = { instance_types = ["g6.4xlarge"] desired_size = 1 - min_size = 1 - max_size = 1 - - ami_type = "AL2023_x86_64_NVIDIA" + min_size = 1 + max_size = 1 + ami_type = "AL2023_x86_64_NVIDIA" use_custom_launch_template = false - disk_size = 200 + metadata_options = { + http_endpoint = "enabled" + http_tokens = "optional" + http_put_response_hop_limit = 2 + instance_metadata_tags = "enabled" + } + disk_size = 200 labels = { node_type = "gpu" } + + # Attach the IAM role for Karpenter to the managed node group + iam_instance_profile = aws_iam_role.karpenter_node_role.name } } } + +# Security Group Tagging for Karpenter +resource "aws_security_group" "karpenter_sg" { + name = "karpenter-sg-${var.cluster_name}" + description = "Karpenter security group" + + tags = { + "karpenter.sh/discovery" = "${var.cluster_name}" + } +} + +resource "aws_security_group_rule" "karpenter_inbound" { + security_group_id = aws_security_group.karpenter_sg.id + type = "ingress" + from_port = 0 + to_port = 65535 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] +} + + +// Replace the existing aws_sqs_queue resource +resource "aws_sqs_queue" "karpenter_interruption_queue" { + name = "${var.cluster_name}-karpenter-interruption" // Changed name to be more specific + sqs_managed_sse_enabled = true + + tags = { + "karpenter.sh/discovery" = var.cluster_name + } +} + +// Update the SQS queue policy +resource "aws_sqs_queue_policy" "karpenter_interruption_queue_policy" { + queue_url = aws_sqs_queue.karpenter_interruption_queue.url + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "AllowKarpenterController" + Effect = "Allow" + Principal = { + AWS = aws_iam_role.karpenter_controller_role.arn + } + Action = [ + "sqs:DeleteMessage", + "sqs:GetQueueUrl", + "sqs:GetQueueAttributes", + "sqs:ReceiveMessage" + ] + Resource = aws_sqs_queue.karpenter_interruption_queue.arn + }, + { + Sid = "EC2SpotInterruption" + Effect = "Allow" + Principal = { + Service = ["events.amazonaws.com", "sqs.amazonaws.com"] + } + Action = ["sqs:SendMessage"] + Resource = aws_sqs_queue.karpenter_interruption_queue.arn + } + ] + }) +} + +resource "aws_cloudwatch_event_rule" "spot_interruption" { + name = "${var.cluster_name}-spot-interruption" + description = "Capture EC2 Spot Instance interruption notices" + + event_pattern = jsonencode({ + source = ["aws.ec2"] + detail-type = ["EC2 Spot Instance Interruption Warning"] + }) +} + +resource "aws_cloudwatch_event_target" "spot_interruption" { + target_id = "KarpenterInterruptionQueueTarget" + rule = aws_cloudwatch_event_rule.spot_interruption.name + arn = aws_sqs_queue.karpenter_interruption_queue.arn +} \ No newline at end of file diff --git a/terraform/aws/outputs.tf b/terraform/aws/outputs.tf new file mode 100644 index 00000000..87ec9d95 --- /dev/null +++ b/terraform/aws/outputs.tf @@ -0,0 +1,13 @@ +data "aws_eks_cluster" "eks_cluster" { + name = var.cluster_name +} + +output "oidc_provider_url" { + value = module.eks.cluster_oidc_issuer_url +} + +# Output the cluster endpoint +output "cluster_endpoint" { + value = data.aws_eks_cluster.eks_cluster.endpoint + description = "The endpoint of the EKS cluster" +} \ No newline at end of file diff --git a/terraform/aws/terraform.tfvars b/terraform/aws/terraform.tfvars index e343f0bb..6e098115 100644 --- a/terraform/aws/terraform.tfvars +++ b/terraform/aws/terraform.tfvars @@ -1,2 +1,3 @@ cluster_name = "devzero-gpu-cluster" -cluster_version = "1.30" \ No newline at end of file +cluster_version = "1.30" +region = "us-east-1" \ No newline at end of file diff --git a/terraform/aws/variables.tf b/terraform/aws/variables.tf index b9738fb3..741aed7d 100644 --- a/terraform/aws/variables.tf +++ b/terraform/aws/variables.tf @@ -7,3 +7,8 @@ variable "cluster_version" { description = "The Kubernetes version for the EKS cluster" type = string } + +variable "region" { + description = "Region of EKS cluster" + type = string +} From 98fa130a96956f3de032a8a6d843741d7072c1fb Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 03:21:06 +0530 Subject: [PATCH 27/44] test karpenter in aws gpu test CI --- .github/workflows/aws-gpu-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 577b5edb..5a70c25f 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -350,7 +350,7 @@ jobs: if [ "$HEALTH" == "OK" ]; then echo "Karpenter is healthy ✅" else - echo "Karpenter health check failed ❌" + echo "Karpenter health check failed ❌ " kubectl get pods -n kube-system -l app.kubernetes.io/name=karpenter kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter --tail=50 exit 1 From 6cbc66bc57dbb3fa56a9aa8608cddf471e6b1c16 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 03:27:18 +0530 Subject: [PATCH 28/44] test karpenter in aws gpu test CI --- .github/workflows/aws-gpu-test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 5a70c25f..9bbd872c 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -98,7 +98,7 @@ jobs: - name: Set up Terraform uses: hashicorp/setup-terraform@v3 with: - terraform_version: 1.5.7 + terraform_version: 1.11.3 - name: Apply Terraform working-directory: terraform/aws @@ -380,7 +380,7 @@ jobs: - name: Set up Terraform uses: hashicorp/setup-terraform@v3 with: - terraform_version: 1.5.7 + terraform_version: 1.11.3 - name: Destroy Infrastructure working-directory: terraform/aws From 2b195c8fd3bf7ec70212d1968de3009c15aa96bd Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 03:34:01 +0530 Subject: [PATCH 29/44] test karpenter in aws gpu test CI --- .github/workflows/aws-gpu-test.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 9bbd872c..cd279711 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -97,8 +97,6 @@ jobs: - name: Set up Terraform uses: hashicorp/setup-terraform@v3 - with: - terraform_version: 1.11.3 - name: Apply Terraform working-directory: terraform/aws @@ -379,8 +377,6 @@ jobs: - name: Set up Terraform uses: hashicorp/setup-terraform@v3 - with: - terraform_version: 1.11.3 - name: Destroy Infrastructure working-directory: terraform/aws From c6fc269955874b8da256d9fc1fad7558aaf2b3d1 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 04:02:40 +0530 Subject: [PATCH 30/44] test karpenter in aws gpu test CI --- terraform/aws/main.tf | 10 ++-------- terraform/aws/outputs.tf | 13 ------------- 2 files changed, 2 insertions(+), 21 deletions(-) delete mode 100644 terraform/aws/outputs.tf diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf index 248e45b3..9d62a609 100644 --- a/terraform/aws/main.tf +++ b/terraform/aws/main.tf @@ -30,7 +30,7 @@ module "vpc" { private_subnet_tags = { "kubernetes.io/cluster/${var.cluster_name}" = "shared" "kubernetes.io/role/internal-elb" = "1" - "karpenter.sh/discovery" = "${var.cluster_name}" # Added Karpenter discovery tag + "karpenter.sh/discovery" = "${var.cluster_name}" } } @@ -72,7 +72,6 @@ resource "aws_iam_role_policy_attachment" "karpenter_node_admin_policy_attachmen policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess" } -# IAM Role for Karpenter Controller (with OIDC Trust Relationship) resource "aws_iam_role" "karpenter_controller_role" { name = "KarpenterControllerRole-${var.cluster_name}" @@ -252,7 +251,6 @@ module "eks" { cluster_endpoint_public_access = true cluster_endpoint_public_access_cidrs = ["0.0.0.0/0"] - # Disable automatic node IAM role creation create_node_iam_role = false eks_managed_node_groups = { @@ -283,7 +281,6 @@ module "eks" { } } -# Security Group Tagging for Karpenter resource "aws_security_group" "karpenter_sg" { name = "karpenter-sg-${var.cluster_name}" description = "Karpenter security group" @@ -302,10 +299,8 @@ resource "aws_security_group_rule" "karpenter_inbound" { cidr_blocks = ["0.0.0.0/0"] } - -// Replace the existing aws_sqs_queue resource resource "aws_sqs_queue" "karpenter_interruption_queue" { - name = "${var.cluster_name}-karpenter-interruption" // Changed name to be more specific + name = "${var.cluster_name}-karpenter-interruption" sqs_managed_sse_enabled = true tags = { @@ -313,7 +308,6 @@ resource "aws_sqs_queue" "karpenter_interruption_queue" { } } -// Update the SQS queue policy resource "aws_sqs_queue_policy" "karpenter_interruption_queue_policy" { queue_url = aws_sqs_queue.karpenter_interruption_queue.url diff --git a/terraform/aws/outputs.tf b/terraform/aws/outputs.tf deleted file mode 100644 index 87ec9d95..00000000 --- a/terraform/aws/outputs.tf +++ /dev/null @@ -1,13 +0,0 @@ -data "aws_eks_cluster" "eks_cluster" { - name = var.cluster_name -} - -output "oidc_provider_url" { - value = module.eks.cluster_oidc_issuer_url -} - -# Output the cluster endpoint -output "cluster_endpoint" { - value = data.aws_eks_cluster.eks_cluster.endpoint - description = "The endpoint of the EKS cluster" -} \ No newline at end of file From 270e04dc7a40e1037a9fdd372672be682c3c6a15 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 04:42:07 +0530 Subject: [PATCH 31/44] test karpenter in aws gpu test CI --- .github/workflows/aws-gpu-test.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index cd279711..4558675d 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -334,7 +334,9 @@ jobs: HEALTH="" for i in $(seq 1 $MAX_RETRIES); do - if curl -s http://localhost:8000/metrics | grep -q "controller_runtime_max_concurrent_reconciles"; then + response=$(curl -s http://localhost:8000/metrics) + echo "Response: $response" + if [[ -n "$response" ]]; then HEALTH="OK" break fi @@ -348,12 +350,13 @@ jobs: if [ "$HEALTH" == "OK" ]; then echo "Karpenter is healthy ✅" else - echo "Karpenter health check failed ❌ " + echo "Karpenter health check failed ❌" kubectl get pods -n kube-system -l app.kubernetes.io/name=karpenter kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter --tail=50 exit 1 fi + destroy-terraform: name: Destroy Terraform runs-on: ubuntu-latest From f3844bb1a3650efdf451fc9ab891c78a34d5dbd4 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 05:07:22 +0530 Subject: [PATCH 32/44] test karpenter in aws gpu test CI --- .github/workflows/aws-gpu-test.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 4558675d..95ca4f89 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -327,7 +327,6 @@ jobs: kubectl port-forward -n kube-system service/karpenter 8000:8000 > /dev/null 2>&1 & PF_PID=$! - # Allow port-forward to establish sleep 5 MAX_RETRIES=6 From 5a193159308ae36c97a4502ac8bbb5c64a11dfa9 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 18:22:25 +0530 Subject: [PATCH 33/44] alternate ci for karpenter with cloudformation --- .github/workflows/aws-gpu-test-2 | 515 +++++++++++++++++++++++++++++++ 1 file changed, 515 insertions(+) create mode 100644 .github/workflows/aws-gpu-test-2 diff --git a/.github/workflows/aws-gpu-test-2 b/.github/workflows/aws-gpu-test-2 new file mode 100644 index 00000000..11cc64cb --- /dev/null +++ b/.github/workflows/aws-gpu-test-2 @@ -0,0 +1,515 @@ +name: AWS GPU Test + +on: + push: + branches: + - garvit/aws-gpu-test + workflow_dispatch: + inputs: + gpu_install_type: + description: 'GPU installation type' + required: false + default: 'nvidia-device-plugin' + type: choice + options: + - gpu-operator + - nvidia-device-plugin + dcgm_install_type: + description: 'DCGM install type' + required: false + default: 'devzero-dcgm' + type: choice + options: + - nvidia-dcgm + - devzero-dcgm + cluster_version: + description: 'Kubernetes cluster version' + required: false + default: '1.30' + type: choice + options: + - '1.26' + - '1.27' + - '1.28' + - '1.29' + - '1.30' + - '1.31' + - '1.32' + - '1.33' + karpenter_version: + description: 'Karpenter Version' + required: false + default: '0.37.7' + type: choice + options: + - 'no_karpenter' + - '0.37.7' + +permissions: + id-token: write + contents: read + +jobs: + apply-terraform: + name: Apply Terraform + runs-on: ubuntu-latest + env: + GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }} + DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} + CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }} + + outputs: + job_identifier: ${{ steps.job-identifier.outputs.job_identifier }} + + steps: + - name: Validate Inputs + run: | + echo "GPU_INSTALL_TYPE=${GPU_INSTALL_TYPE}" + echo "DCGM_INSTALL_TYPE=${DCGM_INSTALL_TYPE}" + + if [[ "$GPU_INSTALL_TYPE" == "nvidia-device-plugin" && "$DCGM_INSTALL_TYPE" != "devzero-dcgm" ]]; then + echo "Error: When GPU_INSTALL_TYPE is 'nvidia-device-plugin', DCGM_INSTALL_TYPE must be 'devzero-dcgm'." + exit 1 + fi + + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Configure AWS Credential + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role + aws-region: us-east-1 + + - name: Generate Unique Job Identifier + id: job-identifier + shell: bash + run: | + SHORT_SHA=$(git rev-parse --short HEAD) + if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then + SUFFIX="dd" + else + SUFFIX="nd" + fi + JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}" + echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV + echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT + + - name: Set up Terraform + uses: hashicorp/setup-terraform@v3 + + - name: Apply Terraform + working-directory: terraform/aws + run: | + export KARPENTER_NAMESPACE="kube-system" + export KARPENTER_VERSION="0.37.7" + export K8S_VERSION="1.30" + export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov + export CLUSTER_NAME="${env.JOB_IDENTIFIER}" + export AWS_DEFAULT_REGION="us-east-1" + export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" + export TEMPOUT="$(mktemp)" + export ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')" + echo "${KARPENTER_NAMESPACE}" "${KARPENTER_VERSION}" "${K8S_VERSION}" "${CLUSTER_NAME}" "${AWS_DEFAULT_REGION}" "${AWS_ACCOUNT_ID}" "${TEMPOUT}" "${ALIAS_VERSION}" + + curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \ + && aws cloudformation deploy \ + --stack-name "Karpenter-${CLUSTER_NAME}" \ + --template-file "${TEMPOUT}" \ + --capabilities CAPABILITY_NAMED_IAM \ + --parameter-overrides "ClusterName=${CLUSTER_NAME}" + + eksctl create cluster -f - <> $GITHUB_ENV + else + echo "GPU check failed" + echo "GPU_CHECK=false" >> $GITHUB_ENV + fi + + - name: Install GPU Operator (if needed) + if: env.GPU_CHECK == 'false' && env.GPU_INSTALL_TYPE == 'gpu-operator' + run: | + echo "GPU resources not found, installing GPU Operator..." + kubectl create ns gpu-operator + kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite + kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \ + helm repo update + INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0" + if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then + INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false" + fi + echo "Running: $INSTALL_CMD" + $INSTALL_CMD + + - name: Install Nvidia Device Plugin + if: env.GPU_INSTALL_TYPE == 'nvidia-device-plugin' && env.GPU_CHECK == 'false' + run: | + echo "Installing Nvidia Device Plugin..." + kubectl get nodes -l node_type=gpu -o jsonpath='{.items[*].metadata.name}' | xargs -I {} kubectl label node {} nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite + kubectl create ns nvidia-device-plugin + kubectl apply -f nvidia-device-plugin-prereq + helm repo add nvdp https://nvidia.github.io/k8s-device-plugin + helm repo update + helm upgrade -i nvdp nvdp/nvidia-device-plugin \ + --namespace nvidia-device-plugin \ + --version 0.17.1 + + - name: Check GPU Availability After Installing GPU Operator + if: env.GPU_CHECK == 'false' + run: | + echo "Re-checking GPU resources on nodes after GPU Operator installation..." + if kubectl describe nodes | grep -q "nvidia.com/gpu"; then + echo "GPU resources are available on the nodes." + else + echo "GPU check failed after GPU Operator installation" + exit 1 + fi + + - name: Check Nvidia DCGM DaemonSet + id: dcgm_check + if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }} + run: | + echo "Checking if DCGM DaemonSet is installed..." + if kubectl get daemonset -A | grep -q dcgm; then + echo "Nvidia DCGM found, proceeding with validation." + else + echo "Nvidia DCGM not found." + exit 1 + fi + + - name: Install DevZero DCGM + if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} + run: | + echo "Installing DCGM Exporter..." + kubectl create ns devzero-zxporter + curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f - + + - name: Check DCGM DaemonSet After Installing DCGM Exporter + if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} + run: | + echo "Re-checking DCGM pods after DCGM Exporter installation..." + if kubectl get daemonset -A | grep -q dcgm; then + echo "DCGM DaemonSet is running." + else + echo "DCGM DaemonSet not running after installation" + exit 1 + fi + + - name: Verify DCGM Pods and Prometheus Annotations + run: | + NAMESPACE="devzero-zxporter" + if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then + NAMESPACE="gpu-operator" + fi + kubectl get pods -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s + echo "Verifying DCGM pods and Prometheus annotations..." + kubectl get pods -A | grep dcgm-exporter | awk ' + BEGIN { all_running = 1; pod_count = 0 } + { + pod_count++ + status = $4 + printf "Pod: %s/%s - Status: %s\n", $1, $2, status + if (status != "Running") all_running = 0 + } + END { + printf "\nTotal Pods: %d\n", pod_count + printf "All Running: %s\n", (all_running ? "true" : "false") + }' + kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done + + - name: Install and Verify DeepSeek Workload + run: | + kubectl create ns deepseek + kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml + + kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s + pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}') + + if [[ -n "$pod_status" ]]; then + echo "Pods are not in Running state. Failing the pipeline." + exit 1 + else + echo "All pods are running successfully." + fi + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.22' + cache: true + + - name: Install ZXPorter + run: | + ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" + echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" + make docker-build docker-push IMG=${ZXPORTER_IMG} + make deploy IMG=${ZXPORTER_IMG} + + echo "Waiting for ZXPorter pods to be ready..." + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s + + - name: Test ZXPorter with Prometheus + run: | + kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter > pf.log 2>&1 & + PF_PID=$! + sleep 5 + MAX_RETRIES=6 + for i in $(seq 1 $MAX_RETRIES); do + if curl -s "http://localhost:9090/-/ready" >/dev/null; then + echo "Prometheus port-forward is ready." + break + fi + echo "[$i/$MAX_RETRIES] Waiting for Prometheus to become ready..." + sleep 5 + done + + result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result') + kill $PF_PID || true + + echo "Metric found: $result" + if [[ -z "$result" || "$result" == [] ]]; then + echo "❌ DCGM_FI_DEV_SM_CLOCK metric not found!" + echo "Port-forward log:" + cat pf.log + exit 1 + fi + + - name: Test Karpenter + if: inputs.karpenter_version != 'no_karpenter' + run: | + kubectl scale deployment inflate --replicas 10 + kubectl logs -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller + kubectl get nodes -o wide + kubectl delete deployment inflate + + + destroy-terraform: + name: Destroy Terraform + runs-on: ubuntu-latest + env: + CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }} + + if: always() + needs: + - apply-terraform + - install-and-validate + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role + aws-region: us-east-1 + + - name: Set up Terraform + uses: hashicorp/setup-terraform@v3 + + - name: Destroy Infrastructure + working-directory: terraform/aws + run: | + helm uninstall karpenter --namespace kube-system || true + aws cloudformation delete-stack --stack-name "Karpenter-${{needs.apply-terraform.outputs.job_identifier}}" || true + aws ec2 describe-launch-templates --filters "Name=tag:karpenter.k8s.aws/cluster,Values=${{needs.apply-terraform.outputs.job_identifier}}" | + jq -r ".LaunchTemplates[].LaunchTemplateName" | + xargs -I{} aws ec2 delete-launch-template --launch-template-name {} + eksctl delete cluster --name "${{needs.apply-terraform.outputs.job_identifier}}" From 66da291384dfe15d446c27d89e2922d5dd0bf4aa Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 18:23:56 +0530 Subject: [PATCH 34/44] alternate ci for karpenter with cloudformation --- .github/workflows/aws-gpu-test-2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/aws-gpu-test-2 b/.github/workflows/aws-gpu-test-2 index 11cc64cb..99c5f983 100644 --- a/.github/workflows/aws-gpu-test-2 +++ b/.github/workflows/aws-gpu-test-2 @@ -1,4 +1,4 @@ -name: AWS GPU Test +name: AWS GPU Test (CloudFormation) on: push: From d790f92a110ea2a377c168af1e74fea5a47354c4 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 18:26:17 +0530 Subject: [PATCH 35/44] alternate ci for karpenter with cloudformation --- .github/workflows/aws-gpu-test.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 95ca4f89..14162d31 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -1,9 +1,9 @@ name: AWS GPU Test on: - push: - branches: - - garvit/aws-gpu-test + # push: + # branches: + # - garvit/aws-gpu-test workflow_dispatch: inputs: gpu_install_type: @@ -141,17 +141,17 @@ jobs: run: | echo "Installing Karpenter..." AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" - CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "${{ needs.apply-terraform.outputs.job_identifier }}" --query "cluster.endpoint" --output text)" - KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-${{ needs.apply-terraform.outputs.job_identifier }}" + CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "devzero-gpu-cluster" --query "cluster.endpoint" --output text)" + KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-devzero-gpu-cluster" helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \ - --version "${{ env.KARPENTER_VERSION}}" \ + --version "0.37.7" \ --namespace kube-system \ --create-namespace \ - --set settings.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \ - --set settings.aws.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \ + --set settings.clusterName="devzero-gpu-cluster" \ + --set settings.aws.clusterName="devzero-gpu-cluster" \ --set settings.aws.clusterEndpoint="${CLUSTER_ENDPOINT}" \ - --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}" \ - --set settings.aws.interruptionQueueName="${{ needs.apply-terraform.outputs.job_identifier }}-karpenter-interruption" \ + --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-devzero-gpu-cluster" \ + --set settings.aws.interruptionQueueName="devzero-gpu-cluster-karpenter-interruption" \ --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="${KARPENTER_IAM_ROLE_ARN}" \ --set controller.resources.requests.cpu="1" \ --set controller.resources.requests.memory="1Gi" \ From 6d70a3405e9fdcab79e46f479179f10fbba2e82b Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 19:31:43 +0530 Subject: [PATCH 36/44] karpenter in aws gpu test ci --- .github/workflows/aws-gpu-test-2 | 515 ---------------------------- .github/workflows/aws-gpu-test.yaml | 150 +++++--- 2 files changed, 110 insertions(+), 555 deletions(-) delete mode 100644 .github/workflows/aws-gpu-test-2 diff --git a/.github/workflows/aws-gpu-test-2 b/.github/workflows/aws-gpu-test-2 deleted file mode 100644 index 99c5f983..00000000 --- a/.github/workflows/aws-gpu-test-2 +++ /dev/null @@ -1,515 +0,0 @@ -name: AWS GPU Test (CloudFormation) - -on: - push: - branches: - - garvit/aws-gpu-test - workflow_dispatch: - inputs: - gpu_install_type: - description: 'GPU installation type' - required: false - default: 'nvidia-device-plugin' - type: choice - options: - - gpu-operator - - nvidia-device-plugin - dcgm_install_type: - description: 'DCGM install type' - required: false - default: 'devzero-dcgm' - type: choice - options: - - nvidia-dcgm - - devzero-dcgm - cluster_version: - description: 'Kubernetes cluster version' - required: false - default: '1.30' - type: choice - options: - - '1.26' - - '1.27' - - '1.28' - - '1.29' - - '1.30' - - '1.31' - - '1.32' - - '1.33' - karpenter_version: - description: 'Karpenter Version' - required: false - default: '0.37.7' - type: choice - options: - - 'no_karpenter' - - '0.37.7' - -permissions: - id-token: write - contents: read - -jobs: - apply-terraform: - name: Apply Terraform - runs-on: ubuntu-latest - env: - GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }} - DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} - CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }} - - outputs: - job_identifier: ${{ steps.job-identifier.outputs.job_identifier }} - - steps: - - name: Validate Inputs - run: | - echo "GPU_INSTALL_TYPE=${GPU_INSTALL_TYPE}" - echo "DCGM_INSTALL_TYPE=${DCGM_INSTALL_TYPE}" - - if [[ "$GPU_INSTALL_TYPE" == "nvidia-device-plugin" && "$DCGM_INSTALL_TYPE" != "devzero-dcgm" ]]; then - echo "Error: When GPU_INSTALL_TYPE is 'nvidia-device-plugin', DCGM_INSTALL_TYPE must be 'devzero-dcgm'." - exit 1 - fi - - - name: Checkout Repository - uses: actions/checkout@v4 - - - name: Configure AWS Credential - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role - aws-region: us-east-1 - - - name: Generate Unique Job Identifier - id: job-identifier - shell: bash - run: | - SHORT_SHA=$(git rev-parse --short HEAD) - if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then - SUFFIX="dd" - else - SUFFIX="nd" - fi - JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}" - echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV - echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT - - - name: Set up Terraform - uses: hashicorp/setup-terraform@v3 - - - name: Apply Terraform - working-directory: terraform/aws - run: | - export KARPENTER_NAMESPACE="kube-system" - export KARPENTER_VERSION="0.37.7" - export K8S_VERSION="1.30" - export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov - export CLUSTER_NAME="${env.JOB_IDENTIFIER}" - export AWS_DEFAULT_REGION="us-east-1" - export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" - export TEMPOUT="$(mktemp)" - export ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')" - echo "${KARPENTER_NAMESPACE}" "${KARPENTER_VERSION}" "${K8S_VERSION}" "${CLUSTER_NAME}" "${AWS_DEFAULT_REGION}" "${AWS_ACCOUNT_ID}" "${TEMPOUT}" "${ALIAS_VERSION}" - - curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \ - && aws cloudformation deploy \ - --stack-name "Karpenter-${CLUSTER_NAME}" \ - --template-file "${TEMPOUT}" \ - --capabilities CAPABILITY_NAMED_IAM \ - --parameter-overrides "ClusterName=${CLUSTER_NAME}" - - eksctl create cluster -f - <> $GITHUB_ENV - else - echo "GPU check failed" - echo "GPU_CHECK=false" >> $GITHUB_ENV - fi - - - name: Install GPU Operator (if needed) - if: env.GPU_CHECK == 'false' && env.GPU_INSTALL_TYPE == 'gpu-operator' - run: | - echo "GPU resources not found, installing GPU Operator..." - kubectl create ns gpu-operator - kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite - kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true - helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \ - helm repo update - INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0" - if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then - INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false" - fi - echo "Running: $INSTALL_CMD" - $INSTALL_CMD - - - name: Install Nvidia Device Plugin - if: env.GPU_INSTALL_TYPE == 'nvidia-device-plugin' && env.GPU_CHECK == 'false' - run: | - echo "Installing Nvidia Device Plugin..." - kubectl get nodes -l node_type=gpu -o jsonpath='{.items[*].metadata.name}' | xargs -I {} kubectl label node {} nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite - kubectl create ns nvidia-device-plugin - kubectl apply -f nvidia-device-plugin-prereq - helm repo add nvdp https://nvidia.github.io/k8s-device-plugin - helm repo update - helm upgrade -i nvdp nvdp/nvidia-device-plugin \ - --namespace nvidia-device-plugin \ - --version 0.17.1 - - - name: Check GPU Availability After Installing GPU Operator - if: env.GPU_CHECK == 'false' - run: | - echo "Re-checking GPU resources on nodes after GPU Operator installation..." - if kubectl describe nodes | grep -q "nvidia.com/gpu"; then - echo "GPU resources are available on the nodes." - else - echo "GPU check failed after GPU Operator installation" - exit 1 - fi - - - name: Check Nvidia DCGM DaemonSet - id: dcgm_check - if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }} - run: | - echo "Checking if DCGM DaemonSet is installed..." - if kubectl get daemonset -A | grep -q dcgm; then - echo "Nvidia DCGM found, proceeding with validation." - else - echo "Nvidia DCGM not found." - exit 1 - fi - - - name: Install DevZero DCGM - if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} - run: | - echo "Installing DCGM Exporter..." - kubectl create ns devzero-zxporter - curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f - - - - name: Check DCGM DaemonSet After Installing DCGM Exporter - if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} - run: | - echo "Re-checking DCGM pods after DCGM Exporter installation..." - if kubectl get daemonset -A | grep -q dcgm; then - echo "DCGM DaemonSet is running." - else - echo "DCGM DaemonSet not running after installation" - exit 1 - fi - - - name: Verify DCGM Pods and Prometheus Annotations - run: | - NAMESPACE="devzero-zxporter" - if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then - NAMESPACE="gpu-operator" - fi - kubectl get pods -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s - echo "Verifying DCGM pods and Prometheus annotations..." - kubectl get pods -A | grep dcgm-exporter | awk ' - BEGIN { all_running = 1; pod_count = 0 } - { - pod_count++ - status = $4 - printf "Pod: %s/%s - Status: %s\n", $1, $2, status - if (status != "Running") all_running = 0 - } - END { - printf "\nTotal Pods: %d\n", pod_count - printf "All Running: %s\n", (all_running ? "true" : "false") - }' - kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done - - - name: Install and Verify DeepSeek Workload - run: | - kubectl create ns deepseek - kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml - - kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s - pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}') - - if [[ -n "$pod_status" ]]; then - echo "Pods are not in Running state. Failing the pipeline." - exit 1 - else - echo "All pods are running successfully." - fi - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.22' - cache: true - - - name: Install ZXPorter - run: | - ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" - echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" - make docker-build docker-push IMG=${ZXPORTER_IMG} - make deploy IMG=${ZXPORTER_IMG} - - echo "Waiting for ZXPorter pods to be ready..." - kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s - - - name: Test ZXPorter with Prometheus - run: | - kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter > pf.log 2>&1 & - PF_PID=$! - sleep 5 - MAX_RETRIES=6 - for i in $(seq 1 $MAX_RETRIES); do - if curl -s "http://localhost:9090/-/ready" >/dev/null; then - echo "Prometheus port-forward is ready." - break - fi - echo "[$i/$MAX_RETRIES] Waiting for Prometheus to become ready..." - sleep 5 - done - - result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result') - kill $PF_PID || true - - echo "Metric found: $result" - if [[ -z "$result" || "$result" == [] ]]; then - echo "❌ DCGM_FI_DEV_SM_CLOCK metric not found!" - echo "Port-forward log:" - cat pf.log - exit 1 - fi - - - name: Test Karpenter - if: inputs.karpenter_version != 'no_karpenter' - run: | - kubectl scale deployment inflate --replicas 10 - kubectl logs -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller - kubectl get nodes -o wide - kubectl delete deployment inflate - - - destroy-terraform: - name: Destroy Terraform - runs-on: ubuntu-latest - env: - CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }} - - if: always() - needs: - - apply-terraform - - install-and-validate - - steps: - - name: Checkout Repository - uses: actions/checkout@v4 - - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role - aws-region: us-east-1 - - - name: Set up Terraform - uses: hashicorp/setup-terraform@v3 - - - name: Destroy Infrastructure - working-directory: terraform/aws - run: | - helm uninstall karpenter --namespace kube-system || true - aws cloudformation delete-stack --stack-name "Karpenter-${{needs.apply-terraform.outputs.job_identifier}}" || true - aws ec2 describe-launch-templates --filters "Name=tag:karpenter.k8s.aws/cluster,Values=${{needs.apply-terraform.outputs.job_identifier}}" | - jq -r ".LaunchTemplates[].LaunchTemplateName" | - xargs -I{} aws ec2 delete-launch-template --launch-template-name {} - eksctl delete cluster --name "${{needs.apply-terraform.outputs.job_identifier}}" diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 14162d31..dc5389fb 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -1,9 +1,9 @@ name: AWS GPU Test on: - # push: - # branches: - # - garvit/aws-gpu-test + push: + branches: + - garvit/aws-gpu-test workflow_dispatch: inputs: gpu_install_type: @@ -294,11 +294,117 @@ jobs: echo "Waiting for ZXPorter pods to be ready..." kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s + - name: Test Karpenter + if: inputs.karpenter_version != 'no_karpenter' + run: | + echo "Intalling Karpenter Node Class and Node Pool..." + K8S_VERSION="1.30" + ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')" + kubectl get nodes -o wide || true + cat < pf.log 2>&1 & PF_PID=$! - sleep 5 + sleep 20 MAX_RETRIES=6 for i in $(seq 1 $MAX_RETRIES); do if curl -s "http://localhost:9090/-/ready" >/dev/null; then @@ -320,42 +426,6 @@ jobs: exit 1 fi - - name: Test Karpenter - if: inputs.karpenter_version != 'no_karpenter' - run: | - echo "Verifying Karpenter installation..." - kubectl port-forward -n kube-system service/karpenter 8000:8000 > /dev/null 2>&1 & - PF_PID=$! - - sleep 5 - - MAX_RETRIES=6 - HEALTH="" - - for i in $(seq 1 $MAX_RETRIES); do - response=$(curl -s http://localhost:8000/metrics) - echo "Response: $response" - if [[ -n "$response" ]]; then - HEALTH="OK" - break - fi - echo "[$i/$MAX_RETRIES] Waiting for Karpenter to become ready..." - sleep 10 - done - - # Cleanup port-forward - kill $PF_PID || true - - if [ "$HEALTH" == "OK" ]; then - echo "Karpenter is healthy ✅" - else - echo "Karpenter health check failed ❌" - kubectl get pods -n kube-system -l app.kubernetes.io/name=karpenter - kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter --tail=50 - exit 1 - fi - - destroy-terraform: name: Destroy Terraform runs-on: ubuntu-latest From 3ffa2cf5e7d4b483eabff603de859db132537079 Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 19:44:15 +0530 Subject: [PATCH 37/44] karpenter in aws gpu test ci --- .github/workflows/aws-gpu-test.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index dc5389fb..c8146d20 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -141,17 +141,17 @@ jobs: run: | echo "Installing Karpenter..." AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" - CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "devzero-gpu-cluster" --query "cluster.endpoint" --output text)" - KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-devzero-gpu-cluster" + CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ needs.apply-terraform.outputs.job_identifier }} --query "cluster.endpoint" --output text)" + KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-${{ needs.apply-terraform.outputs.job_identifier }}" helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \ --version "0.37.7" \ --namespace kube-system \ --create-namespace \ - --set settings.clusterName="devzero-gpu-cluster" \ - --set settings.aws.clusterName="devzero-gpu-cluster" \ + --set settings.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \ + --set settings.aws.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \ --set settings.aws.clusterEndpoint="${CLUSTER_ENDPOINT}" \ - --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-devzero-gpu-cluster" \ - --set settings.aws.interruptionQueueName="devzero-gpu-cluster-karpenter-interruption" \ + --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}" \ + --set settings.aws.interruptionQueueName="${{ needs.apply-terraform.outputs.job_identifier }}-karpenter-interruption" \ --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="${KARPENTER_IAM_ROLE_ARN}" \ --set controller.resources.requests.cpu="1" \ --set controller.resources.requests.memory="1Gi" \ From 975c0e3961427c1da9c63f10e8e450bba1d2f32d Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 20:06:45 +0530 Subject: [PATCH 38/44] karpenter in aws gpu test ci --- .github/workflows/aws-gpu-test.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index c8146d20..dedd8c5b 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -132,10 +132,24 @@ jobs: role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role aws-region: us-east-1 + - name: Install yq + run: | + sudo wget https://github.com/mikefarah/yq/releases/download/v4.15.1/yq_linux_amd64 -O /usr/local/bin/yq + sudo chmod +x /usr/local/bin/yq + - name: Configure Kubernetes Access run: | aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }} + - name: Add new mapRole to aws-auth ConfigMap + if: env.Karpenter_VERSION != 'no_karpenter' + run: | + NEW_MAPROLE='- groups:\n - system:bootstrappers\n - system:nodes\n rolearn: arn:aws:iam::484907513542:role/KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}\n username: system:node:{{EC2PrivateDNSName}}' + kubectl get configmap/aws-auth -n kube-system -o yaml > aws-auth.yaml + yq eval '.data.mapRoles |= . + "- groups:\n - system:bootstrappers\n - system:nodes\n rolearn: arn:aws:iam::484907513542:role/KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}\n username: system:node:{{EC2PrivateDNSName}}\n"' -i aws-auth.yaml + kubectl apply -f aws-auth.yaml + kubectl get configmap/aws-auth -n kube-system -o yaml + - name: Install Karpenter (if needed) if: env.Karpenter_VERSION != 'no_karpenter' run: | From f74d8ac1914767bdc9c18381d5e373c9dfc5888a Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 20:40:56 +0530 Subject: [PATCH 39/44] karpenter in aws gpu test ci --- .github/workflows/aws-gpu-test.yaml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index dedd8c5b..abcb35f5 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -121,6 +121,7 @@ jobs: GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }} DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} Karpenter_VERSION: ${{ github.event.inputs.karpenter_version || '0.37.7' }} + CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }} steps: - name: Checkout Repository @@ -157,6 +158,8 @@ jobs: AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ needs.apply-terraform.outputs.job_identifier }} --query "cluster.endpoint" --output text)" KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-${{ needs.apply-terraform.outputs.job_identifier }}" + echo "Karpenter IAM Role ARN: ${KARPENTER_IAM_ROLE_ARN}" + echo "Cluster Endpoint: ${CLUSTER_ENDPOINT}" helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \ --version "0.37.7" \ --namespace kube-system \ @@ -312,8 +315,7 @@ jobs: if: inputs.karpenter_version != 'no_karpenter' run: | echo "Intalling Karpenter Node Class and Node Pool..." - K8S_VERSION="1.30" - ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')" + ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${{ env.CLUSTER_VERSION }}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')" kubectl get nodes -o wide || true cat < Date: Tue, 10 Jun 2025 20:59:31 +0530 Subject: [PATCH 40/44] karpenter in aws gpu test ci --- .github/workflows/aws-gpu-test.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index abcb35f5..5722f5f2 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -407,6 +407,8 @@ jobs: kubectl wait --for=condition=Ready pod -l app=inflate --timeout=180s kubectl get nodes -o wide || true + + kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter -c controller NODE_COUNT=$(kubectl get nodes --no-headers | wc -l) if [ "$NODE_COUNT" -le 1 ]; then From d5e4f9020f3d68ed7d4b227a55b805b8f540043a Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 21:13:37 +0530 Subject: [PATCH 41/44] karpenter in aws gpu test ci --- .github/workflows/aws-gpu-test.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 5722f5f2..5c032127 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -368,7 +368,7 @@ jobs: karpenter.sh/discovery: "${{ needs.apply-terraform.outputs.job_identifier }}" EOF - sleep 10 + kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter -c controller echo "Creating a deployment to trigger Karpenter node provisioning..." cat < Date: Tue, 10 Jun 2025 21:29:54 +0530 Subject: [PATCH 42/44] karpenter in aws gpu test ci --- terraform/aws/main.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf index 9d62a609..252ad5c2 100644 --- a/terraform/aws/main.tf +++ b/terraform/aws/main.tf @@ -253,6 +253,10 @@ module "eks" { create_node_iam_role = false + tags = { + "karpenter.sh/discovery" = var.cluster_name + } + eks_managed_node_groups = { gpu_nodes = { instance_types = ["g6.4xlarge"] @@ -284,6 +288,7 @@ module "eks" { resource "aws_security_group" "karpenter_sg" { name = "karpenter-sg-${var.cluster_name}" description = "Karpenter security group" + vpc_id = module.vpc.vpc_id tags = { "karpenter.sh/discovery" = "${var.cluster_name}" From 5b19b522bc0d7409a0571d4437633e4e1cc85e2d Mon Sep 17 00:00:00 2001 From: garvit3835 Date: Tue, 10 Jun 2025 22:14:39 +0530 Subject: [PATCH 43/44] karpenter in aws gpu test ci --- .github/workflows/aws-gpu-test.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index 5c032127..fbd733e7 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -316,6 +316,7 @@ jobs: run: | echo "Intalling Karpenter Node Class and Node Pool..." ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${{ env.CLUSTER_VERSION }}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')" + echo "Using ALIAS_VERSION: ${ALIAS_VERSION}" kubectl get nodes -o wide || true cat < Date: Wed, 11 Jun 2025 00:25:36 +0530 Subject: [PATCH 44/44] karpenter in aws gpu test ci --- .github/workflows/aws-gpu-test.yaml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml index fbd733e7..ce3d4bea 100644 --- a/.github/workflows/aws-gpu-test.yaml +++ b/.github/workflows/aws-gpu-test.yaml @@ -135,7 +135,7 @@ jobs: - name: Install yq run: | - sudo wget https://github.com/mikefarah/yq/releases/download/v4.15.1/yq_linux_amd64 -O /usr/local/bin/yq + sudo wget https://github.com/mikefarah/yq/releases/download/v4.35.2/yq_linux_amd64 -O /usr/local/bin/yq sudo chmod +x /usr/local/bin/yq - name: Configure Kubernetes Access @@ -465,6 +465,20 @@ jobs: role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role aws-region: us-east-1 + - name: Configure Kubernetes Access + if: inputs.karpenter_version != 'no_karpenter' + run: | + aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }} + + - name: Delete Karpenter Nodes + if: inputs.karpenter_version != 'no_karpenter' + run: | + kubectl delete deployment inflate + kubectl wait --for=delete deployment/inflate --timeout=300s + NODE_NAME=$(kubectl get nodes --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[1].metadata.name}') + kubectl delete node "${NODE_NAME}" + + - name: Set up Terraform uses: hashicorp/setup-terraform@v3