diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml
new file mode 100644
index 00000000..ce3d4bea
--- /dev/null
+++ b/.github/workflows/aws-gpu-test.yaml
@@ -0,0 +1,498 @@
+name: AWS GPU Test
+
+on:
+  push:
+    branches:
+      - garvit/aws-gpu-test
+  workflow_dispatch:
+    inputs:
+      gpu_install_type:
+        description: 'GPU installation type'
+        required: false
+        default: 'nvidia-device-plugin'
+        type: choice
+        options:
+          - gpu-operator
+          - nvidia-device-plugin
+      dcgm_install_type:
+        description: 'DCGM install type'
+        required: false
+        default: 'devzero-dcgm'
+        type: choice
+        options:
+          - nvidia-dcgm
+          - devzero-dcgm
+      cluster_version:
+        description: 'Kubernetes cluster version'
+        required: false
+        default: '1.30'
+        type: choice
+        options:
+          - '1.26'
+          - '1.27'
+          - '1.28'
+          - '1.29'
+          - '1.30'
+          - '1.31'
+          - '1.32'
+          - '1.33'
+      karpenter_version:
+        description: 'Karpenter Version'
+        required: false
+        default: '0.37.7'
+        type: choice
+        options:
+          - 'no_karpenter'
+          - '0.37.7'
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  apply-terraform:
+    name: Apply Terraform
+    runs-on: ubuntu-latest
+    env:
+      GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
+      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
+      CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}
+
+    outputs:
+      job_identifier: ${{ steps.job-identifier.outputs.job_identifier }}
+
+    steps:
+      - name: Validate Inputs
+        run: |
+          echo "GPU_INSTALL_TYPE=${GPU_INSTALL_TYPE}"
+          echo "DCGM_INSTALL_TYPE=${DCGM_INSTALL_TYPE}"
+
+          if [[ "$GPU_INSTALL_TYPE" == "nvidia-device-plugin" && "$DCGM_INSTALL_TYPE" != "devzero-dcgm" ]]; then
+            echo "Error: When GPU_INSTALL_TYPE is 'nvidia-device-plugin', DCGM_INSTALL_TYPE must be 'devzero-dcgm'."
+            exit 1
+          fi
+
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Configure AWS Credential
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
+          aws-region: us-east-1
+
+      - name: Generate Unique Job Identifier
+        id: job-identifier
+        shell: bash
+        run: |
+          SHORT_SHA=$(git rev-parse --short HEAD)
+          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
+            SUFFIX="dd"
+          else
+            SUFFIX="nd"
+          fi
+          JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}"
+          echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV
+          echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT
+
+      - name: Set up Terraform
+        uses: hashicorp/setup-terraform@v3
+
+      - name: Apply Terraform
+        working-directory: terraform/aws
+        run: |
+          cat <<EOF > backend_override.tf
+          terraform {
+            backend "s3" {
+                bucket         	   = "zxporter-tf-state"
+                key              	 = "${JOB_IDENTIFIER}/terraform.tfstate"
+                region         	   = "us-east-1"
+            }
+          }
+          EOF
+          terraform init
+          terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" -var='cluster_version=${{ env.CLUSTER_VERSION }}'
+
+  install-and-validate:
+    name: Install and Validate GPU Resources and ZXPorter
+    runs-on: ubuntu-latest
+    needs: apply-terraform
+    env:
+      GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }}
+      DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }}
+      Karpenter_VERSION: ${{ github.event.inputs.karpenter_version || '0.37.7' }}
+      CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
+          aws-region: us-east-1
+
+      - name: Install yq
+        run: |
+          sudo wget https://github.com/mikefarah/yq/releases/download/v4.35.2/yq_linux_amd64 -O /usr/local/bin/yq
+          sudo chmod +x /usr/local/bin/yq
+
+      - name: Configure Kubernetes Access
+        run: |
+          aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }}
+
+      - name: Add new mapRole to aws-auth ConfigMap
+        if: env.Karpenter_VERSION != 'no_karpenter'
+        run: |
+          NEW_MAPROLE='- groups:\n      - system:bootstrappers\n      - system:nodes\n      rolearn: arn:aws:iam::484907513542:role/KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}\n      username: system:node:{{EC2PrivateDNSName}}'
+          kubectl get configmap/aws-auth -n kube-system -o yaml > aws-auth.yaml
+          yq eval '.data.mapRoles |= . + "- groups:\n  - system:bootstrappers\n  - system:nodes\n  rolearn: arn:aws:iam::484907513542:role/KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}\n  username: system:node:{{EC2PrivateDNSName}}\n"' -i aws-auth.yaml
+          kubectl apply -f aws-auth.yaml
+          kubectl get configmap/aws-auth -n kube-system -o yaml
+
+      - name: Install Karpenter (if needed)
+        if: env.Karpenter_VERSION != 'no_karpenter'
+        run: |
+          echo "Installing Karpenter..."
+          AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
+          CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ needs.apply-terraform.outputs.job_identifier }} --query "cluster.endpoint" --output text)"
+          KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-${{ needs.apply-terraform.outputs.job_identifier }}"
+          echo "Karpenter IAM Role ARN: ${KARPENTER_IAM_ROLE_ARN}"
+          echo "Cluster Endpoint: ${CLUSTER_ENDPOINT}"
+          helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \
+            --version "0.37.7" \
+            --namespace kube-system \
+            --create-namespace \
+            --set settings.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \
+            --set settings.aws.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \
+            --set settings.aws.clusterEndpoint="${CLUSTER_ENDPOINT}" \
+            --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}" \
+            --set settings.aws.interruptionQueueName="${{ needs.apply-terraform.outputs.job_identifier }}-karpenter-interruption" \
+            --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="${KARPENTER_IAM_ROLE_ARN}" \
+            --set controller.resources.requests.cpu="1" \
+            --set controller.resources.requests.memory="1Gi" \
+            --set controller.resources.limits.cpu="1" \
+            --set controller.resources.limits.memory="1Gi" \
+            --wait
+
+      - name: Check GPU Availability
+        id: gpu_check
+        run: |
+          echo "Checking GPU resources on nodes..."
+          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
+            echo "GPU resources are available on the nodes."
+            echo "GPU_CHECK=true" >> $GITHUB_ENV
+          else
+            echo "GPU check failed"
+            echo "GPU_CHECK=false" >> $GITHUB_ENV
+          fi
+
+      - name: Install GPU Operator (if needed)
+        if: env.GPU_CHECK == 'false' && env.GPU_INSTALL_TYPE == 'gpu-operator'
+        run: |
+          echo "GPU resources not found, installing GPU Operator..."
+          kubectl create ns gpu-operator
+          kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite
+          kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true
+          helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \
+          helm repo update
+          INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0"
+          if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then
+            INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false"
+          fi
+          echo "Running: $INSTALL_CMD"
+          $INSTALL_CMD
+
+      - name: Install Nvidia Device Plugin
+        if: env.GPU_INSTALL_TYPE == 'nvidia-device-plugin' && env.GPU_CHECK == 'false'
+        run: |
+          echo "Installing Nvidia Device Plugin..."
+          kubectl get nodes -l node_type=gpu -o jsonpath='{.items[*].metadata.name}' | xargs -I {} kubectl label node {} nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite
+          kubectl create ns nvidia-device-plugin
+          kubectl apply -f nvidia-device-plugin-prereq
+          helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
+          helm repo update
+          helm upgrade -i nvdp nvdp/nvidia-device-plugin \
+            --namespace nvidia-device-plugin \
+            --version 0.17.1
+
+      - name: Check GPU Availability After Installing GPU Operator
+        if: env.GPU_CHECK == 'false'
+        run: |
+          echo "Re-checking GPU resources on nodes after GPU Operator installation..."
+          if kubectl describe nodes | grep -q "nvidia.com/gpu"; then
+            echo "GPU resources are available on the nodes."
+          else
+            echo "GPU check failed after GPU Operator installation"
+            exit 1
+          fi
+
+      - name: Check Nvidia DCGM DaemonSet
+        id: dcgm_check
+        if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }}
+        run: |
+          echo "Checking if DCGM DaemonSet is installed..."
+          if kubectl get daemonset -A | grep -q dcgm; then
+            echo "Nvidia DCGM found, proceeding with validation."
+          else
+            echo "Nvidia DCGM not found."
+            exit 1
+          fi
+
+      - name: Install DevZero DCGM
+        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
+        run: |
+          echo "Installing DCGM Exporter..."
+          kubectl create ns devzero-zxporter
+          curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f -
+
+      - name: Check DCGM DaemonSet After Installing DCGM Exporter
+        if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }}
+        run: |
+          echo "Re-checking DCGM pods after DCGM Exporter installation..."
+          if kubectl get daemonset -A | grep -q dcgm; then
+            echo "DCGM DaemonSet is running."
+          else
+            echo "DCGM DaemonSet not running after installation"
+            exit 1
+          fi
+          
+      - name: Verify DCGM Pods and Prometheus Annotations
+        run: |
+          NAMESPACE="devzero-zxporter"
+          if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then
+            NAMESPACE="gpu-operator"
+          fi
+          kubectl get pods -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s
+          echo "Verifying DCGM pods and Prometheus annotations..."
+          kubectl get pods -A | grep dcgm-exporter | awk '
+          BEGIN { all_running = 1; pod_count = 0 }
+          {
+              pod_count++
+              status = $4
+              printf "Pod: %s/%s - Status: %s\n", $1, $2, status
+              if (status != "Running") all_running = 0
+          }
+          END {
+              printf "\nTotal Pods: %d\n", pod_count
+              printf "All Running: %s\n", (all_running ? "true" : "false")
+          }'
+          kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done
+
+      - name: Install and Verify DeepSeek Workload
+        run: |
+          kubectl create ns deepseek
+          kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml    
+          
+          kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s
+          pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}')
+          
+          if [[ -n "$pod_status" ]]; then
+            echo "Pods are not in Running state. Failing the pipeline."
+            exit 1
+          else
+            echo "All pods are running successfully."
+          fi
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.22'
+          cache: true
+
+      - name: Install ZXPorter
+        run: |
+          ZXPORTER_IMG="ttl.sh/$(uuidgen):2h"
+          echo "Building and pushing zxporter image: ${ZXPORTER_IMG}"
+          make docker-build docker-push IMG=${ZXPORTER_IMG}
+          make deploy IMG=${ZXPORTER_IMG}
+          
+          echo "Waiting for ZXPorter pods to be ready..."
+          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s
+
+      - name: Test Karpenter
+        if: inputs.karpenter_version != 'no_karpenter'
+        run: |
+          echo "Intalling Karpenter Node Class and Node Pool..."
+          ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${{ env.CLUSTER_VERSION }}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')"
+          echo "Using ALIAS_VERSION: ${ALIAS_VERSION}"
+          kubectl get nodes -o wide || true
+          cat <<EOF | envsubst | kubectl apply -f -
+          apiVersion: karpenter.sh/v1
+          kind: NodePool
+          metadata:
+            name: default
+          spec:
+            template:
+              spec:
+                requirements:
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values: ["amd64"]
+                  - key: kubernetes.io/os
+                    operator: In
+                    values: ["linux"]
+                  - key: karpenter.sh/capacity-type
+                    operator: In
+                    values: ["on-demand"]
+                  - key: karpenter.k8s.aws/instance-category
+                    operator: In
+                    values: ["c", "m", "r"]
+                  - key: karpenter.k8s.aws/instance-generation
+                    operator: Gt
+                    values: ["2"]
+                nodeClassRef:
+                  group: karpenter.k8s.aws
+                  kind: EC2NodeClass
+                  name: default
+                expireAfter: 720h # 30 * 24h = 720h
+            limits:
+              cpu: 1000
+            disruption:
+              consolidationPolicy: WhenEmptyOrUnderutilized
+              consolidateAfter: 1m
+          ---
+          apiVersion: karpenter.k8s.aws/v1
+          kind: EC2NodeClass
+          metadata:
+            name: default
+          spec:
+            role: "KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}"
+            amiSelectorTerms:
+              - alias: "al2023@${ALIAS_VERSION}"
+            subnetSelectorTerms:
+              - tags:
+                  karpenter.sh/discovery: "${{ needs.apply-terraform.outputs.job_identifier }}"
+            securityGroupSelectorTerms:
+              - tags:
+                  karpenter.sh/discovery: "${{ needs.apply-terraform.outputs.job_identifier }}"
+          EOF
+
+          kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter -c controller
+
+          echo "Creating a deployment to trigger Karpenter node provisioning..."
+          cat <<EOF | kubectl apply -f -
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: inflate
+          spec:
+            replicas: 0
+            selector:
+              matchLabels:
+                app: inflate
+            template:
+              metadata:
+                labels:
+                  app: inflate
+              spec:
+                terminationGracePeriodSeconds: 0
+                securityContext:
+                  runAsUser: 1000
+                  runAsGroup: 3000
+                  fsGroup: 2000
+                containers:
+                - name: inflate
+                  image: public.ecr.aws/eks-distro/kubernetes/pause:3.7
+                  resources:
+                    requests:
+                      cpu: 1
+                  securityContext:
+                    allowPrivilegeEscalation: false
+          EOF
+
+          kubectl scale deployment inflate --replicas 10
+            
+          echo "Waiting for nodes to be provisioned by Karpenter..."
+          kubectl wait --for=condition=Ready pod -l app=inflate --timeout=180s || true
+
+          kubectl get nodes -o wide
+
+          kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter -c controller
+  
+          NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
+          if [ "$NODE_COUNT" -le 1 ]; then
+            echo "Error: Node count is $NODE_COUNT, Karpenter did not provision nodes."
+          else
+            echo "Karepenter successfully provisioned nodes. Node count: $NODE_COUNT"
+          fi
+
+      - name: Test ZXPorter with Prometheus
+        run: |
+          kubectl port-forward svc/prometheus-dz-prometheus-server 9090:80 -n devzero-zxporter > pf.log 2>&1 &
+          PF_PID=$!
+          sleep 20
+          MAX_RETRIES=6
+          for i in $(seq 1 $MAX_RETRIES); do
+            if curl -s "http://localhost:9090/-/ready" >/dev/null; then
+              echo "Prometheus port-forward is ready."
+              break
+            fi
+            echo "[$i/$MAX_RETRIES] Waiting for Prometheus to become ready..."
+            sleep 5
+          done
+
+          result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result')
+          kill $PF_PID || true
+
+          echo "Metric found: $result"
+          if [[ -z "$result" || "$result" == [] ]]; then
+            echo "❌ DCGM_FI_DEV_SM_CLOCK metric not found!"
+            echo "Port-forward log:"
+            cat pf.log
+            exit 1
+          fi
+
+  destroy-terraform:
+    name: Destroy Terraform
+    runs-on: ubuntu-latest
+    env:
+      CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }}
+
+    if: always()
+    needs:
+      - apply-terraform
+      - install-and-validate
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role
+          aws-region: us-east-1
+
+      - name: Configure Kubernetes Access
+        if: inputs.karpenter_version != 'no_karpenter'
+        run: |
+          aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }}
+
+      - name: Delete Karpenter Nodes
+        if: inputs.karpenter_version != 'no_karpenter'
+        run: |
+          kubectl delete deployment inflate
+          kubectl wait --for=delete deployment/inflate --timeout=300s
+          NODE_NAME=$(kubectl get nodes --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[1].metadata.name}')
+          kubectl delete node "${NODE_NAME}"
+
+
+      - name: Set up Terraform
+        uses: hashicorp/setup-terraform@v3
+
+      - name: Destroy Infrastructure
+        working-directory: terraform/aws
+        run: |
+          cat <<EOF > backend_override.tf
+          terraform {
+            backend "s3" {
+                bucket  = "zxporter-tf-state"
+                key     = "${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate"
+                region  = "us-east-1"
+            }
+          }
+          EOF
+          terraform init
+          terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}" -var='cluster_version=${{ env.CLUSTER_VERSION }}'
diff --git a/.gitignore b/.gitignore
index f2f57448..e5cfe436 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,8 @@ config/**/charts
 *.swp
 *.swo
 *~
+
+# Terraform files
+*.tfstate
+*.tfstate.backup
+.terraform*
diff --git a/Makefile b/Makefile
index a6be166d..a4bdfb11 100644
--- a/Makefile
+++ b/Makefile
@@ -125,11 +125,11 @@ help: ## Display this help.
 
 .PHONY: manifests
 manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
-	$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases
+	$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases -w
 
 .PHONY: generate
 generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
-	$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..."
+	$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." -w
 
 .PHONY: fmt
 fmt: ## Run go fmt against code.
diff --git a/config/prometheus/hack.prometheus.values.yaml b/config/prometheus/hack.prometheus.values.yaml
index b1975764..db227b61 100644
--- a/config/prometheus/hack.prometheus.values.yaml
+++ b/config/prometheus/hack.prometheus.values.yaml
@@ -51,344 +51,84 @@ kube-state-metrics:
     # - roles
 
 serverFiles:
-  prometheus.yml:
+  prometheus.yml: 
+    rule_files:
+      - /etc/config/recording_rules.yml
+      - /etc/config/alerting_rules.yml
+      - /etc/config/rules
+      - /etc/config/alerts
     scrape_configs:
       - job_name: prometheus
         static_configs:
           - targets:
               - localhost:9090
-
-      # A scrape configuration for running Prometheus on a Kubernetes cluster.
-      # This uses separate scrape configs for cluster components (i.e. API server, node)
-      # and services to allow each to use different authentication configs.
-      #
-      # Kubernetes labels will be added as Prometheus labels on metrics via the
-      # `labelmap` relabeling action.
-
-## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
-#      # Scrape config for API servers.
-#      #
-#      # Kubernetes exposes API servers as endpoints to the default/kubernetes
-#      # service so this uses `endpoints` role and uses relabelling to only keep
-#      # the endpoints associated with the default/kubernetes service using the
-#      # default named port `https`. This works for single API server deployments as
-#      # well as HA API server deployments.
-#      - job_name: 'kubernetes-apiservers'
-#
-#        kubernetes_sd_configs:
-#          - role: endpoints
-#
-#        # Default to scraping over https. If required, just disable this or change to
-#        # `http`.
-#        scheme: https
-#
-#        # This TLS & bearer token file config is used to connect to the actual scrape
-#        # endpoints for cluster components. This is separate to discovery auth
-#        # configuration because discovery & scraping are two separate concerns in
-#        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-#        # the cluster. Otherwise, more config options have to be provided within the
-#        # <kubernetes_sd_config>.
-#        tls_config:
-#          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-#          # If your node certificates are self-signed or use a different CA to the
-#          # master CA, then disable certificate verification below. Note that
-#          # certificate verification is an integral part of a secure infrastructure
-#          # so this should only be disabled in a controlled environment. You can
-#          # disable certificate verification by uncommenting the line below.
-#          #
-#          # insecure_skip_verify: true
-#        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-#
-#        # Keep only the default/kubernetes service endpoints for the https port. This
-#        # will add targets for each API server which Kubernetes adds an endpoint to
-#        # the default/kubernetes service.
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name ]
-#            action: keep
-#            regex: default;kubernetes;https
-
-      - job_name: 'kubernetes-nodes'
-
-        # Default to scraping over https. If required, just disable this or change to
-        # `http`.
+      - job_name: kubernetes-nodes
         scheme: https
-
-        # This TLS & bearer token file config is used to connect to the actual scrape
-        # endpoints for cluster components. This is separate to discovery auth
-        # configuration because discovery & scraping are two separate concerns in
-        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-        # the cluster. Otherwise, more config options have to be provided within the
-        # <kubernetes_sd_config>.
         tls_config:
           ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-          # If your node certificates are self-signed or use a different CA to the
-          # master CA, then disable certificate verification below. Note that
-          # certificate verification is an integral part of a secure infrastructure
-          # so this should only be disabled in a controlled environment. You can
-          # disable certificate verification by uncommenting the line below.
-          #
-          # insecure_skip_verify: true
         bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
         kubernetes_sd_configs:
           - role: node
-
         relabel_configs:
           - action: labelmap
             regex: __meta_kubernetes_node_label_(.+)
           - target_label: __address__
             replacement: kubernetes.default.svc:443
-          - source_labels: [ __meta_kubernetes_node_name ]
+          - source_labels:
+              - __meta_kubernetes_node_name
             regex: (.+)
             target_label: __metrics_path__
             replacement: /api/v1/nodes/$1/proxy/metrics
-
-
-      - job_name: 'kubernetes-nodes-cadvisor'
-
-        # Default to scraping over https. If required, just disable this or change to
-        # `http`.
+      - job_name: kubernetes-nodes-cadvisor
         scheme: https
-
-        # This TLS & bearer token file config is used to connect to the actual scrape
-        # endpoints for cluster components. This is separate to discovery auth
-        # configuration because discovery & scraping are two separate concerns in
-        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-        # the cluster. Otherwise, more config options have to be provided within the
-        # <kubernetes_sd_config>.
         tls_config:
           ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-          # If your node certificates are self-signed or use a different CA to the
-          # master CA, then disable certificate verification below. Note that
-          # certificate verification is an integral part of a secure infrastructure
-          # so this should only be disabled in a controlled environment. You can
-          # disable certificate verification by uncommenting the line below.
-          #
-          # insecure_skip_verify: true
         bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
         kubernetes_sd_configs:
           - role: node
-
-        # This configuration will work only on kubelet 1.7.3+
-        # As the scrape endpoints for cAdvisor have changed
-        # if you are using older version you need to change the replacement to
-        # replacement: /api/v1/nodes/$1:4194/proxy/metrics
-        # more info here https://github.com/coreos/prometheus-operator/issues/633
         relabel_configs:
           - action: labelmap
             regex: __meta_kubernetes_node_label_(.+)
           - target_label: __address__
             replacement: kubernetes.default.svc:443
-          - source_labels: [ __meta_kubernetes_node_name ]
+          - source_labels:
+              - __meta_kubernetes_node_name
             regex: (.+)
             target_label: __metrics_path__
             replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
-
-        # Metric relabel configs to apply to samples before ingestion.
-        # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs)
-        # metric_relabel_configs:
-        # - action: labeldrop
-        #   regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone)
-
-## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
-#      # Scrape config for service endpoints.
-#      #
-#      # The relabeling allows the actual service scrape endpoint to be configured
-#      # via the following annotations:
-#      #
-#      # * `prometheus.io/scrape`: Only scrape services that have a value of
-#      # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well.
-#      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-#      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-#      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-#      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
-#      # service then set this appropriately.
-#      # * `prometheus.io/param_<parameter>`: If the metrics endpoint uses parameters
-#      # then you can set any parameter
-#      - job_name: 'kubernetes-service-endpoints'
-#        honor_labels: true
-#
-#        kubernetes_sd_configs:
-#          - role: endpoints
-#
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape ]
-#            action: keep
-#            regex: true
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ]
-#            action: drop
-#            regex: true
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ]
-#            action: replace
-#            target_label: __scheme__
-#            regex: (https?)
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ]
-#            action: replace
-#            target_label: __metrics_path__
-#            regex: (.+)
-#          - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ]
-#            action: replace
-#            target_label: __address__
-#            regex: (.+?)(?::\d+)?;(\d+)
-#            replacement: $1:$2
-#          - action: labelmap
-#            regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
-#            replacement: __param_$1
-#          - action: labelmap
-#            regex: __meta_kubernetes_service_label_(.+)
-#          - source_labels: [ __meta_kubernetes_namespace ]
-#            action: replace
-#            target_label: namespace
-#          - source_labels: [ __meta_kubernetes_service_name ]
-#            action: replace
-#            target_label: service
-#          - source_labels: [ __meta_kubernetes_pod_node_name ]
-#            action: replace
-#            target_label: node
-
-
-## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
-#      # Scrape config for slow service endpoints; same as above, but with a larger
-#      # timeout and a larger interval
-#      #
-#      # The relabeling allows the actual service scrape endpoint to be configured
-#      # via the following annotations:
-#      #
-#      # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true`
-#      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-#      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-#      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-#      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
-#      # service then set this appropriately.
-#      # * `prometheus.io/param_<parameter>`: If the metrics endpoint uses parameters
-#      # then you can set any parameter
-#      - job_name: 'kubernetes-service-endpoints-slow'
-#        honor_labels: true
-#
-#        scrape_interval: 5m
-#        scrape_timeout: 30s
-#
-#        kubernetes_sd_configs:
-#          - role: endpoints
-#
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ]
-#            action: keep
-#            regex: true
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ]
-#            action: replace
-#            target_label: __scheme__
-#            regex: (https?)
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ]
-#            action: replace
-#            target_label: __metrics_path__
-#            regex: (.+)
-#          - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ]
-#            action: replace
-#            target_label: __address__
-#            regex: (.+?)(?::\d+)?;(\d+)
-#            replacement: $1:$2
-#          - action: labelmap
-#            regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
-#            replacement: __param_$1
-#          - action: labelmap
-#            regex: __meta_kubernetes_service_label_(.+)
-#          - source_labels: [ __meta_kubernetes_namespace ]
-#            action: replace
-#            target_label: namespace
-#          - source_labels: [ __meta_kubernetes_service_name ]
-#            action: replace
-#            target_label: service
-#          - source_labels: [ __meta_kubernetes_pod_node_name ]
-#            action: replace
-#            target_label: node
-#
-#      - job_name: 'prometheus-pushgateway'
-#        honor_labels: true
-#
-#        kubernetes_sd_configs:
-#          - role: service
-#
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ]
-#            action: keep
-#            regex: pushgateway
-
-
-## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
-#      # Example scrape config for probing services via the Blackbox Exporter.
-#      #
-#      # The relabeling allows the actual service scrape endpoint to be configured
-#      # via the following annotations:
-#      #
-#      # * `prometheus.io/probe`: Only probe services that have a value of `true`
-#      - job_name: 'kubernetes-services'
-#        honor_labels: true
-#
-#        metrics_path: /probe
-#        params:
-#          module: [ http_2xx ]
-#
-#        kubernetes_sd_configs:
-#          - role: service
-#
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ]
-#            action: keep
-#            regex: true
-#          - source_labels: [ __address__ ]
-#            target_label: __param_target
-#          - target_label: __address__
-#            replacement: blackbox
-#          - source_labels: [ __param_target ]
-#            target_label: instance
-#          - action: labelmap
-#            regex: __meta_kubernetes_service_label_(.+)
-#          - source_labels: [ __meta_kubernetes_namespace ]
-#            target_label: namespace
-#          - source_labels: [ __meta_kubernetes_service_name ]
-#            target_label: service
-
-
-      # Example scrape config for pods
-      #
-      # The relabeling allows the actual pod scrape endpoint to be configured via the
-      # following annotations:
-      #
-      # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`,
-      # except if `prometheus.io/scrape-slow` is set to `true` as well.
-      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-      # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
-      - job_name: 'kubernetes-pods'
+      - job_name: kubernetes-pods
         honor_labels: true
-
         kubernetes_sd_configs:
           - role: pod
-
         relabel_configs:
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape ]
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_scrape
             action: keep
             regex: true
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ]
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
             action: drop
             regex: true
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ]
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_scheme
             action: replace
             regex: (https?)
             target_label: __scheme__
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ]
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_path
             action: replace
-            target_label: __metrics_path__
             regex: (.+)
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+            target_label: __metrics_path__
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_port
+              - __meta_kubernetes_pod_ip
             action: replace
             regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
-            replacement: '[$2]:$1'
+            replacement: "[$2]:$1"
             target_label: __address__
-          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+          - source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_port
+              - __meta_kubernetes_pod_ip
             action: replace
             regex: (\d+);((([0-9]+?)(\.|$)){4})
             replacement: $2:$1
@@ -398,77 +138,442 @@ serverFiles:
             replacement: __param_$1
           - action: labelmap
             regex: __meta_kubernetes_pod_label_(.+)
-          - source_labels: [ __meta_kubernetes_namespace ]
+          - source_labels:
+              - __meta_kubernetes_namespace
             action: replace
             target_label: namespace
-          - source_labels: [ __meta_kubernetes_pod_name ]
+          - source_labels:
+              - __meta_kubernetes_pod_name
             action: replace
             target_label: pod
-          - source_labels: [ __meta_kubernetes_pod_phase ]
+          - source_labels:
+              - __meta_kubernetes_pod_phase
             regex: Pending|Succeeded|Failed|Completed
             action: drop
-          - source_labels: [ __meta_kubernetes_pod_node_name ]
+          - source_labels:
+              - __meta_kubernetes_pod_node_name
             action: replace
             target_label: node
 
+# serverFiles:
+#   prometheus.yml:
+#     scrape_configs:
+#       - job_name: prometheus
+#         static_configs:
+#           - targets:
+#               - localhost:9090
+
+#       # A scrape configuration for running Prometheus on a Kubernetes cluster.
+#       # This uses separate scrape configs for cluster components (i.e. API server, node)
+#       # and services to allow each to use different authentication configs.
+#       #
+#       # Kubernetes labels will be added as Prometheus labels on metrics via the
+#       # `labelmap` relabeling action.
+
+# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
+# #      # Scrape config for API servers.
+# #      #
+# #      # Kubernetes exposes API servers as endpoints to the default/kubernetes
+# #      # service so this uses `endpoints` role and uses relabelling to only keep
+# #      # the endpoints associated with the default/kubernetes service using the
+# #      # default named port `https`. This works for single API server deployments as
+# #      # well as HA API server deployments.
+# #      - job_name: 'kubernetes-apiservers'
+# #
+# #        kubernetes_sd_configs:
+# #          - role: endpoints
+# #
+# #        # Default to scraping over https. If required, just disable this or change to
+# #        # `http`.
+# #        scheme: https
+# #
+# #        # This TLS & bearer token file config is used to connect to the actual scrape
+# #        # endpoints for cluster components. This is separate to discovery auth
+# #        # configuration because discovery & scraping are two separate concerns in
+# #        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+# #        # the cluster. Otherwise, more config options have to be provided within the
+# #        # <kubernetes_sd_config>.
+# #        tls_config:
+# #          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+# #          # If your node certificates are self-signed or use a different CA to the
+# #          # master CA, then disable certificate verification below. Note that
+# #          # certificate verification is an integral part of a secure infrastructure
+# #          # so this should only be disabled in a controlled environment. You can
+# #          # disable certificate verification by uncommenting the line below.
+# #          #
+# #          # insecure_skip_verify: true
+# #        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+# #
+# #        # Keep only the default/kubernetes service endpoints for the https port. This
+# #        # will add targets for each API server which Kubernetes adds an endpoint to
+# #        # the default/kubernetes service.
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name ]
+# #            action: keep
+# #            regex: default;kubernetes;https
+
+#       - job_name: 'kubernetes-nodes'
+
+#         # Default to scraping over https. If required, just disable this or change to
+#         # `http`.
+#         scheme: https
+
+#         # This TLS & bearer token file config is used to connect to the actual scrape
+#         # endpoints for cluster components. This is separate to discovery auth
+#         # configuration because discovery & scraping are two separate concerns in
+#         # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+#         # the cluster. Otherwise, more config options have to be provided within the
+#         # <kubernetes_sd_config>.
+#         tls_config:
+#           ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+#           # If your node certificates are self-signed or use a different CA to the
+#           # master CA, then disable certificate verification below. Note that
+#           # certificate verification is an integral part of a secure infrastructure
+#           # so this should only be disabled in a controlled environment. You can
+#           # disable certificate verification by uncommenting the line below.
+#           #
+#           # insecure_skip_verify: true
+#         bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+#         kubernetes_sd_configs:
+#           - role: node
+
+#         relabel_configs:
+#           - action: labelmap
+#             regex: __meta_kubernetes_node_label_(.+)
+#           - target_label: __address__
+#             replacement: kubernetes.default.svc:443
+#           - source_labels: [ __meta_kubernetes_node_name ]
+#             regex: (.+)
+#             target_label: __metrics_path__
+#             replacement: /api/v1/nodes/$1/proxy/metrics
+
+
+#       - job_name: 'kubernetes-nodes-cadvisor'
+
+#         # Default to scraping over https. If required, just disable this or change to
+#         # `http`.
+#         scheme: https
+
+#         # This TLS & bearer token file config is used to connect to the actual scrape
+#         # endpoints for cluster components. This is separate to discovery auth
+#         # configuration because discovery & scraping are two separate concerns in
+#         # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+#         # the cluster. Otherwise, more config options have to be provided within the
+#         # <kubernetes_sd_config>.
+#         tls_config:
+#           ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+#           # If your node certificates are self-signed or use a different CA to the
+#           # master CA, then disable certificate verification below. Note that
+#           # certificate verification is an integral part of a secure infrastructure
+#           # so this should only be disabled in a controlled environment. You can
+#           # disable certificate verification by uncommenting the line below.
+#           #
+#           # insecure_skip_verify: true
+#         bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+#         kubernetes_sd_configs:
+#           - role: node
+
+#         # This configuration will work only on kubelet 1.7.3+
+#         # As the scrape endpoints for cAdvisor have changed
+#         # if you are using older version you need to change the replacement to
+#         # replacement: /api/v1/nodes/$1:4194/proxy/metrics
+#         # more info here https://github.com/coreos/prometheus-operator/issues/633
+#         relabel_configs:
+#           - action: labelmap
+#             regex: __meta_kubernetes_node_label_(.+)
+#           - target_label: __address__
+#             replacement: kubernetes.default.svc:443
+#           - source_labels: [ __meta_kubernetes_node_name ]
+#             regex: (.+)
+#             target_label: __metrics_path__
+#             replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
+
+#         # Metric relabel configs to apply to samples before ingestion.
+#         # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs)
+#         # metric_relabel_configs:
+#         # - action: labeldrop
+#         #   regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone)
+
+# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
+# #      # Scrape config for service endpoints.
+# #      #
+# #      # The relabeling allows the actual service scrape endpoint to be configured
+# #      # via the following annotations:
+# #      #
+# #      # * `prometheus.io/scrape`: Only scrape services that have a value of
+# #      # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well.
+# #      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+# #      # to set this to `https` & most likely set the `tls_config` of the scrape config.
+# #      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+# #      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
+# #      # service then set this appropriately.
+# #      # * `prometheus.io/param_<parameter>`: If the metrics endpoint uses parameters
+# #      # then you can set any parameter
+# #      - job_name: 'kubernetes-service-endpoints'
+# #        honor_labels: true
+# #
+# #        kubernetes_sd_configs:
+# #          - role: endpoints
+# #
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape ]
+# #            action: keep
+# #            regex: true
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ]
+# #            action: drop
+# #            regex: true
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ]
+# #            action: replace
+# #            target_label: __scheme__
+# #            regex: (https?)
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ]
+# #            action: replace
+# #            target_label: __metrics_path__
+# #            regex: (.+)
+# #          - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ]
+# #            action: replace
+# #            target_label: __address__
+# #            regex: (.+?)(?::\d+)?;(\d+)
+# #            replacement: $1:$2
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
+# #            replacement: __param_$1
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_service_label_(.+)
+# #          - source_labels: [ __meta_kubernetes_namespace ]
+# #            action: replace
+# #            target_label: namespace
+# #          - source_labels: [ __meta_kubernetes_service_name ]
+# #            action: replace
+# #            target_label: service
+# #          - source_labels: [ __meta_kubernetes_pod_node_name ]
+# #            action: replace
+# #            target_label: node
+
+
+# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
+# #      # Scrape config for slow service endpoints; same as above, but with a larger
+# #      # timeout and a larger interval
+# #      #
+# #      # The relabeling allows the actual service scrape endpoint to be configured
+# #      # via the following annotations:
+# #      #
+# #      # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true`
+# #      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+# #      # to set this to `https` & most likely set the `tls_config` of the scrape config.
+# #      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+# #      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
+# #      # service then set this appropriately.
+# #      # * `prometheus.io/param_<parameter>`: If the metrics endpoint uses parameters
+# #      # then you can set any parameter
+# #      - job_name: 'kubernetes-service-endpoints-slow'
+# #        honor_labels: true
+# #
+# #        scrape_interval: 5m
+# #        scrape_timeout: 30s
+# #
+# #        kubernetes_sd_configs:
+# #          - role: endpoints
+# #
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ]
+# #            action: keep
+# #            regex: true
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ]
+# #            action: replace
+# #            target_label: __scheme__
+# #            regex: (https?)
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ]
+# #            action: replace
+# #            target_label: __metrics_path__
+# #            regex: (.+)
+# #          - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ]
+# #            action: replace
+# #            target_label: __address__
+# #            regex: (.+?)(?::\d+)?;(\d+)
+# #            replacement: $1:$2
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
+# #            replacement: __param_$1
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_service_label_(.+)
+# #          - source_labels: [ __meta_kubernetes_namespace ]
+# #            action: replace
+# #            target_label: namespace
+# #          - source_labels: [ __meta_kubernetes_service_name ]
+# #            action: replace
+# #            target_label: service
+# #          - source_labels: [ __meta_kubernetes_pod_node_name ]
+# #            action: replace
+# #            target_label: node
+# #
+# #      - job_name: 'prometheus-pushgateway'
+# #        honor_labels: true
+# #
+# #        kubernetes_sd_configs:
+# #          - role: service
+# #
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ]
+# #            action: keep
+# #            regex: pushgateway
+
+
+# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
+# #      # Example scrape config for probing services via the Blackbox Exporter.
+# #      #
+# #      # The relabeling allows the actual service scrape endpoint to be configured
+# #      # via the following annotations:
+# #      #
+# #      # * `prometheus.io/probe`: Only probe services that have a value of `true`
+# #      - job_name: 'kubernetes-services'
+# #        honor_labels: true
+# #
+# #        metrics_path: /probe
+# #        params:
+# #          module: [ http_2xx ]
+# #
+# #        kubernetes_sd_configs:
+# #          - role: service
+# #
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ]
+# #            action: keep
+# #            regex: true
+# #          - source_labels: [ __address__ ]
+# #            target_label: __param_target
+# #          - target_label: __address__
+# #            replacement: blackbox
+# #          - source_labels: [ __param_target ]
+# #            target_label: instance
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_service_label_(.+)
+# #          - source_labels: [ __meta_kubernetes_namespace ]
+# #            target_label: namespace
+# #          - source_labels: [ __meta_kubernetes_service_name ]
+# #            target_label: service
+
+
+#       # Example scrape config for pods
+#       #
+#       # The relabeling allows the actual pod scrape endpoint to be configured via the
+#       # following annotations:
+#       #
+#       # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`,
+#       # except if `prometheus.io/scrape-slow` is set to `true` as well.
+#       # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+#       # to set this to `https` & most likely set the `tls_config` of the scrape config.
+#       # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+#       # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
+#       - job_name: 'kubernetes-pods'
+#         honor_labels: true
+
+#         kubernetes_sd_configs:
+#           - role: pod
+
+#         relabel_configs:
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape ]
+#             action: keep
+#             regex: true
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ]
+#             action: drop
+#             regex: true
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ]
+#             action: replace
+#             regex: (https?)
+#             target_label: __scheme__
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ]
+#             action: replace
+#             target_label: __metrics_path__
+#             regex: (.+)
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+#             action: replace
+#             regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
+#             replacement: '[$2]:$1'
+#             target_label: __address__
+#           - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+#             action: replace
+#             regex: (\d+);((([0-9]+?)(\.|$)){4})
+#             replacement: $2:$1
+#             target_label: __address__
+#           - action: labelmap
+#             regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
+#             replacement: __param_$1
+#           - action: labelmap
+#             regex: __meta_kubernetes_pod_label_(.+)
+#           - source_labels: [ __meta_kubernetes_namespace ]
+#             action: replace
+#             target_label: namespace
+#           - source_labels: [ __meta_kubernetes_pod_name ]
+#             action: replace
+#             target_label: pod
+#           - source_labels: [ __meta_kubernetes_pod_phase ]
+#             regex: Pending|Succeeded|Failed|Completed
+#             action: drop
+#           - source_labels: [ __meta_kubernetes_pod_node_name ]
+#             action: replace
+#             target_label: node
+
 
-## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
-#      # Example Scrape config for pods which should be scraped slower. An useful example
-#      # would be stackriver-exporter which queries an API on every scrape of the pod
-#      #
-#      # The relabeling allows the actual pod scrape endpoint to be configured via the
-#      # following annotations:
-#      #
-#      # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true`
-#      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-#      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-#      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-#      # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
-#      - job_name: 'kubernetes-pods-slow'
-#        honor_labels: true
-#
-#        scrape_interval: 5m
-#        scrape_timeout: 30s
-#
-#        kubernetes_sd_configs:
-#          - role: pod
-#
-#        relabel_configs:
-#          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ]
-#            action: keep
-#            regex: true
-#          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ]
-#            action: replace
-#            regex: (https?)
-#            target_label: __scheme__
-#          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ]
-#            action: replace
-#            target_label: __metrics_path__
-#            regex: (.+)
-#          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
-#            action: replace
-#            regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
-#            replacement: '[$2]:$1'
-#            target_label: __address__
-#          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
-#            action: replace
-#            regex: (\d+);((([0-9]+?)(\.|$)){4})
-#            replacement: $2:$1
-#            target_label: __address__
-#          - action: labelmap
-#            regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
-#            replacement: __param_$1
-#          - action: labelmap
-#            regex: __meta_kubernetes_pod_label_(.+)
-#          - source_labels: [ __meta_kubernetes_namespace ]
-#            action: replace
-#            target_label: namespace
-#          - source_labels: [ __meta_kubernetes_pod_name ]
-#            action: replace
-#            target_label: pod
-#          - source_labels: [ __meta_kubernetes_pod_phase ]
-#            regex: Pending|Succeeded|Failed|Completed
-#            action: drop
-#          - source_labels: [ __meta_kubernetes_pod_node_name ]
-#            action: replace
-#            target_label: node
+# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING
+# #      # Example Scrape config for pods which should be scraped slower. An useful example
+# #      # would be stackriver-exporter which queries an API on every scrape of the pod
+# #      #
+# #      # The relabeling allows the actual pod scrape endpoint to be configured via the
+# #      # following annotations:
+# #      #
+# #      # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true`
+# #      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+# #      # to set this to `https` & most likely set the `tls_config` of the scrape config.
+# #      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+# #      # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
+# #      - job_name: 'kubernetes-pods-slow'
+# #        honor_labels: true
+# #
+# #        scrape_interval: 5m
+# #        scrape_timeout: 30s
+# #
+# #        kubernetes_sd_configs:
+# #          - role: pod
+# #
+# #        relabel_configs:
+# #          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ]
+# #            action: keep
+# #            regex: true
+# #          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ]
+# #            action: replace
+# #            regex: (https?)
+# #            target_label: __scheme__
+# #          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ]
+# #            action: replace
+# #            target_label: __metrics_path__
+# #            regex: (.+)
+# #          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+# #            action: replace
+# #            regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
+# #            replacement: '[$2]:$1'
+# #            target_label: __address__
+# #          - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ]
+# #            action: replace
+# #            regex: (\d+);((([0-9]+?)(\.|$)){4})
+# #            replacement: $2:$1
+# #            target_label: __address__
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
+# #            replacement: __param_$1
+# #          - action: labelmap
+# #            regex: __meta_kubernetes_pod_label_(.+)
+# #          - source_labels: [ __meta_kubernetes_namespace ]
+# #            action: replace
+# #            target_label: namespace
+# #          - source_labels: [ __meta_kubernetes_pod_name ]
+# #            action: replace
+# #            target_label: pod
+# #          - source_labels: [ __meta_kubernetes_pod_phase ]
+# #            regex: Pending|Succeeded|Failed|Completed
+# #            action: drop
+# #          - source_labels: [ __meta_kubernetes_pod_node_name ]
+# #            action: replace
+# #            target_label: node
diff --git a/dist/install.yaml b/dist/install.yaml
index c41fa0dc..b3e6a2e2 100644
--- a/dist/install.yaml
+++ b/dist/install.yaml
@@ -1229,4 +1229,4 @@ spec:
       volumes:
       - configMap:
           name: devzero-zxporter-env-config
-        name: config-volume
+        name: config-volume
\ No newline at end of file
diff --git a/nvidia-device-plugin-prereq/container-toolkit.yaml b/nvidia-device-plugin-prereq/container-toolkit.yaml
new file mode 100644
index 00000000..17ada11b
--- /dev/null
+++ b/nvidia-device-plugin-prereq/container-toolkit.yaml
@@ -0,0 +1,84 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-toolkit-installer
+  namespace: nvidia-device-plugin
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-toolkit-installer
+  template:
+    metadata:
+      labels:
+        name: nvidia-toolkit-installer
+    spec:
+      nodeSelector:
+        nvidia.com/gpu.present: "true"
+      hostPID: true
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+        - key: "CriticalAddonsOnly"
+          operator: "Exists"
+        - effect: NoSchedule
+          key: node-role.kubernetes.io/control-plane
+        - effect: NoSchedule
+          key: node-role.kubernetes.io/master
+      containers:
+      - name: install-nvidia-toolkit
+        image: amazonlinux:2023
+        securityContext:
+          privileged: true
+        command:
+          - /bin/bash
+          - -c
+          - |
+            set -ex
+
+            # Add NVIDIA repo
+            curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
+              -o /etc/yum.repos.d/nvidia-container-toolkit.repo
+
+            # Install toolkit
+            yum install -y nvidia-container-toolkit
+
+            # Configure containerd
+            nvidia-ctk runtime configure --runtime=containerd
+
+            # Restart containerd
+            systemctl restart containerd || true
+
+            # Exit cleanly
+            echo "NVIDIA container toolkit installed and configured."
+            sleep infinity
+        volumeMounts:
+        - name: root
+          mountPath: /host
+          mountPropagation: Bidirectional
+        - name: containerd-config
+          mountPath: /etc/containerd
+        - name: systemd
+          mountPath: /run/systemd
+        - name: modules
+          mountPath: /lib/modules
+          readOnly: true
+        - name: dev
+          mountPath: /dev
+      volumes:
+        - name: root
+          hostPath:
+            path: /
+        - name: containerd-config
+          hostPath:
+            path: /etc/containerd
+        - name: systemd
+          hostPath:
+            path: /run/systemd
+        - name: modules
+          hostPath:
+            path: /lib/modules
+        - name: dev
+          hostPath:
+            path: /dev
+      restartPolicy: Always
diff --git a/nvidia-device-plugin-prereq/driver-installer.yaml b/nvidia-device-plugin-prereq/driver-installer.yaml
new file mode 100644
index 00000000..7f04e106
--- /dev/null
+++ b/nvidia-device-plugin-prereq/driver-installer.yaml
@@ -0,0 +1,81 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-driver-installer
+  namespace: nvidia-device-plugin
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-driver-installer
+  template:
+    metadata:
+      labels:
+        name: nvidia-driver-installer
+    spec:
+      hostPID: true
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+        - key: CriticalAddonsOnly
+          operator: Exists
+        - key: node-role.kubernetes.io/control-plane
+          effect: NoSchedule
+        - key: node-role.kubernetes.io/master
+          effect: NoSchedule
+      nodeSelector:
+        nvidia.com/gpu.present: "true"
+      containers:
+      - name: driver-installer
+        image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0
+        securityContext:
+          privileged: true
+        env:
+          - name: NVIDIA_DRIVER_VERSION
+            value: "535.129.03"  # or the version you require
+          - name: NODE_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: spec.nodeName
+        volumeMounts:
+          - name: root
+            mountPath: /host
+            mountPropagation: Bidirectional
+          - name: modules
+            mountPath: /lib/modules
+            readOnly: true
+          - name: nvidia-local
+            mountPath: /host/usr/local/nvidia
+      - name: fix-dcgm-dir
+        image: amazonlinux:2023
+        securityContext:
+          privileged: true
+        command: ["/bin/bash", "-c"]
+        args:
+          - |
+            set -ex
+            TARGET_DIR="/host/usr/local/nvidia"
+            # If it doesn't exist, symlink something useful
+            if [ ! -d "$TARGET_DIR" ]; then
+              mkdir -p /host/usr/local
+              ln -s /usr/lib64 "$TARGET_DIR"
+            fi
+            echo "/usr/local/nvidia set up for DCGM."
+            sleep 10
+        volumeMounts:
+          - name: nvidia-local
+            mountPath: /host/usr/local/nvidia
+          - name: root
+            mountPath: /host
+            mountPropagation: Bidirectional
+      volumes:
+        - name: root
+          hostPath:
+            path: /
+        - name: modules
+          hostPath:
+            path: /lib/modules
+        - name: nvidia-local
+          hostPath:
+            path: /usr/local/nvidia
+            type: DirectoryOrCreate
diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf
new file mode 100644
index 00000000..252ad5c2
--- /dev/null
+++ b/terraform/aws/main.tf
@@ -0,0 +1,363 @@
+provider "aws" {
+  region = "us-east-1"
+}
+
+data "aws_caller_identity" "current" {}
+
+# VPC Configuration
+module "vpc" {
+  source = "terraform-aws-modules/vpc/aws"
+
+  name = "${var.cluster_name}-vpc"
+  cidr = "10.0.0.0/16"
+
+  azs             = ["us-east-1a", "us-east-1b"]
+  private_subnets = ["10.0.1.0/24", "10.0.2.0/24"]
+  public_subnets  = ["10.0.101.0/24", "10.0.102.0/24"]
+
+  enable_nat_gateway = true
+  single_nat_gateway = true
+  
+  # Required for EKS
+  enable_dns_hostnames = true
+  enable_dns_support   = true
+
+  public_subnet_tags = {
+    "kubernetes.io/cluster/${var.cluster_name}" = "shared"
+    "kubernetes.io/role/elb"                    = "1"
+  }
+
+  private_subnet_tags = {
+    "kubernetes.io/cluster/${var.cluster_name}" = "shared"
+    "kubernetes.io/role/internal-elb"           = "1"
+    "karpenter.sh/discovery"                    = "${var.cluster_name}" 
+  }
+}
+
+# IAM Roles and Policies for Karpenter
+resource "aws_iam_role" "karpenter_node_role" {
+  name = "KarpenterNodeRole-${var.cluster_name}"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect    = "Allow"
+        Principal = {
+          Service = "ec2.amazonaws.com"
+        }
+        Action   = "sts:AssumeRole"
+      }
+    ]
+  })
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_node_role_policy_attachment" {
+  role       = aws_iam_role.karpenter_node_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_node_ssm_policy_attachment" {
+  role       = aws_iam_role.karpenter_node_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_node_registry_policy_attachment" {
+  role       = aws_iam_role.karpenter_node_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryPullOnly"
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_node_admin_policy_attachment" {
+  role       = aws_iam_role.karpenter_node_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
+}
+
+resource "aws_iam_role" "karpenter_controller_role" {
+  name = "KarpenterControllerRole-${var.cluster_name}"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect    = "Allow"
+        Principal = {
+          Federated = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/oidc.eks.${var.region}.amazonaws.com/id/${split("/id/", module.eks.cluster_oidc_issuer_url)[1]}"
+        }
+        Action   = "sts:AssumeRoleWithWebIdentity"
+        Condition = {
+          StringEquals = {
+            "oidc.eks.${var.region}.amazonaws.com/id/${split("/id/", module.eks.cluster_oidc_issuer_url)[1]}:sub" = "system:serviceaccount:kube-system:karpenter"
+          }
+        }
+      }
+    ]
+  })
+}
+
+resource "aws_iam_policy" "karpenter_controller_policy" {
+  name        = "KarpenterControllerPolicy-${var.cluster_name}"
+  description = "Custom Karpenter controller policy for managing EC2 instances, IAM roles, and EKS."
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Action = [
+          "ssm:GetParameter",
+          "ec2:DescribeImages",
+          "ec2:RunInstances",
+          "ec2:DescribeSubnets",
+          "ec2:DescribeSecurityGroups",
+          "ec2:DescribeLaunchTemplates",
+          "ec2:DescribeInstances",
+          "ec2:DescribeInstanceTypes",
+          "ec2:DescribeInstanceTypeOfferings",
+          "ec2:DeleteLaunchTemplate",
+          "ec2:CreateTags",
+          "ec2:CreateLaunchTemplate",
+          "ec2:CreateFleet",
+          "ec2:DescribeSpotPriceHistory",
+          "pricing:GetProducts"
+        ]
+        Effect = "Allow"
+        Resource = "*"
+        Sid = "Karpenter"
+      },
+      {
+        Action = "ec2:TerminateInstances"
+        Condition = {
+          StringLike = {
+            "ec2:ResourceTag/karpenter.sh/nodepool" = "*"
+          }
+        }
+        Effect = "Allow"
+        Resource = "*"
+        Sid = "ConditionalEC2Termination"
+      },
+      {
+        Effect = "Allow"
+        Action = "iam:PassRole"
+        Resource = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/KarpenterNodeRole-${var.cluster_name}"
+        Sid = "PassNodeIAMRole"
+      },
+      {
+        Effect = "Allow"
+        Action = "eks:DescribeCluster"
+        Resource = "arn:aws:eks:${var.region}:${data.aws_caller_identity.current.account_id}:cluster/${var.cluster_name}"
+        Sid = "EKSClusterEndpointLookup"
+      },
+      {
+        Sid = "AllowScopedInstanceProfileCreationActions"
+        Effect = "Allow"
+        Resource = "*"
+        Action = ["iam:CreateInstanceProfile"]
+        Condition = {
+          StringEquals = {
+            "aws:RequestTag/kubernetes.io/cluster/${var.cluster_name}" = "owned"
+            "aws:RequestTag/topology.kubernetes.io/region"           = "${var.region}"
+          }
+          StringLike = {
+            "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass" = "*"
+          }
+        }
+      },
+      {
+        Sid = "AllowScopedInstanceProfileTagActions"
+        Effect = "Allow"
+        Resource = "*"
+        Action = ["iam:TagInstanceProfile"]
+        Condition = {
+          StringEquals = {
+            "aws:ResourceTag/kubernetes.io/cluster/${var.cluster_name}" = "owned"
+            "aws:ResourceTag/topology.kubernetes.io/region"           = "${var.region}"
+            "aws:RequestTag/kubernetes.io/cluster/${var.cluster_name}" = "owned"
+            "aws:RequestTag/topology.kubernetes.io/region"           = "${var.region}"
+          }
+          StringLike = {
+            "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass" = "*"
+            "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass" = "*"
+          }
+        }
+      },
+      {
+        Sid = "AllowScopedInstanceProfileActions"
+        Effect = "Allow"
+        Resource = "*"
+        Action = [
+          "iam:AddRoleToInstanceProfile",
+          "iam:RemoveRoleFromInstanceProfile",
+          "iam:DeleteInstanceProfile"
+        ]
+        Condition = {
+          StringEquals = {
+            "aws:ResourceTag/kubernetes.io/cluster/${var.cluster_name}" = "owned"
+            "aws:ResourceTag/topology.kubernetes.io/region"           = "${var.region}"
+          }
+          StringLike = {
+            "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass" = "*"
+          }
+        }
+      },
+      {
+        Sid = "AllowInstanceProfileReadActions"
+        Effect = "Allow"
+        Resource = "*"
+        Action = "iam:GetInstanceProfile"
+      },
+      {
+        Effect = "Allow"
+        Action = [
+          "sqs:DeleteMessage",
+          "sqs:GetQueueUrl",
+          "sqs:GetQueueAttributes",
+          "sqs:ReceiveMessage"
+        ]
+        Resource = "*"
+        Sid      = "KarpenterInterruptionQueue"
+      }
+    ]
+  })
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_controller_custom_policy_attachment" {
+  role       = aws_iam_role.karpenter_controller_role.name
+  policy_arn = aws_iam_policy.karpenter_controller_policy.arn
+}
+
+
+resource "aws_iam_role_policy_attachment" "karpenter_controller_policy_attachment" {
+  role       = aws_iam_role.karpenter_controller_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
+}
+
+resource "aws_iam_role_policy_attachment" "karpenter_controller_admin_policy_attachment" {
+  role       = aws_iam_role.karpenter_controller_role.name
+  policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
+}
+
+# EKS Cluster Configuration
+module "eks" {
+  source          = "terraform-aws-modules/eks/aws"
+
+  cluster_name    = var.cluster_name
+  cluster_version = var.cluster_version
+
+  # Add VPC configuration
+  vpc_id          = module.vpc.vpc_id
+  subnet_ids      = module.vpc.private_subnets
+
+  enable_irsa     = true
+  enable_cluster_creator_admin_permissions = true
+  cluster_endpoint_public_access = true
+  cluster_endpoint_public_access_cidrs = ["0.0.0.0/0"]
+
+  create_node_iam_role = false
+
+  tags = {
+    "karpenter.sh/discovery" = var.cluster_name
+  }
+
+  eks_managed_node_groups = {
+    gpu_nodes = {
+      instance_types = ["g6.4xlarge"]
+      desired_size   = 1
+      min_size       = 1
+      max_size       = 1
+
+      ami_type       = "AL2023_x86_64_NVIDIA"
+      use_custom_launch_template = false
+
+      metadata_options = {
+        http_endpoint               = "enabled"
+        http_tokens                 = "optional" 
+        http_put_response_hop_limit = 2           
+        instance_metadata_tags      = "enabled"
+      }
+
+      disk_size      = 200
+      labels = {
+        node_type = "gpu"
+      }
+
+      # Attach the IAM role for Karpenter to the managed node group
+      iam_instance_profile = aws_iam_role.karpenter_node_role.name
+    }
+  }
+}
+
+resource "aws_security_group" "karpenter_sg" {
+  name        = "karpenter-sg-${var.cluster_name}"
+  description = "Karpenter security group"
+  vpc_id      = module.vpc.vpc_id
+
+  tags = {
+    "karpenter.sh/discovery" = "${var.cluster_name}"
+  }
+}
+
+resource "aws_security_group_rule" "karpenter_inbound" {
+  security_group_id = aws_security_group.karpenter_sg.id
+  type              = "ingress"
+  from_port         = 0
+  to_port           = 65535
+  protocol          = "tcp"
+  cidr_blocks       = ["0.0.0.0/0"]
+}
+
+resource "aws_sqs_queue" "karpenter_interruption_queue" {
+  name = "${var.cluster_name}-karpenter-interruption"
+  sqs_managed_sse_enabled = true
+
+  tags = {
+    "karpenter.sh/discovery" = var.cluster_name
+  }
+}
+
+resource "aws_sqs_queue_policy" "karpenter_interruption_queue_policy" {
+  queue_url = aws_sqs_queue.karpenter_interruption_queue.url
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Sid    = "AllowKarpenterController"
+        Effect = "Allow"
+        Principal = {
+          AWS = aws_iam_role.karpenter_controller_role.arn
+        }
+        Action = [
+          "sqs:DeleteMessage",
+          "sqs:GetQueueUrl",
+          "sqs:GetQueueAttributes",
+          "sqs:ReceiveMessage"
+        ]
+        Resource = aws_sqs_queue.karpenter_interruption_queue.arn
+      },
+      {
+        Sid    = "EC2SpotInterruption"
+        Effect = "Allow"
+        Principal = {
+          Service = ["events.amazonaws.com", "sqs.amazonaws.com"]
+        }
+        Action   = ["sqs:SendMessage"]
+        Resource = aws_sqs_queue.karpenter_interruption_queue.arn
+      }
+    ]
+  })
+}
+
+resource "aws_cloudwatch_event_rule" "spot_interruption" {
+  name        = "${var.cluster_name}-spot-interruption"
+  description = "Capture EC2 Spot Instance interruption notices"
+
+  event_pattern = jsonencode({
+    source      = ["aws.ec2"]
+    detail-type = ["EC2 Spot Instance Interruption Warning"]
+  })
+}
+
+resource "aws_cloudwatch_event_target" "spot_interruption" {
+  target_id = "KarpenterInterruptionQueueTarget"
+  rule      = aws_cloudwatch_event_rule.spot_interruption.name
+  arn       = aws_sqs_queue.karpenter_interruption_queue.arn
+}
\ No newline at end of file
diff --git a/terraform/aws/terraform.tfvars b/terraform/aws/terraform.tfvars
new file mode 100644
index 00000000..6e098115
--- /dev/null
+++ b/terraform/aws/terraform.tfvars
@@ -0,0 +1,3 @@
+cluster_name    = "devzero-gpu-cluster"
+cluster_version = "1.30"
+region          = "us-east-1"
\ No newline at end of file
diff --git a/terraform/aws/variables.tf b/terraform/aws/variables.tf
new file mode 100644
index 00000000..741aed7d
--- /dev/null
+++ b/terraform/aws/variables.tf
@@ -0,0 +1,14 @@
+variable "cluster_name" {
+  description = "The name of the EKS cluster"
+  type        = string
+}
+
+variable "cluster_version" {
+  description = "The Kubernetes version for the EKS cluster"
+  type        = string
+}
+
+variable "region" {
+  description = "Region of EKS cluster"
+  type        = string
+}