diff --git a/.github/workflows/aws-gpu-test.yaml b/.github/workflows/aws-gpu-test.yaml new file mode 100644 index 00000000..ce3d4bea --- /dev/null +++ b/.github/workflows/aws-gpu-test.yaml @@ -0,0 +1,498 @@ +name: AWS GPU Test + +on: + push: + branches: + - garvit/aws-gpu-test + workflow_dispatch: + inputs: + gpu_install_type: + description: 'GPU installation type' + required: false + default: 'nvidia-device-plugin' + type: choice + options: + - gpu-operator + - nvidia-device-plugin + dcgm_install_type: + description: 'DCGM install type' + required: false + default: 'devzero-dcgm' + type: choice + options: + - nvidia-dcgm + - devzero-dcgm + cluster_version: + description: 'Kubernetes cluster version' + required: false + default: '1.30' + type: choice + options: + - '1.26' + - '1.27' + - '1.28' + - '1.29' + - '1.30' + - '1.31' + - '1.32' + - '1.33' + karpenter_version: + description: 'Karpenter Version' + required: false + default: '0.37.7' + type: choice + options: + - 'no_karpenter' + - '0.37.7' + +permissions: + id-token: write + contents: read + +jobs: + apply-terraform: + name: Apply Terraform + runs-on: ubuntu-latest + env: + GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }} + DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} + CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }} + + outputs: + job_identifier: ${{ steps.job-identifier.outputs.job_identifier }} + + steps: + - name: Validate Inputs + run: | + echo "GPU_INSTALL_TYPE=${GPU_INSTALL_TYPE}" + echo "DCGM_INSTALL_TYPE=${DCGM_INSTALL_TYPE}" + + if [[ "$GPU_INSTALL_TYPE" == "nvidia-device-plugin" && "$DCGM_INSTALL_TYPE" != "devzero-dcgm" ]]; then + echo "Error: When GPU_INSTALL_TYPE is 'nvidia-device-plugin', DCGM_INSTALL_TYPE must be 'devzero-dcgm'." + exit 1 + fi + + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Configure AWS Credential + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role + aws-region: us-east-1 + + - name: Generate Unique Job Identifier + id: job-identifier + shell: bash + run: | + SHORT_SHA=$(git rev-parse --short HEAD) + if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then + SUFFIX="dd" + else + SUFFIX="nd" + fi + JOB_IDENTIFIER="gh-ci-ro-${SHORT_SHA}-${SUFFIX}" + echo "JOB_IDENTIFIER=${JOB_IDENTIFIER}" >> $GITHUB_ENV + echo "job_identifier=${JOB_IDENTIFIER}" >> $GITHUB_OUTPUT + + - name: Set up Terraform + uses: hashicorp/setup-terraform@v3 + + - name: Apply Terraform + working-directory: terraform/aws + run: | + cat < backend_override.tf + terraform { + backend "s3" { + bucket = "zxporter-tf-state" + key = "${JOB_IDENTIFIER}/terraform.tfstate" + region = "us-east-1" + } + } + EOF + terraform init + terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" -var='cluster_version=${{ env.CLUSTER_VERSION }}' + + install-and-validate: + name: Install and Validate GPU Resources and ZXPorter + runs-on: ubuntu-latest + needs: apply-terraform + env: + GPU_INSTALL_TYPE: ${{ github.event.inputs.gpu_install_type || 'nvidia-device-plugin' }} + DCGM_INSTALL_TYPE: ${{ github.event.inputs.dcgm_install_type || 'devzero-dcgm' }} + Karpenter_VERSION: ${{ github.event.inputs.karpenter_version || '0.37.7' }} + CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }} + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role + aws-region: us-east-1 + + - name: Install yq + run: | + sudo wget https://github.com/mikefarah/yq/releases/download/v4.35.2/yq_linux_amd64 -O /usr/local/bin/yq + sudo chmod +x /usr/local/bin/yq + + - name: Configure Kubernetes Access + run: | + aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }} + + - name: Add new mapRole to aws-auth ConfigMap + if: env.Karpenter_VERSION != 'no_karpenter' + run: | + NEW_MAPROLE='- groups:\n - system:bootstrappers\n - system:nodes\n rolearn: arn:aws:iam::484907513542:role/KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}\n username: system:node:{{EC2PrivateDNSName}}' + kubectl get configmap/aws-auth -n kube-system -o yaml > aws-auth.yaml + yq eval '.data.mapRoles |= . + "- groups:\n - system:bootstrappers\n - system:nodes\n rolearn: arn:aws:iam::484907513542:role/KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}\n username: system:node:{{EC2PrivateDNSName}}\n"' -i aws-auth.yaml + kubectl apply -f aws-auth.yaml + kubectl get configmap/aws-auth -n kube-system -o yaml + + - name: Install Karpenter (if needed) + if: env.Karpenter_VERSION != 'no_karpenter' + run: | + echo "Installing Karpenter..." + AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" + CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ needs.apply-terraform.outputs.job_identifier }} --query "cluster.endpoint" --output text)" + KARPENTER_IAM_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterControllerRole-${{ needs.apply-terraform.outputs.job_identifier }}" + echo "Karpenter IAM Role ARN: ${KARPENTER_IAM_ROLE_ARN}" + echo "Cluster Endpoint: ${CLUSTER_ENDPOINT}" + helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \ + --version "0.37.7" \ + --namespace kube-system \ + --create-namespace \ + --set settings.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \ + --set settings.aws.clusterName="${{ needs.apply-terraform.outputs.job_identifier }}" \ + --set settings.aws.clusterEndpoint="${CLUSTER_ENDPOINT}" \ + --set settings.aws.defaultInstanceProfile="KarpenterNodeRole-${{ needs.apply-terraform.outputs.job_identifier }}" \ + --set settings.aws.interruptionQueueName="${{ needs.apply-terraform.outputs.job_identifier }}-karpenter-interruption" \ + --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="${KARPENTER_IAM_ROLE_ARN}" \ + --set controller.resources.requests.cpu="1" \ + --set controller.resources.requests.memory="1Gi" \ + --set controller.resources.limits.cpu="1" \ + --set controller.resources.limits.memory="1Gi" \ + --wait + + - name: Check GPU Availability + id: gpu_check + run: | + echo "Checking GPU resources on nodes..." + if kubectl describe nodes | grep -q "nvidia.com/gpu"; then + echo "GPU resources are available on the nodes." + echo "GPU_CHECK=true" >> $GITHUB_ENV + else + echo "GPU check failed" + echo "GPU_CHECK=false" >> $GITHUB_ENV + fi + + - name: Install GPU Operator (if needed) + if: env.GPU_CHECK == 'false' && env.GPU_INSTALL_TYPE == 'gpu-operator' + run: | + echo "GPU resources not found, installing GPU Operator..." + kubectl create ns gpu-operator + kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged --overwrite + kubectl get nodes -o json | jq '.items[].metadata.labels | keys | any(startswith("feature.node.kubernetes.io"))' || true + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && \ + helm repo update + INSTALL_CMD="helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v25.3.0" + if [[ "$DCGM_INSTALL_TYPE" == "devzero-dcgm" ]]; then + INSTALL_CMD="$INSTALL_CMD --set dcgmExporter.enabled=false" + fi + echo "Running: $INSTALL_CMD" + $INSTALL_CMD + + - name: Install Nvidia Device Plugin + if: env.GPU_INSTALL_TYPE == 'nvidia-device-plugin' && env.GPU_CHECK == 'false' + run: | + echo "Installing Nvidia Device Plugin..." + kubectl get nodes -l node_type=gpu -o jsonpath='{.items[*].metadata.name}' | xargs -I {} kubectl label node {} nvidia.com/gpu=true nvidia.com/mps.capable=true nvidia.com/gpu.present=true --overwrite + kubectl create ns nvidia-device-plugin + kubectl apply -f nvidia-device-plugin-prereq + helm repo add nvdp https://nvidia.github.io/k8s-device-plugin + helm repo update + helm upgrade -i nvdp nvdp/nvidia-device-plugin \ + --namespace nvidia-device-plugin \ + --version 0.17.1 + + - name: Check GPU Availability After Installing GPU Operator + if: env.GPU_CHECK == 'false' + run: | + echo "Re-checking GPU resources on nodes after GPU Operator installation..." + if kubectl describe nodes | grep -q "nvidia.com/gpu"; then + echo "GPU resources are available on the nodes." + else + echo "GPU check failed after GPU Operator installation" + exit 1 + fi + + - name: Check Nvidia DCGM DaemonSet + id: dcgm_check + if: ${{ env.DCGM_INSTALL_TYPE == 'nvidia-dcgm' }} + run: | + echo "Checking if DCGM DaemonSet is installed..." + if kubectl get daemonset -A | grep -q dcgm; then + echo "Nvidia DCGM found, proceeding with validation." + else + echo "Nvidia DCGM not found." + exit 1 + fi + + - name: Install DevZero DCGM + if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} + run: | + echo "Installing DCGM Exporter..." + kubectl create ns devzero-zxporter + curl https://raw.githubusercontent.com/devzero-inc/zxporter/refs/heads/main/dcgm-installers/eks.yml | kubectl apply -f - + + - name: Check DCGM DaemonSet After Installing DCGM Exporter + if: ${{ env.DCGM_INSTALL_TYPE == 'devzero-dcgm' }} + run: | + echo "Re-checking DCGM pods after DCGM Exporter installation..." + if kubectl get daemonset -A | grep -q dcgm; then + echo "DCGM DaemonSet is running." + else + echo "DCGM DaemonSet not running after installation" + exit 1 + fi + + - name: Verify DCGM Pods and Prometheus Annotations + run: | + NAMESPACE="devzero-zxporter" + if [[ "$DCGM_INSTALL_TYPE" == "nvidia-dcgm" ]]; then + NAMESPACE="gpu-operator" + fi + kubectl get pods -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep dcgm-exporter | xargs -r -I {} kubectl wait --for=condition=Ready pod {} -n $NAMESPACE --timeout=300s + echo "Verifying DCGM pods and Prometheus annotations..." + kubectl get pods -A | grep dcgm-exporter | awk ' + BEGIN { all_running = 1; pod_count = 0 } + { + pod_count++ + status = $4 + printf "Pod: %s/%s - Status: %s\n", $1, $2, status + if (status != "Running") all_running = 0 + } + END { + printf "\nTotal Pods: %d\n", pod_count + printf "All Running: %s\n", (all_running ? "true" : "false") + }' + kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.name | contains("dcgm-exporter")) | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace pod; do kubectl annotate pod $pod -n $namespace prometheus.io/scrape=true --overwrite; done + + - name: Install and Verify DeepSeek Workload + run: | + kubectl create ns deepseek + kubectl apply -f https://gist.githubusercontent.com/Tzvonimir/a168dcc1515d3bf89254c34010e16d37/raw/4b154383f4e254c9490d4815e85aa5f574eb26eb/install-test-deepseek.yaml + + kubectl wait --for=condition=ready pod -n deepseek --all --timeout=600s + pod_status=$(kubectl get pods -n deepseek --field-selector=status.phase!=Running -o jsonpath='{.items[*].status.phase}') + + if [[ -n "$pod_status" ]]; then + echo "Pods are not in Running state. Failing the pipeline." + exit 1 + else + echo "All pods are running successfully." + fi + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.22' + cache: true + + - name: Install ZXPorter + run: | + ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" + echo "Building and pushing zxporter image: ${ZXPORTER_IMG}" + make docker-build docker-push IMG=${ZXPORTER_IMG} + make deploy IMG=${ZXPORTER_IMG} + + echo "Waiting for ZXPorter pods to be ready..." + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server -n devzero-zxporter --timeout=300s + + - name: Test Karpenter + if: inputs.karpenter_version != 'no_karpenter' + run: | + echo "Intalling Karpenter Node Class and Node Pool..." + ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${{ env.CLUSTER_VERSION }}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')" + echo "Using ALIAS_VERSION: ${ALIAS_VERSION}" + kubectl get nodes -o wide || true + cat < pf.log 2>&1 & + PF_PID=$! + sleep 20 + MAX_RETRIES=6 + for i in $(seq 1 $MAX_RETRIES); do + if curl -s "http://localhost:9090/-/ready" >/dev/null; then + echo "Prometheus port-forward is ready." + break + fi + echo "[$i/$MAX_RETRIES] Waiting for Prometheus to become ready..." + sleep 5 + done + + result=$(curl -s "http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_SM_CLOCK" | jq -r '.data.result') + kill $PF_PID || true + + echo "Metric found: $result" + if [[ -z "$result" || "$result" == [] ]]; then + echo "❌ DCGM_FI_DEV_SM_CLOCK metric not found!" + echo "Port-forward log:" + cat pf.log + exit 1 + fi + + destroy-terraform: + name: Destroy Terraform + runs-on: ubuntu-latest + env: + CLUSTER_VERSION: ${{ github.event.inputs.cluster_version || '1.30' }} + + if: always() + needs: + - apply-terraform + - install-and-validate + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::484907513542:role/github-actions-oidc-role + aws-region: us-east-1 + + - name: Configure Kubernetes Access + if: inputs.karpenter_version != 'no_karpenter' + run: | + aws eks update-kubeconfig --region us-east-1 --name ${{ needs.apply-terraform.outputs.job_identifier }} + + - name: Delete Karpenter Nodes + if: inputs.karpenter_version != 'no_karpenter' + run: | + kubectl delete deployment inflate + kubectl wait --for=delete deployment/inflate --timeout=300s + NODE_NAME=$(kubectl get nodes --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[1].metadata.name}') + kubectl delete node "${NODE_NAME}" + + + - name: Set up Terraform + uses: hashicorp/setup-terraform@v3 + + - name: Destroy Infrastructure + working-directory: terraform/aws + run: | + cat < backend_override.tf + terraform { + backend "s3" { + bucket = "zxporter-tf-state" + key = "${{ needs.apply-terraform.outputs.job_identifier }}/terraform.tfstate" + region = "us-east-1" + } + } + EOF + terraform init + terraform destroy -auto-approve -var="cluster_name=${{ needs.apply-terraform.outputs.job_identifier }}" -var='cluster_version=${{ env.CLUSTER_VERSION }}' diff --git a/.gitignore b/.gitignore index f2f57448..e5cfe436 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,8 @@ config/**/charts *.swp *.swo *~ + +# Terraform files +*.tfstate +*.tfstate.backup +.terraform* diff --git a/Makefile b/Makefile index a6be166d..a4bdfb11 100644 --- a/Makefile +++ b/Makefile @@ -125,11 +125,11 @@ help: ## Display this help. .PHONY: manifests manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. - $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases + $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases -w .PHONY: generate generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. - $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." + $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." -w .PHONY: fmt fmt: ## Run go fmt against code. diff --git a/config/prometheus/hack.prometheus.values.yaml b/config/prometheus/hack.prometheus.values.yaml index b1975764..db227b61 100644 --- a/config/prometheus/hack.prometheus.values.yaml +++ b/config/prometheus/hack.prometheus.values.yaml @@ -51,344 +51,84 @@ kube-state-metrics: # - roles serverFiles: - prometheus.yml: + prometheus.yml: + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts scrape_configs: - job_name: prometheus static_configs: - targets: - localhost:9090 - - # A scrape configuration for running Prometheus on a Kubernetes cluster. - # This uses separate scrape configs for cluster components (i.e. API server, node) - # and services to allow each to use different authentication configs. - # - # Kubernetes labels will be added as Prometheus labels on metrics via the - # `labelmap` relabeling action. - -## DEVZERO COMMENTED OUT TO PREVENT SCRAPING -# # Scrape config for API servers. -# # -# # Kubernetes exposes API servers as endpoints to the default/kubernetes -# # service so this uses `endpoints` role and uses relabelling to only keep -# # the endpoints associated with the default/kubernetes service using the -# # default named port `https`. This works for single API server deployments as -# # well as HA API server deployments. -# - job_name: 'kubernetes-apiservers' -# -# kubernetes_sd_configs: -# - role: endpoints -# -# # Default to scraping over https. If required, just disable this or change to -# # `http`. -# scheme: https -# -# # This TLS & bearer token file config is used to connect to the actual scrape -# # endpoints for cluster components. This is separate to discovery auth -# # configuration because discovery & scraping are two separate concerns in -# # Prometheus. The discovery auth config is automatic if Prometheus runs inside -# # the cluster. Otherwise, more config options have to be provided within the -# # . -# tls_config: -# ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt -# # If your node certificates are self-signed or use a different CA to the -# # master CA, then disable certificate verification below. Note that -# # certificate verification is an integral part of a secure infrastructure -# # so this should only be disabled in a controlled environment. You can -# # disable certificate verification by uncommenting the line below. -# # -# # insecure_skip_verify: true -# bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token -# -# # Keep only the default/kubernetes service endpoints for the https port. This -# # will add targets for each API server which Kubernetes adds an endpoint to -# # the default/kubernetes service. -# relabel_configs: -# - source_labels: [ __meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name ] -# action: keep -# regex: default;kubernetes;https - - - job_name: 'kubernetes-nodes' - - # Default to scraping over https. If required, just disable this or change to - # `http`. + - job_name: kubernetes-nodes scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - # insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - role: node - relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - - source_labels: [ __meta_kubernetes_node_name ] + - source_labels: + - __meta_kubernetes_node_name regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$1/proxy/metrics - - - - job_name: 'kubernetes-nodes-cadvisor' - - # Default to scraping over https. If required, just disable this or change to - # `http`. + - job_name: kubernetes-nodes-cadvisor scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - # insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - role: node - - # This configuration will work only on kubelet 1.7.3+ - # As the scrape endpoints for cAdvisor have changed - # if you are using older version you need to change the replacement to - # replacement: /api/v1/nodes/$1:4194/proxy/metrics - # more info here https://github.com/coreos/prometheus-operator/issues/633 relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - - source_labels: [ __meta_kubernetes_node_name ] + - source_labels: + - __meta_kubernetes_node_name regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor - - # Metric relabel configs to apply to samples before ingestion. - # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs) - # metric_relabel_configs: - # - action: labeldrop - # regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone) - -## DEVZERO COMMENTED OUT TO PREVENT SCRAPING -# # Scrape config for service endpoints. -# # -# # The relabeling allows the actual service scrape endpoint to be configured -# # via the following annotations: -# # -# # * `prometheus.io/scrape`: Only scrape services that have a value of -# # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well. -# # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need -# # to set this to `https` & most likely set the `tls_config` of the scrape config. -# # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. -# # * `prometheus.io/port`: If the metrics are exposed on a different port to the -# # service then set this appropriately. -# # * `prometheus.io/param_`: If the metrics endpoint uses parameters -# # then you can set any parameter -# - job_name: 'kubernetes-service-endpoints' -# honor_labels: true -# -# kubernetes_sd_configs: -# - role: endpoints -# -# relabel_configs: -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape ] -# action: keep -# regex: true -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ] -# action: drop -# regex: true -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ] -# action: replace -# target_label: __scheme__ -# regex: (https?) -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ] -# action: replace -# target_label: __metrics_path__ -# regex: (.+) -# - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ] -# action: replace -# target_label: __address__ -# regex: (.+?)(?::\d+)?;(\d+) -# replacement: $1:$2 -# - action: labelmap -# regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) -# replacement: __param_$1 -# - action: labelmap -# regex: __meta_kubernetes_service_label_(.+) -# - source_labels: [ __meta_kubernetes_namespace ] -# action: replace -# target_label: namespace -# - source_labels: [ __meta_kubernetes_service_name ] -# action: replace -# target_label: service -# - source_labels: [ __meta_kubernetes_pod_node_name ] -# action: replace -# target_label: node - - -## DEVZERO COMMENTED OUT TO PREVENT SCRAPING -# # Scrape config for slow service endpoints; same as above, but with a larger -# # timeout and a larger interval -# # -# # The relabeling allows the actual service scrape endpoint to be configured -# # via the following annotations: -# # -# # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true` -# # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need -# # to set this to `https` & most likely set the `tls_config` of the scrape config. -# # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. -# # * `prometheus.io/port`: If the metrics are exposed on a different port to the -# # service then set this appropriately. -# # * `prometheus.io/param_`: If the metrics endpoint uses parameters -# # then you can set any parameter -# - job_name: 'kubernetes-service-endpoints-slow' -# honor_labels: true -# -# scrape_interval: 5m -# scrape_timeout: 30s -# -# kubernetes_sd_configs: -# - role: endpoints -# -# relabel_configs: -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ] -# action: keep -# regex: true -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ] -# action: replace -# target_label: __scheme__ -# regex: (https?) -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ] -# action: replace -# target_label: __metrics_path__ -# regex: (.+) -# - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ] -# action: replace -# target_label: __address__ -# regex: (.+?)(?::\d+)?;(\d+) -# replacement: $1:$2 -# - action: labelmap -# regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) -# replacement: __param_$1 -# - action: labelmap -# regex: __meta_kubernetes_service_label_(.+) -# - source_labels: [ __meta_kubernetes_namespace ] -# action: replace -# target_label: namespace -# - source_labels: [ __meta_kubernetes_service_name ] -# action: replace -# target_label: service -# - source_labels: [ __meta_kubernetes_pod_node_name ] -# action: replace -# target_label: node -# -# - job_name: 'prometheus-pushgateway' -# honor_labels: true -# -# kubernetes_sd_configs: -# - role: service -# -# relabel_configs: -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ] -# action: keep -# regex: pushgateway - - -## DEVZERO COMMENTED OUT TO PREVENT SCRAPING -# # Example scrape config for probing services via the Blackbox Exporter. -# # -# # The relabeling allows the actual service scrape endpoint to be configured -# # via the following annotations: -# # -# # * `prometheus.io/probe`: Only probe services that have a value of `true` -# - job_name: 'kubernetes-services' -# honor_labels: true -# -# metrics_path: /probe -# params: -# module: [ http_2xx ] -# -# kubernetes_sd_configs: -# - role: service -# -# relabel_configs: -# - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ] -# action: keep -# regex: true -# - source_labels: [ __address__ ] -# target_label: __param_target -# - target_label: __address__ -# replacement: blackbox -# - source_labels: [ __param_target ] -# target_label: instance -# - action: labelmap -# regex: __meta_kubernetes_service_label_(.+) -# - source_labels: [ __meta_kubernetes_namespace ] -# target_label: namespace -# - source_labels: [ __meta_kubernetes_service_name ] -# target_label: service - - - # Example scrape config for pods - # - # The relabeling allows the actual pod scrape endpoint to be configured via the - # following annotations: - # - # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`, - # except if `prometheus.io/scrape-slow` is set to `true` as well. - # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # to set this to `https` & most likely set the `tls_config` of the scrape config. - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. - - job_name: 'kubernetes-pods' + - job_name: kubernetes-pods honor_labels: true - kubernetes_sd_configs: - role: pod - relabel_configs: - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape ] + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape action: keep regex: true - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ] + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow action: drop regex: true - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ] + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme action: replace regex: (https?) target_label: __scheme__ - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ] + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path action: replace - target_label: __metrics_path__ regex: (.+) - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] + target_label: __metrics_path__ + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip action: replace regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' + replacement: "[$2]:$1" target_label: __address__ - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] + - source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip action: replace regex: (\d+);((([0-9]+?)(\.|$)){4}) replacement: $2:$1 @@ -398,77 +138,442 @@ serverFiles: replacement: __param_$1 - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [ __meta_kubernetes_namespace ] + - source_labels: + - __meta_kubernetes_namespace action: replace target_label: namespace - - source_labels: [ __meta_kubernetes_pod_name ] + - source_labels: + - __meta_kubernetes_pod_name action: replace target_label: pod - - source_labels: [ __meta_kubernetes_pod_phase ] + - source_labels: + - __meta_kubernetes_pod_phase regex: Pending|Succeeded|Failed|Completed action: drop - - source_labels: [ __meta_kubernetes_pod_node_name ] + - source_labels: + - __meta_kubernetes_pod_node_name action: replace target_label: node +# serverFiles: +# prometheus.yml: +# scrape_configs: +# - job_name: prometheus +# static_configs: +# - targets: +# - localhost:9090 + +# # A scrape configuration for running Prometheus on a Kubernetes cluster. +# # This uses separate scrape configs for cluster components (i.e. API server, node) +# # and services to allow each to use different authentication configs. +# # +# # Kubernetes labels will be added as Prometheus labels on metrics via the +# # `labelmap` relabeling action. + +# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING +# # # Scrape config for API servers. +# # # +# # # Kubernetes exposes API servers as endpoints to the default/kubernetes +# # # service so this uses `endpoints` role and uses relabelling to only keep +# # # the endpoints associated with the default/kubernetes service using the +# # # default named port `https`. This works for single API server deployments as +# # # well as HA API server deployments. +# # - job_name: 'kubernetes-apiservers' +# # +# # kubernetes_sd_configs: +# # - role: endpoints +# # +# # # Default to scraping over https. If required, just disable this or change to +# # # `http`. +# # scheme: https +# # +# # # This TLS & bearer token file config is used to connect to the actual scrape +# # # endpoints for cluster components. This is separate to discovery auth +# # # configuration because discovery & scraping are two separate concerns in +# # # Prometheus. The discovery auth config is automatic if Prometheus runs inside +# # # the cluster. Otherwise, more config options have to be provided within the +# # # . +# # tls_config: +# # ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +# # # If your node certificates are self-signed or use a different CA to the +# # # master CA, then disable certificate verification below. Note that +# # # certificate verification is an integral part of a secure infrastructure +# # # so this should only be disabled in a controlled environment. You can +# # # disable certificate verification by uncommenting the line below. +# # # +# # # insecure_skip_verify: true +# # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token +# # +# # # Keep only the default/kubernetes service endpoints for the https port. This +# # # will add targets for each API server which Kubernetes adds an endpoint to +# # # the default/kubernetes service. +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name ] +# # action: keep +# # regex: default;kubernetes;https + +# - job_name: 'kubernetes-nodes' + +# # Default to scraping over https. If required, just disable this or change to +# # `http`. +# scheme: https + +# # This TLS & bearer token file config is used to connect to the actual scrape +# # endpoints for cluster components. This is separate to discovery auth +# # configuration because discovery & scraping are two separate concerns in +# # Prometheus. The discovery auth config is automatic if Prometheus runs inside +# # the cluster. Otherwise, more config options have to be provided within the +# # . +# tls_config: +# ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +# # If your node certificates are self-signed or use a different CA to the +# # master CA, then disable certificate verification below. Note that +# # certificate verification is an integral part of a secure infrastructure +# # so this should only be disabled in a controlled environment. You can +# # disable certificate verification by uncommenting the line below. +# # +# # insecure_skip_verify: true +# bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + +# kubernetes_sd_configs: +# - role: node + +# relabel_configs: +# - action: labelmap +# regex: __meta_kubernetes_node_label_(.+) +# - target_label: __address__ +# replacement: kubernetes.default.svc:443 +# - source_labels: [ __meta_kubernetes_node_name ] +# regex: (.+) +# target_label: __metrics_path__ +# replacement: /api/v1/nodes/$1/proxy/metrics + + +# - job_name: 'kubernetes-nodes-cadvisor' + +# # Default to scraping over https. If required, just disable this or change to +# # `http`. +# scheme: https + +# # This TLS & bearer token file config is used to connect to the actual scrape +# # endpoints for cluster components. This is separate to discovery auth +# # configuration because discovery & scraping are two separate concerns in +# # Prometheus. The discovery auth config is automatic if Prometheus runs inside +# # the cluster. Otherwise, more config options have to be provided within the +# # . +# tls_config: +# ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +# # If your node certificates are self-signed or use a different CA to the +# # master CA, then disable certificate verification below. Note that +# # certificate verification is an integral part of a secure infrastructure +# # so this should only be disabled in a controlled environment. You can +# # disable certificate verification by uncommenting the line below. +# # +# # insecure_skip_verify: true +# bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + +# kubernetes_sd_configs: +# - role: node + +# # This configuration will work only on kubelet 1.7.3+ +# # As the scrape endpoints for cAdvisor have changed +# # if you are using older version you need to change the replacement to +# # replacement: /api/v1/nodes/$1:4194/proxy/metrics +# # more info here https://github.com/coreos/prometheus-operator/issues/633 +# relabel_configs: +# - action: labelmap +# regex: __meta_kubernetes_node_label_(.+) +# - target_label: __address__ +# replacement: kubernetes.default.svc:443 +# - source_labels: [ __meta_kubernetes_node_name ] +# regex: (.+) +# target_label: __metrics_path__ +# replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + +# # Metric relabel configs to apply to samples before ingestion. +# # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs) +# # metric_relabel_configs: +# # - action: labeldrop +# # regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone) + +# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING +# # # Scrape config for service endpoints. +# # # +# # # The relabeling allows the actual service scrape endpoint to be configured +# # # via the following annotations: +# # # +# # # * `prometheus.io/scrape`: Only scrape services that have a value of +# # # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well. +# # # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# # # to set this to `https` & most likely set the `tls_config` of the scrape config. +# # # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# # # * `prometheus.io/port`: If the metrics are exposed on a different port to the +# # # service then set this appropriately. +# # # * `prometheus.io/param_`: If the metrics endpoint uses parameters +# # # then you can set any parameter +# # - job_name: 'kubernetes-service-endpoints' +# # honor_labels: true +# # +# # kubernetes_sd_configs: +# # - role: endpoints +# # +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape ] +# # action: keep +# # regex: true +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ] +# # action: drop +# # regex: true +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ] +# # action: replace +# # target_label: __scheme__ +# # regex: (https?) +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ] +# # action: replace +# # target_label: __metrics_path__ +# # regex: (.+) +# # - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ] +# # action: replace +# # target_label: __address__ +# # regex: (.+?)(?::\d+)?;(\d+) +# # replacement: $1:$2 +# # - action: labelmap +# # regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) +# # replacement: __param_$1 +# # - action: labelmap +# # regex: __meta_kubernetes_service_label_(.+) +# # - source_labels: [ __meta_kubernetes_namespace ] +# # action: replace +# # target_label: namespace +# # - source_labels: [ __meta_kubernetes_service_name ] +# # action: replace +# # target_label: service +# # - source_labels: [ __meta_kubernetes_pod_node_name ] +# # action: replace +# # target_label: node + + +# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING +# # # Scrape config for slow service endpoints; same as above, but with a larger +# # # timeout and a larger interval +# # # +# # # The relabeling allows the actual service scrape endpoint to be configured +# # # via the following annotations: +# # # +# # # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true` +# # # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# # # to set this to `https` & most likely set the `tls_config` of the scrape config. +# # # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# # # * `prometheus.io/port`: If the metrics are exposed on a different port to the +# # # service then set this appropriately. +# # # * `prometheus.io/param_`: If the metrics endpoint uses parameters +# # # then you can set any parameter +# # - job_name: 'kubernetes-service-endpoints-slow' +# # honor_labels: true +# # +# # scrape_interval: 5m +# # scrape_timeout: 30s +# # +# # kubernetes_sd_configs: +# # - role: endpoints +# # +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ] +# # action: keep +# # regex: true +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ] +# # action: replace +# # target_label: __scheme__ +# # regex: (https?) +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ] +# # action: replace +# # target_label: __metrics_path__ +# # regex: (.+) +# # - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ] +# # action: replace +# # target_label: __address__ +# # regex: (.+?)(?::\d+)?;(\d+) +# # replacement: $1:$2 +# # - action: labelmap +# # regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) +# # replacement: __param_$1 +# # - action: labelmap +# # regex: __meta_kubernetes_service_label_(.+) +# # - source_labels: [ __meta_kubernetes_namespace ] +# # action: replace +# # target_label: namespace +# # - source_labels: [ __meta_kubernetes_service_name ] +# # action: replace +# # target_label: service +# # - source_labels: [ __meta_kubernetes_pod_node_name ] +# # action: replace +# # target_label: node +# # +# # - job_name: 'prometheus-pushgateway' +# # honor_labels: true +# # +# # kubernetes_sd_configs: +# # - role: service +# # +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ] +# # action: keep +# # regex: pushgateway + + +# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING +# # # Example scrape config for probing services via the Blackbox Exporter. +# # # +# # # The relabeling allows the actual service scrape endpoint to be configured +# # # via the following annotations: +# # # +# # # * `prometheus.io/probe`: Only probe services that have a value of `true` +# # - job_name: 'kubernetes-services' +# # honor_labels: true +# # +# # metrics_path: /probe +# # params: +# # module: [ http_2xx ] +# # +# # kubernetes_sd_configs: +# # - role: service +# # +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ] +# # action: keep +# # regex: true +# # - source_labels: [ __address__ ] +# # target_label: __param_target +# # - target_label: __address__ +# # replacement: blackbox +# # - source_labels: [ __param_target ] +# # target_label: instance +# # - action: labelmap +# # regex: __meta_kubernetes_service_label_(.+) +# # - source_labels: [ __meta_kubernetes_namespace ] +# # target_label: namespace +# # - source_labels: [ __meta_kubernetes_service_name ] +# # target_label: service + + +# # Example scrape config for pods +# # +# # The relabeling allows the actual pod scrape endpoint to be configured via the +# # following annotations: +# # +# # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`, +# # except if `prometheus.io/scrape-slow` is set to `true` as well. +# # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# # to set this to `https` & most likely set the `tls_config` of the scrape config. +# # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. +# - job_name: 'kubernetes-pods' +# honor_labels: true + +# kubernetes_sd_configs: +# - role: pod + +# relabel_configs: +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape ] +# action: keep +# regex: true +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ] +# action: drop +# regex: true +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ] +# action: replace +# regex: (https?) +# target_label: __scheme__ +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ] +# action: replace +# target_label: __metrics_path__ +# regex: (.+) +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] +# action: replace +# regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) +# replacement: '[$2]:$1' +# target_label: __address__ +# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] +# action: replace +# regex: (\d+);((([0-9]+?)(\.|$)){4}) +# replacement: $2:$1 +# target_label: __address__ +# - action: labelmap +# regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) +# replacement: __param_$1 +# - action: labelmap +# regex: __meta_kubernetes_pod_label_(.+) +# - source_labels: [ __meta_kubernetes_namespace ] +# action: replace +# target_label: namespace +# - source_labels: [ __meta_kubernetes_pod_name ] +# action: replace +# target_label: pod +# - source_labels: [ __meta_kubernetes_pod_phase ] +# regex: Pending|Succeeded|Failed|Completed +# action: drop +# - source_labels: [ __meta_kubernetes_pod_node_name ] +# action: replace +# target_label: node + -## DEVZERO COMMENTED OUT TO PREVENT SCRAPING -# # Example Scrape config for pods which should be scraped slower. An useful example -# # would be stackriver-exporter which queries an API on every scrape of the pod -# # -# # The relabeling allows the actual pod scrape endpoint to be configured via the -# # following annotations: -# # -# # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true` -# # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need -# # to set this to `https` & most likely set the `tls_config` of the scrape config. -# # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. -# # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. -# - job_name: 'kubernetes-pods-slow' -# honor_labels: true -# -# scrape_interval: 5m -# scrape_timeout: 30s -# -# kubernetes_sd_configs: -# - role: pod -# -# relabel_configs: -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ] -# action: keep -# regex: true -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ] -# action: replace -# regex: (https?) -# target_label: __scheme__ -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ] -# action: replace -# target_label: __metrics_path__ -# regex: (.+) -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] -# action: replace -# regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) -# replacement: '[$2]:$1' -# target_label: __address__ -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] -# action: replace -# regex: (\d+);((([0-9]+?)(\.|$)){4}) -# replacement: $2:$1 -# target_label: __address__ -# - action: labelmap -# regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) -# replacement: __param_$1 -# - action: labelmap -# regex: __meta_kubernetes_pod_label_(.+) -# - source_labels: [ __meta_kubernetes_namespace ] -# action: replace -# target_label: namespace -# - source_labels: [ __meta_kubernetes_pod_name ] -# action: replace -# target_label: pod -# - source_labels: [ __meta_kubernetes_pod_phase ] -# regex: Pending|Succeeded|Failed|Completed -# action: drop -# - source_labels: [ __meta_kubernetes_pod_node_name ] -# action: replace -# target_label: node +# ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING +# # # Example Scrape config for pods which should be scraped slower. An useful example +# # # would be stackriver-exporter which queries an API on every scrape of the pod +# # # +# # # The relabeling allows the actual pod scrape endpoint to be configured via the +# # # following annotations: +# # # +# # # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true` +# # # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# # # to set this to `https` & most likely set the `tls_config` of the scrape config. +# # # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# # # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. +# # - job_name: 'kubernetes-pods-slow' +# # honor_labels: true +# # +# # scrape_interval: 5m +# # scrape_timeout: 30s +# # +# # kubernetes_sd_configs: +# # - role: pod +# # +# # relabel_configs: +# # - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ] +# # action: keep +# # regex: true +# # - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ] +# # action: replace +# # regex: (https?) +# # target_label: __scheme__ +# # - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ] +# # action: replace +# # target_label: __metrics_path__ +# # regex: (.+) +# # - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] +# # action: replace +# # regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) +# # replacement: '[$2]:$1' +# # target_label: __address__ +# # - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] +# # action: replace +# # regex: (\d+);((([0-9]+?)(\.|$)){4}) +# # replacement: $2:$1 +# # target_label: __address__ +# # - action: labelmap +# # regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) +# # replacement: __param_$1 +# # - action: labelmap +# # regex: __meta_kubernetes_pod_label_(.+) +# # - source_labels: [ __meta_kubernetes_namespace ] +# # action: replace +# # target_label: namespace +# # - source_labels: [ __meta_kubernetes_pod_name ] +# # action: replace +# # target_label: pod +# # - source_labels: [ __meta_kubernetes_pod_phase ] +# # regex: Pending|Succeeded|Failed|Completed +# # action: drop +# # - source_labels: [ __meta_kubernetes_pod_node_name ] +# # action: replace +# # target_label: node diff --git a/dist/install.yaml b/dist/install.yaml index c41fa0dc..b3e6a2e2 100644 --- a/dist/install.yaml +++ b/dist/install.yaml @@ -1229,4 +1229,4 @@ spec: volumes: - configMap: name: devzero-zxporter-env-config - name: config-volume + name: config-volume \ No newline at end of file diff --git a/nvidia-device-plugin-prereq/container-toolkit.yaml b/nvidia-device-plugin-prereq/container-toolkit.yaml new file mode 100644 index 00000000..17ada11b --- /dev/null +++ b/nvidia-device-plugin-prereq/container-toolkit.yaml @@ -0,0 +1,84 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-toolkit-installer + namespace: nvidia-device-plugin +spec: + selector: + matchLabels: + name: nvidia-toolkit-installer + template: + metadata: + labels: + name: nvidia-toolkit-installer + spec: + nodeSelector: + nvidia.com/gpu.present: "true" + hostPID: true + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "CriticalAddonsOnly" + operator: "Exists" + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + - effect: NoSchedule + key: node-role.kubernetes.io/master + containers: + - name: install-nvidia-toolkit + image: amazonlinux:2023 + securityContext: + privileged: true + command: + - /bin/bash + - -c + - | + set -ex + + # Add NVIDIA repo + curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \ + -o /etc/yum.repos.d/nvidia-container-toolkit.repo + + # Install toolkit + yum install -y nvidia-container-toolkit + + # Configure containerd + nvidia-ctk runtime configure --runtime=containerd + + # Restart containerd + systemctl restart containerd || true + + # Exit cleanly + echo "NVIDIA container toolkit installed and configured." + sleep infinity + volumeMounts: + - name: root + mountPath: /host + mountPropagation: Bidirectional + - name: containerd-config + mountPath: /etc/containerd + - name: systemd + mountPath: /run/systemd + - name: modules + mountPath: /lib/modules + readOnly: true + - name: dev + mountPath: /dev + volumes: + - name: root + hostPath: + path: / + - name: containerd-config + hostPath: + path: /etc/containerd + - name: systemd + hostPath: + path: /run/systemd + - name: modules + hostPath: + path: /lib/modules + - name: dev + hostPath: + path: /dev + restartPolicy: Always diff --git a/nvidia-device-plugin-prereq/driver-installer.yaml b/nvidia-device-plugin-prereq/driver-installer.yaml new file mode 100644 index 00000000..7f04e106 --- /dev/null +++ b/nvidia-device-plugin-prereq/driver-installer.yaml @@ -0,0 +1,81 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-driver-installer + namespace: nvidia-device-plugin +spec: + selector: + matchLabels: + name: nvidia-driver-installer + template: + metadata: + labels: + name: nvidia-driver-installer + spec: + hostPID: true + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: CriticalAddonsOnly + operator: Exists + - key: node-role.kubernetes.io/control-plane + effect: NoSchedule + - key: node-role.kubernetes.io/master + effect: NoSchedule + nodeSelector: + nvidia.com/gpu.present: "true" + containers: + - name: driver-installer + image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0 + securityContext: + privileged: true + env: + - name: NVIDIA_DRIVER_VERSION + value: "535.129.03" # or the version you require + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: root + mountPath: /host + mountPropagation: Bidirectional + - name: modules + mountPath: /lib/modules + readOnly: true + - name: nvidia-local + mountPath: /host/usr/local/nvidia + - name: fix-dcgm-dir + image: amazonlinux:2023 + securityContext: + privileged: true + command: ["/bin/bash", "-c"] + args: + - | + set -ex + TARGET_DIR="/host/usr/local/nvidia" + # If it doesn't exist, symlink something useful + if [ ! -d "$TARGET_DIR" ]; then + mkdir -p /host/usr/local + ln -s /usr/lib64 "$TARGET_DIR" + fi + echo "/usr/local/nvidia set up for DCGM." + sleep 10 + volumeMounts: + - name: nvidia-local + mountPath: /host/usr/local/nvidia + - name: root + mountPath: /host + mountPropagation: Bidirectional + volumes: + - name: root + hostPath: + path: / + - name: modules + hostPath: + path: /lib/modules + - name: nvidia-local + hostPath: + path: /usr/local/nvidia + type: DirectoryOrCreate diff --git a/terraform/aws/main.tf b/terraform/aws/main.tf new file mode 100644 index 00000000..252ad5c2 --- /dev/null +++ b/terraform/aws/main.tf @@ -0,0 +1,363 @@ +provider "aws" { + region = "us-east-1" +} + +data "aws_caller_identity" "current" {} + +# VPC Configuration +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + + name = "${var.cluster_name}-vpc" + cidr = "10.0.0.0/16" + + azs = ["us-east-1a", "us-east-1b"] + private_subnets = ["10.0.1.0/24", "10.0.2.0/24"] + public_subnets = ["10.0.101.0/24", "10.0.102.0/24"] + + enable_nat_gateway = true + single_nat_gateway = true + + # Required for EKS + enable_dns_hostnames = true + enable_dns_support = true + + public_subnet_tags = { + "kubernetes.io/cluster/${var.cluster_name}" = "shared" + "kubernetes.io/role/elb" = "1" + } + + private_subnet_tags = { + "kubernetes.io/cluster/${var.cluster_name}" = "shared" + "kubernetes.io/role/internal-elb" = "1" + "karpenter.sh/discovery" = "${var.cluster_name}" + } +} + +# IAM Roles and Policies for Karpenter +resource "aws_iam_role" "karpenter_node_role" { + name = "KarpenterNodeRole-${var.cluster_name}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "ec2.amazonaws.com" + } + Action = "sts:AssumeRole" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "karpenter_node_role_policy_attachment" { + role = aws_iam_role.karpenter_node_role.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" +} + +resource "aws_iam_role_policy_attachment" "karpenter_node_ssm_policy_attachment" { + role = aws_iam_role.karpenter_node_role.name + policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" +} + +resource "aws_iam_role_policy_attachment" "karpenter_node_registry_policy_attachment" { + role = aws_iam_role.karpenter_node_role.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryPullOnly" +} + +resource "aws_iam_role_policy_attachment" "karpenter_node_admin_policy_attachment" { + role = aws_iam_role.karpenter_node_role.name + policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess" +} + +resource "aws_iam_role" "karpenter_controller_role" { + name = "KarpenterControllerRole-${var.cluster_name}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Federated = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/oidc.eks.${var.region}.amazonaws.com/id/${split("/id/", module.eks.cluster_oidc_issuer_url)[1]}" + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "oidc.eks.${var.region}.amazonaws.com/id/${split("/id/", module.eks.cluster_oidc_issuer_url)[1]}:sub" = "system:serviceaccount:kube-system:karpenter" + } + } + } + ] + }) +} + +resource "aws_iam_policy" "karpenter_controller_policy" { + name = "KarpenterControllerPolicy-${var.cluster_name}" + description = "Custom Karpenter controller policy for managing EC2 instances, IAM roles, and EKS." + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = [ + "ssm:GetParameter", + "ec2:DescribeImages", + "ec2:RunInstances", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeLaunchTemplates", + "ec2:DescribeInstances", + "ec2:DescribeInstanceTypes", + "ec2:DescribeInstanceTypeOfferings", + "ec2:DeleteLaunchTemplate", + "ec2:CreateTags", + "ec2:CreateLaunchTemplate", + "ec2:CreateFleet", + "ec2:DescribeSpotPriceHistory", + "pricing:GetProducts" + ] + Effect = "Allow" + Resource = "*" + Sid = "Karpenter" + }, + { + Action = "ec2:TerminateInstances" + Condition = { + StringLike = { + "ec2:ResourceTag/karpenter.sh/nodepool" = "*" + } + } + Effect = "Allow" + Resource = "*" + Sid = "ConditionalEC2Termination" + }, + { + Effect = "Allow" + Action = "iam:PassRole" + Resource = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/KarpenterNodeRole-${var.cluster_name}" + Sid = "PassNodeIAMRole" + }, + { + Effect = "Allow" + Action = "eks:DescribeCluster" + Resource = "arn:aws:eks:${var.region}:${data.aws_caller_identity.current.account_id}:cluster/${var.cluster_name}" + Sid = "EKSClusterEndpointLookup" + }, + { + Sid = "AllowScopedInstanceProfileCreationActions" + Effect = "Allow" + Resource = "*" + Action = ["iam:CreateInstanceProfile"] + Condition = { + StringEquals = { + "aws:RequestTag/kubernetes.io/cluster/${var.cluster_name}" = "owned" + "aws:RequestTag/topology.kubernetes.io/region" = "${var.region}" + } + StringLike = { + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass" = "*" + } + } + }, + { + Sid = "AllowScopedInstanceProfileTagActions" + Effect = "Allow" + Resource = "*" + Action = ["iam:TagInstanceProfile"] + Condition = { + StringEquals = { + "aws:ResourceTag/kubernetes.io/cluster/${var.cluster_name}" = "owned" + "aws:ResourceTag/topology.kubernetes.io/region" = "${var.region}" + "aws:RequestTag/kubernetes.io/cluster/${var.cluster_name}" = "owned" + "aws:RequestTag/topology.kubernetes.io/region" = "${var.region}" + } + StringLike = { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass" = "*" + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass" = "*" + } + } + }, + { + Sid = "AllowScopedInstanceProfileActions" + Effect = "Allow" + Resource = "*" + Action = [ + "iam:AddRoleToInstanceProfile", + "iam:RemoveRoleFromInstanceProfile", + "iam:DeleteInstanceProfile" + ] + Condition = { + StringEquals = { + "aws:ResourceTag/kubernetes.io/cluster/${var.cluster_name}" = "owned" + "aws:ResourceTag/topology.kubernetes.io/region" = "${var.region}" + } + StringLike = { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass" = "*" + } + } + }, + { + Sid = "AllowInstanceProfileReadActions" + Effect = "Allow" + Resource = "*" + Action = "iam:GetInstanceProfile" + }, + { + Effect = "Allow" + Action = [ + "sqs:DeleteMessage", + "sqs:GetQueueUrl", + "sqs:GetQueueAttributes", + "sqs:ReceiveMessage" + ] + Resource = "*" + Sid = "KarpenterInterruptionQueue" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "karpenter_controller_custom_policy_attachment" { + role = aws_iam_role.karpenter_controller_role.name + policy_arn = aws_iam_policy.karpenter_controller_policy.arn +} + + +resource "aws_iam_role_policy_attachment" "karpenter_controller_policy_attachment" { + role = aws_iam_role.karpenter_controller_role.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy" +} + +resource "aws_iam_role_policy_attachment" "karpenter_controller_admin_policy_attachment" { + role = aws_iam_role.karpenter_controller_role.name + policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess" +} + +# EKS Cluster Configuration +module "eks" { + source = "terraform-aws-modules/eks/aws" + + cluster_name = var.cluster_name + cluster_version = var.cluster_version + + # Add VPC configuration + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + + enable_irsa = true + enable_cluster_creator_admin_permissions = true + cluster_endpoint_public_access = true + cluster_endpoint_public_access_cidrs = ["0.0.0.0/0"] + + create_node_iam_role = false + + tags = { + "karpenter.sh/discovery" = var.cluster_name + } + + eks_managed_node_groups = { + gpu_nodes = { + instance_types = ["g6.4xlarge"] + desired_size = 1 + min_size = 1 + max_size = 1 + + ami_type = "AL2023_x86_64_NVIDIA" + use_custom_launch_template = false + + metadata_options = { + http_endpoint = "enabled" + http_tokens = "optional" + http_put_response_hop_limit = 2 + instance_metadata_tags = "enabled" + } + + disk_size = 200 + labels = { + node_type = "gpu" + } + + # Attach the IAM role for Karpenter to the managed node group + iam_instance_profile = aws_iam_role.karpenter_node_role.name + } + } +} + +resource "aws_security_group" "karpenter_sg" { + name = "karpenter-sg-${var.cluster_name}" + description = "Karpenter security group" + vpc_id = module.vpc.vpc_id + + tags = { + "karpenter.sh/discovery" = "${var.cluster_name}" + } +} + +resource "aws_security_group_rule" "karpenter_inbound" { + security_group_id = aws_security_group.karpenter_sg.id + type = "ingress" + from_port = 0 + to_port = 65535 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] +} + +resource "aws_sqs_queue" "karpenter_interruption_queue" { + name = "${var.cluster_name}-karpenter-interruption" + sqs_managed_sse_enabled = true + + tags = { + "karpenter.sh/discovery" = var.cluster_name + } +} + +resource "aws_sqs_queue_policy" "karpenter_interruption_queue_policy" { + queue_url = aws_sqs_queue.karpenter_interruption_queue.url + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "AllowKarpenterController" + Effect = "Allow" + Principal = { + AWS = aws_iam_role.karpenter_controller_role.arn + } + Action = [ + "sqs:DeleteMessage", + "sqs:GetQueueUrl", + "sqs:GetQueueAttributes", + "sqs:ReceiveMessage" + ] + Resource = aws_sqs_queue.karpenter_interruption_queue.arn + }, + { + Sid = "EC2SpotInterruption" + Effect = "Allow" + Principal = { + Service = ["events.amazonaws.com", "sqs.amazonaws.com"] + } + Action = ["sqs:SendMessage"] + Resource = aws_sqs_queue.karpenter_interruption_queue.arn + } + ] + }) +} + +resource "aws_cloudwatch_event_rule" "spot_interruption" { + name = "${var.cluster_name}-spot-interruption" + description = "Capture EC2 Spot Instance interruption notices" + + event_pattern = jsonencode({ + source = ["aws.ec2"] + detail-type = ["EC2 Spot Instance Interruption Warning"] + }) +} + +resource "aws_cloudwatch_event_target" "spot_interruption" { + target_id = "KarpenterInterruptionQueueTarget" + rule = aws_cloudwatch_event_rule.spot_interruption.name + arn = aws_sqs_queue.karpenter_interruption_queue.arn +} \ No newline at end of file diff --git a/terraform/aws/terraform.tfvars b/terraform/aws/terraform.tfvars new file mode 100644 index 00000000..6e098115 --- /dev/null +++ b/terraform/aws/terraform.tfvars @@ -0,0 +1,3 @@ +cluster_name = "devzero-gpu-cluster" +cluster_version = "1.30" +region = "us-east-1" \ No newline at end of file diff --git a/terraform/aws/variables.tf b/terraform/aws/variables.tf new file mode 100644 index 00000000..741aed7d --- /dev/null +++ b/terraform/aws/variables.tf @@ -0,0 +1,14 @@ +variable "cluster_name" { + description = "The name of the EKS cluster" + type = string +} + +variable "cluster_version" { + description = "The Kubernetes version for the EKS cluster" + type = string +} + +variable "region" { + description = "Region of EKS cluster" + type = string +}