From 4bd05da4e59b12b6946eea2253c7de4496a833b0 Mon Sep 17 00:00:00 2001 From: Rahul Shetty Date: Fri, 24 Apr 2026 14:23:17 +0530 Subject: [PATCH 1/2] changes for implementing node overprevisioner Signed-off-by: Rahul Shetty --- .../overprovisioner/deployment.yaml | 67 +++++++++++++++++++ .../components/overprovisioner/namespace.yaml | 6 ++ .../overprovisioner/priorityclass.yaml | 12 ++++ .../overprovisioner/prometheusrule.yaml | 36 ++++++++++ 4 files changed, 121 insertions(+) create mode 100644 components/manifests/components/overprovisioner/deployment.yaml create mode 100644 components/manifests/components/overprovisioner/namespace.yaml create mode 100644 components/manifests/components/overprovisioner/priorityclass.yaml create mode 100644 components/manifests/components/overprovisioner/prometheusrule.yaml diff --git a/components/manifests/components/overprovisioner/deployment.yaml b/components/manifests/components/overprovisioner/deployment.yaml new file mode 100644 index 000000000..88bce97a2 --- /dev/null +++ b/components/manifests/components/overprovisioner/deployment.yaml @@ -0,0 +1,67 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: acp-overprovisioner + namespace: acp-overprovisioner + labels: + app: acp-overprovisioner + app.kubernetes.io/part-of: acp-overprovisioner +spec: + # ── Tunable: number of spare runner-sized slots to keep warm ── + # Each replica reserves capacity equivalent to one agentic session runner pod. + # Increase to handle larger bursts; decrease to reduce idle cost. + # Quick adjustment: kubectl scale deployment/acp-overprovisioner -n acp-overprovisioner --replicas= + replicas: 5 + selector: + matchLabels: + app: acp-overprovisioner + template: + metadata: + labels: + app: acp-overprovisioner + annotations: + # Allows the cluster autoscaler to evict these pods when scaling down + # idle nodes. Without this, placeholder pods would block scale-down. + cluster-autoscaler.kubernetes.io/safe-to-evict: "true" + spec: + # Priority -10: any pod with default (0) or higher priority preempts these. + # When a runner pod needs capacity, the scheduler evicts a placeholder + # instantly, and the evicted placeholder triggers the autoscaler to + # replenish the buffer by adding a new node. + priorityClassName: acp-overprovisioning + # Evict immediately — no graceful shutdown needed for a pause container. + terminationGracePeriodSeconds: 0 + # Spread placeholders across nodes so spare capacity is distributed. + # ScheduleAnyway makes this best-effort — won't block scheduling if + # there are fewer nodes than replicas. + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: acp-overprovisioner + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - name: pause + image: registry.k8s.io/pause:3.9 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + resources: + requests: + # ── Tunable: must match runner pod resource requests ── + # These values mirror the agentic session runner container requests + # (see operator/internal/handlers/sessions.go defaults). + # When a placeholder is evicted, the freed capacity is exactly + # what a runner pod needs to start immediately. + cpu: "500m" + memory: "512Mi" + # No limits set intentionally — keeps QoS class as Burstable, + # making these pods easier to evict. The pause container uses + # zero actual CPU/memory; only requests matter for scheduling. diff --git a/components/manifests/components/overprovisioner/namespace.yaml b/components/manifests/components/overprovisioner/namespace.yaml new file mode 100644 index 000000000..5f70b3561 --- /dev/null +++ b/components/manifests/components/overprovisioner/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: acp-overprovisioner + labels: + app.kubernetes.io/part-of: acp-overprovisioner diff --git a/components/manifests/components/overprovisioner/priorityclass.yaml b/components/manifests/components/overprovisioner/priorityclass.yaml new file mode 100644 index 000000000..f96985a3e --- /dev/null +++ b/components/manifests/components/overprovisioner/priorityclass.yaml @@ -0,0 +1,12 @@ +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: acp-overprovisioning + labels: + app.kubernetes.io/part-of: acp-overprovisioner +value: -10 +globalDefault: false +description: >- + Low-priority class for overprovisioning placeholder pods. + These pods reserve capacity for agentic session runners and + are preempted immediately when real workloads need scheduling. diff --git a/components/manifests/components/overprovisioner/prometheusrule.yaml b/components/manifests/components/overprovisioner/prometheusrule.yaml new file mode 100644 index 000000000..55d89e031 --- /dev/null +++ b/components/manifests/components/overprovisioner/prometheusrule.yaml @@ -0,0 +1,36 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: acp-overprovisioner + namespace: acp-overprovisioner + labels: + app.kubernetes.io/part-of: acp-overprovisioner +spec: + groups: + - name: acp-overprovisioner + rules: + - alert: OverprovisionerReplicasMissing + expr: | + kube_deployment_status_available_replicas{namespace="acp-overprovisioner", deployment="acp-overprovisioner"} + < kube_deployment_spec_replicas{namespace="acp-overprovisioner", deployment="acp-overprovisioner"} + for: 15m + labels: + severity: warning + annotations: + summary: Overprovisioner has fewer available replicas than desired. + description: >- + The acp-overprovisioner Deployment has had fewer available replicas + than desired for more than 15 minutes. Spare capacity for agentic + sessions may be reduced. + - alert: OverprovisionerDeploymentMissing + expr: | + absent(kube_deployment_status_available_replicas{namespace="acp-overprovisioner", deployment="acp-overprovisioner"}) + for: 30m + labels: + severity: critical + annotations: + summary: Overprovisioner deployment is missing. + description: >- + The acp-overprovisioner Deployment has been absent for more than + 30 minutes. No spare capacity is being reserved for agentic + session runners, which may result in slow pod startup times. From ef5bba0d1f232672d7c2618048e06862e9f62137 Mon Sep 17 00:00:00 2001 From: Rahul Shetty <160733420+rh-rahulshetty@users.noreply.github.com> Date: Fri, 24 Apr 2026 23:35:27 +0530 Subject: [PATCH 2/2] Delete components/manifests/components/overprovisioner/prometheusrule.yaml --- .../overprovisioner/prometheusrule.yaml | 36 ------------------- 1 file changed, 36 deletions(-) delete mode 100644 components/manifests/components/overprovisioner/prometheusrule.yaml diff --git a/components/manifests/components/overprovisioner/prometheusrule.yaml b/components/manifests/components/overprovisioner/prometheusrule.yaml deleted file mode 100644 index 55d89e031..000000000 --- a/components/manifests/components/overprovisioner/prometheusrule.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: acp-overprovisioner - namespace: acp-overprovisioner - labels: - app.kubernetes.io/part-of: acp-overprovisioner -spec: - groups: - - name: acp-overprovisioner - rules: - - alert: OverprovisionerReplicasMissing - expr: | - kube_deployment_status_available_replicas{namespace="acp-overprovisioner", deployment="acp-overprovisioner"} - < kube_deployment_spec_replicas{namespace="acp-overprovisioner", deployment="acp-overprovisioner"} - for: 15m - labels: - severity: warning - annotations: - summary: Overprovisioner has fewer available replicas than desired. - description: >- - The acp-overprovisioner Deployment has had fewer available replicas - than desired for more than 15 minutes. Spare capacity for agentic - sessions may be reduced. - - alert: OverprovisionerDeploymentMissing - expr: | - absent(kube_deployment_status_available_replicas{namespace="acp-overprovisioner", deployment="acp-overprovisioner"}) - for: 30m - labels: - severity: critical - annotations: - summary: Overprovisioner deployment is missing. - description: >- - The acp-overprovisioner Deployment has been absent for more than - 30 minutes. No spare capacity is being reserved for agentic - session runners, which may result in slow pod startup times.