k8s-bench | health probe troubleshooting eval task #145 (#163)

zvdy · web-flow · commit d9a5ba6c9b9c · 2025-05-08T11:15:16.000-07:00
* feat: add health probe troubleshooting eval task

* fix: modify path for probes and use kubectl wait

* fix: refactor for consistency with other tasks, add probes exists check and decrease restarts to 1

* chore: decrease restart check to 1

* refactor: improve README &amp; add missing flags for examples
diff --git a/k8s-bench/README.md b/k8s-bench/README.md
@@ -8,10 +8,54 @@
 ```sh
 # build the k8s-bench binary
 go build
+```
+
+#### Run Subcommand
+
+The `run` subcommand executes the benchmark evaluations.
+
+```sh
+# Basic usage with mandatory output directory
+./k8s-bench run --agent-bin <path/to/kubectl-ai/binary> --output-dir .build/k8sbench
 
 # Run evaluation for scale related tasks
-./k8s-bench run --agent-bin <path/to/kubectl-ai/binary> --task-pattern scale --kubeconfig <path/to/kubeconfig>
+./k8s-bench run --agent-bin <path/to/kubectl-ai/binary> --task-pattern scale --kubeconfig <path/to/kubeconfig> --output-dir .build/k8sbench
+
+# Run evaluation for a specific LLM provider and model with tool use shim disabled
+./k8s-bench run --llm-provider=grok --models=grok-3-beta --agent-bin ../kubectl-ai --task-pattern=fix-probes --enable-tool-use-shim=false --output-dir .build/k8sbench
+
+# Run evaluation with all available options
+./k8s-bench run \
+  --agent-bin <path/to/kubectl-ai/binary> \
+  --kubeconfig ~/.kube/config \
+  --tasks-dir ./tasks \
+  --task-pattern fix \
+  --llm-provider gemini \
+  --models gemini-2.5-pro-preview-03-25,gemini-1.5-pro-latest \
+  --enable-tool-use-shim true \
+  --quiet true \
+  --output-dir .build/k8sbench
+```
 
+#### Available flags for `run` subcommand:
+
+| Flag | Description | Default | Required |
+|------|-------------|---------|----------|
+| `--agent-bin` | Path to kubectl-ai binary | - | Yes |
+| `--output-dir` | Directory to write results to | - | Yes |
+| `--tasks-dir` | Directory containing evaluation tasks | ./tasks | No |
+| `--kubeconfig` | Path to kubeconfig file | ~/.kube/config | No |
+| `--task-pattern` | Pattern to filter tasks (e.g. 'pod' or 'redis') | - | No |
+| `--llm-provider` | Specific LLM provider to evaluate (e.g. 'gemini' or 'ollama') | gemini | No |
+| `--models` | Comma-separated list of models to evaluate | gemini-2.5-pro-preview-03-25 | No |
+| `--enable-tool-use-shim` | Enable tool use shim | true | No |
+| `--quiet` | Quiet mode (non-interactive mode) | true | No |
+
+#### Analyze Subcommand
+
+The `analyze` subcommand processes results from previous runs:
+
+```sh
 # Analyze previous evaluation results and output in markdown format (default)
 ./k8s-bench analyze --input-dir .build/k8sbench
 
@@ -20,8 +64,24 @@ go build
 
 # Save analysis results to a file
 ./k8s-bench analyze --input-dir .build/k8sbench --results-filepath ./results.md
+
+# Analyze with all available options
+./k8s-bench analyze \
+  --input-dir .build/k8sbench \
+  --output-format markdown \
+  --ignore-tool-use-shim true \
+  --results-filepath ./detailed-analysis.md
 ```
 
+#### Available flags for `analyze` subcommand:
+
+| Flag | Description | Default | Required |
+|------|-------------|---------|----------|
+| `--input-dir` | Directory containing evaluation results | - | Yes |
+| `--output-format` | Output format (markdown or json) | markdown | No |
+| `--ignore-tool-use-shim` | Ignore tool use shim in result grouping | true | No |
+| `--results-filepath` | Optional file path to write results to | - | No |
+
 Running the benchmark with the `run` subcommand will produce results as below:
 
 ```sh
@@ -37,4 +97,4 @@ Task: scale-down-deployment
     gemini-2.0-flash-thinking-exp-01-21: true
 ```
 
-The `analyze` subcommand will gather the results from previous runs and display them in a tabular format with emoji indicators for success (✅) and failure (❌). 
+The `analyze` subcommand will gather the results from previous runs and display them in a tabular format with emoji indicators for success (✅) and failure (❌).
diff --git a/k8s-bench/tasks/fix-probes/cleanup.sh b/k8s-bench/tasks/fix-probes/cleanup.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# Delete the namespace which will remove all resources created for this task
+kubectl delete namespace health-check --ignore-not-found
+
+echo "Cleanup completed"
diff --git a/k8s-bench/tasks/fix-probes/setup.sh b/k8s-bench/tasks/fix-probes/setup.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Delete namespace if exists and create a fresh one
+kubectl delete namespace health-check --ignore-not-found
+kubectl create namespace health-check
+
+# Create a deployment with problematic health checks
+cat <<YAML | kubectl apply -f -
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: webapp
+  namespace: health-check
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: webapp
+  template:
+    metadata:
+      labels:
+        app: webapp
+    spec:
+      containers:
+      - name: webapp
+        image: nginx:latest
+        ports:
+        - containerPort: 80
+        # The problem: incorrect health probes causing restarts
+        livenessProbe:
+          httpGet:
+            path: /get_status  # Path doesn't exist
+            port: 80
+          initialDelaySeconds: 5
+          periodSeconds: 5
+        readinessProbe:
+          httpGet:
+            path: /is_ready  # Path doesn't exist
+            port: 80
+          initialDelaySeconds: 5
+          periodSeconds: 5
+YAML
+
+# Create a service for the webapp
+kubectl create service clusterip webapp -n health-check --tcp=80:80
+
+# Wait for the pod to start and begin restarting due to failed probes
+echo "Waiting for pod to start and begin failing health checks..."
+kubectl wait --for=condition=Available=False --timeout=30s deployment/webapp -n health-check || true
diff --git a/k8s-bench/tasks/fix-probes/task.yaml b/k8s-bench/tasks/fix-probes/task.yaml
@@ -0,0 +1,6 @@
+script:
+- prompt: "Please fix the health check issue with the deployment named 'webapp' in namespace 'health-check'"
+setup: "setup.sh"
+verifier: "verify.sh"
+cleanup: "cleanup.sh"
+difficulty: "medium"
diff --git a/k8s-bench/tasks/fix-probes/verify.sh b/k8s-bench/tasks/fix-probes/verify.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Check if the pod is in Running state with Ready status
+echo "Checking if the pod is running and ready..."
+
+# Wait up to 30 seconds for pod to become ready using kubectl wait
+if kubectl wait --for=condition=Ready pod -l app=webapp -n health-check --timeout=30s; then
+  echo "Success: Pod is now Ready"
+    
+    # Check if probes exist at all
+    LIVENESS_EXISTS=$(kubectl get deploy webapp -n health-check -o jsonpath='{.spec.template.spec.containers[0].livenessProbe}')
+    READINESS_EXISTS=$(kubectl get deploy webapp -n health-check -o jsonpath='{.spec.template.spec.containers[0].readinessProbe}')
+    
+    if [ -z "$LIVENESS_EXISTS" ] || [ -z "$READINESS_EXISTS" ]; then
+      echo "Failure: One or both probes have been removed completely."
+      echo "Probes should be fixed, not removed."
+      exit 1
+    fi
+    
+    # Get the current probe configurations
+    LIVENESS_PATH=$(kubectl get deploy webapp -n health-check -o jsonpath='{.spec.template.spec.containers[0].livenessProbe.httpGet.path}')
+    READINESS_PATH=$(kubectl get deploy webapp -n health-check -o jsonpath='{.spec.template.spec.containers[0].readinessProbe.httpGet.path}')
+    
+    echo "Current liveness probe path: $LIVENESS_PATH"
+    echo "Current readiness probe path: $READINESS_PATH"
+    
+    # Verify the probes are not using the nonexistent paths and have valid paths set
+    if [ "$LIVENESS_PATH" != "/get_status" ] && [ "$READINESS_PATH" != "/is_ready" ] && \
+       [ ! -z "$LIVENESS_PATH" ] && [ ! -z "$READINESS_PATH" ]; then
+      echo "Success: Both probe paths have been fixed"
+      
+      # Check if pod is stable with no recent restarts
+      RESTARTS=$(kubectl get pods -n health-check -l app=webapp -o jsonpath='{.items[0].status.containerStatuses[0].restartCount}')
+      if [ "$RESTARTS" -lt 1 ]; then
+        echo "Success: Pod is stable with acceptable number of restarts"
+        exit 0
+      else
+        echo "Failure: Pod has too many restarts: $RESTARTS"
+        exit 1
+      fi
+    else
+      echo "Failure: One or both probe paths are still incorrect or missing:"
+      echo "Liveness path: $LIVENESS_PATH"
+      echo "Readiness path: $READINESS_PATH"
+      exit 1
+    fi
+else
+  echo "Failure: Pod is not Ready after waiting"
+  kubectl get pods -n health-check -l app=webapp
+  exit 1
+fi