From aae3956f205410c952d14800dccf44e07789e292 Mon Sep 17 00:00:00 2001
From: Subhash Khileri <skhileri@redhat.com>
Date: Mon, 27 Apr 2026 14:15:18 +0530
Subject: [PATCH] feat: add diagnostic log collection on test failure

Adds collectDiagnosticLogs() to KubernetesClientHelper that captures
cluster state (events, pods, deployments, statefulsets, routes,
per-container pod logs) to files on test failure.

TeardownReporter now tracks failed projects and collects diagnostics
before namespace deletion. Log collection runs on both CI and local;
namespace deletion remains CI-only.

Bumps version to 1.1.34.
---
 docs/.vitepress/config.ts                  |   2 +-
 docs/changelog.md                          |  17 ++-
 docs/guide/core-concepts/error-handling.md |   8 ++
 docs/guide/utilities/kubernetes-client.md  |  38 +++++++
 docs/overlay/reference/troubleshooting.md  |  34 ++++++
 package.json                               |   2 +-
 src/playwright/teardown-reporter.ts        |  50 ++++++---
 src/utils/kubernetes-client.ts             | 119 ++++++++++++++++++++-
 8 files changed, 249 insertions(+), 21 deletions(-)
diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts
index 04df3c2..7997a0e 100644
--- a/docs/.vitepress/config.ts
+++ b/docs/.vitepress/config.ts
@@ -33,7 +33,7 @@ export default defineConfig({
       { text: "Examples", link: "/examples/" },
       { text: "Overlay Testing", link: "/overlay/" },
       {
-        text: "v1.1.33",
+        text: "v1.1.34",
         items: [{ text: "Changelog", link: "/changelog" }],
       },
     ],
diff --git a/docs/changelog.md b/docs/changelog.md
index 1ea1c57..41a6393 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -2,12 +2,27 @@
 
 All notable changes to this project will be documented in this file.
 
-## [1.1.33] - Current
+## [1.1.34] - Current
+
+### Added
+
+- **Diagnostic log collection on failure**: `collectDiagnosticLogs(namespace, outputDir?)` on `KubernetesClientHelper` captures comprehensive cluster state (events, pod status, deployments, statefulsets, routes, and per-container pod logs including init containers and previous restarts) to files under `node_modules/.cache/e2e-test-results/logs/<namespace>/`. Uses `kubectl` for cross-platform compatibility. Empty files (e.g. no previous logs) are not created.
+- **TeardownReporter collects diagnostics on test failure**: When any test in a project fails, the teardown reporter automatically calls `collectDiagnosticLogs` before namespace deletion. Diagnostic collection runs on both CI and local; namespace deletion remains CI-only.
+- **Per-container pod log collection**: Logs are collected per-container (init + app containers) instead of `--all-containers`, which fails entirely if any container hasn't started. Files saved to `pods/<pod-name>/<container-name>.log` and `pods/<pod-name>/<container-name>.previous.log`.
+
+### Changed
+
+- **TeardownReporter tracks test failures**: Added `_projectsWithFailures` set to track which projects had test failures, so diagnostic logs are only collected when needed.
+- **TeardownReporter active on non-CI**: The reporter now processes `onTestEnd`/`onEnd` regardless of `CI` env var. Log collection runs always; namespace deletion is still gated on `CI=true`.
+
+## [1.1.33]
 
 ### Added
 
 - **Automatic Vault secret loading for local development**: Set `VAULT=1` or `VAULT=true` to automatically fetch secrets from HashiCorp Vault during global setup. Handles OIDC login, fetches global and per-workspace secrets, and injects them into `process.env`. Only secret key names are logged, never values. Configurable via `VAULT_ADDR` and `VAULT_BASE_PATH` env vars. Logs a Slack channel (`#rhdh-e2e-tests`) when permission is denied.
 
+## [1.1.32]
+
 ### Fixed
 
 - **Normalize `-dynamic` suffix in `extractPluginName`**: Plugins whose metadata `dynamicArtifact` is a local path (ending in `-dynamic`) were not matched during PR OCI resolution or config injection, because the metadata map key included the `-dynamic` suffix while OCI URL lookups did not. `extractPluginName` now strips the `-dynamic` suffix so local paths and OCI refs for the same logical plugin produce the same key. ([RHDHBUGS-2987](https://issues.redhat.com/browse/RHDHBUGS-2987))
diff --git a/docs/guide/core-concepts/error-handling.md b/docs/guide/core-concepts/error-handling.md
index 91add9a..a3e99cd 100644
--- a/docs/guide/core-concepts/error-handling.md
+++ b/docs/guide/core-concepts/error-handling.md
@@ -280,6 +280,14 @@ await page.click('button[data-testid="save"]');
 await expect(page.getByText("Saved")).toBeVisible();
 ```
 
+## Cluster Diagnostic Logs
+
+When tests fail, the framework automatically collects cluster diagnostics (pod logs, events, deployments) to `node_modules/.cache/e2e-test-results/logs/<namespace>/`. This includes per-container logs for all pods (init and app containers), with previous restart logs when available.
+
+Check these files first when debugging deployment or pod failures — they're often more useful than Playwright's HTML report for infrastructure issues.
+
+See [Kubernetes Client — Diagnostic Log Collection](/guide/utilities/kubernetes-client#diagnostic-log-collection) for the full list of collected resources and API details.
+
 ## Error Handling Checklist
 
 - [ ] Use specific error messages that include context
diff --git a/docs/guide/utilities/kubernetes-client.md b/docs/guide/utilities/kubernetes-client.md
index 3be61af..c4ff805 100644
--- a/docs/guide/utilities/kubernetes-client.md
+++ b/docs/guide/utilities/kubernetes-client.md
@@ -121,6 +121,44 @@ When a failure is detected, the method:
 2. Fetches container logs via `oc logs`
 3. Throws an error with the failure details
 
+## Diagnostic Log Collection
+
+### `collectDiagnosticLogs(namespace, outputDir?)`
+
+Collects comprehensive cluster diagnostics and saves them to files. Uses `kubectl` for cross-platform compatibility (OpenShift, EKS, GKE, etc.). OpenShift-specific resources (routes) are collected on a best-effort basis.
+
+```typescript
+await k8sClient.collectDiagnosticLogs("my-namespace");
+// Saves to: node_modules/.cache/e2e-test-results/logs/my-namespace/
+
+// Or with a custom output directory:
+await k8sClient.collectDiagnosticLogs("my-namespace", "/tmp/debug-logs");
+```
+
+**Collected resources:**
+
+| File | Content |
+|------|---------|
+| `events.txt` | Namespace events sorted by timestamp |
+| `pods.txt` | Pod status (`kubectl get pods -o wide`) |
+| `describe-pods.txt` | Full pod descriptions |
+| `deployments.txt` | Deployment status |
+| `describe-deployments.txt` | Full deployment descriptions |
+| `statefulsets.txt` | StatefulSet status |
+| `routes.txt` | OpenShift routes (skipped on non-OpenShift clusters) |
+| `pods/<pod>/<container>.log` | Current logs per container (init + app) |
+| `pods/<pod>/<container>.previous.log` | Previous restart logs (only if pod restarted) |
+
+**Key behaviors:**
+- Logs are collected per-container rather than `--all-containers`, so a failed init container doesn't block collection of other container logs
+- Empty files are not created (e.g., when there are no previous logs)
+- Resource types that don't exist on the cluster (e.g., routes on non-OpenShift) are silently skipped
+- All resource collection runs in parallel via `Promise.allSettled`
+
+**Automatic collection on test failure:**
+
+In the overlay testing flow, you don't need to call this manually. The built-in `TeardownReporter` automatically calls `collectDiagnosticLogs` for any project that had test failures. This works on both CI and local runs.
+
 ## Deployment Operations
 
 ### `scaleDeployment(namespace, name, replicas)`
diff --git a/docs/overlay/reference/troubleshooting.md b/docs/overlay/reference/troubleshooting.md
index 64ceffa..781676c 100644
--- a/docs/overlay/reference/troubleshooting.md
+++ b/docs/overlay/reference/troubleshooting.md
@@ -271,6 +271,40 @@ oc login --token=<token> --server=<server>
 - Check route/service configuration
 - Verify network policies
 
+## Diagnostic Logs
+
+When tests fail, the `TeardownReporter` automatically collects cluster diagnostics and saves them to:
+
+```
+node_modules/.cache/e2e-test-results/logs/<project-name>/
+├── events.txt              # Namespace events (sorted by time)
+├── pods.txt                # Pod status
+├── describe-pods.txt       # Full pod descriptions
+├── deployments.txt         # Deployment status
+├── describe-deployments.txt
+├── statefulsets.txt
+├── routes.txt              # OpenShift routes
+└── pods/
+    └── <pod-name>/
+        ├── <container>.log          # Current logs
+        └── <container>.previous.log # Previous restart logs
+```
+
+This runs automatically on **both CI and local** — no configuration needed. Namespace deletion remains CI-only.
+
+**When using `run-e2e.sh`**, logs are written relative to the repo root. When running from a workspace (`cd workspaces/my-plugin/e2e-tests && yarn test`), they're relative to the `e2e-tests/` directory.
+
+**Logs are only collected for projects with failures.** If all tests pass, no diagnostic logs are written.
+
+To collect diagnostics manually (e.g., from a custom script):
+
+```typescript
+import { KubernetesClientHelper } from "@red-hat-developer-hub/e2e-test-utils/utils";
+
+const k8sClient = new KubernetesClientHelper();
+await k8sClient.collectDiagnosticLogs("my-namespace", "./my-logs");
+```
+
 ## Debugging Tips
 
 ### Use Headed Mode
diff --git a/package.json b/package.json
index d08a109..884f180 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@red-hat-developer-hub/e2e-test-utils",
-  "version": "1.1.33",
+  "version": "1.1.34",
   "description": "Test utilities for RHDH E2E tests",
   "license": "Apache-2.0",
   "repository": {
diff --git a/src/playwright/teardown-reporter.ts b/src/playwright/teardown-reporter.ts
index ed2f4ea..2f71029 100644
--- a/src/playwright/teardown-reporter.ts
+++ b/src/playwright/teardown-reporter.ts
@@ -4,6 +4,7 @@ import type {
   TestCase,
   TestResult,
 } from "@playwright/test/reporter";
+import path from "path";
 import { KubernetesClientHelper } from "../utils/kubernetes-client.js";
 import { getTeardownNamespaces } from "./teardown-namespaces.js";
 
@@ -18,7 +19,8 @@ import { getTeardownNamespaces } from "./teardown-namespaces.js";
  * Falls back in onEnd() to clean up any projects that didn't complete naturally
  * (e.g., interrupted runs, maxFailures).
  *
- * Only active when process.env.CI === "true".
+ * Diagnostic log collection runs always (CI and local).
+ * Namespace deletion only runs when process.env.CI === "true".
  *
  * By default, deletes the namespace matching the project name.
  * For custom namespaces, consumers can register them via registerTeardownNamespace().
@@ -26,6 +28,7 @@ import { getTeardownNamespaces } from "./teardown-namespaces.js";
 export default class TeardownReporter implements Reporter {
   private _projectTestCounts = new Map<string, number>();
   private _projectCompleted = new Map<string, number>();
+  private _projectsWithFailures = new Set<string>();
   private _pendingDeletions = new Map<string, Promise<void>>();
 
   onBegin(_config: unknown, suite: Suite): void {
@@ -42,8 +45,6 @@ export default class TeardownReporter implements Reporter {
   }
 
   onTestEnd(test: TestCase, result: TestResult): void {
-    if (process.env.CI !== "true") return;
-
     const project = test.parent.project();
     if (!project) return;
 
@@ -55,10 +56,15 @@ export default class TeardownReporter implements Reporter {
     if (!isDone) return;
 
     const name = project.name;
+
+    if (result.status !== "passed" && result.status !== "skipped") {
+      this._projectsWithFailures.add(name);
+    }
+
     const completed = (this._projectCompleted.get(name) ?? 0) + 1;
     this._projectCompleted.set(name, completed);
 
-    // Start deletion immediately (fire-and-forget here, awaited in onEnd)
+    // Start cleanup immediately (fire-and-forget here, awaited in onEnd)
     if (
       completed === this._projectTestCounts.get(name) &&
       !this._pendingDeletions.has(name)
@@ -68,15 +74,14 @@ export default class TeardownReporter implements Reporter {
   }
 
   async onEnd(): Promise<void> {
-    if (process.env.CI !== "true") return;
-
-    // Await all in-flight deletions started from onTestEnd
+    // Await all in-flight cleanups started from onTestEnd
     await Promise.all(this._pendingDeletions.values());
 
     // Fallback: clean up projects that didn't complete naturally
-    // (e.g., interrupted run, maxFailures hit)
+    // (e.g., interrupted run, maxFailures hit) — always collect diagnostics
     for (const [project] of this._projectTestCounts) {
       if (!this._pendingDeletions.has(project)) {
+        this._projectsWithFailures.add(project);
         await this._deleteProjectNamespaces(project);
       }
     }
@@ -88,7 +93,7 @@ export default class TeardownReporter implements Reporter {
       k8sClient = new KubernetesClientHelper();
     } catch (error) {
       console.error(
-        `[TeardownReporter] Cannot connect to cluster, skipping teardown:`,
+        `[TeardownReporter] Cannot connect to cluster, skipping cleanup:`,
         error,
       );
       return;
@@ -98,11 +103,28 @@ export default class TeardownReporter implements Reporter {
     const namespaces =
       customNamespaces.length > 0 ? customNamespaces : [projectName];
 
-    for (const ns of namespaces) {
-      console.log(
-        `[TeardownReporter] Deleting namespace "${ns}" (project: ${projectName})`,
-      );
-      await k8sClient.deleteNamespace(ns);
+    // Collect diagnostic logs on failure (always, regardless of CI)
+    if (this._projectsWithFailures.has(projectName)) {
+      for (const ns of namespaces) {
+        const outputDir = path.join(
+          "node_modules",
+          ".cache",
+          "e2e-test-results",
+          "logs",
+          projectName,
+        );
+        await k8sClient.collectDiagnosticLogs(ns, outputDir);
+      }
+    }
+
+    // Delete namespaces only in CI
+    if (process.env.CI === "true") {
+      for (const ns of namespaces) {
+        console.log(
+          `[TeardownReporter] Deleting namespace "${ns}" (project: ${projectName})`,
+        );
+        await k8sClient.deleteNamespace(ns);
+      }
     }
   }
 }
diff --git a/src/utils/kubernetes-client.ts b/src/utils/kubernetes-client.ts
index ccfed3d..542e448 100644
--- a/src/utils/kubernetes-client.ts
+++ b/src/utils/kubernetes-client.ts
@@ -629,15 +629,12 @@ class KubernetesClientHelper {
       await new Promise((r) => setTimeout(r, pollIntervalMs));
     }
 
-    // Timeout reached - collect diagnostic info before throwing
+    // Timeout reached - print diagnostics to stdio before throwing
     console.log(`\n[K8sHelper] ═══ Pod Diagnostics (timeout reached) ═══`);
     try {
       console.log(`\n[K8sHelper] ─── Pod Status ───`);
       await $`oc get pods -n ${namespace} -l ${labelSelector} -o wide`;
 
-      console.log(`\n[K8sHelper] ─── Namespace Events ───`);
-      await $`oc get events -n ${namespace} --sort-by='.lastTimestamp'`;
-
       console.log(`\n[K8sHelper] ─── Pod Logs ───`);
       await $`oc logs -n ${namespace} -l ${labelSelector} --all-containers --tail=100 2>&1 || true`;
     } catch {
@@ -650,6 +647,120 @@ class KubernetesClientHelper {
     );
   }
 
+  /**
+   * Collects diagnostic logs for all resources in a namespace and saves them as files.
+   * Uses kubectl for cross-platform compatibility (works on OpenShift, EKS, GKE, etc.).
+   * OpenShift-specific resources (routes) are collected on a best-effort basis.
+   *
+   * @param namespace - Namespace to collect diagnostics from
+   * @param outputDir - Directory to write log files to (defaults to playwright-report/logs/<namespace>)
+   */
+  async collectDiagnosticLogs(
+    namespace: string,
+    outputDir: string = path.join(
+      "node_modules",
+      ".cache",
+      "e2e-test-results",
+      "logs",
+      namespace,
+    ),
+  ): Promise<void> {
+    fs.mkdirSync(outputDir, { recursive: true });
+    console.log(
+      `[K8sHelper] Collecting diagnostic logs for "${namespace}" → ${outputDir}`,
+    );
+    const quiet = $({
+      stdio: ["pipe", "pipe", "pipe"],
+      timeout: "20s",
+    });
+
+    const save = async (filePath: string, cmd: Promise<{ stdout: string }>) => {
+      try {
+        const result = await cmd;
+        fs.mkdirSync(path.dirname(filePath), { recursive: true });
+        fs.writeFileSync(filePath, result.stdout);
+      } catch {
+        // ignore — resource type may not exist on this cluster
+      }
+    };
+
+    await Promise.allSettled([
+      save(
+        path.join(outputDir, "events.txt"),
+        quiet`kubectl get events -n ${namespace} --sort-by='.lastTimestamp'`,
+      ),
+      save(
+        path.join(outputDir, "pods.txt"),
+        quiet`kubectl get pods -n ${namespace} -o wide`,
+      ),
+      save(
+        path.join(outputDir, "describe-pods.txt"),
+        quiet`kubectl describe pods -n ${namespace}`,
+      ),
+      save(
+        path.join(outputDir, "deployments.txt"),
+        quiet`kubectl get deployments -n ${namespace} -o wide`,
+      ),
+      save(
+        path.join(outputDir, "describe-deployments.txt"),
+        quiet`kubectl describe deployments -n ${namespace}`,
+      ),
+      save(
+        path.join(outputDir, "statefulsets.txt"),
+        quiet`kubectl get statefulsets -n ${namespace} -o wide`,
+      ),
+      save(
+        path.join(outputDir, "routes.txt"),
+        quiet`kubectl get routes -n ${namespace} -o wide`,
+      ),
+    ]);
+
+    try {
+      const pods = (await this._k8sApi.listNamespacedPod({ namespace })).items;
+      const saveLogs = async (
+        filePath: string,
+        cmd: Promise<{ stdout: string }>,
+      ) => {
+        try {
+          const result = await cmd;
+          if (result.stdout.trim()) {
+            fs.mkdirSync(path.dirname(filePath), { recursive: true });
+            fs.writeFileSync(filePath, result.stdout);
+          }
+        } catch {
+          // ignore — container may not have started or no previous logs
+        }
+      };
+
+      await Promise.allSettled(
+        pods
+          .filter((pod) => pod.metadata?.name)
+          .flatMap((pod) => {
+            const podName = pod.metadata!.name!;
+            const podDir = path.join(outputDir, "pods", podName);
+            const containers = [
+              ...(pod.spec?.initContainers ?? []),
+              ...(pod.spec?.containers ?? []),
+            ];
+            return containers
+              .filter((c) => c.name)
+              .flatMap((c) => [
+                saveLogs(
+                  path.join(podDir, `${c.name}.log`),
+                  quiet`kubectl logs ${podName} -n ${namespace} -c ${c.name}`,
+                ),
+                saveLogs(
+                  path.join(podDir, `${c.name}.previous.log`),
+                  quiet`kubectl logs ${podName} -n ${namespace} -c ${c.name} --previous`,
+                ),
+              ]);
+          }),
+      );
+    } catch {
+      // ignore
+    }
+  }
+
   /**
    * Check if a pod is in a failure state. Returns failure info or null if healthy.
    */