From aae3956f205410c952d14800dccf44e07789e292 Mon Sep 17 00:00:00 2001 From: Subhash Khileri Date: Mon, 27 Apr 2026 14:15:18 +0530 Subject: [PATCH] feat: add diagnostic log collection on test failure Adds collectDiagnosticLogs() to KubernetesClientHelper that captures cluster state (events, pods, deployments, statefulsets, routes, per-container pod logs) to files on test failure. TeardownReporter now tracks failed projects and collects diagnostics before namespace deletion. Log collection runs on both CI and local; namespace deletion remains CI-only. Bumps version to 1.1.34. --- docs/.vitepress/config.ts | 2 +- docs/changelog.md | 17 ++- docs/guide/core-concepts/error-handling.md | 8 ++ docs/guide/utilities/kubernetes-client.md | 38 +++++++ docs/overlay/reference/troubleshooting.md | 34 ++++++ package.json | 2 +- src/playwright/teardown-reporter.ts | 50 ++++++--- src/utils/kubernetes-client.ts | 119 ++++++++++++++++++++- 8 files changed, 249 insertions(+), 21 deletions(-) diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts index 04df3c2..7997a0e 100644 --- a/docs/.vitepress/config.ts +++ b/docs/.vitepress/config.ts @@ -33,7 +33,7 @@ export default defineConfig({ { text: "Examples", link: "/examples/" }, { text: "Overlay Testing", link: "/overlay/" }, { - text: "v1.1.33", + text: "v1.1.34", items: [{ text: "Changelog", link: "/changelog" }], }, ], diff --git a/docs/changelog.md b/docs/changelog.md index 1ea1c57..41a6393 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,12 +2,27 @@ All notable changes to this project will be documented in this file. -## [1.1.33] - Current +## [1.1.34] - Current + +### Added + +- **Diagnostic log collection on failure**: `collectDiagnosticLogs(namespace, outputDir?)` on `KubernetesClientHelper` captures comprehensive cluster state (events, pod status, deployments, statefulsets, routes, and per-container pod logs including init containers and previous restarts) to files under `node_modules/.cache/e2e-test-results/logs//`. Uses `kubectl` for cross-platform compatibility. Empty files (e.g. no previous logs) are not created. +- **TeardownReporter collects diagnostics on test failure**: When any test in a project fails, the teardown reporter automatically calls `collectDiagnosticLogs` before namespace deletion. Diagnostic collection runs on both CI and local; namespace deletion remains CI-only. +- **Per-container pod log collection**: Logs are collected per-container (init + app containers) instead of `--all-containers`, which fails entirely if any container hasn't started. Files saved to `pods//.log` and `pods//.previous.log`. + +### Changed + +- **TeardownReporter tracks test failures**: Added `_projectsWithFailures` set to track which projects had test failures, so diagnostic logs are only collected when needed. +- **TeardownReporter active on non-CI**: The reporter now processes `onTestEnd`/`onEnd` regardless of `CI` env var. Log collection runs always; namespace deletion is still gated on `CI=true`. + +## [1.1.33] ### Added - **Automatic Vault secret loading for local development**: Set `VAULT=1` or `VAULT=true` to automatically fetch secrets from HashiCorp Vault during global setup. Handles OIDC login, fetches global and per-workspace secrets, and injects them into `process.env`. Only secret key names are logged, never values. Configurable via `VAULT_ADDR` and `VAULT_BASE_PATH` env vars. Logs a Slack channel (`#rhdh-e2e-tests`) when permission is denied. +## [1.1.32] + ### Fixed - **Normalize `-dynamic` suffix in `extractPluginName`**: Plugins whose metadata `dynamicArtifact` is a local path (ending in `-dynamic`) were not matched during PR OCI resolution or config injection, because the metadata map key included the `-dynamic` suffix while OCI URL lookups did not. `extractPluginName` now strips the `-dynamic` suffix so local paths and OCI refs for the same logical plugin produce the same key. ([RHDHBUGS-2987](https://issues.redhat.com/browse/RHDHBUGS-2987)) diff --git a/docs/guide/core-concepts/error-handling.md b/docs/guide/core-concepts/error-handling.md index 91add9a..a3e99cd 100644 --- a/docs/guide/core-concepts/error-handling.md +++ b/docs/guide/core-concepts/error-handling.md @@ -280,6 +280,14 @@ await page.click('button[data-testid="save"]'); await expect(page.getByText("Saved")).toBeVisible(); ``` +## Cluster Diagnostic Logs + +When tests fail, the framework automatically collects cluster diagnostics (pod logs, events, deployments) to `node_modules/.cache/e2e-test-results/logs//`. This includes per-container logs for all pods (init and app containers), with previous restart logs when available. + +Check these files first when debugging deployment or pod failures — they're often more useful than Playwright's HTML report for infrastructure issues. + +See [Kubernetes Client — Diagnostic Log Collection](/guide/utilities/kubernetes-client#diagnostic-log-collection) for the full list of collected resources and API details. + ## Error Handling Checklist - [ ] Use specific error messages that include context diff --git a/docs/guide/utilities/kubernetes-client.md b/docs/guide/utilities/kubernetes-client.md index 3be61af..c4ff805 100644 --- a/docs/guide/utilities/kubernetes-client.md +++ b/docs/guide/utilities/kubernetes-client.md @@ -121,6 +121,44 @@ When a failure is detected, the method: 2. Fetches container logs via `oc logs` 3. Throws an error with the failure details +## Diagnostic Log Collection + +### `collectDiagnosticLogs(namespace, outputDir?)` + +Collects comprehensive cluster diagnostics and saves them to files. Uses `kubectl` for cross-platform compatibility (OpenShift, EKS, GKE, etc.). OpenShift-specific resources (routes) are collected on a best-effort basis. + +```typescript +await k8sClient.collectDiagnosticLogs("my-namespace"); +// Saves to: node_modules/.cache/e2e-test-results/logs/my-namespace/ + +// Or with a custom output directory: +await k8sClient.collectDiagnosticLogs("my-namespace", "/tmp/debug-logs"); +``` + +**Collected resources:** + +| File | Content | +|------|---------| +| `events.txt` | Namespace events sorted by timestamp | +| `pods.txt` | Pod status (`kubectl get pods -o wide`) | +| `describe-pods.txt` | Full pod descriptions | +| `deployments.txt` | Deployment status | +| `describe-deployments.txt` | Full deployment descriptions | +| `statefulsets.txt` | StatefulSet status | +| `routes.txt` | OpenShift routes (skipped on non-OpenShift clusters) | +| `pods//.log` | Current logs per container (init + app) | +| `pods//.previous.log` | Previous restart logs (only if pod restarted) | + +**Key behaviors:** +- Logs are collected per-container rather than `--all-containers`, so a failed init container doesn't block collection of other container logs +- Empty files are not created (e.g., when there are no previous logs) +- Resource types that don't exist on the cluster (e.g., routes on non-OpenShift) are silently skipped +- All resource collection runs in parallel via `Promise.allSettled` + +**Automatic collection on test failure:** + +In the overlay testing flow, you don't need to call this manually. The built-in `TeardownReporter` automatically calls `collectDiagnosticLogs` for any project that had test failures. This works on both CI and local runs. + ## Deployment Operations ### `scaleDeployment(namespace, name, replicas)` diff --git a/docs/overlay/reference/troubleshooting.md b/docs/overlay/reference/troubleshooting.md index 64ceffa..781676c 100644 --- a/docs/overlay/reference/troubleshooting.md +++ b/docs/overlay/reference/troubleshooting.md @@ -271,6 +271,40 @@ oc login --token= --server= - Check route/service configuration - Verify network policies +## Diagnostic Logs + +When tests fail, the `TeardownReporter` automatically collects cluster diagnostics and saves them to: + +``` +node_modules/.cache/e2e-test-results/logs// +├── events.txt # Namespace events (sorted by time) +├── pods.txt # Pod status +├── describe-pods.txt # Full pod descriptions +├── deployments.txt # Deployment status +├── describe-deployments.txt +├── statefulsets.txt +├── routes.txt # OpenShift routes +└── pods/ + └── / + ├── .log # Current logs + └── .previous.log # Previous restart logs +``` + +This runs automatically on **both CI and local** — no configuration needed. Namespace deletion remains CI-only. + +**When using `run-e2e.sh`**, logs are written relative to the repo root. When running from a workspace (`cd workspaces/my-plugin/e2e-tests && yarn test`), they're relative to the `e2e-tests/` directory. + +**Logs are only collected for projects with failures.** If all tests pass, no diagnostic logs are written. + +To collect diagnostics manually (e.g., from a custom script): + +```typescript +import { KubernetesClientHelper } from "@red-hat-developer-hub/e2e-test-utils/utils"; + +const k8sClient = new KubernetesClientHelper(); +await k8sClient.collectDiagnosticLogs("my-namespace", "./my-logs"); +``` + ## Debugging Tips ### Use Headed Mode diff --git a/package.json b/package.json index d08a109..884f180 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@red-hat-developer-hub/e2e-test-utils", - "version": "1.1.33", + "version": "1.1.34", "description": "Test utilities for RHDH E2E tests", "license": "Apache-2.0", "repository": { diff --git a/src/playwright/teardown-reporter.ts b/src/playwright/teardown-reporter.ts index ed2f4ea..2f71029 100644 --- a/src/playwright/teardown-reporter.ts +++ b/src/playwright/teardown-reporter.ts @@ -4,6 +4,7 @@ import type { TestCase, TestResult, } from "@playwright/test/reporter"; +import path from "path"; import { KubernetesClientHelper } from "../utils/kubernetes-client.js"; import { getTeardownNamespaces } from "./teardown-namespaces.js"; @@ -18,7 +19,8 @@ import { getTeardownNamespaces } from "./teardown-namespaces.js"; * Falls back in onEnd() to clean up any projects that didn't complete naturally * (e.g., interrupted runs, maxFailures). * - * Only active when process.env.CI === "true". + * Diagnostic log collection runs always (CI and local). + * Namespace deletion only runs when process.env.CI === "true". * * By default, deletes the namespace matching the project name. * For custom namespaces, consumers can register them via registerTeardownNamespace(). @@ -26,6 +28,7 @@ import { getTeardownNamespaces } from "./teardown-namespaces.js"; export default class TeardownReporter implements Reporter { private _projectTestCounts = new Map(); private _projectCompleted = new Map(); + private _projectsWithFailures = new Set(); private _pendingDeletions = new Map>(); onBegin(_config: unknown, suite: Suite): void { @@ -42,8 +45,6 @@ export default class TeardownReporter implements Reporter { } onTestEnd(test: TestCase, result: TestResult): void { - if (process.env.CI !== "true") return; - const project = test.parent.project(); if (!project) return; @@ -55,10 +56,15 @@ export default class TeardownReporter implements Reporter { if (!isDone) return; const name = project.name; + + if (result.status !== "passed" && result.status !== "skipped") { + this._projectsWithFailures.add(name); + } + const completed = (this._projectCompleted.get(name) ?? 0) + 1; this._projectCompleted.set(name, completed); - // Start deletion immediately (fire-and-forget here, awaited in onEnd) + // Start cleanup immediately (fire-and-forget here, awaited in onEnd) if ( completed === this._projectTestCounts.get(name) && !this._pendingDeletions.has(name) @@ -68,15 +74,14 @@ export default class TeardownReporter implements Reporter { } async onEnd(): Promise { - if (process.env.CI !== "true") return; - - // Await all in-flight deletions started from onTestEnd + // Await all in-flight cleanups started from onTestEnd await Promise.all(this._pendingDeletions.values()); // Fallback: clean up projects that didn't complete naturally - // (e.g., interrupted run, maxFailures hit) + // (e.g., interrupted run, maxFailures hit) — always collect diagnostics for (const [project] of this._projectTestCounts) { if (!this._pendingDeletions.has(project)) { + this._projectsWithFailures.add(project); await this._deleteProjectNamespaces(project); } } @@ -88,7 +93,7 @@ export default class TeardownReporter implements Reporter { k8sClient = new KubernetesClientHelper(); } catch (error) { console.error( - `[TeardownReporter] Cannot connect to cluster, skipping teardown:`, + `[TeardownReporter] Cannot connect to cluster, skipping cleanup:`, error, ); return; @@ -98,11 +103,28 @@ export default class TeardownReporter implements Reporter { const namespaces = customNamespaces.length > 0 ? customNamespaces : [projectName]; - for (const ns of namespaces) { - console.log( - `[TeardownReporter] Deleting namespace "${ns}" (project: ${projectName})`, - ); - await k8sClient.deleteNamespace(ns); + // Collect diagnostic logs on failure (always, regardless of CI) + if (this._projectsWithFailures.has(projectName)) { + for (const ns of namespaces) { + const outputDir = path.join( + "node_modules", + ".cache", + "e2e-test-results", + "logs", + projectName, + ); + await k8sClient.collectDiagnosticLogs(ns, outputDir); + } + } + + // Delete namespaces only in CI + if (process.env.CI === "true") { + for (const ns of namespaces) { + console.log( + `[TeardownReporter] Deleting namespace "${ns}" (project: ${projectName})`, + ); + await k8sClient.deleteNamespace(ns); + } } } } diff --git a/src/utils/kubernetes-client.ts b/src/utils/kubernetes-client.ts index ccfed3d..542e448 100644 --- a/src/utils/kubernetes-client.ts +++ b/src/utils/kubernetes-client.ts @@ -629,15 +629,12 @@ class KubernetesClientHelper { await new Promise((r) => setTimeout(r, pollIntervalMs)); } - // Timeout reached - collect diagnostic info before throwing + // Timeout reached - print diagnostics to stdio before throwing console.log(`\n[K8sHelper] ═══ Pod Diagnostics (timeout reached) ═══`); try { console.log(`\n[K8sHelper] ─── Pod Status ───`); await $`oc get pods -n ${namespace} -l ${labelSelector} -o wide`; - console.log(`\n[K8sHelper] ─── Namespace Events ───`); - await $`oc get events -n ${namespace} --sort-by='.lastTimestamp'`; - console.log(`\n[K8sHelper] ─── Pod Logs ───`); await $`oc logs -n ${namespace} -l ${labelSelector} --all-containers --tail=100 2>&1 || true`; } catch { @@ -650,6 +647,120 @@ class KubernetesClientHelper { ); } + /** + * Collects diagnostic logs for all resources in a namespace and saves them as files. + * Uses kubectl for cross-platform compatibility (works on OpenShift, EKS, GKE, etc.). + * OpenShift-specific resources (routes) are collected on a best-effort basis. + * + * @param namespace - Namespace to collect diagnostics from + * @param outputDir - Directory to write log files to (defaults to playwright-report/logs/) + */ + async collectDiagnosticLogs( + namespace: string, + outputDir: string = path.join( + "node_modules", + ".cache", + "e2e-test-results", + "logs", + namespace, + ), + ): Promise { + fs.mkdirSync(outputDir, { recursive: true }); + console.log( + `[K8sHelper] Collecting diagnostic logs for "${namespace}" → ${outputDir}`, + ); + const quiet = $({ + stdio: ["pipe", "pipe", "pipe"], + timeout: "20s", + }); + + const save = async (filePath: string, cmd: Promise<{ stdout: string }>) => { + try { + const result = await cmd; + fs.mkdirSync(path.dirname(filePath), { recursive: true }); + fs.writeFileSync(filePath, result.stdout); + } catch { + // ignore — resource type may not exist on this cluster + } + }; + + await Promise.allSettled([ + save( + path.join(outputDir, "events.txt"), + quiet`kubectl get events -n ${namespace} --sort-by='.lastTimestamp'`, + ), + save( + path.join(outputDir, "pods.txt"), + quiet`kubectl get pods -n ${namespace} -o wide`, + ), + save( + path.join(outputDir, "describe-pods.txt"), + quiet`kubectl describe pods -n ${namespace}`, + ), + save( + path.join(outputDir, "deployments.txt"), + quiet`kubectl get deployments -n ${namespace} -o wide`, + ), + save( + path.join(outputDir, "describe-deployments.txt"), + quiet`kubectl describe deployments -n ${namespace}`, + ), + save( + path.join(outputDir, "statefulsets.txt"), + quiet`kubectl get statefulsets -n ${namespace} -o wide`, + ), + save( + path.join(outputDir, "routes.txt"), + quiet`kubectl get routes -n ${namespace} -o wide`, + ), + ]); + + try { + const pods = (await this._k8sApi.listNamespacedPod({ namespace })).items; + const saveLogs = async ( + filePath: string, + cmd: Promise<{ stdout: string }>, + ) => { + try { + const result = await cmd; + if (result.stdout.trim()) { + fs.mkdirSync(path.dirname(filePath), { recursive: true }); + fs.writeFileSync(filePath, result.stdout); + } + } catch { + // ignore — container may not have started or no previous logs + } + }; + + await Promise.allSettled( + pods + .filter((pod) => pod.metadata?.name) + .flatMap((pod) => { + const podName = pod.metadata!.name!; + const podDir = path.join(outputDir, "pods", podName); + const containers = [ + ...(pod.spec?.initContainers ?? []), + ...(pod.spec?.containers ?? []), + ]; + return containers + .filter((c) => c.name) + .flatMap((c) => [ + saveLogs( + path.join(podDir, `${c.name}.log`), + quiet`kubectl logs ${podName} -n ${namespace} -c ${c.name}`, + ), + saveLogs( + path.join(podDir, `${c.name}.previous.log`), + quiet`kubectl logs ${podName} -n ${namespace} -c ${c.name} --previous`, + ), + ]); + }), + ); + } catch { + // ignore + } + } + /** * Check if a pod is in a failure state. Returns failure info or null if healthy. */