From 5ffe0fada02f3d65a61dbb70f55b4b01fbd62043 Mon Sep 17 00:00:00 2001 From: Jonathan Haas Date: Sun, 8 Mar 2026 19:45:24 -0700 Subject: [PATCH 1/4] ci: add nightly no-cache checks, flake artifacts, RC smoke, and failure triage --- .github/scripts/apply_branch_protection.sh | 1 + .github/scripts/collect_flake_metrics.sh | 124 ++++++++++++++++ .github/scripts/create_ci_failure_issue.sh | 97 ++++++++++++ .github/workflows/ci.yml | 100 ++++++++++++- .github/workflows/release-candidate-smoke.yml | 140 ++++++++++++++++++ AGENTS.md | 3 + README.md | 4 + 7 files changed, 463 insertions(+), 6 deletions(-) create mode 100755 .github/scripts/collect_flake_metrics.sh create mode 100755 .github/scripts/create_ci_failure_issue.sh create mode 100644 .github/workflows/release-candidate-smoke.yml diff --git a/.github/scripts/apply_branch_protection.sh b/.github/scripts/apply_branch_protection.sh index d9c2988..f85e3fd 100755 --- a/.github/scripts/apply_branch_protection.sh +++ b/.github/scripts/apply_branch_protection.sh @@ -25,6 +25,7 @@ cat >"$payload_file" </dev/null 2>&1; then + echo "Missing dependency: $1" >&2 + exit 1 + fi +} + +require_bin gh +require_bin jq + +if [[ -z "${GH_TOKEN:-}" ]]; then + echo "GH_TOKEN is required" >&2 + exit 1 +fi +if [[ -z "${REPO:-}" ]]; then + echo "REPO is required (owner/repo)" >&2 + exit 1 +fi +if [[ -z "${RUN_ID:-}" ]]; then + echo "RUN_ID is required" >&2 + exit 1 +fi + +output_dir="${OUTPUT_DIR:-flake-metrics}" +workflow_file="${WORKFLOW_FILE:-ci.yml}" +branch="${BRANCH:-main}" +trend_limit="${TREND_RUN_LIMIT:-30}" + +mkdir -p "${output_dir}" + +jobs_file="$(mktemp)" +runs_file="$(mktemp)" +trap 'rm -f "${jobs_file}" "${runs_file}"' EXIT + +# Current run snapshot for integration/perf-smoke status and timing. +gh api "repos/${REPO}/actions/runs/${RUN_ID}/jobs?per_page=100" >"${jobs_file}" + +jq -n \ + --arg generated_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --arg repo "${REPO}" \ + --argjson run_id "${RUN_ID}" \ + --arg run_url "https://github.com/${REPO}/actions/runs/${RUN_ID}" \ + --slurpfile jobs "${jobs_file}" \ + '{ + generated_at: $generated_at, + repository: $repo, + run_id: $run_id, + run_url: $run_url, + jobs: ($jobs[0].jobs + | map(select(.name == "integration" or .name == "perf-smoke") + | { + name: .name, + status: .status, + conclusion: .conclusion, + started_at: .started_at, + completed_at: .completed_at, + duration_seconds: (if (.started_at != null and .completed_at != null) + then ((.completed_at | fromdateiso8601) - (.started_at | fromdateiso8601)) + else null + end) + })) + }' >"${output_dir}/current.json" + +printf 'run_id,run_number,run_created_at,run_conclusion,run_url,job_name,job_conclusion,job_started_at,job_completed_at,duration_seconds\n' >"${output_dir}/trend.csv" + +# Trend window over recent CI runs on main. +gh api "repos/${REPO}/actions/workflows/${workflow_file}/runs?branch=${branch}&per_page=${trend_limit}" >"${runs_file}" + +while IFS=$'\t' read -r run_id run_number run_created_at run_conclusion run_url; do + run_jobs_file="$(mktemp)" + gh api "repos/${REPO}/actions/runs/${run_id}/jobs?per_page=100" >"${run_jobs_file}" + + jq -r \ + --arg run_id "${run_id}" \ + --arg run_number "${run_number}" \ + --arg run_created_at "${run_created_at}" \ + --arg run_conclusion "${run_conclusion}" \ + --arg run_url "${run_url}" \ + '.jobs[] + | select(.name == "integration" or .name == "perf-smoke") + | [ + $run_id, + $run_number, + $run_created_at, + $run_conclusion, + $run_url, + .name, + (.conclusion // "unknown"), + (.started_at // ""), + (.completed_at // ""), + (if (.started_at != null and .completed_at != null) + then (((.completed_at | fromdateiso8601) - (.started_at | fromdateiso8601)) | tostring) + else "" + end) + ] + | @csv' "${run_jobs_file}" >>"${output_dir}/trend.csv" + + rm -f "${run_jobs_file}" +done < <(jq -r '.workflow_runs[] | [.id, .run_number, .created_at, .conclusion, .html_url] | @tsv' "${runs_file}") + +jq -n \ + --arg generated_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --arg repo "${REPO}" \ + --arg branch "${branch}" \ + --argjson run_id "${RUN_ID}" \ + --arg workflow_file "${workflow_file}" \ + --arg trend_csv "${output_dir}/trend.csv" \ + --arg current_json "${output_dir}/current.json" \ + '{ + generated_at: $generated_at, + repository: $repo, + branch: $branch, + source_run_id: $run_id, + workflow_file: $workflow_file, + artifacts: { + current: $current_json, + trend: $trend_csv + } + }' >"${output_dir}/manifest.json" + +echo "Flake metrics written to ${output_dir}" diff --git a/.github/scripts/create_ci_failure_issue.sh b/.github/scripts/create_ci_failure_issue.sh new file mode 100755 index 0000000..502b116 --- /dev/null +++ b/.github/scripts/create_ci_failure_issue.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -euo pipefail + +require_bin() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "Missing dependency: $1" >&2 + exit 1 + fi +} + +require_bin gh +require_bin jq + +if [[ -z "${GH_TOKEN:-}" ]]; then + echo "GH_TOKEN is required" >&2 + exit 1 +fi +if [[ -z "${REPO:-}" ]]; then + echo "REPO is required (owner/repo)" >&2 + exit 1 +fi +if [[ -z "${RUN_ID:-}" ]]; then + echo "RUN_ID is required" >&2 + exit 1 +fi + +run_file="$(mktemp)" +jobs_file="$(mktemp)" +failed_file="$(mktemp)" +body_file="$(mktemp)" +work_dir="$(mktemp -d)" +trap 'rm -f "${run_file}" "${jobs_file}" "${failed_file}" "${body_file}"; rm -rf "${work_dir}"' EXIT + +gh api "repos/${REPO}/actions/runs/${RUN_ID}" >"${run_file}" +gh api "repos/${REPO}/actions/runs/${RUN_ID}/jobs?per_page=100" >"${jobs_file}" + +jq -r '.jobs[] | select(.conclusion == "failure") | [.id, .name, .html_url] | @tsv' "${jobs_file}" >"${failed_file}" + +if [[ ! -s "${failed_file}" ]]; then + echo "No failed jobs found for run ${RUN_ID}; skipping issue creation" + exit 0 +fi + +title="CI failure on main: run ${RUN_ID}" +existing_issue="$(gh issue list -R "${REPO}" --state open --search "\"${title}\" in:title" --json number --jq '.[0].number // empty')" +if [[ -n "${existing_issue}" ]]; then + echo "Issue #${existing_issue} already exists for run ${RUN_ID}; skipping" + exit 0 +fi + +# Ensure label exists for triage routing. +gh label create ci-failure -R "${REPO}" --description "Automated CI failure reports on main" --color B60205 2>/dev/null || true + +run_url="$(jq -r '.html_url' "${run_file}")" +head_sha="$(jq -r '.head_sha' "${run_file}")" +run_attempt="$(jq -r '.run_attempt' "${run_file}")" +created_at="$(jq -r '.created_at' "${run_file}")" + +{ + echo "Automated CI failure report for a \\`main\\` push run." + echo + echo "- Run: ${run_url}" + echo "- Run ID: ${RUN_ID}" + echo "- Attempt: ${run_attempt}" + echo "- Head SHA: \\`${head_sha}\\`" + echo "- Created At: ${created_at}" + echo + echo "## Failed Jobs" + while IFS=$'\t' read -r job_id job_name job_url; do + echo "- [${job_name}](${job_url}) (job id: ${job_id})" + done <"${failed_file}" +} >"${body_file}" + +while IFS=$'\t' read -r job_id job_name _job_url; do + log_file="${work_dir}/job-${job_id}.log" + if ! gh run view "${RUN_ID}" -R "${REPO}" --job "${job_id}" --log-failed >"${log_file}" 2>/dev/null; then + gh run view "${RUN_ID}" -R "${REPO}" --job "${job_id}" --log >"${log_file}" 2>/dev/null || true + fi + + { + echo + echo "
${job_name} failed log (truncated)" + echo + echo "\\`\\`\\`text" + if [[ -s "${log_file}" ]]; then + head -n 220 "${log_file}" | sed 's/\r$//' + else + echo "No log output available for job ${job_id}." + fi + echo "\\`\\`\\`" + echo + echo "
" + } >>"${body_file}" +done <"${failed_file}" + +issue_url="$(gh issue create -R "${REPO}" --title "${title}" --body-file "${body_file}" --label ci-failure)" +echo "Created CI failure issue: ${issue_url}" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index eb5bab3..5532493 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,6 +4,10 @@ on: push: branches: ["main"] pull_request: + workflow_dispatch: + schedule: + # Nightly cache-bypass run to detect toolchain/cache drift. + - cron: "17 9 * * *" concurrency: group: ci-${{ github.ref }} @@ -12,6 +16,7 @@ concurrency: env: GOTOOLCHAIN: go1.26.1 TOOLS_BIN: ${{ github.workspace }}/.cache/tools/bin + DISABLE_TOOL_CACHE: ${{ github.event_name == 'schedule' && 'true' || 'false' }} STATICCHECK_VERSION: v0.6.1 GOSEC_VERSION: v2.23.0 GOVULNCHECK_VERSION: v1.1.4 @@ -62,14 +67,21 @@ jobs: go-version: "1.26.1" - name: Restore tool cache + if: env.DISABLE_TOOL_CACHE != 'true' id: staticcheck-cache uses: actions/cache@v4 with: path: ${{ env.TOOLS_BIN }} key: staticcheck-${{ runner.os }}-${{ env.GOTOOLCHAIN }}-${{ env.STATICCHECK_VERSION }} - - name: Install staticcheck - if: steps.staticcheck-cache.outputs.cache-hit != 'true' + - name: Install staticcheck (no cache) + if: env.DISABLE_TOOL_CACHE == 'true' + run: | + mkdir -p "${TOOLS_BIN}" + GOBIN="${TOOLS_BIN}" go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION} + + - name: Install staticcheck (cache miss) + if: env.DISABLE_TOOL_CACHE != 'true' && steps.staticcheck-cache.outputs.cache-hit != 'true' run: | mkdir -p "${TOOLS_BIN}" GOBIN="${TOOLS_BIN}" go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION} @@ -162,14 +174,21 @@ jobs: go-version: "1.26.1" - name: Restore tool cache + if: env.DISABLE_TOOL_CACHE != 'true' id: gosec-cache uses: actions/cache@v4 with: path: ${{ env.TOOLS_BIN }} key: gosec-${{ runner.os }}-${{ env.GOTOOLCHAIN }}-${{ env.GOSEC_VERSION }} - - name: Install gosec - if: steps.gosec-cache.outputs.cache-hit != 'true' + - name: Install gosec (no cache) + if: env.DISABLE_TOOL_CACHE == 'true' + run: | + mkdir -p "${TOOLS_BIN}" + GOBIN="${TOOLS_BIN}" go install github.com/securego/gosec/v2/cmd/gosec@${GOSEC_VERSION} + + - name: Install gosec (cache miss) + if: env.DISABLE_TOOL_CACHE != 'true' && steps.gosec-cache.outputs.cache-hit != 'true' run: | mkdir -p "${TOOLS_BIN}" GOBIN="${TOOLS_BIN}" go install github.com/securego/gosec/v2/cmd/gosec@${GOSEC_VERSION} @@ -191,14 +210,21 @@ jobs: go-version: "1.26.1" - name: Restore tool cache + if: env.DISABLE_TOOL_CACHE != 'true' id: govulncheck-cache uses: actions/cache@v4 with: path: ${{ env.TOOLS_BIN }} key: govulncheck-${{ runner.os }}-${{ env.GOTOOLCHAIN }}-${{ env.GOVULNCHECK_VERSION }} - - name: Install govulncheck - if: steps.govulncheck-cache.outputs.cache-hit != 'true' + - name: Install govulncheck (no cache) + if: env.DISABLE_TOOL_CACHE == 'true' + run: | + mkdir -p "${TOOLS_BIN}" + GOBIN="${TOOLS_BIN}" go install golang.org/x/vuln/cmd/govulncheck@${GOVULNCHECK_VERSION} + + - name: Install govulncheck (cache miss) + if: env.DISABLE_TOOL_CACHE != 'true' && steps.govulncheck-cache.outputs.cache-hit != 'true' run: | mkdir -p "${TOOLS_BIN}" GOBIN="${TOOLS_BIN}" go install golang.org/x/vuln/cmd/govulncheck@${GOVULNCHECK_VERSION} @@ -387,3 +413,65 @@ jobs: if: always() run: | docker rm -f tap-nats tap-clickhouse || true + + flake-tracker: + if: always() + needs: + - integration + - perf-smoke + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Collect CI flake metrics + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + RUN_ID: ${{ github.run_id }} + BRANCH: main + TREND_RUN_LIMIT: "30" + run: .github/scripts/collect_flake_metrics.sh + + - name: Upload flake metrics artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: ci-flake-metrics-${{ github.run_id }} + path: flake-metrics/ + if-no-files-found: error + retention-days: 30 + + create-failure-issue: + if: ${{ always() && github.event_name == 'push' && github.ref == 'refs/heads/main' && contains(needs.*.result, 'failure') }} + needs: + - test + - staticcheck + - openapi-contract + - config-lint + - docker-build + - helm-lint + - security-gosec + - security-govulncheck + - security-trivy + - security-sbom + - perf-smoke + - integration + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + issues: write + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Create issue with failed logs + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + RUN_ID: ${{ github.run_id }} + run: .github/scripts/create_ci_failure_issue.sh diff --git a/.github/workflows/release-candidate-smoke.yml b/.github/workflows/release-candidate-smoke.yml new file mode 100644 index 0000000..b35e0ad --- /dev/null +++ b/.github/workflows/release-candidate-smoke.yml @@ -0,0 +1,140 @@ +name: Release Candidate Smoke + +on: + workflow_dispatch: + inputs: + ref: + description: Git ref (branch, tag, or SHA) to validate + required: false + default: main + release_name: + description: Helm release name for smoke install + required: false + default: ensemble-tap + namespace: + description: Kubernetes namespace for smoke install + required: false + default: ensemble-rc + +permissions: + contents: read + +env: + GOTOOLCHAIN: go1.26.1 + TAP_IMAGE_REPO: ensemble-tap + NATS_IMAGE: nats:2.12.4-alpine + +jobs: + rc-smoke: + runs-on: ubuntu-latest + timeout-minutes: 35 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref }} + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: "1.26.1" + + - name: Setup Helm + uses: azure/setup-helm@v4 + + - name: Setup kind + uses: helm/kind-action@v1 + with: + cluster_name: rc-smoke + + - name: Build release-candidate image + env: + IMAGE_TAG: rc-${{ github.run_id }}-${{ github.run_attempt }} + run: | + docker build -t "${TAP_IMAGE_REPO}:${IMAGE_TAG}" . + echo "IMAGE_TAG=${IMAGE_TAG}" >> "$GITHUB_ENV" + + - name: Load image into kind + run: kind load docker-image "${TAP_IMAGE_REPO}:${IMAGE_TAG}" --name rc-smoke + + - name: Deploy NATS JetStream dependency + run: | + kubectl create namespace "${{ inputs.namespace }}" --dry-run=client -o yaml | kubectl apply -f - + cat < Date: Sun, 8 Mar 2026 19:50:19 -0700 Subject: [PATCH 2/4] ci: isolate nightly concurrency from push runs --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5532493..5aebded 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,7 +10,7 @@ on: - cron: "17 9 * * *" concurrency: - group: ci-${{ github.ref }} + group: ci-${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} cancel-in-progress: true env: From 0d4e1aa1319d3544a1c8caff1452c82bbe17fa04 Mon Sep 17 00:00:00 2001 From: Jonathan Haas Date: Sun, 8 Mar 2026 19:54:51 -0700 Subject: [PATCH 3/4] ci: require branch protection checks via GitHub Actions check-runs --- .github/scripts/apply_branch_protection.sh | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/scripts/apply_branch_protection.sh b/.github/scripts/apply_branch_protection.sh index f85e3fd..2aa33ca 100755 --- a/.github/scripts/apply_branch_protection.sh +++ b/.github/scripts/apply_branch_protection.sh @@ -21,19 +21,19 @@ cat >"$payload_file" < Date: Sun, 8 Mar 2026 19:58:44 -0700 Subject: [PATCH 4/4] ci: group main failure reports into one triage issue --- .github/scripts/create_ci_failure_issue.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/scripts/create_ci_failure_issue.sh b/.github/scripts/create_ci_failure_issue.sh index 502b116..52e2810 100755 --- a/.github/scripts/create_ci_failure_issue.sh +++ b/.github/scripts/create_ci_failure_issue.sh @@ -41,12 +41,8 @@ if [[ ! -s "${failed_file}" ]]; then exit 0 fi -title="CI failure on main: run ${RUN_ID}" +title="CI failure on main" existing_issue="$(gh issue list -R "${REPO}" --state open --search "\"${title}\" in:title" --json number --jq '.[0].number // empty')" -if [[ -n "${existing_issue}" ]]; then - echo "Issue #${existing_issue} already exists for run ${RUN_ID}; skipping" - exit 0 -fi # Ensure label exists for triage routing. gh label create ci-failure -R "${REPO}" --description "Automated CI failure reports on main" --color B60205 2>/dev/null || true @@ -93,5 +89,11 @@ while IFS=$'\t' read -r job_id job_name _job_url; do } >>"${body_file}" done <"${failed_file}" +if [[ -n "${existing_issue}" ]]; then + gh issue comment "${existing_issue}" -R "${REPO}" --body-file "${body_file}" >/dev/null + echo "Added CI failure details to existing issue #${existing_issue}" + exit 0 +fi + issue_url="$(gh issue create -R "${REPO}" --title "${title}" --body-file "${body_file}" --label ci-failure)" echo "Created CI failure issue: ${issue_url}"