From da72e05c92b6cbc1e26ccfa78bb988253c03c528 Mon Sep 17 00:00:00 2001 From: JacobPEvans <20714140+JacobPEvans@users.noreply.github.com> Date: Wed, 29 Apr 2026 03:11:17 -0400 Subject: [PATCH 1/2] feat(ci): add _ci-gate.yml shared workflow with queue watchdog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Centralizes the conditional-required-check / Merge Gatekeeper pattern that previously lived as a duplicated ~110-line `ci-gate.yml` in every consumer repo. New reusable workflow with four jobs: - changes: dorny/paths-filter on caller-supplied filter YAML - conditional checks (nix_validate, markdown_lint, file_size, python_security): each gated on (input toggle && filter output) - watchdog: sleeps queue_timeout_minutes (default 10) then cancels any sibling job still in `queued` state via the GitHub API - gate (name "Merge Gate"): re-actors/alls-green aggregator The watchdog closes a class of bugs first observed on nix-home#205: when a `needs:` dependency is stuck queued (e.g. a self-hosted runner never claims it), the dependent gate job — even with `if: !cancelled()` — never schedules, leaving the required `Merge Gate` status absent forever and the PR unmergeable. By forcing a terminal state, the watchdog guarantees the gate always evaluates within queue_timeout_minutes + watchdog runtime. Also adds `timeout-minutes: 30` to _nix-validate.yml's validate job so a wedged running job (different failure mode, same symptom) also terminates cleanly. Watchdog logic lives in scripts/ci-gate-watchdog.sh, sparse-checked-out at runtime so the YAML stays under the inline-script-guard threshold and the logic is shellcheck-testable in isolation. Branch protection unchanged: the gate job's `name: Merge Gate` matches existing required_status_checks rulesets across all consumer repos. Follow-up PRs will migrate consumer repos to thin callers of this workflow, starting with ai-assistant-instructions as the canary. (claude) --- .github/workflows/_ci-gate.yml | 176 ++++++++++++++++++++++++++++ .github/workflows/_nix-validate.yml | 4 + scripts/ci-gate-watchdog.sh | 41 +++++++ 3 files changed, 221 insertions(+) create mode 100644 .github/workflows/_ci-gate.yml create mode 100755 scripts/ci-gate-watchdog.sh diff --git a/.github/workflows/_ci-gate.yml b/.github/workflows/_ci-gate.yml new file mode 100644 index 0000000..5428edc --- /dev/null +++ b/.github/workflows/_ci-gate.yml @@ -0,0 +1,176 @@ +# Reusable: CI Gate (Merge Gatekeeper) +# +# Centralizes the conditional-required-check pattern that previously lived as +# duplicated `ci-gate.yml` files in every consumer repo. +# +# What this workflow does: +# +# changes -> dorny/paths-filter on caller-supplied filters +# -> per-check reusable workflows, gated on (input toggle && filter) +# watchdog -> sleeps queue_timeout_minutes, cancels any sibling job still +# in `queued` state so the gate is never blocked indefinitely +# gate -> name MUST stay "Merge Gate" (matches branch protection rulesets); +# aggregates results via re-actors/alls-green +# +# Why the watchdog exists: +# +# GitHub Actions only schedules a `needs:` dependent once every upstream job +# reaches a *terminal* state (success/failure/cancelled/skipped). A job stuck +# in `queued` (e.g. self-hosted runner never picks it up) is NOT terminal, so +# `gate` never runs and the required `Merge Gate` status never reports. The +# watchdog forces a terminal state by cancelling stuck queued jobs. +# +# Filter name convention (caller defines these in `filters:` input): +# +# nix -> gates `nix_validate` +# markdown -> gates `markdown_lint` and `file_size` +# python -> gates `python_security` +# +# Callers may include additional filters; this workflow ignores them. To add +# a new conditional check, add: input toggle, conditional job, and entry in +# the gate's `needs:` and `allowed-skips`. + +name: _ci-gate + +on: + workflow_call: + inputs: + filters: + description: >- + YAML body for dorny/paths-filter. Use the convention names listed + in this file's header to enable the matching conditional check. + type: string + required: true + + nix_validate: + description: Enable `Nix Validate` (gated on `nix` filter) + type: boolean + default: false + markdown_lint: + description: Enable `Markdown Lint` (gated on `markdown` filter) + type: boolean + default: false + file_size: + description: Enable `File Size` (gated on `nix` OR `markdown` filter) + type: boolean + default: false + python_security: + description: Enable `Python Security` (gated on `python` filter) + type: boolean + default: false + python_security_dirs: + description: Space-separated dirs passed to _python-security.yml + type: string + default: '' + + queue_timeout_minutes: + description: >- + Minutes the watchdog waits before cancelling sibling jobs still in + `queued` state. Default 10. Cap your slowest gate-path job at well + below this value or it will be killed before it starts. + type: number + default: 10 + +permissions: + contents: read + pull-requests: read + +jobs: + # ============================================================================ + # CHANGE DETECTION + # ============================================================================ + changes: + name: Detect Changes + runs-on: ubuntu-latest + outputs: + nix: ${{ steps.filter.outputs.nix }} + markdown: ${{ steps.filter.outputs.markdown }} + python: ${{ steps.filter.outputs.python }} + steps: + - uses: actions/checkout@v6 + - uses: dorny/paths-filter@v4 + id: filter + with: + filters: ${{ inputs.filters }} + + # ============================================================================ + # CONDITIONAL CHECKS + # Each is gated on `inputs.` AND `needs.changes.outputs.`. + # Disabled or untouched paths skip; `re-actors/alls-green` accepts skips. + # ============================================================================ + nix-validate: + name: Nix Validate + needs: changes + if: ${{ inputs.nix_validate && needs.changes.outputs.nix == 'true' }} + uses: JacobPEvans/.github/.github/workflows/_nix-validate.yml@main + + markdown-lint: + name: Markdown Lint + needs: changes + if: ${{ inputs.markdown_lint && needs.changes.outputs.markdown == 'true' }} + uses: JacobPEvans/.github/.github/workflows/_markdown-lint.yml@main + + file-size: + name: File Size + needs: changes + if: ${{ inputs.file_size && (needs.changes.outputs.nix == 'true' || needs.changes.outputs.markdown == 'true') }} + uses: JacobPEvans/.github/.github/workflows/_file-size.yml@main + + python-security: + name: Python Security + needs: changes + if: ${{ inputs.python_security && needs.changes.outputs.python == 'true' }} + uses: JacobPEvans/.github/.github/workflows/_python-security.yml@main + with: + python-dirs: ${{ inputs.python_security_dirs }} + secrets: inherit + + # ============================================================================ + # WATCHDOG + # Forces a terminal state for queued sibling jobs after queue_timeout_minutes. + # Itself reports success regardless — its job is to unstick the gate. + # ============================================================================ + watchdog: + name: Queue Watchdog + needs: changes + runs-on: ubuntu-latest + permissions: + actions: write # required to cancel sibling jobs + # Hardcoded ceiling — GHA expressions don't support arithmetic so this + # can't be derived from `queue_timeout_minutes`. 30 min handles up to a + # ~25-minute queue_timeout with comfortable margin. + timeout-minutes: 30 + steps: + - name: Sparse-checkout watchdog script from this repo + uses: actions/checkout@v6 + with: + repository: JacobPEvans/.github + ref: main + path: .gh-shared + sparse-checkout: scripts/ci-gate-watchdog.sh + sparse-checkout-cone-mode: false + - name: Sleep, then cancel any sibling job still queued + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + QUEUE_TIMEOUT_MINUTES: ${{ inputs.queue_timeout_minutes }} + REPO: ${{ github.repository }} + RUN_ID: ${{ github.run_id }} + run: bash .gh-shared/scripts/ci-gate-watchdog.sh + + # ============================================================================ + # MERGE GATE — the only check that branch protection requires. + # `name:` MUST stay "Merge Gate" (matches existing required_status_checks). + # ============================================================================ + gate: + name: Merge Gate + needs: [changes, watchdog, nix-validate, markdown-lint, file-size, python-security] + if: ${{ !cancelled() }} + runs-on: ubuntu-latest + steps: + - name: Aggregate results + uses: re-actors/alls-green@release/v1 + with: + # `watchdog` is always-success-on-completion; treating it as + # allowed-skip lets `alls-green` ignore its result either way. + allowed-skips: nix-validate, markdown-lint, file-size, python-security, watchdog + jobs: ${{ toJSON(needs) }} diff --git a/.github/workflows/_nix-validate.yml b/.github/workflows/_nix-validate.yml index c370270..a0b6bda 100644 --- a/.github/workflows/_nix-validate.yml +++ b/.github/workflows/_nix-validate.yml @@ -25,6 +25,10 @@ jobs: validate: name: Validate runs-on: ${{ inputs.runner_label }} + # Hard cap so a wedged `nix flake check` (network hang, runner stall) can't + # block `Merge Gate` indefinitely. Queue-stuck handling lives in the + # _ci-gate watchdog; this is the running-job side of defense in depth. + timeout-minutes: 30 steps: - name: Checkout uses: actions/checkout@v6 diff --git a/scripts/ci-gate-watchdog.sh b/scripts/ci-gate-watchdog.sh new file mode 100755 index 0000000..4266a7c --- /dev/null +++ b/scripts/ci-gate-watchdog.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# ci-gate-watchdog.sh — invoked by the `watchdog` job in _ci-gate.yml. +# +# After QUEUE_TIMEOUT_MINUTES, cancels every sibling job in the current +# workflow run that is still in `queued` state, except for "Queue Watchdog" +# (this script's own job) and "Merge Gate" (the gate job, which is still +# pending here because `needs: watchdog`). Cancellation forces a terminal +# state so `gate` can finally schedule and `re-actors/alls-green` can +# evaluate, ensuring the required `Merge Gate` status always reports. +# +# Required env: +# GH_TOKEN — GitHub token with actions:write on the run +# QUEUE_TIMEOUT_MINUTES — minutes to wait before evaluating queued jobs +# REPO — owner/repo of the current workflow run +# RUN_ID — workflow run id + +set -euo pipefail + +: "${GH_TOKEN:?required}" +: "${QUEUE_TIMEOUT_MINUTES:?required}" +: "${REPO:?required}" +: "${RUN_ID:?required}" + +sleep "$((QUEUE_TIMEOUT_MINUTES * 60))" + +# Names whose `status == "queued"` is intentional at this point in the run. +# The watchdog itself is mid-execution; the gate is awaiting watchdog. +exempt_filter='.name != "Queue Watchdog" and .name != "Merge Gate"' + +stuck=$(gh api "repos/${REPO}/actions/runs/${RUN_ID}/jobs?per_page=100" \ + --jq ".jobs[] | select(.status == \"queued\" and ${exempt_filter}) | \"\(.id)\t\(.name)\"") + +if [ -z "$stuck" ]; then + echo "No stuck queued jobs found." + exit 0 +fi + +while IFS=$'\t' read -r job_id job_name; do + echo "Cancelling stuck queued job: ${job_name} (id=${job_id})" + gh api -X POST "repos/${REPO}/actions/jobs/${job_id}/cancel" || true +done <<< "$stuck" From 68cc3ac71924ddf9c665a392b2df6bad3db20f8d Mon Sep 17 00:00:00 2001 From: JacobPEvans <20714140+JacobPEvans@users.noreply.github.com> Date: Wed, 29 Apr 2026 03:54:35 -0400 Subject: [PATCH 2/2] fix(ci): address Copilot review on watchdog + gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - gate: `always() && !cancelled()` ensures the gate runs even when a dependency fails (not just when not cancelled) - watchdog: add `contents: read` permission so sparse-checkout succeeds when job-level perms override workflow-level perms - watchdog script: use awk for float-safe minute→second conversion - watchdog script: `gh api --paginate` to catch queued jobs beyond the first 100 in large runs (claude) --- .github/workflows/_ci-gate.yml | 5 +++-- scripts/ci-gate-watchdog.sh | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_ci-gate.yml b/.github/workflows/_ci-gate.yml index 5428edc..cd3123c 100644 --- a/.github/workflows/_ci-gate.yml +++ b/.github/workflows/_ci-gate.yml @@ -135,7 +135,8 @@ jobs: needs: changes runs-on: ubuntu-latest permissions: - actions: write # required to cancel sibling jobs + actions: write # required to cancel sibling jobs + contents: read # required for actions/checkout sparse-checkout # Hardcoded ceiling — GHA expressions don't support arithmetic so this # can't be derived from `queue_timeout_minutes`. 30 min handles up to a # ~25-minute queue_timeout with comfortable margin. @@ -164,7 +165,7 @@ jobs: gate: name: Merge Gate needs: [changes, watchdog, nix-validate, markdown-lint, file-size, python-security] - if: ${{ !cancelled() }} + if: ${{ always() && !cancelled() }} runs-on: ubuntu-latest steps: - name: Aggregate results diff --git a/scripts/ci-gate-watchdog.sh b/scripts/ci-gate-watchdog.sh index 4266a7c..7f4a448 100755 --- a/scripts/ci-gate-watchdog.sh +++ b/scripts/ci-gate-watchdog.sh @@ -21,13 +21,13 @@ set -euo pipefail : "${REPO:?required}" : "${RUN_ID:?required}" -sleep "$((QUEUE_TIMEOUT_MINUTES * 60))" +sleep "$(awk "BEGIN{printf \"%d\", $QUEUE_TIMEOUT_MINUTES * 60}")" # Names whose `status == "queued"` is intentional at this point in the run. # The watchdog itself is mid-execution; the gate is awaiting watchdog. exempt_filter='.name != "Queue Watchdog" and .name != "Merge Gate"' -stuck=$(gh api "repos/${REPO}/actions/runs/${RUN_ID}/jobs?per_page=100" \ +stuck=$(gh api --paginate "repos/${REPO}/actions/runs/${RUN_ID}/jobs?per_page=100" \ --jq ".jobs[] | select(.status == \"queued\" and ${exempt_filter}) | \"\(.id)\t\(.name)\"") if [ -z "$stuck" ]; then