diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 60a13bf..5584728 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -295,6 +295,35 @@ jobs: echo "All SSH attempts failed" exit 1 + - name: Pause queue worker for deploy + if: env.ENVIRONMENT == 'production' + # When staging gets its own WP backend, also gate this step on + # env.ENVIRONMENT == 'staging' and route to the staging WP REST URL. + # Today staging shares the production backend, so only production + # deploys touch FPM hard enough to need the pause. + env: + WP_REST_URL: ${{ vars.WP_REST_URL }} + WP_APP_USERNAME: ${{ secrets.WP_APP_USERNAME }} + WP_APP_PASSWORD: ${{ secrets.WP_APP_PASSWORD }} + run: | + AUTH=$(echo -n "$WP_APP_USERNAME:$WP_APP_PASSWORD" | base64) + for attempt in 1 2 3; do + STATUS=$(curl -sS -o /tmp/maint-begin.json -w '%{http_code}' \ + --connect-timeout 10 --max-time 30 \ + -X POST "$WP_REST_URL/cdcf/v1/maintenance" \ + -H "Authorization: Basic $AUTH" \ + -H "Content-Type: application/json" \ + -d '{"action":"begin","duration_seconds":300}') + if [ "$STATUS" = "200" ]; then + echo "Worker paused: $(cat /tmp/maint-begin.json)" + exit 0 + fi + echo "Maintenance begin attempt $attempt failed (HTTP $STATUS)" + [ "$attempt" -lt 3 ] && sleep 5 + done + echo "::error::Failed to pause queue worker; aborting deploy." + exit 1 + - name: Extract WP theme and plugin bundles if: env.ENVIRONMENT == 'production' env: @@ -381,3 +410,29 @@ jobs: echo "One or more plugin activations failed; failing the deploy." exit 1 fi + + - name: Resume queue worker after deploy + if: always() && env.ENVIRONMENT == 'production' + env: + WP_REST_URL: ${{ vars.WP_REST_URL }} + WP_APP_USERNAME: ${{ secrets.WP_APP_USERNAME }} + WP_APP_PASSWORD: ${{ secrets.WP_APP_PASSWORD }} + run: | + AUTH=$(echo -n "$WP_APP_USERNAME:$WP_APP_PASSWORD" | base64) + for attempt in 1 2 3; do + STATUS=$(curl -sS -o /dev/null -w '%{http_code}' \ + --connect-timeout 10 --max-time 30 \ + -X POST "$WP_REST_URL/cdcf/v1/maintenance" \ + -H "Authorization: Basic $AUTH" \ + -H "Content-Type: application/json" \ + -d '{"action":"end"}') + if [ "$STATUS" = "200" ]; then + echo "Worker resumed." + exit 0 + fi + echo "Maintenance end attempt $attempt failed (HTTP $STATUS)" + [ "$attempt" -lt 3 ] && sleep 5 + done + # Don't exit 1 here — the TTL self-heals within 600s and failing + # this step would mask the real outcome of the deploy. + echo "::warning::Failed to resume queue worker; will self-heal via TTL within 600s." diff --git a/AGENTS.md b/AGENTS.md index ed3076e..cdbd753 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -74,7 +74,7 @@ Uses `@import 'tailwindcss'` (not `@tailwind` directives). Custom utilities via ## REST API Endpoints (`cdcf/v1`) -All endpoints require Application Password authentication (`edit_posts` capability). +All endpoints require Application Password authentication. Most endpoints require `edit_posts` capability; `/process-queue` and `/maintenance` require `manage_options` (administrator) — see the row notes where capability differs. | Method | Route | Description | |--------|-------|-------------| @@ -85,6 +85,7 @@ All endpoints require Application Password authentication (`edit_posts` capabili | `POST` | `/team-member` | Create a team member with auto-translation and About page linking (see below) | | `POST` | `/community-channel` | Create a community channel with auto-translation and Community page linking (see below) | | `POST` | `/local-group` | Create a local group with auto-translation and Community page linking (see below) | +| `POST` | `/maintenance` | Pause or resume the cdcf-queue-worker by setting/clearing a Redis flag. Body: `action` is `"begin"` or `"end"`; optional `duration_seconds` is clamped server-side to 60–600. Requires administrator (`manage_options`) capability. | | `POST` | `/academic-collaboration` | Create an academic collaboration with auto-translation and Community page linking (see below) | ### `POST /team-member` @@ -169,6 +170,10 @@ scripts/.venv/bin/python scripts/cdcf_api.py rest-get wp/v2/posts --params '{"pe # Cache revalidation scripts/.venv/bin/python scripts/cdcf_api.py revalidate --path /about + +# Pause/resume the queue worker (used by the deploy workflow) +scripts/.venv/bin/python scripts/cdcf_api.py maintenance --action begin --duration 300 +scripts/.venv/bin/python scripts/cdcf_api.py maintenance --action end ``` See `docs/python-api-client.md` for full documentation of all commands and library usage. diff --git a/docs/redis-queue-worker.md b/docs/redis-queue-worker.md index f4cef5f..069d326 100644 --- a/docs/redis-queue-worker.md +++ b/docs/redis-queue-worker.md @@ -33,7 +33,7 @@ Queue Worker (systemd service) - Redis server running on the production host - The `redis-queue` and `cdcf-redis-translations` WordPress plugins activated - A WordPress user with `manage_options` capability and an Application Password -- `curl` and `python3` available on the server +- `curl`, `python3`, and `redis-cli` available on the server (`redis-cli` is typically provided by the `redis-tools` package on Debian/Ubuntu or `redis` on RHEL-derived distros; the worker uses it to poll the maintenance flag — see "Maintenance mode" below) ## 1. Install the worker script @@ -147,6 +147,63 @@ sudo chmod +x /usr/local/bin/cdcf_queue_worker.sh sudo systemctl restart cdcf-queue-worker ``` +## Maintenance mode + +The worker can be paused via a Redis flag. While the flag is set, the worker skips both `process_one` and `run_daily_tasks` and just sleeps `POLL_INTERVAL` seconds per cycle. This is used by the production deploy workflow to prevent the worker's parallel POSTs from competing with deploy-time WP traffic for FPM workers. + +### Setting and clearing the flag + +Via the WP REST API (the way the deploy workflow does it): + +```bash +# Pause for 300 seconds +curl -u "$WP_APP_USERNAME:$WP_APP_PASSWORD" \ + -X POST "$WP_REST_URL/cdcf/v1/maintenance" \ + -H "Content-Type: application/json" \ + -d '{"action":"begin","duration_seconds":300}' + +# Resume +curl -u "$WP_APP_USERNAME:$WP_APP_PASSWORD" \ + -X POST "$WP_REST_URL/cdcf/v1/maintenance" \ + -H "Content-Type: application/json" \ + -d '{"action":"end"}' +``` + +Or via the Python CLI: + +```bash +scripts/.venv/bin/python scripts/cdcf_api.py maintenance --action begin --duration 300 +scripts/.venv/bin/python scripts/cdcf_api.py maintenance --action end +``` + +Or directly via `redis-cli` on the VPS (operator-only): + +```bash +redis-cli SETEX cdcf:maintenance:until 300 1 +redis-cli DEL cdcf:maintenance:until +``` + +The TTL is server-clamped to `[60, 600]` seconds. If `end` is never called, the flag self-expires within ≤600 seconds. + +### Expected log output + +The worker logs exactly one line per transition, never per cycle: + +``` +2026-05-02T12:00:00+00:00 Entering maintenance mode (worker paused) +2026-05-02T12:02:30+00:00 Exiting maintenance mode (worker resumed) +2026-05-02T12:02:45+00:00 Processed 3 job(s) +``` + +### Verifying during a deploy + +```bash +journalctl -u cdcf-queue-worker -f +# Expect one "Entering" then one "Exiting" line bracketing the deploy. +# Compare 504 counts in the WP access log before/after the deploy day +# to baseline (~10/day; bad days hit 200+). +``` + ## Troubleshooting ### Worker logs show "WARNING: unexpected response" diff --git a/scripts/cdcf_api.py b/scripts/cdcf_api.py index 156e01e..5f4a05f 100644 --- a/scripts/cdcf_api.py +++ b/scripts/cdcf_api.py @@ -355,6 +355,26 @@ def revalidate(self, path: str | None = None, tags: list[str] | None = None) -> resp.raise_for_status() return resp.json() + # -- Maintenance Flag -- + + def maintenance(self, action: str, duration_seconds: int = 300) -> dict: + """POST /cdcf/v1/maintenance — set or clear the deploy-time maintenance flag. + + action: 'begin' (sets cdcf:maintenance:until in Redis with a clamped + TTL) or 'end' (deletes the key). Validated by argparse at the + CLI layer; ValueError raised for invalid values when called + programmatically. + duration_seconds: only sent for 'begin'. Server clamps to [60, 600]. + + Returns the endpoint response dict. + """ + if action not in ("begin", "end"): + raise ValueError(f"action must be 'begin' or 'end', got {action!r}") + payload: dict = {"action": action} + if action == "begin": + payload["duration_seconds"] = int(duration_seconds) + return self._wp_post("cdcf/v1/maintenance", payload) + # --------------------------------------------------------------------------- # CLI @@ -472,6 +492,14 @@ def _build_parser() -> argparse.ArgumentParser: p.add_argument("--path", help="Path to revalidate") p.add_argument("--tags", nargs="*", help="Cache tags to revalidate") + # maintenance + p = sub.add_parser("maintenance", + help="Pause/resume the cdcf-queue-worker via Redis flag") + p.add_argument("--action", required=True, choices=["begin", "end"]) + p.add_argument("--duration", type=int, default=300, + help="Seconds to pause for (clamped server-side to 60-600). " + "Only used with --action begin. Default: 300") + # -- Post Meta / ACF Fields -- # get-post @@ -628,6 +656,9 @@ def _run_cli(args: argparse.Namespace, client: CdcfClient) -> dict: if cmd == "revalidate": return client.revalidate(path=args.path, tags=args.tags) + if cmd == "maintenance": + return client.maintenance(args.action, args.duration) + if cmd == "get-post": return client.get_post(args.post_id, args.post_type) diff --git a/scripts/cdcf_queue_worker.sh b/scripts/cdcf_queue_worker.sh index bd0bfef..1076ecc 100755 --- a/scripts/cdcf_queue_worker.sh +++ b/scripts/cdcf_queue_worker.sh @@ -75,6 +75,14 @@ if [ -z "$WP_REST_URL" ] || [ -z "$WP_APP_USERNAME" ] || [ -z "$WP_APP_PASSWORD" exit 1 fi +# Required for in_maintenance() — without this the worker silently +# ignores the deploy-time pause flag and keeps hitting FPM during +# deploys (the very condition this whole feature is meant to prevent). +if ! command -v redis-cli >/dev/null 2>&1; then + echo "ERROR: redis-cli not found in PATH. Install it (Debian/Ubuntu: 'sudo apt install redis-tools'; RHEL: 'sudo dnf install redis')." + exit 1 +fi + ENDPOINT="${WP_REST_URL}/cdcf/v1/process-queue" AUTH=$(echo -n "${WP_APP_USERNAME}:${WP_APP_PASSWORD}" | base64) @@ -133,6 +141,16 @@ except: fi } +# ─── Maintenance flag check ────────────────────────────────────────── +# Returns 0 (true) if the maintenance flag is set in Redis, 1 (false) +# otherwise. Redis-unreachable counts as "not in maintenance" so a +# Redis outage does not stall the worker indefinitely. +in_maintenance() { + local result + result=$(redis-cli -h 127.0.0.1 -p 6379 -n 0 EXISTS cdcf:maintenance:until 2>/dev/null) || return 1 + [ "$result" = "1" ] +} + # ─── Queue processing ──────────────────────────────────────────────── # process_one fires a single REST call and logs the result. @@ -187,7 +205,22 @@ except: fi } +IN_MAINTENANCE=0 while true; do + if in_maintenance; then + if [ "$IN_MAINTENANCE" = "0" ]; then + echo "$(date -Iseconds) Entering maintenance mode (worker paused)" + IN_MAINTENANCE=1 + fi + sleep "${POLL_INTERVAL}" + continue + fi + + if [ "$IN_MAINTENANCE" = "1" ]; then + echo "$(date -Iseconds) Exiting maintenance mode (worker resumed)" + IN_MAINTENANCE=0 + fi + run_daily_tasks if [ "$CONCURRENCY" -le 1 ]; then diff --git a/wordpress/plugins/cdcf-redis-translations/cdcf-redis-translations.php b/wordpress/plugins/cdcf-redis-translations/cdcf-redis-translations.php index 6036da3..6ca2c05 100644 --- a/wordpress/plugins/cdcf-redis-translations/cdcf-redis-translations.php +++ b/wordpress/plugins/cdcf-redis-translations/cdcf-redis-translations.php @@ -40,4 +40,75 @@ 'batch_size' => ['required' => false, 'type' => 'integer', 'default' => 10, 'sanitize_callback' => 'absint'], ], ]); + + register_rest_route('cdcf/v1', '/maintenance', [ + 'methods' => 'POST', + 'permission_callback' => function () { + return current_user_can('manage_options'); + }, + 'callback' => function (WP_REST_Request $request) { + $action = $request['action'] ?? ''; + if ($action !== 'begin' && $action !== 'end') { + return new WP_Error( + 'invalid_action', + "action must be 'begin' or 'end'", + ['status' => 400] + ); + } + + if (class_exists('Redis') === false) { + return new WP_Error( + 'redis_unavailable', + 'PHP Redis extension not installed', + ['status' => 500] + ); + } + + try { + $redis = new Redis(); + if ($redis->connect('127.0.0.1', 6379, 1.0) === false) { + return new WP_Error( + 'redis_unavailable', + 'Could not connect to Redis at 127.0.0.1:6379', + ['status' => 500] + ); + } + } catch (\Throwable $e) { + return new WP_Error('redis_unavailable', $e->getMessage(), ['status' => 500]); + } + + if ($action === 'end') { + $redis->del('cdcf:maintenance:until'); + return new WP_REST_Response(['ok' => true], 200); + } + + // action === 'begin' + $duration = (int) ($request['duration_seconds'] ?? 300); + $duration = max(60, min(600, $duration)); + if ($redis->setex('cdcf:maintenance:until', $duration, '1') === false) { + return new WP_Error( + 'redis_write_failed', + 'Failed to set maintenance flag in Redis', + ['status' => 500] + ); + } + return new WP_REST_Response([ + 'ok' => true, + 'until' => time() + $duration, + 'duration' => $duration, + ], 200); + }, + 'args' => [ + 'action' => [ + 'required' => true, + 'type' => 'string', + ], + 'duration_seconds' => [ + 'required' => false, + 'type' => 'integer', + 'default' => 300, + 'sanitize_callback' => 'absint', + ], + ], + ]); });