Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 20 additions & 51 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,65 +80,34 @@ jobs:
cache-from: type=gha
cache-to: type=gha,mode=max

- name: Copy deploy script to VPS
uses: appleboy/scp-action@v1
with:
host: ${{ secrets.VPS_HOST }}
username: ${{ secrets.VPS_USER }}
key: ${{ secrets.VPS_SSH_KEY }}
source: scripts/deploy-with-rollback.sh
target: ~/app
strip_components: 1

- name: Deploy to VPS via SSH
uses: appleboy/ssh-action@v1
env:
IMAGE: ghcr.io/${{ github.repository }}:${{ steps.version.outputs.version }}
PORT: ${{ secrets.APP_PORT || '3000' }}
with:
host: ${{ secrets.VPS_HOST }}
username: ${{ secrets.VPS_USER }}
key: ${{ secrets.VPS_SSH_KEY }}
envs: IMAGE,PORT
script: |
set -e
IMAGE="ghcr.io/${{ github.repository }}:${{ steps.version.outputs.version }}"
PORT="${{ secrets.APP_PORT || '3000' }}"
mkdir -p ~/app
cd ~/app

# Capture currently running image for potential rollback
PREV_IMAGE=""
if docker compose ps -q app >/dev/null 2>&1; then
CID=$(docker compose ps -q app || true)
if [ -n "$CID" ]; then
PREV_IMAGE=$(docker inspect --format '{{.Config.Image}}' "$CID" 2>/dev/null || true)
fi
fi
echo "Previous image: ${PREV_IMAGE:-<none>}"

write_compose() {
local img="$1"
cat > ~/app/docker-compose.yml << EOF
services:
app:
image: $img
env_file: $HOME/.env.app
ports:
- "${PORT}:${PORT}"
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "-qO", "/dev/null", "http://localhost:${PORT}/health"]
interval: 10s
timeout: 5s
retries: 3
start_period: 10s
EOF
}

write_compose "$IMAGE"
docker compose pull

if docker compose up -d --wait; then
echo "Deploy succeeded."
docker image prune -f
else
echo "::error::Deploy health check failed."
if [ -n "$PREV_IMAGE" ] && [ "$PREV_IMAGE" != "$IMAGE" ]; then
echo "Rolling back to $PREV_IMAGE"
write_compose "$PREV_IMAGE"
docker compose up -d --wait || echo "::error::Rollback also failed — manual intervention required."
else
echo "No previous image available to roll back to."
fi
exit 1
fi
chmod +x ~/app/deploy-with-rollback.sh
IMAGE="$IMAGE" \
PORT="$PORT" \
DEPLOY_DIR="$HOME/app" \
ENV_FILE="$HOME/.env.app" \
~/app/deploy-with-rollback.sh

- name: Clean up old GHCR images
uses: actions/delete-package-versions@v5
Expand Down
11 changes: 11 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,14 @@ jobs:
echo "::error::Docker image exceeds 500 MB (${SIZE_MB} MB) — consider multi-stage build"
exit 1
fi

rollback-integration-test:
# Regression test for scripts/deploy-with-rollback.sh. Exercises the
# same script the CD workflow calls over SSH, but with local images
# and local docker compose so no secrets/registry are required.
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6

- name: Run rollback integration test
run: bash tests/rollback-integration.sh
97 changes: 97 additions & 0 deletions scripts/deploy-with-rollback.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env bash
# Deploy a container image via docker compose with automatic rollback on
# health-check failure.
#
# This script is shared between:
# - .github/workflows/cd.yml (runs on the VPS over SSH during production deploy)
# - .github/workflows/ci.yml (runs locally on the CI runner as a regression test)
#
# Responsibilities:
# 1. Detect the image currently running for the "app" service (if any) and
# remember it as the rollback target.
# 2. Write a docker-compose.yml that points at the new image.
# 3. `docker compose up -d --wait` — Docker blocks until the container is
# healthy. If unhealthy, the command fails.
# 4. On failure, rewrite the compose file with the previous image and
# restart. If there was no previous image, exit non-zero.
#
# Required env vars:
# IMAGE Container image to deploy (e.g. ghcr.io/org/app:1.2.3)
# PORT Port the app listens on (host and container)
# DEPLOY_DIR Directory that holds docker-compose.yml (created if missing)
#
# Optional env vars:
# ENV_FILE Path to an env_file for the compose service. Empty disables it.
# SKIP_PULL If "1", skip `docker compose pull` (useful when the test has
# already loaded a local image that is not in a registry).

set -euo pipefail

: "${IMAGE:?IMAGE is required}"
: "${PORT:?PORT is required}"
: "${DEPLOY_DIR:?DEPLOY_DIR is required}"
ENV_FILE="${ENV_FILE:-}"
SKIP_PULL="${SKIP_PULL:-0}"

mkdir -p "$DEPLOY_DIR"
cd "$DEPLOY_DIR"

write_compose() {
local img="$1"
{
echo "services:"
echo " app:"
echo " image: $img"
if [ -n "$ENV_FILE" ]; then
echo " env_file: $ENV_FILE"
fi
echo " environment:"
echo " PORT: \"${PORT}\""
echo " ports:"
echo " - \"${PORT}:${PORT}\""
echo " restart: unless-stopped"
echo " healthcheck:"
echo " test: [\"CMD-SHELL\", \"wget -qO /dev/null http://localhost:${PORT}/health || exit 1\"]"
echo " interval: 5s"
echo " timeout: 3s"
echo " retries: 3"
echo " start_period: 5s"
} > docker-compose.yml
}

# Capture currently running image for potential rollback.
PREV_IMAGE=""
if docker compose ps -q app >/dev/null 2>&1; then
CID="$(docker compose ps -q app || true)"
if [ -n "$CID" ]; then
PREV_IMAGE="$(docker inspect --format '{{.Config.Image}}' "$CID" 2>/dev/null || true)"
fi
fi
echo "Previous image: ${PREV_IMAGE:-<none>}"
echo "Target image: ${IMAGE}"

write_compose "$IMAGE"

if [ "$SKIP_PULL" != "1" ]; then
docker compose pull
fi

if docker compose up -d --wait; then
echo "Deploy succeeded."
docker image prune -f >/dev/null 2>&1 || true
exit 0
fi

echo "::error::Deploy health check failed."
if [ -n "$PREV_IMAGE" ] && [ "$PREV_IMAGE" != "$IMAGE" ]; then
echo "Rolling back to $PREV_IMAGE"
write_compose "$PREV_IMAGE"
if docker compose up -d --wait; then
echo "Rollback succeeded."
else
echo "::error::Rollback also failed — manual intervention required."
fi
else
echo "No previous image available to roll back to."
fi
exit 1
7 changes: 7 additions & 0 deletions tests/fixtures/bad-app/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM node:20-alpine
WORKDIR /app
COPY server.js .
EXPOSE 3000
HEALTHCHECK --interval=5s --timeout=3s --retries=3 --start-period=5s \
CMD wget -qO /dev/null http://localhost:${PORT:-3000}/health || exit 1
CMD ["node", "server.js"]
14 changes: 14 additions & 0 deletions tests/fixtures/bad-app/server.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Broken app fixture: /health always returns 500.
// Used by the rollback integration test to simulate a failed deploy that
// should trigger rollback to the previous (good) image.
const http = require('http');
const port = process.env.PORT || 3000;
http.createServer((req, res) => {
if (req.url === '/health') {
res.writeHead(500, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ status: 'broken', build: 'bad' }));
return;
}
res.writeHead(500);
res.end('bad');
}).listen(port, () => console.log(`bad app on ${port}`));
7 changes: 7 additions & 0 deletions tests/fixtures/good-app/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM node:20-alpine
WORKDIR /app
COPY server.js .
EXPOSE 3000
HEALTHCHECK --interval=5s --timeout=3s --retries=3 --start-period=5s \
CMD wget -qO /dev/null http://localhost:${PORT:-3000}/health || exit 1
CMD ["node", "server.js"]
13 changes: 13 additions & 0 deletions tests/fixtures/good-app/server.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Healthy app fixture: responds 200 on /health.
// Used by the rollback integration test to simulate a successful deploy.
const http = require('http');
const port = process.env.PORT || 3000;
http.createServer((req, res) => {
if (req.url === '/health') {
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ status: 'ok', build: 'good' }));
return;
}
res.writeHead(200);
res.end('good');
}).listen(port, () => console.log(`good app on ${port}`));
113 changes: 113 additions & 0 deletions tests/rollback-integration.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env bash
# Integration test for scripts/deploy-with-rollback.sh.
#
# Runs entirely on the CI runner using local `docker compose` — no SSH, no
# registry. Builds two local images (good, bad) and exercises the full
# deploy flow the CD workflow uses in production.
#
# Scenarios covered:
# 1. First deploy of the good image succeeds.
# 2. Attempting to deploy the bad image over the good one fails AND
# rollback restores the good image. After the attempt, /health must
# still return 200 from the good image.
# 3. First deploy of a bad image (no previous image) fails with a
# non-zero exit and leaves nothing healthy — the script must not
# silently swallow the failure.

set -euo pipefail

REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
SCRIPT="$REPO_ROOT/scripts/deploy-with-rollback.sh"
GOOD_IMAGE="rollback-test/good:1"
BAD_IMAGE="rollback-test/bad:1"
PORT="${PORT:-38080}"
WORK_DIR="$(mktemp -d)"

pass() { echo "PASS: $1"; }
fail() { echo "FAIL: $1" >&2; exit 1; }

cleanup() {
if [ -f "$WORK_DIR/docker-compose.yml" ]; then
(cd "$WORK_DIR" && docker compose down -v --remove-orphans >/dev/null 2>&1 || true)
fi
rm -rf "$WORK_DIR"
docker rmi -f "$GOOD_IMAGE" "$BAD_IMAGE" >/dev/null 2>&1 || true
}
trap cleanup EXIT

check_health() {
local expected_build="$1"
for _ in 1 2 3 4 5 6 7 8 9 10; do
body="$(curl -sf "http://127.0.0.1:${PORT}/health" 2>/dev/null || true)"
if echo "$body" | grep -q "\"build\":\"${expected_build}\""; then
return 0
fi
sleep 1
done
echo "health check did not return build=${expected_build} (last body: ${body:-<empty>})" >&2
return 1
}

echo "==> Building fixture images"
docker build -t "$GOOD_IMAGE" "$REPO_ROOT/tests/fixtures/good-app" >/dev/null
docker build -t "$BAD_IMAGE" "$REPO_ROOT/tests/fixtures/bad-app" >/dev/null

# ---------------------------------------------------------------------------
# Scenario 1: first deploy of the good image must succeed.
# ---------------------------------------------------------------------------
echo "==> Scenario 1: deploy good image (first deploy)"
if IMAGE="$GOOD_IMAGE" PORT="$PORT" DEPLOY_DIR="$WORK_DIR" SKIP_PULL=1 \
bash "$SCRIPT"; then
pass "good image deploy returned 0"
else
fail "good image deploy returned non-zero"
fi
check_health good || fail "good image not serving /health"
pass "good image /health responds with build=good"

# ---------------------------------------------------------------------------
# Scenario 2: deploying the bad image on top must fail AND rollback must
# restore the good image.
# ---------------------------------------------------------------------------
echo "==> Scenario 2: deploy bad image, expect rollback to good"
set +e
IMAGE="$BAD_IMAGE" PORT="$PORT" DEPLOY_DIR="$WORK_DIR" SKIP_PULL=1 \
bash "$SCRIPT"
deploy_rc=$?
set -e
if [ "$deploy_rc" -eq 0 ]; then
fail "bad image deploy returned 0 — rollback script failed to detect failure"
fi
pass "bad image deploy returned non-zero ($deploy_rc)"

# After rollback, /health must still respond build=good.
check_health good || fail "rollback did not restore the good image"
pass "rollback restored good image — /health still serves build=good"

# The compose file on disk should now point back at the good image.
if grep -q "image: $GOOD_IMAGE" "$WORK_DIR/docker-compose.yml"; then
pass "compose file was rewritten to the good image after rollback"
else
fail "compose file does not point to good image after rollback"
fi

# Tear down before scenario 3 (we want a truly fresh state with no PREV_IMAGE).
(cd "$WORK_DIR" && docker compose down -v --remove-orphans >/dev/null 2>&1 || true)
rm -f "$WORK_DIR/docker-compose.yml"

# ---------------------------------------------------------------------------
# Scenario 3: first deploy of a bad image (no previous) must fail loudly.
# ---------------------------------------------------------------------------
echo "==> Scenario 3: first deploy of bad image (no previous) — expect failure"
set +e
IMAGE="$BAD_IMAGE" PORT="$PORT" DEPLOY_DIR="$WORK_DIR" SKIP_PULL=1 \
bash "$SCRIPT"
rc=$?
set -e
if [ "$rc" -eq 0 ]; then
fail "bad image first deploy returned 0 — failure was swallowed"
fi
pass "bad image first deploy returned non-zero ($rc)"

echo ""
echo "All rollback integration scenarios passed."
Loading