Skip to content

feat(deploy): add deployment script library foundation #5

feat(deploy): add deployment script library foundation

feat(deploy): add deployment script library foundation #5

Workflow file for this run

name: Deploy Docker Compose

Check failure on line 1 in .github/workflows/deploy.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/deploy.yml

Invalid workflow file

(Line: 1214, Col: 14): Exceeded max expression length 21000
# Future Refactoring Recommendations:
# 1. Extract bash logic into separate composite actions or standalone scripts
# - Retry mechanisms (retry.sh, ssh_retry) could be a reusable composite action
# - Process_stack function could be extracted to a standalone script
# - Validation functions could be moved to a validation composite action
# 2. Consider splitting deploy and rollback into separate reusable workflows
# - Would improve readability and make each workflow easier to maintain
# - Could share common functionality via composite actions
# 3. Abstract common patterns into reusable functions
# - Exit code and log handling patterns appear multiple times
# - Could create helper functions for result parsing and analysis
# Note: Current implementation prioritizes single-file simplicity for easier debugging
on:
workflow_call:
inputs:
args:
description: "docker compose up -d arguments"
required: false
type: string
stacks:
description: "JSON array of stack names to deploy"
required: true
type: string
webhook-url:
description: "1Password reference to Discord webhook URL"
required: true
type: string
repo-name:
description: "Repository display name for notifications"
required: true
type: string
target-ref:
description: "Git reference to checkout on remote server"
required: true
type: string
has-dockge:
description: "Whether this deployment includes Dockge"
required: false
type: boolean
default: false
force-deploy:
description: "Force deployment even if repository is already at target commit"
required: false
type: boolean
default: false
health-check-timeout:
description: "Health check timeout in seconds (default: 180)"
required: false
type: number
default: 180
health-check-command-timeout:
description: "Individual health check command timeout in seconds (default: 15)"
required: false
type: number
default: 15
critical-services:
description: "JSON array of critical service names that should trigger early exit on failure"
required: false
type: string
default: '[]'
git-fetch-timeout:
description: "Git fetch operation timeout in seconds (default: 300)"
required: false
type: number
default: 300
git-checkout-timeout:
description: "Git checkout operation timeout in seconds (default: 60)"
required: false
type: number
default: 60
image-pull-timeout:
description: "Docker image pull timeout in seconds (default: 600)"
required: false
type: number
default: 600
service-startup-timeout:
description: "Service startup timeout in seconds (default: 300)"
required: false
type: number
default: 300
validation-env-timeout:
description: "Environment validation timeout in seconds (default: 30)"
required: false
type: number
default: 30
validation-syntax-timeout:
description: "Syntax validation timeout in seconds (default: 60)"
required: false
type: number
default: 60
discord-user-id:
description: "Discord user ID to mention in failure notifications (e.g., '<@123456789>')"
required: false
type: string
default: ''
jobs:
deploy:
runs-on: ubuntu-24.04
if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
timeout-minutes: 40 # Overall job timeout
outputs:
previous_sha: ${{ steps.backup.outputs.previous_sha }}
deployment_needed: ${{ steps.backup.outputs.deployment_needed }}
deleted_files: ${{ steps.changed-files.outputs.deleted_files }}
deploy_status: ${{ steps.deploy.outcome }}
health_status: ${{ steps.health.outcome }}
cleanup_status: ${{ steps.cleanup.outcome }}
rollback_status: ${{ steps.rollback.outcome }}
rollback_health_status: ${{ steps.rollback-health.outcome }}
discovered_rollback_stacks: ${{ steps.rollback.outputs.discovered_rollback_stacks }}
healthy_stacks: ${{ steps.health.outputs.healthy_stacks }}
degraded_stacks: ${{ steps.health.outputs.degraded_stacks }}
failed_stacks: ${{ steps.health.outputs.failed_stacks }}
total_containers: ${{ steps.health.outputs.total_containers }}
running_containers: ${{ steps.health.outputs.running_containers }}
success_rate: ${{ steps.health.outputs.success_rate }}
rollback_healthy_stacks: ${{ steps.rollback-health.outputs.rollback_healthy_stacks }}
rollback_degraded_stacks: ${{ steps.rollback-health.outputs.rollback_degraded_stacks }}
rollback_failed_stacks: ${{ steps.rollback-health.outputs.rollback_failed_stacks }}
rollback_total_containers: ${{ steps.rollback-health.outputs.rollback_total_containers }}
rollback_running_containers: ${{ steps.rollback-health.outputs.rollback_running_containers }}
rollback_success_rate: ${{ steps.rollback-health.outputs.rollback_success_rate }}
removed_stacks: ${{ steps.cleanup-removed.outputs.removed_stacks }}
has_removed_stacks: ${{ steps.cleanup-removed.outputs.has_removed_stacks }}
steps:
- name: Validate and sanitize inputs
run: |
# Validate stacks parameter is valid JSON
echo '${{ inputs.stacks }}' | jq -r '.[]' >/dev/null || {
echo "::error::Invalid stacks JSON format: ${{ inputs.stacks }}"
exit 1
}
# Validate stack names contain only safe characters
echo '${{ inputs.stacks }}' | jq -r '.[]' | while read -r stack; do
if [[ ! "$stack" =~ ^[a-zA-Z0-9_-]+$ ]]; then
echo "::error::Invalid stack name: $stack. Only alphanumeric, underscore, and hyphen allowed."
exit 1
fi
# Check stack name length
if [ ${#stack} -gt 50 ]; then
echo "::error::Stack name too long: $stack (max 50 characters)"
exit 1
fi
done
# Validate target-ref format
TARGET_REF="${{ inputs.target-ref }}"
# Check if it's a valid commit SHA (7-40 hex chars) or branch/tag name
if [[ "$TARGET_REF" =~ ^[a-fA-F0-9]{7,40}$ ]] || [[ "$TARGET_REF" =~ ^[a-zA-Z0-9_-]+$ ]] || [[ "$TARGET_REF" =~ ^[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$ ]]; then
echo "✅ Target-ref format valid: $TARGET_REF"
else
echo "::error::Invalid target-ref format: $TARGET_REF"
echo "::error::Expected: commit SHA (7-40 hex chars) or branch/tag name"
exit 1
fi
# Validate and sanitize compose args
COMPOSE_ARGS="${{ inputs.args }}"
if [[ -n "$COMPOSE_ARGS" ]]; then
# Check for dangerous characters and patterns
if [[ "$COMPOSE_ARGS" =~ [\;\&\|\`\$\\] ]]; then
echo "::error::Compose args contain potentially dangerous characters: $COMPOSE_ARGS"
echo "::error::Prohibited characters: ; & | \` $ \\"
exit 1
fi
# Check for suspicious patterns
if [[ "$COMPOSE_ARGS" =~ (rm|kill|shutdown|reboot|format|dd|\>|\<|sudo|su) ]]; then
echo "::error::Compose args contain prohibited commands: $COMPOSE_ARGS"
exit 1
fi
# Validate against known docker compose options - allow hyphens, spaces, and equals for arguments
if [[ "$COMPOSE_ARGS" =~ ^[a-zA-Z0-9[:space:]_=.-]+$ ]]; then
echo "✅ Compose args format valid: $COMPOSE_ARGS"
else
echo "::error::Compose args contain invalid characters: $COMPOSE_ARGS"
exit 1
fi
fi
# Validate webhook URL format
WEBHOOK_URL="${{ inputs.webhook-url }}"
if [[ ! "$WEBHOOK_URL" =~ ^op://[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$ ]]; then
echo "::error::Invalid webhook URL format: $WEBHOOK_URL"
echo "::error::Expected format: op://vault/item/field"
exit 1
fi
# Validate repo name
REPO_NAME="${{ inputs.repo-name }}"
if [[ ! "$REPO_NAME" =~ ^[a-zA-Z0-9_-]+$ ]] || [ ${#REPO_NAME} -gt 100 ]; then
echo "::error::Invalid repo name: $REPO_NAME"
echo "::error::Must be alphanumeric with hyphens/underscores, max 100 chars"
exit 1
fi
echo "✅ All input validation passed"
- name: Display version information
run: |
echo "📋 Workflow Version Information"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Repository: ${{ inputs.repo-name }}"
echo "Target ref: ${{ inputs.target-ref }}"
echo "Stacks: ${{ inputs.stacks }}"
echo "Runner: ${{ runner.os }} ${{ runner.arch }}"
echo ""
echo "ℹ️ Reusable workflow SHA shown in 'Uses:' line above"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
- name: Verify required tools
run: |
echo "🔍 Verifying required tools are available..."
# Check for jq (required for JSON parsing)
if ! command -v jq &> /dev/null; then
echo "::error::jq is not installed. Installing..."
sudo apt-get update -qq
sudo apt-get install -y jq
fi
echo "✅ jq version: $(jq --version)"
# Check for timeout (required for command timeouts)
if ! command -v timeout &> /dev/null; then
echo "::error::timeout command is not available"
exit 1
fi
echo "✅ timeout is available (part of coreutils)"
# Check for readarray (Bash 4.0+ built-in for array operations)
if ! declare -p BASH_VERSION &> /dev/null || [ "${BASH_VERSION%%.*}" -lt 4 ]; then
echo "::error::Bash 4.0+ is required (current: ${BASH_VERSION:-unknown})"
exit 1
fi
echo "✅ Bash version: $BASH_VERSION (supports readarray and associative arrays)"
echo "✅ All required tools verified"
- name: Setup retry mechanism
run: |
# Create retry function for bash commands
cat > /tmp/retry.sh << 'EOF'
#!/bin/bash
retry() {
local max_attempts=$1
local delay=$2
local command="${@:3}"
local attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Attempt $attempt of $max_attempts: $command"
if eval "$command"; then
echo "✅ Command succeeded on attempt $attempt"
return 0
else
echo "❌ Command failed on attempt $attempt"
if [ $attempt -lt $max_attempts ]; then
echo "⏳ Waiting ${delay}s before retry..."
sleep $delay
delay=$((delay * 2)) # Exponential backoff
fi
attempt=$((attempt + 1))
fi
done
echo "💥 Command failed after $max_attempts attempts"
return 1
}
# Create SSH retry function with specific error handling
ssh_retry() {
local max_attempts=$1
local delay=$2
local ssh_cmd="${@:3}"
local attempt=1
local last_exit_code=1
while [ $attempt -le $max_attempts ]; do
echo "SSH Attempt $attempt of $max_attempts" >&2
if eval "$ssh_cmd"; then
echo "✅ SSH command succeeded on attempt $attempt" >&2
return 0
else
last_exit_code=$?
echo "❌ SSH command failed on attempt $attempt (exit code: $last_exit_code)" >&2
# Check for specific SSH errors
case $last_exit_code in
255) echo "SSH connection error - network/auth issue" >&2 ;;
1) echo "General SSH error" >&2 ;;
*) echo "Unknown error code: $last_exit_code" >&2 ;;
esac
if [ $attempt -lt $max_attempts ]; then
echo "⏳ Waiting ${delay}s before SSH retry..." >&2
sleep $delay
fi
attempt=$((attempt + 1))
fi
done
echo "💥 SSH command failed after $max_attempts attempts (final exit code: $last_exit_code)" >&2
return $last_exit_code
}
EOF
chmod +x /tmp/retry.sh
- name: Cache deployment tools
uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
with:
path: |
~/.cache/pip
~/.cache/docker
~/.ssh
key: deploy-tools-${{ runner.os }}-v1
restore-keys: |
deploy-tools-${{ runner.os }}-
- name: Configure 1Password Service Account
uses: 1password/load-secrets-action/configure@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
with:
service-account-token: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}
- name: Load Tailscale credentials
id: load-tailscale-credentials
uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
with:
unset-previous: true
env:
TAILSCALE_OAUTH_CLIENT_ID: "op://Docker/tailscale-oauth/client_id"
TAILSCALE_OAUTH_SECRET: "op://Docker/tailscale-oauth/secret"
- name: Connect to Tailnet
uses: tailscale/github-action@53acf823325fe9ca47f4cdaa951f90b4b0de5bb9 # v4.1.1
with:
oauth-client-id: ${{ steps.load-tailscale-credentials.outputs.TAILSCALE_OAUTH_CLIENT_ID }}
oauth-secret: ${{ steps.load-tailscale-credentials.outputs.TAILSCALE_OAUTH_SECRET }}
tags: tag:ci
ping: ${{ secrets.SSH_HOST }}
- name: Unload Tailscale credentials
uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
with:
unset-previous: true
- name: Optimize SSH connections
run: |
# Configure SSH connection multiplexing for better performance
mkdir -p ~/.ssh
cat >> ~/.ssh/config << EOF
Host deployment-server
HostName ${{ secrets.SSH_HOST }}
User ${{ secrets.SSH_USER }}
ControlMaster auto
ControlPath ~/.ssh/sockets/%r@%h:%p
ControlPersist 300
ServerAliveInterval 30
ServerAliveCountMax 3
Compression yes
TCPKeepAlive yes
EOF
# Create control socket directory and pre-establish SSH connection
mkdir -p ~/.ssh/sockets
echo "🔗 Pre-establishing SSH connection for multiplexing..."
ssh -o "StrictHostKeyChecking no" deployment-server -O check 2>/dev/null || \
ssh -o "StrictHostKeyChecking no" deployment-server -O forward -N &
# Give the connection a moment to establish
sleep 2
echo "✅ SSH connection optimization configured"
- name: Checkout repository for change detection
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
fetch-depth: 0 # Fetch full history for accurate change detection
- name: Determine previous deployment SHA
id: determine-previous
run: |
# Use retry mechanism for SSH connection
source /tmp/retry.sh
# Get current deployment SHA with error handling
echo "🔍 Checking current deployment SHA for change detection..."
if CURRENT_SHA=$(ssh_retry 3 5 "ssh -o 'StrictHostKeyChecking no' ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} 'cd /opt/compose && git rev-parse HEAD 2>/dev/null'"); then
# Validate SHA format
if [[ "$CURRENT_SHA" =~ ^[a-fA-F0-9]{40}$ ]]; then
echo "✅ Current deployed SHA: $CURRENT_SHA"
echo "previous_sha=$CURRENT_SHA" >> "$GITHUB_OUTPUT"
else
echo "⚠️ Invalid SHA format from server: $CURRENT_SHA"
echo "⚠️ Using HEAD^ as fallback for change detection"
echo "previous_sha=HEAD^" >> "$GITHUB_OUTPUT"
fi
else
echo "⚠️ Could not retrieve current deployment SHA - using HEAD^ for change detection"
echo "previous_sha=HEAD^" >> "$GITHUB_OUTPUT"
fi
- name: Get changed files for removal detection
id: changed-files
if: steps.determine-previous.outputs.previous_sha != inputs.target-ref
continue-on-error: true
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
with:
json: true
sha: ${{ inputs.target-ref }}
base_sha: ${{ steps.determine-previous.outputs.previous_sha }}
- name: Store current deployment for rollback
id: backup
run: |
echo "::group::Preparing deployment backup"
# Use retry mechanism for SSH connection
source /tmp/retry.sh
# Get current deployment SHA with error handling
echo "🔍 Checking current deployment SHA..."
if CURRENT_SHA=$(ssh_retry 3 5 "ssh -o 'StrictHostKeyChecking no' ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} 'cd /opt/compose && git rev-parse HEAD 2>/dev/null'"); then
# Validate SHA format
if [[ "$CURRENT_SHA" =~ ^[a-fA-F0-9]{40}$ ]]; then
echo "✅ Current deployed SHA: $CURRENT_SHA"
else
echo "⚠️ Invalid SHA format from server: $CURRENT_SHA"
CURRENT_SHA="unknown"
fi
else
echo "⚠️ Could not retrieve current deployment SHA - assuming first deployment"
CURRENT_SHA="unknown"
fi
TARGET_REF="${{ inputs.target-ref }}"
echo "🎯 Target deployment ref: $TARGET_REF"
# Resolve target ref to SHA if it's not already a SHA
if [[ "$TARGET_REF" =~ ^[a-fA-F0-9]{40}$ ]]; then
TARGET_SHA="$TARGET_REF"
echo "✅ Target ref is already a full SHA"
elif [[ "$TARGET_REF" =~ ^[a-fA-F0-9]{7,39}$ ]]; then
TARGET_SHA="$TARGET_REF"
echo "✅ Target ref is a short SHA, will resolve on server"
else
TARGET_SHA="$TARGET_REF"
echo "✅ Target ref is a branch/tag name, will resolve on server"
fi
# Set outputs with proper validation
echo "previous_sha=${CURRENT_SHA}" >> "$GITHUB_OUTPUT"
if [ "$CURRENT_SHA" = "$TARGET_SHA" ] && [ "${{ inputs.force-deploy }}" != "true" ]; then
echo "⚠️ Repository is already at target commit - no deployment needed"
echo "deployment_needed=false" >> "$GITHUB_OUTPUT"
elif [ "$CURRENT_SHA" = "$TARGET_SHA" ] && [ "${{ inputs.force-deploy }}" = "true" ]; then
echo "🔄 Force deployment requested - proceeding despite same commit"
echo "deployment_needed=true" >> "$GITHUB_OUTPUT"
else
echo "✅ Deployment needed - proceeding with update"
echo "deployment_needed=true" >> "$GITHUB_OUTPUT"
fi
echo "::endgroup::"
# ================================================================
# STACK REMOVAL DETECTION AND CLEANUP
# ================================================================
# Automatically detect and clean up Docker stacks that have been
# removed from the repository using three independent detection methods.
#
# Detection Methods:
# 1. Git Diff: Compares current deployed SHA vs target SHA
# 2. Tree Comparison: Compares target commit tree vs server filesystem
# (catches removals from previous undeployed commits)
# 3. Discovery Analysis: Analyzes tj-actions/changed-files output
# (validates removals from GitHub perspective)
#
# Process:
# 1. Run all three detection methods independently on deployment server
# 2. Fail deployment if ANY detection method encounters errors (fail-safe)
# 3. Aggregate results using union approach (remove anything found by any method)
# 4. Deduplicate and validate stack names
# 5. Run 'docker compose down' for each removed stack
# 6. Fail deployment if any cleanup fails
# 7. Send Discord notification listing removed stacks
#
# Design: docs/plans/2025-12-06-enhanced-stack-removal-detection-design.md
- name: Detect and clean up removed stacks
id: cleanup-removed
if: steps.backup.outputs.deployment_needed == 'true'
continue-on-error: false
run: |
# Source retry functions
source /tmp/retry.sh
# === DETECTION FUNCTION: GIT DIFF ===
# Purpose: Detect stacks removed between two git commits
# Inputs: $1=current_sha, $2=target_ref
# Output: Newline-separated list of stack names (stdout)
# Returns: 0 on success, 1 on error
detect_removed_stacks_gitdiff() {
local current_sha="$1"
local target_ref="$2"
# Build detection script
local detect_script
detect_script=$(cat << 'DETECT_EOF'
set -e
CURRENT_SHA="$1"
TARGET_REF="$2"
cd /opt/compose
# Fetch target ref to ensure we have it
if ! git fetch origin "$TARGET_REF" 2>/dev/null; then
echo "⚠️ Failed to fetch target ref, trying general fetch..." >&2
if ! git fetch 2>/dev/null; then
echo "::error::Failed to fetch repository updates" >&2
exit 1
fi
fi
# Resolve target ref to SHA for comparison
TARGET_SHA=$(git rev-parse "$TARGET_REF" 2>/dev/null || echo "$TARGET_REF")
# Validate both SHAs exist
if ! git cat-file -e "$CURRENT_SHA" 2>/dev/null; then
echo "::warning::Current SHA $CURRENT_SHA not found in repository (may have been replaced by force-push)" >&2
echo " Skipping git diff detection, will rely on tree comparison method" >&2
exit 1
fi
if ! git cat-file -e "$TARGET_SHA" 2>/dev/null; then
echo "::warning::Target SHA $TARGET_SHA not found in repository" >&2
echo " Skipping git diff detection, will rely on tree comparison method" >&2
exit 1
fi
# Find deleted compose.yaml files between current and target
git diff --diff-filter=D --name-only "$CURRENT_SHA" "$TARGET_SHA" 2>/dev/null | \
grep -E '^[^/]+/compose\.yaml$' | \
sed 's|/compose\.yaml||' || echo ""
DETECT_EOF
)
# Execute detection script on remote server
echo "$detect_script" | ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$current_sha\" \"$target_ref\""
}
# === DETECTION FUNCTION: TREE COMPARISON ===
# Purpose: Detect stacks on server filesystem missing from target commit tree
# Inputs: $1=target_ref
# Output: Newline-separated list of stack names (stdout)
# Returns: 0 on success, 1 on error
detect_removed_stacks_tree() {
local target_ref="$1"
# Build detection script
local detect_script
detect_script=$(cat << 'DETECT_TREE_EOF'
set -e
TARGET_REF="$1"
cd /opt/compose
# Fetch target ref to ensure we have it
if ! git fetch origin "$TARGET_REF" 2>/dev/null; then
echo "⚠️ Failed to fetch target ref, trying general fetch..." >&2
if ! git fetch 2>/dev/null; then
echo "::error::Failed to fetch repository updates" >&2
exit 1
fi
fi
# Resolve target ref to SHA
TARGET_SHA=$(git rev-parse "$TARGET_REF" 2>/dev/null || echo "$TARGET_REF")
# Validate target SHA exists
if ! git cat-file -e "$TARGET_SHA" 2>/dev/null; then
echo "::error::Target SHA $TARGET_SHA not found in repository" >&2
exit 1
fi
# Get directories in target commit (one level deep, directories only)
COMMIT_DIRS=$(git ls-tree --name-only "$TARGET_SHA" 2>/dev/null | sort)
# Get directories on server filesystem (exclude .git and hidden dirs)
SERVER_DIRS=$(find /opt/compose -maxdepth 1 -mindepth 1 -type d ! -name '.*' -exec basename {} \; 2>/dev/null | sort)
# Find directories on server but not in commit
MISSING_IN_COMMIT=$(comm -13 <(echo "$COMMIT_DIRS") <(echo "$SERVER_DIRS"))
# Filter for directories with compose.yaml files
for dir in $MISSING_IN_COMMIT; do
if [ -f "/opt/compose/$dir/compose.yaml" ]; then
echo "$dir"
fi
done
DETECT_TREE_EOF
)
# Execute detection script on remote server
echo "$detect_script" | ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$target_ref\""
}
# === DETECTION FUNCTION: DISCOVERY ANALYSIS ===
# Purpose: Analyze deleted files from tj-actions/changed-files output
# Inputs: $1=deleted_files_json (JSON array from tj-actions/changed-files)
# Output: Newline-separated list of stack names (stdout)
# Returns: 0 on success, 1 on error
detect_removed_stacks_discovery() {
local deleted_files_json="$1"
# Build detection script
local detect_script
detect_script=$(cat << 'DETECT_DISCOVERY_EOF'
set -e
DELETED_FILES_JSON="$1"
# Parse JSON array and filter for compose.yaml deletions
# Pattern: one level deep only (stack-name/compose.yaml)
echo "$DELETED_FILES_JSON" | jq -r '.[]' 2>/dev/null | \
grep -E '^[^/]+/compose\.yaml$' | \
sed 's|/compose\.yaml||' || echo ""
DETECT_DISCOVERY_EOF
)
# Execute detection script on remote server
echo "$detect_script" | ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$deleted_files_json\""
}
# === AGGREGATION FUNCTION ===
# Purpose: Merge and deduplicate results from all three detection methods
# Inputs: $1=gitdiff_stacks, $2=tree_stacks, $3=discovery_stacks (newline-separated lists)
# Output: Deduplicated newline-separated list of stack names (stdout)
# Returns: 0 on success (empty string if all inputs empty, not an error)
aggregate_removed_stacks() {
local gitdiff_stacks="$1"
local tree_stacks="$2"
local discovery_stacks="$3"
# Concatenate all three lists, remove empty lines, sort and deduplicate
{
echo "$gitdiff_stacks"
echo "$tree_stacks"
echo "$discovery_stacks"
} | \
grep -v '^$' | \
sort -u | \
grep -E '^[a-zA-Z0-9_-]+$' || echo ""
}
# === CLEANUP FUNCTION ===
# Purpose: Clean up a single removed stack using docker compose down
# Inputs: $1=stack_name
# Returns: 0 on success, 1 on error
# Note: Requires OP_SERVICE_ACCOUNT_TOKEN from GitHub secrets
# Security: Token passed as environment variable via heredoc (not command-line args) to avoid exposure in process listings
cleanup_stack() {
local stack="$1"
local op_token="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}"
# Build cleanup script that expects OP_SERVICE_ACCOUNT_TOKEN from environment
local cleanup_script
cleanup_script=$(cat << 'CLEANUP_EOF'
STACK="$1"
# Check if stack directory exists
if [ ! -d "/opt/compose/$STACK" ]; then
echo "⚠️ Stack directory not found for $STACK - already fully removed"
exit 0
fi
cd "/opt/compose/$STACK"
# Check if compose.yaml exists
if [ ! -f compose.yaml ]; then
echo "⚠️ compose.yaml not found for $STACK - may have been manually removed"
exit 0
fi
# Run docker compose down with 1Password
# Note: OP_SERVICE_ACCOUNT_TOKEN is set by the wrapper script
if op run --env-file=/opt/compose/compose.env -- docker compose -f ./compose.yaml down; then
echo "✅ Successfully cleaned up $STACK"
else
echo "❌ Failed to clean up $STACK"
exit 1
fi
CLEANUP_EOF
)
# Execute cleanup script on remote server
# Token passed as environment variable via heredoc to avoid exposure in process args
ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$stack\"" <<EOF
export OP_SERVICE_ACCOUNT_TOKEN="$op_token"
$cleanup_script
EOF
}
# === MAIN EXECUTION ===
echo "::group::Detecting removed stacks"
CURRENT_SHA="${{ steps.backup.outputs.previous_sha }}"
TARGET_REF="${{ inputs.target-ref }}"
# Skip detection if this is the first deployment
if [ "$CURRENT_SHA" = "unknown" ]; then
echo "ℹ️ First deployment detected - no previous stacks to remove"
echo "removed_stacks=" >> "$GITHUB_OUTPUT"
echo "has_removed_stacks=false" >> "$GITHUB_OUTPUT"
echo "::endgroup::"
exit 0
fi
echo "📊 Comparing commits:"
echo " Current: $CURRENT_SHA"
echo " Target: $TARGET_REF"
echo "🔍 Checking for removed stacks..."
# Read deleted files from changed-files step (may be empty if step failed)
DELETED_FILES='${{ steps.changed-files.outputs.deleted_files }}'
# Check if changed-files step succeeded
if [ "${{ steps.changed-files.outcome }}" != "success" ]; then
echo "⚠️ Changed-files detection failed (likely due to missing git ref)"
echo " Proceeding with git diff and tree comparison methods only"
fi
echo "🔍 Running three detection methods..."
# Execute all three detection methods independently
echo " 1. Git diff detection (commit comparison)..."
GITDIFF_STACKS=$(detect_removed_stacks_gitdiff "$CURRENT_SHA" "$TARGET_REF") || GITDIFF_EXIT=$?
echo " 2. Tree comparison detection (filesystem vs commit)..."
TREE_STACKS=$(detect_removed_stacks_tree "$TARGET_REF") || TREE_EXIT=$?
echo " 3. Discovery analysis detection (changed files)..."
if [ "$DELETED_FILES" = "[]" ] || [ -z "$DELETED_FILES" ]; then
# Empty JSON array or empty string - no deleted files to analyze
echo " ℹ️ No deleted files detected - skipping discovery analysis"
DISCOVERY_STACKS=""
DISCOVERY_EXIT=0
else
DISCOVERY_STACKS=$(detect_removed_stacks_discovery "$DELETED_FILES") || DISCOVERY_EXIT=$?
fi
# Fail deployment if any detection method failed (fail-safe)
if [ "${GITDIFF_EXIT:-0}" -ne 0 ]; then
echo "::error::Git diff detection failed (exit code: $GITDIFF_EXIT)"
exit 1
fi
if [ "${TREE_EXIT:-0}" -ne 0 ]; then
echo "::error::Tree comparison detection failed (exit code: $TREE_EXIT)"
exit 1
fi
if [ "${DISCOVERY_EXIT:-0}" -ne 0 ]; then
echo "::error::Discovery analysis detection failed (exit code: $DISCOVERY_EXIT)"
exit 1
fi
echo "✅ All detection methods completed successfully"
# Aggregate results (union of all three methods)
echo "📊 Aggregating results..."
REMOVED_STACKS=$(aggregate_removed_stacks "$GITDIFF_STACKS" "$TREE_STACKS" "$DISCOVERY_STACKS")
# Debug logging
if [ -n "$GITDIFF_STACKS" ]; then
echo " Git diff found: $(echo "$GITDIFF_STACKS" | tr '\n' ', ' | sed 's/,$//')"
fi
if [ -n "$TREE_STACKS" ]; then
echo " Tree comparison found: $(echo "$TREE_STACKS" | tr '\n' ', ' | sed 's/,$//')"
fi
if [ -n "$DISCOVERY_STACKS" ]; then
echo " Discovery analysis found: $(echo "$DISCOVERY_STACKS" | tr '\n' ', ' | sed 's/,$//')"
fi
# Process results
if [ -z "$REMOVED_STACKS" ]; then
echo "✅ No stacks to remove"
echo "removed_stacks=" >> "$GITHUB_OUTPUT"
echo "has_removed_stacks=false" >> "$GITHUB_OUTPUT"
else
echo "🗑️ Found stacks to remove:"
echo "$REMOVED_STACKS" | while read -r stack; do
echo " - $stack"
done
# Convert to JSON array for output
REMOVED_JSON=$(echo "$REMOVED_STACKS" | jq -R -s -c 'split("\n") | map(select(length > 0))')
echo "removed_stacks=$REMOVED_JSON" >> "$GITHUB_OUTPUT"
echo "has_removed_stacks=true" >> "$GITHUB_OUTPUT"
# Cleanup each removed stack
echo ""
echo "::group::Cleaning up removed stacks"
CLEANUP_FAILED=false
while IFS= read -r stack; do
[ -z "$stack" ] && continue
echo "🧹 Cleaning up stack: $stack"
if ! cleanup_stack "$stack"; then
echo "💥 Cleanup failed for stack: $stack"
CLEANUP_FAILED=true
break
fi
done <<< "$REMOVED_STACKS"
echo "::endgroup::"
if [ "$CLEANUP_FAILED" = "true" ]; then
echo "::error::Stack cleanup failed - stopping deployment"
exit 1
fi
echo "✅ All removed stacks cleaned successfully"
fi
echo "::endgroup::"
- name: Notify removed stacks cleanup
if: steps.cleanup-removed.outputs.has_removed_stacks == 'true'
run: |
echo "📢 Sending cleanup notification to Discord..."
# Get webhook URL from 1Password
WEBHOOK_URL=$(op read "${{ inputs.webhook-url }}")
# Build removed stacks list and create JSON payload using jq for proper escaping
REMOVED_STACKS='${{ steps.cleanup-removed.outputs.removed_stacks }}'
STACK_LIST=$(echo "$REMOVED_STACKS" | jq -r '.[] | "- " + .')
TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
# Build JSON payload with jq to ensure proper escaping
PAYLOAD=$(jq -n \
--arg title "🗑️ Stack Cleanup - ${{ inputs.repo-name }}" \
--arg description "Removed stacks have been cleaned up before deployment" \
--arg stacks "$STACK_LIST" \
--arg target "${{ inputs.target-ref }}" \
--arg previous "${{ steps.backup.outputs.previous_sha }}" \
--arg timestamp "$TIMESTAMP" \
'{
embeds: [{
title: $title,
description: $description,
color: 16753920,
fields: [
{name: "Removed Stacks", value: $stacks},
{name: "Target Commit", value: ("`" + $target + "`")},
{name: "Previous Commit", value: ("`" + $previous + "`")}
],
timestamp: $timestamp
}]
}')
# Send Discord notification
curl -X POST "$WEBHOOK_URL" \
-H "Content-Type: application/json" \
-d "$PAYLOAD"
echo "✅ Cleanup notification sent"
- name: Deploy All Stacks
id: deploy
if: steps.backup.outputs.deployment_needed == 'true'
continue-on-error: true
run: |
echo "🚀 Deploying all stacks"
# Source retry functions
source /tmp/retry.sh
# Set error handling
set -e
trap 'echo "❌ Deployment failed at line $LINENO"' ERR
# Parse inputs outside SSH context
STACKS="${{ join(fromJson(inputs.stacks), ' ') }}"
HAS_DOCKGE="${{ inputs.has-dockge }}"
TARGET_REF="${{ inputs.target-ref }}"
COMPOSE_ARGS="${{ inputs.args || '' }}"
# Use retry mechanism and optimized deployment
ssh_retry 3 10 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s $STACKS \"$HAS_DOCKGE\" \"$TARGET_REF\" \"$COMPOSE_ARGS\"" << 'EOF'
set -e
# Performance optimizations
export DOCKER_BUILDKIT=1
export COMPOSE_DOCKER_CLI_BUILD=1
# Enable parallel image pulls
export COMPOSE_PARALLEL_LIMIT=8
# Get arguments passed to script (excluding sensitive OP_TOKEN)
# Arguments: stack1 stack2 stack3 ... HAS_DOCKGE TARGET_REF [COMPOSE_ARGS]
# COMPOSE_ARGS might be empty, so we need to handle variable arg count
TOTAL_ARGS=$#
# Find HAS_DOCKGE by looking for 'true' or 'false' in the args
HAS_DOCKGE=""
TARGET_REF=""
COMPOSE_ARGS=""
# The last few args should be: HAS_DOCKGE TARGET_REF [COMPOSE_ARGS]
# HAS_DOCKGE is always 'true' or 'false'
# TARGET_REF is a commit SHA (starts with letter/number)
# COMPOSE_ARGS is optional and could be empty
for i in $(seq 1 $TOTAL_ARGS); do
ARG="${!i}"
if [ "$ARG" = "true" ] || [ "$ARG" = "false" ]; then
HAS_DOCKGE="$ARG"
TARGET_REF="${@:$((i+1)):1}"
if [ $((i+2)) -le $TOTAL_ARGS ]; then
COMPOSE_ARGS="${@:$((i+2)):1}"
fi
# All args before this position are stack names
STACKS="${@:1:$((i-1))}"
break
fi
done
# Set OP_TOKEN via environment (passed separately)
export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}"
# Consolidate timeout values for easier maintenance
# These can be overridden by workflow inputs where available
GIT_FETCH_TIMEOUT=${{ inputs.git-fetch-timeout }}
GIT_CHECKOUT_TIMEOUT=${{ inputs.git-checkout-timeout }}
IMAGE_PULL_TIMEOUT=${{ inputs.image-pull-timeout }}
SERVICE_STARTUP_TIMEOUT=${{ inputs.service-startup-timeout }}
VALIDATION_ENV_TIMEOUT=${{ inputs.validation-env-timeout }}
VALIDATION_SYNTAX_TIMEOUT=${{ inputs.validation-syntax-timeout }}
if [ "$HAS_DOCKGE" = "true" ]; then
echo "🚀 Deploying Dockge..."
cd /opt/dockge
# Add timeout protection for Dockge operations
if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then
echo "❌ Dockge image pull timed out after ${IMAGE_PULL_TIMEOUT}s"
exit 1
fi
if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then
echo "❌ Dockge startup timed out after ${SERVICE_STARTUP_TIMEOUT}s"
exit 1
fi
echo "✅ Dockge deployed successfully"
fi
echo "Updating repository to $TARGET_REF..."
# Add timeout protection to git operations
if ! timeout $GIT_FETCH_TIMEOUT git -C /opt/compose/ fetch; then
echo "❌ Git fetch timed out after ${GIT_FETCH_TIMEOUT}s"
exit 1
fi
if ! timeout $GIT_CHECKOUT_TIMEOUT git -C /opt/compose/ checkout $TARGET_REF; then
echo "❌ Git checkout timed out after ${GIT_CHECKOUT_TIMEOUT}s"
exit 1
fi
echo "✅ Repository updated to $TARGET_REF"
# Shared function to deploy or rollback a single stack
# This eliminates code duplication between deploy and rollback operations
process_stack() {
local STACK=$1
local OPERATION=$2 # "deploy" or "rollback"
local LOGFILE="/tmp/${OPERATION}_${STACK}.log"
local EXITCODEFILE="/tmp/${OPERATION}_${STACK}.exitcode"
{
if [ "$OPERATION" = "deploy" ]; then
echo "🚀 Deploying $STACK..."
else
echo "🔄 Rolling back $STACK..."
fi
cd /opt/compose/$STACK
echo " Pulling images for $STACK..."
# Add timeout protection (5 minutes for image pull)
if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then
echo "❌ Failed to pull images for $STACK during $OPERATION (timeout or error)"
exit 1
fi
echo " Starting services for $STACK..."
# Add timeout protection (2 minutes for service startup)
if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then
echo "❌ Failed to start services for $STACK during $OPERATION (timeout or error)"
exit 1
fi
if [ "$OPERATION" = "deploy" ]; then
echo "✅ $STACK deployed successfully"
else
echo "✅ $STACK rolled back successfully"
fi
} > "$LOGFILE" 2>&1
# Capture and save exit code for robust error detection
local exit_code=$?
echo "$exit_code" > "$EXITCODEFILE"
return $exit_code
}
# Wrapper function for deploy (maintains backward compatibility)
deploy_stack() {
process_stack "$1" "deploy"
}
# Cleanup function for deploy logs
cleanup_deploy_logs() {
for STACK in $STACKS; do
rm -f "/tmp/deploy_${STACK}.log" 2>/dev/null
done
}
# Pre-deployment validation function
validate_all_stacks() {
echo "🔍 Pre-deployment validation of all stacks..."
local validation_failed=false
for STACK in $STACKS; do
echo " Validating $STACK..."
# Check if stack directory exists
if [ ! -d "/opt/compose/$STACK" ]; then
echo "❌ $STACK: Directory /opt/compose/$STACK not found"
validation_failed=true
continue
fi
cd "/opt/compose/$STACK" || {
echo "❌ $STACK: Cannot access directory"
validation_failed=true
continue
}
# Check if compose.yaml exists
if [ ! -f "compose.yaml" ]; then
echo "❌ $STACK: compose.yaml not found"
validation_failed=true
continue
fi
# Validate 1Password environment access and Docker Compose config
if ! timeout $VALIDATION_ENV_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services >/dev/null 2>&1; then
echo "❌ $STACK: Environment validation failed (1Password or compose config error)"
validation_failed=true
continue
fi
# Quick syntax validation
if ! timeout $VALIDATION_SYNTAX_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --quiet 2>/dev/null; then
echo "❌ $STACK: Docker Compose syntax validation failed"
validation_failed=true
continue
fi
echo "✅ $STACK: Pre-deployment validation passed"
done
if [ "$validation_failed" = true ]; then
echo "❌ Pre-deployment validation failed for one or more stacks"
echo " Stopping deployment to prevent extended failures"
return 1
fi
echo "✅ All stacks passed pre-deployment validation"
return 0
}
# Run pre-deployment validation
if ! validate_all_stacks; then
echo "DEPLOYMENT_STATUS=failed_validation" >> "$GITHUB_OUTPUT"
exit 1
fi
# Set trap for cleanup on exit
trap cleanup_deploy_logs EXIT
# Start all deployments in parallel
echo "🚀 Starting parallel deployment of all stacks..."
PIDS=""
# Simple approach - use for loop directly with unquoted variable
for STACK in $STACKS; do
echo "🚀 Deploying $STACK..."
deploy_stack "$STACK" &
PIDS="$PIDS $!"
echo "Started deployment of $STACK (PID: $!)"
done
# Wait for all deployments and collect results
echo "⏳ Waiting for all deployments to complete..."
FAILED_STACKS=""
# Enhanced parallel job monitoring with better error propagation
echo "⏳ Monitoring parallel deployments..."
DEPLOYED_STACKS=""
SUCCESSFUL_STACKS=""
DEPLOYMENT_ERRORS=""
# Wait for jobs individually to capture exit codes
for PID in $PIDS; do
if wait "$PID"; then
echo "✅ Deployment process $PID completed successfully"
else
EXIT_CODE=$?
echo "❌ Deployment process $PID failed with exit code $EXIT_CODE"
DEPLOYMENT_ERRORS="$DEPLOYMENT_ERRORS PID:$PID:$EXIT_CODE"
fi
done
# Enhanced result analysis using exit code files (more robust than log parsing)
for STACK in $STACKS; do
if [ -f "/tmp/deploy_${STACK}.log" ]; then
DEPLOYED_STACKS="$DEPLOYED_STACKS $STACK"
# Primary: Check exit code file for robust error detection
if [ -f "/tmp/deploy_${STACK}.exitcode" ]; then
EXIT_CODE=$(cat "/tmp/deploy_${STACK}.exitcode")
if [ "$EXIT_CODE" -eq 0 ]; then
SUCCESSFUL_STACKS="$SUCCESSFUL_STACKS $STACK"
else
FAILED_STACKS="$FAILED_STACKS $STACK"
echo "🔍 $STACK Error: Non-zero exit code ($EXIT_CODE)"
fi
else
# Fallback: Log-based error detection if exit code file is missing
echo "⚠️ $STACK: Exit code file missing - using less reliable log-based detection"
if grep -q "❌.*$STACK\|CRITICAL.*$STACK\|Failed.*$STACK\|Error.*$STACK" "/tmp/deploy_${STACK}.log"; then
FAILED_STACKS="$FAILED_STACKS $STACK"
# Extract specific error for reporting
STACK_ERROR=$(grep -E "❌.*$STACK|CRITICAL.*$STACK|Failed.*$STACK|Error.*$STACK" "/tmp/deploy_${STACK}.log" | head -1)
echo "🔍 $STACK Error: $STACK_ERROR"
elif grep -q "✅.*$STACK\|Successfully.*$STACK" "/tmp/deploy_${STACK}.log"; then
SUCCESSFUL_STACKS="$SUCCESSFUL_STACKS $STACK"
else
echo "⚠️ $STACK: No clear success/failure indicator in logs - treating as potential failure"
FAILED_STACKS="$FAILED_STACKS $STACK"
fi
fi
else
echo "⚠️ $STACK: No deployment log found - possible early failure"
FAILED_STACKS="$FAILED_STACKS $STACK"
fi
done
# Summary of deployment results
echo ""
echo "📊 Deployment Summary:"
echo " Successful: $(echo $SUCCESSFUL_STACKS | wc -w | tr -d ' ') stacks"
echo " Failed: $(echo $FAILED_STACKS | wc -w | tr -d ' ') stacks"
if [ -n "$DEPLOYMENT_ERRORS" ]; then
echo " Process errors: $DEPLOYMENT_ERRORS"
fi
# Display deployment logs with enhanced formatting
echo ""
echo "📋 Detailed Deployment Results:"
echo "════════════════════════════════════════════════════════════════"
for STACK in $STACKS; do
if [ -f "/tmp/deploy_${STACK}.log" ]; then
echo ""
echo "🔸 STACK: $STACK"
echo "────────────────────────────────────────────────────────────────"
cat "/tmp/deploy_${STACK}.log"
echo "────────────────────────────────────────────────────────────────"
else
echo ""
echo "🔸 STACK: $STACK"
echo "────────────────────────────────────────────────────────────────"
echo "⚠️ No deployment log found for $STACK"
echo "────────────────────────────────────────────────────────────────"
fi
done
echo "════════════════════════════════════════════════════════════════"
# Check if any deployments failed
if [ -z "$STACKS" ]; then
echo "💥 No stacks to deploy - STACKS variable is empty!"
exit 1
elif [ -z "$DEPLOYED_STACKS" ]; then
echo "💥 No stacks were actually deployed - check stack discovery!"
exit 1
elif [ -n "$FAILED_STACKS" ]; then
echo "💥 Deployments failed for:$FAILED_STACKS"
exit 1
fi
echo "🎉 All stacks deployed successfully in parallel!"
EOF
- name: Health Check All Services
id: health
if: steps.backup.outputs.deployment_needed == 'true' && steps.deploy.outcome == 'success'
run: |
echo "🔍 Health checking all services"
# Source retry functions
source /tmp/retry.sh
# Parse inputs outside SSH context
STACKS="${{ join(fromJSON(inputs.stacks), ' ') }}"
HAS_DOCKGE="${{ inputs.has-dockge }}"
# Execute health check and capture structured output
# Temporarily disable set -e to capture exit code from command substitution
# Use retry mechanism for health check
set +e
HEALTH_RESULT=$(ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} /bin/bash -s $STACKS \"$HAS_DOCKGE\"" << 'EOF'
set -e
# Get arguments passed to script (excluding sensitive OP_TOKEN)
TOTAL_ARGS=$#
# Find HAS_DOCKGE by looking for 'true' or 'false' in the args
HAS_DOCKGE=""
for i in $(seq 1 $TOTAL_ARGS); do
ARG="${!i}"
if [ "$ARG" = "true" ] || [ "$ARG" = "false" ]; then
HAS_DOCKGE="$ARG"
# All args before this position are stack names
STACKS="${@:1:$((i-1))}"
break
fi
done
# Set OP_TOKEN via environment (passed separately)
export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}"
# Set timeout configuration with defaults
HEALTH_CHECK_TIMEOUT=${{ inputs.health-check-timeout }}
HEALTH_CHECK_CMD_TIMEOUT=${{ inputs.health-check-command-timeout }}
# Enhanced health check with exponential backoff
echo "🔍 Starting enhanced health check with exponential backoff..."
# Health check function with retry logic
health_check_with_retry() {
local stack=$1
local logfile="/tmp/health_${stack}.log"
# Use configurable timeout with fallback to defaults
local timeout_seconds=${HEALTH_CHECK_TIMEOUT:-180}
local max_attempts=4
local wait_time=3
local attempt=1
local fast_fail_threshold=2 # Fast fail after 2 attempts if no progress
local start_time=$(date +%s)
# Create log file and redirect all output
exec 3>&1 4>&2
exec 1>"$logfile" 2>&1
# Ensure file descriptors are restored on function exit
trap 'exec 1>&3 2>&4 3>&- 4>&-' RETURN
echo "🕰️ Health check timeout configured: ${timeout_seconds}s"
echo "🔍 Health checking $stack with optimized retry logic..."
cd "/opt/compose/$stack" || {
echo "❌ $stack: Directory not found"
return 1
}
# Cache total service count (doesn't change during health check)
local total_count
total_count=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services 2>/dev/null | grep -E '^[a-zA-Z0-9_-]+$' | wc -l | tr -d " " || echo "0")
if [ "$total_count" -eq 0 ]; then
echo "❌ $stack: No services defined in compose file"
return 1
fi
local previous_running=0
local no_progress_count=0
while [ $attempt -le $max_attempts ]; do
echo " Attempt $attempt/$max_attempts for $stack (wait: ${wait_time}s)"
# Get container status and health with error handling
local running_healthy running_starting running_unhealthy running_no_health
local exited_count restarting_count running_count
# Check overall timeout
local current_time=$(date +%s)
local elapsed=$((current_time - start_time))
if [ $elapsed -gt $timeout_seconds ]; then
echo "❌ $stack: Health check timed out after ${elapsed}s (limit: ${timeout_seconds}s)"
return 1
fi
# Get container state and health in one call using custom format
# Format: Service State Health (tab-separated)
local ps_output
ps_output=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps --format '{{.Service}}\t{{.State}}\t{{.Health}}' 2>/dev/null || echo "")
# Parse output to count different states and health conditions
running_healthy=0
running_starting=0
running_unhealthy=0
running_no_health=0
exited_count=0
restarting_count=0
while IFS=$'\t' read -r service state health; do
# Skip empty lines
[ -z "$service" ] && continue
case "$state" in
running)
case "$health" in
healthy)
running_healthy=$((running_healthy + 1))
;;
starting)
running_starting=$((running_starting + 1))
;;
unhealthy)
running_unhealthy=$((running_unhealthy + 1))
;;
*)
# No health check defined
running_no_health=$((running_no_health + 1))
;;
esac
;;
exited)
exited_count=$((exited_count + 1))
;;
restarting)
restarting_count=$((restarting_count + 1))
;;
esac
done <<< "$ps_output"
# Total running containers (all health states)
running_count=$((running_healthy + running_starting + running_unhealthy + running_no_health))
echo " $stack status: $running_count/$total_count running (healthy: $running_healthy, starting: $running_starting, unhealthy: $running_unhealthy, no-check: $running_no_health), exited: $exited_count, restarting: $restarting_count"
# Fast fail logic: if unhealthy or no progress with failures
if [ "$running_unhealthy" -gt 0 ] && [ $attempt -ge $fast_fail_threshold ]; then
echo "❌ $stack: Fast fail - $running_unhealthy unhealthy containers detected (attempt $attempt)"
return 1
elif [ $attempt -ge $fast_fail_threshold ] && [ "$running_count" -eq "$previous_running" ] && [ "$exited_count" -gt 0 ]; then
no_progress_count=$((no_progress_count + 1))
if [ $no_progress_count -ge 2 ]; then
echo "❌ $stack: Fast fail - no progress and containers failing (attempt $attempt)"
return 1
fi
else
no_progress_count=0
fi
# Calculate healthy containers (healthy + no health check defined)
local healthy_total=$((running_healthy + running_no_health))
# Success condition: all containers running and healthy (or no health check)
if [ "$healthy_total" -eq "$total_count" ] && [ "$total_count" -gt 0 ] && [ "$running_starting" -eq 0 ] && [ "$running_unhealthy" -eq 0 ] && [ "$exited_count" -eq 0 ] && [ "$restarting_count" -eq 0 ]; then
echo "✅ $stack: All $total_count services healthy"
return 0
# Degraded but stable: all running and healthy, but fewer than expected
elif [ "$healthy_total" -gt 0 ] && [ "$healthy_total" -eq "$running_count" ] && [ "$running_starting" -eq 0 ] && [ "$running_unhealthy" -eq 0 ] && [ "$exited_count" -eq 0 ] && [ "$restarting_count" -eq 0 ]; then
echo "⚠️ $stack: $healthy_total/$total_count services healthy (degraded but stable)"
return 2 # Degraded but acceptable
# Still starting: health checks initializing, allow retry
elif [ "$running_starting" -gt 0 ] && [ "$running_unhealthy" -eq 0 ] && [ $attempt -lt $max_attempts ]; then
echo " $stack: $running_starting services still initializing health checks..."
sleep $wait_time
wait_time=$((wait_time * 2))
if [ $wait_time -gt 20 ]; then
wait_time=20
fi
# Final attempt failure
elif [ $attempt -eq $max_attempts ]; then
if [ "$running_unhealthy" -gt 0 ]; then
echo "❌ $stack: Failed - $running_unhealthy services unhealthy after $max_attempts attempts"
elif [ "$running_starting" -gt 0 ]; then
echo "❌ $stack: Failed - $running_starting services still starting after $max_attempts attempts"
else
echo "❌ $stack: Failed after $max_attempts attempts ($running_count/$total_count running, $healthy_total healthy)"
fi
return 1
# Continue with exponential backoff
else
echo " $stack: Not ready yet, waiting ${wait_time}s..."
sleep $wait_time
wait_time=$((wait_time * 2))
if [ $wait_time -gt 20 ]; then
wait_time=20
fi
fi
previous_running=$running_count
attempt=$((attempt + 1))
done
}
FAILED_STACKS=""
DEGRADED_STACKS=""
HEALTHY_STACKS=""
TOTAL_CONTAINERS=0
RUNNING_CONTAINERS=0
if [ "$HAS_DOCKGE" = "true" ]; then
echo "🔍 Health checking Dockge with retry logic..."
cd /opt/dockge
# Retry logic for Dockge with health check verification
local dockge_max_attempts=3
local dockge_attempt=1
local dockge_wait=3
local DOCKGE_TOTAL
local dockge_healthy dockge_starting dockge_unhealthy dockge_no_health
local dockge_running dockge_healthy_total
# Get total services
DOCKGE_TOTAL=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose config --services 2>/dev/null | wc -l | tr -d " " || echo "0")
while [ $dockge_attempt -le $dockge_max_attempts ]; do
# Get Dockge state and health
local dockge_ps_output
dockge_ps_output=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose ps --format '{{.Service}}\t{{.State}}\t{{.Health}}' 2>/dev/null || echo "")
# Parse health states
dockge_healthy=0
dockge_starting=0
dockge_unhealthy=0
dockge_no_health=0
while IFS=$'\t' read -r service state health; do
[ -z "$service" ] && continue
if [ "$state" = "running" ]; then
case "$health" in
healthy) dockge_healthy=$((dockge_healthy + 1)) ;;
starting) dockge_starting=$((dockge_starting + 1)) ;;
unhealthy) dockge_unhealthy=$((dockge_unhealthy + 1)) ;;
*) dockge_no_health=$((dockge_no_health + 1)) ;;
esac
fi
done <<< "$dockge_ps_output"
dockge_running=$((dockge_healthy + dockge_starting + dockge_unhealthy + dockge_no_health))
dockge_healthy_total=$((dockge_healthy + dockge_no_health))
echo " Dockge attempt $dockge_attempt/$dockge_max_attempts: $dockge_running/$DOCKGE_TOTAL running (healthy: $dockge_healthy, starting: $dockge_starting, unhealthy: $dockge_unhealthy, no-check: $dockge_no_health)"
# Success: all healthy
if [ "$dockge_healthy_total" -eq "$DOCKGE_TOTAL" ] && [ "$DOCKGE_TOTAL" -gt 0 ] && [ "$dockge_starting" -eq 0 ] && [ "$dockge_unhealthy" -eq 0 ]; then
break
# Unhealthy detected - fail
elif [ "$dockge_unhealthy" -gt 0 ]; then
echo " Dockge has $dockge_unhealthy unhealthy services"
break
# Degraded but stable: some healthy, final attempt
elif [ "$dockge_healthy_total" -gt 0 ] && [ "$dockge_unhealthy" -eq 0 ] && [ $dockge_attempt -eq $dockge_max_attempts ]; then
break
# Retry
elif [ $dockge_attempt -lt $dockge_max_attempts ]; then
echo " Dockge not ready, waiting ${dockge_wait}s..."
sleep $dockge_wait
dockge_wait=$((dockge_wait * 2))
fi
dockge_attempt=$((dockge_attempt + 1))
done
TOTAL_CONTAINERS=$((TOTAL_CONTAINERS + DOCKGE_TOTAL))
RUNNING_CONTAINERS=$((RUNNING_CONTAINERS + dockge_running))
if [ "$dockge_unhealthy" -gt 0 ]; then
echo "❌ Dockge: $dockge_unhealthy services unhealthy"
FAILED_STACKS="$FAILED_STACKS dockge"
elif [ "$dockge_running" -eq 0 ]; then
echo "❌ Dockge: 0/$DOCKGE_TOTAL services running"
FAILED_STACKS="$FAILED_STACKS dockge"
elif [ "$dockge_healthy_total" -eq "$DOCKGE_TOTAL" ]; then
echo "✅ Dockge: All $DOCKGE_TOTAL services healthy"
HEALTHY_STACKS="$HEALTHY_STACKS dockge"
else
echo "⚠️ Dockge: $dockge_healthy_total/$DOCKGE_TOTAL services healthy (degraded)"
DEGRADED_STACKS="$DEGRADED_STACKS dockge"
fi
fi
# Parse critical services list
# Note: CRITICAL_SERVICES contains stack names (not individual Docker service names)
# This matches stacks that are considered critical for the deployment
# Example: ["portainer", "dockge"] identifies these stacks as critical
CRITICAL_SERVICES_ARRAY=()
if [ -n "$CRITICAL_SERVICES" ] && [ "$CRITICAL_SERVICES" != "[]" ]; then
# Convert JSON array to bash array using jq for robust parsing and preserve spaces/special characters
readarray -t CRITICAL_SERVICES_ARRAY < <(echo "$CRITICAL_SERVICES" | jq -r '.[]')
echo "🚨 Critical stacks configured: ${CRITICAL_SERVICES_ARRAY[*]}"
fi
# Function to check if a stack is critical
# Parameter: stack name to check
# Returns: 0 if critical, 1 if not critical
is_critical_service() {
local stack_name=$1
for critical in "${CRITICAL_SERVICES_ARRAY[@]}"; do
if [ "$stack_name" = "$critical" ]; then
return 0
fi
done
return 1
}
# Enhanced health checks with sequential retry logic and early exit
echo "🔍 Starting enhanced health checks with retry logic..."
CRITICAL_FAILURE=false
# Disable exit on error for health checks to ensure we reach output section
set +e
# Check each stack with the new enhanced health check
for STACK in $STACKS; do
echo ""
echo "🔍 Checking stack: $STACK"
health_check_with_retry "$STACK"
HEALTH_RESULT=$?
case $HEALTH_RESULT in
0)
# Output already restored in health_check_with_retry
echo "✅ $STACK: Healthy"
HEALTHY_STACKS="$HEALTHY_STACKS $STACK"
;;
2)
# Output already restored in health_check_with_retry
echo "⚠️ $STACK: Degraded but stable"
DEGRADED_STACKS="$DEGRADED_STACKS $STACK"
# Check if degraded stack is critical
if is_critical_service "$STACK"; then
echo "🚨 CRITICAL SERVICE DEGRADED: $STACK"
echo " Continuing monitoring but flagging for attention"
fi
;;
*)
# For failures, output is already restored in health_check_with_retry
echo "❌ $STACK: Failed health check"
FAILED_STACKS="$FAILED_STACKS $STACK"
# Check if failed stack is critical - trigger early exit
if is_critical_service "$STACK"; then
echo "🚨 CRITICAL SERVICE FAILURE: $STACK"
echo " This is a critical service failure - triggering early exit"
echo " Remaining stacks will not be health checked"
CRITICAL_FAILURE=true
break
fi
;;
esac
done
# Count services across all stacks after health checks complete
echo ""
echo "📊 Counting services across all stacks..."
if [ -z "$STACKS" ]; then
echo "ERROR: STACKS variable is empty! Cannot count services."
echo "Will attempt to discover stacks from filesystem..."
DISCOVERED_STACKS=""
for dir in /opt/compose/*/; do
if [ -d "$dir" ] && [ -f "$dir/compose.yaml" ]; then
STACK_NAME=$(basename "$dir")
DISCOVERED_STACKS="$DISCOVERED_STACKS $STACK_NAME"
fi
done
STACKS=$(echo "$DISCOVERED_STACKS" | xargs)
echo "Discovered stacks: $STACKS"
fi
for STACK in $STACKS; do
STACK_RUNNING=$(cd /opt/compose/$STACK 2>/dev/null && op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps --services --filter "status=running" 2>/dev/null | grep -E '^[a-zA-Z0-9_-]+$' 2>/dev/null | wc -l | tr -d " " || echo "0")
STACK_TOTAL=$(cd /opt/compose/$STACK 2>/dev/null && op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services 2>/dev/null | grep -E '^[a-zA-Z0-9_-]+$' 2>/dev/null | wc -l | tr -d " " || echo "0")
echo " $STACK: $STACK_RUNNING/$STACK_TOTAL services"
TOTAL_CONTAINERS=$((TOTAL_CONTAINERS + STACK_TOTAL))
RUNNING_CONTAINERS=$((RUNNING_CONTAINERS + STACK_RUNNING))
done
# Write outputs to temp file to ensure capture even if script exits early
TEMP_OUTPUT="/tmp/github_health_check_outputs.txt"
echo "healthy_stacks=$(echo $HEALTHY_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" > "$TEMP_OUTPUT"
echo "degraded_stacks=$(echo $DEGRADED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" >> "$TEMP_OUTPUT"
echo "failed_stacks=$(echo $FAILED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" >> "$TEMP_OUTPUT"
echo "total_containers=$TOTAL_CONTAINERS" >> "$TEMP_OUTPUT"
echo "running_containers=$RUNNING_CONTAINERS" >> "$TEMP_OUTPUT"
if [ "$TOTAL_CONTAINERS" -gt 0 ]; then
echo "success_rate=$(( RUNNING_CONTAINERS * 100 / TOTAL_CONTAINERS ))" >> "$TEMP_OUTPUT"
else
echo "success_rate=0" >> "$TEMP_OUTPUT"
fi
# Handle critical service failure
if [ "$CRITICAL_FAILURE" = true ]; then
echo ""
echo "❌ CRITICAL SERVICE FAILURE DETECTED"
echo " Deployment marked as failed due to critical service failure"
echo " Health check terminated early to prevent extended failure cycles"
# Set outputs for early termination
echo "health_status=failed_critical" >> "$GITHUB_OUTPUT"
echo "failed_stacks=$FAILED_STACKS" >> "$GITHUB_OUTPUT"
echo "healthy_stacks=$HEALTHY_STACKS" >> "$GITHUB_OUTPUT"
echo "degraded_stacks=$DEGRADED_STACKS" >> "$GITHUB_OUTPUT"
exit 1
fi
echo "📊 Total service count: $RUNNING_CONTAINERS/$TOTAL_CONTAINERS across all stacks"
# Display comprehensive health check results
echo ""
echo "📊 Health Check Summary:"
echo "════════════════════════"
echo "Total Services: $TOTAL_CONTAINERS"
echo "Running Services: $RUNNING_CONTAINERS"
if [ "$TOTAL_CONTAINERS" -gt 0 ]; then
echo "Success Rate: $(( RUNNING_CONTAINERS * 100 / TOTAL_CONTAINERS ))%"
else
echo "Success Rate: 0%"
fi
echo ""
# Display results by category
[ -n "$HEALTHY_STACKS" ] && echo "✅ Healthy Stacks: $(echo $HEALTHY_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')"
[ -n "$DEGRADED_STACKS" ] && echo "⚠️ Degraded Stacks: $(echo $DEGRADED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')"
[ -n "$FAILED_STACKS" ] && echo "❌ Failed Stacks: $(echo $FAILED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')"
echo ""
echo "📋 Detailed Health Check Results:"
echo "════════════════════════════════════════════════════════════════"
for STACK in $STACKS; do
if [ -f "/tmp/health_${STACK}.log" ]; then
echo ""
echo "🔸 STACK: $STACK"
echo "────────────────────────────────────────────────────────────────"
cat "/tmp/health_${STACK}.log"
echo "────────────────────────────────────────────────────────────────"
else
echo ""
echo "🔸 STACK: $STACK"
echo "────────────────────────────────────────────────────────────────"
echo "⚠️ No health check log found for $STACK"
echo "────────────────────────────────────────────────────────────────"
fi
done
echo "════════════════════════════════════════════════════════════════"
# Output results in parseable format (temp file already written earlier)
echo "GITHUB_OUTPUT_START"
cat "$TEMP_OUTPUT"
echo "GITHUB_OUTPUT_END"
set -e # Re-enable exit on error after outputs are written
# Determine final health status
if [ -n "$FAILED_STACKS" ]; then
echo ""
echo "💥 Health check failed - some stacks are not running"
exit 1
elif [ -n "$DEGRADED_STACKS" ]; then
echo ""
echo "⚠️ Health check passed with warnings - some services degraded"
exit 0
else
echo ""
echo "🎉 All services are fully healthy!"
exit 0
fi
EOF
)
HEALTH_EXIT_CODE=$?
set -e
# Check if health check command failed
if [ $HEALTH_EXIT_CODE -ne 0 ]; then
echo "::error::Health check failed with exit code: $HEALTH_EXIT_CODE"
echo "💥 Health check command failed - marking deployment as failed"
# Still extract outputs for debugging before failing
echo "$HEALTH_RESULT"
if echo "$HEALTH_RESULT" | grep -q "GITHUB_OUTPUT_START"; then
echo "$HEALTH_RESULT" | sed -n '/GITHUB_OUTPUT_START/,/GITHUB_OUTPUT_END/p' | grep -E "^(healthy_stacks|degraded_stacks|failed_stacks|total_containers|running_containers|success_rate)=" >> "$GITHUB_OUTPUT" || true
fi
exit 1
fi
# Extract health outputs from structured result
echo "$HEALTH_RESULT"
# Parse outputs without temporary files
if echo "$HEALTH_RESULT" | grep -q "GITHUB_OUTPUT_START"; then
echo "$HEALTH_RESULT" | sed -n '/GITHUB_OUTPUT_START/,/GITHUB_OUTPUT_END/p' | grep -E "^(healthy_stacks|degraded_stacks|failed_stacks|total_containers|running_containers|success_rate)=" >> "$GITHUB_OUTPUT"
else
echo "⚠️ GITHUB_OUTPUT_START marker not found, attempting to read from temp file..."
# Try to read from temp file on remote server
TEMP_FILE_CONTENT=$(ssh -o "StrictHostKeyChecking no" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} 'cat /tmp/github_health_check_outputs.txt 2>/dev/null' || echo "")
if [ -n "$TEMP_FILE_CONTENT" ]; then
echo "✅ Successfully read outputs from temp file"
echo "$TEMP_FILE_CONTENT" >> "$GITHUB_OUTPUT"
else
echo "❌ Could not read temp file, using fallback outputs"
# Fallback outputs if parsing fails
{
echo "healthy_stacks="
echo "degraded_stacks="
echo "failed_stacks="
echo "total_containers=0"
echo "running_containers=0"
echo "success_rate=0"
} >> "$GITHUB_OUTPUT"
fi
fi
- name: Cleanup unused images
id: cleanup
if: steps.backup.outputs.deployment_needed == 'true' && steps.deploy.outcome == 'success' && steps.health.outcome == 'success'
continue-on-error: true
run: |
echo "::group::Cleaning up unused Docker images"
ssh -o "StrictHostKeyChecking no" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} << EOF
echo "🧹 Cleaning up unused Docker images..."
docker image prune -f
echo "✅ Cleanup completed"
EOF
echo "::endgroup::"
- name: Rollback to Previous Version
id: rollback
if: steps.backup.outputs.deployment_needed == 'true' && (steps.deploy.outcome == 'failure' || steps.health.outcome == 'failure')
continue-on-error: true
run: |
echo "::group::Rolling back to previous deployment"
echo "🔄 **INITIATING ROLLBACK**"
echo "Previous SHA: ${{ steps.backup.outputs.previous_sha }}"
echo "Failed SHA: ${{ inputs.target-ref }}"
# Parse inputs outside SSH context
HAS_DOCKGE="${{ inputs.has-dockge }}"
PREVIOUS_SHA="${{ steps.backup.outputs.previous_sha }}"
COMPOSE_ARGS="${{ inputs.args || '' }}"
CRITICAL_SERVICES='${{ inputs.critical-services }}'
# Validate PREVIOUS_SHA before attempting rollback
if [ "$PREVIOUS_SHA" = "unknown" ] || [ -z "$PREVIOUS_SHA" ]; then
echo "❌ Cannot rollback: No previous deployment exists (first deployment)"
echo "::error::Rollback failed - no previous deployment to rollback to"
exit 1
fi
# Validate SHA format (full 40-char SHA)
if ! [[ "$PREVIOUS_SHA" =~ ^[a-fA-F0-9]{40}$ ]]; then
echo "❌ Cannot rollback: Invalid previous SHA format: $PREVIOUS_SHA"
echo "::error::Rollback failed - invalid SHA format"
exit 1
fi
echo "✅ Previous SHA validation passed: $PREVIOUS_SHA"
# Source retry functions
source /tmp/retry.sh
# Use retry mechanism for SSH connection (same as deploy) and capture output
ROLLBACK_RESULT=$(ssh_retry 3 10 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$HAS_DOCKGE\" \"$PREVIOUS_SHA\" \"$COMPOSE_ARGS\" \"$CRITICAL_SERVICES\"" << 'EOF'
set -e
# Get arguments passed to script (excluding sensitive OP_TOKEN)
HAS_DOCKGE="$1"
PREVIOUS_SHA="$2"
COMPOSE_ARGS="$3"
CRITICAL_SERVICES="$4"
# Set OP_TOKEN via environment (passed separately)
export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}"
# Consolidate timeout values for easier maintenance
# These can be overridden by workflow inputs where available
GIT_FETCH_TIMEOUT=${{ inputs.git-fetch-timeout }}
GIT_CHECKOUT_TIMEOUT=${{ inputs.git-checkout-timeout }}
IMAGE_PULL_TIMEOUT=${{ inputs.image-pull-timeout }}
SERVICE_STARTUP_TIMEOUT=${{ inputs.service-startup-timeout }}
VALIDATION_ENV_TIMEOUT=${{ inputs.validation-env-timeout }}
VALIDATION_SYNTAX_TIMEOUT=${{ inputs.validation-syntax-timeout }}
echo "🔄 Rolling back to $PREVIOUS_SHA..."
# Add timeout protection to git operations
if ! timeout $GIT_FETCH_TIMEOUT git -C /opt/compose/ fetch; then
echo "❌ Git fetch timed out after ${GIT_FETCH_TIMEOUT}s"
exit 1
fi
if ! timeout $GIT_CHECKOUT_TIMEOUT git -C /opt/compose/ checkout $PREVIOUS_SHA; then
echo "❌ Git checkout timed out after ${GIT_CHECKOUT_TIMEOUT}s"
exit 1
fi
echo "✅ Repository rolled back to $PREVIOUS_SHA"
# Dynamically discover stacks based on the previous commit's structure
echo "🔍 Discovering stacks in previous commit..."
ROLLBACK_STACKS_ARRAY=()
cd /opt/compose
for dir in */; do
if [[ -d "$dir" && (-f "$dir/compose.yml" || -f "$dir/compose.yaml") ]]; then
STACK_NAME=$(basename "$dir")
ROLLBACK_STACKS_ARRAY+=("$STACK_NAME")
echo " Found stack: $STACK_NAME"
fi
done
if [ ${#ROLLBACK_STACKS_ARRAY[@]} -eq 0 ]; then
echo "⚠️ No stacks found in previous commit - rollback cannot proceed"
exit 1
fi
# Use null character as delimiter to support stack names with spaces and special characters
# Note: Null delimiter is used only within this SSH script execution
# The rollback-health step will convert it to space-delimited before passing between workflow steps
ROLLBACK_STACKS=$(printf "%s\0" "${ROLLBACK_STACKS_ARRAY[@]}")
echo "📋 Stacks to rollback: ${ROLLBACK_STACKS_ARRAY[*]}"
# Output discovered stacks for rollback-health step (null-delimited)
# Will be converted to space-delimited in rollback-health step for compatibility
echo "DISCOVERED_ROLLBACK_STACKS=$ROLLBACK_STACKS"
# Deploy Dockge first if needed
if [ "$HAS_DOCKGE" = "true" ]; then
echo "🔄 Rolling back Dockge..."
cd /opt/dockge
# Add timeout protection for Dockge operations
if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then
echo "❌ Dockge image pull timed out after ${IMAGE_PULL_TIMEOUT}s"
exit 1
fi
if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then
echo "❌ Dockge startup timed out after ${SERVICE_STARTUP_TIMEOUT}s"
exit 1
fi
echo "✅ Dockge rolled back successfully"
fi
# Shared function to deploy or rollback a single stack
# This eliminates code duplication between deploy and rollback operations
process_stack() {
local STACK=$1
local OPERATION=$2 # "deploy" or "rollback"
local LOGFILE="/tmp/${OPERATION}_${STACK}.log"
local EXITCODEFILE="/tmp/${OPERATION}_${STACK}.exitcode"
{
if [ "$OPERATION" = "deploy" ]; then
echo "🚀 Deploying $STACK..."
else
echo "🔄 Rolling back $STACK..."
fi
cd /opt/compose/$STACK
echo " Pulling images for $STACK..."
# Add timeout protection (5 minutes for image pull)
if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then
echo "❌ Failed to pull images for $STACK during $OPERATION (timeout or error)"
exit 1
fi
echo " Starting services for $STACK..."
# Add timeout protection (2 minutes for service startup)
if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then
echo "❌ Failed to start services for $STACK during $OPERATION (timeout or error)"
exit 1
fi
if [ "$OPERATION" = "deploy" ]; then
echo "✅ $STACK deployed successfully"
else
echo "✅ $STACK rolled back successfully"
fi
} > "$LOGFILE" 2>&1
# Capture and save exit code for robust error detection
local exit_code=$?
echo "$exit_code" > "$EXITCODEFILE"
return $exit_code
}
# Wrapper function for rollback (uses shared process_stack)
rollback_stack() {
process_stack "$1" "rollback"
}
# Cleanup function for rollback logs
cleanup_rollback_logs() {
# Parse null-delimited stacks into array
readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS"
for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do
rm -f "/tmp/rollback_${STACK}.log" 2>/dev/null
done
}
# Pre-rollback validation function
validate_all_rollback_stacks() {
echo "🔍 Pre-rollback validation of all stacks..."
local validation_failed=false
# Parse null-delimited stacks into array
readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS"
for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do
echo " Validating $STACK..."
# Check if stack directory exists
if [ ! -d "/opt/compose/$STACK" ]; then
echo "❌ $STACK: Directory /opt/compose/$STACK not found"
validation_failed=true
continue
fi
cd "/opt/compose/$STACK" || {
echo "❌ $STACK: Cannot access directory"
validation_failed=true
continue
}
# Check if compose.yaml or compose.yml exists and determine which to use
COMPOSE_FILE=""
if [ -f "compose.yaml" ]; then
COMPOSE_FILE="compose.yaml"
elif [ -f "compose.yml" ]; then
COMPOSE_FILE="compose.yml"
else
echo "❌ $STACK: neither compose.yaml nor compose.yml found"
validation_failed=true
continue
fi
# Validate 1Password environment access and Docker Compose config
if ! timeout $VALIDATION_ENV_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f "$COMPOSE_FILE" config --services >/dev/null 2>&1; then
echo "❌ $STACK: Environment validation failed (1Password or compose config error)"
validation_failed=true
continue
fi
# Quick syntax validation
if ! timeout $VALIDATION_SYNTAX_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f "$COMPOSE_FILE" config --quiet 2>/dev/null; then
echo "❌ $STACK: Docker Compose syntax validation failed"
validation_failed=true
continue
fi
echo "✅ $STACK: Pre-rollback validation passed"
done
if [ "$validation_failed" = true ]; then
echo "❌ Pre-rollback validation failed for one or more stacks"
echo " Stopping rollback to prevent extended failures"
return 1
fi
echo "✅ All stacks passed pre-rollback validation"
return 0
}
# Set trap for cleanup on exit
trap cleanup_rollback_logs EXIT
# Run pre-rollback validation
if ! validate_all_rollback_stacks; then
echo "ROLLBACK_STATUS=failed_validation" >> "$GITHUB_OUTPUT"
exit 1
fi
# Start all rollback deployments in parallel
echo "🔄 Starting parallel rollback of all stacks..."
ROLLBACK_PIDS=""
# Map each PID to its stack name for improved error reporting
# Note: Requires Bash 4.0+ for associative arrays (GitHub Actions runners use Bash 5.x)
declare -A ROLLBACK_PID_TO_STACK
# Parse null-delimited stacks into array
readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS"
for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do
echo "🔄 Rolling back $STACK..."
rollback_stack "$STACK" &
PID=$!
ROLLBACK_PIDS="$ROLLBACK_PIDS $PID"
ROLLBACK_PID_TO_STACK[$PID]=$STACK
echo "Started rollback of $STACK (PID: $PID)"
done
# Wait for all rollback deployments and collect results
echo "⏳ Waiting for all rollbacks to complete..."
FAILED_ROLLBACKS=""
ROLLBACK_ERRORS=""
# Enhanced parallel job monitoring with proper error propagation
echo "⏳ Monitoring parallel rollback operations..."
# Wait for jobs individually to capture exit codes and report stack names
for PID in $ROLLBACK_PIDS; do
STACK_NAME="${ROLLBACK_PID_TO_STACK[$PID]}"
if wait "$PID"; then
echo "✅ Rollback process $PID for stack $STACK_NAME completed successfully"
else
EXIT_CODE=$?
# Check if process was terminated by signal (exit code > 128)
if [ "$EXIT_CODE" -gt 128 ]; then
SIGNAL_NUM=$((EXIT_CODE - 128))
# Try to get signal name (works on most systems)
if command -v kill >/dev/null 2>&1; then
SIGNAL_NAME=$(kill -l $SIGNAL_NUM 2>/dev/null || echo "SIG$SIGNAL_NUM")
else
SIGNAL_NAME="SIG$SIGNAL_NUM"
fi
echo "❌ Rollback process $PID for stack $STACK_NAME was terminated by signal $SIGNAL_NUM ($SIGNAL_NAME)"
ROLLBACK_ERRORS="$ROLLBACK_ERRORS STACK:$STACK_NAME:PID:$PID:TERMINATED_BY_SIGNAL:$SIGNAL_NUM:$SIGNAL_NAME"
else
echo "❌ Rollback process $PID for stack $STACK_NAME failed with exit code $EXIT_CODE"
ROLLBACK_ERRORS="$ROLLBACK_ERRORS STACK:$STACK_NAME:PID:$PID:EXIT_CODE:$EXIT_CODE"
fi
fi
done
# Enhanced result analysis using exit code files (more robust than log parsing)
ROLLED_BACK_STACKS=""
SUCCESSFUL_ROLLBACKS=""
# Parse null-delimited stacks into array
readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS"
for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do
if [ -f "/tmp/rollback_${STACK}.log" ]; then
ROLLED_BACK_STACKS="$ROLLED_BACK_STACKS $STACK"
# Primary: Check exit code file for robust error detection
if [ -f "/tmp/rollback_${STACK}.exitcode" ]; then
EXIT_CODE=$(cat "/tmp/rollback_${STACK}.exitcode")
if [ "$EXIT_CODE" -eq 0 ]; then
SUCCESSFUL_ROLLBACKS="$SUCCESSFUL_ROLLBACKS $STACK"
else
FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK"
echo "🔍 $STACK Rollback Error: Non-zero exit code ($EXIT_CODE)"
fi
else
# Fallback: Log-based error detection if exit code file is missing
echo "⚠️ $STACK: Exit code file missing - using less reliable log-based detection"
if grep -q "❌.*$STACK\|CRITICAL.*$STACK\|Failed.*$STACK\|Error.*$STACK" "/tmp/rollback_${STACK}.log"; then
FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK"
# Extract specific error for reporting
STACK_ERROR=$(grep -E "❌.*$STACK|CRITICAL.*$STACK|Failed.*$STACK|Error.*$STACK" "/tmp/rollback_${STACK}.log" | head -1)
echo "🔍 $STACK Rollback Error: $STACK_ERROR"
elif grep -q "✅.*$STACK\|Successfully.*$STACK" "/tmp/rollback_${STACK}.log"; then
SUCCESSFUL_ROLLBACKS="$SUCCESSFUL_ROLLBACKS $STACK"
else
echo "⚠️ $STACK: No clear success/failure indicator in logs - treating as potential failure"
FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK"
fi
fi
else
echo "⚠️ $STACK: No rollback log found - possible early failure"
FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK"
fi
done
# Summary of rollback results
echo ""
echo "📊 Rollback Summary:"
echo " Successful: $(echo $SUCCESSFUL_ROLLBACKS | wc -w | tr -d ' ') stacks"
echo " Failed: $(echo $FAILED_ROLLBACKS | wc -w | tr -d ' ') stacks"
if [ -n "$ROLLBACK_ERRORS" ]; then
echo " Process errors: $ROLLBACK_ERRORS"
fi
# Parse critical services list
# Note: CRITICAL_SERVICES contains stack names (not individual Docker service names)
# This matches stacks that are considered critical for the deployment
# Example: ["portainer", "dockge"] identifies these stacks as critical
CRITICAL_SERVICES_ARRAY=()
CRITICAL_FAILURE=false
if [ -n "$CRITICAL_SERVICES" ] && [ "$CRITICAL_SERVICES" != "[]" ]; then
# Convert JSON array to bash array using jq for robust parsing and preserve spaces/special characters
readarray -t CRITICAL_SERVICES_ARRAY < <(echo "$CRITICAL_SERVICES" | jq -r '.[]')
echo "🚨 Critical stacks configured: ${CRITICAL_SERVICES_ARRAY[*]}"
# Check if any failed rollback stack is critical
for FAILED_STACK in $FAILED_ROLLBACKS; do
for CRITICAL_STACK in "${CRITICAL_SERVICES_ARRAY[@]}"; do
if [ "$FAILED_STACK" = "$CRITICAL_STACK" ]; then
echo "🚨 CRITICAL STACK ROLLBACK FAILED: $FAILED_STACK"
echo " This is a critical stack - system may be in unsafe state"
CRITICAL_FAILURE=true
fi
done
done
fi
# Display all rollback logs
echo ""
echo "📋 Rollback Results:"
echo "════════════════════════════════════════════════════════════════"
# Parse null-delimited stacks into array
readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS"
for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do
if [ -f "/tmp/rollback_${STACK}.log" ]; then
echo ""
echo "🔸 ROLLBACK STACK: $STACK"
echo "────────────────────────────────────────────────────────────────"
cat "/tmp/rollback_${STACK}.log"
echo "────────────────────────────────────────────────────────────────"
else
echo ""
echo "🔸 ROLLBACK STACK: $STACK"
echo "────────────────────────────────────────────────────────────────"
echo "⚠️ No rollback log found for $STACK"
echo "────────────────────────────────────────────────────────────────"
fi
done
echo "════════════════════════════════════════════════════════════════"
# Check if any rollbacks failed
if [ -z "$ROLLBACK_STACKS" ]; then
echo "💥 No stacks to rollback - ROLLBACK_STACKS variable is empty!"
exit 1
elif [ -z "$ROLLED_BACK_STACKS" ]; then
echo "💥 No stacks were actually rolled back - check stack discovery!"
exit 1
elif [ "$CRITICAL_FAILURE" = true ]; then
echo ""
echo "💥 CRITICAL SERVICE ROLLBACK FAILURE"
echo " One or more critical services failed to rollback"
echo " System may be in an unsafe state - manual intervention required"
echo " Failed critical services:$FAILED_ROLLBACKS"
exit 1
elif [ -n "$FAILED_ROLLBACKS" ]; then
echo "💥 Rollbacks failed for:$FAILED_ROLLBACKS"
exit 1
fi
echo "🎉 All stacks rolled back successfully!"
EOF
)
# Extract rollback result and discovered stacks
echo "$ROLLBACK_RESULT"
# Parse discovered stacks output for rollback-health step (simplified - no markers needed)
if echo "$ROLLBACK_RESULT" | grep -q "DISCOVERED_ROLLBACK_STACKS="; then
DISCOVERED_STACKS=$(echo "$ROLLBACK_RESULT" | grep "DISCOVERED_ROLLBACK_STACKS=" | cut -d'=' -f2-)
echo "discovered_rollback_stacks=$DISCOVERED_STACKS" >> "$GITHUB_OUTPUT"
echo "✅ Captured discovered rollback stacks: $DISCOVERED_STACKS"
else
echo "⚠️ Could not parse discovered stacks, will use input stacks as fallback"
echo "discovered_rollback_stacks=${{ join(fromJSON(inputs.stacks), ' ') }}" >> "$GITHUB_OUTPUT"
fi
echo "::endgroup::"
# Health check runs after rollback attempt regardless of rollback success/failure
# This is intentional: we need to know the final system state even if rollback fails
# Using 'conclusion != skipped' instead of 'outcome == success' ensures we get
# visibility into what services are running, which is critical for incident response
- name: Verify Rollback Health
id: rollback-health
if: steps.backup.outputs.deployment_needed == 'true' && (steps.deploy.outcome == 'failure' || steps.health.outcome == 'failure') && steps.rollback.conclusion != 'skipped'
continue-on-error: true
run: |
echo "🔍 Verifying rollback health status"
# Source retry functions
source /tmp/retry.sh
# Use discovered rollback stacks instead of input stacks
# This ensures we check the stacks that were actually rolled back (from previous commit)
DISCOVERED_STACKS="${{ steps.rollback.outputs.discovered_rollback_stacks }}"
if [ -n "$DISCOVERED_STACKS" ]; then
# Convert null-delimited stacks to space-delimited for SSH arguments
readarray -d $'\0' -t STACKS_ARRAY <<< "$DISCOVERED_STACKS"
STACKS="${STACKS_ARRAY[*]}"
echo "✅ Using discovered rollback stacks: $STACKS"
else
# Fallback to input stacks if discovery failed
STACKS="${{ join(fromJSON(inputs.stacks), ' ') }}"
echo "⚠️ Using input stacks as fallback: $STACKS"
fi
HAS_DOCKGE="${{ inputs.has-dockge }}"
# Execute rollback health check
ROLLBACK_HEALTH_RESULT=$(ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} /bin/bash -s $STACKS \"$HAS_DOCKGE\"" << 'EOF'
set -e
# Get arguments passed to script
TOTAL_ARGS=$#
# Find HAS_DOCKGE by looking for 'true' or 'false' in the args
HAS_DOCKGE=""
for i in $(seq 1 $TOTAL_ARGS); do
ARG="${!i}"
if [ "$ARG" = "true" ] || [ "$ARG" = "false" ]; then
HAS_DOCKGE="$ARG"
# All args before this position are stack names
STACKS="${@:1:$((i-1))}"
break
fi
done
# Set OP_TOKEN via environment
export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}"
# Set configurable timeout for health check commands (default: 15 seconds)
HEALTH_CHECK_CMD_TIMEOUT=${{ inputs.health-check-command-timeout }}
if [ -z "$HEALTH_CHECK_CMD_TIMEOUT" ]; then
echo "ℹ️ HEALTH_CHECK_CMD_TIMEOUT not provided, using default 15 seconds"
HEALTH_CHECK_CMD_TIMEOUT=15
fi
# Validate that HEALTH_CHECK_CMD_TIMEOUT is an integer
if ! [[ "$HEALTH_CHECK_CMD_TIMEOUT" =~ ^[0-9]+$ ]]; then
echo "⚠️ HEALTH_CHECK_CMD_TIMEOUT ($HEALTH_CHECK_CMD_TIMEOUT) is not an integer, using default 15"
HEALTH_CHECK_CMD_TIMEOUT=15
fi
# Enforce minimum and maximum limits for HEALTH_CHECK_CMD_TIMEOUT
HEALTH_CHECK_CMD_TIMEOUT_MIN=5
HEALTH_CHECK_CMD_TIMEOUT_MAX=60
if [ "$HEALTH_CHECK_CMD_TIMEOUT" -lt "$HEALTH_CHECK_CMD_TIMEOUT_MIN" ]; then
echo "⚠️ HEALTH_CHECK_CMD_TIMEOUT ($HEALTH_CHECK_CMD_TIMEOUT) below minimum, using $HEALTH_CHECK_CMD_TIMEOUT_MIN"
HEALTH_CHECK_CMD_TIMEOUT=$HEALTH_CHECK_CMD_TIMEOUT_MIN
fi
if [ "$HEALTH_CHECK_CMD_TIMEOUT" -gt "$HEALTH_CHECK_CMD_TIMEOUT_MAX" ]; then
echo "⚠️ HEALTH_CHECK_CMD_TIMEOUT ($HEALTH_CHECK_CMD_TIMEOUT) above maximum, using $HEALTH_CHECK_CMD_TIMEOUT_MAX"
HEALTH_CHECK_CMD_TIMEOUT=$HEALTH_CHECK_CMD_TIMEOUT_MAX
fi
echo "🔍 Verifying rollback health for all services..."
ROLLBACK_HEALTHY_STACKS=""
ROLLBACK_DEGRADED_STACKS=""
ROLLBACK_FAILED_STACKS=""
ROLLBACK_TOTAL_CONTAINERS=0
ROLLBACK_RUNNING_CONTAINERS=0
# Check Dockge health if applicable
if [ "$HAS_DOCKGE" = "true" ]; then
echo "🔍 Verifying Dockge rollback health..."
cd /opt/dockge
DOCKGE_RUNNING=$(op run --env-file=/opt/compose/compose.env -- docker compose ps --services --filter "status=running" | wc -l | tr -d " ")
DOCKGE_TOTAL=$(op run --env-file=/opt/compose/compose.env -- docker compose ps --services | wc -l | tr -d " ")
ROLLBACK_TOTAL_CONTAINERS=$((ROLLBACK_TOTAL_CONTAINERS + DOCKGE_TOTAL))
ROLLBACK_RUNNING_CONTAINERS=$((ROLLBACK_RUNNING_CONTAINERS + DOCKGE_RUNNING))
if [ "$DOCKGE_RUNNING" -eq 0 ]; then
echo "❌ Dockge rollback: 0/$DOCKGE_TOTAL services running"
ROLLBACK_FAILED_STACKS="$ROLLBACK_FAILED_STACKS dockge"
elif [ "$DOCKGE_RUNNING" -lt "$DOCKGE_TOTAL" ]; then
echo "⚠️ Dockge rollback: $DOCKGE_RUNNING/$DOCKGE_TOTAL services running (degraded)"
ROLLBACK_DEGRADED_STACKS="$ROLLBACK_DEGRADED_STACKS dockge"
else
echo "✅ Dockge rollback: All $DOCKGE_RUNNING services healthy"
ROLLBACK_HEALTHY_STACKS="$ROLLBACK_HEALTHY_STACKS dockge"
fi
fi
# Simple health check for each rolled back stack
for STACK in $STACKS; do
echo ""
echo "🔍 Verifying rollback health for stack: $STACK"
cd "/opt/compose/$STACK" || {
echo "❌ $STACK: Directory not accessible after rollback"
ROLLBACK_FAILED_STACKS="$ROLLBACK_FAILED_STACKS $STACK"
continue
}
# Get basic health status
RUNNING_COUNT=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps --services --filter "status=running" 2>/dev/null | grep -E '^[a-zA-Z0-9_-]+$' | wc -l | tr -d " " || echo "0")
TOTAL_COUNT=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services 2>/dev/null | grep -E '^[a-zA-Z0-9_-]+$' | wc -l | tr -d " " || echo "0")
ROLLBACK_TOTAL_CONTAINERS=$((ROLLBACK_TOTAL_CONTAINERS + TOTAL_COUNT))
ROLLBACK_RUNNING_CONTAINERS=$((ROLLBACK_RUNNING_CONTAINERS + RUNNING_COUNT))
if [ "$RUNNING_COUNT" -eq 0 ]; then
echo "❌ $STACK rollback: 0/$TOTAL_COUNT services running"
ROLLBACK_FAILED_STACKS="$ROLLBACK_FAILED_STACKS $STACK"
elif [ "$RUNNING_COUNT" -lt "$TOTAL_COUNT" ]; then
echo "⚠️ $STACK rollback: $RUNNING_COUNT/$TOTAL_COUNT services running (degraded)"
ROLLBACK_DEGRADED_STACKS="$ROLLBACK_DEGRADED_STACKS $STACK"
else
echo "✅ $STACK rollback: All $RUNNING_COUNT services healthy"
ROLLBACK_HEALTHY_STACKS="$ROLLBACK_HEALTHY_STACKS $STACK"
fi
done
# Calculate success rate
if [ "$ROLLBACK_TOTAL_CONTAINERS" -gt 0 ]; then
ROLLBACK_SUCCESS_RATE=$(( ROLLBACK_RUNNING_CONTAINERS * 100 / ROLLBACK_TOTAL_CONTAINERS ))
else
ROLLBACK_SUCCESS_RATE=0
fi
echo ""
echo "📊 Rollback Health Verification Summary:"
echo "════════════════════════════════════════"
echo "Total Services: $ROLLBACK_TOTAL_CONTAINERS"
echo "Running Services: $ROLLBACK_RUNNING_CONTAINERS"
echo "Success Rate: ${ROLLBACK_SUCCESS_RATE}%"
echo ""
[ -n "$ROLLBACK_HEALTHY_STACKS" ] && echo "✅ Healthy After Rollback: $(echo $ROLLBACK_HEALTHY_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')"
[ -n "$ROLLBACK_DEGRADED_STACKS" ] && echo "⚠️ Degraded After Rollback: $(echo $ROLLBACK_DEGRADED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')"
[ -n "$ROLLBACK_FAILED_STACKS" ] && echo "❌ Failed After Rollback: $(echo $ROLLBACK_FAILED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')"
# Output structured results (simplified - no markers needed)
echo "ROLLBACK_HEALTH_HEALTHY=$(echo $ROLLBACK_HEALTHY_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')"
echo "ROLLBACK_HEALTH_DEGRADED=$(echo $ROLLBACK_DEGRADED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')"
echo "ROLLBACK_HEALTH_FAILED=$(echo $ROLLBACK_FAILED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')"
echo "ROLLBACK_HEALTH_TOTAL_CONTAINERS=$ROLLBACK_TOTAL_CONTAINERS"
echo "ROLLBACK_HEALTH_RUNNING_CONTAINERS=$ROLLBACK_RUNNING_CONTAINERS"
echo "ROLLBACK_HEALTH_SUCCESS_RATE=$ROLLBACK_SUCCESS_RATE"
# Determine rollback verification status
if [ -n "$ROLLBACK_FAILED_STACKS" ]; then
echo ""
echo "⚠️ Rollback completed but some services are still unhealthy"
echo "Manual intervention may be required"
exit 0 # Don't fail the workflow, rollback itself was successful
else
echo ""
echo "🎉 Rollback verified - all services are healthy or degraded but stable"
exit 0
fi
EOF
)
# Extract rollback health outputs
echo "$ROLLBACK_HEALTH_RESULT"
# Parse rollback health outputs (simplified - direct variable extraction)
if echo "$ROLLBACK_HEALTH_RESULT" | grep -q "ROLLBACK_HEALTH_"; then
# Extract each output variable directly
HEALTHY=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_HEALTHY=" | cut -d'=' -f2-)
DEGRADED=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_DEGRADED=" | cut -d'=' -f2-)
FAILED=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_FAILED=" | cut -d'=' -f2-)
TOTAL=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_TOTAL_CONTAINERS=" | cut -d'=' -f2-)
RUNNING=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_RUNNING_CONTAINERS=" | cut -d'=' -f2-)
RATE=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_SUCCESS_RATE=" | cut -d'=' -f2-)
{
echo "rollback_healthy_stacks=${HEALTHY:-}"
echo "rollback_degraded_stacks=${DEGRADED:-}"
echo "rollback_failed_stacks=${FAILED:-}"
echo "rollback_total_containers=${TOTAL:-0}"
echo "rollback_running_containers=${RUNNING:-0}"
echo "rollback_success_rate=${RATE:-0}"
} >> "$GITHUB_OUTPUT"
else
# Fallback outputs if parsing fails
{
echo "rollback_healthy_stacks="
echo "rollback_degraded_stacks="
echo "rollback_failed_stacks="
echo "rollback_total_containers=0"
echo "rollback_running_containers=0"
echo "rollback_success_rate=0"
} >> "$GITHUB_OUTPUT"
fi
- name: Cleanup SSH connections
if: always()
run: |
# Close SSH connection multiplexing
echo "🧹 Cleaning up SSH connections..."
ssh -o "StrictHostKeyChecking no" deployment-server -O exit 2>/dev/null || true
# Clean up SSH control sockets
rm -f ~/.ssh/sockets/* 2>/dev/null || true
echo "✅ SSH cleanup completed"
- name: Report Deployment Status
if: always()
run: |
echo "::group::Deployment Summary"
# Parse stacks from JSON input and create display list
STACK_LIST="${{ join(fromJson(inputs.stacks), ', ') }}"
if [ "${{ inputs.has-dockge }}" = "true" ]; then
STACK_LIST="dockge, $STACK_LIST"
fi
if [ "${{ steps.backup.outputs.deployment_needed }}" != "true" ]; then
echo "ℹ️ **NO DEPLOYMENT NEEDED**"
echo "✅ Repository already at target commit"
echo "📋 Target stacks: $STACK_LIST"
echo "🔄 SHA: ${{ inputs.target-ref }}"
elif [ "${{ inputs.force-deploy }}" = "true" ] && [ "${{ steps.deploy.outcome }}" == "success" ] && [ "${{ steps.health.outcome }}" == "success" ]; then
echo "🔄 **FORCE DEPLOYMENT SUCCESSFUL**"
echo "✅ All stacks force-deployed and healthy"
echo "📋 Deployed stacks: $STACK_LIST"
echo "🔄 SHA: ${{ inputs.target-ref }}"
if [ "${{ steps.cleanup.outcome }}" == "success" ]; then
echo "🧹 Cleanup completed successfully"
fi
elif [ "${{ steps.deploy.outcome }}" == "success" ] && [ "${{ steps.health.outcome }}" == "success" ]; then
echo "🎉 **DEPLOYMENT SUCCESSFUL**"
echo "✅ All stacks deployed and healthy"
echo "📋 Deployed stacks: $STACK_LIST"
echo "🔄 SHA: ${{ inputs.target-ref }}"
if [ "${{ steps.cleanup.outcome }}" == "success" ]; then
echo "🧹 Cleanup completed successfully"
fi
else
echo "💥 **DEPLOYMENT FAILED**"
echo "❌ Deploy status: ${{ steps.deploy.outcome }}"
echo "❌ Health check status: ${{ steps.health.outcome }}"
if [ "${{ steps.rollback.outcome }}" == "success" ]; then
echo "🔄 Rollback completed successfully"
if [ "${{ steps.rollback-health.outcome }}" == "success" ]; then
echo "✅ Rollback verification passed"
elif [ "${{ steps.rollback-health.outcome }}" == "failure" ]; then
echo "⚠️ Rollback verification failed - manual intervention may be needed"
fi
else
echo "❌ Rollback status: ${{ steps.rollback.outcome }}"
fi
exit 1
fi
echo "::endgroup::"
notify:
name: Discord Notification
runs-on: ubuntu-24.04
needs: [deploy]
if: always()
steps:
- name: Configure 1Password Service Account
uses: 1password/load-secrets-action/configure@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
with:
service-account-token: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}
- name: Get commit message
id: commit-msg
run: |
COMMIT_MSG=$(curl -s -H "Authorization: token ${{ github.token }}" \
"https://api.github.com/repos/${{ github.repository }}/commits/${{ inputs.target-ref }}" \
| jq -r '.commit.message // "No commit message available"' \
| head -1)
SHORT_SHA="${{ inputs.target-ref }}"
SHORT_SHA="${SHORT_SHA:0:7}"
echo "message=$COMMIT_MSG" >> "$GITHUB_OUTPUT"
echo "short-sha=$SHORT_SHA" >> "$GITHUB_OUTPUT"
- name: Load Discord webhook and user ID
id: op-load-discord
uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
with:
unset-previous: true
env:
DISCORD_WEBHOOK: ${{ inputs.webhook-url }}
DISCORD_USER_ID: ${{ inputs.discord-user-id != '' && inputs.discord-user-id || 'SKIP' }}
- name: Send Discord notification
uses: sarisia/actions-status-discord@b8381b25576cb341b2af39926ab42c5056cc44ed # v1.15.5
with:
webhook: ${{ steps.op-load-discord.outputs.DISCORD_WEBHOOK }}
status: ${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && 'success' || needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && 'success' || 'failure' }}
title: "🚀 ${{ inputs.repo-name }} • ${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && 'No Changes' || needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && 'Deployed' || needs.deploy.outputs.rollback_status == 'success' && 'Rolled Back' || 'Failed' }}"
description: |
${{ (needs.deploy.outputs.deploy_status == 'failure' || needs.deploy.outputs.health_status == 'failure' || needs.deploy.result == 'failure') && inputs.discord-user-id != '' && steps.op-load-discord.outputs.DISCORD_USER_ID != 'SKIP' && format('<@{0}> ', steps.op-load-discord.outputs.DISCORD_USER_ID) || '' }}${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && '📋 **Repository already at target commit**' ||
inputs.force-deploy == true && needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && '🔄 **Force deployment completed successfully**' ||
needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && '✅ **Deployment completed successfully**' ||
needs.deploy.outputs.rollback_status == 'success' && '🔄 **Deployment failed but rolled back successfully**' || '❌ **Deployment failed**' }}
${{ needs.deploy.outputs.deployment_needed == 'true' && needs.deploy.outputs.rollback_status != 'success' &&
format('**📊 Health Status**
🟢 Running: {0}/{1} services ({2}%)', needs.deploy.outputs.running_containers || '0',
needs.deploy.outputs.total_containers || '0', needs.deploy.outputs.success_rate || '0') ||
needs.deploy.outputs.rollback_status == 'success' && format('**📊 Rollback Health**
🟢 Running: {0}/{1} services ({2}%)',
needs.deploy.outputs.rollback_running_containers || '0', needs.deploy.outputs.rollback_total_containers || '0',
needs.deploy.outputs.rollback_success_rate || '0') || '' }}
${{ needs.deploy.outputs.rollback_status != 'success' && (needs.deploy.outputs.healthy_stacks != '' || needs.deploy.outputs.degraded_stacks != '' || needs.deploy.outputs.failed_stacks != '') &&
format('**🏷️ Stack Status**
{0}{1}{2}',
needs.deploy.outputs.healthy_stacks != '' && format('✅ {0}
', needs.deploy.outputs.healthy_stacks) || '',
needs.deploy.outputs.degraded_stacks != '' && format('⚠️ {0}
', needs.deploy.outputs.degraded_stacks) || '',
needs.deploy.outputs.failed_stacks != '' && format('❌ {0}', needs.deploy.outputs.failed_stacks) || '') || '' }}
${{ needs.deploy.outputs.rollback_status == 'success' && (needs.deploy.outputs.rollback_healthy_stacks != '' || needs.deploy.outputs.rollback_degraded_stacks != '' || needs.deploy.outputs.rollback_failed_stacks != '') &&
format('**🏷️ Rollback Stack Status**
{0}{1}{2}',
needs.deploy.outputs.rollback_healthy_stacks != '' && format('✅ {0}
', needs.deploy.outputs.rollback_healthy_stacks) || '',
needs.deploy.outputs.rollback_degraded_stacks != '' && format('⚠️ {0}
', needs.deploy.outputs.rollback_degraded_stacks) || '',
needs.deploy.outputs.rollback_failed_stacks != '' && format('❌ {0}', needs.deploy.outputs.rollback_failed_stacks) || '') || '' }}
${{ needs.deploy.outputs.deployment_needed == 'true' && format('**🔄 Pipeline Status**
{0} Deploy → {1} Health → {2} Cleanup{3}',
needs.deploy.outputs.deploy_status == 'success' && '✅' || '❌',
needs.deploy.outputs.health_status == 'success' && '✅' || needs.deploy.outputs.health_status == 'skipped' && '⏭️' || '❌',
needs.deploy.outputs.cleanup_status == 'success' && '✅' || needs.deploy.outputs.cleanup_status == 'skipped' && '⏭️' || '❌',
needs.deploy.outputs.rollback_status != 'skipped' && format(' → {0} Rollback{1}',
needs.deploy.outputs.rollback_status == 'success' && '✅' || '❌',
needs.deploy.outputs.rollback_health_status == 'success' && ' → ✅ Verify' ||
needs.deploy.outputs.rollback_health_status == 'failure' && ' → ❌ Verify' || '') || '') || '' }}
${{ github.event_name == 'workflow_dispatch' && '🔧 **Triggered manually**' || format('📝 **Commit:** [`{0}`](https://github.com/{1}/commit/{2}) {3}', steps.commit-msg.outputs.short-sha, github.repository, inputs.target-ref, steps.commit-msg.outputs.message) }}
**⏱️ Duration:** ${{ github.event_name != 'workflow_dispatch' && '3min' || 'Manual' }}
color: ${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && 0x6c757d ||
needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && 0x28a745 ||
needs.deploy.outputs.rollback_status == 'success' && 0xffc107 ||
needs.deploy.outputs.degraded_stacks != '' && 0xfd7e14 || 0xdc3545 }}
username: "Compose Deploy"
avatar_url: "https://cdn-icons-png.flaticon.com/512/919/919853.png"
- name: Unload Discord webhook
uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
with:
unset-previous: true