feat(deploy): add deployment script library foundation #5
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Deploy Docker Compose | ||
| # Future Refactoring Recommendations: | ||
| # 1. Extract bash logic into separate composite actions or standalone scripts | ||
| # - Retry mechanisms (retry.sh, ssh_retry) could be a reusable composite action | ||
| # - Process_stack function could be extracted to a standalone script | ||
| # - Validation functions could be moved to a validation composite action | ||
| # 2. Consider splitting deploy and rollback into separate reusable workflows | ||
| # - Would improve readability and make each workflow easier to maintain | ||
| # - Could share common functionality via composite actions | ||
| # 3. Abstract common patterns into reusable functions | ||
| # - Exit code and log handling patterns appear multiple times | ||
| # - Could create helper functions for result parsing and analysis | ||
| # Note: Current implementation prioritizes single-file simplicity for easier debugging | ||
| on: | ||
| workflow_call: | ||
| inputs: | ||
| args: | ||
| description: "docker compose up -d arguments" | ||
| required: false | ||
| type: string | ||
| stacks: | ||
| description: "JSON array of stack names to deploy" | ||
| required: true | ||
| type: string | ||
| webhook-url: | ||
| description: "1Password reference to Discord webhook URL" | ||
| required: true | ||
| type: string | ||
| repo-name: | ||
| description: "Repository display name for notifications" | ||
| required: true | ||
| type: string | ||
| target-ref: | ||
| description: "Git reference to checkout on remote server" | ||
| required: true | ||
| type: string | ||
| has-dockge: | ||
| description: "Whether this deployment includes Dockge" | ||
| required: false | ||
| type: boolean | ||
| default: false | ||
| force-deploy: | ||
| description: "Force deployment even if repository is already at target commit" | ||
| required: false | ||
| type: boolean | ||
| default: false | ||
| health-check-timeout: | ||
| description: "Health check timeout in seconds (default: 180)" | ||
| required: false | ||
| type: number | ||
| default: 180 | ||
| health-check-command-timeout: | ||
| description: "Individual health check command timeout in seconds (default: 15)" | ||
| required: false | ||
| type: number | ||
| default: 15 | ||
| critical-services: | ||
| description: "JSON array of critical service names that should trigger early exit on failure" | ||
| required: false | ||
| type: string | ||
| default: '[]' | ||
| git-fetch-timeout: | ||
| description: "Git fetch operation timeout in seconds (default: 300)" | ||
| required: false | ||
| type: number | ||
| default: 300 | ||
| git-checkout-timeout: | ||
| description: "Git checkout operation timeout in seconds (default: 60)" | ||
| required: false | ||
| type: number | ||
| default: 60 | ||
| image-pull-timeout: | ||
| description: "Docker image pull timeout in seconds (default: 600)" | ||
| required: false | ||
| type: number | ||
| default: 600 | ||
| service-startup-timeout: | ||
| description: "Service startup timeout in seconds (default: 300)" | ||
| required: false | ||
| type: number | ||
| default: 300 | ||
| validation-env-timeout: | ||
| description: "Environment validation timeout in seconds (default: 30)" | ||
| required: false | ||
| type: number | ||
| default: 30 | ||
| validation-syntax-timeout: | ||
| description: "Syntax validation timeout in seconds (default: 60)" | ||
| required: false | ||
| type: number | ||
| default: 60 | ||
| discord-user-id: | ||
| description: "Discord user ID to mention in failure notifications (e.g., '<@123456789>')" | ||
| required: false | ||
| type: string | ||
| default: '' | ||
| jobs: | ||
| deploy: | ||
| runs-on: ubuntu-24.04 | ||
| if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }} | ||
| timeout-minutes: 40 # Overall job timeout | ||
| outputs: | ||
| previous_sha: ${{ steps.backup.outputs.previous_sha }} | ||
| deployment_needed: ${{ steps.backup.outputs.deployment_needed }} | ||
| deleted_files: ${{ steps.changed-files.outputs.deleted_files }} | ||
| deploy_status: ${{ steps.deploy.outcome }} | ||
| health_status: ${{ steps.health.outcome }} | ||
| cleanup_status: ${{ steps.cleanup.outcome }} | ||
| rollback_status: ${{ steps.rollback.outcome }} | ||
| rollback_health_status: ${{ steps.rollback-health.outcome }} | ||
| discovered_rollback_stacks: ${{ steps.rollback.outputs.discovered_rollback_stacks }} | ||
| healthy_stacks: ${{ steps.health.outputs.healthy_stacks }} | ||
| degraded_stacks: ${{ steps.health.outputs.degraded_stacks }} | ||
| failed_stacks: ${{ steps.health.outputs.failed_stacks }} | ||
| total_containers: ${{ steps.health.outputs.total_containers }} | ||
| running_containers: ${{ steps.health.outputs.running_containers }} | ||
| success_rate: ${{ steps.health.outputs.success_rate }} | ||
| rollback_healthy_stacks: ${{ steps.rollback-health.outputs.rollback_healthy_stacks }} | ||
| rollback_degraded_stacks: ${{ steps.rollback-health.outputs.rollback_degraded_stacks }} | ||
| rollback_failed_stacks: ${{ steps.rollback-health.outputs.rollback_failed_stacks }} | ||
| rollback_total_containers: ${{ steps.rollback-health.outputs.rollback_total_containers }} | ||
| rollback_running_containers: ${{ steps.rollback-health.outputs.rollback_running_containers }} | ||
| rollback_success_rate: ${{ steps.rollback-health.outputs.rollback_success_rate }} | ||
| removed_stacks: ${{ steps.cleanup-removed.outputs.removed_stacks }} | ||
| has_removed_stacks: ${{ steps.cleanup-removed.outputs.has_removed_stacks }} | ||
| steps: | ||
| - name: Validate and sanitize inputs | ||
| run: | | ||
| # Validate stacks parameter is valid JSON | ||
| echo '${{ inputs.stacks }}' | jq -r '.[]' >/dev/null || { | ||
| echo "::error::Invalid stacks JSON format: ${{ inputs.stacks }}" | ||
| exit 1 | ||
| } | ||
| # Validate stack names contain only safe characters | ||
| echo '${{ inputs.stacks }}' | jq -r '.[]' | while read -r stack; do | ||
| if [[ ! "$stack" =~ ^[a-zA-Z0-9_-]+$ ]]; then | ||
| echo "::error::Invalid stack name: $stack. Only alphanumeric, underscore, and hyphen allowed." | ||
| exit 1 | ||
| fi | ||
| # Check stack name length | ||
| if [ ${#stack} -gt 50 ]; then | ||
| echo "::error::Stack name too long: $stack (max 50 characters)" | ||
| exit 1 | ||
| fi | ||
| done | ||
| # Validate target-ref format | ||
| TARGET_REF="${{ inputs.target-ref }}" | ||
| # Check if it's a valid commit SHA (7-40 hex chars) or branch/tag name | ||
| if [[ "$TARGET_REF" =~ ^[a-fA-F0-9]{7,40}$ ]] || [[ "$TARGET_REF" =~ ^[a-zA-Z0-9_-]+$ ]] || [[ "$TARGET_REF" =~ ^[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$ ]]; then | ||
| echo "✅ Target-ref format valid: $TARGET_REF" | ||
| else | ||
| echo "::error::Invalid target-ref format: $TARGET_REF" | ||
| echo "::error::Expected: commit SHA (7-40 hex chars) or branch/tag name" | ||
| exit 1 | ||
| fi | ||
| # Validate and sanitize compose args | ||
| COMPOSE_ARGS="${{ inputs.args }}" | ||
| if [[ -n "$COMPOSE_ARGS" ]]; then | ||
| # Check for dangerous characters and patterns | ||
| if [[ "$COMPOSE_ARGS" =~ [\;\&\|\`\$\\] ]]; then | ||
| echo "::error::Compose args contain potentially dangerous characters: $COMPOSE_ARGS" | ||
| echo "::error::Prohibited characters: ; & | \` $ \\" | ||
| exit 1 | ||
| fi | ||
| # Check for suspicious patterns | ||
| if [[ "$COMPOSE_ARGS" =~ (rm|kill|shutdown|reboot|format|dd|\>|\<|sudo|su) ]]; then | ||
| echo "::error::Compose args contain prohibited commands: $COMPOSE_ARGS" | ||
| exit 1 | ||
| fi | ||
| # Validate against known docker compose options - allow hyphens, spaces, and equals for arguments | ||
| if [[ "$COMPOSE_ARGS" =~ ^[a-zA-Z0-9[:space:]_=.-]+$ ]]; then | ||
| echo "✅ Compose args format valid: $COMPOSE_ARGS" | ||
| else | ||
| echo "::error::Compose args contain invalid characters: $COMPOSE_ARGS" | ||
| exit 1 | ||
| fi | ||
| fi | ||
| # Validate webhook URL format | ||
| WEBHOOK_URL="${{ inputs.webhook-url }}" | ||
| if [[ ! "$WEBHOOK_URL" =~ ^op://[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$ ]]; then | ||
| echo "::error::Invalid webhook URL format: $WEBHOOK_URL" | ||
| echo "::error::Expected format: op://vault/item/field" | ||
| exit 1 | ||
| fi | ||
| # Validate repo name | ||
| REPO_NAME="${{ inputs.repo-name }}" | ||
| if [[ ! "$REPO_NAME" =~ ^[a-zA-Z0-9_-]+$ ]] || [ ${#REPO_NAME} -gt 100 ]; then | ||
| echo "::error::Invalid repo name: $REPO_NAME" | ||
| echo "::error::Must be alphanumeric with hyphens/underscores, max 100 chars" | ||
| exit 1 | ||
| fi | ||
| echo "✅ All input validation passed" | ||
| - name: Display version information | ||
| run: | | ||
| echo "📋 Workflow Version Information" | ||
| echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" | ||
| echo "Repository: ${{ inputs.repo-name }}" | ||
| echo "Target ref: ${{ inputs.target-ref }}" | ||
| echo "Stacks: ${{ inputs.stacks }}" | ||
| echo "Runner: ${{ runner.os }} ${{ runner.arch }}" | ||
| echo "" | ||
| echo "ℹ️ Reusable workflow SHA shown in 'Uses:' line above" | ||
| echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" | ||
| - name: Verify required tools | ||
| run: | | ||
| echo "🔍 Verifying required tools are available..." | ||
| # Check for jq (required for JSON parsing) | ||
| if ! command -v jq &> /dev/null; then | ||
| echo "::error::jq is not installed. Installing..." | ||
| sudo apt-get update -qq | ||
| sudo apt-get install -y jq | ||
| fi | ||
| echo "✅ jq version: $(jq --version)" | ||
| # Check for timeout (required for command timeouts) | ||
| if ! command -v timeout &> /dev/null; then | ||
| echo "::error::timeout command is not available" | ||
| exit 1 | ||
| fi | ||
| echo "✅ timeout is available (part of coreutils)" | ||
| # Check for readarray (Bash 4.0+ built-in for array operations) | ||
| if ! declare -p BASH_VERSION &> /dev/null || [ "${BASH_VERSION%%.*}" -lt 4 ]; then | ||
| echo "::error::Bash 4.0+ is required (current: ${BASH_VERSION:-unknown})" | ||
| exit 1 | ||
| fi | ||
| echo "✅ Bash version: $BASH_VERSION (supports readarray and associative arrays)" | ||
| echo "✅ All required tools verified" | ||
| - name: Setup retry mechanism | ||
| run: | | ||
| # Create retry function for bash commands | ||
| cat > /tmp/retry.sh << 'EOF' | ||
| #!/bin/bash | ||
| retry() { | ||
| local max_attempts=$1 | ||
| local delay=$2 | ||
| local command="${@:3}" | ||
| local attempt=1 | ||
| while [ $attempt -le $max_attempts ]; do | ||
| echo "Attempt $attempt of $max_attempts: $command" | ||
| if eval "$command"; then | ||
| echo "✅ Command succeeded on attempt $attempt" | ||
| return 0 | ||
| else | ||
| echo "❌ Command failed on attempt $attempt" | ||
| if [ $attempt -lt $max_attempts ]; then | ||
| echo "⏳ Waiting ${delay}s before retry..." | ||
| sleep $delay | ||
| delay=$((delay * 2)) # Exponential backoff | ||
| fi | ||
| attempt=$((attempt + 1)) | ||
| fi | ||
| done | ||
| echo "💥 Command failed after $max_attempts attempts" | ||
| return 1 | ||
| } | ||
| # Create SSH retry function with specific error handling | ||
| ssh_retry() { | ||
| local max_attempts=$1 | ||
| local delay=$2 | ||
| local ssh_cmd="${@:3}" | ||
| local attempt=1 | ||
| local last_exit_code=1 | ||
| while [ $attempt -le $max_attempts ]; do | ||
| echo "SSH Attempt $attempt of $max_attempts" >&2 | ||
| if eval "$ssh_cmd"; then | ||
| echo "✅ SSH command succeeded on attempt $attempt" >&2 | ||
| return 0 | ||
| else | ||
| last_exit_code=$? | ||
| echo "❌ SSH command failed on attempt $attempt (exit code: $last_exit_code)" >&2 | ||
| # Check for specific SSH errors | ||
| case $last_exit_code in | ||
| 255) echo "SSH connection error - network/auth issue" >&2 ;; | ||
| 1) echo "General SSH error" >&2 ;; | ||
| *) echo "Unknown error code: $last_exit_code" >&2 ;; | ||
| esac | ||
| if [ $attempt -lt $max_attempts ]; then | ||
| echo "⏳ Waiting ${delay}s before SSH retry..." >&2 | ||
| sleep $delay | ||
| fi | ||
| attempt=$((attempt + 1)) | ||
| fi | ||
| done | ||
| echo "💥 SSH command failed after $max_attempts attempts (final exit code: $last_exit_code)" >&2 | ||
| return $last_exit_code | ||
| } | ||
| EOF | ||
| chmod +x /tmp/retry.sh | ||
| - name: Cache deployment tools | ||
| uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1 | ||
| with: | ||
| path: | | ||
| ~/.cache/pip | ||
| ~/.cache/docker | ||
| ~/.ssh | ||
| key: deploy-tools-${{ runner.os }}-v1 | ||
| restore-keys: | | ||
| deploy-tools-${{ runner.os }}- | ||
| - name: Configure 1Password Service Account | ||
| uses: 1password/load-secrets-action/configure@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0 | ||
| with: | ||
| service-account-token: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }} | ||
| - name: Load Tailscale credentials | ||
| id: load-tailscale-credentials | ||
| uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0 | ||
| with: | ||
| unset-previous: true | ||
| env: | ||
| TAILSCALE_OAUTH_CLIENT_ID: "op://Docker/tailscale-oauth/client_id" | ||
| TAILSCALE_OAUTH_SECRET: "op://Docker/tailscale-oauth/secret" | ||
| - name: Connect to Tailnet | ||
| uses: tailscale/github-action@53acf823325fe9ca47f4cdaa951f90b4b0de5bb9 # v4.1.1 | ||
| with: | ||
| oauth-client-id: ${{ steps.load-tailscale-credentials.outputs.TAILSCALE_OAUTH_CLIENT_ID }} | ||
| oauth-secret: ${{ steps.load-tailscale-credentials.outputs.TAILSCALE_OAUTH_SECRET }} | ||
| tags: tag:ci | ||
| ping: ${{ secrets.SSH_HOST }} | ||
| - name: Unload Tailscale credentials | ||
| uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0 | ||
| with: | ||
| unset-previous: true | ||
| - name: Optimize SSH connections | ||
| run: | | ||
| # Configure SSH connection multiplexing for better performance | ||
| mkdir -p ~/.ssh | ||
| cat >> ~/.ssh/config << EOF | ||
| Host deployment-server | ||
| HostName ${{ secrets.SSH_HOST }} | ||
| User ${{ secrets.SSH_USER }} | ||
| ControlMaster auto | ||
| ControlPath ~/.ssh/sockets/%r@%h:%p | ||
| ControlPersist 300 | ||
| ServerAliveInterval 30 | ||
| ServerAliveCountMax 3 | ||
| Compression yes | ||
| TCPKeepAlive yes | ||
| EOF | ||
| # Create control socket directory and pre-establish SSH connection | ||
| mkdir -p ~/.ssh/sockets | ||
| echo "🔗 Pre-establishing SSH connection for multiplexing..." | ||
| ssh -o "StrictHostKeyChecking no" deployment-server -O check 2>/dev/null || \ | ||
| ssh -o "StrictHostKeyChecking no" deployment-server -O forward -N & | ||
| # Give the connection a moment to establish | ||
| sleep 2 | ||
| echo "✅ SSH connection optimization configured" | ||
| - name: Checkout repository for change detection | ||
| uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 | ||
| with: | ||
| fetch-depth: 0 # Fetch full history for accurate change detection | ||
| - name: Determine previous deployment SHA | ||
| id: determine-previous | ||
| run: | | ||
| # Use retry mechanism for SSH connection | ||
| source /tmp/retry.sh | ||
| # Get current deployment SHA with error handling | ||
| echo "🔍 Checking current deployment SHA for change detection..." | ||
| if CURRENT_SHA=$(ssh_retry 3 5 "ssh -o 'StrictHostKeyChecking no' ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} 'cd /opt/compose && git rev-parse HEAD 2>/dev/null'"); then | ||
| # Validate SHA format | ||
| if [[ "$CURRENT_SHA" =~ ^[a-fA-F0-9]{40}$ ]]; then | ||
| echo "✅ Current deployed SHA: $CURRENT_SHA" | ||
| echo "previous_sha=$CURRENT_SHA" >> "$GITHUB_OUTPUT" | ||
| else | ||
| echo "⚠️ Invalid SHA format from server: $CURRENT_SHA" | ||
| echo "⚠️ Using HEAD^ as fallback for change detection" | ||
| echo "previous_sha=HEAD^" >> "$GITHUB_OUTPUT" | ||
| fi | ||
| else | ||
| echo "⚠️ Could not retrieve current deployment SHA - using HEAD^ for change detection" | ||
| echo "previous_sha=HEAD^" >> "$GITHUB_OUTPUT" | ||
| fi | ||
| - name: Get changed files for removal detection | ||
| id: changed-files | ||
| if: steps.determine-previous.outputs.previous_sha != inputs.target-ref | ||
| continue-on-error: true | ||
| uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1 | ||
| with: | ||
| json: true | ||
| sha: ${{ inputs.target-ref }} | ||
| base_sha: ${{ steps.determine-previous.outputs.previous_sha }} | ||
| - name: Store current deployment for rollback | ||
| id: backup | ||
| run: | | ||
| echo "::group::Preparing deployment backup" | ||
| # Use retry mechanism for SSH connection | ||
| source /tmp/retry.sh | ||
| # Get current deployment SHA with error handling | ||
| echo "🔍 Checking current deployment SHA..." | ||
| if CURRENT_SHA=$(ssh_retry 3 5 "ssh -o 'StrictHostKeyChecking no' ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} 'cd /opt/compose && git rev-parse HEAD 2>/dev/null'"); then | ||
| # Validate SHA format | ||
| if [[ "$CURRENT_SHA" =~ ^[a-fA-F0-9]{40}$ ]]; then | ||
| echo "✅ Current deployed SHA: $CURRENT_SHA" | ||
| else | ||
| echo "⚠️ Invalid SHA format from server: $CURRENT_SHA" | ||
| CURRENT_SHA="unknown" | ||
| fi | ||
| else | ||
| echo "⚠️ Could not retrieve current deployment SHA - assuming first deployment" | ||
| CURRENT_SHA="unknown" | ||
| fi | ||
| TARGET_REF="${{ inputs.target-ref }}" | ||
| echo "🎯 Target deployment ref: $TARGET_REF" | ||
| # Resolve target ref to SHA if it's not already a SHA | ||
| if [[ "$TARGET_REF" =~ ^[a-fA-F0-9]{40}$ ]]; then | ||
| TARGET_SHA="$TARGET_REF" | ||
| echo "✅ Target ref is already a full SHA" | ||
| elif [[ "$TARGET_REF" =~ ^[a-fA-F0-9]{7,39}$ ]]; then | ||
| TARGET_SHA="$TARGET_REF" | ||
| echo "✅ Target ref is a short SHA, will resolve on server" | ||
| else | ||
| TARGET_SHA="$TARGET_REF" | ||
| echo "✅ Target ref is a branch/tag name, will resolve on server" | ||
| fi | ||
| # Set outputs with proper validation | ||
| echo "previous_sha=${CURRENT_SHA}" >> "$GITHUB_OUTPUT" | ||
| if [ "$CURRENT_SHA" = "$TARGET_SHA" ] && [ "${{ inputs.force-deploy }}" != "true" ]; then | ||
| echo "⚠️ Repository is already at target commit - no deployment needed" | ||
| echo "deployment_needed=false" >> "$GITHUB_OUTPUT" | ||
| elif [ "$CURRENT_SHA" = "$TARGET_SHA" ] && [ "${{ inputs.force-deploy }}" = "true" ]; then | ||
| echo "🔄 Force deployment requested - proceeding despite same commit" | ||
| echo "deployment_needed=true" >> "$GITHUB_OUTPUT" | ||
| else | ||
| echo "✅ Deployment needed - proceeding with update" | ||
| echo "deployment_needed=true" >> "$GITHUB_OUTPUT" | ||
| fi | ||
| echo "::endgroup::" | ||
| # ================================================================ | ||
| # STACK REMOVAL DETECTION AND CLEANUP | ||
| # ================================================================ | ||
| # Automatically detect and clean up Docker stacks that have been | ||
| # removed from the repository using three independent detection methods. | ||
| # | ||
| # Detection Methods: | ||
| # 1. Git Diff: Compares current deployed SHA vs target SHA | ||
| # 2. Tree Comparison: Compares target commit tree vs server filesystem | ||
| # (catches removals from previous undeployed commits) | ||
| # 3. Discovery Analysis: Analyzes tj-actions/changed-files output | ||
| # (validates removals from GitHub perspective) | ||
| # | ||
| # Process: | ||
| # 1. Run all three detection methods independently on deployment server | ||
| # 2. Fail deployment if ANY detection method encounters errors (fail-safe) | ||
| # 3. Aggregate results using union approach (remove anything found by any method) | ||
| # 4. Deduplicate and validate stack names | ||
| # 5. Run 'docker compose down' for each removed stack | ||
| # 6. Fail deployment if any cleanup fails | ||
| # 7. Send Discord notification listing removed stacks | ||
| # | ||
| # Design: docs/plans/2025-12-06-enhanced-stack-removal-detection-design.md | ||
| - name: Detect and clean up removed stacks | ||
| id: cleanup-removed | ||
| if: steps.backup.outputs.deployment_needed == 'true' | ||
| continue-on-error: false | ||
| run: | | ||
| # Source retry functions | ||
| source /tmp/retry.sh | ||
| # === DETECTION FUNCTION: GIT DIFF === | ||
| # Purpose: Detect stacks removed between two git commits | ||
| # Inputs: $1=current_sha, $2=target_ref | ||
| # Output: Newline-separated list of stack names (stdout) | ||
| # Returns: 0 on success, 1 on error | ||
| detect_removed_stacks_gitdiff() { | ||
| local current_sha="$1" | ||
| local target_ref="$2" | ||
| # Build detection script | ||
| local detect_script | ||
| detect_script=$(cat << 'DETECT_EOF' | ||
| set -e | ||
| CURRENT_SHA="$1" | ||
| TARGET_REF="$2" | ||
| cd /opt/compose | ||
| # Fetch target ref to ensure we have it | ||
| if ! git fetch origin "$TARGET_REF" 2>/dev/null; then | ||
| echo "⚠️ Failed to fetch target ref, trying general fetch..." >&2 | ||
| if ! git fetch 2>/dev/null; then | ||
| echo "::error::Failed to fetch repository updates" >&2 | ||
| exit 1 | ||
| fi | ||
| fi | ||
| # Resolve target ref to SHA for comparison | ||
| TARGET_SHA=$(git rev-parse "$TARGET_REF" 2>/dev/null || echo "$TARGET_REF") | ||
| # Validate both SHAs exist | ||
| if ! git cat-file -e "$CURRENT_SHA" 2>/dev/null; then | ||
| echo "::warning::Current SHA $CURRENT_SHA not found in repository (may have been replaced by force-push)" >&2 | ||
| echo " Skipping git diff detection, will rely on tree comparison method" >&2 | ||
| exit 1 | ||
| fi | ||
| if ! git cat-file -e "$TARGET_SHA" 2>/dev/null; then | ||
| echo "::warning::Target SHA $TARGET_SHA not found in repository" >&2 | ||
| echo " Skipping git diff detection, will rely on tree comparison method" >&2 | ||
| exit 1 | ||
| fi | ||
| # Find deleted compose.yaml files between current and target | ||
| git diff --diff-filter=D --name-only "$CURRENT_SHA" "$TARGET_SHA" 2>/dev/null | \ | ||
| grep -E '^[^/]+/compose\.yaml$' | \ | ||
| sed 's|/compose\.yaml||' || echo "" | ||
| DETECT_EOF | ||
| ) | ||
| # Execute detection script on remote server | ||
| echo "$detect_script" | ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$current_sha\" \"$target_ref\"" | ||
| } | ||
| # === DETECTION FUNCTION: TREE COMPARISON === | ||
| # Purpose: Detect stacks on server filesystem missing from target commit tree | ||
| # Inputs: $1=target_ref | ||
| # Output: Newline-separated list of stack names (stdout) | ||
| # Returns: 0 on success, 1 on error | ||
| detect_removed_stacks_tree() { | ||
| local target_ref="$1" | ||
| # Build detection script | ||
| local detect_script | ||
| detect_script=$(cat << 'DETECT_TREE_EOF' | ||
| set -e | ||
| TARGET_REF="$1" | ||
| cd /opt/compose | ||
| # Fetch target ref to ensure we have it | ||
| if ! git fetch origin "$TARGET_REF" 2>/dev/null; then | ||
| echo "⚠️ Failed to fetch target ref, trying general fetch..." >&2 | ||
| if ! git fetch 2>/dev/null; then | ||
| echo "::error::Failed to fetch repository updates" >&2 | ||
| exit 1 | ||
| fi | ||
| fi | ||
| # Resolve target ref to SHA | ||
| TARGET_SHA=$(git rev-parse "$TARGET_REF" 2>/dev/null || echo "$TARGET_REF") | ||
| # Validate target SHA exists | ||
| if ! git cat-file -e "$TARGET_SHA" 2>/dev/null; then | ||
| echo "::error::Target SHA $TARGET_SHA not found in repository" >&2 | ||
| exit 1 | ||
| fi | ||
| # Get directories in target commit (one level deep, directories only) | ||
| COMMIT_DIRS=$(git ls-tree --name-only "$TARGET_SHA" 2>/dev/null | sort) | ||
| # Get directories on server filesystem (exclude .git and hidden dirs) | ||
| SERVER_DIRS=$(find /opt/compose -maxdepth 1 -mindepth 1 -type d ! -name '.*' -exec basename {} \; 2>/dev/null | sort) | ||
| # Find directories on server but not in commit | ||
| MISSING_IN_COMMIT=$(comm -13 <(echo "$COMMIT_DIRS") <(echo "$SERVER_DIRS")) | ||
| # Filter for directories with compose.yaml files | ||
| for dir in $MISSING_IN_COMMIT; do | ||
| if [ -f "/opt/compose/$dir/compose.yaml" ]; then | ||
| echo "$dir" | ||
| fi | ||
| done | ||
| DETECT_TREE_EOF | ||
| ) | ||
| # Execute detection script on remote server | ||
| echo "$detect_script" | ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$target_ref\"" | ||
| } | ||
| # === DETECTION FUNCTION: DISCOVERY ANALYSIS === | ||
| # Purpose: Analyze deleted files from tj-actions/changed-files output | ||
| # Inputs: $1=deleted_files_json (JSON array from tj-actions/changed-files) | ||
| # Output: Newline-separated list of stack names (stdout) | ||
| # Returns: 0 on success, 1 on error | ||
| detect_removed_stacks_discovery() { | ||
| local deleted_files_json="$1" | ||
| # Build detection script | ||
| local detect_script | ||
| detect_script=$(cat << 'DETECT_DISCOVERY_EOF' | ||
| set -e | ||
| DELETED_FILES_JSON="$1" | ||
| # Parse JSON array and filter for compose.yaml deletions | ||
| # Pattern: one level deep only (stack-name/compose.yaml) | ||
| echo "$DELETED_FILES_JSON" | jq -r '.[]' 2>/dev/null | \ | ||
| grep -E '^[^/]+/compose\.yaml$' | \ | ||
| sed 's|/compose\.yaml||' || echo "" | ||
| DETECT_DISCOVERY_EOF | ||
| ) | ||
| # Execute detection script on remote server | ||
| echo "$detect_script" | ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$deleted_files_json\"" | ||
| } | ||
| # === AGGREGATION FUNCTION === | ||
| # Purpose: Merge and deduplicate results from all three detection methods | ||
| # Inputs: $1=gitdiff_stacks, $2=tree_stacks, $3=discovery_stacks (newline-separated lists) | ||
| # Output: Deduplicated newline-separated list of stack names (stdout) | ||
| # Returns: 0 on success (empty string if all inputs empty, not an error) | ||
| aggregate_removed_stacks() { | ||
| local gitdiff_stacks="$1" | ||
| local tree_stacks="$2" | ||
| local discovery_stacks="$3" | ||
| # Concatenate all three lists, remove empty lines, sort and deduplicate | ||
| { | ||
| echo "$gitdiff_stacks" | ||
| echo "$tree_stacks" | ||
| echo "$discovery_stacks" | ||
| } | \ | ||
| grep -v '^$' | \ | ||
| sort -u | \ | ||
| grep -E '^[a-zA-Z0-9_-]+$' || echo "" | ||
| } | ||
| # === CLEANUP FUNCTION === | ||
| # Purpose: Clean up a single removed stack using docker compose down | ||
| # Inputs: $1=stack_name | ||
| # Returns: 0 on success, 1 on error | ||
| # Note: Requires OP_SERVICE_ACCOUNT_TOKEN from GitHub secrets | ||
| # Security: Token passed as environment variable via heredoc (not command-line args) to avoid exposure in process listings | ||
| cleanup_stack() { | ||
| local stack="$1" | ||
| local op_token="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}" | ||
| # Build cleanup script that expects OP_SERVICE_ACCOUNT_TOKEN from environment | ||
| local cleanup_script | ||
| cleanup_script=$(cat << 'CLEANUP_EOF' | ||
| STACK="$1" | ||
| # Check if stack directory exists | ||
| if [ ! -d "/opt/compose/$STACK" ]; then | ||
| echo "⚠️ Stack directory not found for $STACK - already fully removed" | ||
| exit 0 | ||
| fi | ||
| cd "/opt/compose/$STACK" | ||
| # Check if compose.yaml exists | ||
| if [ ! -f compose.yaml ]; then | ||
| echo "⚠️ compose.yaml not found for $STACK - may have been manually removed" | ||
| exit 0 | ||
| fi | ||
| # Run docker compose down with 1Password | ||
| # Note: OP_SERVICE_ACCOUNT_TOKEN is set by the wrapper script | ||
| if op run --env-file=/opt/compose/compose.env -- docker compose -f ./compose.yaml down; then | ||
| echo "✅ Successfully cleaned up $STACK" | ||
| else | ||
| echo "❌ Failed to clean up $STACK" | ||
| exit 1 | ||
| fi | ||
| CLEANUP_EOF | ||
| ) | ||
| # Execute cleanup script on remote server | ||
| # Token passed as environment variable via heredoc to avoid exposure in process args | ||
| ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$stack\"" <<EOF | ||
| export OP_SERVICE_ACCOUNT_TOKEN="$op_token" | ||
| $cleanup_script | ||
| EOF | ||
| } | ||
| # === MAIN EXECUTION === | ||
| echo "::group::Detecting removed stacks" | ||
| CURRENT_SHA="${{ steps.backup.outputs.previous_sha }}" | ||
| TARGET_REF="${{ inputs.target-ref }}" | ||
| # Skip detection if this is the first deployment | ||
| if [ "$CURRENT_SHA" = "unknown" ]; then | ||
| echo "ℹ️ First deployment detected - no previous stacks to remove" | ||
| echo "removed_stacks=" >> "$GITHUB_OUTPUT" | ||
| echo "has_removed_stacks=false" >> "$GITHUB_OUTPUT" | ||
| echo "::endgroup::" | ||
| exit 0 | ||
| fi | ||
| echo "📊 Comparing commits:" | ||
| echo " Current: $CURRENT_SHA" | ||
| echo " Target: $TARGET_REF" | ||
| echo "🔍 Checking for removed stacks..." | ||
| # Read deleted files from changed-files step (may be empty if step failed) | ||
| DELETED_FILES='${{ steps.changed-files.outputs.deleted_files }}' | ||
| # Check if changed-files step succeeded | ||
| if [ "${{ steps.changed-files.outcome }}" != "success" ]; then | ||
| echo "⚠️ Changed-files detection failed (likely due to missing git ref)" | ||
| echo " Proceeding with git diff and tree comparison methods only" | ||
| fi | ||
| echo "🔍 Running three detection methods..." | ||
| # Execute all three detection methods independently | ||
| echo " 1. Git diff detection (commit comparison)..." | ||
| GITDIFF_STACKS=$(detect_removed_stacks_gitdiff "$CURRENT_SHA" "$TARGET_REF") || GITDIFF_EXIT=$? | ||
| echo " 2. Tree comparison detection (filesystem vs commit)..." | ||
| TREE_STACKS=$(detect_removed_stacks_tree "$TARGET_REF") || TREE_EXIT=$? | ||
| echo " 3. Discovery analysis detection (changed files)..." | ||
| if [ "$DELETED_FILES" = "[]" ] || [ -z "$DELETED_FILES" ]; then | ||
| # Empty JSON array or empty string - no deleted files to analyze | ||
| echo " ℹ️ No deleted files detected - skipping discovery analysis" | ||
| DISCOVERY_STACKS="" | ||
| DISCOVERY_EXIT=0 | ||
| else | ||
| DISCOVERY_STACKS=$(detect_removed_stacks_discovery "$DELETED_FILES") || DISCOVERY_EXIT=$? | ||
| fi | ||
| # Fail deployment if any detection method failed (fail-safe) | ||
| if [ "${GITDIFF_EXIT:-0}" -ne 0 ]; then | ||
| echo "::error::Git diff detection failed (exit code: $GITDIFF_EXIT)" | ||
| exit 1 | ||
| fi | ||
| if [ "${TREE_EXIT:-0}" -ne 0 ]; then | ||
| echo "::error::Tree comparison detection failed (exit code: $TREE_EXIT)" | ||
| exit 1 | ||
| fi | ||
| if [ "${DISCOVERY_EXIT:-0}" -ne 0 ]; then | ||
| echo "::error::Discovery analysis detection failed (exit code: $DISCOVERY_EXIT)" | ||
| exit 1 | ||
| fi | ||
| echo "✅ All detection methods completed successfully" | ||
| # Aggregate results (union of all three methods) | ||
| echo "📊 Aggregating results..." | ||
| REMOVED_STACKS=$(aggregate_removed_stacks "$GITDIFF_STACKS" "$TREE_STACKS" "$DISCOVERY_STACKS") | ||
| # Debug logging | ||
| if [ -n "$GITDIFF_STACKS" ]; then | ||
| echo " Git diff found: $(echo "$GITDIFF_STACKS" | tr '\n' ', ' | sed 's/,$//')" | ||
| fi | ||
| if [ -n "$TREE_STACKS" ]; then | ||
| echo " Tree comparison found: $(echo "$TREE_STACKS" | tr '\n' ', ' | sed 's/,$//')" | ||
| fi | ||
| if [ -n "$DISCOVERY_STACKS" ]; then | ||
| echo " Discovery analysis found: $(echo "$DISCOVERY_STACKS" | tr '\n' ', ' | sed 's/,$//')" | ||
| fi | ||
| # Process results | ||
| if [ -z "$REMOVED_STACKS" ]; then | ||
| echo "✅ No stacks to remove" | ||
| echo "removed_stacks=" >> "$GITHUB_OUTPUT" | ||
| echo "has_removed_stacks=false" >> "$GITHUB_OUTPUT" | ||
| else | ||
| echo "🗑️ Found stacks to remove:" | ||
| echo "$REMOVED_STACKS" | while read -r stack; do | ||
| echo " - $stack" | ||
| done | ||
| # Convert to JSON array for output | ||
| REMOVED_JSON=$(echo "$REMOVED_STACKS" | jq -R -s -c 'split("\n") | map(select(length > 0))') | ||
| echo "removed_stacks=$REMOVED_JSON" >> "$GITHUB_OUTPUT" | ||
| echo "has_removed_stacks=true" >> "$GITHUB_OUTPUT" | ||
| # Cleanup each removed stack | ||
| echo "" | ||
| echo "::group::Cleaning up removed stacks" | ||
| CLEANUP_FAILED=false | ||
| while IFS= read -r stack; do | ||
| [ -z "$stack" ] && continue | ||
| echo "🧹 Cleaning up stack: $stack" | ||
| if ! cleanup_stack "$stack"; then | ||
| echo "💥 Cleanup failed for stack: $stack" | ||
| CLEANUP_FAILED=true | ||
| break | ||
| fi | ||
| done <<< "$REMOVED_STACKS" | ||
| echo "::endgroup::" | ||
| if [ "$CLEANUP_FAILED" = "true" ]; then | ||
| echo "::error::Stack cleanup failed - stopping deployment" | ||
| exit 1 | ||
| fi | ||
| echo "✅ All removed stacks cleaned successfully" | ||
| fi | ||
| echo "::endgroup::" | ||
| - name: Notify removed stacks cleanup | ||
| if: steps.cleanup-removed.outputs.has_removed_stacks == 'true' | ||
| run: | | ||
| echo "📢 Sending cleanup notification to Discord..." | ||
| # Get webhook URL from 1Password | ||
| WEBHOOK_URL=$(op read "${{ inputs.webhook-url }}") | ||
| # Build removed stacks list and create JSON payload using jq for proper escaping | ||
| REMOVED_STACKS='${{ steps.cleanup-removed.outputs.removed_stacks }}' | ||
| STACK_LIST=$(echo "$REMOVED_STACKS" | jq -r '.[] | "- " + .') | ||
| TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) | ||
| # Build JSON payload with jq to ensure proper escaping | ||
| PAYLOAD=$(jq -n \ | ||
| --arg title "🗑️ Stack Cleanup - ${{ inputs.repo-name }}" \ | ||
| --arg description "Removed stacks have been cleaned up before deployment" \ | ||
| --arg stacks "$STACK_LIST" \ | ||
| --arg target "${{ inputs.target-ref }}" \ | ||
| --arg previous "${{ steps.backup.outputs.previous_sha }}" \ | ||
| --arg timestamp "$TIMESTAMP" \ | ||
| '{ | ||
| embeds: [{ | ||
| title: $title, | ||
| description: $description, | ||
| color: 16753920, | ||
| fields: [ | ||
| {name: "Removed Stacks", value: $stacks}, | ||
| {name: "Target Commit", value: ("`" + $target + "`")}, | ||
| {name: "Previous Commit", value: ("`" + $previous + "`")} | ||
| ], | ||
| timestamp: $timestamp | ||
| }] | ||
| }') | ||
| # Send Discord notification | ||
| curl -X POST "$WEBHOOK_URL" \ | ||
| -H "Content-Type: application/json" \ | ||
| -d "$PAYLOAD" | ||
| echo "✅ Cleanup notification sent" | ||
| - name: Deploy All Stacks | ||
| id: deploy | ||
| if: steps.backup.outputs.deployment_needed == 'true' | ||
| continue-on-error: true | ||
| run: | | ||
| echo "🚀 Deploying all stacks" | ||
| # Source retry functions | ||
| source /tmp/retry.sh | ||
| # Set error handling | ||
| set -e | ||
| trap 'echo "❌ Deployment failed at line $LINENO"' ERR | ||
| # Parse inputs outside SSH context | ||
| STACKS="${{ join(fromJson(inputs.stacks), ' ') }}" | ||
| HAS_DOCKGE="${{ inputs.has-dockge }}" | ||
| TARGET_REF="${{ inputs.target-ref }}" | ||
| COMPOSE_ARGS="${{ inputs.args || '' }}" | ||
| # Use retry mechanism and optimized deployment | ||
| ssh_retry 3 10 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s $STACKS \"$HAS_DOCKGE\" \"$TARGET_REF\" \"$COMPOSE_ARGS\"" << 'EOF' | ||
| set -e | ||
| # Performance optimizations | ||
| export DOCKER_BUILDKIT=1 | ||
| export COMPOSE_DOCKER_CLI_BUILD=1 | ||
| # Enable parallel image pulls | ||
| export COMPOSE_PARALLEL_LIMIT=8 | ||
| # Get arguments passed to script (excluding sensitive OP_TOKEN) | ||
| # Arguments: stack1 stack2 stack3 ... HAS_DOCKGE TARGET_REF [COMPOSE_ARGS] | ||
| # COMPOSE_ARGS might be empty, so we need to handle variable arg count | ||
| TOTAL_ARGS=$# | ||
| # Find HAS_DOCKGE by looking for 'true' or 'false' in the args | ||
| HAS_DOCKGE="" | ||
| TARGET_REF="" | ||
| COMPOSE_ARGS="" | ||
| # The last few args should be: HAS_DOCKGE TARGET_REF [COMPOSE_ARGS] | ||
| # HAS_DOCKGE is always 'true' or 'false' | ||
| # TARGET_REF is a commit SHA (starts with letter/number) | ||
| # COMPOSE_ARGS is optional and could be empty | ||
| for i in $(seq 1 $TOTAL_ARGS); do | ||
| ARG="${!i}" | ||
| if [ "$ARG" = "true" ] || [ "$ARG" = "false" ]; then | ||
| HAS_DOCKGE="$ARG" | ||
| TARGET_REF="${@:$((i+1)):1}" | ||
| if [ $((i+2)) -le $TOTAL_ARGS ]; then | ||
| COMPOSE_ARGS="${@:$((i+2)):1}" | ||
| fi | ||
| # All args before this position are stack names | ||
| STACKS="${@:1:$((i-1))}" | ||
| break | ||
| fi | ||
| done | ||
| # Set OP_TOKEN via environment (passed separately) | ||
| export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}" | ||
| # Consolidate timeout values for easier maintenance | ||
| # These can be overridden by workflow inputs where available | ||
| GIT_FETCH_TIMEOUT=${{ inputs.git-fetch-timeout }} | ||
| GIT_CHECKOUT_TIMEOUT=${{ inputs.git-checkout-timeout }} | ||
| IMAGE_PULL_TIMEOUT=${{ inputs.image-pull-timeout }} | ||
| SERVICE_STARTUP_TIMEOUT=${{ inputs.service-startup-timeout }} | ||
| VALIDATION_ENV_TIMEOUT=${{ inputs.validation-env-timeout }} | ||
| VALIDATION_SYNTAX_TIMEOUT=${{ inputs.validation-syntax-timeout }} | ||
| if [ "$HAS_DOCKGE" = "true" ]; then | ||
| echo "🚀 Deploying Dockge..." | ||
| cd /opt/dockge | ||
| # Add timeout protection for Dockge operations | ||
| if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then | ||
| echo "❌ Dockge image pull timed out after ${IMAGE_PULL_TIMEOUT}s" | ||
| exit 1 | ||
| fi | ||
| if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then | ||
| echo "❌ Dockge startup timed out after ${SERVICE_STARTUP_TIMEOUT}s" | ||
| exit 1 | ||
| fi | ||
| echo "✅ Dockge deployed successfully" | ||
| fi | ||
| echo "Updating repository to $TARGET_REF..." | ||
| # Add timeout protection to git operations | ||
| if ! timeout $GIT_FETCH_TIMEOUT git -C /opt/compose/ fetch; then | ||
| echo "❌ Git fetch timed out after ${GIT_FETCH_TIMEOUT}s" | ||
| exit 1 | ||
| fi | ||
| if ! timeout $GIT_CHECKOUT_TIMEOUT git -C /opt/compose/ checkout $TARGET_REF; then | ||
| echo "❌ Git checkout timed out after ${GIT_CHECKOUT_TIMEOUT}s" | ||
| exit 1 | ||
| fi | ||
| echo "✅ Repository updated to $TARGET_REF" | ||
| # Shared function to deploy or rollback a single stack | ||
| # This eliminates code duplication between deploy and rollback operations | ||
| process_stack() { | ||
| local STACK=$1 | ||
| local OPERATION=$2 # "deploy" or "rollback" | ||
| local LOGFILE="/tmp/${OPERATION}_${STACK}.log" | ||
| local EXITCODEFILE="/tmp/${OPERATION}_${STACK}.exitcode" | ||
| { | ||
| if [ "$OPERATION" = "deploy" ]; then | ||
| echo "🚀 Deploying $STACK..." | ||
| else | ||
| echo "🔄 Rolling back $STACK..." | ||
| fi | ||
| cd /opt/compose/$STACK | ||
| echo " Pulling images for $STACK..." | ||
| # Add timeout protection (5 minutes for image pull) | ||
| if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then | ||
| echo "❌ Failed to pull images for $STACK during $OPERATION (timeout or error)" | ||
| exit 1 | ||
| fi | ||
| echo " Starting services for $STACK..." | ||
| # Add timeout protection (2 minutes for service startup) | ||
| if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then | ||
| echo "❌ Failed to start services for $STACK during $OPERATION (timeout or error)" | ||
| exit 1 | ||
| fi | ||
| if [ "$OPERATION" = "deploy" ]; then | ||
| echo "✅ $STACK deployed successfully" | ||
| else | ||
| echo "✅ $STACK rolled back successfully" | ||
| fi | ||
| } > "$LOGFILE" 2>&1 | ||
| # Capture and save exit code for robust error detection | ||
| local exit_code=$? | ||
| echo "$exit_code" > "$EXITCODEFILE" | ||
| return $exit_code | ||
| } | ||
| # Wrapper function for deploy (maintains backward compatibility) | ||
| deploy_stack() { | ||
| process_stack "$1" "deploy" | ||
| } | ||
| # Cleanup function for deploy logs | ||
| cleanup_deploy_logs() { | ||
| for STACK in $STACKS; do | ||
| rm -f "/tmp/deploy_${STACK}.log" 2>/dev/null | ||
| done | ||
| } | ||
| # Pre-deployment validation function | ||
| validate_all_stacks() { | ||
| echo "🔍 Pre-deployment validation of all stacks..." | ||
| local validation_failed=false | ||
| for STACK in $STACKS; do | ||
| echo " Validating $STACK..." | ||
| # Check if stack directory exists | ||
| if [ ! -d "/opt/compose/$STACK" ]; then | ||
| echo "❌ $STACK: Directory /opt/compose/$STACK not found" | ||
| validation_failed=true | ||
| continue | ||
| fi | ||
| cd "/opt/compose/$STACK" || { | ||
| echo "❌ $STACK: Cannot access directory" | ||
| validation_failed=true | ||
| continue | ||
| } | ||
| # Check if compose.yaml exists | ||
| if [ ! -f "compose.yaml" ]; then | ||
| echo "❌ $STACK: compose.yaml not found" | ||
| validation_failed=true | ||
| continue | ||
| fi | ||
| # Validate 1Password environment access and Docker Compose config | ||
| if ! timeout $VALIDATION_ENV_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services >/dev/null 2>&1; then | ||
| echo "❌ $STACK: Environment validation failed (1Password or compose config error)" | ||
| validation_failed=true | ||
| continue | ||
| fi | ||
| # Quick syntax validation | ||
| if ! timeout $VALIDATION_SYNTAX_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --quiet 2>/dev/null; then | ||
| echo "❌ $STACK: Docker Compose syntax validation failed" | ||
| validation_failed=true | ||
| continue | ||
| fi | ||
| echo "✅ $STACK: Pre-deployment validation passed" | ||
| done | ||
| if [ "$validation_failed" = true ]; then | ||
| echo "❌ Pre-deployment validation failed for one or more stacks" | ||
| echo " Stopping deployment to prevent extended failures" | ||
| return 1 | ||
| fi | ||
| echo "✅ All stacks passed pre-deployment validation" | ||
| return 0 | ||
| } | ||
| # Run pre-deployment validation | ||
| if ! validate_all_stacks; then | ||
| echo "DEPLOYMENT_STATUS=failed_validation" >> "$GITHUB_OUTPUT" | ||
| exit 1 | ||
| fi | ||
| # Set trap for cleanup on exit | ||
| trap cleanup_deploy_logs EXIT | ||
| # Start all deployments in parallel | ||
| echo "🚀 Starting parallel deployment of all stacks..." | ||
| PIDS="" | ||
| # Simple approach - use for loop directly with unquoted variable | ||
| for STACK in $STACKS; do | ||
| echo "🚀 Deploying $STACK..." | ||
| deploy_stack "$STACK" & | ||
| PIDS="$PIDS $!" | ||
| echo "Started deployment of $STACK (PID: $!)" | ||
| done | ||
| # Wait for all deployments and collect results | ||
| echo "⏳ Waiting for all deployments to complete..." | ||
| FAILED_STACKS="" | ||
| # Enhanced parallel job monitoring with better error propagation | ||
| echo "⏳ Monitoring parallel deployments..." | ||
| DEPLOYED_STACKS="" | ||
| SUCCESSFUL_STACKS="" | ||
| DEPLOYMENT_ERRORS="" | ||
| # Wait for jobs individually to capture exit codes | ||
| for PID in $PIDS; do | ||
| if wait "$PID"; then | ||
| echo "✅ Deployment process $PID completed successfully" | ||
| else | ||
| EXIT_CODE=$? | ||
| echo "❌ Deployment process $PID failed with exit code $EXIT_CODE" | ||
| DEPLOYMENT_ERRORS="$DEPLOYMENT_ERRORS PID:$PID:$EXIT_CODE" | ||
| fi | ||
| done | ||
| # Enhanced result analysis using exit code files (more robust than log parsing) | ||
| for STACK in $STACKS; do | ||
| if [ -f "/tmp/deploy_${STACK}.log" ]; then | ||
| DEPLOYED_STACKS="$DEPLOYED_STACKS $STACK" | ||
| # Primary: Check exit code file for robust error detection | ||
| if [ -f "/tmp/deploy_${STACK}.exitcode" ]; then | ||
| EXIT_CODE=$(cat "/tmp/deploy_${STACK}.exitcode") | ||
| if [ "$EXIT_CODE" -eq 0 ]; then | ||
| SUCCESSFUL_STACKS="$SUCCESSFUL_STACKS $STACK" | ||
| else | ||
| FAILED_STACKS="$FAILED_STACKS $STACK" | ||
| echo "🔍 $STACK Error: Non-zero exit code ($EXIT_CODE)" | ||
| fi | ||
| else | ||
| # Fallback: Log-based error detection if exit code file is missing | ||
| echo "⚠️ $STACK: Exit code file missing - using less reliable log-based detection" | ||
| if grep -q "❌.*$STACK\|CRITICAL.*$STACK\|Failed.*$STACK\|Error.*$STACK" "/tmp/deploy_${STACK}.log"; then | ||
| FAILED_STACKS="$FAILED_STACKS $STACK" | ||
| # Extract specific error for reporting | ||
| STACK_ERROR=$(grep -E "❌.*$STACK|CRITICAL.*$STACK|Failed.*$STACK|Error.*$STACK" "/tmp/deploy_${STACK}.log" | head -1) | ||
| echo "🔍 $STACK Error: $STACK_ERROR" | ||
| elif grep -q "✅.*$STACK\|Successfully.*$STACK" "/tmp/deploy_${STACK}.log"; then | ||
| SUCCESSFUL_STACKS="$SUCCESSFUL_STACKS $STACK" | ||
| else | ||
| echo "⚠️ $STACK: No clear success/failure indicator in logs - treating as potential failure" | ||
| FAILED_STACKS="$FAILED_STACKS $STACK" | ||
| fi | ||
| fi | ||
| else | ||
| echo "⚠️ $STACK: No deployment log found - possible early failure" | ||
| FAILED_STACKS="$FAILED_STACKS $STACK" | ||
| fi | ||
| done | ||
| # Summary of deployment results | ||
| echo "" | ||
| echo "📊 Deployment Summary:" | ||
| echo " Successful: $(echo $SUCCESSFUL_STACKS | wc -w | tr -d ' ') stacks" | ||
| echo " Failed: $(echo $FAILED_STACKS | wc -w | tr -d ' ') stacks" | ||
| if [ -n "$DEPLOYMENT_ERRORS" ]; then | ||
| echo " Process errors: $DEPLOYMENT_ERRORS" | ||
| fi | ||
| # Display deployment logs with enhanced formatting | ||
| echo "" | ||
| echo "📋 Detailed Deployment Results:" | ||
| echo "════════════════════════════════════════════════════════════════" | ||
| for STACK in $STACKS; do | ||
| if [ -f "/tmp/deploy_${STACK}.log" ]; then | ||
| echo "" | ||
| echo "🔸 STACK: $STACK" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| cat "/tmp/deploy_${STACK}.log" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| else | ||
| echo "" | ||
| echo "🔸 STACK: $STACK" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| echo "⚠️ No deployment log found for $STACK" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| fi | ||
| done | ||
| echo "════════════════════════════════════════════════════════════════" | ||
| # Check if any deployments failed | ||
| if [ -z "$STACKS" ]; then | ||
| echo "💥 No stacks to deploy - STACKS variable is empty!" | ||
| exit 1 | ||
| elif [ -z "$DEPLOYED_STACKS" ]; then | ||
| echo "💥 No stacks were actually deployed - check stack discovery!" | ||
| exit 1 | ||
| elif [ -n "$FAILED_STACKS" ]; then | ||
| echo "💥 Deployments failed for:$FAILED_STACKS" | ||
| exit 1 | ||
| fi | ||
| echo "🎉 All stacks deployed successfully in parallel!" | ||
| EOF | ||
| - name: Health Check All Services | ||
| id: health | ||
| if: steps.backup.outputs.deployment_needed == 'true' && steps.deploy.outcome == 'success' | ||
| run: | | ||
| echo "🔍 Health checking all services" | ||
| # Source retry functions | ||
| source /tmp/retry.sh | ||
| # Parse inputs outside SSH context | ||
| STACKS="${{ join(fromJSON(inputs.stacks), ' ') }}" | ||
| HAS_DOCKGE="${{ inputs.has-dockge }}" | ||
| # Execute health check and capture structured output | ||
| # Temporarily disable set -e to capture exit code from command substitution | ||
| # Use retry mechanism for health check | ||
| set +e | ||
| HEALTH_RESULT=$(ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} /bin/bash -s $STACKS \"$HAS_DOCKGE\"" << 'EOF' | ||
| set -e | ||
| # Get arguments passed to script (excluding sensitive OP_TOKEN) | ||
| TOTAL_ARGS=$# | ||
| # Find HAS_DOCKGE by looking for 'true' or 'false' in the args | ||
| HAS_DOCKGE="" | ||
| for i in $(seq 1 $TOTAL_ARGS); do | ||
| ARG="${!i}" | ||
| if [ "$ARG" = "true" ] || [ "$ARG" = "false" ]; then | ||
| HAS_DOCKGE="$ARG" | ||
| # All args before this position are stack names | ||
| STACKS="${@:1:$((i-1))}" | ||
| break | ||
| fi | ||
| done | ||
| # Set OP_TOKEN via environment (passed separately) | ||
| export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}" | ||
| # Set timeout configuration with defaults | ||
| HEALTH_CHECK_TIMEOUT=${{ inputs.health-check-timeout }} | ||
| HEALTH_CHECK_CMD_TIMEOUT=${{ inputs.health-check-command-timeout }} | ||
| # Enhanced health check with exponential backoff | ||
| echo "🔍 Starting enhanced health check with exponential backoff..." | ||
| # Health check function with retry logic | ||
| health_check_with_retry() { | ||
| local stack=$1 | ||
| local logfile="/tmp/health_${stack}.log" | ||
| # Use configurable timeout with fallback to defaults | ||
| local timeout_seconds=${HEALTH_CHECK_TIMEOUT:-180} | ||
| local max_attempts=4 | ||
| local wait_time=3 | ||
| local attempt=1 | ||
| local fast_fail_threshold=2 # Fast fail after 2 attempts if no progress | ||
| local start_time=$(date +%s) | ||
| # Create log file and redirect all output | ||
| exec 3>&1 4>&2 | ||
| exec 1>"$logfile" 2>&1 | ||
| # Ensure file descriptors are restored on function exit | ||
| trap 'exec 1>&3 2>&4 3>&- 4>&-' RETURN | ||
| echo "🕰️ Health check timeout configured: ${timeout_seconds}s" | ||
| echo "🔍 Health checking $stack with optimized retry logic..." | ||
| cd "/opt/compose/$stack" || { | ||
| echo "❌ $stack: Directory not found" | ||
| return 1 | ||
| } | ||
| # Cache total service count (doesn't change during health check) | ||
| local total_count | ||
| total_count=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services 2>/dev/null | grep -E '^[a-zA-Z0-9_-]+$' | wc -l | tr -d " " || echo "0") | ||
| if [ "$total_count" -eq 0 ]; then | ||
| echo "❌ $stack: No services defined in compose file" | ||
| return 1 | ||
| fi | ||
| local previous_running=0 | ||
| local no_progress_count=0 | ||
| while [ $attempt -le $max_attempts ]; do | ||
| echo " Attempt $attempt/$max_attempts for $stack (wait: ${wait_time}s)" | ||
| # Get container status and health with error handling | ||
| local running_healthy running_starting running_unhealthy running_no_health | ||
| local exited_count restarting_count running_count | ||
| # Check overall timeout | ||
| local current_time=$(date +%s) | ||
| local elapsed=$((current_time - start_time)) | ||
| if [ $elapsed -gt $timeout_seconds ]; then | ||
| echo "❌ $stack: Health check timed out after ${elapsed}s (limit: ${timeout_seconds}s)" | ||
| return 1 | ||
| fi | ||
| # Get container state and health in one call using custom format | ||
| # Format: Service State Health (tab-separated) | ||
| local ps_output | ||
| ps_output=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps --format '{{.Service}}\t{{.State}}\t{{.Health}}' 2>/dev/null || echo "") | ||
| # Parse output to count different states and health conditions | ||
| running_healthy=0 | ||
| running_starting=0 | ||
| running_unhealthy=0 | ||
| running_no_health=0 | ||
| exited_count=0 | ||
| restarting_count=0 | ||
| while IFS=$'\t' read -r service state health; do | ||
| # Skip empty lines | ||
| [ -z "$service" ] && continue | ||
| case "$state" in | ||
| running) | ||
| case "$health" in | ||
| healthy) | ||
| running_healthy=$((running_healthy + 1)) | ||
| ;; | ||
| starting) | ||
| running_starting=$((running_starting + 1)) | ||
| ;; | ||
| unhealthy) | ||
| running_unhealthy=$((running_unhealthy + 1)) | ||
| ;; | ||
| *) | ||
| # No health check defined | ||
| running_no_health=$((running_no_health + 1)) | ||
| ;; | ||
| esac | ||
| ;; | ||
| exited) | ||
| exited_count=$((exited_count + 1)) | ||
| ;; | ||
| restarting) | ||
| restarting_count=$((restarting_count + 1)) | ||
| ;; | ||
| esac | ||
| done <<< "$ps_output" | ||
| # Total running containers (all health states) | ||
| running_count=$((running_healthy + running_starting + running_unhealthy + running_no_health)) | ||
| echo " $stack status: $running_count/$total_count running (healthy: $running_healthy, starting: $running_starting, unhealthy: $running_unhealthy, no-check: $running_no_health), exited: $exited_count, restarting: $restarting_count" | ||
| # Fast fail logic: if unhealthy or no progress with failures | ||
| if [ "$running_unhealthy" -gt 0 ] && [ $attempt -ge $fast_fail_threshold ]; then | ||
| echo "❌ $stack: Fast fail - $running_unhealthy unhealthy containers detected (attempt $attempt)" | ||
| return 1 | ||
| elif [ $attempt -ge $fast_fail_threshold ] && [ "$running_count" -eq "$previous_running" ] && [ "$exited_count" -gt 0 ]; then | ||
| no_progress_count=$((no_progress_count + 1)) | ||
| if [ $no_progress_count -ge 2 ]; then | ||
| echo "❌ $stack: Fast fail - no progress and containers failing (attempt $attempt)" | ||
| return 1 | ||
| fi | ||
| else | ||
| no_progress_count=0 | ||
| fi | ||
| # Calculate healthy containers (healthy + no health check defined) | ||
| local healthy_total=$((running_healthy + running_no_health)) | ||
| # Success condition: all containers running and healthy (or no health check) | ||
| if [ "$healthy_total" -eq "$total_count" ] && [ "$total_count" -gt 0 ] && [ "$running_starting" -eq 0 ] && [ "$running_unhealthy" -eq 0 ] && [ "$exited_count" -eq 0 ] && [ "$restarting_count" -eq 0 ]; then | ||
| echo "✅ $stack: All $total_count services healthy" | ||
| return 0 | ||
| # Degraded but stable: all running and healthy, but fewer than expected | ||
| elif [ "$healthy_total" -gt 0 ] && [ "$healthy_total" -eq "$running_count" ] && [ "$running_starting" -eq 0 ] && [ "$running_unhealthy" -eq 0 ] && [ "$exited_count" -eq 0 ] && [ "$restarting_count" -eq 0 ]; then | ||
| echo "⚠️ $stack: $healthy_total/$total_count services healthy (degraded but stable)" | ||
| return 2 # Degraded but acceptable | ||
| # Still starting: health checks initializing, allow retry | ||
| elif [ "$running_starting" -gt 0 ] && [ "$running_unhealthy" -eq 0 ] && [ $attempt -lt $max_attempts ]; then | ||
| echo " $stack: $running_starting services still initializing health checks..." | ||
| sleep $wait_time | ||
| wait_time=$((wait_time * 2)) | ||
| if [ $wait_time -gt 20 ]; then | ||
| wait_time=20 | ||
| fi | ||
| # Final attempt failure | ||
| elif [ $attempt -eq $max_attempts ]; then | ||
| if [ "$running_unhealthy" -gt 0 ]; then | ||
| echo "❌ $stack: Failed - $running_unhealthy services unhealthy after $max_attempts attempts" | ||
| elif [ "$running_starting" -gt 0 ]; then | ||
| echo "❌ $stack: Failed - $running_starting services still starting after $max_attempts attempts" | ||
| else | ||
| echo "❌ $stack: Failed after $max_attempts attempts ($running_count/$total_count running, $healthy_total healthy)" | ||
| fi | ||
| return 1 | ||
| # Continue with exponential backoff | ||
| else | ||
| echo " $stack: Not ready yet, waiting ${wait_time}s..." | ||
| sleep $wait_time | ||
| wait_time=$((wait_time * 2)) | ||
| if [ $wait_time -gt 20 ]; then | ||
| wait_time=20 | ||
| fi | ||
| fi | ||
| previous_running=$running_count | ||
| attempt=$((attempt + 1)) | ||
| done | ||
| } | ||
| FAILED_STACKS="" | ||
| DEGRADED_STACKS="" | ||
| HEALTHY_STACKS="" | ||
| TOTAL_CONTAINERS=0 | ||
| RUNNING_CONTAINERS=0 | ||
| if [ "$HAS_DOCKGE" = "true" ]; then | ||
| echo "🔍 Health checking Dockge with retry logic..." | ||
| cd /opt/dockge | ||
| # Retry logic for Dockge with health check verification | ||
| local dockge_max_attempts=3 | ||
| local dockge_attempt=1 | ||
| local dockge_wait=3 | ||
| local DOCKGE_TOTAL | ||
| local dockge_healthy dockge_starting dockge_unhealthy dockge_no_health | ||
| local dockge_running dockge_healthy_total | ||
| # Get total services | ||
| DOCKGE_TOTAL=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose config --services 2>/dev/null | wc -l | tr -d " " || echo "0") | ||
| while [ $dockge_attempt -le $dockge_max_attempts ]; do | ||
| # Get Dockge state and health | ||
| local dockge_ps_output | ||
| dockge_ps_output=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose ps --format '{{.Service}}\t{{.State}}\t{{.Health}}' 2>/dev/null || echo "") | ||
| # Parse health states | ||
| dockge_healthy=0 | ||
| dockge_starting=0 | ||
| dockge_unhealthy=0 | ||
| dockge_no_health=0 | ||
| while IFS=$'\t' read -r service state health; do | ||
| [ -z "$service" ] && continue | ||
| if [ "$state" = "running" ]; then | ||
| case "$health" in | ||
| healthy) dockge_healthy=$((dockge_healthy + 1)) ;; | ||
| starting) dockge_starting=$((dockge_starting + 1)) ;; | ||
| unhealthy) dockge_unhealthy=$((dockge_unhealthy + 1)) ;; | ||
| *) dockge_no_health=$((dockge_no_health + 1)) ;; | ||
| esac | ||
| fi | ||
| done <<< "$dockge_ps_output" | ||
| dockge_running=$((dockge_healthy + dockge_starting + dockge_unhealthy + dockge_no_health)) | ||
| dockge_healthy_total=$((dockge_healthy + dockge_no_health)) | ||
| echo " Dockge attempt $dockge_attempt/$dockge_max_attempts: $dockge_running/$DOCKGE_TOTAL running (healthy: $dockge_healthy, starting: $dockge_starting, unhealthy: $dockge_unhealthy, no-check: $dockge_no_health)" | ||
| # Success: all healthy | ||
| if [ "$dockge_healthy_total" -eq "$DOCKGE_TOTAL" ] && [ "$DOCKGE_TOTAL" -gt 0 ] && [ "$dockge_starting" -eq 0 ] && [ "$dockge_unhealthy" -eq 0 ]; then | ||
| break | ||
| # Unhealthy detected - fail | ||
| elif [ "$dockge_unhealthy" -gt 0 ]; then | ||
| echo " Dockge has $dockge_unhealthy unhealthy services" | ||
| break | ||
| # Degraded but stable: some healthy, final attempt | ||
| elif [ "$dockge_healthy_total" -gt 0 ] && [ "$dockge_unhealthy" -eq 0 ] && [ $dockge_attempt -eq $dockge_max_attempts ]; then | ||
| break | ||
| # Retry | ||
| elif [ $dockge_attempt -lt $dockge_max_attempts ]; then | ||
| echo " Dockge not ready, waiting ${dockge_wait}s..." | ||
| sleep $dockge_wait | ||
| dockge_wait=$((dockge_wait * 2)) | ||
| fi | ||
| dockge_attempt=$((dockge_attempt + 1)) | ||
| done | ||
| TOTAL_CONTAINERS=$((TOTAL_CONTAINERS + DOCKGE_TOTAL)) | ||
| RUNNING_CONTAINERS=$((RUNNING_CONTAINERS + dockge_running)) | ||
| if [ "$dockge_unhealthy" -gt 0 ]; then | ||
| echo "❌ Dockge: $dockge_unhealthy services unhealthy" | ||
| FAILED_STACKS="$FAILED_STACKS dockge" | ||
| elif [ "$dockge_running" -eq 0 ]; then | ||
| echo "❌ Dockge: 0/$DOCKGE_TOTAL services running" | ||
| FAILED_STACKS="$FAILED_STACKS dockge" | ||
| elif [ "$dockge_healthy_total" -eq "$DOCKGE_TOTAL" ]; then | ||
| echo "✅ Dockge: All $DOCKGE_TOTAL services healthy" | ||
| HEALTHY_STACKS="$HEALTHY_STACKS dockge" | ||
| else | ||
| echo "⚠️ Dockge: $dockge_healthy_total/$DOCKGE_TOTAL services healthy (degraded)" | ||
| DEGRADED_STACKS="$DEGRADED_STACKS dockge" | ||
| fi | ||
| fi | ||
| # Parse critical services list | ||
| # Note: CRITICAL_SERVICES contains stack names (not individual Docker service names) | ||
| # This matches stacks that are considered critical for the deployment | ||
| # Example: ["portainer", "dockge"] identifies these stacks as critical | ||
| CRITICAL_SERVICES_ARRAY=() | ||
| if [ -n "$CRITICAL_SERVICES" ] && [ "$CRITICAL_SERVICES" != "[]" ]; then | ||
| # Convert JSON array to bash array using jq for robust parsing and preserve spaces/special characters | ||
| readarray -t CRITICAL_SERVICES_ARRAY < <(echo "$CRITICAL_SERVICES" | jq -r '.[]') | ||
| echo "🚨 Critical stacks configured: ${CRITICAL_SERVICES_ARRAY[*]}" | ||
| fi | ||
| # Function to check if a stack is critical | ||
| # Parameter: stack name to check | ||
| # Returns: 0 if critical, 1 if not critical | ||
| is_critical_service() { | ||
| local stack_name=$1 | ||
| for critical in "${CRITICAL_SERVICES_ARRAY[@]}"; do | ||
| if [ "$stack_name" = "$critical" ]; then | ||
| return 0 | ||
| fi | ||
| done | ||
| return 1 | ||
| } | ||
| # Enhanced health checks with sequential retry logic and early exit | ||
| echo "🔍 Starting enhanced health checks with retry logic..." | ||
| CRITICAL_FAILURE=false | ||
| # Disable exit on error for health checks to ensure we reach output section | ||
| set +e | ||
| # Check each stack with the new enhanced health check | ||
| for STACK in $STACKS; do | ||
| echo "" | ||
| echo "🔍 Checking stack: $STACK" | ||
| health_check_with_retry "$STACK" | ||
| HEALTH_RESULT=$? | ||
| case $HEALTH_RESULT in | ||
| 0) | ||
| # Output already restored in health_check_with_retry | ||
| echo "✅ $STACK: Healthy" | ||
| HEALTHY_STACKS="$HEALTHY_STACKS $STACK" | ||
| ;; | ||
| 2) | ||
| # Output already restored in health_check_with_retry | ||
| echo "⚠️ $STACK: Degraded but stable" | ||
| DEGRADED_STACKS="$DEGRADED_STACKS $STACK" | ||
| # Check if degraded stack is critical | ||
| if is_critical_service "$STACK"; then | ||
| echo "🚨 CRITICAL SERVICE DEGRADED: $STACK" | ||
| echo " Continuing monitoring but flagging for attention" | ||
| fi | ||
| ;; | ||
| *) | ||
| # For failures, output is already restored in health_check_with_retry | ||
| echo "❌ $STACK: Failed health check" | ||
| FAILED_STACKS="$FAILED_STACKS $STACK" | ||
| # Check if failed stack is critical - trigger early exit | ||
| if is_critical_service "$STACK"; then | ||
| echo "🚨 CRITICAL SERVICE FAILURE: $STACK" | ||
| echo " This is a critical service failure - triggering early exit" | ||
| echo " Remaining stacks will not be health checked" | ||
| CRITICAL_FAILURE=true | ||
| break | ||
| fi | ||
| ;; | ||
| esac | ||
| done | ||
| # Count services across all stacks after health checks complete | ||
| echo "" | ||
| echo "📊 Counting services across all stacks..." | ||
| if [ -z "$STACKS" ]; then | ||
| echo "ERROR: STACKS variable is empty! Cannot count services." | ||
| echo "Will attempt to discover stacks from filesystem..." | ||
| DISCOVERED_STACKS="" | ||
| for dir in /opt/compose/*/; do | ||
| if [ -d "$dir" ] && [ -f "$dir/compose.yaml" ]; then | ||
| STACK_NAME=$(basename "$dir") | ||
| DISCOVERED_STACKS="$DISCOVERED_STACKS $STACK_NAME" | ||
| fi | ||
| done | ||
| STACKS=$(echo "$DISCOVERED_STACKS" | xargs) | ||
| echo "Discovered stacks: $STACKS" | ||
| fi | ||
| for STACK in $STACKS; do | ||
| STACK_RUNNING=$(cd /opt/compose/$STACK 2>/dev/null && op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps --services --filter "status=running" 2>/dev/null | grep -E '^[a-zA-Z0-9_-]+$' 2>/dev/null | wc -l | tr -d " " || echo "0") | ||
| STACK_TOTAL=$(cd /opt/compose/$STACK 2>/dev/null && op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services 2>/dev/null | grep -E '^[a-zA-Z0-9_-]+$' 2>/dev/null | wc -l | tr -d " " || echo "0") | ||
| echo " $STACK: $STACK_RUNNING/$STACK_TOTAL services" | ||
| TOTAL_CONTAINERS=$((TOTAL_CONTAINERS + STACK_TOTAL)) | ||
| RUNNING_CONTAINERS=$((RUNNING_CONTAINERS + STACK_RUNNING)) | ||
| done | ||
| # Write outputs to temp file to ensure capture even if script exits early | ||
| TEMP_OUTPUT="/tmp/github_health_check_outputs.txt" | ||
| echo "healthy_stacks=$(echo $HEALTHY_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" > "$TEMP_OUTPUT" | ||
| echo "degraded_stacks=$(echo $DEGRADED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" >> "$TEMP_OUTPUT" | ||
| echo "failed_stacks=$(echo $FAILED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" >> "$TEMP_OUTPUT" | ||
| echo "total_containers=$TOTAL_CONTAINERS" >> "$TEMP_OUTPUT" | ||
| echo "running_containers=$RUNNING_CONTAINERS" >> "$TEMP_OUTPUT" | ||
| if [ "$TOTAL_CONTAINERS" -gt 0 ]; then | ||
| echo "success_rate=$(( RUNNING_CONTAINERS * 100 / TOTAL_CONTAINERS ))" >> "$TEMP_OUTPUT" | ||
| else | ||
| echo "success_rate=0" >> "$TEMP_OUTPUT" | ||
| fi | ||
| # Handle critical service failure | ||
| if [ "$CRITICAL_FAILURE" = true ]; then | ||
| echo "" | ||
| echo "❌ CRITICAL SERVICE FAILURE DETECTED" | ||
| echo " Deployment marked as failed due to critical service failure" | ||
| echo " Health check terminated early to prevent extended failure cycles" | ||
| # Set outputs for early termination | ||
| echo "health_status=failed_critical" >> "$GITHUB_OUTPUT" | ||
| echo "failed_stacks=$FAILED_STACKS" >> "$GITHUB_OUTPUT" | ||
| echo "healthy_stacks=$HEALTHY_STACKS" >> "$GITHUB_OUTPUT" | ||
| echo "degraded_stacks=$DEGRADED_STACKS" >> "$GITHUB_OUTPUT" | ||
| exit 1 | ||
| fi | ||
| echo "📊 Total service count: $RUNNING_CONTAINERS/$TOTAL_CONTAINERS across all stacks" | ||
| # Display comprehensive health check results | ||
| echo "" | ||
| echo "📊 Health Check Summary:" | ||
| echo "════════════════════════" | ||
| echo "Total Services: $TOTAL_CONTAINERS" | ||
| echo "Running Services: $RUNNING_CONTAINERS" | ||
| if [ "$TOTAL_CONTAINERS" -gt 0 ]; then | ||
| echo "Success Rate: $(( RUNNING_CONTAINERS * 100 / TOTAL_CONTAINERS ))%" | ||
| else | ||
| echo "Success Rate: 0%" | ||
| fi | ||
| echo "" | ||
| # Display results by category | ||
| [ -n "$HEALTHY_STACKS" ] && echo "✅ Healthy Stacks: $(echo $HEALTHY_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" | ||
| [ -n "$DEGRADED_STACKS" ] && echo "⚠️ Degraded Stacks: $(echo $DEGRADED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" | ||
| [ -n "$FAILED_STACKS" ] && echo "❌ Failed Stacks: $(echo $FAILED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" | ||
| echo "" | ||
| echo "📋 Detailed Health Check Results:" | ||
| echo "════════════════════════════════════════════════════════════════" | ||
| for STACK in $STACKS; do | ||
| if [ -f "/tmp/health_${STACK}.log" ]; then | ||
| echo "" | ||
| echo "🔸 STACK: $STACK" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| cat "/tmp/health_${STACK}.log" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| else | ||
| echo "" | ||
| echo "🔸 STACK: $STACK" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| echo "⚠️ No health check log found for $STACK" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| fi | ||
| done | ||
| echo "════════════════════════════════════════════════════════════════" | ||
| # Output results in parseable format (temp file already written earlier) | ||
| echo "GITHUB_OUTPUT_START" | ||
| cat "$TEMP_OUTPUT" | ||
| echo "GITHUB_OUTPUT_END" | ||
| set -e # Re-enable exit on error after outputs are written | ||
| # Determine final health status | ||
| if [ -n "$FAILED_STACKS" ]; then | ||
| echo "" | ||
| echo "💥 Health check failed - some stacks are not running" | ||
| exit 1 | ||
| elif [ -n "$DEGRADED_STACKS" ]; then | ||
| echo "" | ||
| echo "⚠️ Health check passed with warnings - some services degraded" | ||
| exit 0 | ||
| else | ||
| echo "" | ||
| echo "🎉 All services are fully healthy!" | ||
| exit 0 | ||
| fi | ||
| EOF | ||
| ) | ||
| HEALTH_EXIT_CODE=$? | ||
| set -e | ||
| # Check if health check command failed | ||
| if [ $HEALTH_EXIT_CODE -ne 0 ]; then | ||
| echo "::error::Health check failed with exit code: $HEALTH_EXIT_CODE" | ||
| echo "💥 Health check command failed - marking deployment as failed" | ||
| # Still extract outputs for debugging before failing | ||
| echo "$HEALTH_RESULT" | ||
| if echo "$HEALTH_RESULT" | grep -q "GITHUB_OUTPUT_START"; then | ||
| echo "$HEALTH_RESULT" | sed -n '/GITHUB_OUTPUT_START/,/GITHUB_OUTPUT_END/p' | grep -E "^(healthy_stacks|degraded_stacks|failed_stacks|total_containers|running_containers|success_rate)=" >> "$GITHUB_OUTPUT" || true | ||
| fi | ||
| exit 1 | ||
| fi | ||
| # Extract health outputs from structured result | ||
| echo "$HEALTH_RESULT" | ||
| # Parse outputs without temporary files | ||
| if echo "$HEALTH_RESULT" | grep -q "GITHUB_OUTPUT_START"; then | ||
| echo "$HEALTH_RESULT" | sed -n '/GITHUB_OUTPUT_START/,/GITHUB_OUTPUT_END/p' | grep -E "^(healthy_stacks|degraded_stacks|failed_stacks|total_containers|running_containers|success_rate)=" >> "$GITHUB_OUTPUT" | ||
| else | ||
| echo "⚠️ GITHUB_OUTPUT_START marker not found, attempting to read from temp file..." | ||
| # Try to read from temp file on remote server | ||
| TEMP_FILE_CONTENT=$(ssh -o "StrictHostKeyChecking no" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} 'cat /tmp/github_health_check_outputs.txt 2>/dev/null' || echo "") | ||
| if [ -n "$TEMP_FILE_CONTENT" ]; then | ||
| echo "✅ Successfully read outputs from temp file" | ||
| echo "$TEMP_FILE_CONTENT" >> "$GITHUB_OUTPUT" | ||
| else | ||
| echo "❌ Could not read temp file, using fallback outputs" | ||
| # Fallback outputs if parsing fails | ||
| { | ||
| echo "healthy_stacks=" | ||
| echo "degraded_stacks=" | ||
| echo "failed_stacks=" | ||
| echo "total_containers=0" | ||
| echo "running_containers=0" | ||
| echo "success_rate=0" | ||
| } >> "$GITHUB_OUTPUT" | ||
| fi | ||
| fi | ||
| - name: Cleanup unused images | ||
| id: cleanup | ||
| if: steps.backup.outputs.deployment_needed == 'true' && steps.deploy.outcome == 'success' && steps.health.outcome == 'success' | ||
| continue-on-error: true | ||
| run: | | ||
| echo "::group::Cleaning up unused Docker images" | ||
| ssh -o "StrictHostKeyChecking no" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} << EOF | ||
| echo "🧹 Cleaning up unused Docker images..." | ||
| docker image prune -f | ||
| echo "✅ Cleanup completed" | ||
| EOF | ||
| echo "::endgroup::" | ||
| - name: Rollback to Previous Version | ||
| id: rollback | ||
| if: steps.backup.outputs.deployment_needed == 'true' && (steps.deploy.outcome == 'failure' || steps.health.outcome == 'failure') | ||
| continue-on-error: true | ||
| run: | | ||
| echo "::group::Rolling back to previous deployment" | ||
| echo "🔄 **INITIATING ROLLBACK**" | ||
| echo "Previous SHA: ${{ steps.backup.outputs.previous_sha }}" | ||
| echo "Failed SHA: ${{ inputs.target-ref }}" | ||
| # Parse inputs outside SSH context | ||
| HAS_DOCKGE="${{ inputs.has-dockge }}" | ||
| PREVIOUS_SHA="${{ steps.backup.outputs.previous_sha }}" | ||
| COMPOSE_ARGS="${{ inputs.args || '' }}" | ||
| CRITICAL_SERVICES='${{ inputs.critical-services }}' | ||
| # Validate PREVIOUS_SHA before attempting rollback | ||
| if [ "$PREVIOUS_SHA" = "unknown" ] || [ -z "$PREVIOUS_SHA" ]; then | ||
| echo "❌ Cannot rollback: No previous deployment exists (first deployment)" | ||
| echo "::error::Rollback failed - no previous deployment to rollback to" | ||
| exit 1 | ||
| fi | ||
| # Validate SHA format (full 40-char SHA) | ||
| if ! [[ "$PREVIOUS_SHA" =~ ^[a-fA-F0-9]{40}$ ]]; then | ||
| echo "❌ Cannot rollback: Invalid previous SHA format: $PREVIOUS_SHA" | ||
| echo "::error::Rollback failed - invalid SHA format" | ||
| exit 1 | ||
| fi | ||
| echo "✅ Previous SHA validation passed: $PREVIOUS_SHA" | ||
| # Source retry functions | ||
| source /tmp/retry.sh | ||
| # Use retry mechanism for SSH connection (same as deploy) and capture output | ||
| ROLLBACK_RESULT=$(ssh_retry 3 10 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$HAS_DOCKGE\" \"$PREVIOUS_SHA\" \"$COMPOSE_ARGS\" \"$CRITICAL_SERVICES\"" << 'EOF' | ||
| set -e | ||
| # Get arguments passed to script (excluding sensitive OP_TOKEN) | ||
| HAS_DOCKGE="$1" | ||
| PREVIOUS_SHA="$2" | ||
| COMPOSE_ARGS="$3" | ||
| CRITICAL_SERVICES="$4" | ||
| # Set OP_TOKEN via environment (passed separately) | ||
| export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}" | ||
| # Consolidate timeout values for easier maintenance | ||
| # These can be overridden by workflow inputs where available | ||
| GIT_FETCH_TIMEOUT=${{ inputs.git-fetch-timeout }} | ||
| GIT_CHECKOUT_TIMEOUT=${{ inputs.git-checkout-timeout }} | ||
| IMAGE_PULL_TIMEOUT=${{ inputs.image-pull-timeout }} | ||
| SERVICE_STARTUP_TIMEOUT=${{ inputs.service-startup-timeout }} | ||
| VALIDATION_ENV_TIMEOUT=${{ inputs.validation-env-timeout }} | ||
| VALIDATION_SYNTAX_TIMEOUT=${{ inputs.validation-syntax-timeout }} | ||
| echo "🔄 Rolling back to $PREVIOUS_SHA..." | ||
| # Add timeout protection to git operations | ||
| if ! timeout $GIT_FETCH_TIMEOUT git -C /opt/compose/ fetch; then | ||
| echo "❌ Git fetch timed out after ${GIT_FETCH_TIMEOUT}s" | ||
| exit 1 | ||
| fi | ||
| if ! timeout $GIT_CHECKOUT_TIMEOUT git -C /opt/compose/ checkout $PREVIOUS_SHA; then | ||
| echo "❌ Git checkout timed out after ${GIT_CHECKOUT_TIMEOUT}s" | ||
| exit 1 | ||
| fi | ||
| echo "✅ Repository rolled back to $PREVIOUS_SHA" | ||
| # Dynamically discover stacks based on the previous commit's structure | ||
| echo "🔍 Discovering stacks in previous commit..." | ||
| ROLLBACK_STACKS_ARRAY=() | ||
| cd /opt/compose | ||
| for dir in */; do | ||
| if [[ -d "$dir" && (-f "$dir/compose.yml" || -f "$dir/compose.yaml") ]]; then | ||
| STACK_NAME=$(basename "$dir") | ||
| ROLLBACK_STACKS_ARRAY+=("$STACK_NAME") | ||
| echo " Found stack: $STACK_NAME" | ||
| fi | ||
| done | ||
| if [ ${#ROLLBACK_STACKS_ARRAY[@]} -eq 0 ]; then | ||
| echo "⚠️ No stacks found in previous commit - rollback cannot proceed" | ||
| exit 1 | ||
| fi | ||
| # Use null character as delimiter to support stack names with spaces and special characters | ||
| # Note: Null delimiter is used only within this SSH script execution | ||
| # The rollback-health step will convert it to space-delimited before passing between workflow steps | ||
| ROLLBACK_STACKS=$(printf "%s\0" "${ROLLBACK_STACKS_ARRAY[@]}") | ||
| echo "📋 Stacks to rollback: ${ROLLBACK_STACKS_ARRAY[*]}" | ||
| # Output discovered stacks for rollback-health step (null-delimited) | ||
| # Will be converted to space-delimited in rollback-health step for compatibility | ||
| echo "DISCOVERED_ROLLBACK_STACKS=$ROLLBACK_STACKS" | ||
| # Deploy Dockge first if needed | ||
| if [ "$HAS_DOCKGE" = "true" ]; then | ||
| echo "🔄 Rolling back Dockge..." | ||
| cd /opt/dockge | ||
| # Add timeout protection for Dockge operations | ||
| if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then | ||
| echo "❌ Dockge image pull timed out after ${IMAGE_PULL_TIMEOUT}s" | ||
| exit 1 | ||
| fi | ||
| if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then | ||
| echo "❌ Dockge startup timed out after ${SERVICE_STARTUP_TIMEOUT}s" | ||
| exit 1 | ||
| fi | ||
| echo "✅ Dockge rolled back successfully" | ||
| fi | ||
| # Shared function to deploy or rollback a single stack | ||
| # This eliminates code duplication between deploy and rollback operations | ||
| process_stack() { | ||
| local STACK=$1 | ||
| local OPERATION=$2 # "deploy" or "rollback" | ||
| local LOGFILE="/tmp/${OPERATION}_${STACK}.log" | ||
| local EXITCODEFILE="/tmp/${OPERATION}_${STACK}.exitcode" | ||
| { | ||
| if [ "$OPERATION" = "deploy" ]; then | ||
| echo "🚀 Deploying $STACK..." | ||
| else | ||
| echo "🔄 Rolling back $STACK..." | ||
| fi | ||
| cd /opt/compose/$STACK | ||
| echo " Pulling images for $STACK..." | ||
| # Add timeout protection (5 minutes for image pull) | ||
| if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then | ||
| echo "❌ Failed to pull images for $STACK during $OPERATION (timeout or error)" | ||
| exit 1 | ||
| fi | ||
| echo " Starting services for $STACK..." | ||
| # Add timeout protection (2 minutes for service startup) | ||
| if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then | ||
| echo "❌ Failed to start services for $STACK during $OPERATION (timeout or error)" | ||
| exit 1 | ||
| fi | ||
| if [ "$OPERATION" = "deploy" ]; then | ||
| echo "✅ $STACK deployed successfully" | ||
| else | ||
| echo "✅ $STACK rolled back successfully" | ||
| fi | ||
| } > "$LOGFILE" 2>&1 | ||
| # Capture and save exit code for robust error detection | ||
| local exit_code=$? | ||
| echo "$exit_code" > "$EXITCODEFILE" | ||
| return $exit_code | ||
| } | ||
| # Wrapper function for rollback (uses shared process_stack) | ||
| rollback_stack() { | ||
| process_stack "$1" "rollback" | ||
| } | ||
| # Cleanup function for rollback logs | ||
| cleanup_rollback_logs() { | ||
| # Parse null-delimited stacks into array | ||
| readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS" | ||
| for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do | ||
| rm -f "/tmp/rollback_${STACK}.log" 2>/dev/null | ||
| done | ||
| } | ||
| # Pre-rollback validation function | ||
| validate_all_rollback_stacks() { | ||
| echo "🔍 Pre-rollback validation of all stacks..." | ||
| local validation_failed=false | ||
| # Parse null-delimited stacks into array | ||
| readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS" | ||
| for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do | ||
| echo " Validating $STACK..." | ||
| # Check if stack directory exists | ||
| if [ ! -d "/opt/compose/$STACK" ]; then | ||
| echo "❌ $STACK: Directory /opt/compose/$STACK not found" | ||
| validation_failed=true | ||
| continue | ||
| fi | ||
| cd "/opt/compose/$STACK" || { | ||
| echo "❌ $STACK: Cannot access directory" | ||
| validation_failed=true | ||
| continue | ||
| } | ||
| # Check if compose.yaml or compose.yml exists and determine which to use | ||
| COMPOSE_FILE="" | ||
| if [ -f "compose.yaml" ]; then | ||
| COMPOSE_FILE="compose.yaml" | ||
| elif [ -f "compose.yml" ]; then | ||
| COMPOSE_FILE="compose.yml" | ||
| else | ||
| echo "❌ $STACK: neither compose.yaml nor compose.yml found" | ||
| validation_failed=true | ||
| continue | ||
| fi | ||
| # Validate 1Password environment access and Docker Compose config | ||
| if ! timeout $VALIDATION_ENV_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f "$COMPOSE_FILE" config --services >/dev/null 2>&1; then | ||
| echo "❌ $STACK: Environment validation failed (1Password or compose config error)" | ||
| validation_failed=true | ||
| continue | ||
| fi | ||
| # Quick syntax validation | ||
| if ! timeout $VALIDATION_SYNTAX_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f "$COMPOSE_FILE" config --quiet 2>/dev/null; then | ||
| echo "❌ $STACK: Docker Compose syntax validation failed" | ||
| validation_failed=true | ||
| continue | ||
| fi | ||
| echo "✅ $STACK: Pre-rollback validation passed" | ||
| done | ||
| if [ "$validation_failed" = true ]; then | ||
| echo "❌ Pre-rollback validation failed for one or more stacks" | ||
| echo " Stopping rollback to prevent extended failures" | ||
| return 1 | ||
| fi | ||
| echo "✅ All stacks passed pre-rollback validation" | ||
| return 0 | ||
| } | ||
| # Set trap for cleanup on exit | ||
| trap cleanup_rollback_logs EXIT | ||
| # Run pre-rollback validation | ||
| if ! validate_all_rollback_stacks; then | ||
| echo "ROLLBACK_STATUS=failed_validation" >> "$GITHUB_OUTPUT" | ||
| exit 1 | ||
| fi | ||
| # Start all rollback deployments in parallel | ||
| echo "🔄 Starting parallel rollback of all stacks..." | ||
| ROLLBACK_PIDS="" | ||
| # Map each PID to its stack name for improved error reporting | ||
| # Note: Requires Bash 4.0+ for associative arrays (GitHub Actions runners use Bash 5.x) | ||
| declare -A ROLLBACK_PID_TO_STACK | ||
| # Parse null-delimited stacks into array | ||
| readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS" | ||
| for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do | ||
| echo "🔄 Rolling back $STACK..." | ||
| rollback_stack "$STACK" & | ||
| PID=$! | ||
| ROLLBACK_PIDS="$ROLLBACK_PIDS $PID" | ||
| ROLLBACK_PID_TO_STACK[$PID]=$STACK | ||
| echo "Started rollback of $STACK (PID: $PID)" | ||
| done | ||
| # Wait for all rollback deployments and collect results | ||
| echo "⏳ Waiting for all rollbacks to complete..." | ||
| FAILED_ROLLBACKS="" | ||
| ROLLBACK_ERRORS="" | ||
| # Enhanced parallel job monitoring with proper error propagation | ||
| echo "⏳ Monitoring parallel rollback operations..." | ||
| # Wait for jobs individually to capture exit codes and report stack names | ||
| for PID in $ROLLBACK_PIDS; do | ||
| STACK_NAME="${ROLLBACK_PID_TO_STACK[$PID]}" | ||
| if wait "$PID"; then | ||
| echo "✅ Rollback process $PID for stack $STACK_NAME completed successfully" | ||
| else | ||
| EXIT_CODE=$? | ||
| # Check if process was terminated by signal (exit code > 128) | ||
| if [ "$EXIT_CODE" -gt 128 ]; then | ||
| SIGNAL_NUM=$((EXIT_CODE - 128)) | ||
| # Try to get signal name (works on most systems) | ||
| if command -v kill >/dev/null 2>&1; then | ||
| SIGNAL_NAME=$(kill -l $SIGNAL_NUM 2>/dev/null || echo "SIG$SIGNAL_NUM") | ||
| else | ||
| SIGNAL_NAME="SIG$SIGNAL_NUM" | ||
| fi | ||
| echo "❌ Rollback process $PID for stack $STACK_NAME was terminated by signal $SIGNAL_NUM ($SIGNAL_NAME)" | ||
| ROLLBACK_ERRORS="$ROLLBACK_ERRORS STACK:$STACK_NAME:PID:$PID:TERMINATED_BY_SIGNAL:$SIGNAL_NUM:$SIGNAL_NAME" | ||
| else | ||
| echo "❌ Rollback process $PID for stack $STACK_NAME failed with exit code $EXIT_CODE" | ||
| ROLLBACK_ERRORS="$ROLLBACK_ERRORS STACK:$STACK_NAME:PID:$PID:EXIT_CODE:$EXIT_CODE" | ||
| fi | ||
| fi | ||
| done | ||
| # Enhanced result analysis using exit code files (more robust than log parsing) | ||
| ROLLED_BACK_STACKS="" | ||
| SUCCESSFUL_ROLLBACKS="" | ||
| # Parse null-delimited stacks into array | ||
| readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS" | ||
| for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do | ||
| if [ -f "/tmp/rollback_${STACK}.log" ]; then | ||
| ROLLED_BACK_STACKS="$ROLLED_BACK_STACKS $STACK" | ||
| # Primary: Check exit code file for robust error detection | ||
| if [ -f "/tmp/rollback_${STACK}.exitcode" ]; then | ||
| EXIT_CODE=$(cat "/tmp/rollback_${STACK}.exitcode") | ||
| if [ "$EXIT_CODE" -eq 0 ]; then | ||
| SUCCESSFUL_ROLLBACKS="$SUCCESSFUL_ROLLBACKS $STACK" | ||
| else | ||
| FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK" | ||
| echo "🔍 $STACK Rollback Error: Non-zero exit code ($EXIT_CODE)" | ||
| fi | ||
| else | ||
| # Fallback: Log-based error detection if exit code file is missing | ||
| echo "⚠️ $STACK: Exit code file missing - using less reliable log-based detection" | ||
| if grep -q "❌.*$STACK\|CRITICAL.*$STACK\|Failed.*$STACK\|Error.*$STACK" "/tmp/rollback_${STACK}.log"; then | ||
| FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK" | ||
| # Extract specific error for reporting | ||
| STACK_ERROR=$(grep -E "❌.*$STACK|CRITICAL.*$STACK|Failed.*$STACK|Error.*$STACK" "/tmp/rollback_${STACK}.log" | head -1) | ||
| echo "🔍 $STACK Rollback Error: $STACK_ERROR" | ||
| elif grep -q "✅.*$STACK\|Successfully.*$STACK" "/tmp/rollback_${STACK}.log"; then | ||
| SUCCESSFUL_ROLLBACKS="$SUCCESSFUL_ROLLBACKS $STACK" | ||
| else | ||
| echo "⚠️ $STACK: No clear success/failure indicator in logs - treating as potential failure" | ||
| FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK" | ||
| fi | ||
| fi | ||
| else | ||
| echo "⚠️ $STACK: No rollback log found - possible early failure" | ||
| FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK" | ||
| fi | ||
| done | ||
| # Summary of rollback results | ||
| echo "" | ||
| echo "📊 Rollback Summary:" | ||
| echo " Successful: $(echo $SUCCESSFUL_ROLLBACKS | wc -w | tr -d ' ') stacks" | ||
| echo " Failed: $(echo $FAILED_ROLLBACKS | wc -w | tr -d ' ') stacks" | ||
| if [ -n "$ROLLBACK_ERRORS" ]; then | ||
| echo " Process errors: $ROLLBACK_ERRORS" | ||
| fi | ||
| # Parse critical services list | ||
| # Note: CRITICAL_SERVICES contains stack names (not individual Docker service names) | ||
| # This matches stacks that are considered critical for the deployment | ||
| # Example: ["portainer", "dockge"] identifies these stacks as critical | ||
| CRITICAL_SERVICES_ARRAY=() | ||
| CRITICAL_FAILURE=false | ||
| if [ -n "$CRITICAL_SERVICES" ] && [ "$CRITICAL_SERVICES" != "[]" ]; then | ||
| # Convert JSON array to bash array using jq for robust parsing and preserve spaces/special characters | ||
| readarray -t CRITICAL_SERVICES_ARRAY < <(echo "$CRITICAL_SERVICES" | jq -r '.[]') | ||
| echo "🚨 Critical stacks configured: ${CRITICAL_SERVICES_ARRAY[*]}" | ||
| # Check if any failed rollback stack is critical | ||
| for FAILED_STACK in $FAILED_ROLLBACKS; do | ||
| for CRITICAL_STACK in "${CRITICAL_SERVICES_ARRAY[@]}"; do | ||
| if [ "$FAILED_STACK" = "$CRITICAL_STACK" ]; then | ||
| echo "🚨 CRITICAL STACK ROLLBACK FAILED: $FAILED_STACK" | ||
| echo " This is a critical stack - system may be in unsafe state" | ||
| CRITICAL_FAILURE=true | ||
| fi | ||
| done | ||
| done | ||
| fi | ||
| # Display all rollback logs | ||
| echo "" | ||
| echo "📋 Rollback Results:" | ||
| echo "════════════════════════════════════════════════════════════════" | ||
| # Parse null-delimited stacks into array | ||
| readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS" | ||
| for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do | ||
| if [ -f "/tmp/rollback_${STACK}.log" ]; then | ||
| echo "" | ||
| echo "🔸 ROLLBACK STACK: $STACK" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| cat "/tmp/rollback_${STACK}.log" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| else | ||
| echo "" | ||
| echo "🔸 ROLLBACK STACK: $STACK" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| echo "⚠️ No rollback log found for $STACK" | ||
| echo "────────────────────────────────────────────────────────────────" | ||
| fi | ||
| done | ||
| echo "════════════════════════════════════════════════════════════════" | ||
| # Check if any rollbacks failed | ||
| if [ -z "$ROLLBACK_STACKS" ]; then | ||
| echo "💥 No stacks to rollback - ROLLBACK_STACKS variable is empty!" | ||
| exit 1 | ||
| elif [ -z "$ROLLED_BACK_STACKS" ]; then | ||
| echo "💥 No stacks were actually rolled back - check stack discovery!" | ||
| exit 1 | ||
| elif [ "$CRITICAL_FAILURE" = true ]; then | ||
| echo "" | ||
| echo "💥 CRITICAL SERVICE ROLLBACK FAILURE" | ||
| echo " One or more critical services failed to rollback" | ||
| echo " System may be in an unsafe state - manual intervention required" | ||
| echo " Failed critical services:$FAILED_ROLLBACKS" | ||
| exit 1 | ||
| elif [ -n "$FAILED_ROLLBACKS" ]; then | ||
| echo "💥 Rollbacks failed for:$FAILED_ROLLBACKS" | ||
| exit 1 | ||
| fi | ||
| echo "🎉 All stacks rolled back successfully!" | ||
| EOF | ||
| ) | ||
| # Extract rollback result and discovered stacks | ||
| echo "$ROLLBACK_RESULT" | ||
| # Parse discovered stacks output for rollback-health step (simplified - no markers needed) | ||
| if echo "$ROLLBACK_RESULT" | grep -q "DISCOVERED_ROLLBACK_STACKS="; then | ||
| DISCOVERED_STACKS=$(echo "$ROLLBACK_RESULT" | grep "DISCOVERED_ROLLBACK_STACKS=" | cut -d'=' -f2-) | ||
| echo "discovered_rollback_stacks=$DISCOVERED_STACKS" >> "$GITHUB_OUTPUT" | ||
| echo "✅ Captured discovered rollback stacks: $DISCOVERED_STACKS" | ||
| else | ||
| echo "⚠️ Could not parse discovered stacks, will use input stacks as fallback" | ||
| echo "discovered_rollback_stacks=${{ join(fromJSON(inputs.stacks), ' ') }}" >> "$GITHUB_OUTPUT" | ||
| fi | ||
| echo "::endgroup::" | ||
| # Health check runs after rollback attempt regardless of rollback success/failure | ||
| # This is intentional: we need to know the final system state even if rollback fails | ||
| # Using 'conclusion != skipped' instead of 'outcome == success' ensures we get | ||
| # visibility into what services are running, which is critical for incident response | ||
| - name: Verify Rollback Health | ||
| id: rollback-health | ||
| if: steps.backup.outputs.deployment_needed == 'true' && (steps.deploy.outcome == 'failure' || steps.health.outcome == 'failure') && steps.rollback.conclusion != 'skipped' | ||
| continue-on-error: true | ||
| run: | | ||
| echo "🔍 Verifying rollback health status" | ||
| # Source retry functions | ||
| source /tmp/retry.sh | ||
| # Use discovered rollback stacks instead of input stacks | ||
| # This ensures we check the stacks that were actually rolled back (from previous commit) | ||
| DISCOVERED_STACKS="${{ steps.rollback.outputs.discovered_rollback_stacks }}" | ||
| if [ -n "$DISCOVERED_STACKS" ]; then | ||
| # Convert null-delimited stacks to space-delimited for SSH arguments | ||
| readarray -d $'\0' -t STACKS_ARRAY <<< "$DISCOVERED_STACKS" | ||
| STACKS="${STACKS_ARRAY[*]}" | ||
| echo "✅ Using discovered rollback stacks: $STACKS" | ||
| else | ||
| # Fallback to input stacks if discovery failed | ||
| STACKS="${{ join(fromJSON(inputs.stacks), ' ') }}" | ||
| echo "⚠️ Using input stacks as fallback: $STACKS" | ||
| fi | ||
| HAS_DOCKGE="${{ inputs.has-dockge }}" | ||
| # Execute rollback health check | ||
| ROLLBACK_HEALTH_RESULT=$(ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} /bin/bash -s $STACKS \"$HAS_DOCKGE\"" << 'EOF' | ||
| set -e | ||
| # Get arguments passed to script | ||
| TOTAL_ARGS=$# | ||
| # Find HAS_DOCKGE by looking for 'true' or 'false' in the args | ||
| HAS_DOCKGE="" | ||
| for i in $(seq 1 $TOTAL_ARGS); do | ||
| ARG="${!i}" | ||
| if [ "$ARG" = "true" ] || [ "$ARG" = "false" ]; then | ||
| HAS_DOCKGE="$ARG" | ||
| # All args before this position are stack names | ||
| STACKS="${@:1:$((i-1))}" | ||
| break | ||
| fi | ||
| done | ||
| # Set OP_TOKEN via environment | ||
| export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}" | ||
| # Set configurable timeout for health check commands (default: 15 seconds) | ||
| HEALTH_CHECK_CMD_TIMEOUT=${{ inputs.health-check-command-timeout }} | ||
| if [ -z "$HEALTH_CHECK_CMD_TIMEOUT" ]; then | ||
| echo "ℹ️ HEALTH_CHECK_CMD_TIMEOUT not provided, using default 15 seconds" | ||
| HEALTH_CHECK_CMD_TIMEOUT=15 | ||
| fi | ||
| # Validate that HEALTH_CHECK_CMD_TIMEOUT is an integer | ||
| if ! [[ "$HEALTH_CHECK_CMD_TIMEOUT" =~ ^[0-9]+$ ]]; then | ||
| echo "⚠️ HEALTH_CHECK_CMD_TIMEOUT ($HEALTH_CHECK_CMD_TIMEOUT) is not an integer, using default 15" | ||
| HEALTH_CHECK_CMD_TIMEOUT=15 | ||
| fi | ||
| # Enforce minimum and maximum limits for HEALTH_CHECK_CMD_TIMEOUT | ||
| HEALTH_CHECK_CMD_TIMEOUT_MIN=5 | ||
| HEALTH_CHECK_CMD_TIMEOUT_MAX=60 | ||
| if [ "$HEALTH_CHECK_CMD_TIMEOUT" -lt "$HEALTH_CHECK_CMD_TIMEOUT_MIN" ]; then | ||
| echo "⚠️ HEALTH_CHECK_CMD_TIMEOUT ($HEALTH_CHECK_CMD_TIMEOUT) below minimum, using $HEALTH_CHECK_CMD_TIMEOUT_MIN" | ||
| HEALTH_CHECK_CMD_TIMEOUT=$HEALTH_CHECK_CMD_TIMEOUT_MIN | ||
| fi | ||
| if [ "$HEALTH_CHECK_CMD_TIMEOUT" -gt "$HEALTH_CHECK_CMD_TIMEOUT_MAX" ]; then | ||
| echo "⚠️ HEALTH_CHECK_CMD_TIMEOUT ($HEALTH_CHECK_CMD_TIMEOUT) above maximum, using $HEALTH_CHECK_CMD_TIMEOUT_MAX" | ||
| HEALTH_CHECK_CMD_TIMEOUT=$HEALTH_CHECK_CMD_TIMEOUT_MAX | ||
| fi | ||
| echo "🔍 Verifying rollback health for all services..." | ||
| ROLLBACK_HEALTHY_STACKS="" | ||
| ROLLBACK_DEGRADED_STACKS="" | ||
| ROLLBACK_FAILED_STACKS="" | ||
| ROLLBACK_TOTAL_CONTAINERS=0 | ||
| ROLLBACK_RUNNING_CONTAINERS=0 | ||
| # Check Dockge health if applicable | ||
| if [ "$HAS_DOCKGE" = "true" ]; then | ||
| echo "🔍 Verifying Dockge rollback health..." | ||
| cd /opt/dockge | ||
| DOCKGE_RUNNING=$(op run --env-file=/opt/compose/compose.env -- docker compose ps --services --filter "status=running" | wc -l | tr -d " ") | ||
| DOCKGE_TOTAL=$(op run --env-file=/opt/compose/compose.env -- docker compose ps --services | wc -l | tr -d " ") | ||
| ROLLBACK_TOTAL_CONTAINERS=$((ROLLBACK_TOTAL_CONTAINERS + DOCKGE_TOTAL)) | ||
| ROLLBACK_RUNNING_CONTAINERS=$((ROLLBACK_RUNNING_CONTAINERS + DOCKGE_RUNNING)) | ||
| if [ "$DOCKGE_RUNNING" -eq 0 ]; then | ||
| echo "❌ Dockge rollback: 0/$DOCKGE_TOTAL services running" | ||
| ROLLBACK_FAILED_STACKS="$ROLLBACK_FAILED_STACKS dockge" | ||
| elif [ "$DOCKGE_RUNNING" -lt "$DOCKGE_TOTAL" ]; then | ||
| echo "⚠️ Dockge rollback: $DOCKGE_RUNNING/$DOCKGE_TOTAL services running (degraded)" | ||
| ROLLBACK_DEGRADED_STACKS="$ROLLBACK_DEGRADED_STACKS dockge" | ||
| else | ||
| echo "✅ Dockge rollback: All $DOCKGE_RUNNING services healthy" | ||
| ROLLBACK_HEALTHY_STACKS="$ROLLBACK_HEALTHY_STACKS dockge" | ||
| fi | ||
| fi | ||
| # Simple health check for each rolled back stack | ||
| for STACK in $STACKS; do | ||
| echo "" | ||
| echo "🔍 Verifying rollback health for stack: $STACK" | ||
| cd "/opt/compose/$STACK" || { | ||
| echo "❌ $STACK: Directory not accessible after rollback" | ||
| ROLLBACK_FAILED_STACKS="$ROLLBACK_FAILED_STACKS $STACK" | ||
| continue | ||
| } | ||
| # Get basic health status | ||
| RUNNING_COUNT=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps --services --filter "status=running" 2>/dev/null | grep -E '^[a-zA-Z0-9_-]+$' | wc -l | tr -d " " || echo "0") | ||
| TOTAL_COUNT=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services 2>/dev/null | grep -E '^[a-zA-Z0-9_-]+$' | wc -l | tr -d " " || echo "0") | ||
| ROLLBACK_TOTAL_CONTAINERS=$((ROLLBACK_TOTAL_CONTAINERS + TOTAL_COUNT)) | ||
| ROLLBACK_RUNNING_CONTAINERS=$((ROLLBACK_RUNNING_CONTAINERS + RUNNING_COUNT)) | ||
| if [ "$RUNNING_COUNT" -eq 0 ]; then | ||
| echo "❌ $STACK rollback: 0/$TOTAL_COUNT services running" | ||
| ROLLBACK_FAILED_STACKS="$ROLLBACK_FAILED_STACKS $STACK" | ||
| elif [ "$RUNNING_COUNT" -lt "$TOTAL_COUNT" ]; then | ||
| echo "⚠️ $STACK rollback: $RUNNING_COUNT/$TOTAL_COUNT services running (degraded)" | ||
| ROLLBACK_DEGRADED_STACKS="$ROLLBACK_DEGRADED_STACKS $STACK" | ||
| else | ||
| echo "✅ $STACK rollback: All $RUNNING_COUNT services healthy" | ||
| ROLLBACK_HEALTHY_STACKS="$ROLLBACK_HEALTHY_STACKS $STACK" | ||
| fi | ||
| done | ||
| # Calculate success rate | ||
| if [ "$ROLLBACK_TOTAL_CONTAINERS" -gt 0 ]; then | ||
| ROLLBACK_SUCCESS_RATE=$(( ROLLBACK_RUNNING_CONTAINERS * 100 / ROLLBACK_TOTAL_CONTAINERS )) | ||
| else | ||
| ROLLBACK_SUCCESS_RATE=0 | ||
| fi | ||
| echo "" | ||
| echo "📊 Rollback Health Verification Summary:" | ||
| echo "════════════════════════════════════════" | ||
| echo "Total Services: $ROLLBACK_TOTAL_CONTAINERS" | ||
| echo "Running Services: $ROLLBACK_RUNNING_CONTAINERS" | ||
| echo "Success Rate: ${ROLLBACK_SUCCESS_RATE}%" | ||
| echo "" | ||
| [ -n "$ROLLBACK_HEALTHY_STACKS" ] && echo "✅ Healthy After Rollback: $(echo $ROLLBACK_HEALTHY_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" | ||
| [ -n "$ROLLBACK_DEGRADED_STACKS" ] && echo "⚠️ Degraded After Rollback: $(echo $ROLLBACK_DEGRADED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" | ||
| [ -n "$ROLLBACK_FAILED_STACKS" ] && echo "❌ Failed After Rollback: $(echo $ROLLBACK_FAILED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" | ||
| # Output structured results (simplified - no markers needed) | ||
| echo "ROLLBACK_HEALTH_HEALTHY=$(echo $ROLLBACK_HEALTHY_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" | ||
| echo "ROLLBACK_HEALTH_DEGRADED=$(echo $ROLLBACK_DEGRADED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" | ||
| echo "ROLLBACK_HEALTH_FAILED=$(echo $ROLLBACK_FAILED_STACKS | tr ' ' ',' | sed 's/^,//' | sed 's/,/, /g')" | ||
| echo "ROLLBACK_HEALTH_TOTAL_CONTAINERS=$ROLLBACK_TOTAL_CONTAINERS" | ||
| echo "ROLLBACK_HEALTH_RUNNING_CONTAINERS=$ROLLBACK_RUNNING_CONTAINERS" | ||
| echo "ROLLBACK_HEALTH_SUCCESS_RATE=$ROLLBACK_SUCCESS_RATE" | ||
| # Determine rollback verification status | ||
| if [ -n "$ROLLBACK_FAILED_STACKS" ]; then | ||
| echo "" | ||
| echo "⚠️ Rollback completed but some services are still unhealthy" | ||
| echo "Manual intervention may be required" | ||
| exit 0 # Don't fail the workflow, rollback itself was successful | ||
| else | ||
| echo "" | ||
| echo "🎉 Rollback verified - all services are healthy or degraded but stable" | ||
| exit 0 | ||
| fi | ||
| EOF | ||
| ) | ||
| # Extract rollback health outputs | ||
| echo "$ROLLBACK_HEALTH_RESULT" | ||
| # Parse rollback health outputs (simplified - direct variable extraction) | ||
| if echo "$ROLLBACK_HEALTH_RESULT" | grep -q "ROLLBACK_HEALTH_"; then | ||
| # Extract each output variable directly | ||
| HEALTHY=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_HEALTHY=" | cut -d'=' -f2-) | ||
| DEGRADED=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_DEGRADED=" | cut -d'=' -f2-) | ||
| FAILED=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_FAILED=" | cut -d'=' -f2-) | ||
| TOTAL=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_TOTAL_CONTAINERS=" | cut -d'=' -f2-) | ||
| RUNNING=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_RUNNING_CONTAINERS=" | cut -d'=' -f2-) | ||
| RATE=$(echo "$ROLLBACK_HEALTH_RESULT" | grep "ROLLBACK_HEALTH_SUCCESS_RATE=" | cut -d'=' -f2-) | ||
| { | ||
| echo "rollback_healthy_stacks=${HEALTHY:-}" | ||
| echo "rollback_degraded_stacks=${DEGRADED:-}" | ||
| echo "rollback_failed_stacks=${FAILED:-}" | ||
| echo "rollback_total_containers=${TOTAL:-0}" | ||
| echo "rollback_running_containers=${RUNNING:-0}" | ||
| echo "rollback_success_rate=${RATE:-0}" | ||
| } >> "$GITHUB_OUTPUT" | ||
| else | ||
| # Fallback outputs if parsing fails | ||
| { | ||
| echo "rollback_healthy_stacks=" | ||
| echo "rollback_degraded_stacks=" | ||
| echo "rollback_failed_stacks=" | ||
| echo "rollback_total_containers=0" | ||
| echo "rollback_running_containers=0" | ||
| echo "rollback_success_rate=0" | ||
| } >> "$GITHUB_OUTPUT" | ||
| fi | ||
| - name: Cleanup SSH connections | ||
| if: always() | ||
| run: | | ||
| # Close SSH connection multiplexing | ||
| echo "🧹 Cleaning up SSH connections..." | ||
| ssh -o "StrictHostKeyChecking no" deployment-server -O exit 2>/dev/null || true | ||
| # Clean up SSH control sockets | ||
| rm -f ~/.ssh/sockets/* 2>/dev/null || true | ||
| echo "✅ SSH cleanup completed" | ||
| - name: Report Deployment Status | ||
| if: always() | ||
| run: | | ||
| echo "::group::Deployment Summary" | ||
| # Parse stacks from JSON input and create display list | ||
| STACK_LIST="${{ join(fromJson(inputs.stacks), ', ') }}" | ||
| if [ "${{ inputs.has-dockge }}" = "true" ]; then | ||
| STACK_LIST="dockge, $STACK_LIST" | ||
| fi | ||
| if [ "${{ steps.backup.outputs.deployment_needed }}" != "true" ]; then | ||
| echo "ℹ️ **NO DEPLOYMENT NEEDED**" | ||
| echo "✅ Repository already at target commit" | ||
| echo "📋 Target stacks: $STACK_LIST" | ||
| echo "🔄 SHA: ${{ inputs.target-ref }}" | ||
| elif [ "${{ inputs.force-deploy }}" = "true" ] && [ "${{ steps.deploy.outcome }}" == "success" ] && [ "${{ steps.health.outcome }}" == "success" ]; then | ||
| echo "🔄 **FORCE DEPLOYMENT SUCCESSFUL**" | ||
| echo "✅ All stacks force-deployed and healthy" | ||
| echo "📋 Deployed stacks: $STACK_LIST" | ||
| echo "🔄 SHA: ${{ inputs.target-ref }}" | ||
| if [ "${{ steps.cleanup.outcome }}" == "success" ]; then | ||
| echo "🧹 Cleanup completed successfully" | ||
| fi | ||
| elif [ "${{ steps.deploy.outcome }}" == "success" ] && [ "${{ steps.health.outcome }}" == "success" ]; then | ||
| echo "🎉 **DEPLOYMENT SUCCESSFUL**" | ||
| echo "✅ All stacks deployed and healthy" | ||
| echo "📋 Deployed stacks: $STACK_LIST" | ||
| echo "🔄 SHA: ${{ inputs.target-ref }}" | ||
| if [ "${{ steps.cleanup.outcome }}" == "success" ]; then | ||
| echo "🧹 Cleanup completed successfully" | ||
| fi | ||
| else | ||
| echo "💥 **DEPLOYMENT FAILED**" | ||
| echo "❌ Deploy status: ${{ steps.deploy.outcome }}" | ||
| echo "❌ Health check status: ${{ steps.health.outcome }}" | ||
| if [ "${{ steps.rollback.outcome }}" == "success" ]; then | ||
| echo "🔄 Rollback completed successfully" | ||
| if [ "${{ steps.rollback-health.outcome }}" == "success" ]; then | ||
| echo "✅ Rollback verification passed" | ||
| elif [ "${{ steps.rollback-health.outcome }}" == "failure" ]; then | ||
| echo "⚠️ Rollback verification failed - manual intervention may be needed" | ||
| fi | ||
| else | ||
| echo "❌ Rollback status: ${{ steps.rollback.outcome }}" | ||
| fi | ||
| exit 1 | ||
| fi | ||
| echo "::endgroup::" | ||
| notify: | ||
| name: Discord Notification | ||
| runs-on: ubuntu-24.04 | ||
| needs: [deploy] | ||
| if: always() | ||
| steps: | ||
| - name: Configure 1Password Service Account | ||
| uses: 1password/load-secrets-action/configure@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0 | ||
| with: | ||
| service-account-token: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }} | ||
| - name: Get commit message | ||
| id: commit-msg | ||
| run: | | ||
| COMMIT_MSG=$(curl -s -H "Authorization: token ${{ github.token }}" \ | ||
| "https://api.github.com/repos/${{ github.repository }}/commits/${{ inputs.target-ref }}" \ | ||
| | jq -r '.commit.message // "No commit message available"' \ | ||
| | head -1) | ||
| SHORT_SHA="${{ inputs.target-ref }}" | ||
| SHORT_SHA="${SHORT_SHA:0:7}" | ||
| echo "message=$COMMIT_MSG" >> "$GITHUB_OUTPUT" | ||
| echo "short-sha=$SHORT_SHA" >> "$GITHUB_OUTPUT" | ||
| - name: Load Discord webhook and user ID | ||
| id: op-load-discord | ||
| uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0 | ||
| with: | ||
| unset-previous: true | ||
| env: | ||
| DISCORD_WEBHOOK: ${{ inputs.webhook-url }} | ||
| DISCORD_USER_ID: ${{ inputs.discord-user-id != '' && inputs.discord-user-id || 'SKIP' }} | ||
| - name: Send Discord notification | ||
| uses: sarisia/actions-status-discord@b8381b25576cb341b2af39926ab42c5056cc44ed # v1.15.5 | ||
| with: | ||
| webhook: ${{ steps.op-load-discord.outputs.DISCORD_WEBHOOK }} | ||
| status: ${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && 'success' || needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && 'success' || 'failure' }} | ||
| title: "🚀 ${{ inputs.repo-name }} • ${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && 'No Changes' || needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && 'Deployed' || needs.deploy.outputs.rollback_status == 'success' && 'Rolled Back' || 'Failed' }}" | ||
| description: | | ||
| ${{ (needs.deploy.outputs.deploy_status == 'failure' || needs.deploy.outputs.health_status == 'failure' || needs.deploy.result == 'failure') && inputs.discord-user-id != '' && steps.op-load-discord.outputs.DISCORD_USER_ID != 'SKIP' && format('<@{0}> ', steps.op-load-discord.outputs.DISCORD_USER_ID) || '' }}${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && '📋 **Repository already at target commit**' || | ||
| inputs.force-deploy == true && needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && '🔄 **Force deployment completed successfully**' || | ||
| needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && '✅ **Deployment completed successfully**' || | ||
| needs.deploy.outputs.rollback_status == 'success' && '🔄 **Deployment failed but rolled back successfully**' || '❌ **Deployment failed**' }} | ||
| ${{ needs.deploy.outputs.deployment_needed == 'true' && needs.deploy.outputs.rollback_status != 'success' && | ||
| format('**📊 Health Status** | ||
| 🟢 Running: {0}/{1} services ({2}%)', needs.deploy.outputs.running_containers || '0', | ||
| needs.deploy.outputs.total_containers || '0', needs.deploy.outputs.success_rate || '0') || | ||
| needs.deploy.outputs.rollback_status == 'success' && format('**📊 Rollback Health** | ||
| 🟢 Running: {0}/{1} services ({2}%)', | ||
| needs.deploy.outputs.rollback_running_containers || '0', needs.deploy.outputs.rollback_total_containers || '0', | ||
| needs.deploy.outputs.rollback_success_rate || '0') || '' }} | ||
| ${{ needs.deploy.outputs.rollback_status != 'success' && (needs.deploy.outputs.healthy_stacks != '' || needs.deploy.outputs.degraded_stacks != '' || needs.deploy.outputs.failed_stacks != '') && | ||
| format('**🏷️ Stack Status** | ||
| {0}{1}{2}', | ||
| needs.deploy.outputs.healthy_stacks != '' && format('✅ {0} | ||
| ', needs.deploy.outputs.healthy_stacks) || '', | ||
| needs.deploy.outputs.degraded_stacks != '' && format('⚠️ {0} | ||
| ', needs.deploy.outputs.degraded_stacks) || '', | ||
| needs.deploy.outputs.failed_stacks != '' && format('❌ {0}', needs.deploy.outputs.failed_stacks) || '') || '' }} | ||
| ${{ needs.deploy.outputs.rollback_status == 'success' && (needs.deploy.outputs.rollback_healthy_stacks != '' || needs.deploy.outputs.rollback_degraded_stacks != '' || needs.deploy.outputs.rollback_failed_stacks != '') && | ||
| format('**🏷️ Rollback Stack Status** | ||
| {0}{1}{2}', | ||
| needs.deploy.outputs.rollback_healthy_stacks != '' && format('✅ {0} | ||
| ', needs.deploy.outputs.rollback_healthy_stacks) || '', | ||
| needs.deploy.outputs.rollback_degraded_stacks != '' && format('⚠️ {0} | ||
| ', needs.deploy.outputs.rollback_degraded_stacks) || '', | ||
| needs.deploy.outputs.rollback_failed_stacks != '' && format('❌ {0}', needs.deploy.outputs.rollback_failed_stacks) || '') || '' }} | ||
| ${{ needs.deploy.outputs.deployment_needed == 'true' && format('**🔄 Pipeline Status** | ||
| {0} Deploy → {1} Health → {2} Cleanup{3}', | ||
| needs.deploy.outputs.deploy_status == 'success' && '✅' || '❌', | ||
| needs.deploy.outputs.health_status == 'success' && '✅' || needs.deploy.outputs.health_status == 'skipped' && '⏭️' || '❌', | ||
| needs.deploy.outputs.cleanup_status == 'success' && '✅' || needs.deploy.outputs.cleanup_status == 'skipped' && '⏭️' || '❌', | ||
| needs.deploy.outputs.rollback_status != 'skipped' && format(' → {0} Rollback{1}', | ||
| needs.deploy.outputs.rollback_status == 'success' && '✅' || '❌', | ||
| needs.deploy.outputs.rollback_health_status == 'success' && ' → ✅ Verify' || | ||
| needs.deploy.outputs.rollback_health_status == 'failure' && ' → ❌ Verify' || '') || '') || '' }} | ||
| ${{ github.event_name == 'workflow_dispatch' && '🔧 **Triggered manually**' || format('📝 **Commit:** [`{0}`](https://github.com/{1}/commit/{2}) {3}', steps.commit-msg.outputs.short-sha, github.repository, inputs.target-ref, steps.commit-msg.outputs.message) }} | ||
| **⏱️ Duration:** ${{ github.event_name != 'workflow_dispatch' && '3min' || 'Manual' }} | ||
| color: ${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && 0x6c757d || | ||
| needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && 0x28a745 || | ||
| needs.deploy.outputs.rollback_status == 'success' && 0xffc107 || | ||
| needs.deploy.outputs.degraded_stacks != '' && 0xfd7e14 || 0xdc3545 }} | ||
| username: "Compose Deploy" | ||
| avatar_url: "https://cdn-icons-png.flaticon.com/512/919/919853.png" | ||
| - name: Unload Discord webhook | ||
| uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0 | ||
| with: | ||
| unset-previous: true | ||