feat(deploy): add deployment script library foundation #5

Workflow file for this run

	name: Deploy Docker Compose
Check failure on line 1 in .github/workflows/deploy.yml View workflow run for this annotation GitHub Actions / .github/workflows/deploy.yml Invalid workflow file `(Line: 1214, Col: 14): Exceeded max expression length 21000`

	# Future Refactoring Recommendations:
	# 1. Extract bash logic into separate composite actions or standalone scripts
	# - Retry mechanisms (retry.sh, ssh_retry) could be a reusable composite action
	# - Process_stack function could be extracted to a standalone script
	# - Validation functions could be moved to a validation composite action
	# 2. Consider splitting deploy and rollback into separate reusable workflows
	# - Would improve readability and make each workflow easier to maintain
	# - Could share common functionality via composite actions
	# 3. Abstract common patterns into reusable functions
	# - Exit code and log handling patterns appear multiple times
	# - Could create helper functions for result parsing and analysis
	# Note: Current implementation prioritizes single-file simplicity for easier debugging

	on:
	workflow_call:
	inputs:
	args:
	description: "docker compose up -d arguments"
	required: false
	type: string
	stacks:
	description: "JSON array of stack names to deploy"
	required: true
	type: string
	webhook-url:
	description: "1Password reference to Discord webhook URL"
	required: true
	type: string
	repo-name:
	description: "Repository display name for notifications"
	required: true
	type: string
	target-ref:
	description: "Git reference to checkout on remote server"
	required: true
	type: string
	has-dockge:
	description: "Whether this deployment includes Dockge"
	required: false
	type: boolean
	default: false
	force-deploy:
	description: "Force deployment even if repository is already at target commit"
	required: false
	type: boolean
	default: false
	health-check-timeout:
	description: "Health check timeout in seconds (default: 180)"
	required: false
	type: number
	default: 180
	health-check-command-timeout:
	description: "Individual health check command timeout in seconds (default: 15)"
	required: false
	type: number
	default: 15
	critical-services:
	description: "JSON array of critical service names that should trigger early exit on failure"
	required: false
	type: string
	default: '[]'
	git-fetch-timeout:
	description: "Git fetch operation timeout in seconds (default: 300)"
	required: false
	type: number
	default: 300
	git-checkout-timeout:
	description: "Git checkout operation timeout in seconds (default: 60)"
	required: false
	type: number
	default: 60
	image-pull-timeout:
	description: "Docker image pull timeout in seconds (default: 600)"
	required: false
	type: number
	default: 600
	service-startup-timeout:
	description: "Service startup timeout in seconds (default: 300)"
	required: false
	type: number
	default: 300
	validation-env-timeout:
	description: "Environment validation timeout in seconds (default: 30)"
	required: false
	type: number
	default: 30
	validation-syntax-timeout:
	description: "Syntax validation timeout in seconds (default: 60)"
	required: false
	type: number
	default: 60
	discord-user-id:
	description: "Discord user ID to mention in failure notifications (e.g., '<@123456789>')"
	required: false
	type: string
	default: ''

	jobs:
	deploy:
	runs-on: ubuntu-24.04
	if: ${{ github.event.workflow_run.conclusion == 'success' \|\| github.event_name == 'workflow_dispatch' }}
	timeout-minutes: 40 # Overall job timeout
	outputs:
	previous_sha: ${{ steps.backup.outputs.previous_sha }}
	deployment_needed: ${{ steps.backup.outputs.deployment_needed }}
	deleted_files: ${{ steps.changed-files.outputs.deleted_files }}
	deploy_status: ${{ steps.deploy.outcome }}
	health_status: ${{ steps.health.outcome }}
	cleanup_status: ${{ steps.cleanup.outcome }}
	rollback_status: ${{ steps.rollback.outcome }}
	rollback_health_status: ${{ steps.rollback-health.outcome }}
	discovered_rollback_stacks: ${{ steps.rollback.outputs.discovered_rollback_stacks }}
	healthy_stacks: ${{ steps.health.outputs.healthy_stacks }}
	degraded_stacks: ${{ steps.health.outputs.degraded_stacks }}
	failed_stacks: ${{ steps.health.outputs.failed_stacks }}
	total_containers: ${{ steps.health.outputs.total_containers }}
	running_containers: ${{ steps.health.outputs.running_containers }}
	success_rate: ${{ steps.health.outputs.success_rate }}
	rollback_healthy_stacks: ${{ steps.rollback-health.outputs.rollback_healthy_stacks }}
	rollback_degraded_stacks: ${{ steps.rollback-health.outputs.rollback_degraded_stacks }}
	rollback_failed_stacks: ${{ steps.rollback-health.outputs.rollback_failed_stacks }}
	rollback_total_containers: ${{ steps.rollback-health.outputs.rollback_total_containers }}
	rollback_running_containers: ${{ steps.rollback-health.outputs.rollback_running_containers }}
	rollback_success_rate: ${{ steps.rollback-health.outputs.rollback_success_rate }}
	removed_stacks: ${{ steps.cleanup-removed.outputs.removed_stacks }}
	has_removed_stacks: ${{ steps.cleanup-removed.outputs.has_removed_stacks }}
	steps:
	- name: Validate and sanitize inputs
	run: \|
	# Validate stacks parameter is valid JSON
	echo '${{ inputs.stacks }}' \| jq -r '.[]' >/dev/null \|\| {
	echo "::error::Invalid stacks JSON format: ${{ inputs.stacks }}"
	exit 1
	}

	# Validate stack names contain only safe characters
	echo '${{ inputs.stacks }}' \| jq -r '.[]' \| while read -r stack; do
	if [[ ! "$stack" =~ ^[a-zA-Z0-9_-]+$ ]]; then
	echo "::error::Invalid stack name: $stack. Only alphanumeric, underscore, and hyphen allowed."
	exit 1
	fi
	# Check stack name length
	if [ ${#stack} -gt 50 ]; then
	echo "::error::Stack name too long: $stack (max 50 characters)"
	exit 1
	fi
	done

	# Validate target-ref format
	TARGET_REF="${{ inputs.target-ref }}"
	# Check if it's a valid commit SHA (7-40 hex chars) or branch/tag name
	if [[ "$TARGET_REF" =~ ^[a-fA-F0-9]{7,40}$ ]] \|\| [[ "$TARGET_REF" =~ ^[a-zA-Z0-9_-]+$ ]] \|\| [[ "$TARGET_REF" =~ ^[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$ ]]; then
	echo "✅ Target-ref format valid: $TARGET_REF"
	else
	echo "::error::Invalid target-ref format: $TARGET_REF"
	echo "::error::Expected: commit SHA (7-40 hex chars) or branch/tag name"
	exit 1
	fi

	# Validate and sanitize compose args
	COMPOSE_ARGS="${{ inputs.args }}"
	if [[ -n "$COMPOSE_ARGS" ]]; then
	# Check for dangerous characters and patterns
	if [[ "$COMPOSE_ARGS" =~ [\;\&\\|\`\$\\] ]]; then
	echo "::error::Compose args contain potentially dangerous characters: $COMPOSE_ARGS"
	echo "::error::Prohibited characters: ; & \| \` $ \\"
	exit 1
	fi
	# Check for suspicious patterns
	if [[ "$COMPOSE_ARGS" =~ (rm\|kill\|shutdown\|reboot\|format\|dd\|\>\|\<\|sudo\|su) ]]; then
	echo "::error::Compose args contain prohibited commands: $COMPOSE_ARGS"
	exit 1
	fi
	# Validate against known docker compose options - allow hyphens, spaces, and equals for arguments
	if [[ "$COMPOSE_ARGS" =~ ^[a-zA-Z0-9[:space:]_=.-]+$ ]]; then
	echo "✅ Compose args format valid: $COMPOSE_ARGS"
	else
	echo "::error::Compose args contain invalid characters: $COMPOSE_ARGS"
	exit 1
	fi
	fi

	# Validate webhook URL format
	WEBHOOK_URL="${{ inputs.webhook-url }}"
	if [[ ! "$WEBHOOK_URL" =~ ^op://[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$ ]]; then
	echo "::error::Invalid webhook URL format: $WEBHOOK_URL"
	echo "::error::Expected format: op://vault/item/field"
	exit 1
	fi

	# Validate repo name
	REPO_NAME="${{ inputs.repo-name }}"
	if [[ ! "$REPO_NAME" =~ ^[a-zA-Z0-9_-]+$ ]] \|\| [ ${#REPO_NAME} -gt 100 ]; then
	echo "::error::Invalid repo name: $REPO_NAME"
	echo "::error::Must be alphanumeric with hyphens/underscores, max 100 chars"
	exit 1
	fi

	echo "✅ All input validation passed"

	- name: Display version information
	run: \|
	echo "📋 Workflow Version Information"
	echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
	echo "Repository: ${{ inputs.repo-name }}"
	echo "Target ref: ${{ inputs.target-ref }}"
	echo "Stacks: ${{ inputs.stacks }}"
	echo "Runner: ${{ runner.os }} ${{ runner.arch }}"
	echo ""
	echo "ℹ️ Reusable workflow SHA shown in 'Uses:' line above"
	echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"

	- name: Verify required tools
	run: \|
	echo "🔍 Verifying required tools are available..."

	# Check for jq (required for JSON parsing)
	if ! command -v jq &> /dev/null; then
	echo "::error::jq is not installed. Installing..."
	sudo apt-get update -qq
	sudo apt-get install -y jq
	fi
	echo "✅ jq version: $(jq --version)"

	# Check for timeout (required for command timeouts)
	if ! command -v timeout &> /dev/null; then
	echo "::error::timeout command is not available"
	exit 1
	fi
	echo "✅ timeout is available (part of coreutils)"

	# Check for readarray (Bash 4.0+ built-in for array operations)
	if ! declare -p BASH_VERSION &> /dev/null \|\| [ "${BASH_VERSION%%.*}" -lt 4 ]; then
	echo "::error::Bash 4.0+ is required (current: ${BASH_VERSION:-unknown})"
	exit 1
	fi
	echo "✅ Bash version: $BASH_VERSION (supports readarray and associative arrays)"

	echo "✅ All required tools verified"

	- name: Setup retry mechanism
	run: \|
	# Create retry function for bash commands
	cat > /tmp/retry.sh << 'EOF'
	#!/bin/bash
	retry() {
	local max_attempts=$1
	local delay=$2
	local command="${@:3}"
	local attempt=1

	while [ $attempt -le $max_attempts ]; do
	echo "Attempt $attempt of $max_attempts: $command"
	if eval "$command"; then
	echo "✅ Command succeeded on attempt $attempt"
	return 0
	else
	echo "❌ Command failed on attempt $attempt"
	if [ $attempt -lt $max_attempts ]; then
	echo "⏳ Waiting ${delay}s before retry..."
	sleep $delay
	delay=$((delay * 2)) # Exponential backoff
	fi
	attempt=$((attempt + 1))
	fi
	done

	echo "💥 Command failed after $max_attempts attempts"
	return 1
	}

	# Create SSH retry function with specific error handling
	ssh_retry() {
	local max_attempts=$1
	local delay=$2
	local ssh_cmd="${@:3}"
	local attempt=1
	local last_exit_code=1

	while [ $attempt -le $max_attempts ]; do
	echo "SSH Attempt $attempt of $max_attempts" >&2
	if eval "$ssh_cmd"; then
	echo "✅ SSH command succeeded on attempt $attempt" >&2
	return 0
	else
	last_exit_code=$?
	echo "❌ SSH command failed on attempt $attempt (exit code: $last_exit_code)" >&2

	# Check for specific SSH errors
	case $last_exit_code in
	255) echo "SSH connection error - network/auth issue" >&2 ;;
	1) echo "General SSH error" >&2 ;;
	*) echo "Unknown error code: $last_exit_code" >&2 ;;
	esac

	if [ $attempt -lt $max_attempts ]; then
	echo "⏳ Waiting ${delay}s before SSH retry..." >&2
	sleep $delay
	fi
	attempt=$((attempt + 1))
	fi
	done

	echo "💥 SSH command failed after $max_attempts attempts (final exit code: $last_exit_code)" >&2
	return $last_exit_code
	}
	EOF
	chmod +x /tmp/retry.sh

	- name: Cache deployment tools
	uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
	with:
	path: \|
	~/.cache/pip
	~/.cache/docker
	~/.ssh
	key: deploy-tools-${{ runner.os }}-v1
	restore-keys: \|
	deploy-tools-${{ runner.os }}-

	- name: Configure 1Password Service Account
	uses: 1password/load-secrets-action/configure@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
	with:
	service-account-token: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}

	- name: Load Tailscale credentials
	id: load-tailscale-credentials
	uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
	with:
	unset-previous: true
	env:
	TAILSCALE_OAUTH_CLIENT_ID: "op://Docker/tailscale-oauth/client_id"
	TAILSCALE_OAUTH_SECRET: "op://Docker/tailscale-oauth/secret"

	- name: Connect to Tailnet
	uses: tailscale/github-action@53acf823325fe9ca47f4cdaa951f90b4b0de5bb9 # v4.1.1
	with:
	oauth-client-id: ${{ steps.load-tailscale-credentials.outputs.TAILSCALE_OAUTH_CLIENT_ID }}
	oauth-secret: ${{ steps.load-tailscale-credentials.outputs.TAILSCALE_OAUTH_SECRET }}
	tags: tag:ci
	ping: ${{ secrets.SSH_HOST }}

	- name: Unload Tailscale credentials
	uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
	with:
	unset-previous: true

	- name: Optimize SSH connections
	run: \|
	# Configure SSH connection multiplexing for better performance
	mkdir -p ~/.ssh
	cat >> ~/.ssh/config << EOF
	Host deployment-server
	HostName ${{ secrets.SSH_HOST }}
	User ${{ secrets.SSH_USER }}
	ControlMaster auto
	ControlPath ~/.ssh/sockets/%r@%h:%p
	ControlPersist 300
	ServerAliveInterval 30
	ServerAliveCountMax 3
	Compression yes
	TCPKeepAlive yes
	EOF

	# Create control socket directory and pre-establish SSH connection
	mkdir -p ~/.ssh/sockets
	echo "🔗 Pre-establishing SSH connection for multiplexing..."
	ssh -o "StrictHostKeyChecking no" deployment-server -O check 2>/dev/null \|\| \
	ssh -o "StrictHostKeyChecking no" deployment-server -O forward -N &

	# Give the connection a moment to establish
	sleep 2

	echo "✅ SSH connection optimization configured"

	- name: Checkout repository for change detection
	uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
	with:
	fetch-depth: 0 # Fetch full history for accurate change detection

	- name: Determine previous deployment SHA
	id: determine-previous
	run: \|
	# Use retry mechanism for SSH connection
	source /tmp/retry.sh

	# Get current deployment SHA with error handling
	echo "🔍 Checking current deployment SHA for change detection..."
	if CURRENT_SHA=$(ssh_retry 3 5 "ssh -o 'StrictHostKeyChecking no' ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} 'cd /opt/compose && git rev-parse HEAD 2>/dev/null'"); then
	# Validate SHA format
	if [[ "$CURRENT_SHA" =~ ^[a-fA-F0-9]{40}$ ]]; then
	echo "✅ Current deployed SHA: $CURRENT_SHA"
	echo "previous_sha=$CURRENT_SHA" >> "$GITHUB_OUTPUT"
	else
	echo "⚠️ Invalid SHA format from server: $CURRENT_SHA"
	echo "⚠️ Using HEAD^ as fallback for change detection"
	echo "previous_sha=HEAD^" >> "$GITHUB_OUTPUT"
	fi
	else
	echo "⚠️ Could not retrieve current deployment SHA - using HEAD^ for change detection"
	echo "previous_sha=HEAD^" >> "$GITHUB_OUTPUT"
	fi

	- name: Get changed files for removal detection
	id: changed-files
	if: steps.determine-previous.outputs.previous_sha != inputs.target-ref
	continue-on-error: true
	uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
	with:
	json: true
	sha: ${{ inputs.target-ref }}
	base_sha: ${{ steps.determine-previous.outputs.previous_sha }}

	- name: Store current deployment for rollback
	id: backup
	run: \|
	echo "::group::Preparing deployment backup"
	# Use retry mechanism for SSH connection
	source /tmp/retry.sh

	# Get current deployment SHA with error handling
	echo "🔍 Checking current deployment SHA..."
	if CURRENT_SHA=$(ssh_retry 3 5 "ssh -o 'StrictHostKeyChecking no' ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} 'cd /opt/compose && git rev-parse HEAD 2>/dev/null'"); then
	# Validate SHA format
	if [[ "$CURRENT_SHA" =~ ^[a-fA-F0-9]{40}$ ]]; then
	echo "✅ Current deployed SHA: $CURRENT_SHA"
	else
	echo "⚠️ Invalid SHA format from server: $CURRENT_SHA"
	CURRENT_SHA="unknown"
	fi
	else
	echo "⚠️ Could not retrieve current deployment SHA - assuming first deployment"
	CURRENT_SHA="unknown"
	fi

	TARGET_REF="${{ inputs.target-ref }}"
	echo "🎯 Target deployment ref: $TARGET_REF"

	# Resolve target ref to SHA if it's not already a SHA
	if [[ "$TARGET_REF" =~ ^[a-fA-F0-9]{40}$ ]]; then
	TARGET_SHA="$TARGET_REF"
	echo "✅ Target ref is already a full SHA"
	elif [[ "$TARGET_REF" =~ ^[a-fA-F0-9]{7,39}$ ]]; then
	TARGET_SHA="$TARGET_REF"
	echo "✅ Target ref is a short SHA, will resolve on server"
	else
	TARGET_SHA="$TARGET_REF"
	echo "✅ Target ref is a branch/tag name, will resolve on server"
	fi

	# Set outputs with proper validation
	echo "previous_sha=${CURRENT_SHA}" >> "$GITHUB_OUTPUT"

	if [ "$CURRENT_SHA" = "$TARGET_SHA" ] && [ "${{ inputs.force-deploy }}" != "true" ]; then
	echo "⚠️ Repository is already at target commit - no deployment needed"
	echo "deployment_needed=false" >> "$GITHUB_OUTPUT"
	elif [ "$CURRENT_SHA" = "$TARGET_SHA" ] && [ "${{ inputs.force-deploy }}" = "true" ]; then
	echo "🔄 Force deployment requested - proceeding despite same commit"
	echo "deployment_needed=true" >> "$GITHUB_OUTPUT"
	else
	echo "✅ Deployment needed - proceeding with update"
	echo "deployment_needed=true" >> "$GITHUB_OUTPUT"
	fi
	echo "::endgroup::"

	# ================================================================
	# STACK REMOVAL DETECTION AND CLEANUP
	# ================================================================
	# Automatically detect and clean up Docker stacks that have been
	# removed from the repository using three independent detection methods.
	#
	# Detection Methods:
	# 1. Git Diff: Compares current deployed SHA vs target SHA
	# 2. Tree Comparison: Compares target commit tree vs server filesystem
	# (catches removals from previous undeployed commits)
	# 3. Discovery Analysis: Analyzes tj-actions/changed-files output
	# (validates removals from GitHub perspective)
	#
	# Process:
	# 1. Run all three detection methods independently on deployment server
	# 2. Fail deployment if ANY detection method encounters errors (fail-safe)
	# 3. Aggregate results using union approach (remove anything found by any method)
	# 4. Deduplicate and validate stack names
	# 5. Run 'docker compose down' for each removed stack
	# 6. Fail deployment if any cleanup fails
	# 7. Send Discord notification listing removed stacks
	#
	# Design: docs/plans/2025-12-06-enhanced-stack-removal-detection-design.md

	- name: Detect and clean up removed stacks
	id: cleanup-removed
	if: steps.backup.outputs.deployment_needed == 'true'
	continue-on-error: false
	run: \|
	# Source retry functions
	source /tmp/retry.sh

	# === DETECTION FUNCTION: GIT DIFF ===
	# Purpose: Detect stacks removed between two git commits
	# Inputs: $1=current_sha, $2=target_ref
	# Output: Newline-separated list of stack names (stdout)
	# Returns: 0 on success, 1 on error
	detect_removed_stacks_gitdiff() {
	local current_sha="$1"
	local target_ref="$2"

	# Build detection script
	local detect_script
	detect_script=$(cat << 'DETECT_EOF'
	set -e
	CURRENT_SHA="$1"
	TARGET_REF="$2"

	cd /opt/compose

	# Fetch target ref to ensure we have it
	if ! git fetch origin "$TARGET_REF" 2>/dev/null; then
	echo "⚠️ Failed to fetch target ref, trying general fetch..." >&2
	if ! git fetch 2>/dev/null; then
	echo "::error::Failed to fetch repository updates" >&2
	exit 1
	fi
	fi

	# Resolve target ref to SHA for comparison
	TARGET_SHA=$(git rev-parse "$TARGET_REF" 2>/dev/null \|\| echo "$TARGET_REF")

	# Validate both SHAs exist
	if ! git cat-file -e "$CURRENT_SHA" 2>/dev/null; then
	echo "::warning::Current SHA $CURRENT_SHA not found in repository (may have been replaced by force-push)" >&2
	echo " Skipping git diff detection, will rely on tree comparison method" >&2
	exit 1
	fi

	if ! git cat-file -e "$TARGET_SHA" 2>/dev/null; then
	echo "::warning::Target SHA $TARGET_SHA not found in repository" >&2
	echo " Skipping git diff detection, will rely on tree comparison method" >&2
	exit 1
	fi

	# Find deleted compose.yaml files between current and target
	git diff --diff-filter=D --name-only "$CURRENT_SHA" "$TARGET_SHA" 2>/dev/null \| \
	grep -E '^[^/]+/compose\.yaml$' \| \
	sed 's\|/compose\.yaml\|\|' \|\| echo ""
	DETECT_EOF
	)

	# Execute detection script on remote server
	echo "$detect_script" \| ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$current_sha\" \"$target_ref\""
	}

	# === DETECTION FUNCTION: TREE COMPARISON ===
	# Purpose: Detect stacks on server filesystem missing from target commit tree
	# Inputs: $1=target_ref
	# Output: Newline-separated list of stack names (stdout)
	# Returns: 0 on success, 1 on error
	detect_removed_stacks_tree() {
	local target_ref="$1"

	# Build detection script
	local detect_script
	detect_script=$(cat << 'DETECT_TREE_EOF'
	set -e
	TARGET_REF="$1"

	cd /opt/compose

	# Fetch target ref to ensure we have it
	if ! git fetch origin "$TARGET_REF" 2>/dev/null; then
	echo "⚠️ Failed to fetch target ref, trying general fetch..." >&2
	if ! git fetch 2>/dev/null; then
	echo "::error::Failed to fetch repository updates" >&2
	exit 1
	fi
	fi

	# Resolve target ref to SHA
	TARGET_SHA=$(git rev-parse "$TARGET_REF" 2>/dev/null \|\| echo "$TARGET_REF")

	# Validate target SHA exists
	if ! git cat-file -e "$TARGET_SHA" 2>/dev/null; then
	echo "::error::Target SHA $TARGET_SHA not found in repository" >&2
	exit 1
	fi

	# Get directories in target commit (one level deep, directories only)
	COMMIT_DIRS=$(git ls-tree --name-only "$TARGET_SHA" 2>/dev/null \| sort)

	# Get directories on server filesystem (exclude .git and hidden dirs)
	SERVER_DIRS=$(find /opt/compose -maxdepth 1 -mindepth 1 -type d ! -name '.*' -exec basename {} \; 2>/dev/null \| sort)

	# Find directories on server but not in commit
	MISSING_IN_COMMIT=$(comm -13 <(echo "$COMMIT_DIRS") <(echo "$SERVER_DIRS"))

	# Filter for directories with compose.yaml files
	for dir in $MISSING_IN_COMMIT; do
	if [ -f "/opt/compose/$dir/compose.yaml" ]; then
	echo "$dir"
	fi
	done
	DETECT_TREE_EOF
	)

	# Execute detection script on remote server
	echo "$detect_script" \| ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$target_ref\""
	}

	# === DETECTION FUNCTION: DISCOVERY ANALYSIS ===
	# Purpose: Analyze deleted files from tj-actions/changed-files output
	# Inputs: $1=deleted_files_json (JSON array from tj-actions/changed-files)
	# Output: Newline-separated list of stack names (stdout)
	# Returns: 0 on success, 1 on error
	detect_removed_stacks_discovery() {
	local deleted_files_json="$1"

	# Build detection script
	local detect_script
	detect_script=$(cat << 'DETECT_DISCOVERY_EOF'
	set -e
	DELETED_FILES_JSON="$1"

	# Parse JSON array and filter for compose.yaml deletions
	# Pattern: one level deep only (stack-name/compose.yaml)
	echo "$DELETED_FILES_JSON" \| jq -r '.[]' 2>/dev/null \| \
	grep -E '^[^/]+/compose\.yaml$' \| \
	sed 's\|/compose\.yaml\|\|' \|\| echo ""
	DETECT_DISCOVERY_EOF
	)

	# Execute detection script on remote server
	echo "$detect_script" \| ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$deleted_files_json\""
	}

	# === AGGREGATION FUNCTION ===
	# Purpose: Merge and deduplicate results from all three detection methods
	# Inputs: $1=gitdiff_stacks, $2=tree_stacks, $3=discovery_stacks (newline-separated lists)
	# Output: Deduplicated newline-separated list of stack names (stdout)
	# Returns: 0 on success (empty string if all inputs empty, not an error)
	aggregate_removed_stacks() {
	local gitdiff_stacks="$1"
	local tree_stacks="$2"
	local discovery_stacks="$3"

	# Concatenate all three lists, remove empty lines, sort and deduplicate
	{
	echo "$gitdiff_stacks"
	echo "$tree_stacks"
	echo "$discovery_stacks"
	} \| \
	grep -v '^$' \| \
	sort -u \| \
	grep -E '^[a-zA-Z0-9_-]+$' \|\| echo ""
	}

	# === CLEANUP FUNCTION ===
	# Purpose: Clean up a single removed stack using docker compose down
	# Inputs: $1=stack_name
	# Returns: 0 on success, 1 on error
	# Note: Requires OP_SERVICE_ACCOUNT_TOKEN from GitHub secrets
	# Security: Token passed as environment variable via heredoc (not command-line args) to avoid exposure in process listings
	cleanup_stack() {
	local stack="$1"
	local op_token="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}"

	# Build cleanup script that expects OP_SERVICE_ACCOUNT_TOKEN from environment
	local cleanup_script
	cleanup_script=$(cat << 'CLEANUP_EOF'
	STACK="$1"

	# Check if stack directory exists
	if [ ! -d "/opt/compose/$STACK" ]; then
	echo "⚠️ Stack directory not found for $STACK - already fully removed"
	exit 0
	fi

	cd "/opt/compose/$STACK"

	# Check if compose.yaml exists
	if [ ! -f compose.yaml ]; then
	echo "⚠️ compose.yaml not found for $STACK - may have been manually removed"
	exit 0
	fi

	# Run docker compose down with 1Password
	# Note: OP_SERVICE_ACCOUNT_TOKEN is set by the wrapper script
	if op run --env-file=/opt/compose/compose.env -- docker compose -f ./compose.yaml down; then
	echo "✅ Successfully cleaned up $STACK"
	else
	echo "❌ Failed to clean up $STACK"
	exit 1
	fi
	CLEANUP_EOF
	)

	# Execute cleanup script on remote server
	# Token passed as environment variable via heredoc to avoid exposure in process args
	ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$stack\"" <<EOF
	export OP_SERVICE_ACCOUNT_TOKEN="$op_token"
	$cleanup_script
	EOF
	}

	# === MAIN EXECUTION ===
	echo "::group::Detecting removed stacks"

	CURRENT_SHA="${{ steps.backup.outputs.previous_sha }}"
	TARGET_REF="${{ inputs.target-ref }}"

	# Skip detection if this is the first deployment
	if [ "$CURRENT_SHA" = "unknown" ]; then
	echo "ℹ️ First deployment detected - no previous stacks to remove"
	echo "removed_stacks=" >> "$GITHUB_OUTPUT"
	echo "has_removed_stacks=false" >> "$GITHUB_OUTPUT"
	echo "::endgroup::"
	exit 0
	fi

	echo "📊 Comparing commits:"
	echo " Current: $CURRENT_SHA"
	echo " Target: $TARGET_REF"
	echo "🔍 Checking for removed stacks..."

	# Read deleted files from changed-files step (may be empty if step failed)
	DELETED_FILES='${{ steps.changed-files.outputs.deleted_files }}'

	# Check if changed-files step succeeded
	if [ "${{ steps.changed-files.outcome }}" != "success" ]; then
	echo "⚠️ Changed-files detection failed (likely due to missing git ref)"
	echo " Proceeding with git diff and tree comparison methods only"
	fi

	echo "🔍 Running three detection methods..."

	# Execute all three detection methods independently
	echo " 1. Git diff detection (commit comparison)..."
	GITDIFF_STACKS=$(detect_removed_stacks_gitdiff "$CURRENT_SHA" "$TARGET_REF") \|\| GITDIFF_EXIT=$?

	echo " 2. Tree comparison detection (filesystem vs commit)..."
	TREE_STACKS=$(detect_removed_stacks_tree "$TARGET_REF") \|\| TREE_EXIT=$?

	echo " 3. Discovery analysis detection (changed files)..."
	if [ "$DELETED_FILES" = "[]" ] \|\| [ -z "$DELETED_FILES" ]; then
	# Empty JSON array or empty string - no deleted files to analyze
	echo " ℹ️ No deleted files detected - skipping discovery analysis"
	DISCOVERY_STACKS=""
	DISCOVERY_EXIT=0
	else
	DISCOVERY_STACKS=$(detect_removed_stacks_discovery "$DELETED_FILES") \|\| DISCOVERY_EXIT=$?
	fi

	# Fail deployment if any detection method failed (fail-safe)
	if [ "${GITDIFF_EXIT:-0}" -ne 0 ]; then
	echo "::error::Git diff detection failed (exit code: $GITDIFF_EXIT)"
	exit 1
	fi
	if [ "${TREE_EXIT:-0}" -ne 0 ]; then
	echo "::error::Tree comparison detection failed (exit code: $TREE_EXIT)"
	exit 1
	fi
	if [ "${DISCOVERY_EXIT:-0}" -ne 0 ]; then
	echo "::error::Discovery analysis detection failed (exit code: $DISCOVERY_EXIT)"
	exit 1
	fi

	echo "✅ All detection methods completed successfully"

	# Aggregate results (union of all three methods)
	echo "📊 Aggregating results..."
	REMOVED_STACKS=$(aggregate_removed_stacks "$GITDIFF_STACKS" "$TREE_STACKS" "$DISCOVERY_STACKS")

	# Debug logging
	if [ -n "$GITDIFF_STACKS" ]; then
	echo " Git diff found: $(echo "$GITDIFF_STACKS" \| tr '\n' ', ' \| sed 's/,$//')"
	fi
	if [ -n "$TREE_STACKS" ]; then
	echo " Tree comparison found: $(echo "$TREE_STACKS" \| tr '\n' ', ' \| sed 's/,$//')"
	fi
	if [ -n "$DISCOVERY_STACKS" ]; then
	echo " Discovery analysis found: $(echo "$DISCOVERY_STACKS" \| tr '\n' ', ' \| sed 's/,$//')"
	fi

	# Process results
	if [ -z "$REMOVED_STACKS" ]; then
	echo "✅ No stacks to remove"
	echo "removed_stacks=" >> "$GITHUB_OUTPUT"
	echo "has_removed_stacks=false" >> "$GITHUB_OUTPUT"
	else
	echo "🗑️ Found stacks to remove:"
	echo "$REMOVED_STACKS" \| while read -r stack; do
	echo " - $stack"
	done

	# Convert to JSON array for output
	REMOVED_JSON=$(echo "$REMOVED_STACKS" \| jq -R -s -c 'split("\n") \| map(select(length > 0))')
	echo "removed_stacks=$REMOVED_JSON" >> "$GITHUB_OUTPUT"
	echo "has_removed_stacks=true" >> "$GITHUB_OUTPUT"

	# Cleanup each removed stack
	echo ""
	echo "::group::Cleaning up removed stacks"

	CLEANUP_FAILED=false
	while IFS= read -r stack; do
	[ -z "$stack" ] && continue

	echo "🧹 Cleaning up stack: $stack"

	if ! cleanup_stack "$stack"; then
	echo "💥 Cleanup failed for stack: $stack"
	CLEANUP_FAILED=true
	break
	fi
	done <<< "$REMOVED_STACKS"

	echo "::endgroup::"

	if [ "$CLEANUP_FAILED" = "true" ]; then
	echo "::error::Stack cleanup failed - stopping deployment"
	exit 1
	fi

	echo "✅ All removed stacks cleaned successfully"
	fi
	echo "::endgroup::"

	- name: Notify removed stacks cleanup
	if: steps.cleanup-removed.outputs.has_removed_stacks == 'true'
	run: \|
	echo "📢 Sending cleanup notification to Discord..."

	# Get webhook URL from 1Password
	WEBHOOK_URL=$(op read "${{ inputs.webhook-url }}")

	# Build removed stacks list and create JSON payload using jq for proper escaping
	REMOVED_STACKS='${{ steps.cleanup-removed.outputs.removed_stacks }}'
	STACK_LIST=$(echo "$REMOVED_STACKS" \| jq -r '.[] \| "- " + .')
	TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)

	# Build JSON payload with jq to ensure proper escaping
	PAYLOAD=$(jq -n \
	--arg title "🗑️ Stack Cleanup - ${{ inputs.repo-name }}" \
	--arg description "Removed stacks have been cleaned up before deployment" \
	--arg stacks "$STACK_LIST" \
	--arg target "${{ inputs.target-ref }}" \
	--arg previous "${{ steps.backup.outputs.previous_sha }}" \
	--arg timestamp "$TIMESTAMP" \
	'{
	embeds: [{
	title: $title,
	description: $description,
	color: 16753920,
	fields: [
	{name: "Removed Stacks", value: $stacks},
	{name: "Target Commit", value: ("`" + $target + "`")},
	{name: "Previous Commit", value: ("`" + $previous + "`")}
	],
	timestamp: $timestamp
	}]
	}')

	# Send Discord notification
	curl -X POST "$WEBHOOK_URL" \
	-H "Content-Type: application/json" \
	-d "$PAYLOAD"

	echo "✅ Cleanup notification sent"

	- name: Deploy All Stacks
	id: deploy
	if: steps.backup.outputs.deployment_needed == 'true'
	continue-on-error: true
	run: \|
	echo "🚀 Deploying all stacks"

	# Source retry functions
	source /tmp/retry.sh

	# Set error handling
	set -e
	trap 'echo "❌ Deployment failed at line $LINENO"' ERR

	# Parse inputs outside SSH context
	STACKS="${{ join(fromJson(inputs.stacks), ' ') }}"
	HAS_DOCKGE="${{ inputs.has-dockge }}"
	TARGET_REF="${{ inputs.target-ref }}"
	COMPOSE_ARGS="${{ inputs.args \|\| '' }}"


	# Use retry mechanism and optimized deployment
	ssh_retry 3 10 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s $STACKS \"$HAS_DOCKGE\" \"$TARGET_REF\" \"$COMPOSE_ARGS\"" << 'EOF'
	set -e

	# Performance optimizations
	export DOCKER_BUILDKIT=1
	export COMPOSE_DOCKER_CLI_BUILD=1

	# Enable parallel image pulls
	export COMPOSE_PARALLEL_LIMIT=8

	# Get arguments passed to script (excluding sensitive OP_TOKEN)
	# Arguments: stack1 stack2 stack3 ... HAS_DOCKGE TARGET_REF [COMPOSE_ARGS]
	# COMPOSE_ARGS might be empty, so we need to handle variable arg count

	TOTAL_ARGS=$#

	# Find HAS_DOCKGE by looking for 'true' or 'false' in the args
	HAS_DOCKGE=""
	TARGET_REF=""
	COMPOSE_ARGS=""

	# The last few args should be: HAS_DOCKGE TARGET_REF [COMPOSE_ARGS]
	# HAS_DOCKGE is always 'true' or 'false'
	# TARGET_REF is a commit SHA (starts with letter/number)
	# COMPOSE_ARGS is optional and could be empty

	for i in $(seq 1 $TOTAL_ARGS); do
	ARG="${!i}"
	if [ "$ARG" = "true" ] \|\| [ "$ARG" = "false" ]; then
	HAS_DOCKGE="$ARG"
	TARGET_REF="${@:$((i+1)):1}"
	if [ $((i+2)) -le $TOTAL_ARGS ]; then
	COMPOSE_ARGS="${@:$((i+2)):1}"
	fi
	# All args before this position are stack names
	STACKS="${@:1:$((i-1))}"
	break
	fi
	done


	# Set OP_TOKEN via environment (passed separately)
	export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}"

	# Consolidate timeout values for easier maintenance
	# These can be overridden by workflow inputs where available
	GIT_FETCH_TIMEOUT=${{ inputs.git-fetch-timeout }}
	GIT_CHECKOUT_TIMEOUT=${{ inputs.git-checkout-timeout }}
	IMAGE_PULL_TIMEOUT=${{ inputs.image-pull-timeout }}
	SERVICE_STARTUP_TIMEOUT=${{ inputs.service-startup-timeout }}
	VALIDATION_ENV_TIMEOUT=${{ inputs.validation-env-timeout }}
	VALIDATION_SYNTAX_TIMEOUT=${{ inputs.validation-syntax-timeout }}

	if [ "$HAS_DOCKGE" = "true" ]; then
	echo "🚀 Deploying Dockge..."
	cd /opt/dockge

	# Add timeout protection for Dockge operations
	if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then
	echo "❌ Dockge image pull timed out after ${IMAGE_PULL_TIMEOUT}s"
	exit 1
	fi

	if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then
	echo "❌ Dockge startup timed out after ${SERVICE_STARTUP_TIMEOUT}s"
	exit 1
	fi

	echo "✅ Dockge deployed successfully"
	fi

	echo "Updating repository to $TARGET_REF..."

	# Add timeout protection to git operations
	if ! timeout $GIT_FETCH_TIMEOUT git -C /opt/compose/ fetch; then
	echo "❌ Git fetch timed out after ${GIT_FETCH_TIMEOUT}s"
	exit 1
	fi

	if ! timeout $GIT_CHECKOUT_TIMEOUT git -C /opt/compose/ checkout $TARGET_REF; then
	echo "❌ Git checkout timed out after ${GIT_CHECKOUT_TIMEOUT}s"
	exit 1
	fi

	echo "✅ Repository updated to $TARGET_REF"

	# Shared function to deploy or rollback a single stack
	# This eliminates code duplication between deploy and rollback operations
	process_stack() {
	local STACK=$1
	local OPERATION=$2 # "deploy" or "rollback"
	local LOGFILE="/tmp/${OPERATION}_${STACK}.log"
	local EXITCODEFILE="/tmp/${OPERATION}_${STACK}.exitcode"

	{
	if [ "$OPERATION" = "deploy" ]; then
	echo "🚀 Deploying $STACK..."
	else
	echo "🔄 Rolling back $STACK..."
	fi

	cd /opt/compose/$STACK

	echo " Pulling images for $STACK..."
	# Add timeout protection (5 minutes for image pull)
	if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then
	echo "❌ Failed to pull images for $STACK during $OPERATION (timeout or error)"
	exit 1
	fi

	echo " Starting services for $STACK..."
	# Add timeout protection (2 minutes for service startup)
	if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then
	echo "❌ Failed to start services for $STACK during $OPERATION (timeout or error)"
	exit 1
	fi

	if [ "$OPERATION" = "deploy" ]; then
	echo "✅ $STACK deployed successfully"
	else
	echo "✅ $STACK rolled back successfully"
	fi
	} > "$LOGFILE" 2>&1

	# Capture and save exit code for robust error detection
	local exit_code=$?
	echo "$exit_code" > "$EXITCODEFILE"
	return $exit_code
	}

	# Wrapper function for deploy (maintains backward compatibility)
	deploy_stack() {
	process_stack "$1" "deploy"
	}

	# Cleanup function for deploy logs
	cleanup_deploy_logs() {
	for STACK in $STACKS; do
	rm -f "/tmp/deploy_${STACK}.log" 2>/dev/null
	done
	}

	# Pre-deployment validation function
	validate_all_stacks() {
	echo "🔍 Pre-deployment validation of all stacks..."
	local validation_failed=false

	for STACK in $STACKS; do
	echo " Validating $STACK..."

	# Check if stack directory exists
	if [ ! -d "/opt/compose/$STACK" ]; then
	echo "❌ $STACK: Directory /opt/compose/$STACK not found"
	validation_failed=true
	continue
	fi

	cd "/opt/compose/$STACK" \|\| {
	echo "❌ $STACK: Cannot access directory"
	validation_failed=true
	continue
	}

	# Check if compose.yaml exists
	if [ ! -f "compose.yaml" ]; then
	echo "❌ $STACK: compose.yaml not found"
	validation_failed=true
	continue
	fi

	# Validate 1Password environment access and Docker Compose config
	if ! timeout $VALIDATION_ENV_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services >/dev/null 2>&1; then
	echo "❌ $STACK: Environment validation failed (1Password or compose config error)"
	validation_failed=true
	continue
	fi

	# Quick syntax validation
	if ! timeout $VALIDATION_SYNTAX_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --quiet 2>/dev/null; then
	echo "❌ $STACK: Docker Compose syntax validation failed"
	validation_failed=true
	continue
	fi

	echo "✅ $STACK: Pre-deployment validation passed"
	done

	if [ "$validation_failed" = true ]; then
	echo "❌ Pre-deployment validation failed for one or more stacks"
	echo " Stopping deployment to prevent extended failures"
	return 1
	fi

	echo "✅ All stacks passed pre-deployment validation"
	return 0
	}

	# Run pre-deployment validation
	if ! validate_all_stacks; then
	echo "DEPLOYMENT_STATUS=failed_validation" >> "$GITHUB_OUTPUT"
	exit 1
	fi

	# Set trap for cleanup on exit
	trap cleanup_deploy_logs EXIT

	# Start all deployments in parallel
	echo "🚀 Starting parallel deployment of all stacks..."
	PIDS=""

	# Simple approach - use for loop directly with unquoted variable
	for STACK in $STACKS; do
	echo "🚀 Deploying $STACK..."
	deploy_stack "$STACK" &
	PIDS="$PIDS $!"
	echo "Started deployment of $STACK (PID: $!)"
	done

	# Wait for all deployments and collect results
	echo "⏳ Waiting for all deployments to complete..."
	FAILED_STACKS=""

	# Enhanced parallel job monitoring with better error propagation
	echo "⏳ Monitoring parallel deployments..."
	DEPLOYED_STACKS=""
	SUCCESSFUL_STACKS=""
	DEPLOYMENT_ERRORS=""

	# Wait for jobs individually to capture exit codes
	for PID in $PIDS; do
	if wait "$PID"; then
	echo "✅ Deployment process $PID completed successfully"
	else
	EXIT_CODE=$?
	echo "❌ Deployment process $PID failed with exit code $EXIT_CODE"
	DEPLOYMENT_ERRORS="$DEPLOYMENT_ERRORS PID:$PID:$EXIT_CODE"
	fi
	done

	# Enhanced result analysis using exit code files (more robust than log parsing)
	for STACK in $STACKS; do
	if [ -f "/tmp/deploy_${STACK}.log" ]; then
	DEPLOYED_STACKS="$DEPLOYED_STACKS $STACK"

	# Primary: Check exit code file for robust error detection
	if [ -f "/tmp/deploy_${STACK}.exitcode" ]; then
	EXIT_CODE=$(cat "/tmp/deploy_${STACK}.exitcode")
	if [ "$EXIT_CODE" -eq 0 ]; then
	SUCCESSFUL_STACKS="$SUCCESSFUL_STACKS $STACK"
	else
	FAILED_STACKS="$FAILED_STACKS $STACK"
	echo "🔍 $STACK Error: Non-zero exit code ($EXIT_CODE)"
	fi
	else
	# Fallback: Log-based error detection if exit code file is missing
	echo "⚠️ $STACK: Exit code file missing - using less reliable log-based detection"
	if grep -q "❌.$STACK\\|CRITICAL.$STACK\\|Failed.$STACK\\|Error.$STACK" "/tmp/deploy_${STACK}.log"; then
	FAILED_STACKS="$FAILED_STACKS $STACK"
	# Extract specific error for reporting
	STACK_ERROR=$(grep -E "❌.$STACK\|CRITICAL.$STACK\|Failed.$STACK\|Error.$STACK" "/tmp/deploy_${STACK}.log" \| head -1)
	echo "🔍 $STACK Error: $STACK_ERROR"
	elif grep -q "✅.$STACK\\|Successfully.$STACK" "/tmp/deploy_${STACK}.log"; then
	SUCCESSFUL_STACKS="$SUCCESSFUL_STACKS $STACK"
	else
	echo "⚠️ $STACK: No clear success/failure indicator in logs - treating as potential failure"
	FAILED_STACKS="$FAILED_STACKS $STACK"
	fi
	fi
	else
	echo "⚠️ $STACK: No deployment log found - possible early failure"
	FAILED_STACKS="$FAILED_STACKS $STACK"
	fi
	done

	# Summary of deployment results
	echo ""
	echo "📊 Deployment Summary:"
	echo " Successful: $(echo $SUCCESSFUL_STACKS \| wc -w \| tr -d ' ') stacks"
	echo " Failed: $(echo $FAILED_STACKS \| wc -w \| tr -d ' ') stacks"
	if [ -n "$DEPLOYMENT_ERRORS" ]; then
	echo " Process errors: $DEPLOYMENT_ERRORS"
	fi

	# Display deployment logs with enhanced formatting
	echo ""
	echo "📋 Detailed Deployment Results:"
	echo "════════════════════════════════════════════════════════════════"
	for STACK in $STACKS; do
	if [ -f "/tmp/deploy_${STACK}.log" ]; then
	echo ""
	echo "🔸 STACK: $STACK"
	echo "────────────────────────────────────────────────────────────────"
	cat "/tmp/deploy_${STACK}.log"
	echo "────────────────────────────────────────────────────────────────"
	else
	echo ""
	echo "🔸 STACK: $STACK"
	echo "────────────────────────────────────────────────────────────────"
	echo "⚠️ No deployment log found for $STACK"
	echo "────────────────────────────────────────────────────────────────"
	fi
	done
	echo "════════════════════════════════════════════════════════════════"

	# Check if any deployments failed
	if [ -z "$STACKS" ]; then
	echo "💥 No stacks to deploy - STACKS variable is empty!"
	exit 1
	elif [ -z "$DEPLOYED_STACKS" ]; then
	echo "💥 No stacks were actually deployed - check stack discovery!"
	exit 1
	elif [ -n "$FAILED_STACKS" ]; then
	echo "💥 Deployments failed for:$FAILED_STACKS"
	exit 1
	fi

	echo "🎉 All stacks deployed successfully in parallel!"
	EOF

	- name: Health Check All Services
	id: health
	if: steps.backup.outputs.deployment_needed == 'true' && steps.deploy.outcome == 'success'
	run: \|
	echo "🔍 Health checking all services"

	# Source retry functions
	source /tmp/retry.sh

	# Parse inputs outside SSH context
	STACKS="${{ join(fromJSON(inputs.stacks), ' ') }}"
	HAS_DOCKGE="${{ inputs.has-dockge }}"

	# Execute health check and capture structured output
	# Temporarily disable set -e to capture exit code from command substitution
	# Use retry mechanism for health check
	set +e
	HEALTH_RESULT=$(ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} /bin/bash -s $STACKS \"$HAS_DOCKGE\"" << 'EOF'
	set -e

	# Get arguments passed to script (excluding sensitive OP_TOKEN)
	TOTAL_ARGS=$#

	# Find HAS_DOCKGE by looking for 'true' or 'false' in the args
	HAS_DOCKGE=""

	for i in $(seq 1 $TOTAL_ARGS); do
	ARG="${!i}"
	if [ "$ARG" = "true" ] \|\| [ "$ARG" = "false" ]; then
	HAS_DOCKGE="$ARG"
	# All args before this position are stack names
	STACKS="${@:1:$((i-1))}"
	break
	fi
	done

	# Set OP_TOKEN via environment (passed separately)
	export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}"

	# Set timeout configuration with defaults
	HEALTH_CHECK_TIMEOUT=${{ inputs.health-check-timeout }}
	HEALTH_CHECK_CMD_TIMEOUT=${{ inputs.health-check-command-timeout }}

	# Enhanced health check with exponential backoff
	echo "🔍 Starting enhanced health check with exponential backoff..."

	# Health check function with retry logic
	health_check_with_retry() {
	local stack=$1
	local logfile="/tmp/health_${stack}.log"

	# Use configurable timeout with fallback to defaults
	local timeout_seconds=${HEALTH_CHECK_TIMEOUT:-180}
	local max_attempts=4
	local wait_time=3
	local attempt=1
	local fast_fail_threshold=2 # Fast fail after 2 attempts if no progress
	local start_time=$(date +%s)

	# Create log file and redirect all output
	exec 3>&1 4>&2
	exec 1>"$logfile" 2>&1

	# Ensure file descriptors are restored on function exit
	trap 'exec 1>&3 2>&4 3>&- 4>&-' RETURN

	echo "🕰️ Health check timeout configured: ${timeout_seconds}s"
	echo "🔍 Health checking $stack with optimized retry logic..."

	cd "/opt/compose/$stack" \|\| {
	echo "❌ $stack: Directory not found"
	return 1
	}

	# Cache total service count (doesn't change during health check)
	local total_count
	total_count=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services 2>/dev/null \| grep -E '^[a-zA-Z0-9_-]+$' \| wc -l \| tr -d " " \|\| echo "0")

	if [ "$total_count" -eq 0 ]; then
	echo "❌ $stack: No services defined in compose file"
	return 1
	fi

	local previous_running=0
	local no_progress_count=0

	while [ $attempt -le $max_attempts ]; do
	echo " Attempt $attempt/$max_attempts for $stack (wait: ${wait_time}s)"

	# Get container status and health with error handling
	local running_healthy running_starting running_unhealthy running_no_health
	local exited_count restarting_count running_count

	# Check overall timeout
	local current_time=$(date +%s)
	local elapsed=$((current_time - start_time))
	if [ $elapsed -gt $timeout_seconds ]; then
	echo "❌ $stack: Health check timed out after ${elapsed}s (limit: ${timeout_seconds}s)"
	return 1
	fi

	# Get container state and health in one call using custom format
	# Format: Service State Health (tab-separated)
	local ps_output
	ps_output=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps --format '{{.Service}}\t{{.State}}\t{{.Health}}' 2>/dev/null \|\| echo "")

	# Parse output to count different states and health conditions
	running_healthy=0
	running_starting=0
	running_unhealthy=0
	running_no_health=0
	exited_count=0
	restarting_count=0

	while IFS=$'\t' read -r service state health; do
	# Skip empty lines
	[ -z "$service" ] && continue

	case "$state" in
	running)
	case "$health" in
	healthy)
	running_healthy=$((running_healthy + 1))
	;;
	starting)
	running_starting=$((running_starting + 1))
	;;
	unhealthy)
	running_unhealthy=$((running_unhealthy + 1))
	;;
	*)
	# No health check defined
	running_no_health=$((running_no_health + 1))
	;;
	esac
	;;
	exited)
	exited_count=$((exited_count + 1))
	;;
	restarting)
	restarting_count=$((restarting_count + 1))
	;;
	esac
	done <<< "$ps_output"

	# Total running containers (all health states)
	running_count=$((running_healthy + running_starting + running_unhealthy + running_no_health))

	echo " $stack status: $running_count/$total_count running (healthy: $running_healthy, starting: $running_starting, unhealthy: $running_unhealthy, no-check: $running_no_health), exited: $exited_count, restarting: $restarting_count"

	# Fast fail logic: if unhealthy or no progress with failures
	if [ "$running_unhealthy" -gt 0 ] && [ $attempt -ge $fast_fail_threshold ]; then
	echo "❌ $stack: Fast fail - $running_unhealthy unhealthy containers detected (attempt $attempt)"
	return 1
	elif [ $attempt -ge $fast_fail_threshold ] && [ "$running_count" -eq "$previous_running" ] && [ "$exited_count" -gt 0 ]; then
	no_progress_count=$((no_progress_count + 1))
	if [ $no_progress_count -ge 2 ]; then
	echo "❌ $stack: Fast fail - no progress and containers failing (attempt $attempt)"
	return 1
	fi
	else
	no_progress_count=0
	fi

	# Calculate healthy containers (healthy + no health check defined)
	local healthy_total=$((running_healthy + running_no_health))

	# Success condition: all containers running and healthy (or no health check)
	if [ "$healthy_total" -eq "$total_count" ] && [ "$total_count" -gt 0 ] && [ "$running_starting" -eq 0 ] && [ "$running_unhealthy" -eq 0 ] && [ "$exited_count" -eq 0 ] && [ "$restarting_count" -eq 0 ]; then
	echo "✅ $stack: All $total_count services healthy"
	return 0
	# Degraded but stable: all running and healthy, but fewer than expected
	elif [ "$healthy_total" -gt 0 ] && [ "$healthy_total" -eq "$running_count" ] && [ "$running_starting" -eq 0 ] && [ "$running_unhealthy" -eq 0 ] && [ "$exited_count" -eq 0 ] && [ "$restarting_count" -eq 0 ]; then
	echo "⚠️ $stack: $healthy_total/$total_count services healthy (degraded but stable)"
	return 2 # Degraded but acceptable
	# Still starting: health checks initializing, allow retry
	elif [ "$running_starting" -gt 0 ] && [ "$running_unhealthy" -eq 0 ] && [ $attempt -lt $max_attempts ]; then
	echo " $stack: $running_starting services still initializing health checks..."
	sleep $wait_time
	wait_time=$((wait_time * 2))
	if [ $wait_time -gt 20 ]; then
	wait_time=20
	fi
	# Final attempt failure
	elif [ $attempt -eq $max_attempts ]; then
	if [ "$running_unhealthy" -gt 0 ]; then
	echo "❌ $stack: Failed - $running_unhealthy services unhealthy after $max_attempts attempts"
	elif [ "$running_starting" -gt 0 ]; then
	echo "❌ $stack: Failed - $running_starting services still starting after $max_attempts attempts"
	else
	echo "❌ $stack: Failed after $max_attempts attempts ($running_count/$total_count running, $healthy_total healthy)"
	fi
	return 1
	# Continue with exponential backoff
	else
	echo " $stack: Not ready yet, waiting ${wait_time}s..."
	sleep $wait_time
	wait_time=$((wait_time * 2))
	if [ $wait_time -gt 20 ]; then
	wait_time=20
	fi
	fi

	previous_running=$running_count
	attempt=$((attempt + 1))
	done
	}

	FAILED_STACKS=""
	DEGRADED_STACKS=""
	HEALTHY_STACKS=""
	TOTAL_CONTAINERS=0
	RUNNING_CONTAINERS=0

	if [ "$HAS_DOCKGE" = "true" ]; then
	echo "🔍 Health checking Dockge with retry logic..."
	cd /opt/dockge

	# Retry logic for Dockge with health check verification
	local dockge_max_attempts=3
	local dockge_attempt=1
	local dockge_wait=3
	local DOCKGE_TOTAL
	local dockge_healthy dockge_starting dockge_unhealthy dockge_no_health
	local dockge_running dockge_healthy_total

	# Get total services
	DOCKGE_TOTAL=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose config --services 2>/dev/null \| wc -l \| tr -d " " \|\| echo "0")

	while [ $dockge_attempt -le $dockge_max_attempts ]; do
	# Get Dockge state and health
	local dockge_ps_output
	dockge_ps_output=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose ps --format '{{.Service}}\t{{.State}}\t{{.Health}}' 2>/dev/null \|\| echo "")

	# Parse health states
	dockge_healthy=0
	dockge_starting=0
	dockge_unhealthy=0
	dockge_no_health=0

	while IFS=$'\t' read -r service state health; do
	[ -z "$service" ] && continue
	if [ "$state" = "running" ]; then
	case "$health" in
	healthy) dockge_healthy=$((dockge_healthy + 1)) ;;
	starting) dockge_starting=$((dockge_starting + 1)) ;;
	unhealthy) dockge_unhealthy=$((dockge_unhealthy + 1)) ;;
	*) dockge_no_health=$((dockge_no_health + 1)) ;;
	esac
	fi
	done <<< "$dockge_ps_output"

	dockge_running=$((dockge_healthy + dockge_starting + dockge_unhealthy + dockge_no_health))
	dockge_healthy_total=$((dockge_healthy + dockge_no_health))

	echo " Dockge attempt $dockge_attempt/$dockge_max_attempts: $dockge_running/$DOCKGE_TOTAL running (healthy: $dockge_healthy, starting: $dockge_starting, unhealthy: $dockge_unhealthy, no-check: $dockge_no_health)"

	# Success: all healthy
	if [ "$dockge_healthy_total" -eq "$DOCKGE_TOTAL" ] && [ "$DOCKGE_TOTAL" -gt 0 ] && [ "$dockge_starting" -eq 0 ] && [ "$dockge_unhealthy" -eq 0 ]; then
	break
	# Unhealthy detected - fail
	elif [ "$dockge_unhealthy" -gt 0 ]; then
	echo " Dockge has $dockge_unhealthy unhealthy services"
	break
	# Degraded but stable: some healthy, final attempt
	elif [ "$dockge_healthy_total" -gt 0 ] && [ "$dockge_unhealthy" -eq 0 ] && [ $dockge_attempt -eq $dockge_max_attempts ]; then
	break
	# Retry
	elif [ $dockge_attempt -lt $dockge_max_attempts ]; then
	echo " Dockge not ready, waiting ${dockge_wait}s..."
	sleep $dockge_wait
	dockge_wait=$((dockge_wait * 2))
	fi

	dockge_attempt=$((dockge_attempt + 1))
	done

	TOTAL_CONTAINERS=$((TOTAL_CONTAINERS + DOCKGE_TOTAL))
	RUNNING_CONTAINERS=$((RUNNING_CONTAINERS + dockge_running))

	if [ "$dockge_unhealthy" -gt 0 ]; then
	echo "❌ Dockge: $dockge_unhealthy services unhealthy"
	FAILED_STACKS="$FAILED_STACKS dockge"
	elif [ "$dockge_running" -eq 0 ]; then
	echo "❌ Dockge: 0/$DOCKGE_TOTAL services running"
	FAILED_STACKS="$FAILED_STACKS dockge"
	elif [ "$dockge_healthy_total" -eq "$DOCKGE_TOTAL" ]; then
	echo "✅ Dockge: All $DOCKGE_TOTAL services healthy"
	HEALTHY_STACKS="$HEALTHY_STACKS dockge"
	else
	echo "⚠️ Dockge: $dockge_healthy_total/$DOCKGE_TOTAL services healthy (degraded)"
	DEGRADED_STACKS="$DEGRADED_STACKS dockge"
	fi
	fi


	# Parse critical services list
	# Note: CRITICAL_SERVICES contains stack names (not individual Docker service names)
	# This matches stacks that are considered critical for the deployment
	# Example: ["portainer", "dockge"] identifies these stacks as critical
	CRITICAL_SERVICES_ARRAY=()
	if [ -n "$CRITICAL_SERVICES" ] && [ "$CRITICAL_SERVICES" != "[]" ]; then
	# Convert JSON array to bash array using jq for robust parsing and preserve spaces/special characters
	readarray -t CRITICAL_SERVICES_ARRAY < <(echo "$CRITICAL_SERVICES" \| jq -r '.[]')
	echo "🚨 Critical stacks configured: ${CRITICAL_SERVICES_ARRAY[*]}"
	fi

	# Function to check if a stack is critical
	# Parameter: stack name to check
	# Returns: 0 if critical, 1 if not critical
	is_critical_service() {
	local stack_name=$1
	for critical in "${CRITICAL_SERVICES_ARRAY[@]}"; do
	if [ "$stack_name" = "$critical" ]; then
	return 0
	fi
	done
	return 1
	}

	# Enhanced health checks with sequential retry logic and early exit
	echo "🔍 Starting enhanced health checks with retry logic..."
	CRITICAL_FAILURE=false

	# Disable exit on error for health checks to ensure we reach output section
	set +e

	# Check each stack with the new enhanced health check
	for STACK in $STACKS; do
	echo ""
	echo "🔍 Checking stack: $STACK"

	health_check_with_retry "$STACK"
	HEALTH_RESULT=$?

	case $HEALTH_RESULT in
	0)
	# Output already restored in health_check_with_retry
	echo "✅ $STACK: Healthy"
	HEALTHY_STACKS="$HEALTHY_STACKS $STACK"
	;;
	2)
	# Output already restored in health_check_with_retry
	echo "⚠️ $STACK: Degraded but stable"
	DEGRADED_STACKS="$DEGRADED_STACKS $STACK"
	# Check if degraded stack is critical
	if is_critical_service "$STACK"; then
	echo "🚨 CRITICAL SERVICE DEGRADED: $STACK"
	echo " Continuing monitoring but flagging for attention"
	fi
	;;
	*)
	# For failures, output is already restored in health_check_with_retry
	echo "❌ $STACK: Failed health check"
	FAILED_STACKS="$FAILED_STACKS $STACK"
	# Check if failed stack is critical - trigger early exit
	if is_critical_service "$STACK"; then
	echo "🚨 CRITICAL SERVICE FAILURE: $STACK"
	echo " This is a critical service failure - triggering early exit"
	echo " Remaining stacks will not be health checked"
	CRITICAL_FAILURE=true
	break
	fi
	;;
	esac
	done

	# Count services across all stacks after health checks complete
	echo ""
	echo "📊 Counting services across all stacks..."

	if [ -z "$STACKS" ]; then
	echo "ERROR: STACKS variable is empty! Cannot count services."
	echo "Will attempt to discover stacks from filesystem..."
	DISCOVERED_STACKS=""
	for dir in /opt/compose/*/; do
	if [ -d "$dir" ] && [ -f "$dir/compose.yaml" ]; then
	STACK_NAME=$(basename "$dir")
	DISCOVERED_STACKS="$DISCOVERED_STACKS $STACK_NAME"
	fi
	done
	STACKS=$(echo "$DISCOVERED_STACKS" \| xargs)
	echo "Discovered stacks: $STACKS"
	fi

	for STACK in $STACKS; do
	STACK_RUNNING=$(cd /opt/compose/$STACK 2>/dev/null && op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps --services --filter "status=running" 2>/dev/null \| grep -E '^[a-zA-Z0-9_-]+$' 2>/dev/null \| wc -l \| tr -d " " \|\| echo "0")
	STACK_TOTAL=$(cd /opt/compose/$STACK 2>/dev/null && op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services 2>/dev/null \| grep -E '^[a-zA-Z0-9_-]+$' 2>/dev/null \| wc -l \| tr -d " " \|\| echo "0")
	echo " $STACK: $STACK_RUNNING/$STACK_TOTAL services"
	TOTAL_CONTAINERS=$((TOTAL_CONTAINERS + STACK_TOTAL))
	RUNNING_CONTAINERS=$((RUNNING_CONTAINERS + STACK_RUNNING))
	done

	# Write outputs to temp file to ensure capture even if script exits early
	TEMP_OUTPUT="/tmp/github_health_check_outputs.txt"
	echo "healthy_stacks=$(echo $HEALTHY_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')" > "$TEMP_OUTPUT"
	echo "degraded_stacks=$(echo $DEGRADED_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')" >> "$TEMP_OUTPUT"
	echo "failed_stacks=$(echo $FAILED_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')" >> "$TEMP_OUTPUT"
	echo "total_containers=$TOTAL_CONTAINERS" >> "$TEMP_OUTPUT"
	echo "running_containers=$RUNNING_CONTAINERS" >> "$TEMP_OUTPUT"
	if [ "$TOTAL_CONTAINERS" -gt 0 ]; then
	echo "success_rate=$(( RUNNING_CONTAINERS * 100 / TOTAL_CONTAINERS ))" >> "$TEMP_OUTPUT"
	else
	echo "success_rate=0" >> "$TEMP_OUTPUT"
	fi

	# Handle critical service failure
	if [ "$CRITICAL_FAILURE" = true ]; then
	echo ""
	echo "❌ CRITICAL SERVICE FAILURE DETECTED"
	echo " Deployment marked as failed due to critical service failure"
	echo " Health check terminated early to prevent extended failure cycles"
	# Set outputs for early termination
	echo "health_status=failed_critical" >> "$GITHUB_OUTPUT"
	echo "failed_stacks=$FAILED_STACKS" >> "$GITHUB_OUTPUT"
	echo "healthy_stacks=$HEALTHY_STACKS" >> "$GITHUB_OUTPUT"
	echo "degraded_stacks=$DEGRADED_STACKS" >> "$GITHUB_OUTPUT"
	exit 1
	fi

	echo "📊 Total service count: $RUNNING_CONTAINERS/$TOTAL_CONTAINERS across all stacks"

	# Display comprehensive health check results
	echo ""
	echo "📊 Health Check Summary:"
	echo "════════════════════════"
	echo "Total Services: $TOTAL_CONTAINERS"
	echo "Running Services: $RUNNING_CONTAINERS"
	if [ "$TOTAL_CONTAINERS" -gt 0 ]; then
	echo "Success Rate: $(( RUNNING_CONTAINERS * 100 / TOTAL_CONTAINERS ))%"
	else
	echo "Success Rate: 0%"
	fi
	echo ""

	# Display results by category
	[ -n "$HEALTHY_STACKS" ] && echo "✅ Healthy Stacks: $(echo $HEALTHY_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')"
	[ -n "$DEGRADED_STACKS" ] && echo "⚠️ Degraded Stacks: $(echo $DEGRADED_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')"
	[ -n "$FAILED_STACKS" ] && echo "❌ Failed Stacks: $(echo $FAILED_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')"

	echo ""
	echo "📋 Detailed Health Check Results:"
	echo "════════════════════════════════════════════════════════════════"
	for STACK in $STACKS; do
	if [ -f "/tmp/health_${STACK}.log" ]; then
	echo ""
	echo "🔸 STACK: $STACK"
	echo "────────────────────────────────────────────────────────────────"
	cat "/tmp/health_${STACK}.log"
	echo "────────────────────────────────────────────────────────────────"
	else
	echo ""
	echo "🔸 STACK: $STACK"
	echo "────────────────────────────────────────────────────────────────"
	echo "⚠️ No health check log found for $STACK"
	echo "────────────────────────────────────────────────────────────────"
	fi
	done
	echo "════════════════════════════════════════════════════════════════"

	# Output results in parseable format (temp file already written earlier)
	echo "GITHUB_OUTPUT_START"
	cat "$TEMP_OUTPUT"
	echo "GITHUB_OUTPUT_END"

	set -e # Re-enable exit on error after outputs are written

	# Determine final health status
	if [ -n "$FAILED_STACKS" ]; then
	echo ""
	echo "💥 Health check failed - some stacks are not running"
	exit 1
	elif [ -n "$DEGRADED_STACKS" ]; then
	echo ""
	echo "⚠️ Health check passed with warnings - some services degraded"
	exit 0
	else
	echo ""
	echo "🎉 All services are fully healthy!"
	exit 0
	fi
	EOF
	)
	HEALTH_EXIT_CODE=$?
	set -e

	# Check if health check command failed
	if [ $HEALTH_EXIT_CODE -ne 0 ]; then
	echo "::error::Health check failed with exit code: $HEALTH_EXIT_CODE"
	echo "💥 Health check command failed - marking deployment as failed"
	# Still extract outputs for debugging before failing
	echo "$HEALTH_RESULT"
	if echo "$HEALTH_RESULT" \| grep -q "GITHUB_OUTPUT_START"; then
	echo "$HEALTH_RESULT" \| sed -n '/GITHUB_OUTPUT_START/,/GITHUB_OUTPUT_END/p' \| grep -E "^(healthy_stacks\|degraded_stacks\|failed_stacks\|total_containers\|running_containers\|success_rate)=" >> "$GITHUB_OUTPUT" \|\| true
	fi
	exit 1
	fi

	# Extract health outputs from structured result
	echo "$HEALTH_RESULT"

	# Parse outputs without temporary files
	if echo "$HEALTH_RESULT" \| grep -q "GITHUB_OUTPUT_START"; then
	echo "$HEALTH_RESULT" \| sed -n '/GITHUB_OUTPUT_START/,/GITHUB_OUTPUT_END/p' \| grep -E "^(healthy_stacks\|degraded_stacks\|failed_stacks\|total_containers\|running_containers\|success_rate)=" >> "$GITHUB_OUTPUT"
	else
	echo "⚠️ GITHUB_OUTPUT_START marker not found, attempting to read from temp file..."
	# Try to read from temp file on remote server
	TEMP_FILE_CONTENT=$(ssh -o "StrictHostKeyChecking no" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} 'cat /tmp/github_health_check_outputs.txt 2>/dev/null' \|\| echo "")

	if [ -n "$TEMP_FILE_CONTENT" ]; then
	echo "✅ Successfully read outputs from temp file"
	echo "$TEMP_FILE_CONTENT" >> "$GITHUB_OUTPUT"
	else
	echo "❌ Could not read temp file, using fallback outputs"
	# Fallback outputs if parsing fails
	{
	echo "healthy_stacks="
	echo "degraded_stacks="
	echo "failed_stacks="
	echo "total_containers=0"
	echo "running_containers=0"
	echo "success_rate=0"
	} >> "$GITHUB_OUTPUT"
	fi
	fi

	- name: Cleanup unused images
	id: cleanup
	if: steps.backup.outputs.deployment_needed == 'true' && steps.deploy.outcome == 'success' && steps.health.outcome == 'success'
	continue-on-error: true
	run: \|
	echo "::group::Cleaning up unused Docker images"
	ssh -o "StrictHostKeyChecking no" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} << EOF
	echo "🧹 Cleaning up unused Docker images..."
	docker image prune -f
	echo "✅ Cleanup completed"
	EOF
	echo "::endgroup::"

	- name: Rollback to Previous Version
	id: rollback
	if: steps.backup.outputs.deployment_needed == 'true' && (steps.deploy.outcome == 'failure' \|\| steps.health.outcome == 'failure')
	continue-on-error: true
	run: \|
	echo "::group::Rolling back to previous deployment"
	echo "🔄 INITIATING ROLLBACK"
	echo "Previous SHA: ${{ steps.backup.outputs.previous_sha }}"
	echo "Failed SHA: ${{ inputs.target-ref }}"

	# Parse inputs outside SSH context
	HAS_DOCKGE="${{ inputs.has-dockge }}"
	PREVIOUS_SHA="${{ steps.backup.outputs.previous_sha }}"
	COMPOSE_ARGS="${{ inputs.args \|\| '' }}"
	CRITICAL_SERVICES='${{ inputs.critical-services }}'

	# Validate PREVIOUS_SHA before attempting rollback
	if [ "$PREVIOUS_SHA" = "unknown" ] \|\| [ -z "$PREVIOUS_SHA" ]; then
	echo "❌ Cannot rollback: No previous deployment exists (first deployment)"
	echo "::error::Rollback failed - no previous deployment to rollback to"
	exit 1
	fi

	# Validate SHA format (full 40-char SHA)
	if ! [[ "$PREVIOUS_SHA" =~ ^[a-fA-F0-9]{40}$ ]]; then
	echo "❌ Cannot rollback: Invalid previous SHA format: $PREVIOUS_SHA"
	echo "::error::Rollback failed - invalid SHA format"
	exit 1
	fi

	echo "✅ Previous SHA validation passed: $PREVIOUS_SHA"

	# Source retry functions
	source /tmp/retry.sh

	# Use retry mechanism for SSH connection (same as deploy) and capture output
	ROLLBACK_RESULT=$(ssh_retry 3 10 "ssh -o \"StrictHostKeyChecking no\" deployment-server /bin/bash -s \"$HAS_DOCKGE\" \"$PREVIOUS_SHA\" \"$COMPOSE_ARGS\" \"$CRITICAL_SERVICES\"" << 'EOF'
	set -e

	# Get arguments passed to script (excluding sensitive OP_TOKEN)
	HAS_DOCKGE="$1"
	PREVIOUS_SHA="$2"
	COMPOSE_ARGS="$3"
	CRITICAL_SERVICES="$4"

	# Set OP_TOKEN via environment (passed separately)
	export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}"

	# Consolidate timeout values for easier maintenance
	# These can be overridden by workflow inputs where available
	GIT_FETCH_TIMEOUT=${{ inputs.git-fetch-timeout }}
	GIT_CHECKOUT_TIMEOUT=${{ inputs.git-checkout-timeout }}
	IMAGE_PULL_TIMEOUT=${{ inputs.image-pull-timeout }}
	SERVICE_STARTUP_TIMEOUT=${{ inputs.service-startup-timeout }}
	VALIDATION_ENV_TIMEOUT=${{ inputs.validation-env-timeout }}
	VALIDATION_SYNTAX_TIMEOUT=${{ inputs.validation-syntax-timeout }}

	echo "🔄 Rolling back to $PREVIOUS_SHA..."

	# Add timeout protection to git operations
	if ! timeout $GIT_FETCH_TIMEOUT git -C /opt/compose/ fetch; then
	echo "❌ Git fetch timed out after ${GIT_FETCH_TIMEOUT}s"
	exit 1
	fi

	if ! timeout $GIT_CHECKOUT_TIMEOUT git -C /opt/compose/ checkout $PREVIOUS_SHA; then
	echo "❌ Git checkout timed out after ${GIT_CHECKOUT_TIMEOUT}s"
	exit 1
	fi

	echo "✅ Repository rolled back to $PREVIOUS_SHA"

	# Dynamically discover stacks based on the previous commit's structure
	echo "🔍 Discovering stacks in previous commit..."
	ROLLBACK_STACKS_ARRAY=()
	cd /opt/compose
	for dir in */; do
	if [[ -d "$dir" && (-f "$dir/compose.yml" \|\| -f "$dir/compose.yaml") ]]; then
	STACK_NAME=$(basename "$dir")
	ROLLBACK_STACKS_ARRAY+=("$STACK_NAME")
	echo " Found stack: $STACK_NAME"
	fi
	done

	if [ ${#ROLLBACK_STACKS_ARRAY[@]} -eq 0 ]; then
	echo "⚠️ No stacks found in previous commit - rollback cannot proceed"
	exit 1
	fi

	# Use null character as delimiter to support stack names with spaces and special characters
	# Note: Null delimiter is used only within this SSH script execution
	# The rollback-health step will convert it to space-delimited before passing between workflow steps
	ROLLBACK_STACKS=$(printf "%s\0" "${ROLLBACK_STACKS_ARRAY[@]}")
	echo "📋 Stacks to rollback: ${ROLLBACK_STACKS_ARRAY[*]}"

	# Output discovered stacks for rollback-health step (null-delimited)
	# Will be converted to space-delimited in rollback-health step for compatibility
	echo "DISCOVERED_ROLLBACK_STACKS=$ROLLBACK_STACKS"

	# Deploy Dockge first if needed
	if [ "$HAS_DOCKGE" = "true" ]; then
	echo "🔄 Rolling back Dockge..."
	cd /opt/dockge

	# Add timeout protection for Dockge operations
	if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then
	echo "❌ Dockge image pull timed out after ${IMAGE_PULL_TIMEOUT}s"
	exit 1
	fi

	if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then
	echo "❌ Dockge startup timed out after ${SERVICE_STARTUP_TIMEOUT}s"
	exit 1
	fi

	echo "✅ Dockge rolled back successfully"
	fi

	# Shared function to deploy or rollback a single stack
	# This eliminates code duplication between deploy and rollback operations
	process_stack() {
	local STACK=$1
	local OPERATION=$2 # "deploy" or "rollback"
	local LOGFILE="/tmp/${OPERATION}_${STACK}.log"
	local EXITCODEFILE="/tmp/${OPERATION}_${STACK}.exitcode"

	{
	if [ "$OPERATION" = "deploy" ]; then
	echo "🚀 Deploying $STACK..."
	else
	echo "🔄 Rolling back $STACK..."
	fi

	cd /opt/compose/$STACK

	echo " Pulling images for $STACK..."
	# Add timeout protection (5 minutes for image pull)
	if ! timeout $IMAGE_PULL_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose pull; then
	echo "❌ Failed to pull images for $STACK during $OPERATION (timeout or error)"
	exit 1
	fi

	echo " Starting services for $STACK..."
	# Add timeout protection (2 minutes for service startup)
	if ! timeout $SERVICE_STARTUP_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose up -d --remove-orphans $COMPOSE_ARGS; then
	echo "❌ Failed to start services for $STACK during $OPERATION (timeout or error)"
	exit 1
	fi

	if [ "$OPERATION" = "deploy" ]; then
	echo "✅ $STACK deployed successfully"
	else
	echo "✅ $STACK rolled back successfully"
	fi
	} > "$LOGFILE" 2>&1

	# Capture and save exit code for robust error detection
	local exit_code=$?
	echo "$exit_code" > "$EXITCODEFILE"
	return $exit_code
	}

	# Wrapper function for rollback (uses shared process_stack)
	rollback_stack() {
	process_stack "$1" "rollback"
	}

	# Cleanup function for rollback logs
	cleanup_rollback_logs() {
	# Parse null-delimited stacks into array
	readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS"
	for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do
	rm -f "/tmp/rollback_${STACK}.log" 2>/dev/null
	done
	}

	# Pre-rollback validation function
	validate_all_rollback_stacks() {
	echo "🔍 Pre-rollback validation of all stacks..."
	local validation_failed=false

	# Parse null-delimited stacks into array
	readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS"
	for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do
	echo " Validating $STACK..."

	# Check if stack directory exists
	if [ ! -d "/opt/compose/$STACK" ]; then
	echo "❌ $STACK: Directory /opt/compose/$STACK not found"
	validation_failed=true
	continue
	fi

	cd "/opt/compose/$STACK" \|\| {
	echo "❌ $STACK: Cannot access directory"
	validation_failed=true
	continue
	}

	# Check if compose.yaml or compose.yml exists and determine which to use
	COMPOSE_FILE=""
	if [ -f "compose.yaml" ]; then
	COMPOSE_FILE="compose.yaml"
	elif [ -f "compose.yml" ]; then
	COMPOSE_FILE="compose.yml"
	else
	echo "❌ $STACK: neither compose.yaml nor compose.yml found"
	validation_failed=true
	continue
	fi

	# Validate 1Password environment access and Docker Compose config
	if ! timeout $VALIDATION_ENV_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f "$COMPOSE_FILE" config --services >/dev/null 2>&1; then
	echo "❌ $STACK: Environment validation failed (1Password or compose config error)"
	validation_failed=true
	continue
	fi

	# Quick syntax validation
	if ! timeout $VALIDATION_SYNTAX_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f "$COMPOSE_FILE" config --quiet 2>/dev/null; then
	echo "❌ $STACK: Docker Compose syntax validation failed"
	validation_failed=true
	continue
	fi

	echo "✅ $STACK: Pre-rollback validation passed"
	done

	if [ "$validation_failed" = true ]; then
	echo "❌ Pre-rollback validation failed for one or more stacks"
	echo " Stopping rollback to prevent extended failures"
	return 1
	fi

	echo "✅ All stacks passed pre-rollback validation"
	return 0
	}

	# Set trap for cleanup on exit
	trap cleanup_rollback_logs EXIT

	# Run pre-rollback validation
	if ! validate_all_rollback_stacks; then
	echo "ROLLBACK_STATUS=failed_validation" >> "$GITHUB_OUTPUT"
	exit 1
	fi

	# Start all rollback deployments in parallel
	echo "🔄 Starting parallel rollback of all stacks..."
	ROLLBACK_PIDS=""

	# Map each PID to its stack name for improved error reporting
	# Note: Requires Bash 4.0+ for associative arrays (GitHub Actions runners use Bash 5.x)
	declare -A ROLLBACK_PID_TO_STACK

	# Parse null-delimited stacks into array
	readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS"

	for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do
	echo "🔄 Rolling back $STACK..."
	rollback_stack "$STACK" &
	PID=$!
	ROLLBACK_PIDS="$ROLLBACK_PIDS $PID"
	ROLLBACK_PID_TO_STACK[$PID]=$STACK
	echo "Started rollback of $STACK (PID: $PID)"
	done

	# Wait for all rollback deployments and collect results
	echo "⏳ Waiting for all rollbacks to complete..."
	FAILED_ROLLBACKS=""
	ROLLBACK_ERRORS=""

	# Enhanced parallel job monitoring with proper error propagation
	echo "⏳ Monitoring parallel rollback operations..."

	# Wait for jobs individually to capture exit codes and report stack names
	for PID in $ROLLBACK_PIDS; do
	STACK_NAME="${ROLLBACK_PID_TO_STACK[$PID]}"
	if wait "$PID"; then
	echo "✅ Rollback process $PID for stack $STACK_NAME completed successfully"
	else
	EXIT_CODE=$?
	# Check if process was terminated by signal (exit code > 128)
	if [ "$EXIT_CODE" -gt 128 ]; then
	SIGNAL_NUM=$((EXIT_CODE - 128))
	# Try to get signal name (works on most systems)
	if command -v kill >/dev/null 2>&1; then
	SIGNAL_NAME=$(kill -l $SIGNAL_NUM 2>/dev/null \|\| echo "SIG$SIGNAL_NUM")
	else
	SIGNAL_NAME="SIG$SIGNAL_NUM"
	fi
	echo "❌ Rollback process $PID for stack $STACK_NAME was terminated by signal $SIGNAL_NUM ($SIGNAL_NAME)"
	ROLLBACK_ERRORS="$ROLLBACK_ERRORS STACK:$STACK_NAME:PID:$PID:TERMINATED_BY_SIGNAL:$SIGNAL_NUM:$SIGNAL_NAME"
	else
	echo "❌ Rollback process $PID for stack $STACK_NAME failed with exit code $EXIT_CODE"
	ROLLBACK_ERRORS="$ROLLBACK_ERRORS STACK:$STACK_NAME:PID:$PID:EXIT_CODE:$EXIT_CODE"
	fi
	fi
	done

	# Enhanced result analysis using exit code files (more robust than log parsing)
	ROLLED_BACK_STACKS=""
	SUCCESSFUL_ROLLBACKS=""
	# Parse null-delimited stacks into array
	readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS"
	for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do
	if [ -f "/tmp/rollback_${STACK}.log" ]; then
	ROLLED_BACK_STACKS="$ROLLED_BACK_STACKS $STACK"

	# Primary: Check exit code file for robust error detection
	if [ -f "/tmp/rollback_${STACK}.exitcode" ]; then
	EXIT_CODE=$(cat "/tmp/rollback_${STACK}.exitcode")
	if [ "$EXIT_CODE" -eq 0 ]; then
	SUCCESSFUL_ROLLBACKS="$SUCCESSFUL_ROLLBACKS $STACK"
	else
	FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK"
	echo "🔍 $STACK Rollback Error: Non-zero exit code ($EXIT_CODE)"
	fi
	else
	# Fallback: Log-based error detection if exit code file is missing
	echo "⚠️ $STACK: Exit code file missing - using less reliable log-based detection"
	if grep -q "❌.$STACK\\|CRITICAL.$STACK\\|Failed.$STACK\\|Error.$STACK" "/tmp/rollback_${STACK}.log"; then
	FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK"
	# Extract specific error for reporting
	STACK_ERROR=$(grep -E "❌.$STACK\|CRITICAL.$STACK\|Failed.$STACK\|Error.$STACK" "/tmp/rollback_${STACK}.log" \| head -1)
	echo "🔍 $STACK Rollback Error: $STACK_ERROR"
	elif grep -q "✅.$STACK\\|Successfully.$STACK" "/tmp/rollback_${STACK}.log"; then
	SUCCESSFUL_ROLLBACKS="$SUCCESSFUL_ROLLBACKS $STACK"
	else
	echo "⚠️ $STACK: No clear success/failure indicator in logs - treating as potential failure"
	FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK"
	fi
	fi
	else
	echo "⚠️ $STACK: No rollback log found - possible early failure"
	FAILED_ROLLBACKS="$FAILED_ROLLBACKS $STACK"
	fi
	done

	# Summary of rollback results
	echo ""
	echo "📊 Rollback Summary:"
	echo " Successful: $(echo $SUCCESSFUL_ROLLBACKS \| wc -w \| tr -d ' ') stacks"
	echo " Failed: $(echo $FAILED_ROLLBACKS \| wc -w \| tr -d ' ') stacks"
	if [ -n "$ROLLBACK_ERRORS" ]; then
	echo " Process errors: $ROLLBACK_ERRORS"
	fi

	# Parse critical services list
	# Note: CRITICAL_SERVICES contains stack names (not individual Docker service names)
	# This matches stacks that are considered critical for the deployment
	# Example: ["portainer", "dockge"] identifies these stacks as critical
	CRITICAL_SERVICES_ARRAY=()
	CRITICAL_FAILURE=false
	if [ -n "$CRITICAL_SERVICES" ] && [ "$CRITICAL_SERVICES" != "[]" ]; then
	# Convert JSON array to bash array using jq for robust parsing and preserve spaces/special characters
	readarray -t CRITICAL_SERVICES_ARRAY < <(echo "$CRITICAL_SERVICES" \| jq -r '.[]')
	echo "🚨 Critical stacks configured: ${CRITICAL_SERVICES_ARRAY[*]}"

	# Check if any failed rollback stack is critical
	for FAILED_STACK in $FAILED_ROLLBACKS; do
	for CRITICAL_STACK in "${CRITICAL_SERVICES_ARRAY[@]}"; do
	if [ "$FAILED_STACK" = "$CRITICAL_STACK" ]; then
	echo "🚨 CRITICAL STACK ROLLBACK FAILED: $FAILED_STACK"
	echo " This is a critical stack - system may be in unsafe state"
	CRITICAL_FAILURE=true
	fi
	done
	done
	fi

	# Display all rollback logs
	echo ""
	echo "📋 Rollback Results:"
	echo "════════════════════════════════════════════════════════════════"
	# Parse null-delimited stacks into array
	readarray -d $'\0' -t ROLLBACK_STACKS_ARRAY <<< "$ROLLBACK_STACKS"
	for STACK in "${ROLLBACK_STACKS_ARRAY[@]}"; do
	if [ -f "/tmp/rollback_${STACK}.log" ]; then
	echo ""
	echo "🔸 ROLLBACK STACK: $STACK"
	echo "────────────────────────────────────────────────────────────────"
	cat "/tmp/rollback_${STACK}.log"
	echo "────────────────────────────────────────────────────────────────"
	else
	echo ""
	echo "🔸 ROLLBACK STACK: $STACK"
	echo "────────────────────────────────────────────────────────────────"
	echo "⚠️ No rollback log found for $STACK"
	echo "────────────────────────────────────────────────────────────────"
	fi
	done
	echo "════════════════════════════════════════════════════════════════"

	# Check if any rollbacks failed
	if [ -z "$ROLLBACK_STACKS" ]; then
	echo "💥 No stacks to rollback - ROLLBACK_STACKS variable is empty!"
	exit 1
	elif [ -z "$ROLLED_BACK_STACKS" ]; then
	echo "💥 No stacks were actually rolled back - check stack discovery!"
	exit 1
	elif [ "$CRITICAL_FAILURE" = true ]; then
	echo ""
	echo "💥 CRITICAL SERVICE ROLLBACK FAILURE"
	echo " One or more critical services failed to rollback"
	echo " System may be in an unsafe state - manual intervention required"
	echo " Failed critical services:$FAILED_ROLLBACKS"
	exit 1
	elif [ -n "$FAILED_ROLLBACKS" ]; then
	echo "💥 Rollbacks failed for:$FAILED_ROLLBACKS"
	exit 1
	fi

	echo "🎉 All stacks rolled back successfully!"
	EOF
	)

	# Extract rollback result and discovered stacks
	echo "$ROLLBACK_RESULT"

	# Parse discovered stacks output for rollback-health step (simplified - no markers needed)
	if echo "$ROLLBACK_RESULT" \| grep -q "DISCOVERED_ROLLBACK_STACKS="; then
	DISCOVERED_STACKS=$(echo "$ROLLBACK_RESULT" \| grep "DISCOVERED_ROLLBACK_STACKS=" \| cut -d'=' -f2-)
	echo "discovered_rollback_stacks=$DISCOVERED_STACKS" >> "$GITHUB_OUTPUT"
	echo "✅ Captured discovered rollback stacks: $DISCOVERED_STACKS"
	else
	echo "⚠️ Could not parse discovered stacks, will use input stacks as fallback"
	echo "discovered_rollback_stacks=${{ join(fromJSON(inputs.stacks), ' ') }}" >> "$GITHUB_OUTPUT"
	fi

	echo "::endgroup::"

	# Health check runs after rollback attempt regardless of rollback success/failure
	# This is intentional: we need to know the final system state even if rollback fails
	# Using 'conclusion != skipped' instead of 'outcome == success' ensures we get
	# visibility into what services are running, which is critical for incident response
	- name: Verify Rollback Health
	id: rollback-health
	if: steps.backup.outputs.deployment_needed == 'true' && (steps.deploy.outcome == 'failure' \|\| steps.health.outcome == 'failure') && steps.rollback.conclusion != 'skipped'
	continue-on-error: true
	run: \|
	echo "🔍 Verifying rollback health status"

	# Source retry functions
	source /tmp/retry.sh

	# Use discovered rollback stacks instead of input stacks
	# This ensures we check the stacks that were actually rolled back (from previous commit)
	DISCOVERED_STACKS="${{ steps.rollback.outputs.discovered_rollback_stacks }}"
	if [ -n "$DISCOVERED_STACKS" ]; then
	# Convert null-delimited stacks to space-delimited for SSH arguments
	readarray -d $'\0' -t STACKS_ARRAY <<< "$DISCOVERED_STACKS"
	STACKS="${STACKS_ARRAY[*]}"
	echo "✅ Using discovered rollback stacks: $STACKS"
	else
	# Fallback to input stacks if discovery failed
	STACKS="${{ join(fromJSON(inputs.stacks), ' ') }}"
	echo "⚠️ Using input stacks as fallback: $STACKS"
	fi

	HAS_DOCKGE="${{ inputs.has-dockge }}"

	# Execute rollback health check
	ROLLBACK_HEALTH_RESULT=$(ssh_retry 3 5 "ssh -o \"StrictHostKeyChecking no\" ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }} /bin/bash -s $STACKS \"$HAS_DOCKGE\"" << 'EOF'
	set -e

	# Get arguments passed to script
	TOTAL_ARGS=$#

	# Find HAS_DOCKGE by looking for 'true' or 'false' in the args
	HAS_DOCKGE=""

	for i in $(seq 1 $TOTAL_ARGS); do
	ARG="${!i}"
	if [ "$ARG" = "true" ] \|\| [ "$ARG" = "false" ]; then
	HAS_DOCKGE="$ARG"
	# All args before this position are stack names
	STACKS="${@:1:$((i-1))}"
	break
	fi
	done

	# Set OP_TOKEN via environment
	export OP_SERVICE_ACCOUNT_TOKEN="${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}"

	# Set configurable timeout for health check commands (default: 15 seconds)
	HEALTH_CHECK_CMD_TIMEOUT=${{ inputs.health-check-command-timeout }}
	if [ -z "$HEALTH_CHECK_CMD_TIMEOUT" ]; then
	echo "ℹ️ HEALTH_CHECK_CMD_TIMEOUT not provided, using default 15 seconds"
	HEALTH_CHECK_CMD_TIMEOUT=15
	fi

	# Validate that HEALTH_CHECK_CMD_TIMEOUT is an integer
	if ! [[ "$HEALTH_CHECK_CMD_TIMEOUT" =~ ^[0-9]+$ ]]; then
	echo "⚠️ HEALTH_CHECK_CMD_TIMEOUT ($HEALTH_CHECK_CMD_TIMEOUT) is not an integer, using default 15"
	HEALTH_CHECK_CMD_TIMEOUT=15
	fi

	# Enforce minimum and maximum limits for HEALTH_CHECK_CMD_TIMEOUT
	HEALTH_CHECK_CMD_TIMEOUT_MIN=5
	HEALTH_CHECK_CMD_TIMEOUT_MAX=60
	if [ "$HEALTH_CHECK_CMD_TIMEOUT" -lt "$HEALTH_CHECK_CMD_TIMEOUT_MIN" ]; then
	echo "⚠️ HEALTH_CHECK_CMD_TIMEOUT ($HEALTH_CHECK_CMD_TIMEOUT) below minimum, using $HEALTH_CHECK_CMD_TIMEOUT_MIN"
	HEALTH_CHECK_CMD_TIMEOUT=$HEALTH_CHECK_CMD_TIMEOUT_MIN
	fi
	if [ "$HEALTH_CHECK_CMD_TIMEOUT" -gt "$HEALTH_CHECK_CMD_TIMEOUT_MAX" ]; then
	echo "⚠️ HEALTH_CHECK_CMD_TIMEOUT ($HEALTH_CHECK_CMD_TIMEOUT) above maximum, using $HEALTH_CHECK_CMD_TIMEOUT_MAX"
	HEALTH_CHECK_CMD_TIMEOUT=$HEALTH_CHECK_CMD_TIMEOUT_MAX
	fi

	echo "🔍 Verifying rollback health for all services..."

	ROLLBACK_HEALTHY_STACKS=""
	ROLLBACK_DEGRADED_STACKS=""
	ROLLBACK_FAILED_STACKS=""
	ROLLBACK_TOTAL_CONTAINERS=0
	ROLLBACK_RUNNING_CONTAINERS=0

	# Check Dockge health if applicable
	if [ "$HAS_DOCKGE" = "true" ]; then
	echo "🔍 Verifying Dockge rollback health..."
	cd /opt/dockge
	DOCKGE_RUNNING=$(op run --env-file=/opt/compose/compose.env -- docker compose ps --services --filter "status=running" \| wc -l \| tr -d " ")
	DOCKGE_TOTAL=$(op run --env-file=/opt/compose/compose.env -- docker compose ps --services \| wc -l \| tr -d " ")
	ROLLBACK_TOTAL_CONTAINERS=$((ROLLBACK_TOTAL_CONTAINERS + DOCKGE_TOTAL))
	ROLLBACK_RUNNING_CONTAINERS=$((ROLLBACK_RUNNING_CONTAINERS + DOCKGE_RUNNING))

	if [ "$DOCKGE_RUNNING" -eq 0 ]; then
	echo "❌ Dockge rollback: 0/$DOCKGE_TOTAL services running"
	ROLLBACK_FAILED_STACKS="$ROLLBACK_FAILED_STACKS dockge"
	elif [ "$DOCKGE_RUNNING" -lt "$DOCKGE_TOTAL" ]; then
	echo "⚠️ Dockge rollback: $DOCKGE_RUNNING/$DOCKGE_TOTAL services running (degraded)"
	ROLLBACK_DEGRADED_STACKS="$ROLLBACK_DEGRADED_STACKS dockge"
	else
	echo "✅ Dockge rollback: All $DOCKGE_RUNNING services healthy"
	ROLLBACK_HEALTHY_STACKS="$ROLLBACK_HEALTHY_STACKS dockge"
	fi
	fi

	# Simple health check for each rolled back stack
	for STACK in $STACKS; do
	echo ""
	echo "🔍 Verifying rollback health for stack: $STACK"
	cd "/opt/compose/$STACK" \|\| {
	echo "❌ $STACK: Directory not accessible after rollback"
	ROLLBACK_FAILED_STACKS="$ROLLBACK_FAILED_STACKS $STACK"
	continue
	}

	# Get basic health status
	RUNNING_COUNT=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps --services --filter "status=running" 2>/dev/null \| grep -E '^[a-zA-Z0-9_-]+$' \| wc -l \| tr -d " " \|\| echo "0")
	TOTAL_COUNT=$(timeout $HEALTH_CHECK_CMD_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml config --services 2>/dev/null \| grep -E '^[a-zA-Z0-9_-]+$' \| wc -l \| tr -d " " \|\| echo "0")

	ROLLBACK_TOTAL_CONTAINERS=$((ROLLBACK_TOTAL_CONTAINERS + TOTAL_COUNT))
	ROLLBACK_RUNNING_CONTAINERS=$((ROLLBACK_RUNNING_CONTAINERS + RUNNING_COUNT))

	if [ "$RUNNING_COUNT" -eq 0 ]; then
	echo "❌ $STACK rollback: 0/$TOTAL_COUNT services running"
	ROLLBACK_FAILED_STACKS="$ROLLBACK_FAILED_STACKS $STACK"
	elif [ "$RUNNING_COUNT" -lt "$TOTAL_COUNT" ]; then
	echo "⚠️ $STACK rollback: $RUNNING_COUNT/$TOTAL_COUNT services running (degraded)"
	ROLLBACK_DEGRADED_STACKS="$ROLLBACK_DEGRADED_STACKS $STACK"
	else
	echo "✅ $STACK rollback: All $RUNNING_COUNT services healthy"
	ROLLBACK_HEALTHY_STACKS="$ROLLBACK_HEALTHY_STACKS $STACK"
	fi
	done

	# Calculate success rate
	if [ "$ROLLBACK_TOTAL_CONTAINERS" -gt 0 ]; then
	ROLLBACK_SUCCESS_RATE=$(( ROLLBACK_RUNNING_CONTAINERS * 100 / ROLLBACK_TOTAL_CONTAINERS ))
	else
	ROLLBACK_SUCCESS_RATE=0
	fi

	echo ""
	echo "📊 Rollback Health Verification Summary:"
	echo "════════════════════════════════════════"
	echo "Total Services: $ROLLBACK_TOTAL_CONTAINERS"
	echo "Running Services: $ROLLBACK_RUNNING_CONTAINERS"
	echo "Success Rate: ${ROLLBACK_SUCCESS_RATE}%"
	echo ""

	[ -n "$ROLLBACK_HEALTHY_STACKS" ] && echo "✅ Healthy After Rollback: $(echo $ROLLBACK_HEALTHY_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')"
	[ -n "$ROLLBACK_DEGRADED_STACKS" ] && echo "⚠️ Degraded After Rollback: $(echo $ROLLBACK_DEGRADED_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')"
	[ -n "$ROLLBACK_FAILED_STACKS" ] && echo "❌ Failed After Rollback: $(echo $ROLLBACK_FAILED_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')"

	# Output structured results (simplified - no markers needed)
	echo "ROLLBACK_HEALTH_HEALTHY=$(echo $ROLLBACK_HEALTHY_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')"
	echo "ROLLBACK_HEALTH_DEGRADED=$(echo $ROLLBACK_DEGRADED_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')"
	echo "ROLLBACK_HEALTH_FAILED=$(echo $ROLLBACK_FAILED_STACKS \| tr ' ' ',' \| sed 's/^,//' \| sed 's/,/, /g')"
	echo "ROLLBACK_HEALTH_TOTAL_CONTAINERS=$ROLLBACK_TOTAL_CONTAINERS"
	echo "ROLLBACK_HEALTH_RUNNING_CONTAINERS=$ROLLBACK_RUNNING_CONTAINERS"
	echo "ROLLBACK_HEALTH_SUCCESS_RATE=$ROLLBACK_SUCCESS_RATE"

	# Determine rollback verification status
	if [ -n "$ROLLBACK_FAILED_STACKS" ]; then
	echo ""
	echo "⚠️ Rollback completed but some services are still unhealthy"
	echo "Manual intervention may be required"
	exit 0 # Don't fail the workflow, rollback itself was successful
	else
	echo ""
	echo "🎉 Rollback verified - all services are healthy or degraded but stable"
	exit 0
	fi
	EOF
	)

	# Extract rollback health outputs
	echo "$ROLLBACK_HEALTH_RESULT"

	# Parse rollback health outputs (simplified - direct variable extraction)
	if echo "$ROLLBACK_HEALTH_RESULT" \| grep -q "ROLLBACK_HEALTH_"; then
	# Extract each output variable directly
	HEALTHY=$(echo "$ROLLBACK_HEALTH_RESULT" \| grep "ROLLBACK_HEALTH_HEALTHY=" \| cut -d'=' -f2-)
	DEGRADED=$(echo "$ROLLBACK_HEALTH_RESULT" \| grep "ROLLBACK_HEALTH_DEGRADED=" \| cut -d'=' -f2-)
	FAILED=$(echo "$ROLLBACK_HEALTH_RESULT" \| grep "ROLLBACK_HEALTH_FAILED=" \| cut -d'=' -f2-)
	TOTAL=$(echo "$ROLLBACK_HEALTH_RESULT" \| grep "ROLLBACK_HEALTH_TOTAL_CONTAINERS=" \| cut -d'=' -f2-)
	RUNNING=$(echo "$ROLLBACK_HEALTH_RESULT" \| grep "ROLLBACK_HEALTH_RUNNING_CONTAINERS=" \| cut -d'=' -f2-)
	RATE=$(echo "$ROLLBACK_HEALTH_RESULT" \| grep "ROLLBACK_HEALTH_SUCCESS_RATE=" \| cut -d'=' -f2-)

	{
	echo "rollback_healthy_stacks=${HEALTHY:-}"
	echo "rollback_degraded_stacks=${DEGRADED:-}"
	echo "rollback_failed_stacks=${FAILED:-}"
	echo "rollback_total_containers=${TOTAL:-0}"
	echo "rollback_running_containers=${RUNNING:-0}"
	echo "rollback_success_rate=${RATE:-0}"
	} >> "$GITHUB_OUTPUT"
	else
	# Fallback outputs if parsing fails
	{
	echo "rollback_healthy_stacks="
	echo "rollback_degraded_stacks="
	echo "rollback_failed_stacks="
	echo "rollback_total_containers=0"
	echo "rollback_running_containers=0"
	echo "rollback_success_rate=0"
	} >> "$GITHUB_OUTPUT"
	fi

	- name: Cleanup SSH connections
	if: always()
	run: \|
	# Close SSH connection multiplexing
	echo "🧹 Cleaning up SSH connections..."
	ssh -o "StrictHostKeyChecking no" deployment-server -O exit 2>/dev/null \|\| true

	# Clean up SSH control sockets
	rm -f ~/.ssh/sockets/* 2>/dev/null \|\| true

	echo "✅ SSH cleanup completed"

	- name: Report Deployment Status
	if: always()
	run: \|
	echo "::group::Deployment Summary"

	# Parse stacks from JSON input and create display list
	STACK_LIST="${{ join(fromJson(inputs.stacks), ', ') }}"
	if [ "${{ inputs.has-dockge }}" = "true" ]; then
	STACK_LIST="dockge, $STACK_LIST"
	fi

	if [ "${{ steps.backup.outputs.deployment_needed }}" != "true" ]; then
	echo "ℹ️ NO DEPLOYMENT NEEDED"
	echo "✅ Repository already at target commit"
	echo "📋 Target stacks: $STACK_LIST"
	echo "🔄 SHA: ${{ inputs.target-ref }}"
	elif [ "${{ inputs.force-deploy }}" = "true" ] && [ "${{ steps.deploy.outcome }}" == "success" ] && [ "${{ steps.health.outcome }}" == "success" ]; then
	echo "🔄 FORCE DEPLOYMENT SUCCESSFUL"
	echo "✅ All stacks force-deployed and healthy"
	echo "📋 Deployed stacks: $STACK_LIST"
	echo "🔄 SHA: ${{ inputs.target-ref }}"
	if [ "${{ steps.cleanup.outcome }}" == "success" ]; then
	echo "🧹 Cleanup completed successfully"
	fi
	elif [ "${{ steps.deploy.outcome }}" == "success" ] && [ "${{ steps.health.outcome }}" == "success" ]; then
	echo "🎉 DEPLOYMENT SUCCESSFUL"
	echo "✅ All stacks deployed and healthy"
	echo "📋 Deployed stacks: $STACK_LIST"
	echo "🔄 SHA: ${{ inputs.target-ref }}"
	if [ "${{ steps.cleanup.outcome }}" == "success" ]; then
	echo "🧹 Cleanup completed successfully"
	fi
	else
	echo "💥 DEPLOYMENT FAILED"
	echo "❌ Deploy status: ${{ steps.deploy.outcome }}"
	echo "❌ Health check status: ${{ steps.health.outcome }}"
	if [ "${{ steps.rollback.outcome }}" == "success" ]; then
	echo "🔄 Rollback completed successfully"
	if [ "${{ steps.rollback-health.outcome }}" == "success" ]; then
	echo "✅ Rollback verification passed"
	elif [ "${{ steps.rollback-health.outcome }}" == "failure" ]; then
	echo "⚠️ Rollback verification failed - manual intervention may be needed"
	fi
	else
	echo "❌ Rollback status: ${{ steps.rollback.outcome }}"
	fi
	exit 1
	fi
	echo "::endgroup::"

	notify:
	name: Discord Notification
	runs-on: ubuntu-24.04
	needs: [deploy]
	if: always()
	steps:
	- name: Configure 1Password Service Account
	uses: 1password/load-secrets-action/configure@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
	with:
	service-account-token: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}

	- name: Get commit message
	id: commit-msg
	run: \|
	COMMIT_MSG=$(curl -s -H "Authorization: token ${{ github.token }}" \
	"https://api.github.com/repos/${{ github.repository }}/commits/${{ inputs.target-ref }}" \
	\| jq -r '.commit.message // "No commit message available"' \
	\| head -1)
	SHORT_SHA="${{ inputs.target-ref }}"
	SHORT_SHA="${SHORT_SHA:0:7}"
	echo "message=$COMMIT_MSG" >> "$GITHUB_OUTPUT"
	echo "short-sha=$SHORT_SHA" >> "$GITHUB_OUTPUT"

	- name: Load Discord webhook and user ID
	id: op-load-discord
	uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
	with:
	unset-previous: true
	env:
	DISCORD_WEBHOOK: ${{ inputs.webhook-url }}
	DISCORD_USER_ID: ${{ inputs.discord-user-id != '' && inputs.discord-user-id \|\| 'SKIP' }}

	- name: Send Discord notification
	uses: sarisia/actions-status-discord@b8381b25576cb341b2af39926ab42c5056cc44ed # v1.15.5
	with:
	webhook: ${{ steps.op-load-discord.outputs.DISCORD_WEBHOOK }}
	status: ${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && 'success' \|\| needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && 'success' \|\| 'failure' }}
	title: "🚀 ${{ inputs.repo-name }} • ${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && 'No Changes' \|\| needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && 'Deployed' \|\| needs.deploy.outputs.rollback_status == 'success' && 'Rolled Back' \|\| 'Failed' }}"
	description: \|
	${{ (needs.deploy.outputs.deploy_status == 'failure' \|\| needs.deploy.outputs.health_status == 'failure' \|\| needs.deploy.result == 'failure') && inputs.discord-user-id != '' && steps.op-load-discord.outputs.DISCORD_USER_ID != 'SKIP' && format('<@{0}> ', steps.op-load-discord.outputs.DISCORD_USER_ID) \|\| '' }}${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && '📋 Repository already at target commit' \|\|
	inputs.force-deploy == true && needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && '🔄 Force deployment completed successfully' \|\|
	needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && '✅ Deployment completed successfully' \|\|
	needs.deploy.outputs.rollback_status == 'success' && '🔄 Deployment failed but rolled back successfully' \|\| '❌ Deployment failed' }}

	${{ needs.deploy.outputs.deployment_needed == 'true' && needs.deploy.outputs.rollback_status != 'success' &&
	format('📊 Health Status
	🟢 Running: {0}/{1} services ({2}%)', needs.deploy.outputs.running_containers \|\| '0',
	needs.deploy.outputs.total_containers \|\| '0', needs.deploy.outputs.success_rate \|\| '0') \|\|
	needs.deploy.outputs.rollback_status == 'success' && format('📊 Rollback Health
	🟢 Running: {0}/{1} services ({2}%)',
	needs.deploy.outputs.rollback_running_containers \|\| '0', needs.deploy.outputs.rollback_total_containers \|\| '0',
	needs.deploy.outputs.rollback_success_rate \|\| '0') \|\| '' }}

	${{ needs.deploy.outputs.rollback_status != 'success' && (needs.deploy.outputs.healthy_stacks != '' \|\| needs.deploy.outputs.degraded_stacks != '' \|\| needs.deploy.outputs.failed_stacks != '') &&
	format('🏷️ Stack Status
	{0}{1}{2}',
	needs.deploy.outputs.healthy_stacks != '' && format('✅ {0}
	', needs.deploy.outputs.healthy_stacks) \|\| '',
	needs.deploy.outputs.degraded_stacks != '' && format('⚠️ {0}
	', needs.deploy.outputs.degraded_stacks) \|\| '',
	needs.deploy.outputs.failed_stacks != '' && format('❌ {0}', needs.deploy.outputs.failed_stacks) \|\| '') \|\| '' }}

	${{ needs.deploy.outputs.rollback_status == 'success' && (needs.deploy.outputs.rollback_healthy_stacks != '' \|\| needs.deploy.outputs.rollback_degraded_stacks != '' \|\| needs.deploy.outputs.rollback_failed_stacks != '') &&
	format('🏷️ Rollback Stack Status
	{0}{1}{2}',
	needs.deploy.outputs.rollback_healthy_stacks != '' && format('✅ {0}
	', needs.deploy.outputs.rollback_healthy_stacks) \|\| '',
	needs.deploy.outputs.rollback_degraded_stacks != '' && format('⚠️ {0}
	', needs.deploy.outputs.rollback_degraded_stacks) \|\| '',
	needs.deploy.outputs.rollback_failed_stacks != '' && format('❌ {0}', needs.deploy.outputs.rollback_failed_stacks) \|\| '') \|\| '' }}

	${{ needs.deploy.outputs.deployment_needed == 'true' && format('🔄 Pipeline Status
	{0} Deploy → {1} Health → {2} Cleanup{3}',
	needs.deploy.outputs.deploy_status == 'success' && '✅' \|\| '❌',
	needs.deploy.outputs.health_status == 'success' && '✅' \|\| needs.deploy.outputs.health_status == 'skipped' && '⏭️' \|\| '❌',
	needs.deploy.outputs.cleanup_status == 'success' && '✅' \|\| needs.deploy.outputs.cleanup_status == 'skipped' && '⏭️' \|\| '❌',
	needs.deploy.outputs.rollback_status != 'skipped' && format(' → {0} Rollback{1}',
	needs.deploy.outputs.rollback_status == 'success' && '✅' \|\| '❌',
	needs.deploy.outputs.rollback_health_status == 'success' && ' → ✅ Verify' \|\|
	needs.deploy.outputs.rollback_health_status == 'failure' && ' → ❌ Verify' \|\| '') \|\| '') \|\| '' }}

	${{ github.event_name == 'workflow_dispatch' && '🔧 Triggered manually' \|\| format('📝 Commit: [`{0}`](https://github.com/{1}/commit/{2}) {3}', steps.commit-msg.outputs.short-sha, github.repository, inputs.target-ref, steps.commit-msg.outputs.message) }}

	⏱️ Duration: ${{ github.event_name != 'workflow_dispatch' && '3min' \|\| 'Manual' }}
	color: ${{ needs.deploy.result == 'success' && needs.deploy.outputs.deployment_needed != 'true' && 0x6c757d \|\|
	needs.deploy.outputs.deploy_status == 'success' && needs.deploy.outputs.health_status == 'success' && 0x28a745 \|\|
	needs.deploy.outputs.rollback_status == 'success' && 0xffc107 \|\|
	needs.deploy.outputs.degraded_stacks != '' && 0xfd7e14 \|\| 0xdc3545 }}
	username: "Compose Deploy"
	avatar_url: "https://cdn-icons-png.flaticon.com/512/919/919853.png"

	- name: Unload Discord webhook
	uses: 1password/load-secrets-action@8d0d610af187e78a2772c2d18d627f4c52d3fbfb # v3.1.0
	with:
	unset-previous: true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

feat(deploy): add deployment script library foundation #5

Workflow file

feat(deploy): add deployment script library foundation #5

Uh oh!

Workflow file for this run

GitHub Actions / .github/workflows/deploy.yml