diff --git a/.github/workflows/functional.yml b/.github/workflows/functional.yml index 63e8e4b1..fc734e72 100644 --- a/.github/workflows/functional.yml +++ b/.github/workflows/functional.yml @@ -289,3 +289,85 @@ jobs: env: PG_IMAGE: postgres:${{ matrix.pg_version }} run: docker compose down -v --remove-orphans 2>/dev/null || true + + functional-raft: + runs-on: ubuntu-latest + timeout-minutes: 30 + needs: build + + steps: + - uses: actions/checkout@v4 + + - name: Download orchestrator binary + uses: actions/download-artifact@v4 + with: + name: orchestrator-binary + path: bin + + - name: Make binary executable + run: chmod +x bin/orchestrator + + - name: Start MySQL infrastructure + working-directory: tests/functional + run: | + docker compose up -d mysql1 mysql2 mysql3 + echo "Waiting for MySQL to be healthy..." + timeout 120 bash -c ' + while true; do + HEALTHY=$(docker compose ps --format json 2>/dev/null | python3 -c " + import json, sys + healthy = 0 + for line in sys.stdin: + svc = json.loads(line) + if \"healthy\" in svc.get(\"Status\",\"\").lower(): + healthy += 1 + print(healthy) + " 2>/dev/null || echo "0") + if [ "$HEALTHY" -ge 3 ]; then + echo "All 3 MySQL services healthy" + exit 0 + fi + sleep 2 + done + ' || { echo "Timeout"; docker compose ps; docker compose logs --tail=30; exit 1; } + + - name: Setup replication + run: bash tests/functional/setup-replication.sh + + - name: Run Raft consensus tests + run: bash tests/functional/test-raft.sh + + - name: Collect Raft orchestrator logs + if: always() + working-directory: tests/functional + run: | + docker compose logs orchestrator-raft1 > /tmp/orchestrator-raft1.log 2>&1 || true + docker compose logs orchestrator-raft2 > /tmp/orchestrator-raft2.log 2>&1 || true + docker compose logs orchestrator-raft3 > /tmp/orchestrator-raft3.log 2>&1 || true + + - name: Upload Raft orchestrator logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: orchestrator-raft-logs + path: | + /tmp/orchestrator-raft1.log + /tmp/orchestrator-raft2.log + /tmp/orchestrator-raft3.log + + - name: Collect all docker logs on failure + if: failure() + working-directory: tests/functional + run: docker compose logs > /tmp/docker-compose-raft-logs.txt 2>&1 || true + + - name: Upload docker logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: docker-compose-raft-logs + path: /tmp/docker-compose-raft-logs.txt + + - name: Cleanup + if: always() + working-directory: tests/functional + run: docker compose down -v --remove-orphans 2>/dev/null || true diff --git a/go/raft/rel_store.go b/go/raft/rel_store.go index b9c25ee3..881063f1 100644 --- a/go/raft/rel_store.go +++ b/go/raft/rel_store.go @@ -36,7 +36,7 @@ var createQueries = []string{ log_index integer, term bigint not null, log_type int not null, - data blob not null, + data blob, PRIMARY KEY (log_index) ) `, @@ -44,7 +44,7 @@ var createQueries = []string{ CREATE TABLE IF NOT EXISTS raft_store ( store_id integer, store_key varbinary(512) not null, - store_value blob not null, + store_value blob, PRIMARY KEY (store_id) ) `, diff --git a/tests/functional/docker-compose.yml b/tests/functional/docker-compose.yml index 3bceac1a..1da3871f 100644 --- a/tests/functional/docker-compose.yml +++ b/tests/functional/docker-compose.yml @@ -205,6 +205,81 @@ services: aliases: - orchestrator-pg + orchestrator-raft1: + image: ubuntu:24.04 + hostname: orchestrator-raft1 + volumes: + - ../../bin/orchestrator:/usr/local/bin/orchestrator:ro + - ../../resources:/orchestrator/resources:ro + - ./orchestrator-raft1.conf.json:/orchestrator/orchestrator.conf.json:ro + command: > + bash -c " + apt-get update -qq && apt-get install -y -qq curl sqlite3 > /dev/null 2>&1 && + mkdir -p /tmp/raft1 && + cd /orchestrator && + orchestrator -config orchestrator.conf.json http + " + ports: + - "3100:3099" + depends_on: + mysql1: + condition: service_healthy + networks: + orchnet: + ipv4_address: 172.30.0.40 + aliases: + - orchestrator-raft1 + + orchestrator-raft2: + image: ubuntu:24.04 + hostname: orchestrator-raft2 + volumes: + - ../../bin/orchestrator:/usr/local/bin/orchestrator:ro + - ../../resources:/orchestrator/resources:ro + - ./orchestrator-raft2.conf.json:/orchestrator/orchestrator.conf.json:ro + command: > + bash -c " + apt-get update -qq && apt-get install -y -qq curl sqlite3 > /dev/null 2>&1 && + mkdir -p /tmp/raft2 && + cd /orchestrator && + orchestrator -config orchestrator.conf.json http + " + ports: + - "3101:3099" + depends_on: + mysql1: + condition: service_healthy + networks: + orchnet: + ipv4_address: 172.30.0.41 + aliases: + - orchestrator-raft2 + + orchestrator-raft3: + image: ubuntu:24.04 + hostname: orchestrator-raft3 + volumes: + - ../../bin/orchestrator:/usr/local/bin/orchestrator:ro + - ../../resources:/orchestrator/resources:ro + - ./orchestrator-raft3.conf.json:/orchestrator/orchestrator.conf.json:ro + command: > + bash -c " + apt-get update -qq && apt-get install -y -qq curl sqlite3 > /dev/null 2>&1 && + mkdir -p /tmp/raft3 && + cd /orchestrator && + orchestrator -config orchestrator.conf.json http + " + ports: + - "3102:3099" + depends_on: + mysql1: + condition: service_healthy + networks: + orchnet: + ipv4_address: 172.30.0.42 + aliases: + - orchestrator-raft3 + networks: orchnet: driver: bridge diff --git a/tests/functional/orchestrator-raft1.conf.json b/tests/functional/orchestrator-raft1.conf.json new file mode 100644 index 00000000..9871d3e9 --- /dev/null +++ b/tests/functional/orchestrator-raft1.conf.json @@ -0,0 +1,25 @@ +{ + "Debug": true, + "ListenAddress": ":3099", + "HTTPAdvertise": "http://172.30.0.40:3099", + "MySQLTopologyUser": "orchestrator", + "MySQLTopologyPassword": "orch_pass", + "MySQLOrchestratorHost": "", + "MySQLOrchestratorPort": 0, + "BackendDB": "sqlite", + "SQLite3DataFile": "/tmp/raft1/orchestrator.sqlite3", + "DiscoverByShowSlaveHosts": false, + "InstancePollSeconds": 5, + "RecoveryPeriodBlockSeconds": 10, + "RecoverMasterClusterFilters": [".*"], + "RecoverIntermediateMasterClusterFilters": [".*"], + "AutoPseudoGTID": false, + "DetectClusterAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "DetectInstanceAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "PrometheusEnabled": false, + "RaftEnabled": true, + "RaftDataDir": "/tmp/raft1", + "RaftBind": "172.30.0.40", + "DefaultRaftPort": 10008, + "RaftNodes": ["172.30.0.40", "172.30.0.41", "172.30.0.42"] +} diff --git a/tests/functional/orchestrator-raft2.conf.json b/tests/functional/orchestrator-raft2.conf.json new file mode 100644 index 00000000..51fbd241 --- /dev/null +++ b/tests/functional/orchestrator-raft2.conf.json @@ -0,0 +1,25 @@ +{ + "Debug": true, + "ListenAddress": ":3099", + "HTTPAdvertise": "http://172.30.0.41:3099", + "MySQLTopologyUser": "orchestrator", + "MySQLTopologyPassword": "orch_pass", + "MySQLOrchestratorHost": "", + "MySQLOrchestratorPort": 0, + "BackendDB": "sqlite", + "SQLite3DataFile": "/tmp/raft2/orchestrator.sqlite3", + "DiscoverByShowSlaveHosts": false, + "InstancePollSeconds": 5, + "RecoveryPeriodBlockSeconds": 10, + "RecoverMasterClusterFilters": [".*"], + "RecoverIntermediateMasterClusterFilters": [".*"], + "AutoPseudoGTID": false, + "DetectClusterAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "DetectInstanceAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "PrometheusEnabled": false, + "RaftEnabled": true, + "RaftDataDir": "/tmp/raft2", + "RaftBind": "172.30.0.41", + "DefaultRaftPort": 10008, + "RaftNodes": ["172.30.0.40", "172.30.0.41", "172.30.0.42"] +} diff --git a/tests/functional/orchestrator-raft3.conf.json b/tests/functional/orchestrator-raft3.conf.json new file mode 100644 index 00000000..3a93ad29 --- /dev/null +++ b/tests/functional/orchestrator-raft3.conf.json @@ -0,0 +1,25 @@ +{ + "Debug": true, + "ListenAddress": ":3099", + "HTTPAdvertise": "http://172.30.0.42:3099", + "MySQLTopologyUser": "orchestrator", + "MySQLTopologyPassword": "orch_pass", + "MySQLOrchestratorHost": "", + "MySQLOrchestratorPort": 0, + "BackendDB": "sqlite", + "SQLite3DataFile": "/tmp/raft3/orchestrator.sqlite3", + "DiscoverByShowSlaveHosts": false, + "InstancePollSeconds": 5, + "RecoveryPeriodBlockSeconds": 10, + "RecoverMasterClusterFilters": [".*"], + "RecoverIntermediateMasterClusterFilters": [".*"], + "AutoPseudoGTID": false, + "DetectClusterAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "DetectInstanceAliasQuery": "SELECT CONCAT(@@hostname, ':', @@port)", + "PrometheusEnabled": false, + "RaftEnabled": true, + "RaftDataDir": "/tmp/raft3", + "RaftBind": "172.30.0.42", + "DefaultRaftPort": 10008, + "RaftNodes": ["172.30.0.40", "172.30.0.41", "172.30.0.42"] +} diff --git a/tests/functional/test-raft.sh b/tests/functional/test-raft.sh new file mode 100755 index 00000000..2af3e895 --- /dev/null +++ b/tests/functional/test-raft.sh @@ -0,0 +1,350 @@ +#!/bin/bash +# Raft consensus tests -- verify leader election, failover, and follower redirect +set -uo pipefail +cd "$(dirname "$0")/../.." +source tests/functional/lib.sh + +echo "=== RAFT CONSENSUS TESTS ===" + +# Port mapping: raft1->3100, raft2->3101, raft3->3102 +RAFT_PORTS=(3100 3101 3102) +RAFT_NODES=(orchestrator-raft1 orchestrator-raft2 orchestrator-raft3) +COMPOSE_FILE="tests/functional/docker-compose.yml" + +# ============================================================ +# Phase 1: Cluster Formation & Leader Election +# ============================================================ +echo "" +echo "--- Phase 1: Cluster Formation & Leader Election ---" + +# Start node 1 first to let it bootstrap the cluster before other nodes join. +# Starting all 3 simultaneously causes each to call BootstrapCluster independently, +# creating conflicting initial states and perpetual election cycles. +echo "Starting first Raft node (bootstrap node)..." +docker compose -f "$COMPOSE_FILE" up -d orchestrator-raft1 + +# Wait for node 1 to be reachable (includes apt-get install time) +echo "Waiting for bootstrap node to be ready (up to 90s)..." +BOOTSTRAP_READY=false +for i in $(seq 1 90); do + if curl -sf --max-time 5 "http://localhost:3100/api/raft-status" > /dev/null 2>&1; then + BOOTSTRAP_READY=true + echo "Bootstrap node ready after ${i}s" + break + fi + sleep 1 +done + +if ! $BOOTSTRAP_READY; then + fail "Bootstrap Raft node (orchestrator-raft1) not ready within 90s" + docker compose -f "$COMPOSE_FILE" logs orchestrator-raft1 2>/dev/null | tail -30 + summary +fi +pass "Bootstrap Raft node started and ready" + +# Now start the remaining nodes — they will find the bootstrapped cluster +echo "Starting remaining Raft nodes..." +docker compose -f "$COMPOSE_FILE" up -d orchestrator-raft2 orchestrator-raft3 + +# Wait for all 3 nodes to be reachable and for a leader to be elected +echo "Waiting for Raft cluster to form and elect a leader (up to 90s)..." +LEADER="" +for i in $(seq 1 90); do + ALL_UP=true + for port in "${RAFT_PORTS[@]}"; do + if ! curl -sf --max-time 10 "http://localhost:${port}/api/raft-leader" > /dev/null 2>&1; then + ALL_UP=false + break + fi + done + if $ALL_UP; then + # Check if all nodes agree on a leader + LEADER1=$(curl -sf --max-time 10 "http://localhost:3100/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + LEADER2=$(curl -sf --max-time 10 "http://localhost:3101/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + LEADER3=$(curl -sf --max-time 10 "http://localhost:3102/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ -n "$LEADER1" ] && [ "$LEADER1" = "$LEADER2" ] && [ "$LEADER2" = "$LEADER3" ]; then + LEADER="$LEADER1" + echo "Leader elected: $LEADER (after ${i}s)" + break + fi + fi + sleep 1 +done + +if [ -n "$LEADER" ]; then + pass "Raft leader elected: $LEADER" +else + fail "Raft leader not elected within 90s" + # Print debug info + for port in "${RAFT_PORTS[@]}"; do + echo " Node :${port} raft-status: $(curl -sf --max-time 10 http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')" + done +fi + +# Verify all nodes agree on the same leader +LEADERS_AGREE=true +for port in "${RAFT_PORTS[@]}"; do + NODE_LEADER=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ "$NODE_LEADER" != "$LEADER" ]; then + LEADERS_AGREE=false + break + fi +done +if $LEADERS_AGREE && [ -n "$LEADER" ]; then + pass "All 3 nodes agree on the same leader" +else + fail "Nodes do not agree on the leader" +fi + +# Verify exactly one node reports itself as Leader state +LEADER_COUNT=0 +for port in "${RAFT_PORTS[@]}"; do + STATE=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ "$STATE" = "Leader" ]; then + ((LEADER_COUNT++)) + fi +done +if [ "$LEADER_COUNT" -eq 1 ]; then + pass "Exactly one node is in Leader state" +else + fail "Expected 1 leader, found $LEADER_COUNT" +fi + +# ============================================================ +# Phase 2: Leader Serves Topology +# ============================================================ +echo "" +echo "--- Phase 2: Leader Serves Topology ---" + +# Determine leader port (map leader IP to host port) +LEADER_PORT="" +LEADER_INDEX="" +for idx in 0 1 2; do + STATE=$(curl -sf --max-time 10 "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ "$STATE" = "Leader" ]; then + LEADER_PORT="${RAFT_PORTS[$idx]}" + LEADER_INDEX=$idx + break + fi +done + +if [ -z "$LEADER_PORT" ]; then + fail "Could not identify leader port" +else + echo "Leader is on localhost:${LEADER_PORT} (${RAFT_NODES[$LEADER_INDEX]})" + + # Discover MySQL topology through the leader + echo "Discovering MySQL topology through the leader..." + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1 + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1 + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1 + + # Wait for topology discovery + echo "Waiting for topology discovery (up to 60s)..." + CLUSTER_FOUND=false + for i in $(seq 1 60); do + CLUSTERS=$(curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]") + COUNT=$(echo "$CLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0") + if [ "$COUNT" -ge 1 ]; then + echo "Cluster discovered after ${i}s" + CLUSTER_FOUND=true + break + fi + # Re-seed discovery periodically + if [ "$((i % 10))" = "0" ]; then + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql1/3306" > /dev/null 2>&1 + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql2/3306" > /dev/null 2>&1 + curl -sf --max-time 10 "http://localhost:${LEADER_PORT}/api/discover/mysql3/3306" > /dev/null 2>&1 + fi + sleep 1 + done + + if $CLUSTER_FOUND; then + pass "Leader serves cluster data via /api/clusters" + else + fail "Leader did not return cluster data within 60s" + fi + + # Verify followers can also return cluster data (Raft replicates state) + FOLLOWER_HAS_DATA=true + for idx in 0 1 2; do + if [ "$idx" = "$LEADER_INDEX" ]; then + continue + fi + FPORT="${RAFT_PORTS[$idx]}" + # Followers may redirect or serve data directly; either is valid + FCLUSTERS=$(curl -sf --max-time 10L "http://localhost:${FPORT}/api/clusters" 2>/dev/null || echo "[]") + FCOUNT=$(echo "$FCLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0") + if [ "$FCOUNT" -lt 1 ]; then + FOLLOWER_HAS_DATA=false + echo " Follower on :${FPORT} returned $FCOUNT clusters" + fi + done + if $FOLLOWER_HAS_DATA; then + pass "Followers serve cluster data (Raft state replicated)" + else + # This is not necessarily a failure -- followers may need more time + skip "Some followers do not yet serve cluster data (may need more replication time)" + fi +fi + +# ============================================================ +# Phase 3: Leader Failure & Re-election +# ============================================================ +echo "" +echo "--- Phase 3: Leader Failure & Re-election ---" + +OLD_LEADER="$LEADER" +OLD_LEADER_NODE="" +if [ -n "$LEADER_INDEX" ]; then + OLD_LEADER_NODE="${RAFT_NODES[$LEADER_INDEX]}" +fi + +if [ -z "$OLD_LEADER_NODE" ]; then + fail "Cannot test leader failure: no leader identified" +else + echo "Stopping leader node: $OLD_LEADER_NODE" + docker compose -f "$COMPOSE_FILE" stop "$OLD_LEADER_NODE" + + # Determine which nodes are still running + REMAINING_PORTS=() + REMAINING_INDICES=() + for idx in 0 1 2; do + if [ "$idx" != "$LEADER_INDEX" ]; then + REMAINING_PORTS+=("${RAFT_PORTS[$idx]}") + REMAINING_INDICES+=("$idx") + fi + done + + # Wait for re-election + echo "Waiting for re-election (up to 60s)..." + NEW_LEADER="" + for i in $(seq 1 60); do + L1=$(curl -sf --max-time 10 "http://localhost:${REMAINING_PORTS[0]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + L2=$(curl -sf --max-time 10 "http://localhost:${REMAINING_PORTS[1]}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ -n "$L1" ] && [ "$L1" = "$L2" ] && [ "$L1" != "$OLD_LEADER" ]; then + NEW_LEADER="$L1" + echo "New leader elected: $NEW_LEADER (after ${i}s)" + break + fi + sleep 1 + done + + if [ -n "$NEW_LEADER" ]; then + pass "New leader elected after stopping old leader: $NEW_LEADER" + else + fail "No new leader elected within 60s" + for port in "${REMAINING_PORTS[@]}"; do + echo " Node :${port} status: $(curl -sf --max-time 10 http://localhost:${port}/api/raft-status 2>/dev/null || echo 'unreachable')" + done + fi + + # Verify new leader is different from old + if [ -n "$NEW_LEADER" ] && [ "$NEW_LEADER" != "$OLD_LEADER" ]; then + pass "New leader is different from old leader" + elif [ -n "$NEW_LEADER" ]; then + fail "New leader is the same as old leader (should not happen)" + fi + + # Verify new leader can serve API requests + if [ -n "$NEW_LEADER" ]; then + NEW_LEADER_PORT="" + for idx in "${REMAINING_INDICES[@]}"; do + STATE=$(curl -sf --max-time 10 "http://localhost:${RAFT_PORTS[$idx]}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ "$STATE" = "Leader" ]; then + NEW_LEADER_PORT="${RAFT_PORTS[$idx]}" + break + fi + done + if [ -n "$NEW_LEADER_PORT" ]; then + CLUSTERS=$(curl -sf --max-time 10 "http://localhost:${NEW_LEADER_PORT}/api/clusters" 2>/dev/null || echo "[]") + COUNT=$(echo "$CLUSTERS" | python3 -c "import json,sys; c=json.load(sys.stdin); print(len(c))" 2>/dev/null || echo "0") + if [ "$COUNT" -ge 1 ]; then + pass "New leader serves cluster data via API" + else + skip "New leader returned 0 clusters (state may not have fully replicated yet)" + fi + fi + fi + + # ============================================================ + # Phase 4: Node Rejoin + # ============================================================ + echo "" + echo "--- Phase 4: Node Rejoin ---" + + echo "Restarting stopped node: $OLD_LEADER_NODE" + docker compose -f "$COMPOSE_FILE" start "$OLD_LEADER_NODE" + + # Wait for the restarted node to rejoin + RESTARTED_PORT="${RAFT_PORTS[$LEADER_INDEX]}" + echo "Waiting for restarted node (:${RESTARTED_PORT}) to rejoin (up to 60s)..." + REJOINED=false + for i in $(seq 1 60); do + RLEADER=$(curl -sf --max-time 10 "http://localhost:${RESTARTED_PORT}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ -n "$RLEADER" ] && [ "$RLEADER" = "$NEW_LEADER" ]; then + echo "Node rejoined after ${i}s" + REJOINED=true + break + fi + sleep 1 + done + + if $REJOINED; then + pass "Restarted node rejoined the cluster" + else + fail "Restarted node did not rejoin within 60s" + fi + + # Verify the restarted node is a follower (not a new leader) + RSTATE=$(curl -sf --max-time 10 "http://localhost:${RESTARTED_PORT}/api/raft-state" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ "$RSTATE" = "Follower" ]; then + pass "Restarted node is a Follower (stable leader)" + elif [ "$RSTATE" = "Leader" ]; then + # Leadership may have shifted -- still valid if all agree + skip "Restarted node became Leader (leadership may have shifted)" + else + fail "Restarted node in unexpected state: $RSTATE" + fi + + # Verify all 3 nodes agree on the current leader + ALL_AGREE=true + CURRENT_LEADER="" + for port in "${RAFT_PORTS[@]}"; do + NL=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-leader" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin))" 2>/dev/null || echo "") + if [ -z "$CURRENT_LEADER" ]; then + CURRENT_LEADER="$NL" + elif [ "$NL" != "$CURRENT_LEADER" ]; then + ALL_AGREE=false + fi + done + if $ALL_AGREE && [ -n "$CURRENT_LEADER" ]; then + pass "All 3 nodes agree on current leader after rejoin: $CURRENT_LEADER" + else + fail "Nodes do not agree on leader after rejoin" + fi + + # Verify cluster is healthy (all 3 nodes report healthy) + HEALTHY_COUNT=0 + for port in "${RAFT_PORTS[@]}"; do + HEALTH=$(curl -sf --max-time 10 "http://localhost:${port}/api/raft-health" 2>/dev/null || echo "") + if echo "$HEALTH" | grep -q "healthy"; then + ((HEALTHY_COUNT++)) + fi + done + if [ "$HEALTHY_COUNT" -eq 3 ]; then + pass "All 3 nodes report healthy" + else + skip "Only $HEALTHY_COUNT/3 nodes report healthy (may need more time)" + fi +fi + +# ============================================================ +# Cleanup +# ============================================================ +echo "" +echo "--- Cleanup ---" +docker compose -f "$COMPOSE_FILE" stop orchestrator-raft1 orchestrator-raft2 orchestrator-raft3 2>/dev/null || true +echo "Raft containers stopped." + +summary