From 84ef389a5e2d17b64ec7d32e1b9b2d5ff0865c4b Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 06:23:25 +0000 Subject: [PATCH 1/9] Add behavioral validation tests: topology, problems, operations (#88, #89, #90) Add three new test scripts that verify orchestrator actually works correctly, not just that API endpoints return 200: - test-topology-discovery.sh: validates orchestrator's view of the topology matches actual MySQL state (master/replica roles, read_only, GTID tracking, replication threads, instance freshness) - test-problem-detection.sh: verifies orchestrator detects and clears replication problems (stopped replication, writable replica, errant GTID) by inducing problems and checking the API response - test-topology-operations.sh: tests actual topology refactoring operations (relocate, move-up, repoint, set-read-only/writeable) and cross-references results with direct MySQL queries --- .github/workflows/functional.yml | 9 + tests/functional/test-problem-detection.sh | 197 ++++++++++++++++ tests/functional/test-topology-discovery.sh | 225 +++++++++++++++++++ tests/functional/test-topology-operations.sh | 180 +++++++++++++++ 4 files changed, 611 insertions(+) create mode 100755 tests/functional/test-problem-detection.sh create mode 100755 tests/functional/test-topology-discovery.sh create mode 100755 tests/functional/test-topology-operations.sh diff --git a/.github/workflows/functional.yml b/.github/workflows/functional.yml index 29f8343b..8ea3e1ee 100644 --- a/.github/workflows/functional.yml +++ b/.github/workflows/functional.yml @@ -107,6 +107,15 @@ jobs: - name: Run smoke tests run: bash tests/functional/test-smoke.sh + - name: Run topology discovery validation + run: bash tests/functional/test-topology-discovery.sh + + - name: Run problem detection tests + run: bash tests/functional/test-problem-detection.sh + + - name: Run topology operations tests + run: bash tests/functional/test-topology-operations.sh + - name: Run regression tests run: bash tests/functional/test-regression.sh diff --git a/tests/functional/test-problem-detection.sh b/tests/functional/test-problem-detection.sh new file mode 100755 index 00000000..0adfe263 --- /dev/null +++ b/tests/functional/test-problem-detection.sh @@ -0,0 +1,197 @@ +#!/bin/bash +# Problem detection tests — verify orchestrator detects and clears +# replication problems correctly +set -uo pipefail # no -e: we handle failures ourselves +cd "$(dirname "$0")/../.." +source tests/functional/lib.sh + +echo "=== PROBLEM DETECTION TESTS ===" + +COMPOSE="docker compose -f tests/functional/docker-compose.yml" +STOP_SQL=$(mysql_stop_replica_sql) +START_SQL=$(mysql_start_replica_sql) + +wait_for_orchestrator || { echo "FATAL: Orchestrator not reachable"; exit 1; } +discover_topology "mysql1" + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 1: Detect stopped replication ---" + +# Stop replication on mysql2 +echo "Stopping replication on mysql2..." +$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "$STOP_SQL" 2>/dev/null + +# Wait for orchestrator to detect the problem (poll up to 30s) +echo "Waiting for orchestrator to detect stopped replication..." +DETECTED=false +for i in $(seq 1 30); do + PROBLEMS=$(curl -s --max-time 10 "$ORC_URL/api/problems" 2>/dev/null) + if echo "$PROBLEMS" | python3 -c " +import json, sys +problems = json.load(sys.stdin) +for p in problems: + h = p.get('Key', {}).get('Hostname', '') + if 'mysql2' in h: + sys.exit(0) +sys.exit(1) +" 2>/dev/null; then + DETECTED=true + echo "Problem detected after ${i}s" + break + fi + sleep 1 +done + +if [ "$DETECTED" = "true" ]; then + pass "Orchestrator detected stopped replication on mysql2" +else + fail "Orchestrator did not detect stopped replication on mysql2 within 30s" +fi + +# Verify the specific problem shows replication threads stopped +REPL_STATE=$(curl -s --max-time 10 "$ORC_URL/api/instance/mysql2/3306" 2>/dev/null | python3 -c " +import json, sys +inst = json.load(sys.stdin) +sql = inst.get('ReplicationSQLThreadRuning', 'unknown') +io = inst.get('ReplicationIOThreadRuning', 'unknown') +print(f'SQL={sql},IO={io}') +" 2>/dev/null || echo "unknown") + +if echo "$REPL_STATE" | grep -q "SQL=False"; then + pass "Orchestrator reports SQL thread stopped on mysql2" +else + fail "Orchestrator replication state for mysql2: $REPL_STATE" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 1b: Clear stopped replication ---" + +# Restart replication +echo "Restarting replication on mysql2..." +$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "$START_SQL" 2>/dev/null + +# Wait for orchestrator to see it clear +echo "Waiting for problem to clear..." +CLEARED=false +for i in $(seq 1 30); do + PROBLEMS=$(curl -s --max-time 10 "$ORC_URL/api/problems" 2>/dev/null) + HAS_MYSQL2=$(echo "$PROBLEMS" | python3 -c " +import json, sys +problems = json.load(sys.stdin) +for p in problems: + h = p.get('Key', {}).get('Hostname', '') + if 'mysql2' in h: + sys.exit(0) +sys.exit(1) +" 2>/dev/null && echo "yes" || echo "no") + if [ "$HAS_MYSQL2" = "no" ]; then + CLEARED=true + echo "Problem cleared after ${i}s" + break + fi + sleep 1 +done + +if [ "$CLEARED" = "true" ]; then + pass "Stopped replication problem cleared after restart" +else + fail "Stopped replication problem still reported after 30s" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 2: Detect read_only mismatch (writable replica) ---" + +# Make mysql2 writable (it should be read-only as a replica) +echo "Setting mysql2 read_only=0 (simulating writable replica)..." +$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "SET GLOBAL read_only=0" 2>/dev/null + +# Wait for orchestrator to detect the problem +echo "Waiting for orchestrator to detect writable replica..." +DETECTED=false +for i in $(seq 1 30); do + INST=$(curl -s --max-time 10 "$ORC_URL/api/instance/mysql2/3306" 2>/dev/null) + IS_RO=$(echo "$INST" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ReadOnly', True))" 2>/dev/null || echo "True") + if [ "$IS_RO" = "False" ]; then + DETECTED=true + echo "Writable replica detected after ${i}s" + break + fi + sleep 1 +done + +if [ "$DETECTED" = "true" ]; then + pass "Orchestrator detected mysql2 is writable (read_only=false while replicating)" +else + fail "Orchestrator did not detect writable replica within 30s" +fi + +# Restore read_only +echo "Restoring read_only=1 on mysql2..." +$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "SET GLOBAL read_only=1" 2>/dev/null + +# Wait for it to clear +CLEARED=false +for i in $(seq 1 15); do + IS_RO=$(curl -s --max-time 10 "$ORC_URL/api/instance/mysql2/3306" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin).get('ReadOnly', False))" 2>/dev/null || echo "False") + if [ "$IS_RO" = "True" ]; then + CLEARED=true + break + fi + sleep 1 +done + +if [ "$CLEARED" = "true" ]; then + pass "Writable replica problem cleared after restoring read_only" +else + fail "read_only still reported as false after 15s" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 3: Detect errant GTID ---" + +# Inject an errant transaction on mysql3 +echo "Injecting errant transaction on mysql3..." +$COMPOSE exec -T mysql3 mysql -uroot -ptestpass -e " + SET GLOBAL read_only=0; + SET GLOBAL super_read_only=0; + CREATE DATABASE IF NOT EXISTS errant_detect_test; + SET GLOBAL read_only=1; + SET GLOBAL super_read_only=1; +" 2>/dev/null + +# Wait for orchestrator to detect errant GTID +echo "Waiting for orchestrator to detect errant GTID..." +DETECTED=false +for i in $(seq 1 30); do + GTID_ERRANT=$(curl -s --max-time 10 "$ORC_URL/api/instance/mysql3/3306" 2>/dev/null | python3 -c " +import json, sys +inst = json.load(sys.stdin) +errant = inst.get('GtidErrant', '') +print(errant if errant else '') +" 2>/dev/null || echo "") + if [ -n "$GTID_ERRANT" ]; then + DETECTED=true + echo "Errant GTID detected after ${i}s: $GTID_ERRANT" + break + fi + sleep 1 +done + +if [ "$DETECTED" = "true" ]; then + pass "Orchestrator detected errant GTID on mysql3" +else + fail "Orchestrator did not detect errant GTID within 30s" +fi + +# Cleanup errant DB (GTID remains but that's OK) +$COMPOSE exec -T mysql3 mysql -uroot -ptestpass -e " + SET GLOBAL read_only=0; + DROP DATABASE IF EXISTS errant_detect_test; + SET GLOBAL read_only=1; +" 2>/dev/null + +summary diff --git a/tests/functional/test-topology-discovery.sh b/tests/functional/test-topology-discovery.sh new file mode 100755 index 00000000..f9324766 --- /dev/null +++ b/tests/functional/test-topology-discovery.sh @@ -0,0 +1,225 @@ +#!/bin/bash +# Topology discovery validation — verify orchestrator correctly discovers +# and represents the actual replication state +set -uo pipefail # no -e: we handle failures ourselves +cd "$(dirname "$0")/../.." +source tests/functional/lib.sh + +echo "=== TOPOLOGY DISCOVERY VALIDATION TESTS ===" + +COMPOSE="docker compose -f tests/functional/docker-compose.yml" + +wait_for_orchestrator || { echo "FATAL: Orchestrator not reachable"; exit 1; } +discover_topology "mysql1" + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 1: Correct instance count ---" + +INSTANCES=$(curl -s --max-time 10 "$ORC_URL/api/cluster/$CLUSTER_NAME" 2>/dev/null) +INST_COUNT=$(echo "$INSTANCES" | python3 -c "import json,sys; print(len(json.load(sys.stdin)))" 2>/dev/null || echo "0") + +if [ "$INST_COUNT" -eq 3 ]; then + pass "Cluster has exactly 3 instances" +else + fail "Cluster has $INST_COUNT instances (expected 3)" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 2: Master identification ---" + +# Get orchestrator's view of the master +MASTER_INFO=$(echo "$INSTANCES" | python3 -c " +import json, sys +instances = json.load(sys.stdin) +for inst in instances: + if not inst.get('ReadOnly', True): + print(json.dumps(inst)) + break +" 2>/dev/null) + +ORC_MASTER_HOST=$(echo "$MASTER_INFO" | python3 -c "import json,sys; print(json.load(sys.stdin)['Key']['Hostname'])" 2>/dev/null || echo "") +ORC_MASTER_RO=$(echo "$MASTER_INFO" | python3 -c "import json,sys; print(json.load(sys.stdin)['ReadOnly'])" 2>/dev/null || echo "") + +# Cross-reference with direct MySQL query +ACTUAL_RO=$($COMPOSE exec -T mysql1 mysql -uroot -ptestpass -Nse "SELECT @@read_only" 2>/dev/null | tr -d '[:space:]') + +if [ -n "$ORC_MASTER_HOST" ]; then + pass "Master identified: $ORC_MASTER_HOST" +else + fail "No master found in orchestrator data" +fi + +if [ "$ORC_MASTER_RO" = "False" ]; then + pass "Orchestrator reports master read_only=false" +else + fail "Orchestrator reports master read_only=$ORC_MASTER_RO (expected False)" +fi + +if [ "$ACTUAL_RO" = "0" ]; then + pass "Direct MySQL confirms mysql1 read_only=0" +else + fail "Direct MySQL: mysql1 read_only=$ACTUAL_RO (expected 0)" +fi + +# Verify master has no master key (is top of topology) +MASTER_MASTER=$(echo "$MASTER_INFO" | python3 -c " +import json, sys +inst = json.load(sys.stdin) +mk = inst.get('MasterKey', {}) +h = mk.get('Hostname', '') +p = mk.get('Port', 0) +# Empty or port 0 means no master +if not h or p == 0: + print('none') +else: + print(f'{h}:{p}') +" 2>/dev/null || echo "error") + +if [ "$MASTER_MASTER" = "none" ]; then + pass "Master has no upstream master (top of topology)" +else + # It might show itself or empty depending on version — acceptable if it's self + if echo "$MASTER_MASTER" | grep -q "$ORC_MASTER_HOST"; then + pass "Master's MasterKey points to itself (acceptable)" + else + fail "Master has unexpected upstream: $MASTER_MASTER" + fi +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 3: Replica identification ---" + +# Get replica info from orchestrator +REPLICAS=$(echo "$INSTANCES" | python3 -c " +import json, sys +instances = json.load(sys.stdin) +for inst in instances: + if inst.get('ReadOnly', False): + h = inst['Key']['Hostname'] + mk = inst.get('MasterKey', {}).get('Hostname', '') + ro = inst.get('ReadOnly', False) + repl_running = inst.get('ReplicationSQLThreadRuning', False) and inst.get('ReplicationIOThreadRuning', False) + print(f'{h}|{mk}|{ro}|{repl_running}') +" 2>/dev/null) + +REPLICA_COUNT=$(echo "$REPLICAS" | grep -c '|' || echo "0") +if [ "$REPLICA_COUNT" -ge 2 ]; then + pass "Found $REPLICA_COUNT replicas" +else + fail "Found $REPLICA_COUNT replicas (expected >= 2)" +fi + +# Verify each replica's master key points to actual master +while IFS='|' read -r RHOST RMASTER RRO RRUNNING; do + [ -z "$RHOST" ] && continue + + if [ "$RMASTER" = "$ORC_MASTER_HOST" ]; then + pass "Replica $RHOST: master=$RMASTER (correct)" + else + fail "Replica $RHOST: master=$RMASTER (expected $ORC_MASTER_HOST)" + fi + + if [ "$RRO" = "True" ]; then + pass "Replica $RHOST: read_only=true" + else + fail "Replica $RHOST: read_only=$RRO (expected True)" + fi + + if [ "$RRUNNING" = "True" ]; then + pass "Replica $RHOST: replication threads running" + else + fail "Replica $RHOST: replication threads not running" + fi +done <<< "$REPLICAS" + +# Cross-reference with direct MySQL +for REPLICA in mysql2 mysql3; do + ACTUAL_RO=$($COMPOSE exec -T "$REPLICA" mysql -uroot -ptestpass -Nse "SELECT @@read_only" 2>/dev/null | tr -d '[:space:]') + ACTUAL_SOURCE=$(mysql_source_host "$REPLICA") + if [ "$ACTUAL_RO" = "1" ]; then + pass "Direct MySQL: $REPLICA read_only=1" + else + fail "Direct MySQL: $REPLICA read_only=$ACTUAL_RO (expected 1)" + fi + if echo "$ACTUAL_SOURCE" | grep -qi "mysql1"; then + pass "Direct MySQL: $REPLICA replicates from mysql1" + else + fail "Direct MySQL: $REPLICA replicates from '$ACTUAL_SOURCE' (expected mysql1)" + fi +done + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 4: GTID tracking ---" + +HAS_GTID=$(echo "$INSTANCES" | python3 -c " +import json, sys +instances = json.load(sys.stdin) +for inst in instances: + gtid = inst.get('GtidPurged', '') or inst.get('ExecutedGtidSet', '') + if gtid: + print('yes') + sys.exit(0) +print('no') +" 2>/dev/null || echo "unknown") + +if [ "$HAS_GTID" = "yes" ]; then + pass "GTID data is being tracked by orchestrator" +else + fail "No GTID data found in orchestrator instance data" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 5: Instance freshness ---" + +# Verify LastChecked is recent (within last 60 seconds) +STALE=$(echo "$INSTANCES" | python3 -c " +import json, sys +from datetime import datetime, timezone, timedelta +instances = json.load(sys.stdin) +stale = [] +now = datetime.now(timezone.utc) +for inst in instances: + lc = inst.get('LastChecked', '') + if lc: + # Parse the timestamp (Go format: 2006-01-02T15:04:05Z) + try: + t = datetime.fromisoformat(lc.replace('Z', '+00:00')) + age = (now - t).total_seconds() + if age > 120: + stale.append(f\"{inst['Key']['Hostname']}:{age:.0f}s\") + except: + pass +print(','.join(stale) if stale else 'none') +" 2>/dev/null || echo "unknown") + +if [ "$STALE" = "none" ]; then + pass "All instances have recent LastChecked timestamps" +elif [ "$STALE" = "unknown" ]; then + skip "Could not parse LastChecked timestamps" +else + fail "Stale instances: $STALE" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 6: Uptime detection ---" + +ALL_UP=$(echo "$INSTANCES" | python3 -c " +import json, sys +instances = json.load(sys.stdin) +down = [inst['Key']['Hostname'] for inst in instances if inst.get('Uptime', 0) == 0] +print(','.join(down) if down else 'all_up') +" 2>/dev/null || echo "unknown") + +if [ "$ALL_UP" = "all_up" ]; then + pass "All instances report non-zero uptime" +else + fail "Instances with zero uptime: $ALL_UP" +fi + +summary diff --git a/tests/functional/test-topology-operations.sh b/tests/functional/test-topology-operations.sh new file mode 100755 index 00000000..8f57fdbb --- /dev/null +++ b/tests/functional/test-topology-operations.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# Topology refactoring operation tests — verify orchestrator can correctly +# move replicas between masters using GTID-based operations +set -uo pipefail # no -e: we handle failures ourselves +cd "$(dirname "$0")/../.." +source tests/functional/lib.sh + +echo "=== TOPOLOGY REFACTORING OPERATIONS TESTS ===" + +COMPOSE="docker compose -f tests/functional/docker-compose.yml" +STOP_SQL=$(mysql_stop_replica_sql) +START_SQL=$(mysql_start_replica_sql) +CHANGE_TO_MYSQL1=$(mysql_change_source_sql mysql1 3306 repl repl_pass) + +wait_for_orchestrator || { echo "FATAL: Orchestrator not reachable"; exit 1; } +discover_topology "mysql1" + +# Helper: wait for orchestrator to see a specific replication source +wait_for_source() { + local INSTANCE="$1" EXPECTED_SOURCE="$2" TIMEOUT="${3:-30}" + for i in $(seq 1 "$TIMEOUT"); do + SOURCE=$(curl -s --max-time 10 "$ORC_URL/api/instance/$INSTANCE/3306" 2>/dev/null | python3 -c " +import json, sys +inst = json.load(sys.stdin) +print(inst.get('MasterKey', {}).get('Hostname', '')) +" 2>/dev/null || echo "") + if echo "$SOURCE" | grep -qi "$EXPECTED_SOURCE"; then + return 0 + fi + sleep 1 + done + return 1 +} + +# Helper: restore flat topology (all replicas directly under mysql1) +restore_flat_topology() { + echo "Restoring flat topology..." + for REPLICA in mysql2 mysql3; do + $COMPOSE exec -T "$REPLICA" mysql -uroot -ptestpass -e " + $STOP_SQL; + $CHANGE_TO_MYSQL1; + $START_SQL; + " 2>/dev/null + done + # Re-seed discovery + curl -s --max-time 10 "$ORC_URL/api/discover/mysql1/3306" > /dev/null 2>&1 + curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1 + curl -s --max-time 10 "$ORC_URL/api/discover/mysql3/3306" > /dev/null 2>&1 + sleep 5 +} + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 1: Relocate replica (mysql3 under mysql2) ---" + +# Use the relocate API to move mysql3 under mysql2 +echo "Relocating mysql3 under mysql2 via API..." +RESULT=$(curl -s --max-time 30 "$ORC_URL/api/relocate/mysql3/3306/mysql2/3306" 2>/dev/null) +CODE=$(echo "$RESULT" | python3 -c "import json,sys; print(json.load(sys.stdin).get('Code',''))" 2>/dev/null || echo "") + +if [ "$CODE" = "OK" ]; then + pass "Relocate API returned OK" +else + fail "Relocate API returned: $CODE" "$(echo "$RESULT" | head -c 200)" +fi + +# Verify mysql3 now replicates from mysql2 +if wait_for_source mysql3 mysql2 15; then + pass "mysql3 now replicates from mysql2 (verified via API)" +else + fail "mysql3 did not move under mysql2 within 15s" +fi + +# Cross-reference with direct MySQL +ACTUAL_SOURCE=$(mysql_source_host mysql3) +if echo "$ACTUAL_SOURCE" | grep -qi "mysql2"; then + pass "Direct MySQL confirms mysql3 replicates from mysql2" +else + fail "Direct MySQL: mysql3 source is '$ACTUAL_SOURCE' (expected mysql2)" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 2: Move-up (mysql3 back under mysql1) ---" + +# Use move-up to move mysql3 from under mysql2 back to mysql1 +echo "Moving mysql3 up via API..." +RESULT=$(curl -s --max-time 30 "$ORC_URL/api/move-up/mysql3/3306" 2>/dev/null) +CODE=$(echo "$RESULT" | python3 -c "import json,sys; print(json.load(sys.stdin).get('Code',''))" 2>/dev/null || echo "") + +if [ "$CODE" = "OK" ]; then + pass "Move-up API returned OK" +else + fail "Move-up API returned: $CODE" "$(echo "$RESULT" | head -c 200)" +fi + +# Verify mysql3 now replicates from mysql1 +if wait_for_source mysql3 mysql1 15; then + pass "mysql3 now replicates from mysql1 after move-up" +else + fail "mysql3 did not move under mysql1 within 15s" +fi + +ACTUAL_SOURCE=$(mysql_source_host mysql3) +if echo "$ACTUAL_SOURCE" | grep -qi "mysql1"; then + pass "Direct MySQL confirms mysql3 replicates from mysql1 after move-up" +else + fail "Direct MySQL: mysql3 source is '$ACTUAL_SOURCE' (expected mysql1)" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 3: Repoint replica ---" + +# Move mysql3 under mysql2 using repoint (GTID-based) +echo "Repointing mysql3 to mysql2 via API..." +RESULT=$(curl -s --max-time 30 "$ORC_URL/api/repoint/mysql3/3306/mysql2/3306" 2>/dev/null) +CODE=$(echo "$RESULT" | python3 -c "import json,sys; print(json.load(sys.stdin).get('Code',''))" 2>/dev/null || echo "") + +if [ "$CODE" = "OK" ]; then + pass "Repoint API returned OK" +else + fail "Repoint API returned: $CODE" "$(echo "$RESULT" | head -c 200)" +fi + +if wait_for_source mysql3 mysql2 15; then + pass "mysql3 repoints to mysql2 (verified via API)" +else + fail "mysql3 did not repoint to mysql2 within 15s" +fi + +ACTUAL_SOURCE=$(mysql_source_host mysql3) +if echo "$ACTUAL_SOURCE" | grep -qi "mysql2"; then + pass "Direct MySQL confirms mysql3 replicates from mysql2 after repoint" +else + fail "Direct MySQL: mysql3 source is '$ACTUAL_SOURCE' (expected mysql2)" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Test 4: Set read-only via API ---" + +# Verify we can toggle read_only via the API +echo "Setting mysql2 writeable via API..." +RESULT=$(curl -s --max-time 10 "$ORC_URL/api/set-writeable/mysql2/3306" 2>/dev/null) + +sleep 2 +ACTUAL_RO=$($COMPOSE exec -T mysql2 mysql -uroot -ptestpass -Nse "SELECT @@read_only" 2>/dev/null | tr -d '[:space:]') +if [ "$ACTUAL_RO" = "0" ]; then + pass "set-writeable API: mysql2 is now writable (read_only=0)" +else + fail "set-writeable API: mysql2 read_only=$ACTUAL_RO (expected 0)" +fi + +echo "Setting mysql2 read-only via API..." +RESULT=$(curl -s --max-time 10 "$ORC_URL/api/set-read-only/mysql2/3306" 2>/dev/null) + +sleep 2 +ACTUAL_RO=$($COMPOSE exec -T mysql2 mysql -uroot -ptestpass -Nse "SELECT @@read_only" 2>/dev/null | tr -d '[:space:]') +if [ "$ACTUAL_RO" = "1" ]; then + pass "set-read-only API: mysql2 is now read-only (read_only=1)" +else + fail "set-read-only API: mysql2 read_only=$ACTUAL_RO (expected 1)" +fi + +# ---------------------------------------------------------------- +echo "" +echo "--- Cleanup: Restore flat topology ---" +restore_flat_topology + +# Verify clean state +SOURCE2=$(mysql_source_host mysql2) +SOURCE3=$(mysql_source_host mysql3) +if echo "$SOURCE2" | grep -qi "mysql1" && echo "$SOURCE3" | grep -qi "mysql1"; then + pass "Flat topology restored (mysql2 and mysql3 under mysql1)" +else + fail "Topology not fully restored: mysql2->$SOURCE2, mysql3->$SOURCE3" +fi + +summary From cbda41b944ba85e482fb605e3c1647c2d95df9f2 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 09:44:14 +0000 Subject: [PATCH 2/9] Clean up extra_db on mysql2 in named channels test The test created extra_db on mysql2 but only cleaned up the channel on mysql3. Add DROP DATABASE to prevent residual state. --- tests/functional/test-named-channels.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/functional/test-named-channels.sh b/tests/functional/test-named-channels.sh index f57adacb..430f73d8 100755 --- a/tests/functional/test-named-channels.sh +++ b/tests/functional/test-named-channels.sh @@ -108,4 +108,7 @@ else fail "Named channel 'extra' still present after cleanup" fi +# Clean up test database on mysql2 +$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "DROP DATABASE IF EXISTS extra_db;" 2>/dev/null + summary From a8afb3dcc68d1830b17a063174bd4193f136e597 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 10:12:03 +0000 Subject: [PATCH 3/9] Fix mysql_source_host: use tabular SHOW STATUS instead of \G MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The \G (vertical format) flag is unreliable when passed through docker exec pipes — it returns empty output on MySQL 8.4 and 9.x. Use tabular SHOW REPLICA STATUS instead; column 2 is always the source host. --- tests/functional/lib.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/functional/lib.sh b/tests/functional/lib.sh index 3968887e..003024dc 100755 --- a/tests/functional/lib.sh +++ b/tests/functional/lib.sh @@ -174,14 +174,16 @@ mysql_read_only() { mysql -uroot -ptestpass -Nse "SELECT @@read_only" 2>/dev/null } -# Get MySQL replication source +# Get MySQL replication source host +# Uses tabular SHOW STATUS (column 2 = host) instead of \G which is +# unreliable through docker exec pipes mysql_source_host() { local CONTAINER="$1" if mysql_is_57; then docker compose -f tests/functional/docker-compose.yml exec -T "$CONTAINER" \ - mysql -uroot -ptestpass -Nse "SHOW SLAVE STATUS\G" 2>/dev/null | grep "Master_Host" | awk '{print $2}' + mysql -uroot -ptestpass -Nse "SHOW SLAVE STATUS" 2>/dev/null | awk '{print $2; exit}' else docker compose -f tests/functional/docker-compose.yml exec -T "$CONTAINER" \ - mysql -uroot -ptestpass -Nse "SHOW REPLICA STATUS\G" 2>/dev/null | grep "Source_Host" | awk '{print $2}' + mysql -uroot -ptestpass -Nse "SHOW REPLICA STATUS" 2>/dev/null | awk '{print $2; exit}' fi } From 62b814e9d3607f220ec95931dbc468cabd3579dc Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 10:20:56 +0000 Subject: [PATCH 4/9] Fix mysql_source_host: use tab-delimited awk field split awk splits on whitespace by default, picking up "for" from "Waiting for source..." (column 1 = Replica_IO_State). Use -F'\t' to split on tabs so column 2 = Source_Host. --- tests/functional/lib.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/functional/lib.sh b/tests/functional/lib.sh index 003024dc..600f8752 100755 --- a/tests/functional/lib.sh +++ b/tests/functional/lib.sh @@ -175,15 +175,14 @@ mysql_read_only() { } # Get MySQL replication source host -# Uses tabular SHOW STATUS (column 2 = host) instead of \G which is -# unreliable through docker exec pipes +# Uses tab-separated SHOW STATUS — Source_Host is column 2 mysql_source_host() { local CONTAINER="$1" if mysql_is_57; then docker compose -f tests/functional/docker-compose.yml exec -T "$CONTAINER" \ - mysql -uroot -ptestpass -Nse "SHOW SLAVE STATUS" 2>/dev/null | awk '{print $2; exit}' + mysql -uroot -ptestpass -Nse "SHOW SLAVE STATUS" 2>/dev/null | awk -F'\t' '{print $2; exit}' else docker compose -f tests/functional/docker-compose.yml exec -T "$CONTAINER" \ - mysql -uroot -ptestpass -Nse "SHOW REPLICA STATUS" 2>/dev/null | awk '{print $2; exit}' + mysql -uroot -ptestpass -Nse "SHOW REPLICA STATUS" 2>/dev/null | awk -F'\t' '{print $2; exit}' fi } From 1d88f3f0bfd36e1de929cbde57ebd341b760ce7d Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 11:07:19 +0000 Subject: [PATCH 5/9] Fix problem detection tests: force re-discovery after state changes Orchestrator caches instance state and only refreshes on its poll interval. After STOP REPLICA / SET GLOBAL read_only / injecting errant GTIDs, the tests need to force a re-discover call so orchestrator updates its cached data before the test assertions check the API. --- tests/functional/test-problem-detection.sh | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/functional/test-problem-detection.sh b/tests/functional/test-problem-detection.sh index 0adfe263..e37bdc6e 100755 --- a/tests/functional/test-problem-detection.sh +++ b/tests/functional/test-problem-detection.sh @@ -22,6 +22,10 @@ echo "--- Test 1: Detect stopped replication ---" echo "Stopping replication on mysql2..." $COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "$STOP_SQL" 2>/dev/null +# Force re-discovery so orchestrator refreshes instance state immediately +curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1 +sleep 2 + # Wait for orchestrator to detect the problem (poll up to 30s) echo "Waiting for orchestrator to detect stopped replication..." DETECTED=false @@ -49,6 +53,10 @@ else fail "Orchestrator did not detect stopped replication on mysql2 within 30s" fi +# Force another re-discovery to ensure instance data is fresh +curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1 +sleep 2 + # Verify the specific problem shows replication threads stopped REPL_STATE=$(curl -s --max-time 10 "$ORC_URL/api/instance/mysql2/3306" 2>/dev/null | python3 -c " import json, sys @@ -72,6 +80,10 @@ echo "--- Test 1b: Clear stopped replication ---" echo "Restarting replication on mysql2..." $COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "$START_SQL" 2>/dev/null +# Force re-discovery so orchestrator refreshes instance state +curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1 +sleep 2 + # Wait for orchestrator to see it clear echo "Waiting for problem to clear..." CLEARED=false @@ -108,6 +120,10 @@ echo "--- Test 2: Detect read_only mismatch (writable replica) ---" echo "Setting mysql2 read_only=0 (simulating writable replica)..." $COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "SET GLOBAL read_only=0" 2>/dev/null +# Force re-discovery so orchestrator refreshes instance state +curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1 +sleep 2 + # Wait for orchestrator to detect the problem echo "Waiting for orchestrator to detect writable replica..." DETECTED=false @@ -132,6 +148,10 @@ fi echo "Restoring read_only=1 on mysql2..." $COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "SET GLOBAL read_only=1" 2>/dev/null +# Force re-discovery +curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1 +sleep 2 + # Wait for it to clear CLEARED=false for i in $(seq 1 15); do @@ -163,6 +183,10 @@ $COMPOSE exec -T mysql3 mysql -uroot -ptestpass -e " SET GLOBAL super_read_only=1; " 2>/dev/null +# Force re-discovery so orchestrator picks up the errant GTID +curl -s --max-time 10 "$ORC_URL/api/discover/mysql3/3306" > /dev/null 2>&1 +sleep 2 + # Wait for orchestrator to detect errant GTID echo "Waiting for orchestrator to detect errant GTID..." DETECTED=false From 13ecfe5c5ae33a28a8d890fb506d7df87cdb465d Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 11:09:26 +0000 Subject: [PATCH 6/9] Fix named channels test: verify setup data, retry data replication - Verify data exists on mysql2 before setting up the extra channel - Replace single sleep+check with a 15s retry loop for data replication - Add better error messages for debugging if setup or replication fails --- tests/functional/test-named-channels.sh | 28 ++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/tests/functional/test-named-channels.sh b/tests/functional/test-named-channels.sh index 430f73d8..a59cc841 100755 --- a/tests/functional/test-named-channels.sh +++ b/tests/functional/test-named-channels.sh @@ -23,12 +23,22 @@ echo "" echo "--- Setup: Configure multi-source replication on mysql3 ---" # Create test database and table on mysql2 for the extra channel +# Note: mysql2 is a replica with read_only=ON, but root has SUPER privilege $COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e " CREATE DATABASE IF NOT EXISTS extra_db; CREATE TABLE IF NOT EXISTS extra_db.test (id INT PRIMARY KEY AUTO_INCREMENT, val VARCHAR(100)); INSERT INTO extra_db.test (val) VALUES ('channel-test'); " 2>/dev/null +# Verify data exists on mysql2 before setting up channel +DATA_ON_M2=$($COMPOSE exec -T mysql2 mysql -uroot -ptestpass -Nse \ + "SELECT val FROM extra_db.test LIMIT 1" 2>/dev/null | tr -d '[:space:]') +if [ "$DATA_ON_M2" = "channel-test" ]; then + echo " Verified data on mysql2: $DATA_ON_M2" +else + fail "Setup: test data not created on mysql2 (got: '$DATA_ON_M2')" "Check if writes are allowed on replica" +fi + # Add a named channel 'extra' on mysql3 replicating from mysql2 CHANGE_SQL=$(mysql_change_source_channel_sql mysql2 3306 repl repl_pass extra) START_SQL=$(mysql_start_replica_sql) @@ -49,15 +59,23 @@ else fail "Named channel 'extra' not running on mysql3 (status: $CHANNEL_STATUS)" fi -# Verify data replicated through the extra channel -sleep 2 -REPLICATED=$($COMPOSE exec -T mysql3 mysql -uroot -ptestpass -Nse \ - "SELECT val FROM extra_db.test LIMIT 1" 2>/dev/null | tr -d '[:space:]') +# Verify data replicated through the extra channel (poll up to 15s) +echo "Waiting for data to replicate through named channel..." +REPLICATED="" +for i in $(seq 1 15); do + REPLICATED=$($COMPOSE exec -T mysql3 mysql -uroot -ptestpass -Nse \ + "SELECT val FROM extra_db.test LIMIT 1" 2>/dev/null | tr -d '[:space:]') + if [ "$REPLICATED" = "channel-test" ]; then + break + fi + sleep 1 +done if [ "$REPLICATED" = "channel-test" ]; then pass "Data replicated through named channel 'extra'" else - fail "Data not replicated through named channel (got: $REPLICATED)" + fail "Data not replicated through named channel (got: '$REPLICATED')" \ + "Channel ON but data missing - check GTID sets on mysql2 vs mysql3" fi # ---------------------------------------------------------------- From 5766a6c1646116b43cd2bacb026fe81cf7811b62 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 12:10:50 +0000 Subject: [PATCH 7/9] Fix problem detection test 1b: check SQL thread state not /api/problems After STOP+START REPLICA, mysql2 may still appear in /api/problems due to other issues (replication lag, errant GTID from stop/start cycle). Instead of checking /api/problems for any mysql2 entry, verify that the SQL thread is running again via the instance API. Also force re-discovery on each poll iteration to ensure fresh state. --- tests/functional/test-problem-detection.sh | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/functional/test-problem-detection.sh b/tests/functional/test-problem-detection.sh index e37bdc6e..cd75a7b6 100755 --- a/tests/functional/test-problem-detection.sh +++ b/tests/functional/test-problem-detection.sh @@ -84,23 +84,23 @@ $COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "$START_SQL" 2>/dev/null curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1 sleep 2 -# Wait for orchestrator to see it clear -echo "Waiting for problem to clear..." +# Wait for orchestrator to see replication running again +echo "Waiting for replication to recover..." CLEARED=false for i in $(seq 1 30); do - PROBLEMS=$(curl -s --max-time 10 "$ORC_URL/api/problems" 2>/dev/null) - HAS_MYSQL2=$(echo "$PROBLEMS" | python3 -c " + # Force re-discovery each iteration + curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1 + REPL_STATE=$(curl -s --max-time 10 "$ORC_URL/api/instance/mysql2/3306" 2>/dev/null | python3 -c " import json, sys -problems = json.load(sys.stdin) -for p in problems: - h = p.get('Key', {}).get('Hostname', '') - if 'mysql2' in h: - sys.exit(0) -sys.exit(1) -" 2>/dev/null && echo "yes" || echo "no") - if [ "$HAS_MYSQL2" = "no" ]; then +inst = json.load(sys.stdin) +sql = inst.get('ReplicationSQLThreadRuning', False) +io = inst.get('ReplicationIOThreadRuning', False) +print(f'{sql}:{io}') +" 2>/dev/null || echo "False:False") + SQL_RUNNING=$(echo "$REPL_STATE" | cut -d: -f1) + if [ "$SQL_RUNNING" = "True" ]; then CLEARED=true - echo "Problem cleared after ${i}s" + echo "Replication recovered after ${i}s (SQL=True, IO=$IO)" break fi sleep 1 @@ -109,7 +109,7 @@ done if [ "$CLEARED" = "true" ]; then pass "Stopped replication problem cleared after restart" else - fail "Stopped replication problem still reported after 30s" + fail "Replication SQL thread not running on mysql2 after 30s (state: $REPL_STATE)" fi # ---------------------------------------------------------------- From 47988d9528559cebb88e49e3324cb5d04a4bc00d Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 12:18:59 +0000 Subject: [PATCH 8/9] Fix unbound variable: extract IO_RUNNING from REPL_STATE --- tests/functional/test-problem-detection.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/functional/test-problem-detection.sh b/tests/functional/test-problem-detection.sh index cd75a7b6..d54c9d14 100755 --- a/tests/functional/test-problem-detection.sh +++ b/tests/functional/test-problem-detection.sh @@ -98,9 +98,10 @@ io = inst.get('ReplicationIOThreadRuning', False) print(f'{sql}:{io}') " 2>/dev/null || echo "False:False") SQL_RUNNING=$(echo "$REPL_STATE" | cut -d: -f1) + IO_RUNNING=$(echo "$REPL_STATE" | cut -d: -f2) if [ "$SQL_RUNNING" = "True" ]; then CLEARED=true - echo "Replication recovered after ${i}s (SQL=True, IO=$IO)" + echo "Replication recovered after ${i}s (SQL=True, IO=${IO_RUNNING})" break fi sleep 1 From cd02379d3c73da295fd13add98a89bf921ace5f4 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 9 Apr 2026 12:32:04 +0000 Subject: [PATCH 9/9] Fix topology operations: wait for replication lag to clear before move-up --- tests/functional/test-topology-operations.sh | 23 ++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/tests/functional/test-topology-operations.sh b/tests/functional/test-topology-operations.sh index 8f57fdbb..0846e009 100755 --- a/tests/functional/test-topology-operations.sh +++ b/tests/functional/test-topology-operations.sh @@ -83,10 +83,29 @@ fi echo "" echo "--- Test 2: Move-up (mysql3 back under mysql1) ---" +# Wait for replication to catch up after relocate before attempting move-up +echo "Waiting for replication to catch up after relocate..." +sleep 5 +# Re-seed discovery so orchestrator sees fresh lag values +curl -s --max-time 10 "$ORC_URL/api/discover/mysql3/3306" > /dev/null 2>&1 +curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1 +sleep 5 + # Use move-up to move mysql3 from under mysql2 back to mysql1 +# Retry up to 3 times to handle transient "lags too much" errors echo "Moving mysql3 up via API..." -RESULT=$(curl -s --max-time 30 "$ORC_URL/api/move-up/mysql3/3306" 2>/dev/null) -CODE=$(echo "$RESULT" | python3 -c "import json,sys; print(json.load(sys.stdin).get('Code',''))" 2>/dev/null || echo "") +CODE="" +RESULT="" +for attempt in 1 2 3; do + RESULT=$(curl -s --max-time 30 "$ORC_URL/api/move-up/mysql3/3306" 2>/dev/null) + CODE=$(echo "$RESULT" | python3 -c "import json,sys; print(json.load(sys.stdin).get('Code',''))" 2>/dev/null || echo "") + if [ "$CODE" = "OK" ]; then + break + fi + echo " Move-up attempt $attempt: $CODE — waiting for lag to clear..." + sleep 5 + curl -s --max-time 10 "$ORC_URL/api/discover/mysql3/3306" > /dev/null 2>&1 +done if [ "$CODE" = "OK" ]; then pass "Move-up API returned OK"