Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/functional.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,15 @@ jobs:
- name: Run smoke tests
run: bash tests/functional/test-smoke.sh

- name: Run topology discovery validation
run: bash tests/functional/test-topology-discovery.sh

- name: Run problem detection tests
run: bash tests/functional/test-problem-detection.sh

- name: Run topology operations tests
run: bash tests/functional/test-topology-operations.sh

- name: Run regression tests
run: bash tests/functional/test-regression.sh

Expand Down
7 changes: 4 additions & 3 deletions tests/functional/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,15 @@ mysql_read_only() {
mysql -uroot -ptestpass -Nse "SELECT @@read_only" 2>/dev/null
}

# Get MySQL replication source
# Get MySQL replication source host
# Uses tab-separated SHOW STATUS — Source_Host is column 2
mysql_source_host() {
local CONTAINER="$1"
if mysql_is_57; then
docker compose -f tests/functional/docker-compose.yml exec -T "$CONTAINER" \
mysql -uroot -ptestpass -Nse "SHOW SLAVE STATUS\G" 2>/dev/null | grep "Master_Host" | awk '{print $2}'
mysql -uroot -ptestpass -Nse "SHOW SLAVE STATUS" 2>/dev/null | awk -F'\t' '{print $2; exit}'
else
docker compose -f tests/functional/docker-compose.yml exec -T "$CONTAINER" \
mysql -uroot -ptestpass -Nse "SHOW REPLICA STATUS\G" 2>/dev/null | grep "Source_Host" | awk '{print $2}'
mysql -uroot -ptestpass -Nse "SHOW REPLICA STATUS" 2>/dev/null | awk -F'\t' '{print $2; exit}'
fi
}
31 changes: 26 additions & 5 deletions tests/functional/test-named-channels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,22 @@ echo ""
echo "--- Setup: Configure multi-source replication on mysql3 ---"

# Create test database and table on mysql2 for the extra channel
# Note: mysql2 is a replica with read_only=ON, but root has SUPER privilege
$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "
CREATE DATABASE IF NOT EXISTS extra_db;
CREATE TABLE IF NOT EXISTS extra_db.test (id INT PRIMARY KEY AUTO_INCREMENT, val VARCHAR(100));
INSERT INTO extra_db.test (val) VALUES ('channel-test');
" 2>/dev/null

# Verify data exists on mysql2 before setting up channel
DATA_ON_M2=$($COMPOSE exec -T mysql2 mysql -uroot -ptestpass -Nse \
"SELECT val FROM extra_db.test LIMIT 1" 2>/dev/null | tr -d '[:space:]')
if [ "$DATA_ON_M2" = "channel-test" ]; then
echo " Verified data on mysql2: $DATA_ON_M2"
else
fail "Setup: test data not created on mysql2 (got: '$DATA_ON_M2')" "Check if writes are allowed on replica"
fi

# Add a named channel 'extra' on mysql3 replicating from mysql2
CHANGE_SQL=$(mysql_change_source_channel_sql mysql2 3306 repl repl_pass extra)
START_SQL=$(mysql_start_replica_sql)
Expand All @@ -49,15 +59,23 @@ else
fail "Named channel 'extra' not running on mysql3 (status: $CHANNEL_STATUS)"
fi

# Verify data replicated through the extra channel
sleep 2
REPLICATED=$($COMPOSE exec -T mysql3 mysql -uroot -ptestpass -Nse \
"SELECT val FROM extra_db.test LIMIT 1" 2>/dev/null | tr -d '[:space:]')
# Verify data replicated through the extra channel (poll up to 15s)
echo "Waiting for data to replicate through named channel..."
REPLICATED=""
for i in $(seq 1 15); do
REPLICATED=$($COMPOSE exec -T mysql3 mysql -uroot -ptestpass -Nse \
"SELECT val FROM extra_db.test LIMIT 1" 2>/dev/null | tr -d '[:space:]')
if [ "$REPLICATED" = "channel-test" ]; then
break
fi
sleep 1
done

if [ "$REPLICATED" = "channel-test" ]; then
pass "Data replicated through named channel 'extra'"
else
fail "Data not replicated through named channel (got: $REPLICATED)"
fail "Data not replicated through named channel (got: '$REPLICATED')" \
"Channel ON but data missing - check GTID sets on mysql2 vs mysql3"
fi

# ----------------------------------------------------------------
Expand Down Expand Up @@ -108,4 +126,7 @@ else
fail "Named channel 'extra' still present after cleanup"
fi

# Clean up test database on mysql2
$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "DROP DATABASE IF EXISTS extra_db;" 2>/dev/null

Comment on lines +129 to +131
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Handle cleanup failures explicitly to avoid cross-test contamination.

Line 130 suppresses stderr and does not check command success. If DROP DATABASE fails, stale state can leak into later functional tests in the same environment.

Suggested fix
 # Clean up test database on mysql2
-$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "DROP DATABASE IF EXISTS extra_db;" 2>/dev/null
+if ! $COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "DROP DATABASE IF EXISTS extra_db;" 2>/dev/null; then
+    fail "Cleanup: failed to drop extra_db on mysql2"
+else
+    pass "Cleanup: dropped extra_db on mysql2"
+fi
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Clean up test database on mysql2
$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "DROP DATABASE IF EXISTS extra_db;" 2>/dev/null
# Clean up test database on mysql2
if ! $COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "DROP DATABASE IF EXISTS extra_db;" 2>/dev/null; then
echo "Error: Failed to drop extra_db on mysql2"
exit 1
fi
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/functional/test-named-channels.sh` around lines 129 - 131, The DROP
DATABASE cleanup currently suppresses stderr and ignores failures for the
command `$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "DROP DATABASE IF
EXISTS extra_db;"`; change this so failures are detected and cause the test run
to fail: remove the `2>/dev/null` suppression, capture and check the command
exit status (or enable `set -e`/`set -o errexit` for the script), and on
non-zero exit print a clear error message and exit non-zero so stale state
cannot leak into later tests.

summary
222 changes: 222 additions & 0 deletions tests/functional/test-problem-detection.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
#!/bin/bash
# Problem detection tests — verify orchestrator detects and clears
# replication problems correctly
set -uo pipefail # no -e: we handle failures ourselves
cd "$(dirname "$0")/../.."
source tests/functional/lib.sh
Comment on lines +5 to +6
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Fail fast when bootstrap steps fail.

If cd or source fails, the script still continues (no set -e), which can cascade into misleading failures later.

Suggested fix
-cd "$(dirname "$0")/../.."
-source tests/functional/lib.sh
+cd "$(dirname "$0")/../.." || { echo "FATAL: unable to cd to repository root"; exit 1; }
+source tests/functional/lib.sh || { echo "FATAL: unable to load tests/functional/lib.sh"; exit 1; }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
cd "$(dirname "$0")/../.."
source tests/functional/lib.sh
cd "$(dirname "$0")/../.." || { echo "FATAL: unable to cd to repository root"; exit 1; }
source tests/functional/lib.sh || { echo "FATAL: unable to load tests/functional/lib.sh"; exit 1; }
🧰 Tools
🪛 Shellcheck (0.11.0)

[warning] 5-5: Use 'cd ... || exit' or 'cd ... || return' in case cd fails.

(SC2164)


[info] 6-6: Not following: tests/functional/lib.sh was not specified as input (see shellcheck -x).

(SC1091)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/functional/test-problem-detection.sh` around lines 5 - 6, The script
should fail fast if bootstrap steps fail: add a strict shell flag (e.g., set
-euo pipefail or at minimum set -e) at the top of the script (immediately after
the shebang) so that failures in the cd "$(dirname "$0")/../.." or source
tests/functional/lib.sh commands abort the run; ensure the flags are applied
before those two lines so any failure in cd or source stops the script.


echo "=== PROBLEM DETECTION TESTS ==="

COMPOSE="docker compose -f tests/functional/docker-compose.yml"
STOP_SQL=$(mysql_stop_replica_sql)
START_SQL=$(mysql_start_replica_sql)

wait_for_orchestrator || { echo "FATAL: Orchestrator not reachable"; exit 1; }
discover_topology "mysql1"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Treat topology discovery as a hard precondition.

Line 15 ignores discover_topology failure, so tests can run against partial/undiscovered state and produce noisy results.

Suggested fix
-discover_topology "mysql1"
+discover_topology "mysql1" || { echo "FATAL: initial topology discovery failed"; exit 1; }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
discover_topology "mysql1"
discover_topology "mysql1" || { echo "FATAL: initial topology discovery failed"; exit 1; }
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/functional/test-problem-detection.sh` at line 15, discover_topology is
being treated as optional but must be a hard precondition; update the test to
fail immediately when discover_topology "mysql1" returns non-zero by checking
its exit status and exiting with a non-zero code and an explanatory stderr
message (e.g., run discover_topology "mysql1" and if it fails print "Topology
discovery failed for mysql1" to stderr and exit 1), or alternatively enable
strict shell error handling at the top of the script (set -euo pipefail) to
enforce failure; refer to the discover_topology invocation to locate where to
add the check.


# ----------------------------------------------------------------
echo ""
echo "--- Test 1: Detect stopped replication ---"

# Stop replication on mysql2
echo "Stopping replication on mysql2..."
$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "$STOP_SQL" 2>/dev/null
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Check exit codes for state-changing MySQL commands.

These commands mutate DB state but suppress stderr and do not verify success. If any command fails, subsequent assertions may incorrectly blame orchestrator behavior.

Suggested pattern
+# helper
+run_mysql() {
+    local host="$1"
+    local sql="$2"
+    if ! $COMPOSE exec -T "$host" mysql -uroot -ptestpass -e "$sql" >/dev/null; then
+        fail "MySQL command failed on ${host}" "$sql"
+        return 1
+    fi
+    return 0
+}
-$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "$STOP_SQL" 2>/dev/null
+run_mysql "mysql2" "$STOP_SQL" || exit 1

Apply the same pattern to the other mutation calls.

Also applies to: 81-81, 122-122, 150-150, 179-185, 216-220

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/functional/test-problem-detection.sh` at line 23, The mysql
state-changing exec lines (e.g. the call using $COMPOSE exec -T mysql2 mysql
-uroot -ptestpass -e "$STOP_SQL") suppress stderr and ignore exit codes; update
each such mutation call (including analogous START/INSERT/UPDATE/DELETE
commands) to capture stdout/stderr and check the command's exit status
immediately, and if non-zero print a descriptive error including the captured
stderr and exit non-zero to fail the test fast. Locate the commands by the
pattern "$COMPOSE exec -T mysql* mysql -uroot -ptestpass -e \"...\"" and modify
them to redirect/collect stderr (or assign output to a variable), test $? (or
the command result), and abort with an explanatory message when the command
fails.


# Force re-discovery so orchestrator refreshes instance state immediately
curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1
sleep 2

# Wait for orchestrator to detect the problem (poll up to 30s)
echo "Waiting for orchestrator to detect stopped replication..."
DETECTED=false
for i in $(seq 1 30); do
PROBLEMS=$(curl -s --max-time 10 "$ORC_URL/api/problems" 2>/dev/null)
if echo "$PROBLEMS" | python3 -c "
import json, sys
problems = json.load(sys.stdin)
for p in problems:
h = p.get('Key', {}).get('Hostname', '')
if 'mysql2' in h:
sys.exit(0)
sys.exit(1)
" 2>/dev/null; then
DETECTED=true
echo "Problem detected after ${i}s"
break
fi
sleep 1
done

if [ "$DETECTED" = "true" ]; then
pass "Orchestrator detected stopped replication on mysql2"
else
fail "Orchestrator did not detect stopped replication on mysql2 within 30s"
fi

# Force another re-discovery to ensure instance data is fresh
curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1
sleep 2

# Verify the specific problem shows replication threads stopped
REPL_STATE=$(curl -s --max-time 10 "$ORC_URL/api/instance/mysql2/3306" 2>/dev/null | python3 -c "
import json, sys
inst = json.load(sys.stdin)
sql = inst.get('ReplicationSQLThreadRuning', 'unknown')
io = inst.get('ReplicationIOThreadRuning', 'unknown')
print(f'SQL={sql},IO={io}')
" 2>/dev/null || echo "unknown")

if echo "$REPL_STATE" | grep -q "SQL=False"; then
pass "Orchestrator reports SQL thread stopped on mysql2"
else
fail "Orchestrator replication state for mysql2: $REPL_STATE"
fi

# ----------------------------------------------------------------
echo ""
echo "--- Test 1b: Clear stopped replication ---"

# Restart replication
echo "Restarting replication on mysql2..."
$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "$START_SQL" 2>/dev/null

# Force re-discovery so orchestrator refreshes instance state
curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1
sleep 2

# Wait for orchestrator to see replication running again
echo "Waiting for replication to recover..."
CLEARED=false
for i in $(seq 1 30); do
# Force re-discovery each iteration
curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1
REPL_STATE=$(curl -s --max-time 10 "$ORC_URL/api/instance/mysql2/3306" 2>/dev/null | python3 -c "
import json, sys
inst = json.load(sys.stdin)
sql = inst.get('ReplicationSQLThreadRuning', False)
io = inst.get('ReplicationIOThreadRuning', False)
print(f'{sql}:{io}')
" 2>/dev/null || echo "False:False")
SQL_RUNNING=$(echo "$REPL_STATE" | cut -d: -f1)
IO_RUNNING=$(echo "$REPL_STATE" | cut -d: -f2)
if [ "$SQL_RUNNING" = "True" ]; then
CLEARED=true
echo "Replication recovered after ${i}s (SQL=True, IO=${IO_RUNNING})"
break
Comment on lines +100 to +105
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Require both SQL and IO threads for “cleared” state.

Current clear condition only checks SQL=True; replication can still be unhealthy with IO=False, causing a false pass.

Suggested fix
-    if [ "$SQL_RUNNING" = "True" ]; then
+    if [ "$SQL_RUNNING" = "True" ] && [ "$IO_RUNNING" = "True" ]; then
         CLEARED=true
-        echo "Replication recovered after ${i}s (SQL=True, IO=${IO_RUNNING})"
+        echo "Replication recovered after ${i}s (SQL=True, IO=True)"
         break
     fi
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
SQL_RUNNING=$(echo "$REPL_STATE" | cut -d: -f1)
IO_RUNNING=$(echo "$REPL_STATE" | cut -d: -f2)
if [ "$SQL_RUNNING" = "True" ]; then
CLEARED=true
echo "Replication recovered after ${i}s (SQL=True, IO=${IO_RUNNING})"
break
SQL_RUNNING=$(echo "$REPL_STATE" | cut -d: -f1)
IO_RUNNING=$(echo "$REPL_STATE" | cut -d: -f2)
if [ "$SQL_RUNNING" = "True" ] && [ "$IO_RUNNING" = "True" ]; then
CLEARED=true
echo "Replication recovered after ${i}s (SQL=True, IO=True)"
break
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/functional/test-problem-detection.sh` around lines 100 - 105, The
current check marks CLEARED when only SQL_RUNNING is "True", which can pass
while IO_RUNNING is "False"; update the conditional that sets CLEARED (the block
using SQL_RUNNING, IO_RUNNING, CLEARED and REPL_STATE) to require both
SQL_RUNNING = "True" and IO_RUNNING = "True" before setting CLEARED and breaking
the loop, and adjust the echo message to reflect both thread states (e.g.,
"SQL=True, IO=True") so the test only passes when both threads are healthy.

fi
sleep 1
done

if [ "$CLEARED" = "true" ]; then
pass "Stopped replication problem cleared after restart"
else
fail "Replication SQL thread not running on mysql2 after 30s (state: $REPL_STATE)"
fi

# ----------------------------------------------------------------
echo ""
echo "--- Test 2: Detect read_only mismatch (writable replica) ---"

# Make mysql2 writable (it should be read-only as a replica)
echo "Setting mysql2 read_only=0 (simulating writable replica)..."
$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "SET GLOBAL read_only=0" 2>/dev/null

# Force re-discovery so orchestrator refreshes instance state
curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1
sleep 2

# Wait for orchestrator to detect the problem
echo "Waiting for orchestrator to detect writable replica..."
DETECTED=false
for i in $(seq 1 30); do
INST=$(curl -s --max-time 10 "$ORC_URL/api/instance/mysql2/3306" 2>/dev/null)
IS_RO=$(echo "$INST" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ReadOnly', True))" 2>/dev/null || echo "True")
if [ "$IS_RO" = "False" ]; then
DETECTED=true
echo "Writable replica detected after ${i}s"
break
fi
sleep 1
done

if [ "$DETECTED" = "true" ]; then
pass "Orchestrator detected mysql2 is writable (read_only=false while replicating)"
else
fail "Orchestrator did not detect writable replica within 30s"
fi

# Restore read_only
echo "Restoring read_only=1 on mysql2..."
$COMPOSE exec -T mysql2 mysql -uroot -ptestpass -e "SET GLOBAL read_only=1" 2>/dev/null

# Force re-discovery
curl -s --max-time 10 "$ORC_URL/api/discover/mysql2/3306" > /dev/null 2>&1
sleep 2

# Wait for it to clear
CLEARED=false
for i in $(seq 1 15); do
IS_RO=$(curl -s --max-time 10 "$ORC_URL/api/instance/mysql2/3306" 2>/dev/null | python3 -c "import json,sys; print(json.load(sys.stdin).get('ReadOnly', False))" 2>/dev/null || echo "False")
if [ "$IS_RO" = "True" ]; then
CLEARED=true
break
fi
sleep 1
done

if [ "$CLEARED" = "true" ]; then
pass "Writable replica problem cleared after restoring read_only"
else
fail "read_only still reported as false after 15s"
fi

# ----------------------------------------------------------------
echo ""
echo "--- Test 3: Detect errant GTID ---"

# Inject an errant transaction on mysql3
echo "Injecting errant transaction on mysql3..."
$COMPOSE exec -T mysql3 mysql -uroot -ptestpass -e "
SET GLOBAL read_only=0;
SET GLOBAL super_read_only=0;
CREATE DATABASE IF NOT EXISTS errant_detect_test;
SET GLOBAL read_only=1;
SET GLOBAL super_read_only=1;
" 2>/dev/null

# Force re-discovery so orchestrator picks up the errant GTID
curl -s --max-time 10 "$ORC_URL/api/discover/mysql3/3306" > /dev/null 2>&1
sleep 2

# Wait for orchestrator to detect errant GTID
echo "Waiting for orchestrator to detect errant GTID..."
DETECTED=false
for i in $(seq 1 30); do
GTID_ERRANT=$(curl -s --max-time 10 "$ORC_URL/api/instance/mysql3/3306" 2>/dev/null | python3 -c "
import json, sys
inst = json.load(sys.stdin)
errant = inst.get('GtidErrant', '')
print(errant if errant else '')
" 2>/dev/null || echo "")
if [ -n "$GTID_ERRANT" ]; then
DETECTED=true
echo "Errant GTID detected after ${i}s: $GTID_ERRANT"
break
fi
sleep 1
done

if [ "$DETECTED" = "true" ]; then
pass "Orchestrator detected errant GTID on mysql3"
else
fail "Orchestrator did not detect errant GTID within 30s"
fi

# Cleanup errant DB (GTID remains but that's OK)
$COMPOSE exec -T mysql3 mysql -uroot -ptestpass -e "
SET GLOBAL read_only=0;
DROP DATABASE IF EXISTS errant_detect_test;
SET GLOBAL read_only=1;
" 2>/dev/null
Comment on lines +216 to +220
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The cleanup for the errant GTID test will fail silently. After the injection at lines 158-164, super_read_only is set to 1. The cleanup attempts to DROP DATABASE, which is a write operation that will be blocked by super_read_only=1. The error is suppressed by 2>/dev/null.

The cleanup needs to disable super_read_only to be able to drop the database and then restore it.

Suggested change
$COMPOSE exec -T mysql3 mysql -uroot -ptestpass -e "
SET GLOBAL read_only=0;
DROP DATABASE IF EXISTS errant_detect_test;
SET GLOBAL read_only=1;
" 2>/dev/null
$COMPOSE exec -T mysql3 mysql -uroot -ptestpass -e "
SET GLOBAL read_only=0;
SET GLOBAL super_read_only=0;
DROP DATABASE IF EXISTS errant_detect_test;
SET GLOBAL read_only=1;
SET GLOBAL super_read_only=1;
" 2>/dev/null


summary
Loading
Loading