Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions kubernetes/lib/charms/mysql/v0/mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def __init__(
MAX_CONNECTIONS_FLOOR = 10
MIM_MEM_BUFFERS = 200 * BYTES_1MiB
ADMIN_PORT = 33062
FORCE_QUORUM_TIMEOUT = 300 # seconds

# Labels are not confidential
SECRET_INTERNAL_LABEL = "secret-id" # noqa: S105
Expand Down Expand Up @@ -2370,13 +2371,16 @@ def force_quorum_from_instance(self) -> None:
instance_def = (
f"{self.cluster_admin_user}:{self.cluster_admin_password}@{self.instance_address}"
)
# TODO: modify/expose timeout in the mysql-shell-client library
address = f"{instance_def}:3306"
command = "\n".join((
f"cluster = dba.get_cluster('{self.cluster_name}')",
f"cluster.force_quorum_using_partition_of('{address}')",
))

executor = self._build_cluster_tcp_executor(self.instance_address)
Comment on lines +2374 to +2381
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sinclert-canonical , testing this here, but how should we expose timeout at this level?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment, timeouts can only be applied in an operation-per-operation basis, in the BaseExecutor subclasses (see code). That means that timeouts can only be applied when manually executing the Python / SQL operations via an executor class.

I originally thought about exposing this argument one level up the chain, in the client classes, but it feels noisy to do in an operation-per-operation basis (i.e. every class method will have an additional arg), and a bit overkill in their constructors, as not every operation should have the same timeout.

Open to ideas.

try:
self._cluster_client_tcp.force_instance_quorum_into_cluster(
cluster_name=self.cluster_name,
instance_host=instance_def,
instance_port=str(3306),
)
executor.execute_py(command, timeout=FORCE_QUORUM_TIMEOUT)
except ExecutionError as e:
raise MySQLForceQuorumFromInstanceError() from e

Expand Down
21 changes: 11 additions & 10 deletions kubernetes/tests/integration/helpers_ha.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,17 @@ def check_mysql_units_writes_increment(
app_primary = get_mysql_primary_unit(juju, app_name, app_units[0])
app_max_value = get_mysql_max_written_value(juju, app_name, app_primary)

for unit_name in app_units:
for attempt in Retrying(
reraise=True,
stop=stop_after_delay(5 * MINUTE_SECS),
wait=wait_fixed(10),
):
with attempt:
unit_max_value = get_mysql_max_written_value(juju, app_name, unit_name)
assert unit_max_value > app_max_value, "Writes not incrementing"
app_max_value = unit_max_value
with update_interval(juju, "10s"):
for unit_name in app_units:
for attempt in Retrying(
reraise=True,
stop=stop_after_delay(5 * MINUTE_SECS),
wait=wait_fixed(10),
):
with attempt:
unit_max_value = get_mysql_max_written_value(juju, app_name, unit_name)
assert unit_max_value > app_max_value, "Writes not incrementing"
app_max_value = unit_max_value


def delete_k8s_pod(juju: Juju, unit_name: str) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def test_deploy_router_and_app(first_model: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
num_units=1,
trust=False,
constraints=constraints,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import time
from collections.abc import Generator
from concurrent.futures import ThreadPoolExecutor
from contextlib import suppress

import jubilant_backports
Expand All @@ -32,6 +33,7 @@
MYSQL_APP_1 = "db1"
MYSQL_APP_2 = "db2"
MYSQL_TEST_APP_NAME = "mysql-test-app"
MYSQL_ROUTER = "mysql-router-k8s"

MINUTE_SECS = 60

Expand Down Expand Up @@ -105,14 +107,23 @@ def test_build_and_deploy(first_model: str, second_model: str, charm: str) -> No
)

logging.info("Waiting for the applications to settle")
model_1.wait(
ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_1),
timeout=10 * MINUTE_SECS,
)
model_2.wait(
ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_2),
timeout=10 * MINUTE_SECS,
)

def wait_model_1():
model_1.wait(
ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_1),
timeout=10 * MINUTE_SECS,
)

def wait_model_2():
model_2.wait(
ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_APP_2),
timeout=10 * MINUTE_SECS,
)

with ThreadPoolExecutor() as executor:
futures = [executor.submit(wait_model_1), executor.submit(wait_model_2)]
for future in futures:
future.result()
Comment on lines +111 to +126
Copy link
Copy Markdown
Contributor

@astrojuanlu astrojuanlu Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious, what does this accomplish? Practically speaking, the execution won't go past this point until both .wait calls return, which is what the sequential code does too.


if path := os.getenv("DATA_SOURCE_PATH"):
logging.info("Loading test database")
Expand Down Expand Up @@ -157,16 +168,29 @@ def test_deploy_test_app(first_model: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
num_units=1,
constraints=constraints,
)

model_1.deploy(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

charm=MYSQL_ROUTER,
app=MYSQL_ROUTER,
base="ubuntu@22.04",
channel="8.0/edge",
num_units=1,
trust=True,
)

logging.info("Relating the test application")
model_1.integrate(
f"{MYSQL_APP_1}:database",
f"{MYSQL_ROUTER}:database",
f"{MYSQL_TEST_APP_NAME}:database",
)
model_1.integrate(
f"{MYSQL_ROUTER}:backend-database",
f"{MYSQL_APP_1}:database",
)

model_1.wait(
ready=wait_for_apps_status(jubilant_backports.all_active, MYSQL_TEST_APP_NAME),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
load_mysql_test_data,
update_interval,
wait_for_apps_status,
wait_for_unit_message,
wait_for_unit_status,
)

Expand All @@ -45,7 +44,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down Expand Up @@ -111,34 +110,33 @@ def test_cluster_failover_after_majority_loss(juju: Juju) -> None:

non_primary_units = app_units - {primary_unit}

unit_to_promote = non_primary_units.pop()
unit_to_survive = non_primary_units.pop()
units_to_freeze = [non_primary_units.pop(), primary_unit]

logging.info(f"Unit selected for promotion: {unit_to_promote}")
logging.info(f"Unit selected for promotion: {unit_to_survive}")

logging.info("Simulate quorum loss")
units_to_kill = [non_primary_units.pop(), primary_unit]
kill_pods(juju, units_to_kill)
logging.info("Simulating quorum loss via SIGSTOP on mysqld")
freeze_mysql(juju, units_to_freeze)

with update_interval(juju, "45s"):
logging.info("Waiting to settle in error state")
logging.info("Waiting for surviving unit to detect quorum loss")
juju.wait(
ready=lambda status: all((
wait_for_unit_status(app_name, unit_to_promote, "active")(status),
wait_for_unit_message(app_name, units_to_kill[0], "OFFLINE")(status),
wait_for_unit_message(app_name, units_to_kill[1], "OFFLINE")(status),
)),
timeout=15 * MINUTE_SECS,
delay=15,
ready=lambda status: wait_for_unit_status(app_name, unit_to_survive, "active")(status),
timeout=5 * MINUTE_SECS,
delay=10,
)

logging.info("Attempting to promote a unit to primary after quorum loss...")
logging.info("Attempting to promote surviving unit to primary after quorum loss...")
juju.run(
unit=unit_to_promote,
unit=unit_to_survive,
action="promote-to-primary",
params={"scope": "unit", "force": True},
wait=600,
)

logging.info("Resuming frozen units so they can rejoin the cluster")
unfreeze_mysql(juju, units_to_freeze)

with update_interval(juju, "15s"):
logging.info("Waiting for all units to become active after switchover...")
juju.wait(
Expand All @@ -147,20 +145,46 @@ def test_cluster_failover_after_majority_loss(juju: Juju) -> None:
delay=5,
)

assert get_mysql_primary_unit(juju, app_name) == unit_to_promote, "Failover failed"


def kill_pods(juju: Juju, unit_names: list[str]) -> None:
"""Kill the unit pods simultaneously using kubectl."""
pod_names = [get_mysql_instance_label(unit) for unit in unit_names]
cmd = [
"kubectl",
"delete",
"pod",
*pod_names,
"-n",
juju.model,
"--grace-period=0",
"--force",
]
subprocess.run(cmd, check=True)
assert get_mysql_primary_unit(juju, app_name) == unit_to_survive, "Failover failed"


def freeze_mysql(juju: Juju, unit_names: list[str]) -> None:
"""Freeze mysqld in the mysql container via SIGSTOP to simulate unreachable members."""
for unit in unit_names:
pod = get_mysql_instance_label(unit)
subprocess.check_call(
[
"kubectl",
"exec",
pod,
"-n",
juju.model or "testing",
"-c",
"mysql",
"--",
"bash",
"-c",
"kill -STOP $(pgrep -x mysqld)",
],
)


def unfreeze_mysql(juju: Juju, unit_names: list[str]) -> None:
"""Resume frozen mysqld in the mysql container via SIGCONT."""
for unit in unit_names:
pod = get_mysql_instance_label(unit)
subprocess.check_call(
[
"kubectl",
"exec",
pod,
"-n",
juju.model or "testing",
"-c",
"mysql",
"--",
"bash",
"-c",
"kill -CONT $(pgrep -x mysqld)",
],
)
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_deploy_highly_available_cluster_1(juju: Juju, charm: str) -> None:
charm="mysql-test-app",
app=MYSQL_TEST_APP_NAME_1,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down Expand Up @@ -89,7 +89,7 @@ def test_deploy_highly_available_cluster_2(juju: Juju, charm: str) -> None:
charm="mysql-test-app",
app=MYSQL_TEST_APP_NAME_2,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_deploy_highly_available_cluster(juju: Juju, charm: str) -> None:
charm=MYSQL_TEST_APP_NAME,
app=MYSQL_TEST_APP_NAME,
base="ubuntu@22.04",
channel="latest/edge",
channel="latest/edge/racing",
config={"sleep_interval": 300},
num_units=1,
constraints=constraints,
Expand Down
Loading
Loading