Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 56 additions & 10 deletions sunbeam-python/sunbeam/steps/k8s.py
Original file line number Diff line number Diff line change
Expand Up @@ -2021,9 +2021,12 @@ def __init__(
self.juju_app_name = "k8s"
self.coredns_namespace = "kube-system"
self.coredns_hpa = "ck-dns-coredns"
self.coredns_deployment = "coredns"
self.timeout = 180 # 3 minutes for helm upgrade
self.replica_count = 1

_COREDNS_POLL_INTERVAL = 10

def compute_coredns_replica_count(self, control_nodes: int) -> int:
"""Determine replica count for coredns."""
return 1 if control_nodes < 3 else 3
Expand Down Expand Up @@ -2120,22 +2123,65 @@ def run(self, context: StepContext) -> Result:
)
return Result(ResultType.FAILED, str(e))

# The helm upgrade modifies the CoreDNS pod template (resources), triggering
# a rolling restart. Wait for the k8s application to return to active before
# proceeding so that DNS is available when subsequent steps deploy OpenStack.
# Wait for CoreDNS pods to be ready rather than relying on the k8s
# charm status, which may not reflect external changes (helm upgrade)
# until the next update-status hook fires.
try:
self.jhelper.wait_application_ready(
self.juju_app_name,
self.deployment.openstack_machines_model,
accepted_status=["active"],
timeout=K8S_APP_TIMEOUT,
)
except TimeoutError as e:
self._wait_for_coredns_ready()
except (TimeoutError, K8SError) as e:
LOG.warning(str(e))
return Result(ResultType.FAILED, str(e))

return Result(ResultType.COMPLETED)

def _wait_for_coredns_ready(self) -> None:
"""Wait until CoreDNS deployment has at least replica_count available replicas.

Directly polls the CoreDNS Deployment status rather than relying on the
k8s charm status, which may not reflect external changes (e.g. a helm
upgrade) until the next update-status hook runs.
"""
deadline = time.monotonic() + K8S_APP_TIMEOUT
while time.monotonic() < deadline:
try:
deployment = self.kube.get(
apps_v1.Deployment, name=self.coredns_deployment
)
except l_exceptions.ApiError as e:
status = getattr(e, "status", None)
status_code = getattr(status, "code", None)
if status_code == 404:
raise K8SError(
f"CoreDNS deployment '{self.coredns_deployment}' not found"
) from e
if status_code is not None and 500 <= status_code < 600:
LOG.debug("Transient error while getting coredns deployment: %s", e)
time.sleep(self._COREDNS_POLL_INTERVAL)
continue
raise
available_replicas = (
deployment.status.availableReplicas
if deployment.status and deployment.status.availableReplicas is not None
else 0
)
if available_replicas >= self.replica_count:
LOG.debug(
"CoreDNS deployment ready: %d/%d replicas available",
available_replicas,
self.replica_count,
)
return
LOG.debug(
"Waiting for CoreDNS availableReplicas >= %d (current: %d)",
self.replica_count,
available_replicas,
)
time.sleep(self._COREDNS_POLL_INTERVAL)
raise TimeoutError(
f"CoreDNS deployment did not reach {self.replica_count} available"
f" replicas within {K8S_APP_TIMEOUT}s"
)


class PatchServiceExternalTrafficStep(BaseStep):
"""Patch service external traffic policy to Local.
Expand Down
152 changes: 127 additions & 25 deletions sunbeam-python/tests/unit/sunbeam/steps/test_k8s.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from sunbeam.errors import SunbeamException
from sunbeam.steps.k8s import (
CREDENTIAL_SUFFIX,
K8S_APP_TIMEOUT,
K8S_CLOUD_SUFFIX,
AddK8SCloudStep,
AddK8SCredentialStep,
Expand Down Expand Up @@ -1365,67 +1364,170 @@ def test_is_skip_control_nodes_removed(self, step, kube, step_context):

def test_run(self, step, jhelper):
jhelper.run_cmd_on_machine_unit_payload.return_value = Mock(return_code=0)
result = step.run(None)
with patch.object(step, "_wait_for_coredns_ready"):
result = step.run(None)
assert result.result_type == ResultType.COMPLETED
jhelper.get_leader_unit.assert_called_once()
jhelper.run_cmd_on_machine_unit_payload.assert_called_once()
jhelper.wait_application_ready.assert_called_once()

def test_run_helm_upgrade_failed(self, step, jhelper):
jhelper.run_cmd_on_machine_unit_payload.return_value = Mock(return_code=1)
result = step.run(None)
with patch.object(step, "_wait_for_coredns_ready") as mock_wait:
result = step.run(None)
assert result.result_type == ResultType.FAILED
jhelper.get_leader_unit.assert_called_once()
jhelper.run_cmd_on_machine_unit_payload.assert_called_once()
jhelper.wait_application_ready.assert_not_called()
mock_wait.assert_not_called()

def test_run_failed_on_juju_run_on_machine_unit(self, step, jhelper):
jhelper.run_cmd_on_machine_unit_payload.side_effect = JujuException(
"Not able to run command"
)
result = step.run(None)
with patch.object(step, "_wait_for_coredns_ready") as mock_wait:
result = step.run(None)
assert result.result_type == ResultType.FAILED
jhelper.get_leader_unit.assert_called_once()
jhelper.run_cmd_on_machine_unit_payload.assert_called_once()
jhelper.wait_application_ready.assert_not_called()
mock_wait.assert_not_called()

def test_run_leader_not_found(self, step, jhelper):
jhelper.get_leader_unit.side_effect = LeaderNotFoundException(
"Leader missing..."
)
result = step.run(None)
with patch.object(step, "_wait_for_coredns_ready") as mock_wait:
result = step.run(None)
assert result.result_type == ResultType.FAILED
jhelper.get_leader_unit.assert_called_once()
jhelper.run_cmd_on_machine_unit_payload.assert_not_called()
jhelper.wait_application_ready.assert_not_called()
mock_wait.assert_not_called()

def test_run_waits_for_k8s_application_ready(self, step, jhelper):
"""wait_application_ready must be called after a successful helm upgrade.
def test_run_waits_for_coredns_ready(self, step, jhelper):
"""_wait_for_coredns_ready must be called after a successful helm upgrade.

Ensures DNS is available before subsequent OpenStack deployment steps.
"""
jhelper.run_cmd_on_machine_unit_payload.return_value = Mock(return_code=0)
result = step.run(None)
with patch.object(step, "_wait_for_coredns_ready") as mock_wait:
result = step.run(None)
assert result.result_type == ResultType.COMPLETED
jhelper.wait_application_ready.assert_called_once_with(
"k8s",
step.deployment.openstack_machines_model,
accepted_status=["active"],
timeout=K8S_APP_TIMEOUT,
)
mock_wait.assert_called_once()

def test_run_wait_application_ready_timeout(self, step, jhelper):
"""A TimeoutError from wait_application_ready must propagate as FAILED.
def test_run_coredns_ready_timeout(self, step, jhelper):
"""A TimeoutError from _wait_for_coredns_ready must propagate as FAILED.

Ensures the deployment does not silently proceed with DNS unavailable.
"""
jhelper.run_cmd_on_machine_unit_payload.return_value = Mock(return_code=0)
jhelper.wait_application_ready.side_effect = TimeoutError(
"Timed out waiting for k8s to be ready"
)
result = step.run(None)
with patch.object(
step,
"_wait_for_coredns_ready",
side_effect=TimeoutError("timed out"),
):
result = step.run(None)
assert result.result_type == ResultType.FAILED

def test_run_coredns_deployment_not_found(self, step, jhelper):
"""A K8SError from _wait_for_coredns_ready must propagate as FAILED."""
jhelper.run_cmd_on_machine_unit_payload.return_value = Mock(return_code=0)
with patch.object(
step,
"_wait_for_coredns_ready",
side_effect=K8SError("CoreDNS deployment 'ck-dns-coredns' not found"),
):
result = step.run(None)
assert result.result_type == ResultType.FAILED
jhelper.wait_application_ready.assert_called_once()

def test_wait_for_coredns_ready_when_already_ready(self, step, kube):
"""Returns immediately when availableReplicas >= replica_count."""
step.kube = kube
step.replica_count = 2
deployment = Mock()
deployment.status = Mock()
deployment.status.availableReplicas = 2
kube.get = Mock(return_value=deployment)
step._wait_for_coredns_ready() # must not raise
kube.get.assert_called_once()

def test_wait_for_coredns_ready_polls_until_ready(self, step, kube):
"""Polls until availableReplicas reaches replica_count."""
step.kube = kube
step.replica_count = 1
not_ready = Mock()
not_ready.status = Mock()
not_ready.status.availableReplicas = 0
ready = Mock()
ready.status = Mock()
ready.status.availableReplicas = 1
kube.get = Mock(side_effect=[not_ready, ready])
with patch("sunbeam.steps.k8s.time.sleep"):
step._wait_for_coredns_ready()
assert kube.get.call_count == 2

def test_wait_for_coredns_ready_retries_on_5xx_api_error(self, step, kube):
"""Retries only on transient 5xx errors."""
step.kube = kube
step.replica_count = 1
api_error = ApiError(
Mock(),
httpx.Response(
status_code=503,
content=json.dumps({"code": 503, "message": "service unavailable"}),
),
)
ready = Mock()
ready.status = Mock()
ready.status.availableReplicas = 1
kube.get = Mock(side_effect=[api_error, ready])
with patch("sunbeam.steps.k8s.time.sleep"):
step._wait_for_coredns_ready()
assert kube.get.call_count == 2

def test_wait_for_coredns_ready_fails_fast_on_404(self, step, kube):
"""Raises K8SError immediately when deployment is not found (404)."""
step.kube = kube
step.replica_count = 1
api_error = ApiError(
Mock(),
httpx.Response(
status_code=404,
content=json.dumps({"code": 404, "message": "not found"}),
),
)
kube.get = Mock(side_effect=api_error)
with pytest.raises(K8SError, match="not found"):
step._wait_for_coredns_ready()
kube.get.assert_called_once()

def test_wait_for_coredns_ready_reraises_non_transient_api_error(self, step, kube):
"""Re-raises ApiError for non-404, non-5xx status codes."""
step.kube = kube
step.replica_count = 1
api_error = ApiError(
Mock(),
httpx.Response(
status_code=403,
content=json.dumps({"code": 403, "message": "forbidden"}),
),
)
kube.get = Mock(side_effect=api_error)
with pytest.raises(ApiError):
step._wait_for_coredns_ready()
kube.get.assert_called_once()

def test_wait_for_coredns_ready_timeout(self, step, kube):
"""Raises TimeoutError when pods do not become ready in time."""
step.kube = kube
step.replica_count = 3
deployment = Mock()
deployment.status = Mock()
deployment.status.availableReplicas = 0
kube.get = Mock(return_value=deployment)
with (
patch("sunbeam.steps.k8s.K8S_APP_TIMEOUT", 0),
patch("sunbeam.steps.k8s.time.sleep"),
):
with pytest.raises(TimeoutError):
step._wait_for_coredns_ready()


class TestPatchServiceExternalTrafficStep:
Expand Down
Loading