From 8985db70fba87c575e37ff4fe560bc5417bc4788 Mon Sep 17 00:00:00 2001 From: Hemanth Nakkina Date: Fri, 24 Apr 2026 09:38:23 +0530 Subject: [PATCH] k8s: wait for CoreDNS pods available before proceeding PatchCoreDNSStep runs a helm upgrade to apply HPA and resource settings to CoreDNS. Previously it waited for the k8s Juju application to reach active status, but the k8s charm does not react to external changes until the next update-status hook fires, so the application could be seen as active while CoreDNS pods were still restarting. Replace the Juju application wait with a direct poll of the CoreDNS Deployment's availableReplicas field via the k8s API. The step now waits until availableReplicas >= minReplicas (replica_count), which ensures DNS is fully available before subsequent OpenStack deployment steps proceed. Fixes: LP#2148371 Assisted-By: Claude:claude-4.6-sonnet Signed-off-by: Hemanth Nakkina --- sunbeam-python/sunbeam/steps/k8s.py | 66 ++++++-- .../tests/unit/sunbeam/steps/test_k8s.py | 152 +++++++++++++++--- 2 files changed, 183 insertions(+), 35 deletions(-) diff --git a/sunbeam-python/sunbeam/steps/k8s.py b/sunbeam-python/sunbeam/steps/k8s.py index a64743ff7..6f0968a25 100644 --- a/sunbeam-python/sunbeam/steps/k8s.py +++ b/sunbeam-python/sunbeam/steps/k8s.py @@ -2021,9 +2021,12 @@ def __init__( self.juju_app_name = "k8s" self.coredns_namespace = "kube-system" self.coredns_hpa = "ck-dns-coredns" + self.coredns_deployment = "coredns" self.timeout = 180 # 3 minutes for helm upgrade self.replica_count = 1 + _COREDNS_POLL_INTERVAL = 10 + def compute_coredns_replica_count(self, control_nodes: int) -> int: """Determine replica count for coredns.""" return 1 if control_nodes < 3 else 3 @@ -2120,22 +2123,65 @@ def run(self, context: StepContext) -> Result: ) return Result(ResultType.FAILED, str(e)) - # The helm upgrade modifies the CoreDNS pod template (resources), triggering - # a rolling restart. Wait for the k8s application to return to active before - # proceeding so that DNS is available when subsequent steps deploy OpenStack. + # Wait for CoreDNS pods to be ready rather than relying on the k8s + # charm status, which may not reflect external changes (helm upgrade) + # until the next update-status hook fires. try: - self.jhelper.wait_application_ready( - self.juju_app_name, - self.deployment.openstack_machines_model, - accepted_status=["active"], - timeout=K8S_APP_TIMEOUT, - ) - except TimeoutError as e: + self._wait_for_coredns_ready() + except (TimeoutError, K8SError) as e: LOG.warning(str(e)) return Result(ResultType.FAILED, str(e)) return Result(ResultType.COMPLETED) + def _wait_for_coredns_ready(self) -> None: + """Wait until CoreDNS deployment has at least replica_count available replicas. + + Directly polls the CoreDNS Deployment status rather than relying on the + k8s charm status, which may not reflect external changes (e.g. a helm + upgrade) until the next update-status hook runs. + """ + deadline = time.monotonic() + K8S_APP_TIMEOUT + while time.monotonic() < deadline: + try: + deployment = self.kube.get( + apps_v1.Deployment, name=self.coredns_deployment + ) + except l_exceptions.ApiError as e: + status = getattr(e, "status", None) + status_code = getattr(status, "code", None) + if status_code == 404: + raise K8SError( + f"CoreDNS deployment '{self.coredns_deployment}' not found" + ) from e + if status_code is not None and 500 <= status_code < 600: + LOG.debug("Transient error while getting coredns deployment: %s", e) + time.sleep(self._COREDNS_POLL_INTERVAL) + continue + raise + available_replicas = ( + deployment.status.availableReplicas + if deployment.status and deployment.status.availableReplicas is not None + else 0 + ) + if available_replicas >= self.replica_count: + LOG.debug( + "CoreDNS deployment ready: %d/%d replicas available", + available_replicas, + self.replica_count, + ) + return + LOG.debug( + "Waiting for CoreDNS availableReplicas >= %d (current: %d)", + self.replica_count, + available_replicas, + ) + time.sleep(self._COREDNS_POLL_INTERVAL) + raise TimeoutError( + f"CoreDNS deployment did not reach {self.replica_count} available" + f" replicas within {K8S_APP_TIMEOUT}s" + ) + class PatchServiceExternalTrafficStep(BaseStep): """Patch service external traffic policy to Local. diff --git a/sunbeam-python/tests/unit/sunbeam/steps/test_k8s.py b/sunbeam-python/tests/unit/sunbeam/steps/test_k8s.py index 1fd5ebd15..2103d55e2 100644 --- a/sunbeam-python/tests/unit/sunbeam/steps/test_k8s.py +++ b/sunbeam-python/tests/unit/sunbeam/steps/test_k8s.py @@ -25,7 +25,6 @@ from sunbeam.errors import SunbeamException from sunbeam.steps.k8s import ( CREDENTIAL_SUFFIX, - K8S_APP_TIMEOUT, K8S_CLOUD_SUFFIX, AddK8SCloudStep, AddK8SCredentialStep, @@ -1365,67 +1364,170 @@ def test_is_skip_control_nodes_removed(self, step, kube, step_context): def test_run(self, step, jhelper): jhelper.run_cmd_on_machine_unit_payload.return_value = Mock(return_code=0) - result = step.run(None) + with patch.object(step, "_wait_for_coredns_ready"): + result = step.run(None) assert result.result_type == ResultType.COMPLETED jhelper.get_leader_unit.assert_called_once() jhelper.run_cmd_on_machine_unit_payload.assert_called_once() - jhelper.wait_application_ready.assert_called_once() def test_run_helm_upgrade_failed(self, step, jhelper): jhelper.run_cmd_on_machine_unit_payload.return_value = Mock(return_code=1) - result = step.run(None) + with patch.object(step, "_wait_for_coredns_ready") as mock_wait: + result = step.run(None) assert result.result_type == ResultType.FAILED jhelper.get_leader_unit.assert_called_once() jhelper.run_cmd_on_machine_unit_payload.assert_called_once() - jhelper.wait_application_ready.assert_not_called() + mock_wait.assert_not_called() def test_run_failed_on_juju_run_on_machine_unit(self, step, jhelper): jhelper.run_cmd_on_machine_unit_payload.side_effect = JujuException( "Not able to run command" ) - result = step.run(None) + with patch.object(step, "_wait_for_coredns_ready") as mock_wait: + result = step.run(None) assert result.result_type == ResultType.FAILED jhelper.get_leader_unit.assert_called_once() jhelper.run_cmd_on_machine_unit_payload.assert_called_once() - jhelper.wait_application_ready.assert_not_called() + mock_wait.assert_not_called() def test_run_leader_not_found(self, step, jhelper): jhelper.get_leader_unit.side_effect = LeaderNotFoundException( "Leader missing..." ) - result = step.run(None) + with patch.object(step, "_wait_for_coredns_ready") as mock_wait: + result = step.run(None) assert result.result_type == ResultType.FAILED jhelper.get_leader_unit.assert_called_once() jhelper.run_cmd_on_machine_unit_payload.assert_not_called() - jhelper.wait_application_ready.assert_not_called() + mock_wait.assert_not_called() - def test_run_waits_for_k8s_application_ready(self, step, jhelper): - """wait_application_ready must be called after a successful helm upgrade. + def test_run_waits_for_coredns_ready(self, step, jhelper): + """_wait_for_coredns_ready must be called after a successful helm upgrade. Ensures DNS is available before subsequent OpenStack deployment steps. """ jhelper.run_cmd_on_machine_unit_payload.return_value = Mock(return_code=0) - result = step.run(None) + with patch.object(step, "_wait_for_coredns_ready") as mock_wait: + result = step.run(None) assert result.result_type == ResultType.COMPLETED - jhelper.wait_application_ready.assert_called_once_with( - "k8s", - step.deployment.openstack_machines_model, - accepted_status=["active"], - timeout=K8S_APP_TIMEOUT, - ) + mock_wait.assert_called_once() - def test_run_wait_application_ready_timeout(self, step, jhelper): - """A TimeoutError from wait_application_ready must propagate as FAILED. + def test_run_coredns_ready_timeout(self, step, jhelper): + """A TimeoutError from _wait_for_coredns_ready must propagate as FAILED. Ensures the deployment does not silently proceed with DNS unavailable. """ jhelper.run_cmd_on_machine_unit_payload.return_value = Mock(return_code=0) - jhelper.wait_application_ready.side_effect = TimeoutError( - "Timed out waiting for k8s to be ready" - ) - result = step.run(None) + with patch.object( + step, + "_wait_for_coredns_ready", + side_effect=TimeoutError("timed out"), + ): + result = step.run(None) + assert result.result_type == ResultType.FAILED + + def test_run_coredns_deployment_not_found(self, step, jhelper): + """A K8SError from _wait_for_coredns_ready must propagate as FAILED.""" + jhelper.run_cmd_on_machine_unit_payload.return_value = Mock(return_code=0) + with patch.object( + step, + "_wait_for_coredns_ready", + side_effect=K8SError("CoreDNS deployment 'ck-dns-coredns' not found"), + ): + result = step.run(None) assert result.result_type == ResultType.FAILED - jhelper.wait_application_ready.assert_called_once() + + def test_wait_for_coredns_ready_when_already_ready(self, step, kube): + """Returns immediately when availableReplicas >= replica_count.""" + step.kube = kube + step.replica_count = 2 + deployment = Mock() + deployment.status = Mock() + deployment.status.availableReplicas = 2 + kube.get = Mock(return_value=deployment) + step._wait_for_coredns_ready() # must not raise + kube.get.assert_called_once() + + def test_wait_for_coredns_ready_polls_until_ready(self, step, kube): + """Polls until availableReplicas reaches replica_count.""" + step.kube = kube + step.replica_count = 1 + not_ready = Mock() + not_ready.status = Mock() + not_ready.status.availableReplicas = 0 + ready = Mock() + ready.status = Mock() + ready.status.availableReplicas = 1 + kube.get = Mock(side_effect=[not_ready, ready]) + with patch("sunbeam.steps.k8s.time.sleep"): + step._wait_for_coredns_ready() + assert kube.get.call_count == 2 + + def test_wait_for_coredns_ready_retries_on_5xx_api_error(self, step, kube): + """Retries only on transient 5xx errors.""" + step.kube = kube + step.replica_count = 1 + api_error = ApiError( + Mock(), + httpx.Response( + status_code=503, + content=json.dumps({"code": 503, "message": "service unavailable"}), + ), + ) + ready = Mock() + ready.status = Mock() + ready.status.availableReplicas = 1 + kube.get = Mock(side_effect=[api_error, ready]) + with patch("sunbeam.steps.k8s.time.sleep"): + step._wait_for_coredns_ready() + assert kube.get.call_count == 2 + + def test_wait_for_coredns_ready_fails_fast_on_404(self, step, kube): + """Raises K8SError immediately when deployment is not found (404).""" + step.kube = kube + step.replica_count = 1 + api_error = ApiError( + Mock(), + httpx.Response( + status_code=404, + content=json.dumps({"code": 404, "message": "not found"}), + ), + ) + kube.get = Mock(side_effect=api_error) + with pytest.raises(K8SError, match="not found"): + step._wait_for_coredns_ready() + kube.get.assert_called_once() + + def test_wait_for_coredns_ready_reraises_non_transient_api_error(self, step, kube): + """Re-raises ApiError for non-404, non-5xx status codes.""" + step.kube = kube + step.replica_count = 1 + api_error = ApiError( + Mock(), + httpx.Response( + status_code=403, + content=json.dumps({"code": 403, "message": "forbidden"}), + ), + ) + kube.get = Mock(side_effect=api_error) + with pytest.raises(ApiError): + step._wait_for_coredns_ready() + kube.get.assert_called_once() + + def test_wait_for_coredns_ready_timeout(self, step, kube): + """Raises TimeoutError when pods do not become ready in time.""" + step.kube = kube + step.replica_count = 3 + deployment = Mock() + deployment.status = Mock() + deployment.status.availableReplicas = 0 + kube.get = Mock(return_value=deployment) + with ( + patch("sunbeam.steps.k8s.K8S_APP_TIMEOUT", 0), + patch("sunbeam.steps.k8s.time.sleep"), + ): + with pytest.raises(TimeoutError): + step._wait_for_coredns_ready() class TestPatchServiceExternalTrafficStep: