From 5b462df46b91cb74fe64f933c50475cef30d0ede Mon Sep 17 00:00:00 2001 From: jansdhillon Date: Fri, 17 Apr 2026 15:26:43 -0600 Subject: [PATCH 1/3] fix(tests): make integration tests idempotent against live models [LNDENG-4308] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace all juju.ssh() calls with juju.exec() — JAAS does not proxy SSH to units, so juju.ssh is broken on live PS7 models. juju.exec runs the command directly over the Juju API. Affected: wait_for_service (helpers.py), test_landscape_schema_migrated, test_ubuntu_installer_attach_toggle_no_maintenance - Wrap test_modern_database_relation and test_legacy_db_relation in try/finally so restore_db_relations() is guaranteed to run even if the assertion fails, leaving the model in a known-good state. - Skip test_upgrade_action_updates_ppa when USE_HOST_JUJU_MODEL=True. This test mutates PPA state on the live unit and is unsuitable for live model runs; it belongs in a dedicated upgrade pipeline. - Fix USE_HOST_JUJU_MODEL and USE_HOST_LBAAS_MODEL to use bool(os.getenv(...)) instead of os.getenv(..., False) — the old form returned None or str, which made the type misleading even though truthiness was correct. --- tests/integration/conftest.py | 4 +- tests/integration/helpers.py | 2 +- tests/integration/test_bundle.py | 68 ++++++++++++++++++-------------- 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 56c4137a..c2deef59 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -20,13 +20,13 @@ WAIT_TIMEOUT_SECONDS = 60 * 20 # Landscape takes a long time to deploy. -USE_HOST_JUJU_MODEL = os.getenv("LANDSCAPE_CHARM_USE_HOST_JUJU_MODEL", False) +USE_HOST_JUJU_MODEL = bool(os.getenv("LANDSCAPE_CHARM_USE_HOST_JUJU_MODEL")) """ If `True`, return a reference the current Juju model on the host instead of a temporary model. """ -USE_HOST_LBAAS_MODEL = os.getenv("LANDSCAPE_CHARM_USE_HOST_LBAAS_MODEL", False) +USE_HOST_LBAAS_MODEL = bool(os.getenv("LANDSCAPE_CHARM_USE_HOST_LBAAS_MODEL")) """ If `True`, use existing LBaaS model instead of creating a temporary one. The model name should be set in `LBAAS_MODEL_NAME` environment variable. diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index e43d9da4..a669c5b9 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -157,7 +157,7 @@ def wait_for_service( last_exc: jubilant.CLIError | None = None while time.monotonic() < deadline: try: - juju.ssh(unit, f"systemctl is-active {service}.service") + juju.exec(f"systemctl is-active {service}.service", unit=unit) return except jubilant.CLIError as e: last_exc = e diff --git a/tests/integration/test_bundle.py b/tests/integration/test_bundle.py index d9f3b8de..e1bb1a67 100644 --- a/tests/integration/test_bundle.py +++ b/tests/integration/test_bundle.py @@ -12,6 +12,7 @@ import pytest from charm import DEFAULT_SERVICES, LANDSCAPE_UBUNTU_INSTALLER_ATTACH, LEADER_SERVICES +from tests.integration.conftest import USE_HOST_JUJU_MODEL from tests.integration.helpers import ( get_session, has_legacy_pg, @@ -203,21 +204,24 @@ def test_modern_database_relation(juju: jubilant.Juju, lbaas: jubilant.Juju): status = juju.status() initial_relations = set(status.apps["landscape-server"].relations) - if "db" in initial_relations: - juju.remove_relation("landscape-server:db", "postgresql:db-admin", force=True) - juju.wait(lambda status: not has_legacy_pg(juju), timeout=120) - - juju.integrate("landscape-server:database", "postgresql:database") + try: + if "db" in initial_relations: + juju.remove_relation( + "landscape-server:db", "postgresql:db-admin", force=True + ) + juju.wait(lambda status: not has_legacy_pg(juju), timeout=120) - elif "database" not in initial_relations: - juju.integrate("landscape-server:database", "postgresql:database") - juju.wait(lambda status: has_modern_pg(juju), timeout=120) + juju.integrate("landscape-server:database", "postgresql:database") - relations = set(juju.status().apps["landscape-server"].relations) + elif "database" not in initial_relations: + juju.integrate("landscape-server:database", "postgresql:database") + juju.wait(lambda status: has_modern_pg(juju), timeout=120) - assert "database" in relations + relations = set(juju.status().apps["landscape-server"].relations) - restore_db_relations(juju, initial_relations) + assert "database" in relations + finally: + restore_db_relations(juju, initial_relations) def test_legacy_db_relation(juju: jubilant.Juju, lbaas: jubilant.Juju): @@ -230,22 +234,23 @@ def test_legacy_db_relation(juju: jubilant.Juju, lbaas: jubilant.Juju): status = juju.status() initial_relations = set(status.apps["landscape-server"].relations) - if "database" in initial_relations: - juju.remove_relation( - "landscape-server:database", "postgresql:database", force=True - ) - juju.wait(lambda status: not has_modern_pg(juju), timeout=120) - juju.integrate("landscape-server:db", "postgresql:db-admin") - - elif "db" not in initial_relations: - juju.integrate("landscape-server:db", "postgresql:db-admin") - juju.wait(lambda status: has_legacy_pg(juju), timeout=120) + try: + if "database" in initial_relations: + juju.remove_relation( + "landscape-server:database", "postgresql:database", force=True + ) + juju.wait(lambda status: not has_modern_pg(juju), timeout=120) + juju.integrate("landscape-server:db", "postgresql:db-admin") - relations = set(juju.status().apps["landscape-server"].relations) + elif "db" not in initial_relations: + juju.integrate("landscape-server:db", "postgresql:db-admin") + juju.wait(lambda status: has_legacy_pg(juju), timeout=120) - assert "db" in relations + relations = set(juju.status().apps["landscape-server"].relations) - restore_db_relations(juju, initial_relations) + assert "db" in relations + finally: + restore_db_relations(juju, initial_relations) def test_pgbouncer_relation(juju: jubilant.Juju, bundle: None): @@ -301,12 +306,12 @@ def test_landscape_schema_migrated(juju: jubilant.Juju, bundle: None): host, port = stores["host"].split(":") password, user, dbname = stores["password"], stores["user"], stores["main"] - result = juju.ssh( - "landscape-server/leader", + result = juju.exec( f"PGPASSWORD={password} psql -h {host} -p {port} -U {user} -d {dbname}" ' -tAc "SELECT COUNT(*) FROM information_schema.tables' " WHERE table_schema = 'public' AND table_name = 'account';\"", - ).strip() + unit="landscape-server/leader", + ).stdout.strip() assert result == "1", ( "Expected the 'account' table to exist in the landscape database, " @@ -402,9 +407,9 @@ def test_ubuntu_installer_attach_toggle_no_maintenance( for name in status.apps["landscape-server"].units.keys(): with pytest.raises(Exception): - juju.ssh( - name, + juju.exec( f"systemctl is-active {LANDSCAPE_UBUNTU_INSTALLER_ATTACH}.service", + unit=name, ) finally: @@ -763,6 +768,11 @@ def test_upgrade_action_updates_ppa(juju: jubilant.Juju, bundle: None): sources before upgrading, so switching PPAs (ex. upgrade from self-hosted-24.04 to self-hosted-beta) works correctly. """ + if USE_HOST_JUJU_MODEL: + pytest.skip( + "test_upgrade_action_updates_ppa mutates PPA state and is not safe " + "to run against a live model. Run in a dedicated upgrade pipeline instead." + ) juju.wait(jubilant.all_active, timeout=300) landscape_ppa = juju.config("landscape-server").get( From fdf068af9a2376891a5dffaf12a98420a30e5446 Mon Sep 17 00:00:00 2001 From: jansdhillon Date: Fri, 17 Apr 2026 16:50:39 -0600 Subject: [PATCH 2/3] fix: address review feedback on integration test idempotency [LNDENG-4308] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - test_modern_database_relation: add juju.wait(has_modern_pg) after integrating database relation when switching from legacy, so status is polled until the relation actually appears before asserting - test_legacy_db_relation: same — add juju.wait(has_legacy_pg) after integrating db relation when switching from modern - test_ubuntu_installer_attach_toggle_no_maintenance: narrow pytest.raises(Exception) to pytest.raises(jubilant.CLIError) so only the expected non-zero exit failure is caught and transport errors are not --- tests/integration/test_bundle.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_bundle.py b/tests/integration/test_bundle.py index e1bb1a67..95f5dcfe 100644 --- a/tests/integration/test_bundle.py +++ b/tests/integration/test_bundle.py @@ -212,6 +212,7 @@ def test_modern_database_relation(juju: jubilant.Juju, lbaas: jubilant.Juju): juju.wait(lambda status: not has_legacy_pg(juju), timeout=120) juju.integrate("landscape-server:database", "postgresql:database") + juju.wait(lambda status: has_modern_pg(juju), timeout=120) elif "database" not in initial_relations: juju.integrate("landscape-server:database", "postgresql:database") @@ -241,6 +242,7 @@ def test_legacy_db_relation(juju: jubilant.Juju, lbaas: jubilant.Juju): ) juju.wait(lambda status: not has_modern_pg(juju), timeout=120) juju.integrate("landscape-server:db", "postgresql:db-admin") + juju.wait(lambda status: has_legacy_pg(juju), timeout=120) elif "db" not in initial_relations: juju.integrate("landscape-server:db", "postgresql:db-admin") @@ -406,7 +408,7 @@ def test_ubuntu_installer_attach_toggle_no_maintenance( assert status.apps["landscape-server"].app_status.current == "active" for name in status.apps["landscape-server"].units.keys(): - with pytest.raises(Exception): + with pytest.raises(jubilant.CLIError): juju.exec( f"systemctl is-active {LANDSCAPE_UBUNTU_INSTALLER_ATTACH}.service", unit=name, From 8c216f9476a1f5bbd46b4783167c1961c3901a37 Mon Sep 17 00:00:00 2001 From: jansdhillon Date: Fri, 17 Apr 2026 21:42:40 -0600 Subject: [PATCH 3/3] feat(tests): add service placement, port, leadership, scale, and pg-failover tests [LNDENG-4308] - test_leader_services_not_on_non_leaders: assert LEADER_SERVICES are NOT active on non-leader units (explicit negative; safe on live models) - test_service_ports_listening: verify all-unit ports and leader-only ports via ss; guards against HAProxy charm config regressions (safe on live models) - test_leadership_change_service_follows: remove leader unit, verify LEADER_SERVICES migrate to new leader and DEFAULT_SERVICES stay up (skip on live model) - test_scale_out_and_in: add/remove a unit, verify service placement throughout (skip on live model) - test_postgresql_failover_services_recover: stop pg leader, verify landscape-server recovers automatically and appserver stays healthy (skip on live model) --- tests/integration/test_bundle.py | 237 +++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) diff --git a/tests/integration/test_bundle.py b/tests/integration/test_bundle.py index 95f5dcfe..bd70952b 100644 --- a/tests/integration/test_bundle.py +++ b/tests/integration/test_bundle.py @@ -811,3 +811,240 @@ def test_upgrade_action_updates_ppa(juju: jubilant.Juju, bundle: None): juju.run(unit_name, "upgrade") juju.run(unit_name, "resume") juju.wait(jubilant.all_active, timeout=300) + + +# --------------------------------------------------------------------------- +# Service placement and port tests +# --------------------------------------------------------------------------- + +_ALL_UNIT_PORTS = [8080, 8070, 8090, 9080] # appserver, pingserver, message-server, api +_LEADER_ONLY_PORTS = [9100, 9099] # package-upload, package-search + + +def test_leader_services_not_on_non_leaders( + juju: jubilant.Juju, lbaas: jubilant.Juju +): + """ + LEADER_SERVICES (package-search, package-upload) must NOT be active on + non-leader units. test_all_services_up only asserts the positive side; + this asserts the negative to catch misconfigured multi-unit deployments. + """ + status = juju.status() + units = status.apps["landscape-server"].units + + if len(units) <= 1: + pytest.skip("Need more than 1 unit to verify non-leader service placement") + + juju.wait(jubilant.all_active, timeout=300) + + for name, unit_status in units.items(): + if not unit_status.leader: + for service in LEADER_SERVICES: + with pytest.raises(jubilant.CLIError): + juju.exec( + f"systemctl is-active {service}.service", + unit=name, + ) + + +def test_service_ports_listening(juju: jubilant.Juju, lbaas: jubilant.Juju): + """ + Verify that services are listening on the expected ports on the expected units. + + All-unit services (appserver:8080, pingserver:8070, message-server:8090, + api:9080) must be open on every unit. Leader-only services + (package-upload:9100, package-search:9099) must be open only on the leader. + + This guards against HAProxy charm config changes silently dropping backend + ports, which previously nearly brought down prod. + """ + juju.wait(jubilant.all_active, timeout=300) + + status = juju.status() + units = status.apps["landscape-server"].units + + for name, unit_status in units.items(): + for port in _ALL_UNIT_PORTS: + output = juju.exec(f"ss -tlnH 'sport = :{port}'", unit=name) + assert str(port) in output, ( + f"Expected port {port} to be listening on {name}, got: {output!r}" + ) + + for port in _LEADER_ONLY_PORTS: + output = juju.exec(f"ss -tlnH 'sport = :{port}'", unit=name) + if unit_status.leader: + assert str(port) in output, ( + f"Expected leader-only port {port} on leader {name}, " + f"got: {output!r}" + ) + else: + assert str(port) not in output, ( + f"Leader-only port {port} must not be open on non-leader {name}" + ) + + +# --------------------------------------------------------------------------- +# Destructive tests — skipped on live models +# --------------------------------------------------------------------------- + + +def test_leadership_change_service_follows( + juju: jubilant.Juju, lbaas: jubilant.Juju +): + """ + When the current leader unit is removed, a new leader is elected and + LEADER_SERVICES must start on that new leader while DEFAULT_SERVICES remain + active across all surviving units. + + Skipped on live models because it removes a unit temporarily. + """ + if USE_HOST_JUJU_MODEL: + pytest.skip( + "test_leadership_change_service_follows removes a unit; " + "skipped on live model." + ) + + status = juju.status() + units = status.apps["landscape-server"].units + + if len(units) <= 1: + pytest.skip("Need more than 1 unit to test leadership change") + + juju.wait(jubilant.all_active, timeout=300) + + leader_name = next(n for n, u in units.items() if u.leader) + + try: + juju.cli("remove-unit", leader_name, "--no-wait") + + def new_leader_active(status): + app_units = status.apps["landscape-server"].units + return any( + u.leader and u.workload_status.current == "active" + for u in app_units.values() + ) + + juju.wait(new_leader_active, timeout=600) + + new_status = juju.status() + new_leader = next( + n + for n, u in new_status.apps["landscape-server"].units.items() + if u.leader + ) + + for service in LEADER_SERVICES: + wait_for_service(juju, new_leader, service) + + for name in new_status.apps["landscape-server"].units: + for service in DEFAULT_SERVICES: + wait_for_service(juju, name, service) + finally: + juju.cli("add-unit", "landscape-server", "-n", "1") + juju.wait(jubilant.all_active, timeout=600) + + +def test_scale_out_and_in(juju: jubilant.Juju, lbaas: jubilant.Juju): + """ + Adding a unit brings it to active with all DEFAULT_SERVICES running and + LEADER_SERVICES absent. Removing it leaves the model healthy. + + Skipped on live models. + """ + if USE_HOST_JUJU_MODEL: + pytest.skip("test_scale_out_and_in adds/removes units; skipped on live model.") + + juju.wait(jubilant.all_active, timeout=300) + + original_count = len(juju.status().apps["landscape-server"].units) + + try: + juju.cli("add-unit", "landscape-server", "-n", "1") + juju.wait(jubilant.all_active, timeout=600) + + status = juju.status() + units = status.apps["landscape-server"].units + assert len(units) == original_count + 1, ( + "Expected one extra unit after scale-out" + ) + + new_unit = next( + n for n, u in units.items() if not u.leader and n not in { + nu for nu in units if units[nu].leader + } + ) + + for service in DEFAULT_SERVICES: + wait_for_service(juju, new_unit, service) + + for service in LEADER_SERVICES: + with pytest.raises(jubilant.CLIError): + juju.exec(f"systemctl is-active {service}.service", unit=new_unit) + + juju.cli("remove-unit", new_unit, "--no-wait") + juju.wait(jubilant.all_active, timeout=600) + + assert ( + len(juju.status().apps["landscape-server"].units) == original_count + ), "Unit count did not return to original after scale-in" + finally: + current = len(juju.status().apps["landscape-server"].units) + if current != original_count: + juju.cli( + "remove-unit", + sorted(juju.status().apps["landscape-server"].units)[-1], + "--no-wait", + ) + juju.wait(jubilant.all_active, timeout=600) + + +def test_postgresql_failover_services_recover( + juju: jubilant.Juju, lbaas: jubilant.Juju +): + """ + After a PostgreSQL leader failover, landscape-server services must recover + automatically without manual intervention and continue serving HTTP. + + Skipped on live models; intended for a dedicated failover pipeline. + """ + if USE_HOST_JUJU_MODEL: + pytest.skip( + "test_postgresql_failover_services_recover triggers a pg failover; " + "skipped on live model. Run in a dedicated failover pipeline." + ) + + pg_status = juju.status() + pg_apps = [a for a in pg_status.apps if a.startswith("postgresql")] + if not pg_apps: + pytest.skip("No postgresql app found in model") + + pg_app = pg_apps[0] + pg_units = pg_status.apps[pg_app].units + pg_leader = next( + (n for n, u in pg_units.items() if u.leader), None + ) + if pg_leader is None or len(pg_units) <= 1: + pytest.skip("Need a multi-unit postgresql with a leader to test failover") + + juju.wait(jubilant.all_active, timeout=300) + + haproxy_ip = _haproxy_ip(juju, lbaas) + + try: + juju.exec("sudo systemctl stop postgresql", unit=pg_leader) + + juju.wait(jubilant.all_active, timeout=600) + + status = juju.status() + for name in status.apps["landscape-server"].units: + for service in DEFAULT_SERVICES: + wait_for_service(juju, name, service) + + session = get_session() + response = session.get(f"https://{haproxy_ip}/ping", verify=False, timeout=30) + assert response.status_code == 200, ( + f"appserver /ping returned {response.status_code} after pg failover" + ) + finally: + juju.exec("sudo systemctl start postgresql", unit=pg_leader) + juju.wait(jubilant.all_active, timeout=600)