-
Notifications
You must be signed in to change notification settings - Fork 12
fix(tests): make integration tests idempotent against live models [LNDENG-4308] #119
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
5b462df
fdf068a
8c216f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |
| import pytest | ||
|
|
||
| from charm import DEFAULT_SERVICES, LANDSCAPE_UBUNTU_INSTALLER_ATTACH, LEADER_SERVICES | ||
| from tests.integration.conftest import USE_HOST_JUJU_MODEL | ||
| from tests.integration.helpers import ( | ||
| get_session, | ||
| has_legacy_pg, | ||
|
|
@@ -203,21 +204,25 @@ def test_modern_database_relation(juju: jubilant.Juju, lbaas: jubilant.Juju): | |
| status = juju.status() | ||
| initial_relations = set(status.apps["landscape-server"].relations) | ||
|
|
||
| if "db" in initial_relations: | ||
| juju.remove_relation("landscape-server:db", "postgresql:db-admin", force=True) | ||
| juju.wait(lambda status: not has_legacy_pg(juju), timeout=120) | ||
|
|
||
| juju.integrate("landscape-server:database", "postgresql:database") | ||
| try: | ||
| if "db" in initial_relations: | ||
| juju.remove_relation( | ||
| "landscape-server:db", "postgresql:db-admin", force=True | ||
| ) | ||
| juju.wait(lambda status: not has_legacy_pg(juju), timeout=120) | ||
|
|
||
| elif "database" not in initial_relations: | ||
| juju.integrate("landscape-server:database", "postgresql:database") | ||
| juju.wait(lambda status: has_modern_pg(juju), timeout=120) | ||
| juju.integrate("landscape-server:database", "postgresql:database") | ||
| juju.wait(lambda status: has_modern_pg(juju), timeout=120) | ||
|
|
||
| relations = set(juju.status().apps["landscape-server"].relations) | ||
| elif "database" not in initial_relations: | ||
| juju.integrate("landscape-server:database", "postgresql:database") | ||
| juju.wait(lambda status: has_modern_pg(juju), timeout=120) | ||
|
|
||
| assert "database" in relations | ||
| relations = set(juju.status().apps["landscape-server"].relations) | ||
|
|
||
| restore_db_relations(juju, initial_relations) | ||
| assert "database" in relations | ||
| finally: | ||
| restore_db_relations(juju, initial_relations) | ||
|
|
||
|
|
||
| def test_legacy_db_relation(juju: jubilant.Juju, lbaas: jubilant.Juju): | ||
|
|
@@ -230,22 +235,24 @@ def test_legacy_db_relation(juju: jubilant.Juju, lbaas: jubilant.Juju): | |
| status = juju.status() | ||
| initial_relations = set(status.apps["landscape-server"].relations) | ||
|
|
||
| if "database" in initial_relations: | ||
| juju.remove_relation( | ||
| "landscape-server:database", "postgresql:database", force=True | ||
| ) | ||
| juju.wait(lambda status: not has_modern_pg(juju), timeout=120) | ||
| juju.integrate("landscape-server:db", "postgresql:db-admin") | ||
|
|
||
| elif "db" not in initial_relations: | ||
| juju.integrate("landscape-server:db", "postgresql:db-admin") | ||
| juju.wait(lambda status: has_legacy_pg(juju), timeout=120) | ||
| try: | ||
| if "database" in initial_relations: | ||
| juju.remove_relation( | ||
| "landscape-server:database", "postgresql:database", force=True | ||
| ) | ||
| juju.wait(lambda status: not has_modern_pg(juju), timeout=120) | ||
| juju.integrate("landscape-server:db", "postgresql:db-admin") | ||
| juju.wait(lambda status: has_legacy_pg(juju), timeout=120) | ||
|
|
||
| relations = set(juju.status().apps["landscape-server"].relations) | ||
| elif "db" not in initial_relations: | ||
| juju.integrate("landscape-server:db", "postgresql:db-admin") | ||
| juju.wait(lambda status: has_legacy_pg(juju), timeout=120) | ||
|
|
||
| assert "db" in relations | ||
| relations = set(juju.status().apps["landscape-server"].relations) | ||
|
Comment on lines
+240
to
+251
|
||
|
|
||
| restore_db_relations(juju, initial_relations) | ||
| assert "db" in relations | ||
| finally: | ||
| restore_db_relations(juju, initial_relations) | ||
|
|
||
|
|
||
| def test_pgbouncer_relation(juju: jubilant.Juju, bundle: None): | ||
|
|
@@ -301,12 +308,12 @@ def test_landscape_schema_migrated(juju: jubilant.Juju, bundle: None): | |
| host, port = stores["host"].split(":") | ||
| password, user, dbname = stores["password"], stores["user"], stores["main"] | ||
|
|
||
| result = juju.ssh( | ||
| "landscape-server/leader", | ||
| result = juju.exec( | ||
| f"PGPASSWORD={password} psql -h {host} -p {port} -U {user} -d {dbname}" | ||
| ' -tAc "SELECT COUNT(*) FROM information_schema.tables' | ||
| " WHERE table_schema = 'public' AND table_name = 'account';\"", | ||
| ).strip() | ||
| unit="landscape-server/leader", | ||
| ).stdout.strip() | ||
|
|
||
| assert result == "1", ( | ||
| "Expected the 'account' table to exist in the landscape database, " | ||
|
|
@@ -401,10 +408,10 @@ def test_ubuntu_installer_attach_toggle_no_maintenance( | |
| assert status.apps["landscape-server"].app_status.current == "active" | ||
|
|
||
| for name in status.apps["landscape-server"].units.keys(): | ||
| with pytest.raises(Exception): | ||
| juju.ssh( | ||
| name, | ||
| with pytest.raises(jubilant.CLIError): | ||
| juju.exec( | ||
| f"systemctl is-active {LANDSCAPE_UBUNTU_INSTALLER_ATTACH}.service", | ||
| unit=name, | ||
| ) | ||
|
|
||
| finally: | ||
|
|
@@ -763,6 +770,11 @@ def test_upgrade_action_updates_ppa(juju: jubilant.Juju, bundle: None): | |
| sources before upgrading, so switching PPAs (ex. upgrade from self-hosted-24.04 to | ||
| self-hosted-beta) works correctly. | ||
| """ | ||
| if USE_HOST_JUJU_MODEL: | ||
| pytest.skip( | ||
| "test_upgrade_action_updates_ppa mutates PPA state and is not safe " | ||
| "to run against a live model. Run in a dedicated upgrade pipeline instead." | ||
| ) | ||
| juju.wait(jubilant.all_active, timeout=300) | ||
|
|
||
| landscape_ppa = juju.config("landscape-server").get( | ||
|
|
@@ -799,3 +811,240 @@ def test_upgrade_action_updates_ppa(juju: jubilant.Juju, bundle: None): | |
| juju.run(unit_name, "upgrade") | ||
| juju.run(unit_name, "resume") | ||
| juju.wait(jubilant.all_active, timeout=300) | ||
|
|
||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Service placement and port tests | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| _ALL_UNIT_PORTS = [8080, 8070, 8090, 9080] # appserver, pingserver, message-server, api | ||
| _LEADER_ONLY_PORTS = [9100, 9099] # package-upload, package-search | ||
|
|
||
|
|
||
| def test_leader_services_not_on_non_leaders( | ||
| juju: jubilant.Juju, lbaas: jubilant.Juju | ||
| ): | ||
| """ | ||
| LEADER_SERVICES (package-search, package-upload) must NOT be active on | ||
| non-leader units. test_all_services_up only asserts the positive side; | ||
| this asserts the negative to catch misconfigured multi-unit deployments. | ||
| """ | ||
| status = juju.status() | ||
| units = status.apps["landscape-server"].units | ||
|
|
||
| if len(units) <= 1: | ||
| pytest.skip("Need more than 1 unit to verify non-leader service placement") | ||
|
|
||
| juju.wait(jubilant.all_active, timeout=300) | ||
|
|
||
| for name, unit_status in units.items(): | ||
| if not unit_status.leader: | ||
| for service in LEADER_SERVICES: | ||
| with pytest.raises(jubilant.CLIError): | ||
| juju.exec( | ||
| f"systemctl is-active {service}.service", | ||
| unit=name, | ||
| ) | ||
|
|
||
|
|
||
| def test_service_ports_listening(juju: jubilant.Juju, lbaas: jubilant.Juju): | ||
| """ | ||
| Verify that services are listening on the expected ports on the expected units. | ||
|
|
||
| All-unit services (appserver:8080, pingserver:8070, message-server:8090, | ||
| api:9080) must be open on every unit. Leader-only services | ||
| (package-upload:9100, package-search:9099) must be open only on the leader. | ||
|
|
||
| This guards against HAProxy charm config changes silently dropping backend | ||
| ports, which previously nearly brought down prod. | ||
| """ | ||
| juju.wait(jubilant.all_active, timeout=300) | ||
|
|
||
| status = juju.status() | ||
| units = status.apps["landscape-server"].units | ||
|
|
||
| for name, unit_status in units.items(): | ||
| for port in _ALL_UNIT_PORTS: | ||
| output = juju.exec(f"ss -tlnH 'sport = :{port}'", unit=name) | ||
| assert str(port) in output, ( | ||
| f"Expected port {port} to be listening on {name}, got: {output!r}" | ||
| ) | ||
|
|
||
| for port in _LEADER_ONLY_PORTS: | ||
| output = juju.exec(f"ss -tlnH 'sport = :{port}'", unit=name) | ||
| if unit_status.leader: | ||
| assert str(port) in output, ( | ||
| f"Expected leader-only port {port} on leader {name}, " | ||
| f"got: {output!r}" | ||
| ) | ||
| else: | ||
| assert str(port) not in output, ( | ||
| f"Leader-only port {port} must not be open on non-leader {name}" | ||
| ) | ||
|
|
||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Destructive tests — skipped on live models | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
|
|
||
| def test_leadership_change_service_follows( | ||
| juju: jubilant.Juju, lbaas: jubilant.Juju | ||
| ): | ||
| """ | ||
| When the current leader unit is removed, a new leader is elected and | ||
| LEADER_SERVICES must start on that new leader while DEFAULT_SERVICES remain | ||
| active across all surviving units. | ||
|
|
||
| Skipped on live models because it removes a unit temporarily. | ||
| """ | ||
| if USE_HOST_JUJU_MODEL: | ||
| pytest.skip( | ||
| "test_leadership_change_service_follows removes a unit; " | ||
| "skipped on live model." | ||
| ) | ||
|
|
||
| status = juju.status() | ||
| units = status.apps["landscape-server"].units | ||
|
|
||
| if len(units) <= 1: | ||
| pytest.skip("Need more than 1 unit to test leadership change") | ||
|
|
||
| juju.wait(jubilant.all_active, timeout=300) | ||
|
|
||
| leader_name = next(n for n, u in units.items() if u.leader) | ||
|
|
||
| try: | ||
| juju.cli("remove-unit", leader_name, "--no-wait") | ||
|
|
||
| def new_leader_active(status): | ||
| app_units = status.apps["landscape-server"].units | ||
| return any( | ||
| u.leader and u.workload_status.current == "active" | ||
| for u in app_units.values() | ||
| ) | ||
|
|
||
| juju.wait(new_leader_active, timeout=600) | ||
|
|
||
| new_status = juju.status() | ||
| new_leader = next( | ||
| n | ||
| for n, u in new_status.apps["landscape-server"].units.items() | ||
| if u.leader | ||
| ) | ||
|
|
||
| for service in LEADER_SERVICES: | ||
| wait_for_service(juju, new_leader, service) | ||
|
|
||
| for name in new_status.apps["landscape-server"].units: | ||
| for service in DEFAULT_SERVICES: | ||
| wait_for_service(juju, name, service) | ||
| finally: | ||
| juju.cli("add-unit", "landscape-server", "-n", "1") | ||
| juju.wait(jubilant.all_active, timeout=600) | ||
|
|
||
|
|
||
| def test_scale_out_and_in(juju: jubilant.Juju, lbaas: jubilant.Juju): | ||
| """ | ||
| Adding a unit brings it to active with all DEFAULT_SERVICES running and | ||
| LEADER_SERVICES absent. Removing it leaves the model healthy. | ||
|
|
||
| Skipped on live models. | ||
| """ | ||
| if USE_HOST_JUJU_MODEL: | ||
| pytest.skip("test_scale_out_and_in adds/removes units; skipped on live model.") | ||
|
|
||
| juju.wait(jubilant.all_active, timeout=300) | ||
|
|
||
| original_count = len(juju.status().apps["landscape-server"].units) | ||
|
|
||
| try: | ||
| juju.cli("add-unit", "landscape-server", "-n", "1") | ||
| juju.wait(jubilant.all_active, timeout=600) | ||
|
|
||
| status = juju.status() | ||
| units = status.apps["landscape-server"].units | ||
| assert len(units) == original_count + 1, ( | ||
| "Expected one extra unit after scale-out" | ||
| ) | ||
|
|
||
| new_unit = next( | ||
| n for n, u in units.items() if not u.leader and n not in { | ||
| nu for nu in units if units[nu].leader | ||
| } | ||
| ) | ||
|
|
||
| for service in DEFAULT_SERVICES: | ||
| wait_for_service(juju, new_unit, service) | ||
|
|
||
| for service in LEADER_SERVICES: | ||
| with pytest.raises(jubilant.CLIError): | ||
| juju.exec(f"systemctl is-active {service}.service", unit=new_unit) | ||
|
|
||
| juju.cli("remove-unit", new_unit, "--no-wait") | ||
| juju.wait(jubilant.all_active, timeout=600) | ||
|
|
||
| assert ( | ||
| len(juju.status().apps["landscape-server"].units) == original_count | ||
| ), "Unit count did not return to original after scale-in" | ||
| finally: | ||
| current = len(juju.status().apps["landscape-server"].units) | ||
| if current != original_count: | ||
| juju.cli( | ||
| "remove-unit", | ||
| sorted(juju.status().apps["landscape-server"].units)[-1], | ||
| "--no-wait", | ||
| ) | ||
| juju.wait(jubilant.all_active, timeout=600) | ||
|
|
||
|
|
||
| def test_postgresql_failover_services_recover( | ||
| juju: jubilant.Juju, lbaas: jubilant.Juju | ||
| ): | ||
| """ | ||
| After a PostgreSQL leader failover, landscape-server services must recover | ||
| automatically without manual intervention and continue serving HTTP. | ||
|
|
||
| Skipped on live models; intended for a dedicated failover pipeline. | ||
| """ | ||
| if USE_HOST_JUJU_MODEL: | ||
| pytest.skip( | ||
| "test_postgresql_failover_services_recover triggers a pg failover; " | ||
| "skipped on live model. Run in a dedicated failover pipeline." | ||
| ) | ||
|
|
||
| pg_status = juju.status() | ||
| pg_apps = [a for a in pg_status.apps if a.startswith("postgresql")] | ||
| if not pg_apps: | ||
| pytest.skip("No postgresql app found in model") | ||
|
|
||
| pg_app = pg_apps[0] | ||
| pg_units = pg_status.apps[pg_app].units | ||
| pg_leader = next( | ||
| (n for n, u in pg_units.items() if u.leader), None | ||
| ) | ||
| if pg_leader is None or len(pg_units) <= 1: | ||
| pytest.skip("Need a multi-unit postgresql with a leader to test failover") | ||
|
|
||
| juju.wait(jubilant.all_active, timeout=300) | ||
|
|
||
| haproxy_ip = _haproxy_ip(juju, lbaas) | ||
|
|
||
| try: | ||
| juju.exec("sudo systemctl stop postgresql", unit=pg_leader) | ||
|
|
||
| juju.wait(jubilant.all_active, timeout=600) | ||
|
|
||
| status = juju.status() | ||
| for name in status.apps["landscape-server"].units: | ||
| for service in DEFAULT_SERVICES: | ||
| wait_for_service(juju, name, service) | ||
|
|
||
| session = get_session() | ||
| response = session.get(f"https://{haproxy_ip}/ping", verify=False, timeout=30) | ||
| assert response.status_code == 200, ( | ||
| f"appserver /ping returned {response.status_code} after pg failover" | ||
| ) | ||
| finally: | ||
| juju.exec("sudo systemctl start postgresql", unit=pg_leader) | ||
| juju.wait(jubilant.all_active, timeout=600) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After switching from the legacy
dbrelation todatabase, this branch callsjuju.integrate(...)but doesn’t wait for the modern relation to actually appear before readingjuju.status()and asserting. This is inconsistent with theelifbranch (which waits forhas_modern_pg) and can lead to flaky failures if relation establishment is not immediate. Consider waiting forhas_modern_pg(juju)(orjubilant.all_active) after the integrate in this branch as well.