Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 167 additions & 0 deletions openstack_hypervisor/bridge_datapath.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@
import json
import logging
import subprocess
import time
import uuid
from collections.abc import Generator
from dataclasses import dataclass
from typing import TypedDict

DEFAULT_LAA_MAC_PREFIX = "0a:c5"
INTEGRATION_BRIDGE = "br-int"
SB_RENAME_SAFE_QUERY_TIMEOUT_S = 15
SB_RENAME_SAFE_QUERY_RETRIES = 3
SB_RENAME_SAFE_RETRY_DELAY_S = 1


@dataclass(frozen=True)
Expand Down Expand Up @@ -65,6 +69,18 @@ class OVSTimeoutError(OVSError, TimeoutError):
"""Raised when an OVS command times out."""


class OVNError(RuntimeError):
"""Common base class for OVN-related errors."""


class OVNCommandError(OVNError):
"""Raised when querying OVN state fails."""


class OVNTimeoutError(OVNError, TimeoutError):
"""Raised when an OVN command times out."""


def _normalize_ovs_vsctl_value(raw_value: str) -> str | None:
"""Normalize ovs-vsctl output values into plain strings."""
cleaned = raw_value.strip()
Expand Down Expand Up @@ -165,6 +181,11 @@ def with_timeout(self, timeout: int) -> Generator["OVSCli", None, None]:
finally:
self._timeout = original_timeout

@property
def timeout(self) -> int | None:
"""Return the configured default command timeout."""
return self._timeout

def commit(self, retry) -> str:
"""Execute all batched commands in a single ovs-vsctl transaction.

Expand Down Expand Up @@ -710,6 +731,152 @@ def get_dpdk_initialized(self) -> bool:
return False


class OVNSBCli:
"""Client for interacting with OVN Southbound via ovn-sbctl."""

def __init__(self, db: str, timeout: int | None = None):
self.db = db
self._timeout = timeout

def sbctl(
self,
*args: str,
timeout: int | None = None,
allow_stale_reads: bool = False,
) -> str:
"""Run ovn-sbctl with the provided arguments and return stdout."""
cmd = ["ovn-sbctl", f"--db={self.db}"]
timeout = timeout or self._timeout
if timeout is not None:
cmd.append(f"--timeout={timeout}")
if allow_stale_reads:
cmd.append("--no-leader-only")
cmd.extend(args)
logging.debug("Executing command: %s", " ".join(cmd))

try:
completed = subprocess.run( # nosec B603
cmd,
check=True,
capture_output=True,
text=True,
)
except FileNotFoundError as exc:
raise OVNCommandError("ovn-sbctl binary not found") from exc
except subprocess.CalledProcessError as exc: # pragma: no cover - defensive
stderr = (exc.stderr or "").strip()
stdout = (exc.stdout or "").strip()
details = stderr or stdout or f"Command failed with exit code {exc.returncode}"
if "Alarm clock" in details:
raise OVNTimeoutError(details) from exc
raise OVNCommandError(details) from exc

return completed.stdout

def rename_safety_query(self, *args: str, allow_stale_reads: bool = False) -> str:
"""Run an OVN SB query with the rename-safety freshness policy."""
if allow_stale_reads:
return self.sbctl(*args, allow_stale_reads=True)

attempts = SB_RENAME_SAFE_QUERY_RETRIES
if attempts < 1:
raise ValueError("SB_RENAME_SAFE_QUERY_RETRIES must be >= 1")

for attempt in range(1, attempts + 1):
try:
return self.sbctl(*args)
except OVNError:
if attempt == attempts:
raise
time.sleep(SB_RENAME_SAFE_RETRY_DELAY_S)


def _get_ovs_external_id(ovs_cli: OVSCli, key: str) -> str | None:
"""Read and normalize an OVS external_ids value."""
raw_value = ovs_cli.vsctl(
"get",
"open",
".",
f"external_ids:{key}",
skip_transaction=True,
)
return _normalize_ovs_vsctl_value(raw_value)


def _find_local_chassis_uuid(
ovn_sb_cli: OVNSBCli, system_id: str, allow_stale_sb_read: bool
) -> uuid.UUID | None:
"""Return the local chassis UUID when the node is registered in SB."""
chassis_output = ovn_sb_cli.rename_safety_query(
"--format=json",
"--columns=_uuid",
"find",
"Chassis",
f"name={system_id}",
allow_stale_reads=allow_stale_sb_read,
)
chassis_data = json.loads(chassis_output)
chassis_idx = chassis_data["headings"].index("_uuid")
if not chassis_data["data"]:
return None
return _parse_ovsdb_data(chassis_data["data"][0][chassis_idx])


def _chassis_has_port_bindings(
ovn_sb_cli: OVNSBCli, local_chassis: uuid.UUID, allow_stale_sb_read: bool
) -> bool:
"""Return whether any Port_Binding row is bound to the local chassis."""
bindings_output = ovn_sb_cli.rename_safety_query(
"--format=json",
"--columns=_uuid",
"find",
"Port_Binding",
f"chassis={local_chassis}",
allow_stale_reads=allow_stale_sb_read,
)
bindings_data = json.loads(bindings_output)
return bool(bindings_data["data"])


def is_bridge_rename_safe(ovs_cli: OVSCli, allow_stale_sb_read: bool = False) -> bool:
"""Return whether bridges can be destroyed and recreated safely."""
try:
ovn_remote = _get_ovs_external_id(ovs_cli, "ovn-remote")
if not ovn_remote:
return True

system_id = _get_ovs_external_id(ovs_cli, "system-id")
except OVSCommandError:
return False

if not system_id:
return True

ovn_sb_cli = OVNSBCli(
ovn_remote,
timeout=(
ovs_cli.timeout if ovs_cli.timeout is not None else SB_RENAME_SAFE_QUERY_TIMEOUT_S
),
)
try:
local_chassis = _find_local_chassis_uuid(
ovn_sb_cli, system_id, allow_stale_sb_read=allow_stale_sb_read
)
if local_chassis is None:
return True
if _chassis_has_port_bindings(
ovn_sb_cli,
local_chassis,
allow_stale_sb_read=allow_stale_sb_read,
):
return False
except (OVNError, KeyError, IndexError, json.JSONDecodeError, ValueError) as exc:
logging.warning("Unable to determine OVN bridge rename safety: %s", exc)
return False

return True


def resolve_bridge_mappings( # noqa: C901
external_bridge: str,
physnet_name: str,
Expand Down
30 changes: 30 additions & 0 deletions openstack_hypervisor/cli/hypervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
click_option_format,
)
from openstack_hypervisor.hooks import (
_configure_ovn_external_networking,
_dpdk_config_is_ready,
_get_configure_context,
_set_ovs_managed_by,
ovs_switch_socket,
ovs_switchd_ctl_socket,
)
Expand Down Expand Up @@ -178,3 +180,31 @@ def dpdk_ready() -> None:
else:
click.echo("DPDK configuration is NOT ready - external OVS restart may be required")
sys.exit(1)


@hypervisor.command("setup-bridge")
@click.option(
"--allow-stale-sb-read",
is_flag=True,
help="Allow OVN Southbound relay/follower reads when checking rename safety.",
)
def setup_bridge(allow_stale_sb_read: bool) -> None:
"""Apply OVN external bridge configuration."""
snap = Snap()
_set_ovs_managed_by(snap)
ovs_socket = ovs_switch_socket(snap)
switchd_ctl_socket = ovs_switchd_ctl_socket(snap)
ovs_cli = OVSCli(ovs_socket, switchd_ctl_socket)
context = _get_configure_context(snap)

if allow_stale_sb_read:
click.echo(
"Warning: allowing stale OVN Southbound reads may misjudge bridge rename safety."
)

_configure_ovn_external_networking(
snap,
ovs_cli,
context,
allow_stale_sb_read=allow_stale_sb_read,
)
67 changes: 66 additions & 1 deletion openstack_hypervisor/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
OVSCommandError,
OVSTimeoutError,
detect_current_mappings,
is_bridge_rename_safe,
resolve_bridge_mappings,
resolve_ovs_changes,
update_mappings_from_rename,
Expand Down Expand Up @@ -1000,6 +1001,39 @@ def _add_interface_to_bridge(ovs_cli: OVSCli, external_bridge: str, external_nic
)


def _bridge_can_be_adopted(
ovs_cli: OVSCli,
bridge: str,
external_nic: str | None,
existing_bridges: set[str] | None = None,
) -> bool:
"""Return whether an existing bridge can be adopted for a mapping."""
if existing_bridges is not None and bridge not in existing_bridges:
return True

try:
bridge_ifaces = ovs_cli.list_bridge_interfaces(bridge)
except OVSCommandError as exc:
logging.warning("Unable to inspect bridge %s for adoption: %s", bridge, exc)
return False

if external_nic:
return set(bridge_ifaces).issubset({external_nic})

return not bridge_ifaces


def _adopt_bridge_interface(ovs_cli: OVSCli, bridge: str, external_nic: str | None) -> None:
"""Mark an existing bridge interface as charm-managed for explicit adoption."""
if not external_nic:
return

if external_nic not in ovs_cli.list_bridge_interfaces(bridge):
return

ovs_cli.set("Port", external_nic, "external_ids", {"microstack-function": "ext-port"})


def _del_interface_from_bridge(ovs_cli: OVSCli, external_bridge: str, external_nic: str) -> None:
"""Remove an interface from a given bridge.

Expand Down Expand Up @@ -1701,7 +1735,7 @@ def get_machine_id() -> str:


def _configure_ovn_external_networking( # noqa: C901
snap: Snap, ovs_cli: OVSCli, context: dict
snap: Snap, ovs_cli: OVSCli, context: dict, allow_stale_sb_read: bool = False
) -> None:
"""Configure OVS/OVN external networking.

Expand Down Expand Up @@ -1742,6 +1776,37 @@ def _configure_ovn_external_networking( # noqa: C901
changes = resolve_ovs_changes(current_mappings, mappings)
logging.debug("OVS external networking changes: %s", changes)

if changes["renamed_bridges"] and is_bridge_rename_safe(
ovs_cli, allow_stale_sb_read=allow_stale_sb_read
):
existing_bridges = set(ovs_cli.list_bridges())
remaining_renames: list[tuple[str, str]] = []

for old_bridge, new_bridge in changes["renamed_bridges"]:
target_mapping = next((m for m in mappings if m.bridge == new_bridge), None)
if target_mapping and _bridge_can_be_adopted(
ovs_cli,
new_bridge,
target_mapping.interface,
existing_bridges,
):
if new_bridge in existing_bridges:
_adopt_bridge_interface(ovs_cli, new_bridge, target_mapping.interface)
if old_bridge not in changes["removed_bridges"]:
changes["removed_bridges"].append(old_bridge)
if new_bridge not in changes["added_bridges"]:
changes["added_bridges"].append(new_bridge)
continue

logging.info(
"Bridge %s cannot be adopted as %s, keeping current bridge name",
old_bridge,
new_bridge,
)
remaining_renames.append((old_bridge, new_bridge))

changes["renamed_bridges"] = remaining_renames

mappings = update_mappings_from_rename(mappings, changes["renamed_bridges"])

for bridge, change in changes["interface_changes"].items():
Expand Down
Loading
Loading