From 24143b3c014e92b4886f453e6f8ef1ea7406f3b2 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 3 Mar 2026 10:26:08 -0800 Subject: [PATCH 1/4] feat(gateway): add libkrun microVM gateway for hardware-isolated cluster bootstrap Introduce the navigator-gateway crate with safe Rust wrappers over the libkrun C FFI, enabling lightweight microVM execution via Apple Hypervisor.framework (macOS) or KVM (Linux). Key components: - navigator-gateway crate: KrunContextBuilder with RAII, ~15 FFI bindings, support for virtio-fs, console redirect, TSI control, and gvproxy virtio-net networking - nav gateway run: ad-hoc microVM execution (direct enter model) - nav gateway cluster: boots k3s inside a microVM with gvproxy networking, automatic rootfs extraction from Docker images, and port forwarding via gvproxy HTTP API - vm-init.sh: guest bootstrap script with DHCP networking, noop CNI plugin (kernel lacks bridge module), and tmpfs-backed k3s data dir - macOS codesigning with com.apple.security.hypervisor entitlement via auto-signing in scripts/bin/nav - navigator-cli build.rs for libkrun/libkrunfw rpath resolution --- Cargo.lock | 10 + crates/navigator-cli/Cargo.toml | 1 + crates/navigator-cli/build.rs | 41 ++ crates/navigator-cli/src/main.rs | 130 +++++ crates/navigator-cli/src/run.rs | 374 ++++++++++++ crates/navigator-gateway/Cargo.toml | 16 + crates/navigator-gateway/build.rs | 43 ++ crates/navigator-gateway/entitlements.plist | 8 + crates/navigator-gateway/src/context.rs | 600 ++++++++++++++++++++ crates/navigator-gateway/src/error.rs | 37 ++ crates/navigator-gateway/src/ffi.rs | 186 ++++++ crates/navigator-gateway/src/lib.rs | 68 +++ deploy/gateway/vm-init.sh | 215 +++++++ scripts/bin/nav | 7 + 14 files changed, 1736 insertions(+) create mode 100644 crates/navigator-cli/build.rs create mode 100644 crates/navigator-gateway/Cargo.toml create mode 100644 crates/navigator-gateway/build.rs create mode 100644 crates/navigator-gateway/entitlements.plist create mode 100644 crates/navigator-gateway/src/context.rs create mode 100644 crates/navigator-gateway/src/error.rs create mode 100644 crates/navigator-gateway/src/ffi.rs create mode 100644 crates/navigator-gateway/src/lib.rs create mode 100755 deploy/gateway/vm-init.sh diff --git a/Cargo.lock b/Cargo.lock index 1e45d96f..0ababf5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2687,6 +2687,7 @@ dependencies = [ "miette", "navigator-bootstrap", "navigator-core", + "navigator-gateway", "navigator-policy", "navigator-providers", "navigator-tui", @@ -2726,6 +2727,15 @@ dependencies = [ "tonic-build", ] +[[package]] +name = "navigator-gateway" +version = "0.1.0" +dependencies = [ + "libc", + "thiserror 2.0.18", + "tracing", +] + [[package]] name = "navigator-policy" version = "0.1.0" diff --git a/crates/navigator-cli/Cargo.toml b/crates/navigator-cli/Cargo.toml index dd16246d..f1246e71 100644 --- a/crates/navigator-cli/Cargo.toml +++ b/crates/navigator-cli/Cargo.toml @@ -14,6 +14,7 @@ path = "src/main.rs" [dependencies] navigator-bootstrap = { path = "../navigator-bootstrap" } navigator-core = { path = "../navigator-core" } +navigator-gateway = { path = "../navigator-gateway" } navigator-policy = { path = "../navigator-policy" } navigator-providers = { path = "../navigator-providers" } navigator-tui = { path = "../navigator-tui" } diff --git a/crates/navigator-cli/build.rs b/crates/navigator-cli/build.rs new file mode 100644 index 00000000..182d9dc2 --- /dev/null +++ b/crates/navigator-cli/build.rs @@ -0,0 +1,41 @@ +use std::process::Command; + +fn main() { + // On macOS, embed rpath entries for libkrun and libkrunfw so the binary + // can find them at runtime without DYLD_LIBRARY_PATH. + // + // Background: navigator-gateway links against libkrun (a system cdylib + // installed via Homebrew). At runtime libkrun loads libkrunfw via dlopen. + // The gateway crate's build.rs already emits link-search paths so the + // *linker* can find the dylibs, but cargo:rustc-link-arg from a library + // crate does NOT propagate to the final binary. We must emit the rpath + // flags from the binary crate's build.rs. + #[cfg(target_os = "macos")] + { + for formula in &["libkrun", "libkrunfw"] { + if let Some(lib_dir) = brew_lib_path(formula) { + println!("cargo:rustc-link-arg=-Wl,-rpath,{lib_dir}"); + } + } + } +} + +/// Ask Homebrew for the install prefix of a formula and return its `lib/` path. +#[cfg(target_os = "macos")] +fn brew_lib_path(formula: &str) -> Option { + let output = Command::new("brew") + .args(["--prefix", formula]) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let prefix = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if prefix.is_empty() { + return None; + } + + Some(format!("{prefix}/lib")) +} diff --git a/crates/navigator-cli/src/main.rs b/crates/navigator-cli/src/main.rs index b2b17b9f..ffbcde19 100644 --- a/crates/navigator-cli/src/main.rs +++ b/crates/navigator-cli/src/main.rs @@ -119,6 +119,12 @@ enum Commands { command: ProviderCommands, }, + /// Hardware-isolated microVM gateway. + Gateway { + #[command(subcommand)] + command: GatewayCommands, + }, + /// Launch the Gator interactive TUI. Gator, @@ -468,6 +474,90 @@ enum ClusterAdminCommands { }, } +#[derive(Subcommand, Debug)] +enum GatewayCommands { + /// Run a command inside a hardware-isolated microVM. + /// + /// Boots a lightweight microVM using libkrun (Apple Hypervisor.framework on + /// macOS ARM64, KVM on Linux) and executes the specified command inside it. + /// The rootfs directory is mapped into the VM via virtio-fs. + /// + /// NOTE: This command takes over the current process. The process will exit + /// with the guest workload's exit code when the VM shuts down. + Run { + /// Path to the root filesystem directory (aarch64 Linux userspace). + /// + /// Must contain the executable specified by EXEC_PATH. For a quick + /// start, download the Alpine minirootfs: + /// + /// curl -L https://dl-cdn.alpinelinux.org/alpine/v3.21/releases/aarch64/alpine-minirootfs-3.21.3-aarch64.tar.gz | tar xz -C ./rootfs + #[arg(long)] + rootfs: PathBuf, + + /// Number of virtual CPUs for the microVM. + #[arg(long, default_value_t = 1)] + vcpus: u8, + + /// Amount of RAM in MiB for the microVM. + #[arg(long, default_value_t = 128)] + mem: u32, + + /// Working directory inside the VM (relative to rootfs). + #[arg(long, default_value = "/")] + workdir: String, + + /// libkrun log level (0=Off, 1=Error, 2=Warn, 3=Info, 4=Debug, 5=Trace). + #[arg(long, default_value_t = 2)] + krun_log_level: u32, + + /// Path to the executable inside the rootfs. + exec_path: String, + + /// Arguments passed to the executable. + #[arg(trailing_var_arg = true)] + args: Vec, + }, + + /// Boot the cluster container in a hardware-isolated microVM. + /// + /// Extracts a rootfs from the cluster Docker image, then boots k3s inside + /// a libkrun microVM with port forwarding and persistent storage. + /// The parent process stays alive to monitor the VM. + Cluster { + /// Cluster Docker image to extract rootfs from. + /// + /// Defaults to the same image used by `cluster admin deploy`. + #[arg(long)] + image: Option, + + /// Host port for the navigator gateway (mapped to guest port 30051). + #[arg(long, default_value_t = 8080)] + port: u16, + + /// Host port for the k3s API server (mapped to guest port 6443). + /// If not set, the API server is not exposed to the host. + #[arg(long)] + kube_port: Option, + + /// Number of virtual CPUs for the microVM. + #[arg(long, default_value_t = 2)] + vcpus: u8, + + /// Amount of RAM in MiB for the microVM. + #[arg(long, default_value_t = 2048)] + mem: u32, + + /// Directory for persistent k3s state. Created if it doesn't exist. + /// Defaults to $XDG_DATA_HOME/navigator/gateway-cluster/k3s-state. + #[arg(long)] + state_dir: Option, + + /// libkrun log level (0=Off, 1=Error, 2=Warn, 3=Info, 4=Debug, 5=Trace). + #[arg(long, default_value_t = 2)] + krun_log_level: u32, + }, +} + #[derive(Subcommand, Debug)] enum SandboxCommands { /// Create a sandbox. @@ -1262,6 +1352,46 @@ async fn main() -> Result<()> { } } } + Some(Commands::Gateway { command }) => match command { + GatewayCommands::Run { + rootfs, + vcpus, + mem, + workdir, + krun_log_level, + exec_path, + args, + } => { + run::gateway_run( + &rootfs, + vcpus, + mem, + &workdir, + krun_log_level, + &exec_path, + &args, + )?; + } + GatewayCommands::Cluster { + image, + port, + kube_port, + vcpus, + mem, + state_dir, + krun_log_level, + } => { + run::gateway_cluster( + image.as_deref(), + port, + kube_port, + vcpus, + mem, + state_dir.as_deref(), + krun_log_level, + )?; + } + }, Some(Commands::Gator) => { let ctx = resolve_cluster(&cli.cluster)?; let tls = tls.with_cluster_name(&ctx.name); diff --git a/crates/navigator-cli/src/run.rs b/crates/navigator-cli/src/run.rs index 350a12ef..ae9b2620 100644 --- a/crates/navigator-cli/src/run.rs +++ b/crates/navigator-cli/src/run.rs @@ -3103,6 +3103,380 @@ fn print_log_line(log: &navigator_core::proto::SandboxLogLine) { } } +// --------------------------------------------------------------------------- +// Gateway (microVM) +// --------------------------------------------------------------------------- + +/// Boot a hardware-isolated microVM and execute a command inside it. +/// +/// This function **never returns on success** -- the libkrun VMM takes over +/// the process and calls `exit()` with the guest workload's exit code. +pub fn gateway_run( + rootfs: &Path, + vcpus: u8, + mem: u32, + workdir: &str, + log_level: u32, + exec_path: &str, + args: &[String], +) -> Result<()> { + use navigator_gateway::KrunContext; + + println!("Booting microVM..."); + println!(" rootfs: {}", rootfs.display()); + println!(" vcpus: {vcpus}"); + println!(" memory: {mem} MiB"); + println!(" workdir: {workdir}"); + println!(" exec: {exec_path}"); + if !args.is_empty() { + println!(" args: {}", args.join(" ")); + } + println!(); + + let arg_strs: Vec<&str> = args.iter().map(String::as_str).collect(); + + let ctx = KrunContext::builder() + .vcpus(vcpus) + .memory_mib(mem) + .rootfs(rootfs) + .workdir(workdir) + .exec(exec_path, &arg_strs) + .log_level(log_level) + .build() + .map_err(|e| miette::miette!("failed to configure microVM: {e}"))?; + + // This never returns on success -- the process exits with the guest's + // exit code. If it does return, it means something went wrong. + ctx.start_enter() + .map_err(|e| miette::miette!("failed to start microVM: {e}"))?; + + Ok(()) +} + +/// Boot the cluster container in a hardware-isolated microVM. +/// +/// This function: +/// 1. Extracts a rootfs from the cluster Docker image (if not already cached) +/// 2. Creates a persistent state directory for k3s data +/// 3. Boots k3s inside a libkrun microVM with port forwarding +/// 4. Waits for the child VM process to exit +#[allow(clippy::too_many_arguments)] +pub fn gateway_cluster( + image: Option<&str>, + gateway_port: u16, + kube_port: Option, + vcpus: u8, + mem: u32, + state_dir: Option<&Path>, + log_level: u32, +) -> Result<()> { + use navigator_gateway::KrunContext; + + // Resolve the cluster image. Priority: + // 1. Explicit --image flag + // 2. NAVIGATOR_CLUSTER_IMAGE env var + // 3. Default: navigator/cluster:dev (local build) + let resolved_image; + let image_ref = if let Some(img) = image { + img + } else if let Ok(img) = std::env::var("NAVIGATOR_CLUSTER_IMAGE") { + resolved_image = img; + &resolved_image + } else { + "navigator/cluster:dev" + }; + + // Determine directories. + let data_dir = if let Some(dir) = state_dir { + dir.to_path_buf() + } else { + let base = std::env::var("XDG_DATA_HOME") + .map(PathBuf::from) + .unwrap_or_else(|_| { + let home = std::env::var("HOME").unwrap_or_else(|_| ".".to_string()); + PathBuf::from(home).join(".local/share") + }); + base.join("navigator/gateway-cluster") + }; + let rootfs_dir = data_dir.join("rootfs"); + let k3s_state_dir = data_dir.join("k3s-state"); + let console_log = data_dir.join("console.log"); + + // Step 1: Extract rootfs from Docker image if not already present. + if !rootfs_dir.join("bin").is_dir() { + println!("Extracting rootfs from Docker image: {image_ref}"); + extract_rootfs_from_docker(image_ref, &rootfs_dir)?; + } else { + println!("Using cached rootfs: {}", rootfs_dir.display()); + } + + // Create k3s state directory. + std::fs::create_dir_all(&k3s_state_dir) + .map_err(|e| miette::miette!("failed to create k3s state dir: {e}"))?; + + // Step 2: Start gvproxy for virtio-net networking. + // + // gvproxy provides a user-mode network backend that gives the guest a real + // eth0 interface with DHCP (192.168.127.0/24). This replaces TSI, which + // breaks k3s by intercepting all localhost connections. + // + // Port forwarding is handled via gvproxy's HTTP API, not krun_set_port_map. + let gvproxy_sock = data_dir.join("gvproxy.sock"); + let gvproxy_api_sock = data_dir.join("gvproxy-api.sock"); + + // Clean up stale sockets from previous runs. + let _ = std::fs::remove_file(&gvproxy_sock); + let _ = std::fs::remove_file(&gvproxy_api_sock); + + println!("Starting gvproxy network backend..."); + let mut gvproxy_child = Command::new("/opt/podman/bin/gvproxy") + .arg("-listen-vfkit") + .arg(format!("unixgram://{}", gvproxy_sock.display())) + .arg("-listen") + .arg(format!("unix://{}", gvproxy_api_sock.display())) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .spawn() + .map_err(|e| miette::miette!("failed to start gvproxy: {e}"))?; + + // Wait for gvproxy to create its sockets. + for _ in 0..20 { + if gvproxy_sock.exists() && gvproxy_api_sock.exists() { + break; + } + std::thread::sleep(Duration::from_millis(100)); + } + if !gvproxy_sock.exists() { + gvproxy_child.kill().ok(); + return Err(miette::miette!( + "gvproxy failed to create socket at {}", + gvproxy_sock.display() + )); + } + + // Step 3: Configure port forwarding via gvproxy HTTP API. + // + // gvproxy's DHCP server assigns guest IPs from 192.168.127.0/24. + // The gateway itself is .1, and the first (and only) DHCP client + // gets .3 (observed empirically — .2 may be reserved internally). + let guest_ip = "192.168.127.3"; + let forward_ports = |local_port: u16, remote_port: u16| -> Result<()> { + let body = format!( + r#"{{"local":":{}","remote":"{}:{}","protocol":"tcp"}}"#, + local_port, guest_ip, remote_port + ); + let output = Command::new("curl") + .args([ + "--unix-socket", + &gvproxy_api_sock.to_string_lossy(), + "-s", + "-X", + "POST", + "http://localhost/services/forwarder/expose", + "-d", + &body, + ]) + .output() + .map_err(|e| miette::miette!("failed to configure port forwarding: {e}"))?; + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(miette::miette!("gvproxy port forward failed: {stderr}")); + } + Ok(()) + }; + + // Forward the navigator gateway port. + forward_ports(gateway_port, 30051)?; + // Optionally forward the kube API port. + if let Some(kp) = kube_port { + forward_ports(kp, 6443)?; + } + + // Step 4: Build the microVM configuration. + println!("Booting k3s cluster in microVM..."); + println!(" image: {image_ref}"); + println!(" rootfs: {}", rootfs_dir.display()); + println!(" vcpus: {vcpus}"); + println!(" memory: {mem} MiB"); + println!(" network: gvproxy (guest IP: {guest_ip})"); + println!(" gateway: localhost:{gateway_port} -> guest:30051"); + if let Some(kp) = kube_port { + println!(" kube API: localhost:{kp} -> guest:6443"); + } + println!(" state: {}", k3s_state_dir.display()); + println!(" console: {}", console_log.display()); + println!(); + + // vm-init.sh handles network setup then execs into k3s with these args. + // The libkrunfw kernel does not include netfilter/iptables modules, so we + // must disable kube-proxy and flannel (both require iptables). This is fine + // because the microVM only needs the API server + controllers for navigator. + // + // --data-dir /run/k3s puts k3s state on tmpfs. SQLite (used by kine) has + // file locking issues on virtio-fs, causing RBAC bootstrap timeouts. tmpfs + // provides proper POSIX locking and much faster I/O. State is lost on VM + // restart, but this is acceptable for development clusters. + let init_args: Vec<&str> = vec![ + "server", + "--data-dir=/run/k3s", + "--disable=traefik,servicelb,metrics-server", + "--disable-kube-proxy", + "--flannel-backend=none", + "--disable-network-policy", + "--tls-san=127.0.0.1", + "--tls-san=localhost", + "--tls-san=192.168.127.2", + "--tls-san=192.168.127.3", + ]; + + // Environment variables for the VM. + let env_vars = vec![ + "HOME=/root".to_string(), + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(), + "TERM=xterm".to_string(), + ]; + + let builder = KrunContext::builder() + .vcpus(vcpus) + .memory_mib(mem) + .rootfs(&rootfs_dir) + .workdir("/") + .exec("/usr/local/bin/vm-init.sh", &init_args) + .env(Some(env_vars)) + .log_level(log_level) + .console_output(&console_log) + // Use gvproxy for networking — this disables TSI automatically. + // TSI cannot be used with k3s because it intercepts ALL guest inet + // connect() calls and proxies them to the host, which breaks the + // k3s API server's internal localhost connections. + .net_gvproxy(&gvproxy_sock); + + let ctx = builder + .build() + .map_err(|e| miette::miette!("failed to configure cluster microVM: {e}"))?; + + // Step 5: Fork and start the VM. + let child_pid = ctx + .fork_start() + .map_err(|e| miette::miette!("failed to start cluster microVM: {e}"))?; + + println!("microVM started (child PID: {child_pid})"); + println!("Waiting for k3s to become ready..."); + println!( + " (tail -f {} for VM console output)", + console_log.display() + ); + + // Step 6: Wait for the child process. + let status = wait_for_child(child_pid)?; + if status == 0 { + println!("microVM exited cleanly"); + } else { + eprintln!("microVM exited with status {status}"); + } + + // Clean up gvproxy. + gvproxy_child.kill().ok(); + gvproxy_child.wait().ok(); + let _ = std::fs::remove_file(&gvproxy_sock); + let _ = std::fs::remove_file(&gvproxy_api_sock); + + Ok(()) +} + +/// Extract a rootfs from a Docker image by creating a temporary container +/// and exporting its filesystem. +fn extract_rootfs_from_docker(image_ref: &str, rootfs_dir: &Path) -> Result<()> { + use std::process::Command; + + // Ensure the image exists locally. + let pull_status = Command::new("docker") + .args(["image", "inspect", image_ref]) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .map_err(|e| miette::miette!("failed to run docker: {e}"))?; + + if !pull_status.success() { + println!(" Pulling image {image_ref}..."); + let pull = Command::new("docker") + .args(["pull", image_ref]) + .status() + .map_err(|e| miette::miette!("docker pull failed: {e}"))?; + if !pull.success() { + return Err(miette::miette!("docker pull failed for {image_ref}")); + } + } + + // Create a temporary container (don't start it). + let container_name = format!("navigator-rootfs-extract-{}", std::process::id()); + let create = Command::new("docker") + .args(["create", "--name", &container_name, image_ref, "/bin/true"]) + .stdout(std::process::Stdio::null()) + .status() + .map_err(|e| miette::miette!("docker create failed: {e}"))?; + if !create.success() { + return Err(miette::miette!("docker create failed for {image_ref}")); + } + + // Export the container filesystem as a tar stream and extract it. + std::fs::create_dir_all(rootfs_dir) + .map_err(|e| miette::miette!("failed to create rootfs dir: {e}"))?; + + println!(" Exporting container filesystem..."); + let export = Command::new("docker") + .args(["export", &container_name]) + .stdout(std::process::Stdio::piped()) + .spawn() + .map_err(|e| miette::miette!("docker export failed: {e}"))?; + + let tar_status = Command::new("tar") + .args(["xf", "-", "-C"]) + .arg(rootfs_dir) + .stdin(export.stdout.unwrap()) + .status() + .map_err(|e| miette::miette!("tar extract failed: {e}"))?; + + if !tar_status.success() { + // Clean up on failure. + let _ = Command::new("docker") + .args(["rm", "-f", &container_name]) + .status(); + return Err(miette::miette!( + "failed to extract rootfs from Docker image" + )); + } + + // Clean up the temporary container. + let _ = Command::new("docker") + .args(["rm", "-f", &container_name]) + .status(); + + // Copy the VM init script into the rootfs. This script sets up networking + // (dummy interface + default route) so k3s can find /proc/net/route, then + // execs into k3s. + let init_script = include_bytes!("../../../deploy/gateway/vm-init.sh"); + let init_path = rootfs_dir.join("usr/local/bin/vm-init.sh"); + std::fs::write(&init_path, init_script) + .map_err(|e| miette::miette!("failed to write vm-init.sh: {e}"))?; + + // Make executable (0o755). + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&init_path, std::fs::Permissions::from_mode(0o755)) + .map_err(|e| miette::miette!("failed to chmod vm-init.sh: {e}"))?; + } + + println!(" Rootfs extracted to {}", rootfs_dir.display()); + Ok(()) +} + +/// Wait for a child process to exit and return its exit status. +fn wait_for_child(pid: u32) -> Result { + navigator_gateway::wait_for_pid(pid).map_err(|e| miette::miette!("waitpid failed: {e}")) +} + #[cfg(test)] mod tests { use super::{inferred_provider_type, parse_credential_pairs, resolve_route_protocols}; diff --git a/crates/navigator-gateway/Cargo.toml b/crates/navigator-gateway/Cargo.toml new file mode 100644 index 00000000..668d3db3 --- /dev/null +++ b/crates/navigator-gateway/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "navigator-gateway" +description = "MicroVM gateway using libkrun for hardware-isolated process execution" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true + +[dependencies] +libc = "0.2" +thiserror = { workspace = true } +tracing = { workspace = true } + +[lints] +workspace = true diff --git a/crates/navigator-gateway/build.rs b/crates/navigator-gateway/build.rs new file mode 100644 index 00000000..b9f5d5b4 --- /dev/null +++ b/crates/navigator-gateway/build.rs @@ -0,0 +1,43 @@ +use std::process::Command; + +fn main() { + // Tell cargo to link against libkrun (the system dynamic library). + // On macOS this expects libkrun.dylib to be findable by the linker. + println!("cargo:rustc-link-lib=dylib=krun"); + + // Discover Homebrew install prefixes for libkrun and libkrunfw. + // We need both: + // - link-search: so the *linker* can find the .dylib at build time + // - link-arg -rpath: so the *dynamic linker* (dyld) can find them at runtime + // + // Without the rpath entries, the binary would require DYLD_LIBRARY_PATH + // to be set, which is fragile and easy to forget. + + for formula in &["libkrun", "libkrunfw"] { + if let Some(lib_dir) = brew_lib_path(formula) { + println!("cargo:rustc-link-search=native={lib_dir}"); + // NOTE: cargo:rustc-link-arg from a *library* crate does NOT + // propagate to the final binary. The rpath is set in + // navigator-cli's build.rs instead. + } + } +} + +/// Ask Homebrew for the install prefix of a formula and return its `lib/` path. +fn brew_lib_path(formula: &str) -> Option { + let output = Command::new("brew") + .args(["--prefix", formula]) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let prefix = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if prefix.is_empty() { + return None; + } + + Some(format!("{prefix}/lib")) +} diff --git a/crates/navigator-gateway/entitlements.plist b/crates/navigator-gateway/entitlements.plist new file mode 100644 index 00000000..154f3308 --- /dev/null +++ b/crates/navigator-gateway/entitlements.plist @@ -0,0 +1,8 @@ + + + + + com.apple.security.hypervisor + + + diff --git a/crates/navigator-gateway/src/context.rs b/crates/navigator-gateway/src/context.rs new file mode 100644 index 00000000..a4e28ab3 --- /dev/null +++ b/crates/navigator-gateway/src/context.rs @@ -0,0 +1,600 @@ +//! Safe wrapper around the libkrun configuration context and VM lifecycle. +//! +//! The main entry point is [`KrunContextBuilder`], obtained via +//! [`KrunContext::builder()`]. After configuring the VM parameters, call +//! [`.build()`](KrunContextBuilder::build) to create a [`KrunContext`], then +//! [`.start_enter()`](KrunContext::start_enter) to boot the microVM in the +//! current process, or [`.fork_start()`](KrunContext::fork_start) to boot it +//! in a child process while the parent retains control. + +use std::ffi::{CString, c_char}; +use std::mem::ManuallyDrop; +use std::path::{Path, PathBuf}; +use std::ptr; + +use tracing::{debug, info}; + +use crate::error::GatewayError; +use crate::ffi; + +/// A configured libkrun microVM context, ready to be started. +/// +/// Owns the libkrun context ID and frees it on drop (unless consumed by +/// `start_enter`, which never returns). +pub struct KrunContext { + ctx_id: u32, +} + +impl KrunContext { + /// Create a new builder for configuring a microVM. + pub fn builder() -> KrunContextBuilder { + KrunContextBuilder::default() + } + + /// Boot the microVM and enter it (direct model). + /// + /// # Never returns + /// + /// On success, this function **never returns**. The libkrun VMM takes over + /// the process and calls `exit()` with the guest workload's exit code when + /// the VM shuts down. + /// + /// The only way this function returns is if libkrun encounters an error + /// before actually starting the VM. + pub fn start_enter(self) -> Result<(), GatewayError> { + // Prevent Drop from running -- krun_start_enter consumes the context + // and will exit() the process, so we must not call krun_free_ctx. + let this = ManuallyDrop::new(self); + + // Raise RLIMIT_NOFILE to the maximum allowed. virtio-fs (used by + // krun_set_root) needs a large number of file descriptors to map the + // host directory into the guest. The chroot_vm reference example does + // the same thing. + raise_nofile_limit(); + + info!( + ctx_id = this.ctx_id, + "starting microVM (this process will be taken over)" + ); + + let ret = unsafe { ffi::krun_start_enter(this.ctx_id) }; + + // If we reach here, it means krun_start_enter failed. + Err(GatewayError::StartFailed(ret)) + } + + /// Boot the microVM in a forked child process. + /// + /// The parent process retains control and receives the child's PID. + /// The child process calls `krun_start_enter()`, which never returns on + /// success. + /// + /// # Returns + /// + /// - `Ok(child_pid)` in the parent process + /// - Never returns in the child (on success) + /// - `Err(...)` if the fork fails or the VM fails to start in the child + /// + /// # Safety + /// + /// After `fork()`, the child inherits all file descriptors and memory. + /// `krun_start_enter()` takes over the child process immediately, so + /// no Rust destructors run in the child. This is safe because + /// `krun_start_enter` calls `exit()` directly. + pub fn fork_start(self) -> Result { + raise_nofile_limit(); + + info!(ctx_id = self.ctx_id, "forking to start microVM in child"); + + // Prevent Drop from running in EITHER process. After fork(), the + // parent and child share kernel-level hypervisor resources (e.g., + // Hypervisor.framework VM handles on macOS). If the parent calls + // krun_free_ctx(), it destroys the VM the child is about to start. + // The child's krun_start_enter() consumes the context and calls + // exit() when the VM shuts down, so cleanup is not needed there + // either. + let this = ManuallyDrop::new(self); + + let pid = unsafe { libc::fork() }; + + if pid < 0 { + return Err(GatewayError::Fork(std::io::Error::last_os_error())); + } + + if pid == 0 { + // Child process: start the VM. This never returns on success. + let ret = unsafe { ffi::krun_start_enter(this.ctx_id) }; + // If we reach here, start failed. Exit with an error code so the + // parent can detect it. + std::process::exit(ret.unsigned_abs().cast_signed()); + } + + // Parent process: return the child PID. + // We intentionally leak the KrunContext (ManuallyDrop) to avoid + // destroying the child's VM. The kernel cleans up when the child + // exits. + debug!(child_pid = pid, "microVM child process started"); + #[expect(clippy::cast_sign_loss, reason = "checked non-negative above")] + Ok(pid as u32) + } +} + +impl Drop for KrunContext { + fn drop(&mut self) { + debug!(ctx_id = self.ctx_id, "freeing libkrun context"); + unsafe { + ffi::krun_free_ctx(self.ctx_id); + } + } +} + +/// A port mapping entry for the microVM (`host_port` -> `guest_port`). +#[derive(Debug, Clone)] +pub struct PortMapping { + /// Port on the host. + pub host_port: u16, + /// Port inside the guest VM. + pub guest_port: u16, +} + +impl PortMapping { + /// Create a new port mapping. + pub fn new(host_port: u16, guest_port: u16) -> Self { + Self { + host_port, + guest_port, + } + } +} + +impl std::fmt::Display for PortMapping { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}", self.host_port, self.guest_port) + } +} + +/// A virtio-fs volume mount (`host_path` -> `guest_tag`). +#[derive(Debug, Clone)] +pub struct VirtiofsMount { + /// Tag to identify the filesystem in the guest (used in mount command). + pub tag: String, + /// Full path to the host directory to expose. + pub host_path: PathBuf, +} + +impl VirtiofsMount { + /// Create a new virtio-fs mount. + pub fn new(tag: impl Into, host_path: impl AsRef) -> Self { + Self { + tag: tag.into(), + host_path: host_path.as_ref().to_path_buf(), + } + } +} + +/// Builder for configuring and creating a [`KrunContext`]. +/// +/// # Example +/// +/// ```no_run +/// use navigator_gateway::KrunContext; +/// +/// let ctx = KrunContext::builder() +/// .vcpus(1) +/// .memory_mib(128) +/// .rootfs("./my-rootfs") +/// .workdir("/") +/// .exec("/bin/echo", &["Hello from microVM!"]) +/// .build() +/// .expect("failed to configure microVM"); +/// +/// // This never returns on success: +/// ctx.start_enter().expect("failed to start microVM"); +/// ``` +pub struct KrunContextBuilder { + vcpus: u8, + memory_mib: u32, + rootfs: Option, + workdir: Option, + exec_path: Option, + args: Vec, + env: Option>, + log_level: u32, + port_map: Vec, + virtiofs_mounts: Vec, + console_output: Option, + disable_tsi: bool, + /// Path to a gvproxy Unix datagram socket for virtio-net networking. + /// When set, TSI is automatically disabled by libkrun and the guest + /// gets a real `eth0` interface with DHCP from gvproxy. + net_gvproxy: Option, +} + +impl Default for KrunContextBuilder { + fn default() -> Self { + Self { + vcpus: 1, + memory_mib: 128, + rootfs: None, + workdir: None, + exec_path: None, + args: Vec::new(), + env: None, + log_level: ffi::KRUN_LOG_LEVEL_WARN, + port_map: Vec::new(), + virtiofs_mounts: Vec::new(), + console_output: None, + disable_tsi: false, + net_gvproxy: None, + } + } +} + +#[allow(clippy::return_self_not_must_use)] +impl KrunContextBuilder { + /// Set the number of virtual CPUs for the microVM. + pub fn vcpus(mut self, n: u8) -> Self { + self.vcpus = n; + self + } + + /// Set the amount of RAM in MiB for the microVM. + pub fn memory_mib(mut self, mib: u32) -> Self { + self.memory_mib = mib; + self + } + + /// Set the host directory to be used as the VM's root filesystem. + /// + /// This directory is mapped into the VM via virtio-fs. It must contain + /// an aarch64 Linux userspace (e.g., Alpine minirootfs). + pub fn rootfs(mut self, path: impl AsRef) -> Self { + self.rootfs = Some(path.as_ref().to_path_buf()); + self + } + + /// Set the working directory inside the VM (relative to rootfs). + pub fn workdir(mut self, path: impl Into) -> Self { + self.workdir = Some(path.into()); + self + } + + /// Set the executable to run inside the VM and its arguments. + /// + /// The `exec_path` is relative to the rootfs. + pub fn exec(mut self, exec_path: impl Into, args: &[impl AsRef]) -> Self { + self.exec_path = Some(exec_path.into()); + self.args = args.iter().map(|a| a.as_ref().to_string()).collect(); + self + } + + /// Set environment variables for the guest process. + /// + /// Each entry should be in `KEY=VALUE` format. If not called (or called + /// with `None`), a minimal default environment is used. + pub fn env(mut self, vars: Option>) -> Self { + self.env = vars; + self + } + + /// Set the libkrun log level (0=Off .. 5=Trace). Default is 2 (Warn). + pub fn log_level(mut self, level: u32) -> Self { + self.log_level = level; + self + } + + /// Add a TCP port mapping from host to guest. + /// + /// The port will be accessible on `host_port` from the host and will + /// forward to `guest_port` inside the VM. Note that libkrun also makes + /// the port accessible inside the guest via `host_port`. + pub fn port_map(mut self, host_port: u16, guest_port: u16) -> Self { + self.port_map.push(PortMapping::new(host_port, guest_port)); + self + } + + /// Add multiple TCP port mappings at once. + pub fn port_maps(mut self, mappings: impl IntoIterator) -> Self { + self.port_map.extend(mappings); + self + } + + /// Add a virtio-fs volume mount. + /// + /// The host directory at `host_path` will be available inside the guest + /// as a virtio-fs filesystem with the given `tag`. The guest must mount + /// it explicitly: `mount -t virtiofs `. + pub fn virtiofs(mut self, tag: impl Into, host_path: impl AsRef) -> Self { + self.virtiofs_mounts + .push(VirtiofsMount::new(tag, host_path)); + self + } + + /// Redirect VM console output to a file instead of stdout. + /// + /// When set, the VM's console device ignores stdin and writes all output + /// to the specified file. Useful when the VM runs in a forked child and + /// the parent needs to capture output. + pub fn console_output(mut self, path: impl AsRef) -> Self { + self.console_output = Some(path.as_ref().to_path_buf()); + self + } + + /// Use gvproxy for virtio-net networking instead of TSI. + /// + /// When set, libkrun adds a virtio-net device backed by the gvproxy + /// Unix datagram socket at the given path. This **automatically disables + /// TSI**, so the guest gets a real `eth0` interface with DHCP from + /// gvproxy (default subnet: 192.168.127.0/24, gateway: 192.168.127.1, + /// guest IP: 192.168.127.2). + /// + /// Port forwarding is handled by gvproxy's HTTP API, not by + /// `krun_set_port_map` (which is TSI-only). + /// + /// Note: When using gvproxy, `port_map` entries are ignored by libkrun. + /// Use gvproxy's HTTP API endpoint to configure port forwarding instead. + pub fn net_gvproxy(mut self, socket_path: impl AsRef) -> Self { + self.net_gvproxy = Some(socket_path.as_ref().to_path_buf()); + self + } + + /// Disable TSI (Transparent Socket Impersonation) for the microVM. + /// + /// When enabled, libkrun's implicit vsock (which hijacks all guest + /// `connect()` syscalls on inet sockets) is replaced with a vsock + /// device that has no TSI features. This allows localhost traffic + /// inside the guest to flow through the real kernel loopback instead + /// of being tunnelled through vsock to the host. + /// + /// This is required for workloads like k3s that make many concurrent + /// internal localhost connections (API server, kine, controllers). + /// TSI intercepts those connections and overwhelms the vsock muxer, + /// causing deadlocks. + /// + /// Port mapping via `krun_set_port_map` still works because it uses + /// the vsock device (with `tsi_features = 0`, only explicit port + /// mappings are forwarded). + pub fn disable_tsi(mut self, disable: bool) -> Self { + self.disable_tsi = disable; + self + } + + /// Build the [`KrunContext`] by calling the libkrun C API to create and + /// configure the microVM. + /// + /// # Errors + /// + /// Returns [`GatewayError`] if the rootfs doesn't exist, if any libkrun + /// API call fails, or if string arguments contain interior null bytes. + pub fn build(self) -> Result { + // Validate rootfs exists. + let rootfs = self + .rootfs + .as_ref() + .ok_or_else(|| GatewayError::RootfsNotFound(PathBuf::from("")))?; + + if !rootfs.is_dir() { + return Err(GatewayError::RootfsNotFound(rootfs.clone())); + } + + let exec_path = self.exec_path.as_deref().unwrap_or("/bin/sh"); + + // Set log level. + check_ret("krun_set_log_level", unsafe { + ffi::krun_set_log_level(self.log_level) + })?; + + // Create the libkrun context. + let ctx_id = unsafe { ffi::krun_create_ctx() }; + if ctx_id < 0 { + return Err(GatewayError::ContextCreation(ctx_id)); + } + #[expect(clippy::cast_sign_loss, reason = "checked non-negative above")] + let ctx_id = ctx_id as u32; + + debug!( + ctx_id, + vcpus = self.vcpus, + ram_mib = self.memory_mib, + "configuring microVM" + ); + + // From here on, if we hit an error we need to clean up the context. + // We'll create KrunContext now so Drop handles it. + let ctx = KrunContext { ctx_id }; + + // Configure VM resources. + check_ret("krun_set_vm_config", unsafe { + ffi::krun_set_vm_config(ctx_id, self.vcpus, self.memory_mib) + })?; + + // Set root filesystem. + let c_rootfs = path_to_cstring(rootfs)?; + check_ret("krun_set_root", unsafe { + ffi::krun_set_root(ctx_id, c_rootfs.as_ptr()) + })?; + + // Set working directory. + if let Some(ref workdir) = self.workdir { + let c_workdir = CString::new(workdir.as_str())?; + check_ret("krun_set_workdir", unsafe { + ffi::krun_set_workdir(ctx_id, c_workdir.as_ptr()) + })?; + } + + // Configure gvproxy-based virtio-net networking. + // + // When a net device is added, libkrun automatically disables TSI. + // The guest gets a real eth0 with DHCP from gvproxy. This MUST be + // called before krun_set_port_map (per libkrun.h). + if let Some(ref gvproxy_path) = self.net_gvproxy { + let c_path = path_to_cstring(gvproxy_path)?; + // Default MAC address for the guest. + let mac: [u8; 6] = [0x02, 0x42, 0xAC, 0x11, 0x00, 0x02]; + + debug!( + path = %gvproxy_path.display(), + "adding gvproxy virtio-net device (disables TSI)" + ); + check_ret("krun_add_net_unixgram", unsafe { + ffi::krun_add_net_unixgram( + ctx_id, + c_path.as_ptr(), + -1, // no fd, use path + mac.as_ptr(), + ffi::COMPAT_NET_FEATURES, + ffi::NET_FLAG_VFKIT, + ) + })?; + } + + // Configure port mapping (TSI-only, skipped when gvproxy is used). + if !self.port_map.is_empty() { + let map_strings: Vec = self.port_map.iter().map(ToString::to_string).collect(); + let c_map_strings = to_cstring_vec(&map_strings)?; + let c_port_map = to_ptr_array(&c_map_strings); + + debug!(?map_strings, "setting port map"); + check_ret("krun_set_port_map", unsafe { + ffi::krun_set_port_map(ctx_id, c_port_map.as_ptr()) + })?; + } + + // Configure virtio-fs volume mounts. + for mount in &self.virtiofs_mounts { + let c_tag = CString::new(mount.tag.as_str())?; + let c_path = path_to_cstring(&mount.host_path)?; + + debug!(tag = mount.tag, path = %mount.host_path.display(), "adding virtiofs mount"); + check_ret("krun_add_virtiofs", unsafe { + ffi::krun_add_virtiofs(ctx_id, c_tag.as_ptr(), c_path.as_ptr()) + })?; + } + + // Configure console output redirection. + if let Some(ref console_path) = self.console_output { + let c_console = path_to_cstring(console_path)?; + check_ret("krun_set_console_output", unsafe { + ffi::krun_set_console_output(ctx_id, c_console.as_ptr()) + })?; + } + + // Disable TSI (Transparent Socket Impersonation) if requested. + // + // TSI intercepts ALL guest connect() syscalls on inet sockets and + // tunnels them through vsock to the host. This breaks workloads + // that rely on internal localhost connections (e.g., k3s). + // + // We replace the implicit vsock with a bare vsock (tsi_features=0) + // so that only explicit port mappings are forwarded while localhost + // traffic stays inside the guest kernel. + if self.disable_tsi { + debug!(ctx_id, "disabling TSI (transparent socket impersonation)"); + check_ret("krun_disable_implicit_vsock", unsafe { + ffi::krun_disable_implicit_vsock(ctx_id) + })?; + check_ret("krun_add_vsock", unsafe { ffi::krun_add_vsock(ctx_id, 0) })?; + } + + // Set executable, arguments, and environment. + let c_exec = CString::new(exec_path)?; + let c_args = to_cstring_vec(&self.args)?; + let c_arg_ptrs = to_ptr_array(&c_args); + + // If no explicit env was provided, use a minimal default environment. + // We must NOT pass NULL to krun_set_exec's envp parameter because + // libkrun would then serialize the entire host environment into the + // kernel command line, which easily overflows its 4096-byte limit + // on developer machines with large PATH/etc. + let default_env = vec![ + "HOME=/root".to_string(), + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(), + "TERM=xterm".to_string(), + ]; + let env_ref = self.env.as_ref().unwrap_or(&default_env); + let c_env_strings = to_cstring_vec(env_ref)?; + let c_envp = to_ptr_array(&c_env_strings); + + check_ret("krun_set_exec", unsafe { + ffi::krun_set_exec( + ctx_id, + c_exec.as_ptr(), + c_arg_ptrs.as_ptr(), + c_envp.as_ptr(), + ) + })?; + + // Keep CStrings alive until after the FFI call. + drop(c_env_strings); + + info!( + ctx_id, + rootfs = %rootfs.display(), + exec = exec_path, + ports = ?self.port_map.iter().map(ToString::to_string).collect::>(), + virtiofs = self.virtiofs_mounts.len(), + "microVM configured successfully" + ); + + Ok(ctx) + } +} + +/// Check a libkrun return code; zero means success, negative means error. +fn check_ret(call: &'static str, ret: i32) -> Result<(), GatewayError> { + if ret < 0 { + Err(GatewayError::Configuration { call, code: ret }) + } else { + Ok(()) + } +} + +/// Convert a `Path` to a `CString`. +fn path_to_cstring(path: &Path) -> Result { + let s = path.to_str().ok_or(GatewayError::Configuration { + call: "path_to_cstring", + code: -1, + })?; + Ok(CString::new(s)?) +} + +/// Convert a slice of strings to a `Vec`. +fn to_cstring_vec(strings: &[String]) -> Result, GatewayError> { + strings + .iter() + .map(|s| Ok(CString::new(s.as_str())?)) + .collect() +} + +/// Create a null-terminated array of C string pointers suitable for passing +/// to libkrun functions that expect `const char *const argv[]`. +/// +/// The returned `Vec` contains pointers into the `CString` values (which must +/// outlive the returned `Vec`) followed by a null terminator. +fn to_ptr_array(strings: &[CString]) -> Vec<*const c_char> { + let mut ptrs: Vec<*const c_char> = strings.iter().map(|s| s.as_ptr()).collect(); + ptrs.push(ptr::null()); + ptrs +} + +/// Raise `RLIMIT_NOFILE` to the maximum allowed value. +/// +/// virtio-fs (used by `krun_set_root` to map the rootfs directory) requires a +/// large number of file descriptors. Without this, `krun_start_enter` can fail +/// with internal errors. This mirrors what the upstream `chroot_vm` example does. +fn raise_nofile_limit() { + use libc::{RLIMIT_NOFILE, getrlimit, rlimit, setrlimit}; + + let mut rlim = rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + if unsafe { getrlimit(RLIMIT_NOFILE, &raw mut rlim) } == 0 { + rlim.rlim_cur = rlim.rlim_max; + if unsafe { setrlimit(RLIMIT_NOFILE, &raw const rlim) } != 0 { + debug!("failed to raise RLIMIT_NOFILE (non-fatal)"); + } else { + debug!(limit = rlim.rlim_cur, "raised RLIMIT_NOFILE"); + } + } +} diff --git a/crates/navigator-gateway/src/error.rs b/crates/navigator-gateway/src/error.rs new file mode 100644 index 00000000..863b28c2 --- /dev/null +++ b/crates/navigator-gateway/src/error.rs @@ -0,0 +1,37 @@ +//! Error types for the gateway microVM subsystem. + +use std::path::PathBuf; + +/// Errors that can occur when configuring or starting a microVM. +#[derive(Debug, thiserror::Error)] +pub enum GatewayError { + /// libkrun failed to create a configuration context. + #[error("failed to create libkrun context (error code: {0})")] + ContextCreation(i32), + + /// The VM configuration call failed. + #[error("failed to configure VM ({call}): libkrun error code {code}")] + Configuration { + /// Which libkrun API call failed. + call: &'static str, + /// The negative error code returned by libkrun. + code: i32, + }, + + /// The rootfs path provided does not exist or is not a directory. + #[error("rootfs path does not exist or is not a directory: {0}")] + RootfsNotFound(PathBuf), + + /// `krun_start_enter` returned an error instead of booting the VM. + #[error("failed to start microVM (libkrun error code: {0})")] + StartFailed(i32), + + /// `fork()` failed when trying to start the VM in a child process. + #[error("fork failed: {0}")] + Fork(std::io::Error), + + /// A string argument contained an interior null byte and could not be + /// converted to a C string. + #[error("argument contains interior null byte: {0}")] + NulError(#[from] std::ffi::NulError), +} diff --git a/crates/navigator-gateway/src/ffi.rs b/crates/navigator-gateway/src/ffi.rs new file mode 100644 index 00000000..1e7f79a3 --- /dev/null +++ b/crates/navigator-gateway/src/ffi.rs @@ -0,0 +1,186 @@ +//! Raw FFI bindings for the libkrun C API. +//! +//! These are manual declarations for the subset of `libkrun.h` functions +//! needed by the gateway. libkrun is a dynamic library providing +//! virtualization-based process isolation via KVM (Linux) or +//! Hypervisor.framework (macOS ARM64). +//! +//! See: + +use std::ffi::c_char; + +// Log level constants matching libkrun.h. +// Not all are used yet but they form the public API surface for log configuration. +#[allow(dead_code)] +pub const KRUN_LOG_LEVEL_OFF: u32 = 0; +#[allow(dead_code)] +pub const KRUN_LOG_LEVEL_ERROR: u32 = 1; +pub const KRUN_LOG_LEVEL_WARN: u32 = 2; +#[allow(dead_code)] +pub const KRUN_LOG_LEVEL_INFO: u32 = 3; +#[allow(dead_code)] +pub const KRUN_LOG_LEVEL_DEBUG: u32 = 4; +#[allow(dead_code)] +pub const KRUN_LOG_LEVEL_TRACE: u32 = 5; + +// Network backend flags from libkrun.h. +/// Send the VFKIT magic after establishing the connection, as required by +/// gvproxy in vfkit mode. +pub const NET_FLAG_VFKIT: u32 = 1 << 0; + +/// Compatible virtio-net features enabled by `krun_set_passt_fd` and +/// `krun_set_gvproxy_path`. We use the same set for `krun_add_net_unixgram`. +pub const COMPAT_NET_FEATURES: u32 = (1 << 0) // CSUM + | (1 << 1) // GUEST_CSUM + | (1 << 7) // GUEST_TSO4 + | (1 << 10) // GUEST_UFO + | (1 << 11) // HOST_TSO4 + | (1 << 14); // HOST_UFO + +// Well-known exit codes from the libkrun init process. +// +// 125 - init cannot set up the environment inside the microVM. +// 126 - init can find the executable but cannot execute it. +// 127 - init cannot find the executable to be run. + +unsafe extern "C" { + /// Sets the log level for the library. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_log_level(level: u32) -> i32; + + /// Creates a configuration context. + /// + /// Returns the context ID (>= 0) on success or a negative error number on failure. + pub fn krun_create_ctx() -> i32; + + /// Frees an existing configuration context. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_free_ctx(ctx_id: u32) -> i32; + + /// Sets the basic configuration parameters for the microVM. + /// + /// - `num_vcpus`: the number of vCPUs. + /// - `ram_mib`: the amount of RAM in MiB. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_vm_config(ctx_id: u32, num_vcpus: u8, ram_mib: u32) -> i32; + + /// Sets the path to be used as root for the microVM. + /// + /// The path is mapped into the VM via virtio-fs. The libkrun init process + /// uses this as the root filesystem. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_root(ctx_id: u32, root_path: *const c_char) -> i32; + + /// Sets the working directory for the executable inside the microVM. + /// + /// The path is relative to the root configured with `krun_set_root`. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_workdir(ctx_id: u32, workdir_path: *const c_char) -> i32; + + /// Sets the executable path, arguments, and environment variables. + /// + /// - `exec_path`: path relative to the root configured with `krun_set_root`. + /// - `argv`: null-terminated array of argument string pointers. + /// - `envp`: null-terminated array of environment variable string pointers + /// (format: `KEY=VALUE`). If null, inherits the current environment. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_exec( + ctx_id: u32, + exec_path: *const c_char, + argv: *const *const c_char, + envp: *const *const c_char, + ) -> i32; + + /// Configures a map of host to guest TCP ports for the microVM. + /// + /// - `port_map`: null-terminated array of string pointers with format + /// `"host_port:guest_port"`. + /// + /// Passing NULL instructs libkrun to expose all listening ports in the + /// guest to the host. Passing an empty (null-terminated) array means no + /// ports are exposed. + /// + /// Exposed ports become accessible by their `host_port` in the guest too, + /// so for a map `"8080:80"`, guest-side applications must also use port 8080. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_port_map(ctx_id: u32, port_map: *const *const c_char) -> i32; + + /// Adds an independent virtio-fs device pointing to a host directory. + /// + /// - `c_tag`: tag to identify the filesystem in the guest (used for + /// mounting: `mount -t virtiofs `). + /// - `c_path`: full path to the host directory to be exposed. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_add_virtiofs(ctx_id: u32, c_tag: *const c_char, c_path: *const c_char) -> i32; + + /// Configures the console device to ignore stdin and write output to a file. + /// + /// - `c_filepath`: path to the file for console output. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_console_output(ctx_id: u32, c_filepath: *const c_char) -> i32; + + /// Disable the implicit vsock device (which carries TSI by default). + /// + /// Must be called before `krun_add_vsock` to add a vsock with custom + /// TSI feature flags. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_disable_implicit_vsock(ctx_id: u32) -> i32; + + /// Add a vsock device with specified TSI features. + /// + /// - `tsi_features`: bitmask of `KRUN_TSI_HIJACK_INET` (1) and + /// `KRUN_TSI_HIJACK_UNIX` (2). Use 0 for no TSI hijacking. + /// + /// Only one vsock device is supported. Call after + /// `krun_disable_implicit_vsock`. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_add_vsock(ctx_id: u32, tsi_features: u32) -> i32; + + /// Adds an independent virtio-net device with a unixgram-based backend, + /// such as gvproxy or vmnet-helper. + /// + /// Adding ANY `krun_add_net_*` device **automatically disables TSI**. The + /// guest gets a real `ethN` interface instead of TSI socket interception. + /// + /// - `c_path`: path to the Unix datagram socket for the network proxy + /// (e.g., gvproxy's `--listen-vfkit` socket). Must be NULL if `fd != -1`. + /// - `fd`: open file descriptor for the socket. Must be -1 if `c_path` + /// is not NULL. + /// - `c_mac`: 6-byte MAC address array. + /// - `features`: virtio-net feature bitmask (use `COMPAT_NET_FEATURES`). + /// - `flags`: generic flags. Use `NET_FLAG_VFKIT` for gvproxy in vfkit + /// mode when using `c_path`. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_add_net_unixgram( + ctx_id: u32, + c_path: *const c_char, + fd: i32, + c_mac: *const u8, + features: u32, + flags: u32, + ) -> i32; + + /// Starts and enters the microVM with the configured parameters. + /// + /// The VMM takes over stdin/stdout to manage them on behalf of the process + /// running inside the isolated environment. + /// + /// **This function never returns on success.** The VMM calls `exit()` with + /// the workload's exit code once the microVM shuts down. + /// + /// Returns a negative error number only if an error happens before the + /// microVM is started (e.g., `-EINVAL` for invalid configuration). + pub fn krun_start_enter(ctx_id: u32) -> i32; +} diff --git a/crates/navigator-gateway/src/lib.rs b/crates/navigator-gateway/src/lib.rs new file mode 100644 index 00000000..3fa88b11 --- /dev/null +++ b/crates/navigator-gateway/src/lib.rs @@ -0,0 +1,68 @@ +#![allow(unsafe_code)] +//! Hardware-isolated microVM gateway using libkrun. +//! +//! This crate provides a safe Rust interface over the [libkrun](https://github.com/containers/libkrun) +//! C library for running processes inside lightweight microVMs. On macOS ARM64, +//! libkrun uses Apple's Hypervisor.framework (HVF); on Linux it uses KVM. +//! +//! # Architecture +//! +//! libkrun bundles a VMM (Virtual Machine Monitor) in a dynamic library with a +//! simple C API. Combined with libkrunfw (which bundles a Linux kernel), it can +//! boot a microVM in milliseconds with minimal resource overhead. +//! +//! The guest's root filesystem is mapped from a host directory via virtio-fs. +//! Networking uses TSI (Transparent Socket Impersonation) by default, allowing +//! the guest to transparently access host network endpoints without explicit +//! network configuration. +//! +//! # Usage +//! +//! ```no_run +//! use navigator_gateway::KrunContext; +//! +//! let ctx = KrunContext::builder() +//! .vcpus(1) +//! .memory_mib(128) +//! .rootfs("./my-alpine-rootfs") +//! .workdir("/") +//! .exec("/bin/echo", &["Hello from a hardware-isolated microVM!"]) +//! .build() +//! .expect("failed to configure microVM"); +//! +//! // Boots the VM and never returns on success. +//! // The process exits with the guest workload's exit code. +//! ctx.start_enter().expect("failed to start microVM"); +//! ``` +//! +//! # Prerequisites +//! +//! - **macOS ARM64**: Install via Homebrew: `brew tap slp/krun && brew install libkrun` +//! - **Linux**: Build and install libkrunfw + libkrun from source +//! - A root filesystem directory containing an aarch64 Linux userspace +//! (e.g., [Alpine minirootfs](https://alpinelinux.org/downloads/)) + +mod context; +mod error; +mod ffi; + +pub use context::{KrunContext, KrunContextBuilder, PortMapping, VirtiofsMount}; +pub use error::GatewayError; + +/// Wait for a child process to exit and return its exit status. +/// +/// This is a thin wrapper over `waitpid(2)` for use after [`KrunContext::fork_start`]. +pub fn wait_for_pid(pid: u32) -> Result { + let mut status: libc::c_int = 0; + let ret = unsafe { libc::waitpid(pid.cast_signed(), &raw mut status, 0) }; + if ret < 0 { + return Err(GatewayError::Fork(std::io::Error::last_os_error())); + } + if libc::WIFEXITED(status) { + Ok(libc::WEXITSTATUS(status)) + } else if libc::WIFSIGNALED(status) { + Ok(128 + libc::WTERMSIG(status)) + } else { + Ok(status) + } +} diff --git a/deploy/gateway/vm-init.sh b/deploy/gateway/vm-init.sh new file mode 100755 index 00000000..74d5404d --- /dev/null +++ b/deploy/gateway/vm-init.sh @@ -0,0 +1,215 @@ +#!/bin/sh +# vm-init.sh — Bootstrap script for running k3s inside a libkrun microVM. +# +# When using gvproxy networking (virtio-net), the guest gets a real eth0 +# interface. This script configures it via DHCP from gvproxy (which provides +# 192.168.127.0/24 with gateway 192.168.127.1). +# +# The libkrunfw kernel does not include netfilter/iptables, so kube-proxy +# and flannel must be disabled. This is handled by the k3s flags passed +# from the CLI. +# +# This script is injected into the rootfs at extraction time and used as the +# microVM entrypoint instead of running k3s directly. + +set -e + +# The k3s (rancher) base image doesn't symlink all BusyBox applets. +# Ensure essential commands are available. +BB=/bin/busybox +for cmd in mount mountpoint mkdir cat ip udhcpc; do + if ! command -v $cmd >/dev/null 2>&1; then + ln -sf $BB /bin/$cmd 2>/dev/null || true + fi +done +# Also ensure sbin commands are available for ip/route. +for cmd in ip route; do + if ! command -v $cmd >/dev/null 2>&1; then + ln -sf $BB /sbin/$cmd 2>/dev/null || true + fi +done + +echo "[vm-init] Setting up network..." + +# The libkrunfw kernel auto-mounts proc, sysfs, devtmpfs, and cgroup2. +# We only need to mount /run (tmpfs for PID files and sockets) and /tmp. +if ! mountpoint -q /run 2>/dev/null; then + mkdir -p /run + mount -t tmpfs tmpfs /run +fi +if ! mountpoint -q /tmp 2>/dev/null; then + mkdir -p /tmp + mount -t tmpfs tmpfs /tmp +fi + +# Enable the loopback interface. +ip link set lo up 2>/dev/null || true + +# Configure eth0 via DHCP from gvproxy. +# gvproxy provides DHCP on 192.168.127.0/24: +# gateway: 192.168.127.1 +# guest: 192.168.127.2 +# DNS: 192.168.127.1 +if ip link show eth0 >/dev/null 2>&1; then + echo "[vm-init] Configuring eth0 via DHCP..." + ip link set eth0 up + + # BusyBox udhcpc needs a script to apply the lease. Create a minimal one. + mkdir -p /usr/share/udhcpc + cat > /usr/share/udhcpc/default.script << 'DHCP_SCRIPT' +#!/bin/sh +case "$1" in + bound|renew) + ip addr add "$ip/$mask" dev "$interface" 2>/dev/null || true + if [ -n "$router" ]; then + ip route add default via "$router" dev "$interface" 2>/dev/null || true + fi + if [ -n "$dns" ]; then + : > /etc/resolv.conf + for ns in $dns; do + echo "nameserver $ns" >> /etc/resolv.conf + done + fi + ;; +esac +DHCP_SCRIPT + chmod +x /usr/share/udhcpc/default.script + + # Run DHCP (foreground, quit after lease obtained). + udhcpc -i eth0 -n -q -f -t 5 2>/dev/null || { + echo "[vm-init] DHCP failed, using static config" + ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true + ip route add default via 192.168.127.1 dev eth0 2>/dev/null || true + echo "nameserver 192.168.127.1" > /etc/resolv.conf + } + + GUEST_IP=$(ip -4 addr show eth0 2>/dev/null | sed -n 's/.*inet \([0-9.]*\).*/\1/p' | head -1) + GUEST_IP="${GUEST_IP:-192.168.127.3}" + echo "[vm-init] Network configured: eth0 = ${GUEST_IP}" +else + # Fallback: no eth0 (TSI-only mode). Add dummy routing on lo so k3s + # finds a default route in /proc/net/route. + echo "[vm-init] No eth0 found, using lo-only fallback..." + ip addr add 10.0.2.100/32 dev lo 2>/dev/null || true + ip route add 10.0.2.1/32 dev lo 2>/dev/null || true + ip route add default via 10.0.2.1 dev lo 2>/dev/null || true + echo "nameserver 10.0.2.1" > /etc/resolv.conf + GUEST_IP="10.0.2.100" + echo "[vm-init] Network configured (fallback): lo = ${GUEST_IP}" +fi + +# Set up k3s-specific DNS config. +mkdir -p /etc/rancher/k3s +cp -f /etc/resolv.conf /etc/rancher/k3s/resolv.conf + +# k3s uses --data-dir=/run/k3s (tmpfs) to avoid SQLite file locking issues +# on virtio-fs. Ensure the directory exists. +mkdir -p /run/k3s + +# --------------------------------------------------------------------------- +# CNI setup +# --------------------------------------------------------------------------- +# When k3s runs with --flannel-backend=none, no CNI plugin is installed. +# Without CNI, the kubelet reports the node as NotReady and no pods can be +# scheduled. +# +# The libkrunfw kernel lacks the bridge module, so the standard bridge CNI +# plugin fails with "operation not supported". Instead, we install a minimal +# "noop" CNI plugin (a shell script) that assigns pod IPs from a static range +# using the host-local IPAM plugin but skips creating any bridge/veth devices. +# This is sufficient for a single-node microVM cluster where we only need: +# - The node to report Ready +# - Pods to start (they communicate via the API server, not directly) +# +# The k3s image ships CNI plugin binaries in /bin/. kubelet expects them +# in /opt/cni/bin/ by default. +echo "[vm-init] Setting up CNI..." +mkdir -p /opt/cni/bin + +# Symlink the standard plugins we need (loopback for pod lo, host-local for IPAM). +for plugin in loopback host-local; do + if [ -f "/bin/$plugin" ] && [ ! -f "/opt/cni/bin/$plugin" ]; then + ln -sf "/bin/$plugin" "/opt/cni/bin/$plugin" + fi +done + +# Create a minimal noop CNI plugin. This shell script satisfies the CNI +# contract without creating any network devices (which the libkrunfw kernel +# can't do — no bridge module). It invokes host-local IPAM to allocate an +# IP, then returns the result. For DEL, it calls IPAM to release the IP. +cat > /opt/cni/bin/noop << 'NOOP_CNI' +#!/bin/sh +# Minimal noop CNI plugin — delegates to host-local IPAM only. +# Reads the network config from stdin, extracts the IPAM section, +# and invokes the host-local plugin to allocate/release IPs. + +IPAM_BIN="/opt/cni/bin/host-local" +CONFIG=$(cat) + +case "$CNI_COMMAND" in + ADD) + # Invoke IPAM to allocate an IP. Pass the full config (host-local + # reads the ipam section from it). + IPAM_RESULT=$(echo "$CONFIG" | "$IPAM_BIN") + IPAM_RC=$? + if [ $IPAM_RC -ne 0 ]; then + echo "$IPAM_RESULT" + exit $IPAM_RC + fi + # Return the IPAM result as our result (IPs allocated, no interfaces). + echo "$IPAM_RESULT" + ;; + DEL) + # Release the IP via IPAM. + echo "$CONFIG" | "$IPAM_BIN" 2>/dev/null + echo '{}' + ;; + CHECK) + echo '{}' + ;; + VERSION) + echo '{"cniVersion":"1.0.0","supportedVersions":["0.3.0","0.3.1","0.4.0","1.0.0"]}' + ;; +esac +NOOP_CNI +chmod +x /opt/cni/bin/noop + +# Write the CNI config. The chain is: +# 1. noop — allocates an IP via host-local IPAM (no network devices) +# 2. loopback — sets up lo in each pod namespace +# host-local IPAM assigns IPs from 10.42.0.0/24. +mkdir -p /etc/cni/net.d +cat > /etc/cni/net.d/10-noop.conflist << 'CNI_CONFIG' +{ + "cniVersion": "1.0.0", + "name": "noop", + "plugins": [ + { + "type": "noop", + "ipam": { + "type": "host-local", + "ranges": [ + [{"subnet": "10.42.0.0/24"}] + ] + } + }, + { + "type": "loopback" + } + ] +} +CNI_CONFIG + +# Copy bundled manifests if they exist (same as cluster-entrypoint.sh). +K3S_MANIFESTS="/run/k3s/server/manifests" +BUNDLED_MANIFESTS="/opt/navigator/manifests" +if [ -d "$BUNDLED_MANIFESTS" ]; then + mkdir -p "$K3S_MANIFESTS" + for manifest in "$BUNDLED_MANIFESTS"/*.yaml; do + [ ! -f "$manifest" ] && continue + cp "$manifest" "$K3S_MANIFESTS/" + done +fi + +echo "[vm-init] Starting k3s..." +exec /bin/k3s "$@" diff --git a/scripts/bin/nav b/scripts/bin/nav index 0dae2b07..9a8d0b6e 100755 --- a/scripts/bin/nav +++ b/scripts/bin/nav @@ -8,4 +8,11 @@ BINARY="$PROJECT_ROOT/target/debug/navigator" # Build if needed (cargo handles change detection) cargo build --package navigator-cli --quiet +# macOS: codesign with hypervisor entitlement if the binary changed. +# libkrun requires com.apple.security.hypervisor to access Hypervisor.framework. +ENTITLEMENTS="$PROJECT_ROOT/crates/navigator-gateway/entitlements.plist" +if [ -f "$ENTITLEMENTS" ]; then + codesign --entitlements "$ENTITLEMENTS" --force -s - "$BINARY" 2>/dev/null || true +fi + exec "$BINARY" "$@" From 60613983e19019f4f31cedff3c16ddae2d4fb60c Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 3 Mar 2026 10:37:22 -0800 Subject: [PATCH 2/4] feat(gateway): add k3s health checking, kubeconfig extraction, and boot retry After fork_start(), the parent now: - Polls https://localhost:/readyz with 2s intervals (120s timeout) - Checks child PID is alive between polls (fast-fail on VM crash) - Reads kubeconfig from rootfs via virtio-fs (host-visible) - Rewrites server URL and cluster name, stores in standard location - Prints kubectl usage instructions when ready Always forwards kube API port (ephemeral if --kube-port not specified) to enable health checking. Adds --name flag for kubeconfig context naming. Boot reliability: vm-init.sh now retries k3s up to 3 times with cleanup between attempts, handling the transient kine SQLite race condition on tmpfs that occasionally crashes k3s on first boot. Exports rewrite_kubeconfig and store_kubeconfig from navigator-bootstrap for use by the gateway cluster command. --- crates/navigator-bootstrap/src/lib.rs | 6 +- crates/navigator-cli/src/main.rs | 8 +- crates/navigator-cli/src/run.rs | 141 ++++++++++++++++++++++++-- crates/navigator-gateway/src/lib.rs | 10 ++ deploy/gateway/vm-init.sh | 37 ++++++- 5 files changed, 189 insertions(+), 13 deletions(-) diff --git a/crates/navigator-bootstrap/src/lib.rs b/crates/navigator-bootstrap/src/lib.rs index f9460798..6fe41ae3 100644 --- a/crates/navigator-bootstrap/src/lib.rs +++ b/crates/navigator-bootstrap/src/lib.rs @@ -24,7 +24,7 @@ use crate::docker::{ check_existing_cluster, create_ssh_docker_client, destroy_cluster_resources, ensure_container, ensure_image, ensure_network, ensure_volume, start_container, stop_container, }; -use crate::kubeconfig::{rewrite_kubeconfig, rewrite_kubeconfig_remote, store_kubeconfig}; +use crate::kubeconfig::rewrite_kubeconfig_remote; use crate::metadata::{ create_cluster_metadata, create_cluster_metadata_with_host, extract_host_from_ssh_destination, local_gateway_host, resolve_ssh_hostname, @@ -38,8 +38,8 @@ use crate::runtime::{ pub use crate::docker::ExistingClusterInfo; pub use crate::kubeconfig::{ - default_local_kubeconfig_path, print_kubeconfig, stored_kubeconfig_path, - update_local_kubeconfig, + default_local_kubeconfig_path, print_kubeconfig, rewrite_kubeconfig, store_kubeconfig, + stored_kubeconfig_path, update_local_kubeconfig, }; pub use crate::metadata::{ ClusterMetadata, clear_active_cluster, get_cluster_metadata, list_clusters, diff --git a/crates/navigator-cli/src/main.rs b/crates/navigator-cli/src/main.rs index ffbcde19..5f4466d7 100644 --- a/crates/navigator-cli/src/main.rs +++ b/crates/navigator-cli/src/main.rs @@ -524,6 +524,10 @@ enum GatewayCommands { /// a libkrun microVM with port forwarding and persistent storage. /// The parent process stays alive to monitor the VM. Cluster { + /// Cluster name for kubeconfig context naming. + #[arg(long, default_value = "gateway")] + name: String, + /// Cluster Docker image to extract rootfs from. /// /// Defaults to the same image used by `cluster admin deploy`. @@ -535,7 +539,7 @@ enum GatewayCommands { port: u16, /// Host port for the k3s API server (mapped to guest port 6443). - /// If not set, the API server is not exposed to the host. + /// If not set, an ephemeral port is used for health checking only. #[arg(long)] kube_port: Option, @@ -1373,6 +1377,7 @@ async fn main() -> Result<()> { )?; } GatewayCommands::Cluster { + name, image, port, kube_port, @@ -1382,6 +1387,7 @@ async fn main() -> Result<()> { krun_log_level, } => { run::gateway_cluster( + &name, image.as_deref(), port, kube_port, diff --git a/crates/navigator-cli/src/run.rs b/crates/navigator-cli/src/run.rs index ae9b2620..2ff3e2e0 100644 --- a/crates/navigator-cli/src/run.rs +++ b/crates/navigator-cli/src/run.rs @@ -3162,6 +3162,7 @@ pub fn gateway_run( /// 4. Waits for the child VM process to exit #[allow(clippy::too_many_arguments)] pub fn gateway_cluster( + cluster_name: &str, image: Option<&str>, gateway_port: u16, kube_port: Option, @@ -3260,6 +3261,13 @@ pub fn gateway_cluster( // The gateway itself is .1, and the first (and only) DHCP client // gets .3 (observed empirically — .2 may be reserved internally). let guest_ip = "192.168.127.3"; + + // We always need the kube API port forwarded for health checking and + // kubeconfig extraction. If the user didn't specify --kube-port, pick + // an ephemeral port. + let effective_kube_port = + kube_port.unwrap_or_else(|| navigator_bootstrap::pick_available_port().unwrap_or(6444)); + let forward_ports = |local_port: u16, remote_port: u16| -> Result<()> { let body = format!( r#"{{"local":":{}","remote":"{}:{}","protocol":"tcp"}}"#, @@ -3287,10 +3295,8 @@ pub fn gateway_cluster( // Forward the navigator gateway port. forward_ports(gateway_port, 30051)?; - // Optionally forward the kube API port. - if let Some(kp) = kube_port { - forward_ports(kp, 6443)?; - } + // Always forward the kube API port (for health checks + kubeconfig). + forward_ports(effective_kube_port, 6443)?; // Step 4: Build the microVM configuration. println!("Booting k3s cluster in microVM..."); @@ -3300,9 +3306,14 @@ pub fn gateway_cluster( println!(" memory: {mem} MiB"); println!(" network: gvproxy (guest IP: {guest_ip})"); println!(" gateway: localhost:{gateway_port} -> guest:30051"); - if let Some(kp) = kube_port { - println!(" kube API: localhost:{kp} -> guest:6443"); - } + println!( + " kube API: localhost:{effective_kube_port} -> guest:6443{}", + if kube_port.is_none() { + " (internal)" + } else { + "" + } + ); println!(" state: {}", k3s_state_dir.display()); println!(" console: {}", console_log.display()); println!(); @@ -3366,11 +3377,120 @@ pub fn gateway_cluster( " (tail -f {} for VM console output)", console_log.display() ); + println!(); - // Step 6: Wait for the child process. + // Step 6: Wait for k3s API server to become ready. + // + // We poll the /readyz endpoint on the forwarded kube port. k3s typically + // takes 15-45 seconds to boot, but the kine race condition on tmpfs can + // cause it to crash and restart once, so we allow up to 120 seconds. + let readyz_url = format!("https://localhost:{effective_kube_port}/readyz"); + let health_timeout = Duration::from_secs(120); + let health_interval = Duration::from_secs(2); + let start = Instant::now(); + + let mut api_ready = false; + while start.elapsed() < health_timeout { + // Check if the child process is still alive. + if !navigator_gateway::is_pid_alive(child_pid) { + // Child exited before becoming ready. + return Err(miette::miette!( + "microVM exited before k3s became ready (PID: {child_pid})\n\ + Check console log: {}", + console_log.display() + )); + } + + // Probe the readyz endpoint (skip TLS verification — self-signed cert). + let probe = Command::new("curl") + .args(["-sk", "--max-time", "2", &readyz_url]) + .output(); + + if let Ok(output) = probe { + let body = String::from_utf8_lossy(&output.stdout); + if body.contains("ok") { + api_ready = true; + break; + } + } + + std::thread::sleep(health_interval); + } + + if !api_ready { + // Don't kill the VM — it may still be starting up. Just warn. + eprintln!( + "warning: k3s API server did not become ready within {}s", + health_timeout.as_secs() + ); + eprintln!( + " The VM is still running (PID: {child_pid}). Check: tail -f {}", + console_log.display() + ); + } + + // Step 7: Extract and store kubeconfig. + // + // k3s writes its kubeconfig to /etc/rancher/k3s/k3s.yaml inside the guest. + // Since the rootfs is mapped via virtio-fs, we can read it directly from + // the host filesystem. + if api_ready { + let elapsed = start.elapsed(); + println!("k3s API server ready ({:.1}s)", elapsed.as_secs_f64()); + + let kubeconfig_guest_path = rootfs_dir.join("etc/rancher/k3s/k3s.yaml"); + match std::fs::read_to_string(&kubeconfig_guest_path) { + Ok(raw_kubeconfig) if is_valid_kubeconfig(&raw_kubeconfig) => { + // Rewrite the kubeconfig: point server URL to the forwarded port + // and rename the cluster/context/user entries. + let rewritten = navigator_bootstrap::rewrite_kubeconfig( + &raw_kubeconfig, + cluster_name, + Some(effective_kube_port), + ); + + // Store in the standard navigator kubeconfig location. + let kubeconfig_path = navigator_bootstrap::stored_kubeconfig_path(cluster_name) + .map_err(|e| miette::miette!("failed to resolve kubeconfig path: {e}"))?; + navigator_bootstrap::store_kubeconfig(&kubeconfig_path, &rewritten) + .map_err(|e| miette::miette!("failed to store kubeconfig: {e}"))?; + + println!("Kubeconfig written to {}", kubeconfig_path.display()); + println!(); + println!("Cluster is ready! To use:"); + println!(" export KUBECONFIG={}", kubeconfig_path.display()); + println!(" kubectl get nodes"); + } + Ok(_) => { + eprintln!( + "warning: kubeconfig at {} is not valid yet", + kubeconfig_guest_path.display() + ); + } + Err(e) => { + eprintln!( + "warning: could not read kubeconfig from {}: {e}", + kubeconfig_guest_path.display() + ); + eprintln!(" k3s may not have written it yet; the VM is still running."); + } + } + } + + println!(); + println!("Press Ctrl+C to stop the cluster."); + + // Step 8: Wait for the child process (blocks until VM exits or signal). + // + // The default SIGINT handler sends SIGINT to the entire process group, + // which includes the forked VM child. So Ctrl+C will naturally propagate + // to the VM. After the child exits, we clean up gvproxy. let status = wait_for_child(child_pid)?; if status == 0 { println!("microVM exited cleanly"); + } else if status == 130 { + // 128 + SIGINT(2) = 130 — user pressed Ctrl+C. + println!("microVM stopped"); } else { eprintln!("microVM exited with status {status}"); } @@ -3384,6 +3504,11 @@ pub fn gateway_cluster( Ok(()) } +/// Check if a string looks like a valid kubeconfig. +fn is_valid_kubeconfig(contents: &str) -> bool { + contents.contains("apiVersion:") && contents.contains("clusters:") +} + /// Extract a rootfs from a Docker image by creating a temporary container /// and exporting its filesystem. fn extract_rootfs_from_docker(image_ref: &str, rootfs_dir: &Path) -> Result<()> { diff --git a/crates/navigator-gateway/src/lib.rs b/crates/navigator-gateway/src/lib.rs index 3fa88b11..7802211c 100644 --- a/crates/navigator-gateway/src/lib.rs +++ b/crates/navigator-gateway/src/lib.rs @@ -66,3 +66,13 @@ pub fn wait_for_pid(pid: u32) -> Result { Ok(status) } } + +/// Check if a child process is still alive (non-blocking). +/// +/// Returns `true` if the process exists and has not yet exited, +/// `false` if it has exited or the PID is invalid. +pub fn is_pid_alive(pid: u32) -> bool { + // kill(pid, 0) checks if we can signal the process without actually + // sending a signal. Returns 0 if the process exists. + unsafe { libc::kill(pid.cast_signed(), 0) == 0 } +} diff --git a/deploy/gateway/vm-init.sh b/deploy/gateway/vm-init.sh index 74d5404d..9f0ddd76 100755 --- a/deploy/gateway/vm-init.sh +++ b/deploy/gateway/vm-init.sh @@ -211,5 +211,40 @@ if [ -d "$BUNDLED_MANIFESTS" ]; then done fi +# Start k3s with a retry wrapper. On tmpfs, k3s sometimes crashes on first +# boot with "duplicate key given in txn request" or "kine.sock: address +# already in use" due to a race condition in kine's SQLite initialization. +# This is transient and always succeeds on the second attempt. +# +# We retry up to 3 times with a brief cleanup pause between attempts. +# On the final attempt, we exec to replace this process with k3s (PID 1). +MAX_RETRIES=3 +RETRY=0 + echo "[vm-init] Starting k3s..." -exec /bin/k3s "$@" +while [ "$RETRY" -lt "$MAX_RETRIES" ]; do + RETRY=$((RETRY + 1)) + + if [ "$RETRY" -eq "$MAX_RETRIES" ]; then + # Final attempt: exec into k3s so it becomes PID 1. + exec /bin/k3s "$@" + fi + + # Non-final attempt: run k3s and check exit code. + /bin/k3s "$@" & + K3S_PID=$! + wait $K3S_PID + K3S_EXIT=$? + + if [ "$K3S_EXIT" -eq 0 ]; then + exit 0 + fi + + echo "[vm-init] k3s exited with status $K3S_EXIT (attempt $RETRY/$MAX_RETRIES)" + + # Clean up stale kine socket and lock files before retrying. + rm -f /run/k3s/server/kine.sock 2>/dev/null + rm -f /run/k3s/server/db/state.db-wal 2>/dev/null + rm -f /run/k3s/server/db/state.db-shm 2>/dev/null + sleep 2 +done From 5d7dfa193cf0ad9042d7a5bf362de4286202d740 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 3 Mar 2026 10:44:50 -0800 Subject: [PATCH 3/4] fix(gateway): use waitpid(WNOHANG) for child liveness check, fix docker export leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit is_pid_alive() now uses waitpid(WNOHANG) instead of kill(pid, 0). kill(0) returns success for zombie processes, so the health check loop would never detect an early child exit — it would spin for the full 120s timeout instead of failing fast. Also fixes a resource leak in extract_rootfs_from_docker(): the docker export child process was never waited on, leaving a zombie and missing its exit status check. Fixes doc comment on net_gvproxy() that incorrectly stated guest IP as .2 (actual: .3 with gvproxy v0.8.6). Removes misleading explicit drop of c_env_strings in build(). --- crates/navigator-cli/src/run.rs | 17 +++++++++++++++-- crates/navigator-gateway/src/context.rs | 8 +++----- crates/navigator-gateway/src/lib.rs | 21 ++++++++++++++++----- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/crates/navigator-cli/src/run.rs b/crates/navigator-cli/src/run.rs index 2ff3e2e0..fb4f848c 100644 --- a/crates/navigator-cli/src/run.rs +++ b/crates/navigator-cli/src/run.rs @@ -3549,7 +3549,7 @@ fn extract_rootfs_from_docker(image_ref: &str, rootfs_dir: &Path) -> Result<()> .map_err(|e| miette::miette!("failed to create rootfs dir: {e}"))?; println!(" Exporting container filesystem..."); - let export = Command::new("docker") + let mut export = Command::new("docker") .args(["export", &container_name]) .stdout(std::process::Stdio::piped()) .spawn() @@ -3558,10 +3558,23 @@ fn extract_rootfs_from_docker(image_ref: &str, rootfs_dir: &Path) -> Result<()> let tar_status = Command::new("tar") .args(["xf", "-", "-C"]) .arg(rootfs_dir) - .stdin(export.stdout.unwrap()) + .stdin(export.stdout.take().unwrap()) .status() .map_err(|e| miette::miette!("tar extract failed: {e}"))?; + // Wait for docker export to finish and check its exit status. + let export_status = export + .wait() + .map_err(|e| miette::miette!("failed to wait for docker export: {e}"))?; + if !export_status.success() { + let _ = Command::new("docker") + .args(["rm", "-f", &container_name]) + .status(); + return Err(miette::miette!( + "docker export failed with status {export_status}" + )); + } + if !tar_status.success() { // Clean up on failure. let _ = Command::new("docker") diff --git a/crates/navigator-gateway/src/context.rs b/crates/navigator-gateway/src/context.rs index a4e28ab3..a0b90a86 100644 --- a/crates/navigator-gateway/src/context.rs +++ b/crates/navigator-gateway/src/context.rs @@ -325,8 +325,9 @@ impl KrunContextBuilder { /// When set, libkrun adds a virtio-net device backed by the gvproxy /// Unix datagram socket at the given path. This **automatically disables /// TSI**, so the guest gets a real `eth0` interface with DHCP from - /// gvproxy (default subnet: 192.168.127.0/24, gateway: 192.168.127.1, - /// guest IP: 192.168.127.2). + /// gvproxy (default subnet: 192.168.127.0/24, gateway: 192.168.127.1). + /// The guest IP is assigned by DHCP — with gvproxy v0.8.6, the first + /// client gets 192.168.127.3 (not .2 as some docs suggest). /// /// Port forwarding is handled by gvproxy's HTTP API, not by /// `krun_set_port_map` (which is TSI-only). @@ -524,9 +525,6 @@ impl KrunContextBuilder { ) })?; - // Keep CStrings alive until after the FFI call. - drop(c_env_strings); - info!( ctx_id, rootfs = %rootfs.display(), diff --git a/crates/navigator-gateway/src/lib.rs b/crates/navigator-gateway/src/lib.rs index 7802211c..aa91e2e3 100644 --- a/crates/navigator-gateway/src/lib.rs +++ b/crates/navigator-gateway/src/lib.rs @@ -69,10 +69,21 @@ pub fn wait_for_pid(pid: u32) -> Result { /// Check if a child process is still alive (non-blocking). /// -/// Returns `true` if the process exists and has not yet exited, -/// `false` if it has exited or the PID is invalid. +/// Uses `waitpid(WNOHANG)` to check if the child has exited without blocking. +/// This correctly detects zombie processes (which `kill(pid, 0)` does not — +/// zombies are still signalable). If the child has exited, it is reaped. +/// +/// Returns `true` if the process is still running, `false` if it has exited +/// or the PID is invalid. +/// +/// **Warning**: If the child has exited, this function reaps it. A subsequent +/// call to [`wait_for_pid`] on the same PID will fail with ECHILD. pub fn is_pid_alive(pid: u32) -> bool { - // kill(pid, 0) checks if we can signal the process without actually - // sending a signal. Returns 0 if the process exists. - unsafe { libc::kill(pid.cast_signed(), 0) == 0 } + let mut status: libc::c_int = 0; + // waitpid with WNOHANG returns: + // 0 — child still running + // pid — child has exited (reaped) + // -1 — error (e.g., not our child, invalid pid) + let ret = unsafe { libc::waitpid(pid.cast_signed(), &raw mut status, libc::WNOHANG) }; + ret == 0 } From f340aa8a62a11956b570ef7520342c7b019221e1 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 3 Mar 2026 10:55:43 -0800 Subject: [PATCH 4/4] fix(gateway): use static guest IP and redirect child stderr to console log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of readyz timeout: gvproxy DHCP assigns guest IPs nondeterministically (.2 or .3 depending on timing), but port forwarding was hardcoded to .3. When the guest got .2, the health check polled the wrong IP for 120s. Fix: vm-init.sh now assigns 192.168.127.2 statically instead of using DHCP. The CLI guest_ip constant matches. This eliminates the race. Also fixes libkrun VMM warnings (virtio-fs passthrough symlink errors) appearing on the parent terminal — fork_start() now redirects the child's stderr to the console log file via dup2(). The warnings are from macOS symlink resolution limits on Kubernetes ConfigMap mounts and are non-fatal. vm-init.sh is now always updated in the cached rootfs on every boot (not just during initial extraction), so networking fixes take effect without requiring rootfs re-extraction. --- crates/navigator-cli/src/run.rs | 42 +++++++++++++------- crates/navigator-gateway/src/context.rs | 36 ++++++++++++++++- deploy/gateway/vm-init.sh | 52 +++++++------------------ 3 files changed, 76 insertions(+), 54 deletions(-) diff --git a/crates/navigator-cli/src/run.rs b/crates/navigator-cli/src/run.rs index fb4f848c..e55b3f6a 100644 --- a/crates/navigator-cli/src/run.rs +++ b/crates/navigator-cli/src/run.rs @@ -3211,6 +3211,11 @@ pub fn gateway_cluster( println!("Using cached rootfs: {}", rootfs_dir.display()); } + // Always update vm-init.sh in the rootfs — it's embedded at build time + // and may contain fixes (e.g., networking changes) that the cached rootfs + // doesn't have. + update_vm_init_script(&rootfs_dir)?; + // Create k3s state directory. std::fs::create_dir_all(&k3s_state_dir) .map_err(|e| miette::miette!("failed to create k3s state dir: {e}"))?; @@ -3258,9 +3263,10 @@ pub fn gateway_cluster( // Step 3: Configure port forwarding via gvproxy HTTP API. // // gvproxy's DHCP server assigns guest IPs from 192.168.127.0/24. - // The gateway itself is .1, and the first (and only) DHCP client - // gets .3 (observed empirically — .2 may be reserved internally). - let guest_ip = "192.168.127.3"; + // The gateway is .1. The guest uses a static IP (.2) configured in + // vm-init.sh — we don't use DHCP because gvproxy assigns IPs + // nondeterministically (.2 or .3 depending on timing). + let guest_ip = "192.168.127.2"; // We always need the kube API port forwarded for health checking and // kubeconfig extraction. If the user didn't specify --kube-port, pick @@ -3590,31 +3596,39 @@ fn extract_rootfs_from_docker(image_ref: &str, rootfs_dir: &Path) -> Result<()> .args(["rm", "-f", &container_name]) .status(); - // Copy the VM init script into the rootfs. This script sets up networking - // (dummy interface + default route) so k3s can find /proc/net/route, then - // execs into k3s. + update_vm_init_script(rootfs_dir)?; + + println!(" Rootfs extracted to {}", rootfs_dir.display()); + Ok(()) +} + +/// Wait for a child process to exit and return its exit status. +fn wait_for_child(pid: u32) -> Result { + navigator_gateway::wait_for_pid(pid).map_err(|e| miette::miette!("waitpid failed: {e}")) +} + +/// Write the embedded vm-init.sh script into the rootfs. +/// +/// This is called both during initial rootfs extraction and on every +/// cluster boot (to pick up fixes baked into the binary without requiring +/// a full rootfs re-extract). +fn update_vm_init_script(rootfs_dir: &Path) -> Result<()> { let init_script = include_bytes!("../../../deploy/gateway/vm-init.sh"); let init_path = rootfs_dir.join("usr/local/bin/vm-init.sh"); + std::fs::create_dir_all(init_path.parent().unwrap()) + .map_err(|e| miette::miette!("failed to create vm-init.sh parent dir: {e}"))?; std::fs::write(&init_path, init_script) .map_err(|e| miette::miette!("failed to write vm-init.sh: {e}"))?; - // Make executable (0o755). #[cfg(unix)] { use std::os::unix::fs::PermissionsExt; std::fs::set_permissions(&init_path, std::fs::Permissions::from_mode(0o755)) .map_err(|e| miette::miette!("failed to chmod vm-init.sh: {e}"))?; } - - println!(" Rootfs extracted to {}", rootfs_dir.display()); Ok(()) } -/// Wait for a child process to exit and return its exit status. -fn wait_for_child(pid: u32) -> Result { - navigator_gateway::wait_for_pid(pid).map_err(|e| miette::miette!("waitpid failed: {e}")) -} - #[cfg(test)] mod tests { use super::{inferred_provider_type, parse_credential_pairs, resolve_route_protocols}; diff --git a/crates/navigator-gateway/src/context.rs b/crates/navigator-gateway/src/context.rs index a0b90a86..641b1b6c 100644 --- a/crates/navigator-gateway/src/context.rs +++ b/crates/navigator-gateway/src/context.rs @@ -23,6 +23,10 @@ use crate::ffi; /// `start_enter`, which never returns). pub struct KrunContext { ctx_id: u32, + /// If set, `fork_start()` redirects the child's stderr to this file + /// so that libkrun VMM warnings (e.g., virtio-fs passthrough) don't + /// leak to the parent's terminal. + console_output: Option, } impl KrunContext { @@ -102,7 +106,15 @@ impl KrunContext { } if pid == 0 { - // Child process: start the VM. This never returns on success. + // Child process: redirect stderr to the console log file (if + // configured) so libkrun VMM warnings don't leak to the parent + // terminal. The VMM's virtio-fs passthrough generates WARN-level + // logs on stderr that are confusing when mixed with CLI output. + if let Some(ref console_path) = this.console_output { + redirect_stderr_to_file(console_path); + } + + // Start the VM. This never returns on success. let ret = unsafe { ffi::krun_start_enter(this.ctx_id) }; // If we reach here, start failed. Exit with an error code so the // parent can detect it. @@ -402,7 +414,10 @@ impl KrunContextBuilder { // From here on, if we hit an error we need to clean up the context. // We'll create KrunContext now so Drop handles it. - let ctx = KrunContext { ctx_id }; + let ctx = KrunContext { + ctx_id, + console_output: self.console_output.clone(), + }; // Configure VM resources. check_ret("krun_set_vm_config", unsafe { @@ -575,6 +590,23 @@ fn to_ptr_array(strings: &[CString]) -> Vec<*const c_char> { ptrs } +/// Redirect stderr (fd 2) to a file. Used in the forked child process to +/// prevent libkrun VMM log messages from appearing on the parent's terminal. +/// +/// Best-effort: if the file can't be opened, stderr is left unchanged. +fn redirect_stderr_to_file(path: &Path) { + use std::fs::OpenOptions; + use std::os::unix::io::IntoRawFd; + + if let Ok(file) = OpenOptions::new().create(true).append(true).open(path) { + let fd = file.into_raw_fd(); + unsafe { + libc::dup2(fd, libc::STDERR_FILENO); + libc::close(fd); + } + } +} + /// Raise `RLIMIT_NOFILE` to the maximum allowed value. /// /// virtio-fs (used by `krun_set_root` to map the rootfs directory) requires a diff --git a/deploy/gateway/vm-init.sh b/deploy/gateway/vm-init.sh index 9f0ddd76..d0f98a70 100755 --- a/deploy/gateway/vm-init.sh +++ b/deploy/gateway/vm-init.sh @@ -45,46 +45,22 @@ fi # Enable the loopback interface. ip link set lo up 2>/dev/null || true -# Configure eth0 via DHCP from gvproxy. -# gvproxy provides DHCP on 192.168.127.0/24: -# gateway: 192.168.127.1 -# guest: 192.168.127.2 -# DNS: 192.168.127.1 +# Configure eth0 with a static IP. +# +# gvproxy provides a 192.168.127.0/24 network with gateway at .1. +# We use a static IP instead of DHCP because gvproxy's DHCP server +# assigns IPs nondeterministically (.2 or .3 depending on timing), +# which breaks port forwarding configured before the VM boots. +# Static assignment guarantees the IP matches what the host expects. +GUEST_IP="192.168.127.2" +GATEWAY_IP="192.168.127.1" + if ip link show eth0 >/dev/null 2>&1; then - echo "[vm-init] Configuring eth0 via DHCP..." + echo "[vm-init] Configuring eth0 with static IP ${GUEST_IP}..." ip link set eth0 up - - # BusyBox udhcpc needs a script to apply the lease. Create a minimal one. - mkdir -p /usr/share/udhcpc - cat > /usr/share/udhcpc/default.script << 'DHCP_SCRIPT' -#!/bin/sh -case "$1" in - bound|renew) - ip addr add "$ip/$mask" dev "$interface" 2>/dev/null || true - if [ -n "$router" ]; then - ip route add default via "$router" dev "$interface" 2>/dev/null || true - fi - if [ -n "$dns" ]; then - : > /etc/resolv.conf - for ns in $dns; do - echo "nameserver $ns" >> /etc/resolv.conf - done - fi - ;; -esac -DHCP_SCRIPT - chmod +x /usr/share/udhcpc/default.script - - # Run DHCP (foreground, quit after lease obtained). - udhcpc -i eth0 -n -q -f -t 5 2>/dev/null || { - echo "[vm-init] DHCP failed, using static config" - ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 dev eth0 2>/dev/null || true - echo "nameserver 192.168.127.1" > /etc/resolv.conf - } - - GUEST_IP=$(ip -4 addr show eth0 2>/dev/null | sed -n 's/.*inet \([0-9.]*\).*/\1/p' | head -1) - GUEST_IP="${GUEST_IP:-192.168.127.3}" + ip addr add "${GUEST_IP}/24" dev eth0 2>/dev/null || true + ip route add default via "${GATEWAY_IP}" dev eth0 2>/dev/null || true + echo "nameserver ${GATEWAY_IP}" > /etc/resolv.conf echo "[vm-init] Network configured: eth0 = ${GUEST_IP}" else # Fallback: no eth0 (TSI-only mode). Add dummy routing on lo so k3s