diff --git a/Cargo.lock b/Cargo.lock index 1e45d96f..0ababf5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2687,6 +2687,7 @@ dependencies = [ "miette", "navigator-bootstrap", "navigator-core", + "navigator-gateway", "navigator-policy", "navigator-providers", "navigator-tui", @@ -2726,6 +2727,15 @@ dependencies = [ "tonic-build", ] +[[package]] +name = "navigator-gateway" +version = "0.1.0" +dependencies = [ + "libc", + "thiserror 2.0.18", + "tracing", +] + [[package]] name = "navigator-policy" version = "0.1.0" diff --git a/crates/navigator-bootstrap/src/lib.rs b/crates/navigator-bootstrap/src/lib.rs index f9460798..6fe41ae3 100644 --- a/crates/navigator-bootstrap/src/lib.rs +++ b/crates/navigator-bootstrap/src/lib.rs @@ -24,7 +24,7 @@ use crate::docker::{ check_existing_cluster, create_ssh_docker_client, destroy_cluster_resources, ensure_container, ensure_image, ensure_network, ensure_volume, start_container, stop_container, }; -use crate::kubeconfig::{rewrite_kubeconfig, rewrite_kubeconfig_remote, store_kubeconfig}; +use crate::kubeconfig::rewrite_kubeconfig_remote; use crate::metadata::{ create_cluster_metadata, create_cluster_metadata_with_host, extract_host_from_ssh_destination, local_gateway_host, resolve_ssh_hostname, @@ -38,8 +38,8 @@ use crate::runtime::{ pub use crate::docker::ExistingClusterInfo; pub use crate::kubeconfig::{ - default_local_kubeconfig_path, print_kubeconfig, stored_kubeconfig_path, - update_local_kubeconfig, + default_local_kubeconfig_path, print_kubeconfig, rewrite_kubeconfig, store_kubeconfig, + stored_kubeconfig_path, update_local_kubeconfig, }; pub use crate::metadata::{ ClusterMetadata, clear_active_cluster, get_cluster_metadata, list_clusters, diff --git a/crates/navigator-cli/Cargo.toml b/crates/navigator-cli/Cargo.toml index dd16246d..f1246e71 100644 --- a/crates/navigator-cli/Cargo.toml +++ b/crates/navigator-cli/Cargo.toml @@ -14,6 +14,7 @@ path = "src/main.rs" [dependencies] navigator-bootstrap = { path = "../navigator-bootstrap" } navigator-core = { path = "../navigator-core" } +navigator-gateway = { path = "../navigator-gateway" } navigator-policy = { path = "../navigator-policy" } navigator-providers = { path = "../navigator-providers" } navigator-tui = { path = "../navigator-tui" } diff --git a/crates/navigator-cli/build.rs b/crates/navigator-cli/build.rs new file mode 100644 index 00000000..182d9dc2 --- /dev/null +++ b/crates/navigator-cli/build.rs @@ -0,0 +1,41 @@ +use std::process::Command; + +fn main() { + // On macOS, embed rpath entries for libkrun and libkrunfw so the binary + // can find them at runtime without DYLD_LIBRARY_PATH. + // + // Background: navigator-gateway links against libkrun (a system cdylib + // installed via Homebrew). At runtime libkrun loads libkrunfw via dlopen. + // The gateway crate's build.rs already emits link-search paths so the + // *linker* can find the dylibs, but cargo:rustc-link-arg from a library + // crate does NOT propagate to the final binary. We must emit the rpath + // flags from the binary crate's build.rs. + #[cfg(target_os = "macos")] + { + for formula in &["libkrun", "libkrunfw"] { + if let Some(lib_dir) = brew_lib_path(formula) { + println!("cargo:rustc-link-arg=-Wl,-rpath,{lib_dir}"); + } + } + } +} + +/// Ask Homebrew for the install prefix of a formula and return its `lib/` path. +#[cfg(target_os = "macos")] +fn brew_lib_path(formula: &str) -> Option { + let output = Command::new("brew") + .args(["--prefix", formula]) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let prefix = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if prefix.is_empty() { + return None; + } + + Some(format!("{prefix}/lib")) +} diff --git a/crates/navigator-cli/src/main.rs b/crates/navigator-cli/src/main.rs index b2b17b9f..5f4466d7 100644 --- a/crates/navigator-cli/src/main.rs +++ b/crates/navigator-cli/src/main.rs @@ -119,6 +119,12 @@ enum Commands { command: ProviderCommands, }, + /// Hardware-isolated microVM gateway. + Gateway { + #[command(subcommand)] + command: GatewayCommands, + }, + /// Launch the Gator interactive TUI. Gator, @@ -468,6 +474,94 @@ enum ClusterAdminCommands { }, } +#[derive(Subcommand, Debug)] +enum GatewayCommands { + /// Run a command inside a hardware-isolated microVM. + /// + /// Boots a lightweight microVM using libkrun (Apple Hypervisor.framework on + /// macOS ARM64, KVM on Linux) and executes the specified command inside it. + /// The rootfs directory is mapped into the VM via virtio-fs. + /// + /// NOTE: This command takes over the current process. The process will exit + /// with the guest workload's exit code when the VM shuts down. + Run { + /// Path to the root filesystem directory (aarch64 Linux userspace). + /// + /// Must contain the executable specified by EXEC_PATH. For a quick + /// start, download the Alpine minirootfs: + /// + /// curl -L https://dl-cdn.alpinelinux.org/alpine/v3.21/releases/aarch64/alpine-minirootfs-3.21.3-aarch64.tar.gz | tar xz -C ./rootfs + #[arg(long)] + rootfs: PathBuf, + + /// Number of virtual CPUs for the microVM. + #[arg(long, default_value_t = 1)] + vcpus: u8, + + /// Amount of RAM in MiB for the microVM. + #[arg(long, default_value_t = 128)] + mem: u32, + + /// Working directory inside the VM (relative to rootfs). + #[arg(long, default_value = "/")] + workdir: String, + + /// libkrun log level (0=Off, 1=Error, 2=Warn, 3=Info, 4=Debug, 5=Trace). + #[arg(long, default_value_t = 2)] + krun_log_level: u32, + + /// Path to the executable inside the rootfs. + exec_path: String, + + /// Arguments passed to the executable. + #[arg(trailing_var_arg = true)] + args: Vec, + }, + + /// Boot the cluster container in a hardware-isolated microVM. + /// + /// Extracts a rootfs from the cluster Docker image, then boots k3s inside + /// a libkrun microVM with port forwarding and persistent storage. + /// The parent process stays alive to monitor the VM. + Cluster { + /// Cluster name for kubeconfig context naming. + #[arg(long, default_value = "gateway")] + name: String, + + /// Cluster Docker image to extract rootfs from. + /// + /// Defaults to the same image used by `cluster admin deploy`. + #[arg(long)] + image: Option, + + /// Host port for the navigator gateway (mapped to guest port 30051). + #[arg(long, default_value_t = 8080)] + port: u16, + + /// Host port for the k3s API server (mapped to guest port 6443). + /// If not set, an ephemeral port is used for health checking only. + #[arg(long)] + kube_port: Option, + + /// Number of virtual CPUs for the microVM. + #[arg(long, default_value_t = 2)] + vcpus: u8, + + /// Amount of RAM in MiB for the microVM. + #[arg(long, default_value_t = 2048)] + mem: u32, + + /// Directory for persistent k3s state. Created if it doesn't exist. + /// Defaults to $XDG_DATA_HOME/navigator/gateway-cluster/k3s-state. + #[arg(long)] + state_dir: Option, + + /// libkrun log level (0=Off, 1=Error, 2=Warn, 3=Info, 4=Debug, 5=Trace). + #[arg(long, default_value_t = 2)] + krun_log_level: u32, + }, +} + #[derive(Subcommand, Debug)] enum SandboxCommands { /// Create a sandbox. @@ -1262,6 +1356,48 @@ async fn main() -> Result<()> { } } } + Some(Commands::Gateway { command }) => match command { + GatewayCommands::Run { + rootfs, + vcpus, + mem, + workdir, + krun_log_level, + exec_path, + args, + } => { + run::gateway_run( + &rootfs, + vcpus, + mem, + &workdir, + krun_log_level, + &exec_path, + &args, + )?; + } + GatewayCommands::Cluster { + name, + image, + port, + kube_port, + vcpus, + mem, + state_dir, + krun_log_level, + } => { + run::gateway_cluster( + &name, + image.as_deref(), + port, + kube_port, + vcpus, + mem, + state_dir.as_deref(), + krun_log_level, + )?; + } + }, Some(Commands::Gator) => { let ctx = resolve_cluster(&cli.cluster)?; let tls = tls.with_cluster_name(&ctx.name); diff --git a/crates/navigator-cli/src/run.rs b/crates/navigator-cli/src/run.rs index 350a12ef..e55b3f6a 100644 --- a/crates/navigator-cli/src/run.rs +++ b/crates/navigator-cli/src/run.rs @@ -3103,6 +3103,532 @@ fn print_log_line(log: &navigator_core::proto::SandboxLogLine) { } } +// --------------------------------------------------------------------------- +// Gateway (microVM) +// --------------------------------------------------------------------------- + +/// Boot a hardware-isolated microVM and execute a command inside it. +/// +/// This function **never returns on success** -- the libkrun VMM takes over +/// the process and calls `exit()` with the guest workload's exit code. +pub fn gateway_run( + rootfs: &Path, + vcpus: u8, + mem: u32, + workdir: &str, + log_level: u32, + exec_path: &str, + args: &[String], +) -> Result<()> { + use navigator_gateway::KrunContext; + + println!("Booting microVM..."); + println!(" rootfs: {}", rootfs.display()); + println!(" vcpus: {vcpus}"); + println!(" memory: {mem} MiB"); + println!(" workdir: {workdir}"); + println!(" exec: {exec_path}"); + if !args.is_empty() { + println!(" args: {}", args.join(" ")); + } + println!(); + + let arg_strs: Vec<&str> = args.iter().map(String::as_str).collect(); + + let ctx = KrunContext::builder() + .vcpus(vcpus) + .memory_mib(mem) + .rootfs(rootfs) + .workdir(workdir) + .exec(exec_path, &arg_strs) + .log_level(log_level) + .build() + .map_err(|e| miette::miette!("failed to configure microVM: {e}"))?; + + // This never returns on success -- the process exits with the guest's + // exit code. If it does return, it means something went wrong. + ctx.start_enter() + .map_err(|e| miette::miette!("failed to start microVM: {e}"))?; + + Ok(()) +} + +/// Boot the cluster container in a hardware-isolated microVM. +/// +/// This function: +/// 1. Extracts a rootfs from the cluster Docker image (if not already cached) +/// 2. Creates a persistent state directory for k3s data +/// 3. Boots k3s inside a libkrun microVM with port forwarding +/// 4. Waits for the child VM process to exit +#[allow(clippy::too_many_arguments)] +pub fn gateway_cluster( + cluster_name: &str, + image: Option<&str>, + gateway_port: u16, + kube_port: Option, + vcpus: u8, + mem: u32, + state_dir: Option<&Path>, + log_level: u32, +) -> Result<()> { + use navigator_gateway::KrunContext; + + // Resolve the cluster image. Priority: + // 1. Explicit --image flag + // 2. NAVIGATOR_CLUSTER_IMAGE env var + // 3. Default: navigator/cluster:dev (local build) + let resolved_image; + let image_ref = if let Some(img) = image { + img + } else if let Ok(img) = std::env::var("NAVIGATOR_CLUSTER_IMAGE") { + resolved_image = img; + &resolved_image + } else { + "navigator/cluster:dev" + }; + + // Determine directories. + let data_dir = if let Some(dir) = state_dir { + dir.to_path_buf() + } else { + let base = std::env::var("XDG_DATA_HOME") + .map(PathBuf::from) + .unwrap_or_else(|_| { + let home = std::env::var("HOME").unwrap_or_else(|_| ".".to_string()); + PathBuf::from(home).join(".local/share") + }); + base.join("navigator/gateway-cluster") + }; + let rootfs_dir = data_dir.join("rootfs"); + let k3s_state_dir = data_dir.join("k3s-state"); + let console_log = data_dir.join("console.log"); + + // Step 1: Extract rootfs from Docker image if not already present. + if !rootfs_dir.join("bin").is_dir() { + println!("Extracting rootfs from Docker image: {image_ref}"); + extract_rootfs_from_docker(image_ref, &rootfs_dir)?; + } else { + println!("Using cached rootfs: {}", rootfs_dir.display()); + } + + // Always update vm-init.sh in the rootfs — it's embedded at build time + // and may contain fixes (e.g., networking changes) that the cached rootfs + // doesn't have. + update_vm_init_script(&rootfs_dir)?; + + // Create k3s state directory. + std::fs::create_dir_all(&k3s_state_dir) + .map_err(|e| miette::miette!("failed to create k3s state dir: {e}"))?; + + // Step 2: Start gvproxy for virtio-net networking. + // + // gvproxy provides a user-mode network backend that gives the guest a real + // eth0 interface with DHCP (192.168.127.0/24). This replaces TSI, which + // breaks k3s by intercepting all localhost connections. + // + // Port forwarding is handled via gvproxy's HTTP API, not krun_set_port_map. + let gvproxy_sock = data_dir.join("gvproxy.sock"); + let gvproxy_api_sock = data_dir.join("gvproxy-api.sock"); + + // Clean up stale sockets from previous runs. + let _ = std::fs::remove_file(&gvproxy_sock); + let _ = std::fs::remove_file(&gvproxy_api_sock); + + println!("Starting gvproxy network backend..."); + let mut gvproxy_child = Command::new("/opt/podman/bin/gvproxy") + .arg("-listen-vfkit") + .arg(format!("unixgram://{}", gvproxy_sock.display())) + .arg("-listen") + .arg(format!("unix://{}", gvproxy_api_sock.display())) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .spawn() + .map_err(|e| miette::miette!("failed to start gvproxy: {e}"))?; + + // Wait for gvproxy to create its sockets. + for _ in 0..20 { + if gvproxy_sock.exists() && gvproxy_api_sock.exists() { + break; + } + std::thread::sleep(Duration::from_millis(100)); + } + if !gvproxy_sock.exists() { + gvproxy_child.kill().ok(); + return Err(miette::miette!( + "gvproxy failed to create socket at {}", + gvproxy_sock.display() + )); + } + + // Step 3: Configure port forwarding via gvproxy HTTP API. + // + // gvproxy's DHCP server assigns guest IPs from 192.168.127.0/24. + // The gateway is .1. The guest uses a static IP (.2) configured in + // vm-init.sh — we don't use DHCP because gvproxy assigns IPs + // nondeterministically (.2 or .3 depending on timing). + let guest_ip = "192.168.127.2"; + + // We always need the kube API port forwarded for health checking and + // kubeconfig extraction. If the user didn't specify --kube-port, pick + // an ephemeral port. + let effective_kube_port = + kube_port.unwrap_or_else(|| navigator_bootstrap::pick_available_port().unwrap_or(6444)); + + let forward_ports = |local_port: u16, remote_port: u16| -> Result<()> { + let body = format!( + r#"{{"local":":{}","remote":"{}:{}","protocol":"tcp"}}"#, + local_port, guest_ip, remote_port + ); + let output = Command::new("curl") + .args([ + "--unix-socket", + &gvproxy_api_sock.to_string_lossy(), + "-s", + "-X", + "POST", + "http://localhost/services/forwarder/expose", + "-d", + &body, + ]) + .output() + .map_err(|e| miette::miette!("failed to configure port forwarding: {e}"))?; + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(miette::miette!("gvproxy port forward failed: {stderr}")); + } + Ok(()) + }; + + // Forward the navigator gateway port. + forward_ports(gateway_port, 30051)?; + // Always forward the kube API port (for health checks + kubeconfig). + forward_ports(effective_kube_port, 6443)?; + + // Step 4: Build the microVM configuration. + println!("Booting k3s cluster in microVM..."); + println!(" image: {image_ref}"); + println!(" rootfs: {}", rootfs_dir.display()); + println!(" vcpus: {vcpus}"); + println!(" memory: {mem} MiB"); + println!(" network: gvproxy (guest IP: {guest_ip})"); + println!(" gateway: localhost:{gateway_port} -> guest:30051"); + println!( + " kube API: localhost:{effective_kube_port} -> guest:6443{}", + if kube_port.is_none() { + " (internal)" + } else { + "" + } + ); + println!(" state: {}", k3s_state_dir.display()); + println!(" console: {}", console_log.display()); + println!(); + + // vm-init.sh handles network setup then execs into k3s with these args. + // The libkrunfw kernel does not include netfilter/iptables modules, so we + // must disable kube-proxy and flannel (both require iptables). This is fine + // because the microVM only needs the API server + controllers for navigator. + // + // --data-dir /run/k3s puts k3s state on tmpfs. SQLite (used by kine) has + // file locking issues on virtio-fs, causing RBAC bootstrap timeouts. tmpfs + // provides proper POSIX locking and much faster I/O. State is lost on VM + // restart, but this is acceptable for development clusters. + let init_args: Vec<&str> = vec![ + "server", + "--data-dir=/run/k3s", + "--disable=traefik,servicelb,metrics-server", + "--disable-kube-proxy", + "--flannel-backend=none", + "--disable-network-policy", + "--tls-san=127.0.0.1", + "--tls-san=localhost", + "--tls-san=192.168.127.2", + "--tls-san=192.168.127.3", + ]; + + // Environment variables for the VM. + let env_vars = vec![ + "HOME=/root".to_string(), + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(), + "TERM=xterm".to_string(), + ]; + + let builder = KrunContext::builder() + .vcpus(vcpus) + .memory_mib(mem) + .rootfs(&rootfs_dir) + .workdir("/") + .exec("/usr/local/bin/vm-init.sh", &init_args) + .env(Some(env_vars)) + .log_level(log_level) + .console_output(&console_log) + // Use gvproxy for networking — this disables TSI automatically. + // TSI cannot be used with k3s because it intercepts ALL guest inet + // connect() calls and proxies them to the host, which breaks the + // k3s API server's internal localhost connections. + .net_gvproxy(&gvproxy_sock); + + let ctx = builder + .build() + .map_err(|e| miette::miette!("failed to configure cluster microVM: {e}"))?; + + // Step 5: Fork and start the VM. + let child_pid = ctx + .fork_start() + .map_err(|e| miette::miette!("failed to start cluster microVM: {e}"))?; + + println!("microVM started (child PID: {child_pid})"); + println!("Waiting for k3s to become ready..."); + println!( + " (tail -f {} for VM console output)", + console_log.display() + ); + println!(); + + // Step 6: Wait for k3s API server to become ready. + // + // We poll the /readyz endpoint on the forwarded kube port. k3s typically + // takes 15-45 seconds to boot, but the kine race condition on tmpfs can + // cause it to crash and restart once, so we allow up to 120 seconds. + let readyz_url = format!("https://localhost:{effective_kube_port}/readyz"); + let health_timeout = Duration::from_secs(120); + let health_interval = Duration::from_secs(2); + let start = Instant::now(); + + let mut api_ready = false; + while start.elapsed() < health_timeout { + // Check if the child process is still alive. + if !navigator_gateway::is_pid_alive(child_pid) { + // Child exited before becoming ready. + return Err(miette::miette!( + "microVM exited before k3s became ready (PID: {child_pid})\n\ + Check console log: {}", + console_log.display() + )); + } + + // Probe the readyz endpoint (skip TLS verification — self-signed cert). + let probe = Command::new("curl") + .args(["-sk", "--max-time", "2", &readyz_url]) + .output(); + + if let Ok(output) = probe { + let body = String::from_utf8_lossy(&output.stdout); + if body.contains("ok") { + api_ready = true; + break; + } + } + + std::thread::sleep(health_interval); + } + + if !api_ready { + // Don't kill the VM — it may still be starting up. Just warn. + eprintln!( + "warning: k3s API server did not become ready within {}s", + health_timeout.as_secs() + ); + eprintln!( + " The VM is still running (PID: {child_pid}). Check: tail -f {}", + console_log.display() + ); + } + + // Step 7: Extract and store kubeconfig. + // + // k3s writes its kubeconfig to /etc/rancher/k3s/k3s.yaml inside the guest. + // Since the rootfs is mapped via virtio-fs, we can read it directly from + // the host filesystem. + if api_ready { + let elapsed = start.elapsed(); + println!("k3s API server ready ({:.1}s)", elapsed.as_secs_f64()); + + let kubeconfig_guest_path = rootfs_dir.join("etc/rancher/k3s/k3s.yaml"); + match std::fs::read_to_string(&kubeconfig_guest_path) { + Ok(raw_kubeconfig) if is_valid_kubeconfig(&raw_kubeconfig) => { + // Rewrite the kubeconfig: point server URL to the forwarded port + // and rename the cluster/context/user entries. + let rewritten = navigator_bootstrap::rewrite_kubeconfig( + &raw_kubeconfig, + cluster_name, + Some(effective_kube_port), + ); + + // Store in the standard navigator kubeconfig location. + let kubeconfig_path = navigator_bootstrap::stored_kubeconfig_path(cluster_name) + .map_err(|e| miette::miette!("failed to resolve kubeconfig path: {e}"))?; + navigator_bootstrap::store_kubeconfig(&kubeconfig_path, &rewritten) + .map_err(|e| miette::miette!("failed to store kubeconfig: {e}"))?; + + println!("Kubeconfig written to {}", kubeconfig_path.display()); + println!(); + println!("Cluster is ready! To use:"); + println!(" export KUBECONFIG={}", kubeconfig_path.display()); + println!(" kubectl get nodes"); + } + Ok(_) => { + eprintln!( + "warning: kubeconfig at {} is not valid yet", + kubeconfig_guest_path.display() + ); + } + Err(e) => { + eprintln!( + "warning: could not read kubeconfig from {}: {e}", + kubeconfig_guest_path.display() + ); + eprintln!(" k3s may not have written it yet; the VM is still running."); + } + } + } + + println!(); + println!("Press Ctrl+C to stop the cluster."); + + // Step 8: Wait for the child process (blocks until VM exits or signal). + // + // The default SIGINT handler sends SIGINT to the entire process group, + // which includes the forked VM child. So Ctrl+C will naturally propagate + // to the VM. After the child exits, we clean up gvproxy. + let status = wait_for_child(child_pid)?; + if status == 0 { + println!("microVM exited cleanly"); + } else if status == 130 { + // 128 + SIGINT(2) = 130 — user pressed Ctrl+C. + println!("microVM stopped"); + } else { + eprintln!("microVM exited with status {status}"); + } + + // Clean up gvproxy. + gvproxy_child.kill().ok(); + gvproxy_child.wait().ok(); + let _ = std::fs::remove_file(&gvproxy_sock); + let _ = std::fs::remove_file(&gvproxy_api_sock); + + Ok(()) +} + +/// Check if a string looks like a valid kubeconfig. +fn is_valid_kubeconfig(contents: &str) -> bool { + contents.contains("apiVersion:") && contents.contains("clusters:") +} + +/// Extract a rootfs from a Docker image by creating a temporary container +/// and exporting its filesystem. +fn extract_rootfs_from_docker(image_ref: &str, rootfs_dir: &Path) -> Result<()> { + use std::process::Command; + + // Ensure the image exists locally. + let pull_status = Command::new("docker") + .args(["image", "inspect", image_ref]) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .map_err(|e| miette::miette!("failed to run docker: {e}"))?; + + if !pull_status.success() { + println!(" Pulling image {image_ref}..."); + let pull = Command::new("docker") + .args(["pull", image_ref]) + .status() + .map_err(|e| miette::miette!("docker pull failed: {e}"))?; + if !pull.success() { + return Err(miette::miette!("docker pull failed for {image_ref}")); + } + } + + // Create a temporary container (don't start it). + let container_name = format!("navigator-rootfs-extract-{}", std::process::id()); + let create = Command::new("docker") + .args(["create", "--name", &container_name, image_ref, "/bin/true"]) + .stdout(std::process::Stdio::null()) + .status() + .map_err(|e| miette::miette!("docker create failed: {e}"))?; + if !create.success() { + return Err(miette::miette!("docker create failed for {image_ref}")); + } + + // Export the container filesystem as a tar stream and extract it. + std::fs::create_dir_all(rootfs_dir) + .map_err(|e| miette::miette!("failed to create rootfs dir: {e}"))?; + + println!(" Exporting container filesystem..."); + let mut export = Command::new("docker") + .args(["export", &container_name]) + .stdout(std::process::Stdio::piped()) + .spawn() + .map_err(|e| miette::miette!("docker export failed: {e}"))?; + + let tar_status = Command::new("tar") + .args(["xf", "-", "-C"]) + .arg(rootfs_dir) + .stdin(export.stdout.take().unwrap()) + .status() + .map_err(|e| miette::miette!("tar extract failed: {e}"))?; + + // Wait for docker export to finish and check its exit status. + let export_status = export + .wait() + .map_err(|e| miette::miette!("failed to wait for docker export: {e}"))?; + if !export_status.success() { + let _ = Command::new("docker") + .args(["rm", "-f", &container_name]) + .status(); + return Err(miette::miette!( + "docker export failed with status {export_status}" + )); + } + + if !tar_status.success() { + // Clean up on failure. + let _ = Command::new("docker") + .args(["rm", "-f", &container_name]) + .status(); + return Err(miette::miette!( + "failed to extract rootfs from Docker image" + )); + } + + // Clean up the temporary container. + let _ = Command::new("docker") + .args(["rm", "-f", &container_name]) + .status(); + + update_vm_init_script(rootfs_dir)?; + + println!(" Rootfs extracted to {}", rootfs_dir.display()); + Ok(()) +} + +/// Wait for a child process to exit and return its exit status. +fn wait_for_child(pid: u32) -> Result { + navigator_gateway::wait_for_pid(pid).map_err(|e| miette::miette!("waitpid failed: {e}")) +} + +/// Write the embedded vm-init.sh script into the rootfs. +/// +/// This is called both during initial rootfs extraction and on every +/// cluster boot (to pick up fixes baked into the binary without requiring +/// a full rootfs re-extract). +fn update_vm_init_script(rootfs_dir: &Path) -> Result<()> { + let init_script = include_bytes!("../../../deploy/gateway/vm-init.sh"); + let init_path = rootfs_dir.join("usr/local/bin/vm-init.sh"); + std::fs::create_dir_all(init_path.parent().unwrap()) + .map_err(|e| miette::miette!("failed to create vm-init.sh parent dir: {e}"))?; + std::fs::write(&init_path, init_script) + .map_err(|e| miette::miette!("failed to write vm-init.sh: {e}"))?; + + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&init_path, std::fs::Permissions::from_mode(0o755)) + .map_err(|e| miette::miette!("failed to chmod vm-init.sh: {e}"))?; + } + Ok(()) +} + #[cfg(test)] mod tests { use super::{inferred_provider_type, parse_credential_pairs, resolve_route_protocols}; diff --git a/crates/navigator-gateway/Cargo.toml b/crates/navigator-gateway/Cargo.toml new file mode 100644 index 00000000..668d3db3 --- /dev/null +++ b/crates/navigator-gateway/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "navigator-gateway" +description = "MicroVM gateway using libkrun for hardware-isolated process execution" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true + +[dependencies] +libc = "0.2" +thiserror = { workspace = true } +tracing = { workspace = true } + +[lints] +workspace = true diff --git a/crates/navigator-gateway/build.rs b/crates/navigator-gateway/build.rs new file mode 100644 index 00000000..b9f5d5b4 --- /dev/null +++ b/crates/navigator-gateway/build.rs @@ -0,0 +1,43 @@ +use std::process::Command; + +fn main() { + // Tell cargo to link against libkrun (the system dynamic library). + // On macOS this expects libkrun.dylib to be findable by the linker. + println!("cargo:rustc-link-lib=dylib=krun"); + + // Discover Homebrew install prefixes for libkrun and libkrunfw. + // We need both: + // - link-search: so the *linker* can find the .dylib at build time + // - link-arg -rpath: so the *dynamic linker* (dyld) can find them at runtime + // + // Without the rpath entries, the binary would require DYLD_LIBRARY_PATH + // to be set, which is fragile and easy to forget. + + for formula in &["libkrun", "libkrunfw"] { + if let Some(lib_dir) = brew_lib_path(formula) { + println!("cargo:rustc-link-search=native={lib_dir}"); + // NOTE: cargo:rustc-link-arg from a *library* crate does NOT + // propagate to the final binary. The rpath is set in + // navigator-cli's build.rs instead. + } + } +} + +/// Ask Homebrew for the install prefix of a formula and return its `lib/` path. +fn brew_lib_path(formula: &str) -> Option { + let output = Command::new("brew") + .args(["--prefix", formula]) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let prefix = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if prefix.is_empty() { + return None; + } + + Some(format!("{prefix}/lib")) +} diff --git a/crates/navigator-gateway/entitlements.plist b/crates/navigator-gateway/entitlements.plist new file mode 100644 index 00000000..154f3308 --- /dev/null +++ b/crates/navigator-gateway/entitlements.plist @@ -0,0 +1,8 @@ + + + + + com.apple.security.hypervisor + + + diff --git a/crates/navigator-gateway/src/context.rs b/crates/navigator-gateway/src/context.rs new file mode 100644 index 00000000..641b1b6c --- /dev/null +++ b/crates/navigator-gateway/src/context.rs @@ -0,0 +1,630 @@ +//! Safe wrapper around the libkrun configuration context and VM lifecycle. +//! +//! The main entry point is [`KrunContextBuilder`], obtained via +//! [`KrunContext::builder()`]. After configuring the VM parameters, call +//! [`.build()`](KrunContextBuilder::build) to create a [`KrunContext`], then +//! [`.start_enter()`](KrunContext::start_enter) to boot the microVM in the +//! current process, or [`.fork_start()`](KrunContext::fork_start) to boot it +//! in a child process while the parent retains control. + +use std::ffi::{CString, c_char}; +use std::mem::ManuallyDrop; +use std::path::{Path, PathBuf}; +use std::ptr; + +use tracing::{debug, info}; + +use crate::error::GatewayError; +use crate::ffi; + +/// A configured libkrun microVM context, ready to be started. +/// +/// Owns the libkrun context ID and frees it on drop (unless consumed by +/// `start_enter`, which never returns). +pub struct KrunContext { + ctx_id: u32, + /// If set, `fork_start()` redirects the child's stderr to this file + /// so that libkrun VMM warnings (e.g., virtio-fs passthrough) don't + /// leak to the parent's terminal. + console_output: Option, +} + +impl KrunContext { + /// Create a new builder for configuring a microVM. + pub fn builder() -> KrunContextBuilder { + KrunContextBuilder::default() + } + + /// Boot the microVM and enter it (direct model). + /// + /// # Never returns + /// + /// On success, this function **never returns**. The libkrun VMM takes over + /// the process and calls `exit()` with the guest workload's exit code when + /// the VM shuts down. + /// + /// The only way this function returns is if libkrun encounters an error + /// before actually starting the VM. + pub fn start_enter(self) -> Result<(), GatewayError> { + // Prevent Drop from running -- krun_start_enter consumes the context + // and will exit() the process, so we must not call krun_free_ctx. + let this = ManuallyDrop::new(self); + + // Raise RLIMIT_NOFILE to the maximum allowed. virtio-fs (used by + // krun_set_root) needs a large number of file descriptors to map the + // host directory into the guest. The chroot_vm reference example does + // the same thing. + raise_nofile_limit(); + + info!( + ctx_id = this.ctx_id, + "starting microVM (this process will be taken over)" + ); + + let ret = unsafe { ffi::krun_start_enter(this.ctx_id) }; + + // If we reach here, it means krun_start_enter failed. + Err(GatewayError::StartFailed(ret)) + } + + /// Boot the microVM in a forked child process. + /// + /// The parent process retains control and receives the child's PID. + /// The child process calls `krun_start_enter()`, which never returns on + /// success. + /// + /// # Returns + /// + /// - `Ok(child_pid)` in the parent process + /// - Never returns in the child (on success) + /// - `Err(...)` if the fork fails or the VM fails to start in the child + /// + /// # Safety + /// + /// After `fork()`, the child inherits all file descriptors and memory. + /// `krun_start_enter()` takes over the child process immediately, so + /// no Rust destructors run in the child. This is safe because + /// `krun_start_enter` calls `exit()` directly. + pub fn fork_start(self) -> Result { + raise_nofile_limit(); + + info!(ctx_id = self.ctx_id, "forking to start microVM in child"); + + // Prevent Drop from running in EITHER process. After fork(), the + // parent and child share kernel-level hypervisor resources (e.g., + // Hypervisor.framework VM handles on macOS). If the parent calls + // krun_free_ctx(), it destroys the VM the child is about to start. + // The child's krun_start_enter() consumes the context and calls + // exit() when the VM shuts down, so cleanup is not needed there + // either. + let this = ManuallyDrop::new(self); + + let pid = unsafe { libc::fork() }; + + if pid < 0 { + return Err(GatewayError::Fork(std::io::Error::last_os_error())); + } + + if pid == 0 { + // Child process: redirect stderr to the console log file (if + // configured) so libkrun VMM warnings don't leak to the parent + // terminal. The VMM's virtio-fs passthrough generates WARN-level + // logs on stderr that are confusing when mixed with CLI output. + if let Some(ref console_path) = this.console_output { + redirect_stderr_to_file(console_path); + } + + // Start the VM. This never returns on success. + let ret = unsafe { ffi::krun_start_enter(this.ctx_id) }; + // If we reach here, start failed. Exit with an error code so the + // parent can detect it. + std::process::exit(ret.unsigned_abs().cast_signed()); + } + + // Parent process: return the child PID. + // We intentionally leak the KrunContext (ManuallyDrop) to avoid + // destroying the child's VM. The kernel cleans up when the child + // exits. + debug!(child_pid = pid, "microVM child process started"); + #[expect(clippy::cast_sign_loss, reason = "checked non-negative above")] + Ok(pid as u32) + } +} + +impl Drop for KrunContext { + fn drop(&mut self) { + debug!(ctx_id = self.ctx_id, "freeing libkrun context"); + unsafe { + ffi::krun_free_ctx(self.ctx_id); + } + } +} + +/// A port mapping entry for the microVM (`host_port` -> `guest_port`). +#[derive(Debug, Clone)] +pub struct PortMapping { + /// Port on the host. + pub host_port: u16, + /// Port inside the guest VM. + pub guest_port: u16, +} + +impl PortMapping { + /// Create a new port mapping. + pub fn new(host_port: u16, guest_port: u16) -> Self { + Self { + host_port, + guest_port, + } + } +} + +impl std::fmt::Display for PortMapping { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}", self.host_port, self.guest_port) + } +} + +/// A virtio-fs volume mount (`host_path` -> `guest_tag`). +#[derive(Debug, Clone)] +pub struct VirtiofsMount { + /// Tag to identify the filesystem in the guest (used in mount command). + pub tag: String, + /// Full path to the host directory to expose. + pub host_path: PathBuf, +} + +impl VirtiofsMount { + /// Create a new virtio-fs mount. + pub fn new(tag: impl Into, host_path: impl AsRef) -> Self { + Self { + tag: tag.into(), + host_path: host_path.as_ref().to_path_buf(), + } + } +} + +/// Builder for configuring and creating a [`KrunContext`]. +/// +/// # Example +/// +/// ```no_run +/// use navigator_gateway::KrunContext; +/// +/// let ctx = KrunContext::builder() +/// .vcpus(1) +/// .memory_mib(128) +/// .rootfs("./my-rootfs") +/// .workdir("/") +/// .exec("/bin/echo", &["Hello from microVM!"]) +/// .build() +/// .expect("failed to configure microVM"); +/// +/// // This never returns on success: +/// ctx.start_enter().expect("failed to start microVM"); +/// ``` +pub struct KrunContextBuilder { + vcpus: u8, + memory_mib: u32, + rootfs: Option, + workdir: Option, + exec_path: Option, + args: Vec, + env: Option>, + log_level: u32, + port_map: Vec, + virtiofs_mounts: Vec, + console_output: Option, + disable_tsi: bool, + /// Path to a gvproxy Unix datagram socket for virtio-net networking. + /// When set, TSI is automatically disabled by libkrun and the guest + /// gets a real `eth0` interface with DHCP from gvproxy. + net_gvproxy: Option, +} + +impl Default for KrunContextBuilder { + fn default() -> Self { + Self { + vcpus: 1, + memory_mib: 128, + rootfs: None, + workdir: None, + exec_path: None, + args: Vec::new(), + env: None, + log_level: ffi::KRUN_LOG_LEVEL_WARN, + port_map: Vec::new(), + virtiofs_mounts: Vec::new(), + console_output: None, + disable_tsi: false, + net_gvproxy: None, + } + } +} + +#[allow(clippy::return_self_not_must_use)] +impl KrunContextBuilder { + /// Set the number of virtual CPUs for the microVM. + pub fn vcpus(mut self, n: u8) -> Self { + self.vcpus = n; + self + } + + /// Set the amount of RAM in MiB for the microVM. + pub fn memory_mib(mut self, mib: u32) -> Self { + self.memory_mib = mib; + self + } + + /// Set the host directory to be used as the VM's root filesystem. + /// + /// This directory is mapped into the VM via virtio-fs. It must contain + /// an aarch64 Linux userspace (e.g., Alpine minirootfs). + pub fn rootfs(mut self, path: impl AsRef) -> Self { + self.rootfs = Some(path.as_ref().to_path_buf()); + self + } + + /// Set the working directory inside the VM (relative to rootfs). + pub fn workdir(mut self, path: impl Into) -> Self { + self.workdir = Some(path.into()); + self + } + + /// Set the executable to run inside the VM and its arguments. + /// + /// The `exec_path` is relative to the rootfs. + pub fn exec(mut self, exec_path: impl Into, args: &[impl AsRef]) -> Self { + self.exec_path = Some(exec_path.into()); + self.args = args.iter().map(|a| a.as_ref().to_string()).collect(); + self + } + + /// Set environment variables for the guest process. + /// + /// Each entry should be in `KEY=VALUE` format. If not called (or called + /// with `None`), a minimal default environment is used. + pub fn env(mut self, vars: Option>) -> Self { + self.env = vars; + self + } + + /// Set the libkrun log level (0=Off .. 5=Trace). Default is 2 (Warn). + pub fn log_level(mut self, level: u32) -> Self { + self.log_level = level; + self + } + + /// Add a TCP port mapping from host to guest. + /// + /// The port will be accessible on `host_port` from the host and will + /// forward to `guest_port` inside the VM. Note that libkrun also makes + /// the port accessible inside the guest via `host_port`. + pub fn port_map(mut self, host_port: u16, guest_port: u16) -> Self { + self.port_map.push(PortMapping::new(host_port, guest_port)); + self + } + + /// Add multiple TCP port mappings at once. + pub fn port_maps(mut self, mappings: impl IntoIterator) -> Self { + self.port_map.extend(mappings); + self + } + + /// Add a virtio-fs volume mount. + /// + /// The host directory at `host_path` will be available inside the guest + /// as a virtio-fs filesystem with the given `tag`. The guest must mount + /// it explicitly: `mount -t virtiofs `. + pub fn virtiofs(mut self, tag: impl Into, host_path: impl AsRef) -> Self { + self.virtiofs_mounts + .push(VirtiofsMount::new(tag, host_path)); + self + } + + /// Redirect VM console output to a file instead of stdout. + /// + /// When set, the VM's console device ignores stdin and writes all output + /// to the specified file. Useful when the VM runs in a forked child and + /// the parent needs to capture output. + pub fn console_output(mut self, path: impl AsRef) -> Self { + self.console_output = Some(path.as_ref().to_path_buf()); + self + } + + /// Use gvproxy for virtio-net networking instead of TSI. + /// + /// When set, libkrun adds a virtio-net device backed by the gvproxy + /// Unix datagram socket at the given path. This **automatically disables + /// TSI**, so the guest gets a real `eth0` interface with DHCP from + /// gvproxy (default subnet: 192.168.127.0/24, gateway: 192.168.127.1). + /// The guest IP is assigned by DHCP — with gvproxy v0.8.6, the first + /// client gets 192.168.127.3 (not .2 as some docs suggest). + /// + /// Port forwarding is handled by gvproxy's HTTP API, not by + /// `krun_set_port_map` (which is TSI-only). + /// + /// Note: When using gvproxy, `port_map` entries are ignored by libkrun. + /// Use gvproxy's HTTP API endpoint to configure port forwarding instead. + pub fn net_gvproxy(mut self, socket_path: impl AsRef) -> Self { + self.net_gvproxy = Some(socket_path.as_ref().to_path_buf()); + self + } + + /// Disable TSI (Transparent Socket Impersonation) for the microVM. + /// + /// When enabled, libkrun's implicit vsock (which hijacks all guest + /// `connect()` syscalls on inet sockets) is replaced with a vsock + /// device that has no TSI features. This allows localhost traffic + /// inside the guest to flow through the real kernel loopback instead + /// of being tunnelled through vsock to the host. + /// + /// This is required for workloads like k3s that make many concurrent + /// internal localhost connections (API server, kine, controllers). + /// TSI intercepts those connections and overwhelms the vsock muxer, + /// causing deadlocks. + /// + /// Port mapping via `krun_set_port_map` still works because it uses + /// the vsock device (with `tsi_features = 0`, only explicit port + /// mappings are forwarded). + pub fn disable_tsi(mut self, disable: bool) -> Self { + self.disable_tsi = disable; + self + } + + /// Build the [`KrunContext`] by calling the libkrun C API to create and + /// configure the microVM. + /// + /// # Errors + /// + /// Returns [`GatewayError`] if the rootfs doesn't exist, if any libkrun + /// API call fails, or if string arguments contain interior null bytes. + pub fn build(self) -> Result { + // Validate rootfs exists. + let rootfs = self + .rootfs + .as_ref() + .ok_or_else(|| GatewayError::RootfsNotFound(PathBuf::from("")))?; + + if !rootfs.is_dir() { + return Err(GatewayError::RootfsNotFound(rootfs.clone())); + } + + let exec_path = self.exec_path.as_deref().unwrap_or("/bin/sh"); + + // Set log level. + check_ret("krun_set_log_level", unsafe { + ffi::krun_set_log_level(self.log_level) + })?; + + // Create the libkrun context. + let ctx_id = unsafe { ffi::krun_create_ctx() }; + if ctx_id < 0 { + return Err(GatewayError::ContextCreation(ctx_id)); + } + #[expect(clippy::cast_sign_loss, reason = "checked non-negative above")] + let ctx_id = ctx_id as u32; + + debug!( + ctx_id, + vcpus = self.vcpus, + ram_mib = self.memory_mib, + "configuring microVM" + ); + + // From here on, if we hit an error we need to clean up the context. + // We'll create KrunContext now so Drop handles it. + let ctx = KrunContext { + ctx_id, + console_output: self.console_output.clone(), + }; + + // Configure VM resources. + check_ret("krun_set_vm_config", unsafe { + ffi::krun_set_vm_config(ctx_id, self.vcpus, self.memory_mib) + })?; + + // Set root filesystem. + let c_rootfs = path_to_cstring(rootfs)?; + check_ret("krun_set_root", unsafe { + ffi::krun_set_root(ctx_id, c_rootfs.as_ptr()) + })?; + + // Set working directory. + if let Some(ref workdir) = self.workdir { + let c_workdir = CString::new(workdir.as_str())?; + check_ret("krun_set_workdir", unsafe { + ffi::krun_set_workdir(ctx_id, c_workdir.as_ptr()) + })?; + } + + // Configure gvproxy-based virtio-net networking. + // + // When a net device is added, libkrun automatically disables TSI. + // The guest gets a real eth0 with DHCP from gvproxy. This MUST be + // called before krun_set_port_map (per libkrun.h). + if let Some(ref gvproxy_path) = self.net_gvproxy { + let c_path = path_to_cstring(gvproxy_path)?; + // Default MAC address for the guest. + let mac: [u8; 6] = [0x02, 0x42, 0xAC, 0x11, 0x00, 0x02]; + + debug!( + path = %gvproxy_path.display(), + "adding gvproxy virtio-net device (disables TSI)" + ); + check_ret("krun_add_net_unixgram", unsafe { + ffi::krun_add_net_unixgram( + ctx_id, + c_path.as_ptr(), + -1, // no fd, use path + mac.as_ptr(), + ffi::COMPAT_NET_FEATURES, + ffi::NET_FLAG_VFKIT, + ) + })?; + } + + // Configure port mapping (TSI-only, skipped when gvproxy is used). + if !self.port_map.is_empty() { + let map_strings: Vec = self.port_map.iter().map(ToString::to_string).collect(); + let c_map_strings = to_cstring_vec(&map_strings)?; + let c_port_map = to_ptr_array(&c_map_strings); + + debug!(?map_strings, "setting port map"); + check_ret("krun_set_port_map", unsafe { + ffi::krun_set_port_map(ctx_id, c_port_map.as_ptr()) + })?; + } + + // Configure virtio-fs volume mounts. + for mount in &self.virtiofs_mounts { + let c_tag = CString::new(mount.tag.as_str())?; + let c_path = path_to_cstring(&mount.host_path)?; + + debug!(tag = mount.tag, path = %mount.host_path.display(), "adding virtiofs mount"); + check_ret("krun_add_virtiofs", unsafe { + ffi::krun_add_virtiofs(ctx_id, c_tag.as_ptr(), c_path.as_ptr()) + })?; + } + + // Configure console output redirection. + if let Some(ref console_path) = self.console_output { + let c_console = path_to_cstring(console_path)?; + check_ret("krun_set_console_output", unsafe { + ffi::krun_set_console_output(ctx_id, c_console.as_ptr()) + })?; + } + + // Disable TSI (Transparent Socket Impersonation) if requested. + // + // TSI intercepts ALL guest connect() syscalls on inet sockets and + // tunnels them through vsock to the host. This breaks workloads + // that rely on internal localhost connections (e.g., k3s). + // + // We replace the implicit vsock with a bare vsock (tsi_features=0) + // so that only explicit port mappings are forwarded while localhost + // traffic stays inside the guest kernel. + if self.disable_tsi { + debug!(ctx_id, "disabling TSI (transparent socket impersonation)"); + check_ret("krun_disable_implicit_vsock", unsafe { + ffi::krun_disable_implicit_vsock(ctx_id) + })?; + check_ret("krun_add_vsock", unsafe { ffi::krun_add_vsock(ctx_id, 0) })?; + } + + // Set executable, arguments, and environment. + let c_exec = CString::new(exec_path)?; + let c_args = to_cstring_vec(&self.args)?; + let c_arg_ptrs = to_ptr_array(&c_args); + + // If no explicit env was provided, use a minimal default environment. + // We must NOT pass NULL to krun_set_exec's envp parameter because + // libkrun would then serialize the entire host environment into the + // kernel command line, which easily overflows its 4096-byte limit + // on developer machines with large PATH/etc. + let default_env = vec![ + "HOME=/root".to_string(), + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(), + "TERM=xterm".to_string(), + ]; + let env_ref = self.env.as_ref().unwrap_or(&default_env); + let c_env_strings = to_cstring_vec(env_ref)?; + let c_envp = to_ptr_array(&c_env_strings); + + check_ret("krun_set_exec", unsafe { + ffi::krun_set_exec( + ctx_id, + c_exec.as_ptr(), + c_arg_ptrs.as_ptr(), + c_envp.as_ptr(), + ) + })?; + + info!( + ctx_id, + rootfs = %rootfs.display(), + exec = exec_path, + ports = ?self.port_map.iter().map(ToString::to_string).collect::>(), + virtiofs = self.virtiofs_mounts.len(), + "microVM configured successfully" + ); + + Ok(ctx) + } +} + +/// Check a libkrun return code; zero means success, negative means error. +fn check_ret(call: &'static str, ret: i32) -> Result<(), GatewayError> { + if ret < 0 { + Err(GatewayError::Configuration { call, code: ret }) + } else { + Ok(()) + } +} + +/// Convert a `Path` to a `CString`. +fn path_to_cstring(path: &Path) -> Result { + let s = path.to_str().ok_or(GatewayError::Configuration { + call: "path_to_cstring", + code: -1, + })?; + Ok(CString::new(s)?) +} + +/// Convert a slice of strings to a `Vec`. +fn to_cstring_vec(strings: &[String]) -> Result, GatewayError> { + strings + .iter() + .map(|s| Ok(CString::new(s.as_str())?)) + .collect() +} + +/// Create a null-terminated array of C string pointers suitable for passing +/// to libkrun functions that expect `const char *const argv[]`. +/// +/// The returned `Vec` contains pointers into the `CString` values (which must +/// outlive the returned `Vec`) followed by a null terminator. +fn to_ptr_array(strings: &[CString]) -> Vec<*const c_char> { + let mut ptrs: Vec<*const c_char> = strings.iter().map(|s| s.as_ptr()).collect(); + ptrs.push(ptr::null()); + ptrs +} + +/// Redirect stderr (fd 2) to a file. Used in the forked child process to +/// prevent libkrun VMM log messages from appearing on the parent's terminal. +/// +/// Best-effort: if the file can't be opened, stderr is left unchanged. +fn redirect_stderr_to_file(path: &Path) { + use std::fs::OpenOptions; + use std::os::unix::io::IntoRawFd; + + if let Ok(file) = OpenOptions::new().create(true).append(true).open(path) { + let fd = file.into_raw_fd(); + unsafe { + libc::dup2(fd, libc::STDERR_FILENO); + libc::close(fd); + } + } +} + +/// Raise `RLIMIT_NOFILE` to the maximum allowed value. +/// +/// virtio-fs (used by `krun_set_root` to map the rootfs directory) requires a +/// large number of file descriptors. Without this, `krun_start_enter` can fail +/// with internal errors. This mirrors what the upstream `chroot_vm` example does. +fn raise_nofile_limit() { + use libc::{RLIMIT_NOFILE, getrlimit, rlimit, setrlimit}; + + let mut rlim = rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + if unsafe { getrlimit(RLIMIT_NOFILE, &raw mut rlim) } == 0 { + rlim.rlim_cur = rlim.rlim_max; + if unsafe { setrlimit(RLIMIT_NOFILE, &raw const rlim) } != 0 { + debug!("failed to raise RLIMIT_NOFILE (non-fatal)"); + } else { + debug!(limit = rlim.rlim_cur, "raised RLIMIT_NOFILE"); + } + } +} diff --git a/crates/navigator-gateway/src/error.rs b/crates/navigator-gateway/src/error.rs new file mode 100644 index 00000000..863b28c2 --- /dev/null +++ b/crates/navigator-gateway/src/error.rs @@ -0,0 +1,37 @@ +//! Error types for the gateway microVM subsystem. + +use std::path::PathBuf; + +/// Errors that can occur when configuring or starting a microVM. +#[derive(Debug, thiserror::Error)] +pub enum GatewayError { + /// libkrun failed to create a configuration context. + #[error("failed to create libkrun context (error code: {0})")] + ContextCreation(i32), + + /// The VM configuration call failed. + #[error("failed to configure VM ({call}): libkrun error code {code}")] + Configuration { + /// Which libkrun API call failed. + call: &'static str, + /// The negative error code returned by libkrun. + code: i32, + }, + + /// The rootfs path provided does not exist or is not a directory. + #[error("rootfs path does not exist or is not a directory: {0}")] + RootfsNotFound(PathBuf), + + /// `krun_start_enter` returned an error instead of booting the VM. + #[error("failed to start microVM (libkrun error code: {0})")] + StartFailed(i32), + + /// `fork()` failed when trying to start the VM in a child process. + #[error("fork failed: {0}")] + Fork(std::io::Error), + + /// A string argument contained an interior null byte and could not be + /// converted to a C string. + #[error("argument contains interior null byte: {0}")] + NulError(#[from] std::ffi::NulError), +} diff --git a/crates/navigator-gateway/src/ffi.rs b/crates/navigator-gateway/src/ffi.rs new file mode 100644 index 00000000..1e7f79a3 --- /dev/null +++ b/crates/navigator-gateway/src/ffi.rs @@ -0,0 +1,186 @@ +//! Raw FFI bindings for the libkrun C API. +//! +//! These are manual declarations for the subset of `libkrun.h` functions +//! needed by the gateway. libkrun is a dynamic library providing +//! virtualization-based process isolation via KVM (Linux) or +//! Hypervisor.framework (macOS ARM64). +//! +//! See: + +use std::ffi::c_char; + +// Log level constants matching libkrun.h. +// Not all are used yet but they form the public API surface for log configuration. +#[allow(dead_code)] +pub const KRUN_LOG_LEVEL_OFF: u32 = 0; +#[allow(dead_code)] +pub const KRUN_LOG_LEVEL_ERROR: u32 = 1; +pub const KRUN_LOG_LEVEL_WARN: u32 = 2; +#[allow(dead_code)] +pub const KRUN_LOG_LEVEL_INFO: u32 = 3; +#[allow(dead_code)] +pub const KRUN_LOG_LEVEL_DEBUG: u32 = 4; +#[allow(dead_code)] +pub const KRUN_LOG_LEVEL_TRACE: u32 = 5; + +// Network backend flags from libkrun.h. +/// Send the VFKIT magic after establishing the connection, as required by +/// gvproxy in vfkit mode. +pub const NET_FLAG_VFKIT: u32 = 1 << 0; + +/// Compatible virtio-net features enabled by `krun_set_passt_fd` and +/// `krun_set_gvproxy_path`. We use the same set for `krun_add_net_unixgram`. +pub const COMPAT_NET_FEATURES: u32 = (1 << 0) // CSUM + | (1 << 1) // GUEST_CSUM + | (1 << 7) // GUEST_TSO4 + | (1 << 10) // GUEST_UFO + | (1 << 11) // HOST_TSO4 + | (1 << 14); // HOST_UFO + +// Well-known exit codes from the libkrun init process. +// +// 125 - init cannot set up the environment inside the microVM. +// 126 - init can find the executable but cannot execute it. +// 127 - init cannot find the executable to be run. + +unsafe extern "C" { + /// Sets the log level for the library. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_log_level(level: u32) -> i32; + + /// Creates a configuration context. + /// + /// Returns the context ID (>= 0) on success or a negative error number on failure. + pub fn krun_create_ctx() -> i32; + + /// Frees an existing configuration context. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_free_ctx(ctx_id: u32) -> i32; + + /// Sets the basic configuration parameters for the microVM. + /// + /// - `num_vcpus`: the number of vCPUs. + /// - `ram_mib`: the amount of RAM in MiB. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_vm_config(ctx_id: u32, num_vcpus: u8, ram_mib: u32) -> i32; + + /// Sets the path to be used as root for the microVM. + /// + /// The path is mapped into the VM via virtio-fs. The libkrun init process + /// uses this as the root filesystem. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_root(ctx_id: u32, root_path: *const c_char) -> i32; + + /// Sets the working directory for the executable inside the microVM. + /// + /// The path is relative to the root configured with `krun_set_root`. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_workdir(ctx_id: u32, workdir_path: *const c_char) -> i32; + + /// Sets the executable path, arguments, and environment variables. + /// + /// - `exec_path`: path relative to the root configured with `krun_set_root`. + /// - `argv`: null-terminated array of argument string pointers. + /// - `envp`: null-terminated array of environment variable string pointers + /// (format: `KEY=VALUE`). If null, inherits the current environment. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_exec( + ctx_id: u32, + exec_path: *const c_char, + argv: *const *const c_char, + envp: *const *const c_char, + ) -> i32; + + /// Configures a map of host to guest TCP ports for the microVM. + /// + /// - `port_map`: null-terminated array of string pointers with format + /// `"host_port:guest_port"`. + /// + /// Passing NULL instructs libkrun to expose all listening ports in the + /// guest to the host. Passing an empty (null-terminated) array means no + /// ports are exposed. + /// + /// Exposed ports become accessible by their `host_port` in the guest too, + /// so for a map `"8080:80"`, guest-side applications must also use port 8080. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_port_map(ctx_id: u32, port_map: *const *const c_char) -> i32; + + /// Adds an independent virtio-fs device pointing to a host directory. + /// + /// - `c_tag`: tag to identify the filesystem in the guest (used for + /// mounting: `mount -t virtiofs `). + /// - `c_path`: full path to the host directory to be exposed. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_add_virtiofs(ctx_id: u32, c_tag: *const c_char, c_path: *const c_char) -> i32; + + /// Configures the console device to ignore stdin and write output to a file. + /// + /// - `c_filepath`: path to the file for console output. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_set_console_output(ctx_id: u32, c_filepath: *const c_char) -> i32; + + /// Disable the implicit vsock device (which carries TSI by default). + /// + /// Must be called before `krun_add_vsock` to add a vsock with custom + /// TSI feature flags. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_disable_implicit_vsock(ctx_id: u32) -> i32; + + /// Add a vsock device with specified TSI features. + /// + /// - `tsi_features`: bitmask of `KRUN_TSI_HIJACK_INET` (1) and + /// `KRUN_TSI_HIJACK_UNIX` (2). Use 0 for no TSI hijacking. + /// + /// Only one vsock device is supported. Call after + /// `krun_disable_implicit_vsock`. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_add_vsock(ctx_id: u32, tsi_features: u32) -> i32; + + /// Adds an independent virtio-net device with a unixgram-based backend, + /// such as gvproxy or vmnet-helper. + /// + /// Adding ANY `krun_add_net_*` device **automatically disables TSI**. The + /// guest gets a real `ethN` interface instead of TSI socket interception. + /// + /// - `c_path`: path to the Unix datagram socket for the network proxy + /// (e.g., gvproxy's `--listen-vfkit` socket). Must be NULL if `fd != -1`. + /// - `fd`: open file descriptor for the socket. Must be -1 if `c_path` + /// is not NULL. + /// - `c_mac`: 6-byte MAC address array. + /// - `features`: virtio-net feature bitmask (use `COMPAT_NET_FEATURES`). + /// - `flags`: generic flags. Use `NET_FLAG_VFKIT` for gvproxy in vfkit + /// mode when using `c_path`. + /// + /// Returns zero on success or a negative error number on failure. + pub fn krun_add_net_unixgram( + ctx_id: u32, + c_path: *const c_char, + fd: i32, + c_mac: *const u8, + features: u32, + flags: u32, + ) -> i32; + + /// Starts and enters the microVM with the configured parameters. + /// + /// The VMM takes over stdin/stdout to manage them on behalf of the process + /// running inside the isolated environment. + /// + /// **This function never returns on success.** The VMM calls `exit()` with + /// the workload's exit code once the microVM shuts down. + /// + /// Returns a negative error number only if an error happens before the + /// microVM is started (e.g., `-EINVAL` for invalid configuration). + pub fn krun_start_enter(ctx_id: u32) -> i32; +} diff --git a/crates/navigator-gateway/src/lib.rs b/crates/navigator-gateway/src/lib.rs new file mode 100644 index 00000000..aa91e2e3 --- /dev/null +++ b/crates/navigator-gateway/src/lib.rs @@ -0,0 +1,89 @@ +#![allow(unsafe_code)] +//! Hardware-isolated microVM gateway using libkrun. +//! +//! This crate provides a safe Rust interface over the [libkrun](https://github.com/containers/libkrun) +//! C library for running processes inside lightweight microVMs. On macOS ARM64, +//! libkrun uses Apple's Hypervisor.framework (HVF); on Linux it uses KVM. +//! +//! # Architecture +//! +//! libkrun bundles a VMM (Virtual Machine Monitor) in a dynamic library with a +//! simple C API. Combined with libkrunfw (which bundles a Linux kernel), it can +//! boot a microVM in milliseconds with minimal resource overhead. +//! +//! The guest's root filesystem is mapped from a host directory via virtio-fs. +//! Networking uses TSI (Transparent Socket Impersonation) by default, allowing +//! the guest to transparently access host network endpoints without explicit +//! network configuration. +//! +//! # Usage +//! +//! ```no_run +//! use navigator_gateway::KrunContext; +//! +//! let ctx = KrunContext::builder() +//! .vcpus(1) +//! .memory_mib(128) +//! .rootfs("./my-alpine-rootfs") +//! .workdir("/") +//! .exec("/bin/echo", &["Hello from a hardware-isolated microVM!"]) +//! .build() +//! .expect("failed to configure microVM"); +//! +//! // Boots the VM and never returns on success. +//! // The process exits with the guest workload's exit code. +//! ctx.start_enter().expect("failed to start microVM"); +//! ``` +//! +//! # Prerequisites +//! +//! - **macOS ARM64**: Install via Homebrew: `brew tap slp/krun && brew install libkrun` +//! - **Linux**: Build and install libkrunfw + libkrun from source +//! - A root filesystem directory containing an aarch64 Linux userspace +//! (e.g., [Alpine minirootfs](https://alpinelinux.org/downloads/)) + +mod context; +mod error; +mod ffi; + +pub use context::{KrunContext, KrunContextBuilder, PortMapping, VirtiofsMount}; +pub use error::GatewayError; + +/// Wait for a child process to exit and return its exit status. +/// +/// This is a thin wrapper over `waitpid(2)` for use after [`KrunContext::fork_start`]. +pub fn wait_for_pid(pid: u32) -> Result { + let mut status: libc::c_int = 0; + let ret = unsafe { libc::waitpid(pid.cast_signed(), &raw mut status, 0) }; + if ret < 0 { + return Err(GatewayError::Fork(std::io::Error::last_os_error())); + } + if libc::WIFEXITED(status) { + Ok(libc::WEXITSTATUS(status)) + } else if libc::WIFSIGNALED(status) { + Ok(128 + libc::WTERMSIG(status)) + } else { + Ok(status) + } +} + +/// Check if a child process is still alive (non-blocking). +/// +/// Uses `waitpid(WNOHANG)` to check if the child has exited without blocking. +/// This correctly detects zombie processes (which `kill(pid, 0)` does not — +/// zombies are still signalable). If the child has exited, it is reaped. +/// +/// Returns `true` if the process is still running, `false` if it has exited +/// or the PID is invalid. +/// +/// **Warning**: If the child has exited, this function reaps it. A subsequent +/// call to [`wait_for_pid`] on the same PID will fail with ECHILD. +pub fn is_pid_alive(pid: u32) -> bool { + let mut status: libc::c_int = 0; + // waitpid with WNOHANG returns: + // 0 — child still running + // pid — child has exited (reaped) + // -1 — error (e.g., not our child, invalid pid) + let ret = unsafe { libc::waitpid(pid.cast_signed(), &raw mut status, libc::WNOHANG) }; + ret == 0 +} diff --git a/deploy/gateway/vm-init.sh b/deploy/gateway/vm-init.sh new file mode 100755 index 00000000..d0f98a70 --- /dev/null +++ b/deploy/gateway/vm-init.sh @@ -0,0 +1,226 @@ +#!/bin/sh +# vm-init.sh — Bootstrap script for running k3s inside a libkrun microVM. +# +# When using gvproxy networking (virtio-net), the guest gets a real eth0 +# interface. This script configures it via DHCP from gvproxy (which provides +# 192.168.127.0/24 with gateway 192.168.127.1). +# +# The libkrunfw kernel does not include netfilter/iptables, so kube-proxy +# and flannel must be disabled. This is handled by the k3s flags passed +# from the CLI. +# +# This script is injected into the rootfs at extraction time and used as the +# microVM entrypoint instead of running k3s directly. + +set -e + +# The k3s (rancher) base image doesn't symlink all BusyBox applets. +# Ensure essential commands are available. +BB=/bin/busybox +for cmd in mount mountpoint mkdir cat ip udhcpc; do + if ! command -v $cmd >/dev/null 2>&1; then + ln -sf $BB /bin/$cmd 2>/dev/null || true + fi +done +# Also ensure sbin commands are available for ip/route. +for cmd in ip route; do + if ! command -v $cmd >/dev/null 2>&1; then + ln -sf $BB /sbin/$cmd 2>/dev/null || true + fi +done + +echo "[vm-init] Setting up network..." + +# The libkrunfw kernel auto-mounts proc, sysfs, devtmpfs, and cgroup2. +# We only need to mount /run (tmpfs for PID files and sockets) and /tmp. +if ! mountpoint -q /run 2>/dev/null; then + mkdir -p /run + mount -t tmpfs tmpfs /run +fi +if ! mountpoint -q /tmp 2>/dev/null; then + mkdir -p /tmp + mount -t tmpfs tmpfs /tmp +fi + +# Enable the loopback interface. +ip link set lo up 2>/dev/null || true + +# Configure eth0 with a static IP. +# +# gvproxy provides a 192.168.127.0/24 network with gateway at .1. +# We use a static IP instead of DHCP because gvproxy's DHCP server +# assigns IPs nondeterministically (.2 or .3 depending on timing), +# which breaks port forwarding configured before the VM boots. +# Static assignment guarantees the IP matches what the host expects. +GUEST_IP="192.168.127.2" +GATEWAY_IP="192.168.127.1" + +if ip link show eth0 >/dev/null 2>&1; then + echo "[vm-init] Configuring eth0 with static IP ${GUEST_IP}..." + ip link set eth0 up + ip addr add "${GUEST_IP}/24" dev eth0 2>/dev/null || true + ip route add default via "${GATEWAY_IP}" dev eth0 2>/dev/null || true + echo "nameserver ${GATEWAY_IP}" > /etc/resolv.conf + echo "[vm-init] Network configured: eth0 = ${GUEST_IP}" +else + # Fallback: no eth0 (TSI-only mode). Add dummy routing on lo so k3s + # finds a default route in /proc/net/route. + echo "[vm-init] No eth0 found, using lo-only fallback..." + ip addr add 10.0.2.100/32 dev lo 2>/dev/null || true + ip route add 10.0.2.1/32 dev lo 2>/dev/null || true + ip route add default via 10.0.2.1 dev lo 2>/dev/null || true + echo "nameserver 10.0.2.1" > /etc/resolv.conf + GUEST_IP="10.0.2.100" + echo "[vm-init] Network configured (fallback): lo = ${GUEST_IP}" +fi + +# Set up k3s-specific DNS config. +mkdir -p /etc/rancher/k3s +cp -f /etc/resolv.conf /etc/rancher/k3s/resolv.conf + +# k3s uses --data-dir=/run/k3s (tmpfs) to avoid SQLite file locking issues +# on virtio-fs. Ensure the directory exists. +mkdir -p /run/k3s + +# --------------------------------------------------------------------------- +# CNI setup +# --------------------------------------------------------------------------- +# When k3s runs with --flannel-backend=none, no CNI plugin is installed. +# Without CNI, the kubelet reports the node as NotReady and no pods can be +# scheduled. +# +# The libkrunfw kernel lacks the bridge module, so the standard bridge CNI +# plugin fails with "operation not supported". Instead, we install a minimal +# "noop" CNI plugin (a shell script) that assigns pod IPs from a static range +# using the host-local IPAM plugin but skips creating any bridge/veth devices. +# This is sufficient for a single-node microVM cluster where we only need: +# - The node to report Ready +# - Pods to start (they communicate via the API server, not directly) +# +# The k3s image ships CNI plugin binaries in /bin/. kubelet expects them +# in /opt/cni/bin/ by default. +echo "[vm-init] Setting up CNI..." +mkdir -p /opt/cni/bin + +# Symlink the standard plugins we need (loopback for pod lo, host-local for IPAM). +for plugin in loopback host-local; do + if [ -f "/bin/$plugin" ] && [ ! -f "/opt/cni/bin/$plugin" ]; then + ln -sf "/bin/$plugin" "/opt/cni/bin/$plugin" + fi +done + +# Create a minimal noop CNI plugin. This shell script satisfies the CNI +# contract without creating any network devices (which the libkrunfw kernel +# can't do — no bridge module). It invokes host-local IPAM to allocate an +# IP, then returns the result. For DEL, it calls IPAM to release the IP. +cat > /opt/cni/bin/noop << 'NOOP_CNI' +#!/bin/sh +# Minimal noop CNI plugin — delegates to host-local IPAM only. +# Reads the network config from stdin, extracts the IPAM section, +# and invokes the host-local plugin to allocate/release IPs. + +IPAM_BIN="/opt/cni/bin/host-local" +CONFIG=$(cat) + +case "$CNI_COMMAND" in + ADD) + # Invoke IPAM to allocate an IP. Pass the full config (host-local + # reads the ipam section from it). + IPAM_RESULT=$(echo "$CONFIG" | "$IPAM_BIN") + IPAM_RC=$? + if [ $IPAM_RC -ne 0 ]; then + echo "$IPAM_RESULT" + exit $IPAM_RC + fi + # Return the IPAM result as our result (IPs allocated, no interfaces). + echo "$IPAM_RESULT" + ;; + DEL) + # Release the IP via IPAM. + echo "$CONFIG" | "$IPAM_BIN" 2>/dev/null + echo '{}' + ;; + CHECK) + echo '{}' + ;; + VERSION) + echo '{"cniVersion":"1.0.0","supportedVersions":["0.3.0","0.3.1","0.4.0","1.0.0"]}' + ;; +esac +NOOP_CNI +chmod +x /opt/cni/bin/noop + +# Write the CNI config. The chain is: +# 1. noop — allocates an IP via host-local IPAM (no network devices) +# 2. loopback — sets up lo in each pod namespace +# host-local IPAM assigns IPs from 10.42.0.0/24. +mkdir -p /etc/cni/net.d +cat > /etc/cni/net.d/10-noop.conflist << 'CNI_CONFIG' +{ + "cniVersion": "1.0.0", + "name": "noop", + "plugins": [ + { + "type": "noop", + "ipam": { + "type": "host-local", + "ranges": [ + [{"subnet": "10.42.0.0/24"}] + ] + } + }, + { + "type": "loopback" + } + ] +} +CNI_CONFIG + +# Copy bundled manifests if they exist (same as cluster-entrypoint.sh). +K3S_MANIFESTS="/run/k3s/server/manifests" +BUNDLED_MANIFESTS="/opt/navigator/manifests" +if [ -d "$BUNDLED_MANIFESTS" ]; then + mkdir -p "$K3S_MANIFESTS" + for manifest in "$BUNDLED_MANIFESTS"/*.yaml; do + [ ! -f "$manifest" ] && continue + cp "$manifest" "$K3S_MANIFESTS/" + done +fi + +# Start k3s with a retry wrapper. On tmpfs, k3s sometimes crashes on first +# boot with "duplicate key given in txn request" or "kine.sock: address +# already in use" due to a race condition in kine's SQLite initialization. +# This is transient and always succeeds on the second attempt. +# +# We retry up to 3 times with a brief cleanup pause between attempts. +# On the final attempt, we exec to replace this process with k3s (PID 1). +MAX_RETRIES=3 +RETRY=0 + +echo "[vm-init] Starting k3s..." +while [ "$RETRY" -lt "$MAX_RETRIES" ]; do + RETRY=$((RETRY + 1)) + + if [ "$RETRY" -eq "$MAX_RETRIES" ]; then + # Final attempt: exec into k3s so it becomes PID 1. + exec /bin/k3s "$@" + fi + + # Non-final attempt: run k3s and check exit code. + /bin/k3s "$@" & + K3S_PID=$! + wait $K3S_PID + K3S_EXIT=$? + + if [ "$K3S_EXIT" -eq 0 ]; then + exit 0 + fi + + echo "[vm-init] k3s exited with status $K3S_EXIT (attempt $RETRY/$MAX_RETRIES)" + + # Clean up stale kine socket and lock files before retrying. + rm -f /run/k3s/server/kine.sock 2>/dev/null + rm -f /run/k3s/server/db/state.db-wal 2>/dev/null + rm -f /run/k3s/server/db/state.db-shm 2>/dev/null + sleep 2 +done diff --git a/scripts/bin/nav b/scripts/bin/nav index 0dae2b07..9a8d0b6e 100755 --- a/scripts/bin/nav +++ b/scripts/bin/nav @@ -8,4 +8,11 @@ BINARY="$PROJECT_ROOT/target/debug/navigator" # Build if needed (cargo handles change detection) cargo build --package navigator-cli --quiet +# macOS: codesign with hypervisor entitlement if the binary changed. +# libkrun requires com.apple.security.hypervisor to access Hypervisor.framework. +ENTITLEMENTS="$PROJECT_ROOT/crates/navigator-gateway/entitlements.plist" +if [ -f "$ENTITLEMENTS" ]; then + codesign --entitlements "$ENTITLEMENTS" --force -s - "$BINARY" 2>/dev/null || true +fi + exec "$BINARY" "$@"