diff --git a/bin/nixbox b/bin/nixbox index 6830a7b..b6c35fd 100755 --- a/bin/nixbox +++ b/bin/nixbox @@ -312,11 +312,13 @@ NFTEOF # Raise FD limit before launching virtiofsd and cloud-hypervisor (#18). raise_nofile 524288 + local virtiofsd_bin + virtiofsd_bin=$(ensure_virtiofsd_cap) # --- Start virtiofsd for nix-store share (required by microvm config) --- log "==> Starting virtiofsd for nix-store..." local nix_store_sock="$run_dir/nixbox-virtiofs-nix-store.sock" - virtiofsd --socket-path="$nix_store_sock" --shared-dir="/nix/store" --sandbox=none --translate-uid="map:1000:$(id -u):1" --translate-gid="map:100:$(id -g):1" --cache=auto 2>"$run_dir/virtiofsd-nix-store.log" & + "$virtiofsd_bin" --socket-path="$nix_store_sock" --shared-dir="/nix/store" --sandbox=none --translate-uid="map:1000:$(id -u):1" --translate-gid="map:100:$(id -g):1" --cache=auto --inode-file-handles=mandatory 2>"$run_dir/virtiofsd-nix-store.log" & echo "$!" > "$state_dir/virtiofsd_nix_store_pid" for _ in $(seq 1 10); do [ -S "$nix_store_sock" ] && break; sleep 0.2; done [ -S "$nix_store_sock" ] || die "virtiofsd socket for nix-store did not appear" @@ -334,7 +336,7 @@ NFTEOF [ ! -d "$src" ] && die "Mount source does not exist: $src" local virtiofs_sock="$run_dir/virtiofs-${i}.sock" - virtiofsd --socket-path="$virtiofs_sock" --shared-dir="$src" --sandbox=none --translate-uid="map:1000:$(id -u):1" --translate-gid="map:100:$(id -g):1" --cache=auto 2>"$run_dir/virtiofsd-${i}.log" & + "$virtiofsd_bin" --socket-path="$virtiofs_sock" --shared-dir="$src" --sandbox=none --translate-uid="map:1000:$(id -u):1" --translate-gid="map:100:$(id -g):1" --cache=auto --inode-file-handles=mandatory 2>"$run_dir/virtiofsd-${i}.log" & echo "$!" > "$state_dir/virtiofsd_${i}_pid" for _ in $(seq 1 10); do [ -S "$virtiofs_sock" ] && break; sleep 0.2; done [ -S "$virtiofs_sock" ] || die "virtiofsd socket did not appear for mount $i" @@ -543,13 +545,16 @@ do_mount() { local virtiofs_sock="$run_dir/virtiofs-${mount_idx}.sock" raise_nofile 524288 - virtiofsd \ + local virtiofsd_bin + virtiofsd_bin=$(ensure_virtiofsd_cap) + "$virtiofsd_bin" \ --socket-path="$virtiofs_sock" \ --shared-dir="$MOUNT_SOURCE" \ --sandbox=none \ --translate-uid="map:1000:$(id -u):1" \ --translate-gid="map:100:$(id -g):1" \ - --cache=auto 2>"$run_dir/virtiofsd-${mount_idx}.log" & + --cache=auto \ + --inode-file-handles=mandatory 2>"$run_dir/virtiofsd-${mount_idx}.log" & echo "$!" > "$state_dir/virtiofsd_${mount_idx}_pid" for _ in $(seq 1 10); do @@ -653,7 +658,7 @@ do_list() { # --------------------------------------------------------------------------- ensure_setup() { - for cmd in nix jq dnsmasq nft mke2fs virtiofsd; do + for cmd in nix jq dnsmasq nft mke2fs virtiofsd setcap getcap; do command -v "$cmd" &>/dev/null || die "$cmd not found. Install it first." done [ -e /dev/kvm ] || die "/dev/kvm not found. KVM is required." @@ -856,7 +861,7 @@ cmd_doctor() { local errors=0 log "==> Checking prerequisites..." - for cmd in nix jq dnsmasq nft mke2fs virtiofsd; do + for cmd in nix jq dnsmasq nft mke2fs virtiofsd setcap getcap; do if command -v "$cmd" &>/dev/null; then log_sub "$cmd: OK" else @@ -872,6 +877,13 @@ cmd_doctor() { errors=$((errors + 1)) fi + local virtiofsd_wrapper="${XDG_DATA_HOME:-$HOME/.local/share}/nixbox/bin/virtiofsd" + if [ -x "$virtiofsd_wrapper" ] && getcap "$virtiofsd_wrapper" 2>/dev/null | grep -q 'cap_dac_read_search'; then + log_sub "virtiofsd cap: OK ($virtiofsd_wrapper)" + else + log_sub "virtiofsd cap: NOT INSTALLED (will install on first 'nixbox up'; needs sudo)" + fi + echo "" log "==> Checking project config..." if NIXBOX_DIR="$(find_nixbox_dir 2>/dev/null)"; then diff --git a/docs/decisions/016-virtiofsd-file-handles-capability.md b/docs/decisions/016-virtiofsd-file-handles-capability.md new file mode 100644 index 0000000..d5ead72 --- /dev/null +++ b/docs/decisions/016-virtiofsd-file-handles-capability.md @@ -0,0 +1,37 @@ +# 016: virtiofsd file-handle mode via setcap'd wrapper + +**Date:** 2026-04-27 +**Status:** accepted + +## Problem + +ADR-015 keeps churning caches off virtiofs, but long-lived shares (source trees) still leak FDs under sustained access. `virtiofsd --cache=auto` retains an O_PATH FD per cached inode; with mandatory file-handle mode the cache holds opaque handles instead, freeing the FD slot until the next I/O. This is the only mitigation that addresses the underlying accumulation rather than its symptoms (#18). + +`--inode-file-handles=mandatory` calls `name_to_handle_at(2)`, which requires `CAP_DAC_READ_SEARCH`. Without it, virtiofsd refuses to start (`Refusing to use (mandatory) file handles, as they do not appear safe to use`). Today the daemon runs as the host user (UID 1000) under `--sandbox=none` (ADR-001) with no special privileges, so the call returns `EPERM`. + +Three privilege models were considered: + +1. **Run as root, `--sandbox=none`.** Rejected: with no sandbox there is no per-request `setresuid`, so guest-created files end up owned by root on the host filesystem. Regresses ADR-002. +2. **Run as root, `--sandbox=namespace`.** Rejected: requires the daemon to run as root for one syscall path, and the interaction between namespace credential switching and `--translate-uid` is unverified — no upside over option 4, and any deviation regresses ADR-002. (ADR-001 only proves the *non-root* failure mode, so it doesn't apply here.) +3. **Run as root, `--sandbox=chroot`.** Same root-EUID-on-create problem as option 1. +4. **Grant only `CAP_DAC_READ_SEARCH` on the binary.** Daemon stays at UID 1000; ADR-001 and ADR-002 hold; only the one capability needed by `name_to_handle_at(2)` is added. + +## Decision + +Option 4. `lib/functions.bash::ensure_virtiofsd_cap` keeps a setcap'd copy of virtiofsd at `${XDG_DATA_HOME:-~/.local/share}/nixbox/bin/virtiofsd` and returns its path; `do_create` and `do_mount` invoke that path with `--inode-file-handles=mandatory`. + +`mandatory` over `prefer` because silent fallback to FD-mode would re-introduce the leak with no signal — the failure mode of #18 only surfaces after a long session, exactly the kind of degradation that hides until something OOMs. + +A copy under `$XDG_DATA_HOME` rather than `setcap` on the in-store binary, because the host is not NixOS: `/nix/store` is read-only and GC-collectable, `security.wrappers` doesn't apply, and host-package wrappers (`/usr/local/bin`) drift independently of the bundled CLI. A nixbox-managed copy is reinstalled automatically when the source binary realpath changes (e.g., after `nixbox update`), keyed by a sidecar marker file. + +`cmd_doctor` reports wrapper status; `nixbox up` triggers (re)install on demand. The first run prompts for sudo (one-time per virtiofsd version), parallel to the existing `sudo prlimit` path in `raise_nofile`. + +## Consequences + +- The FD ceiling is no longer a function of cache size. Source-tree shares are stable across long sessions. +- **Memory replaces FDs as the long-run ceiling.** Each cached inode now holds an opaque handle in virtiofsd's address space instead of an O_PATH FD. Empirically, ~8 KB per `/nix/store` inode and ~60 KB per workspace inode of virtiofsd RSS. A 524k-inode cache is no longer hitting `RLIMIT_NOFILE`, but it is GBs of resident memory. +- One additional sudo prompt on first `nixbox up` after install or after a virtiofsd version bump. +- **`/proc//fd` is now root-owned.** File capabilities trigger `PR_SET_DUMPABLE=0`, so debugging tools (`lsof -p`, `ls /proc//fd`) need sudo. They silently return empty results without it. +- ADR-001 (sandbox=none) and ADR-002 (uid/gid translate) preserved unchanged. +- The granted capability is scoped: `CAP_DAC_READ_SEARCH` bypasses DAC for read/search only, and only matters for paths the daemon already opens via `--shared-dir`. Other DAC checks remain. +- Wrapper drift: invoking nixbox under a non-canonical PATH that resolves a different virtiofsd causes a reinstall. Acceptable — the CLI is a Nix wrapper with a deterministic PATH, so the canonical path is stable. diff --git a/docs/decisions/README.md b/docs/decisions/README.md index cc66a5d..a2b144d 100644 --- a/docs/decisions/README.md +++ b/docs/decisions/README.md @@ -23,3 +23,4 @@ Each file: `NNN-short-title.md` with sections **Problem**, **Decision**, **Conse | [011](011-guest-setup-scripts.md) | Guest setup via user-provided scripts | 2026-03-24 | accepted | | [012](012-per-workspace-nixbox-directory.md) | Per-workspace `.nixbox/` directory | 2026-03-24 | accepted | | [013](013-plugin-env-transparency.md) | Plugins must not inject env vars | 2026-03-24 | accepted | +| [016](016-virtiofsd-file-handles-capability.md) | virtiofsd file-handle mode via setcap'd wrapper | 2026-04-27 | accepted | diff --git a/lib/functions.bash b/lib/functions.bash index a3f845b..6b87b7a 100644 --- a/lib/functions.bash +++ b/lib/functions.bash @@ -45,6 +45,44 @@ raise_nofile() { || die "Failed to raise NOFILE soft limit to $target after raising hard limit" } +# Resolves to a virtiofsd binary that has CAP_DAC_READ_SEARCH set, required for +# --inode-file-handles=mandatory (#18). The capability cannot live on the +# /nix/store path (read-only and GC-collectable), so a copy is kept under +# $XDG_DATA_HOME/nixbox/bin and re-installed when the source binary changes +# (e.g. after `nixbox update`). One sudo prompt per (re)install. Echoes the +# wrapper path on stdout. +ensure_virtiofsd_cap() { + local data_dir="${XDG_DATA_HOME:-$HOME/.local/share}/nixbox" + local wrapper="$data_dir/bin/virtiofsd" + local marker="$data_dir/bin/.virtiofsd.src" + + local src + src=$(command -v virtiofsd 2>/dev/null) \ + || die "virtiofsd not found in PATH" + src=$(realpath "$src") \ + || die "Failed to resolve realpath of virtiofsd" + + if [ -x "$wrapper" ] \ + && [ -f "$marker" ] \ + && [ "$(cat "$marker")" = "$src" ] \ + && getcap "$wrapper" 2>/dev/null | grep -q 'cap_dac_read_search'; then + echo "$wrapper" + return 0 + fi + + log "==> Installing setcap'd virtiofsd at $wrapper (requires sudo)..." >&2 + mkdir -p "$data_dir/bin" \ + || die "Failed to create $data_dir/bin" + chmod 700 "$data_dir/bin" \ + || die "Failed to chmod 700 $data_dir/bin" + cp --remove-destination "$src" "$wrapper" \ + || die "Failed to copy virtiofsd to $wrapper" + sudo setcap cap_dac_read_search=ep "$wrapper" \ + || die "Failed to set cap_dac_read_search on $wrapper. --inode-file-handles=mandatory cannot work without it." + echo "$src" > "$marker" + echo "$wrapper" +} + # --------------------------------------------------------------------------- # Network derivation (pure — depends only on slot + name) # ---------------------------------------------------------------------------