From e64e574e5aa8c81b8bdc6e113eccc783bd5f9257 Mon Sep 17 00:00:00 2001 From: Razvan Laurus Date: Mon, 27 Apr 2026 14:48:48 +0200 Subject: [PATCH 1/5] feat(virtiofs): use setcap'd wrapper for inode-file-handles=mandatory virtiofsd --cache=auto pins one O_PATH FD per cached inode (#18). With --inode-file-handles=mandatory the cache holds opaque file handles instead, decoupling cache size from FD count. PR #20 (raised ceiling) and ADR-015 (no virtiofs for churning caches) couldn't address this for source-tree shares. The flag requires CAP_DAC_READ_SEARCH. Run-as-root alternatives regress ADR-001 (sandbox=namespace silently breaks O_CREAT|O_EXCL) or ADR-002 (sandbox=none/chroot makes guest-created files root-owned on host). Capability-only keeps both ADRs intact. ensure_virtiofsd_cap() keeps a setcap'd copy of virtiofsd under $XDG_DATA_HOME/nixbox and reinstalls on version drift; do_create and do_mount switch to it with --inode-file-handles=mandatory; cmd_doctor reports wrapper status. One sudo prompt on first nixbox up and after each nixbox update -- parallel to the existing sudo prlimit path. Rationale and rejected alternatives in ADR-016. Co-Authored-By: Claude Opus 4.7 (1M context) --- bin/nixbox | 20 ++++++++--- .../016-virtiofsd-file-handles-capability.md | 33 +++++++++++++++++ docs/decisions/README.md | 1 + lib/functions.bash | 35 +++++++++++++++++++ 4 files changed, 85 insertions(+), 4 deletions(-) create mode 100644 docs/decisions/016-virtiofsd-file-handles-capability.md diff --git a/bin/nixbox b/bin/nixbox index 6830a7b..b218ea3 100755 --- a/bin/nixbox +++ b/bin/nixbox @@ -312,11 +312,13 @@ NFTEOF # Raise FD limit before launching virtiofsd and cloud-hypervisor (#18). raise_nofile 524288 + local virtiofsd_bin + virtiofsd_bin=$(ensure_virtiofsd_cap) # --- Start virtiofsd for nix-store share (required by microvm config) --- log "==> Starting virtiofsd for nix-store..." local nix_store_sock="$run_dir/nixbox-virtiofs-nix-store.sock" - virtiofsd --socket-path="$nix_store_sock" --shared-dir="/nix/store" --sandbox=none --translate-uid="map:1000:$(id -u):1" --translate-gid="map:100:$(id -g):1" --cache=auto 2>"$run_dir/virtiofsd-nix-store.log" & + "$virtiofsd_bin" --socket-path="$nix_store_sock" --shared-dir="/nix/store" --sandbox=none --translate-uid="map:1000:$(id -u):1" --translate-gid="map:100:$(id -g):1" --cache=auto --inode-file-handles=mandatory 2>"$run_dir/virtiofsd-nix-store.log" & echo "$!" > "$state_dir/virtiofsd_nix_store_pid" for _ in $(seq 1 10); do [ -S "$nix_store_sock" ] && break; sleep 0.2; done [ -S "$nix_store_sock" ] || die "virtiofsd socket for nix-store did not appear" @@ -334,7 +336,7 @@ NFTEOF [ ! -d "$src" ] && die "Mount source does not exist: $src" local virtiofs_sock="$run_dir/virtiofs-${i}.sock" - virtiofsd --socket-path="$virtiofs_sock" --shared-dir="$src" --sandbox=none --translate-uid="map:1000:$(id -u):1" --translate-gid="map:100:$(id -g):1" --cache=auto 2>"$run_dir/virtiofsd-${i}.log" & + "$virtiofsd_bin" --socket-path="$virtiofs_sock" --shared-dir="$src" --sandbox=none --translate-uid="map:1000:$(id -u):1" --translate-gid="map:100:$(id -g):1" --cache=auto --inode-file-handles=mandatory 2>"$run_dir/virtiofsd-${i}.log" & echo "$!" > "$state_dir/virtiofsd_${i}_pid" for _ in $(seq 1 10); do [ -S "$virtiofs_sock" ] && break; sleep 0.2; done [ -S "$virtiofs_sock" ] || die "virtiofsd socket did not appear for mount $i" @@ -543,13 +545,16 @@ do_mount() { local virtiofs_sock="$run_dir/virtiofs-${mount_idx}.sock" raise_nofile 524288 - virtiofsd \ + local virtiofsd_bin + virtiofsd_bin=$(ensure_virtiofsd_cap) + "$virtiofsd_bin" \ --socket-path="$virtiofs_sock" \ --shared-dir="$MOUNT_SOURCE" \ --sandbox=none \ --translate-uid="map:1000:$(id -u):1" \ --translate-gid="map:100:$(id -g):1" \ - --cache=auto 2>"$run_dir/virtiofsd-${mount_idx}.log" & + --cache=auto \ + --inode-file-handles=mandatory 2>"$run_dir/virtiofsd-${mount_idx}.log" & echo "$!" > "$state_dir/virtiofsd_${mount_idx}_pid" for _ in $(seq 1 10); do @@ -872,6 +877,13 @@ cmd_doctor() { errors=$((errors + 1)) fi + local virtiofsd_wrapper="${XDG_DATA_HOME:-$HOME/.local/share}/nixbox/bin/virtiofsd" + if [ -x "$virtiofsd_wrapper" ] && getcap "$virtiofsd_wrapper" 2>/dev/null | grep -q 'cap_dac_read_search'; then + log_sub "virtiofsd cap: OK ($virtiofsd_wrapper)" + else + log_sub "virtiofsd cap: NOT INSTALLED (will install on first 'nixbox up'; needs sudo)" + fi + echo "" log "==> Checking project config..." if NIXBOX_DIR="$(find_nixbox_dir 2>/dev/null)"; then diff --git a/docs/decisions/016-virtiofsd-file-handles-capability.md b/docs/decisions/016-virtiofsd-file-handles-capability.md new file mode 100644 index 0000000..a66b515 --- /dev/null +++ b/docs/decisions/016-virtiofsd-file-handles-capability.md @@ -0,0 +1,33 @@ +# 016: virtiofsd file-handle mode via setcap'd wrapper + +**Date:** 2026-04-27 +**Status:** accepted + +## Problem + +ADR-015 keeps churning caches off virtiofs, but long-lived shares (source trees) still leak FDs under sustained access. `virtiofsd --cache=auto` retains an O_PATH FD per cached inode; with mandatory file-handle mode the cache holds opaque handles instead, freeing the FD slot until the next I/O. This is the only mitigation that addresses the underlying accumulation rather than its symptoms (#18). + +`--inode-file-handles=mandatory` calls `name_to_handle_at(2)`, which requires `CAP_DAC_READ_SEARCH`. Without it, virtiofsd refuses to start (`Refusing to use (mandatory) file handles, as they do not appear safe to use`). Today the daemon runs as the host user (UID 1000) under `--sandbox=none` (ADR-001) with no special privileges, so the call returns `EPERM`. + +Three privilege models were considered: + +1. **Run as root, `--sandbox=none`.** Rejected: with no sandbox there is no per-request `setresuid`, so guest-created files end up owned by root on the host filesystem. Regresses ADR-002. +2. **Run as root, `--sandbox=namespace`.** Rejected explicitly by ADR-001 — credential switching breaks `O_CREAT|O_EXCL` (silent `EINVAL`) inside the guest. +3. **Run as root, `--sandbox=chroot`.** Same root-EUID-on-create problem as option 1. +4. **Grant only `CAP_DAC_READ_SEARCH` on the binary.** Daemon stays at UID 1000; ADR-001 and ADR-002 hold; only the one capability needed by `name_to_handle_at(2)` is added. + +## Decision + +Option 4. `lib/functions.bash::ensure_virtiofsd_cap` keeps a setcap'd copy of virtiofsd at `${XDG_DATA_HOME:-~/.local/share}/nixbox/bin/virtiofsd` and returns its path; `do_create` and `do_mount` invoke that path with `--inode-file-handles=mandatory`. + +A copy under `$XDG_DATA_HOME` rather than `setcap` on the in-store binary, because the host is not NixOS: `/nix/store` is read-only and GC-collectable, `security.wrappers` doesn't apply, and host-package wrappers (`/usr/local/bin`) drift independently of the bundled CLI. A nixbox-managed copy is reinstalled automatically when the source binary realpath changes (e.g., after `nixbox update`), keyed by a sidecar marker file. + +`cmd_doctor` reports wrapper status; `nixbox up` triggers (re)install on demand. The first run prompts for sudo (one-time per virtiofsd version), parallel to the existing `sudo prlimit` path in `raise_nofile`. + +## Consequences + +- The FD ceiling is no longer a function of cache size. Source-tree shares are stable across long sessions. +- One additional sudo prompt on first `nixbox up` after install or after a virtiofsd version bump. +- ADR-001 (sandbox=none) and ADR-002 (uid/gid translate) preserved unchanged. +- The granted capability is scoped: `CAP_DAC_READ_SEARCH` bypasses DAC for read/search only, and only matters for paths the daemon already opens via `--shared-dir`. Other DAC checks remain. +- Wrapper drift: invoking nixbox under a non-canonical PATH that resolves a different virtiofsd causes a reinstall. Acceptable — the CLI is a Nix wrapper with a deterministic PATH, so the canonical path is stable. diff --git a/docs/decisions/README.md b/docs/decisions/README.md index cc66a5d..a2b144d 100644 --- a/docs/decisions/README.md +++ b/docs/decisions/README.md @@ -23,3 +23,4 @@ Each file: `NNN-short-title.md` with sections **Problem**, **Decision**, **Conse | [011](011-guest-setup-scripts.md) | Guest setup via user-provided scripts | 2026-03-24 | accepted | | [012](012-per-workspace-nixbox-directory.md) | Per-workspace `.nixbox/` directory | 2026-03-24 | accepted | | [013](013-plugin-env-transparency.md) | Plugins must not inject env vars | 2026-03-24 | accepted | +| [016](016-virtiofsd-file-handles-capability.md) | virtiofsd file-handle mode via setcap'd wrapper | 2026-04-27 | accepted | diff --git a/lib/functions.bash b/lib/functions.bash index a3f845b..75e1a16 100644 --- a/lib/functions.bash +++ b/lib/functions.bash @@ -45,6 +45,41 @@ raise_nofile() { || die "Failed to raise NOFILE soft limit to $target after raising hard limit" } +# Resolves to a virtiofsd binary that has CAP_DAC_READ_SEARCH set, required for +# --inode-file-handles=mandatory (#18). The capability cannot live on the +# /nix/store path (read-only and GC-collectable), so a copy is kept under +# $XDG_DATA_HOME/nixbox/bin and re-installed when the source binary changes +# (e.g. after `nixbox update`). One sudo prompt per (re)install. Echoes the +# wrapper path on stdout. +ensure_virtiofsd_cap() { + local data_dir="${XDG_DATA_HOME:-$HOME/.local/share}/nixbox" + local wrapper="$data_dir/bin/virtiofsd" + local marker="$data_dir/bin/.virtiofsd.src" + + local src + src=$(command -v virtiofsd 2>/dev/null) \ + || die "virtiofsd not found in PATH" + src=$(realpath "$src") \ + || die "Failed to resolve realpath of virtiofsd" + + if [ -x "$wrapper" ] \ + && [ -f "$marker" ] \ + && [ "$(cat "$marker")" = "$src" ] \ + && getcap "$wrapper" 2>/dev/null | grep -q 'cap_dac_read_search'; then + echo "$wrapper" + return 0 + fi + + log "==> Installing setcap'd virtiofsd at $wrapper (requires sudo)..." + mkdir -p "$data_dir/bin" + cp -f "$src" "$wrapper" \ + || die "Failed to copy virtiofsd to $wrapper" + sudo setcap cap_dac_read_search=ep "$wrapper" \ + || die "Failed to set cap_dac_read_search on $wrapper. --inode-file-handles=mandatory cannot work without it." + echo "$src" > "$marker" + echo "$wrapper" +} + # --------------------------------------------------------------------------- # Network derivation (pure — depends only on slot + name) # --------------------------------------------------------------------------- From d42d344effb46af0ebe1f2d7b0e6b945627087ae Mon Sep 17 00:00:00 2001 From: Razvan Laurus Date: Tue, 28 Apr 2026 09:15:02 +0200 Subject: [PATCH 2/5] fix(virtiofs): redirect ensure_virtiofsd_cap log to stderr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function returns the wrapper path via stdout (echo "$wrapper"), but log() also writes to stdout. Captured via $(ensure_virtiofsd_cap), the install-message and the wrapper path got concatenated, so the caller invoked a bogus binary path — virtiofsd never started in CI on the first 'nixbox up' (no preinstalled wrapper). --- lib/functions.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/functions.bash b/lib/functions.bash index 75e1a16..42043dd 100644 --- a/lib/functions.bash +++ b/lib/functions.bash @@ -70,7 +70,7 @@ ensure_virtiofsd_cap() { return 0 fi - log "==> Installing setcap'd virtiofsd at $wrapper (requires sudo)..." + log "==> Installing setcap'd virtiofsd at $wrapper (requires sudo)..." >&2 mkdir -p "$data_dir/bin" cp -f "$src" "$wrapper" \ || die "Failed to copy virtiofsd to $wrapper" From f7e3b76b360c87693f25cb9320e3b55851dcc8a3 Mon Sep 17 00:00:00 2001 From: Razvan Laurus Date: Tue, 28 Apr 2026 09:48:12 +0200 Subject: [PATCH 3/5] docs(adr-016): correct option 2 rejection rationale MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ADR-001 documents the non-root failure mode only — it doesn't prove that sandbox=namespace breaks under root. The real reason to reject option 2 is the cost (running the daemon as root) vs. the absence of any benefit over option 4. --- docs/decisions/016-virtiofsd-file-handles-capability.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/decisions/016-virtiofsd-file-handles-capability.md b/docs/decisions/016-virtiofsd-file-handles-capability.md index a66b515..35921c3 100644 --- a/docs/decisions/016-virtiofsd-file-handles-capability.md +++ b/docs/decisions/016-virtiofsd-file-handles-capability.md @@ -12,7 +12,7 @@ ADR-015 keeps churning caches off virtiofs, but long-lived shares (source trees) Three privilege models were considered: 1. **Run as root, `--sandbox=none`.** Rejected: with no sandbox there is no per-request `setresuid`, so guest-created files end up owned by root on the host filesystem. Regresses ADR-002. -2. **Run as root, `--sandbox=namespace`.** Rejected explicitly by ADR-001 — credential switching breaks `O_CREAT|O_EXCL` (silent `EINVAL`) inside the guest. +2. **Run as root, `--sandbox=namespace`.** Rejected: requires the daemon to run as root for one syscall path, and the interaction between namespace credential switching and `--translate-uid` is unverified — no upside over option 4, and any deviation regresses ADR-002. (ADR-001 only proves the *non-root* failure mode, so it doesn't apply here.) 3. **Run as root, `--sandbox=chroot`.** Same root-EUID-on-create problem as option 1. 4. **Grant only `CAP_DAC_READ_SEARCH` on the binary.** Daemon stays at UID 1000; ADR-001 and ADR-002 hold; only the one capability needed by `name_to_handle_at(2)` is added. From 5e6f9c6d3790eb31f40750547b88f208d014d265 Mon Sep 17 00:00:00 2001 From: Razvan Laurus Date: Tue, 28 Apr 2026 10:35:41 +0200 Subject: [PATCH 4/5] docs(adr-016): cover prefer-vs-mandatory choice and runtime side effects Three gaps surfaced by review and empirical verification: - mandatory over prefer: silent fallback would mask #18 until OOM. - Memory now scales with cached inodes (~8 KB/store, ~60 KB/workspace). - /proc//fd is root-owned due to PR_SET_DUMPABLE=0 from file caps. --- docs/decisions/016-virtiofsd-file-handles-capability.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/decisions/016-virtiofsd-file-handles-capability.md b/docs/decisions/016-virtiofsd-file-handles-capability.md index 35921c3..d5ead72 100644 --- a/docs/decisions/016-virtiofsd-file-handles-capability.md +++ b/docs/decisions/016-virtiofsd-file-handles-capability.md @@ -20,6 +20,8 @@ Three privilege models were considered: Option 4. `lib/functions.bash::ensure_virtiofsd_cap` keeps a setcap'd copy of virtiofsd at `${XDG_DATA_HOME:-~/.local/share}/nixbox/bin/virtiofsd` and returns its path; `do_create` and `do_mount` invoke that path with `--inode-file-handles=mandatory`. +`mandatory` over `prefer` because silent fallback to FD-mode would re-introduce the leak with no signal — the failure mode of #18 only surfaces after a long session, exactly the kind of degradation that hides until something OOMs. + A copy under `$XDG_DATA_HOME` rather than `setcap` on the in-store binary, because the host is not NixOS: `/nix/store` is read-only and GC-collectable, `security.wrappers` doesn't apply, and host-package wrappers (`/usr/local/bin`) drift independently of the bundled CLI. A nixbox-managed copy is reinstalled automatically when the source binary realpath changes (e.g., after `nixbox update`), keyed by a sidecar marker file. `cmd_doctor` reports wrapper status; `nixbox up` triggers (re)install on demand. The first run prompts for sudo (one-time per virtiofsd version), parallel to the existing `sudo prlimit` path in `raise_nofile`. @@ -27,7 +29,9 @@ A copy under `$XDG_DATA_HOME` rather than `setcap` on the in-store binary, becau ## Consequences - The FD ceiling is no longer a function of cache size. Source-tree shares are stable across long sessions. +- **Memory replaces FDs as the long-run ceiling.** Each cached inode now holds an opaque handle in virtiofsd's address space instead of an O_PATH FD. Empirically, ~8 KB per `/nix/store` inode and ~60 KB per workspace inode of virtiofsd RSS. A 524k-inode cache is no longer hitting `RLIMIT_NOFILE`, but it is GBs of resident memory. - One additional sudo prompt on first `nixbox up` after install or after a virtiofsd version bump. +- **`/proc//fd` is now root-owned.** File capabilities trigger `PR_SET_DUMPABLE=0`, so debugging tools (`lsof -p`, `ls /proc//fd`) need sudo. They silently return empty results without it. - ADR-001 (sandbox=none) and ADR-002 (uid/gid translate) preserved unchanged. - The granted capability is scoped: `CAP_DAC_READ_SEARCH` bypasses DAC for read/search only, and only matters for paths the daemon already opens via `--shared-dir`. Other DAC checks remain. - Wrapper drift: invoking nixbox under a non-canonical PATH that resolves a different virtiofsd causes a reinstall. Acceptable — the CLI is a Nix wrapper with a deterministic PATH, so the canonical path is stable. From cfc8941e555d0b21755213903d578fce3f140a36 Mon Sep 17 00:00:00 2001 From: Razvan Laurus Date: Tue, 28 Apr 2026 11:01:07 +0200 Subject: [PATCH 5/5] fix(virtiofs): harden ensure_virtiofsd_cap install and prereq checks - Restrict install dir to mode 700 and use cp --remove-destination so a symlink at the wrapper path can't redirect the setcap'd write target. - Fail loud when mkdir fails (consistent with the rest of the function). - Add setcap/getcap to prerequisite checks in ensure_setup and cmd_doctor. Note: kernel cap_inode_killpriv() already clears security.capability on write, so a same-user "swap-then-reuse-cap" attack is not possible. The hardening here is for symlink hygiene and clearer prereq errors. --- bin/nixbox | 4 ++-- lib/functions.bash | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/bin/nixbox b/bin/nixbox index b218ea3..b6c35fd 100755 --- a/bin/nixbox +++ b/bin/nixbox @@ -658,7 +658,7 @@ do_list() { # --------------------------------------------------------------------------- ensure_setup() { - for cmd in nix jq dnsmasq nft mke2fs virtiofsd; do + for cmd in nix jq dnsmasq nft mke2fs virtiofsd setcap getcap; do command -v "$cmd" &>/dev/null || die "$cmd not found. Install it first." done [ -e /dev/kvm ] || die "/dev/kvm not found. KVM is required." @@ -861,7 +861,7 @@ cmd_doctor() { local errors=0 log "==> Checking prerequisites..." - for cmd in nix jq dnsmasq nft mke2fs virtiofsd; do + for cmd in nix jq dnsmasq nft mke2fs virtiofsd setcap getcap; do if command -v "$cmd" &>/dev/null; then log_sub "$cmd: OK" else diff --git a/lib/functions.bash b/lib/functions.bash index 42043dd..6b87b7a 100644 --- a/lib/functions.bash +++ b/lib/functions.bash @@ -71,8 +71,11 @@ ensure_virtiofsd_cap() { fi log "==> Installing setcap'd virtiofsd at $wrapper (requires sudo)..." >&2 - mkdir -p "$data_dir/bin" - cp -f "$src" "$wrapper" \ + mkdir -p "$data_dir/bin" \ + || die "Failed to create $data_dir/bin" + chmod 700 "$data_dir/bin" \ + || die "Failed to chmod 700 $data_dir/bin" + cp --remove-destination "$src" "$wrapper" \ || die "Failed to copy virtiofsd to $wrapper" sudo setcap cap_dac_read_search=ep "$wrapper" \ || die "Failed to set cap_dac_read_search on $wrapper. --inode-file-handles=mandatory cannot work without it."