From 28e2ce4163490da0fae781bb4a749f18cc15d0fd Mon Sep 17 00:00:00 2001 From: lile Date: Thu, 26 Mar 2026 10:55:48 +0800 Subject: [PATCH] fix(jailer): replace hardcoded FD limits with dynamic closure strategies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FD cleanup in pre_exec used hardcoded upper bounds (1024 on Linux, 4096 on macOS). On systems with raised ulimit -n, FDs above these limits leaked into jailed processes, potentially exposing credentials, database connections, or network sockets. Replace with a 3-strategy cascade (Linux): 1. close_range(first_fd, ~0U, 0) — O(1), Linux 5.9+ 2. /proc/self/fd enumeration via raw getdents64 — no heap allocation 3. Brute-force close with dynamic limit from getrlimit(RLIMIT_NOFILE) macOS uses brute-force with dynamic getrlimit limit (replaces hardcoded 4096). All operations remain async-signal-safe for the pre_exec context. Co-Authored-By: Claude Opus 4.6 --- boxlite/src/jailer/common/fd.rs | 272 ++++++++++++++++++++++++++++++-- 1 file changed, 257 insertions(+), 15 deletions(-) diff --git a/boxlite/src/jailer/common/fd.rs b/boxlite/src/jailer/common/fd.rs index a9302512..e530e993 100644 --- a/boxlite/src/jailer/common/fd.rs +++ b/boxlite/src/jailer/common/fd.rs @@ -6,6 +6,160 @@ //! //! Only the async-signal-safe `close_inherited_fds_raw()` is used, //! called from the `pre_exec` hook before exec(). +//! +//! ## Strategy (in order of preference) +//! +//! 1. **Linux 5.9+**: `close_range(first_fd, ~0U, 0)` — O(1), kernel closes all. +//! 2. **Linux < 5.9**: Enumerate `/proc/self/fd` via raw `getdents64` syscall +//! (no memory allocation) and close only open FDs. Falls back to brute-force +//! with a dynamic upper bound from `getrlimit(RLIMIT_NOFILE)`. +//! 3. **macOS**: Brute-force close with dynamic upper bound from +//! `getrlimit(RLIMIT_NOFILE)` instead of a hardcoded limit. + +/// Query the soft limit for `RLIMIT_NOFILE`. Async-signal-safe. +/// +/// Returns the soft limit (current max open files), or a safe default +/// if the query fails. `getrlimit` is async-signal-safe per POSIX. +/// +/// Note: Cannot reuse `rlimit::get_rlimit()` because it returns `io::Error` +/// which heap-allocates (not async-signal-safe for pre_exec context). +fn get_max_fd() -> i32 { + let mut rlim = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + // SAFETY: getrlimit is async-signal-safe per POSIX. rlim is a valid stack-allocated struct. + let result = unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) }; + if result == 0 && rlim.rlim_cur > 0 { + if rlim.rlim_cur < i32::MAX as u64 { + rlim.rlim_cur as i32 + } else { + // rlim_cur is RLIM_INFINITY or very large; cap to a safe maximum. + // 1048576 (2^20) matches Linux's default /proc/sys/fs/nr_open. + 1_048_576 + } + } else { + // getrlimit failed or returned 0; safe default per POSIX OPEN_MAX + 1024 + } +} + +/// Close open FDs by enumerating `/proc/self/fd` with raw `getdents64`. +/// Async-signal-safe: uses only stack buffers and raw syscalls. +/// +/// Returns `true` if `/proc/self/fd` was successfully enumerated, +/// `false` if it's unavailable (e.g., mount namespace without /proc). +#[cfg(target_os = "linux")] +fn close_fds_via_proc(first_fd: i32) -> bool { + // Open /proc/self/fd with raw syscall (no allocation) + let proc_path = b"/proc/self/fd\0"; + // SAFETY: proc_path is a valid null-terminated string literal. open() is async-signal-safe. + let dir_fd = unsafe { + libc::open( + proc_path.as_ptr().cast::(), + libc::O_RDONLY | libc::O_DIRECTORY | libc::O_CLOEXEC, + ) + }; + if dir_fd < 0 { + return false; + } + + // Stack buffer for getdents64 — 1024 bytes handles ~40 entries per call, + // sufficient for typical processes without heap allocation. + let mut buf = [0u8; 1024]; + + loop { + let nread = unsafe { + libc::syscall( + libc::SYS_getdents64, + dir_fd, + buf.as_mut_ptr(), + buf.len() as libc::c_uint, + ) + }; + + if nread <= 0 { + break; + } + + let mut offset = 0usize; + while offset < nread as usize { + // SAFETY: getdents64 returns packed linux_dirent64 structs. + // d_reclen is at byte offset 16. Use read_unaligned because the + // buffer is a byte array and the u16 field may not be 2-byte aligned. + let d_reclen = + unsafe { buf.as_ptr().add(offset + 16).cast::().read_unaligned() } as usize; + + if d_reclen == 0 || offset + d_reclen > nread as usize { + break; + } + + // d_name starts at offset 19 in linux_dirent64 + let name_ptr = unsafe { buf.as_ptr().add(offset + 19) }; + + // Parse FD number from d_name (decimal string, null-terminated). + // Async-signal-safe: no allocation, just pointer arithmetic. + let fd = parse_fd_from_name(name_ptr); + + if let Some(fd) = fd + && fd >= first_fd + && fd != dir_fd + { + // SAFETY: close() is async-signal-safe. Closing an already-closed FD is harmless. + unsafe { libc::close(fd) }; + } + + offset += d_reclen; + } + } + + // SAFETY: close() is async-signal-safe. dir_fd was opened by us above. + unsafe { libc::close(dir_fd) }; + true +} + +/// Parse a decimal FD number from a null-terminated C string pointer. +/// Returns `None` for non-numeric entries (e.g., "." and ".."). +/// Async-signal-safe: no allocation, pure arithmetic. +/// +/// # Safety +/// +/// `ptr` must point to a valid, null-terminated byte string in readable memory. +/// Called only from `close_fds_via_proc` with pointers into the `getdents64` buffer, +/// which the kernel guarantees to contain null-terminated `d_name` fields. +#[cfg(target_os = "linux")] +fn parse_fd_from_name(ptr: *const u8) -> Option { + let mut fd: i32 = 0; + let mut i = 0usize; + loop { + // SAFETY: ptr is guaranteed by the caller to point to a null-terminated string. + // We stop reading at the null terminator (byte == 0). + let byte = unsafe { *ptr.add(i) }; + if byte == 0 { + break; + } + if !byte.is_ascii_digit() { + return None; + } + fd = fd.checked_mul(10)?.checked_add((byte - b'0') as i32)?; + i += 1; + } + if i == 0 { + return None; + } + Some(fd) +} + +/// Brute-force close all FDs from `first_fd` to `get_max_fd()`. Async-signal-safe. +/// +/// Last-resort fallback when `close_range` and `/proc/self/fd` are unavailable. +fn brute_force_close_fds(first_fd: i32) { + let max_fd = get_max_fd(); + for fd in first_fd..max_fd { + // SAFETY: close() is async-signal-safe. Closing a non-open FD returns EBADF (ignored). + unsafe { libc::close(fd) }; + } +} /// Close all FDs from `first_fd` onwards. Async-signal-safe. /// @@ -15,7 +169,8 @@ /// /// # Safety /// -/// This function only uses async-signal-safe syscalls (close, syscall). +/// This function only uses async-signal-safe syscalls (close, syscall, +/// getrlimit, open, getdents64). /// Do NOT add: /// - Logging (tracing, println) /// - Memory allocation (Box, Vec, String) @@ -29,7 +184,7 @@ pub fn close_fds_from(first_fd: i32) -> Result<(), i32> { #[cfg(target_os = "linux")] { - // Try close_range syscall (Linux 5.9+, most efficient) + // Strategy 1: close_range syscall (Linux 5.9+, most efficient — O(1)) let result = unsafe { libc::syscall( libc::SYS_close_range, @@ -42,25 +197,24 @@ pub fn close_fds_from(first_fd: i32) -> Result<(), i32> { return Ok(()); } - // Fallback: brute force close - // Note: We can't use /proc/self/fd here because: - // 1. read_dir allocates memory (not async-signal-safe) - // 2. We might be in a mount namespace where /proc isn't mounted - for fd in first_fd..1024 { - // Ignore errors - FD might not be open - unsafe { libc::close(fd) }; + // Strategy 2: Enumerate /proc/self/fd via raw getdents64 (no allocation). + // Closes only open FDs — efficient even with high ulimit -n. + if close_fds_via_proc(first_fd) { + return Ok(()); } + + // Strategy 3: Brute-force close with dynamic limit from getrlimit. + // Used when both close_range and /proc are unavailable (e.g., old kernel + // in a mount namespace without /proc). + brute_force_close_fds(first_fd); Ok(()) } #[cfg(target_os = "macos")] { - // macOS: brute force close (no close_range syscall) - // 4096 is a reasonable upper bound for most processes - for fd in first_fd..4096 { - // Ignore errors - FD might not be open - unsafe { libc::close(fd) }; - } + // macOS: brute-force close with dynamic limit from getrlimit. + // Handles systems where ulimit -n exceeds the previous hardcoded 4096. + brute_force_close_fds(first_fd); Ok(()) } @@ -230,4 +384,92 @@ mod tests { child_close_fds_from_closes_target_and_above, ); } + + /// Create a high-numbered FD (above the old hardcoded 1024/4096 limits) + /// and verify that `close_inherited_fds_raw` closes it. + fn child_close_high_numbered_fd() -> i32 { + // Raise the soft FD limit so we can dup2 to a high FD number. + // Some systems default to ulimit -n 1024, which would prevent dup2 to FD 2000. + let new_limit = libc::rlimit { + rlim_cur: 4096, + rlim_max: 4096, + }; + // SAFETY: setrlimit is async-signal-safe. Raising soft limit within hard limit is allowed. + if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &new_limit) } != 0 { + // Can't raise limit (hard limit too low) — skip test gracefully + return 0; + } + + // dup2 stdout to a high FD number that exceeds the old hardcoded limits. + // Use 2000 on all platforms — above old Linux limit (1024) and reasonable. + let high_fd: i32 = 2000; + let result = unsafe { libc::dup2(STDOUT_FD, high_fd) }; + if result != high_fd { + // dup2 failed — unexpected after raising ulimit + return 1; + } + + // Verify the high FD is open + if unsafe { libc::fcntl(high_fd, libc::F_GETFD) } < 0 { + return 2; + } + + // Close all inherited FDs + if close_inherited_fds_raw().is_err() { + return 3; + } + + // The high FD should now be closed + if unsafe { libc::fcntl(high_fd, libc::F_GETFD) } != -1 { + return 4; + } + 0 + } + + #[test] + fn test_close_high_numbered_fd() { + run_in_child("test_close_high_numbered_fd", child_close_high_numbered_fd); + } + + #[test] + fn test_get_max_fd_returns_positive() { + let max = get_max_fd(); + assert!( + max > 0, + "get_max_fd should return positive value, got {}", + max + ); + assert!( + max >= 256, + "get_max_fd should return at least 256, got {}", + max + ); + } + + #[cfg(target_os = "linux")] + #[test] + fn test_parse_fd_from_name() { + // Valid numeric names + assert_eq!(parse_fd_from_name(b"0\0".as_ptr()), Some(0)); + assert_eq!(parse_fd_from_name(b"3\0".as_ptr()), Some(3)); + assert_eq!(parse_fd_from_name(b"42\0".as_ptr()), Some(42)); + assert_eq!(parse_fd_from_name(b"1024\0".as_ptr()), Some(1024)); + assert_eq!(parse_fd_from_name(b"65535\0".as_ptr()), Some(65535)); + + // Non-numeric names (. and ..) + assert_eq!(parse_fd_from_name(b".\0".as_ptr()), None); + assert_eq!(parse_fd_from_name(b"..\0".as_ptr()), None); + + // Empty name + assert_eq!(parse_fd_from_name(b"\0".as_ptr()), None); + + // Overflow: i32::MAX (2147483647) should succeed + assert_eq!(parse_fd_from_name(b"2147483647\0".as_ptr()), Some(i32::MAX)); + + // Overflow: i32::MAX + 1 (2147483648) should return None (checked_add overflow) + assert_eq!(parse_fd_from_name(b"2147483648\0".as_ptr()), None); + + // Overflow: very large number should return None + assert_eq!(parse_fd_from_name(b"99999999999\0".as_ptr()), None); + } }