Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
272 changes: 257 additions & 15 deletions boxlite/src/jailer/common/fd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,160 @@
//!
//! Only the async-signal-safe `close_inherited_fds_raw()` is used,
//! called from the `pre_exec` hook before exec().
//!
//! ## Strategy (in order of preference)
//!
//! 1. **Linux 5.9+**: `close_range(first_fd, ~0U, 0)` — O(1), kernel closes all.
//! 2. **Linux < 5.9**: Enumerate `/proc/self/fd` via raw `getdents64` syscall
//! (no memory allocation) and close only open FDs. Falls back to brute-force
//! with a dynamic upper bound from `getrlimit(RLIMIT_NOFILE)`.
//! 3. **macOS**: Brute-force close with dynamic upper bound from
//! `getrlimit(RLIMIT_NOFILE)` instead of a hardcoded limit.

/// Query the soft limit for `RLIMIT_NOFILE`. Async-signal-safe.
///
/// Returns the soft limit (current max open files), or a safe default
/// if the query fails. `getrlimit` is async-signal-safe per POSIX.
///
/// Note: Cannot reuse `rlimit::get_rlimit()` because it returns `io::Error`
/// which heap-allocates (not async-signal-safe for pre_exec context).
fn get_max_fd() -> i32 {
let mut rlim = libc::rlimit {
rlim_cur: 0,
rlim_max: 0,
};
// SAFETY: getrlimit is async-signal-safe per POSIX. rlim is a valid stack-allocated struct.
let result = unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) };
if result == 0 && rlim.rlim_cur > 0 {
if rlim.rlim_cur < i32::MAX as u64 {
rlim.rlim_cur as i32
} else {
// rlim_cur is RLIM_INFINITY or very large; cap to a safe maximum.
// 1048576 (2^20) matches Linux's default /proc/sys/fs/nr_open.
1_048_576
}
Comment on lines +33 to +40
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_max_fd() uses if rlim.rlim_cur < i32::MAX as u64 which incorrectly treats an exact i32::MAX soft limit as “too large” and caps it to 1,048,576. This looks like an off-by-one comparison; use <= if the intent is “fits in i32”.

Copilot uses AI. Check for mistakes.
} else {
// getrlimit failed or returned 0; safe default per POSIX OPEN_MAX
1024
Comment on lines +42 to +43
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If getrlimit(RLIMIT_NOFILE) fails (or returns 0), get_max_fd() falls back to 1024, which can reintroduce the original FD-leak problem on hosts with a higher limit (since brute-force will stop at 1024). Consider a fallback that still errs on the side of closing more FDs (e.g., a higher cap like the existing 1,048,576) to preserve the security invariant even when getrlimit is blocked or fails.

Suggested change
// getrlimit failed or returned 0; safe default per POSIX OPEN_MAX
1024
// getrlimit failed or returned 0; use the same conservative cap to
// ensure we still attempt to close a wide range of FDs.
1_048_576

Copilot uses AI. Check for mistakes.
}
}

/// Close open FDs by enumerating `/proc/self/fd` with raw `getdents64`.
/// Async-signal-safe: uses only stack buffers and raw syscalls.
///
/// Returns `true` if `/proc/self/fd` was successfully enumerated,
/// `false` if it's unavailable (e.g., mount namespace without /proc).
#[cfg(target_os = "linux")]
fn close_fds_via_proc(first_fd: i32) -> bool {
// Open /proc/self/fd with raw syscall (no allocation)
let proc_path = b"/proc/self/fd\0";
// SAFETY: proc_path is a valid null-terminated string literal. open() is async-signal-safe.
let dir_fd = unsafe {
libc::open(
proc_path.as_ptr().cast::<libc::c_char>(),
libc::O_RDONLY | libc::O_DIRECTORY | libc::O_CLOEXEC,
)
};
if dir_fd < 0 {
return false;
}

// Stack buffer for getdents64 — 1024 bytes handles ~40 entries per call,
// sufficient for typical processes without heap allocation.
let mut buf = [0u8; 1024];

loop {
let nread = unsafe {
libc::syscall(
libc::SYS_getdents64,
dir_fd,
buf.as_mut_ptr(),
buf.len() as libc::c_uint,
)
};

if nread <= 0 {
break;
}

let mut offset = 0usize;
while offset < nread as usize {
// SAFETY: getdents64 returns packed linux_dirent64 structs.
// d_reclen is at byte offset 16. Use read_unaligned because the
// buffer is a byte array and the u16 field may not be 2-byte aligned.
let d_reclen =
unsafe { buf.as_ptr().add(offset + 16).cast::<u16>().read_unaligned() } as usize;

if d_reclen == 0 || offset + d_reclen > nread as usize {
break;
}
Comment on lines +71 to +95
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

close_fds_via_proc currently returns true even if getdents64 fails (e.g., syscall returns -1 for EINTR/EBADF). That would skip the brute-force fallback and can leave FDs open. Consider treating nread < 0 (and also malformed d_reclen conditions) as a failure: close dir_fd and return false so brute_force_close_fds() runs.

Copilot uses AI. Check for mistakes.

// d_name starts at offset 19 in linux_dirent64
let name_ptr = unsafe { buf.as_ptr().add(offset + 19) };

// Parse FD number from d_name (decimal string, null-terminated).
// Async-signal-safe: no allocation, just pointer arithmetic.
let fd = parse_fd_from_name(name_ptr);

if let Some(fd) = fd
&& fd >= first_fd
&& fd != dir_fd
{
// SAFETY: close() is async-signal-safe. Closing an already-closed FD is harmless.
unsafe { libc::close(fd) };
}

offset += d_reclen;
}
}

// SAFETY: close() is async-signal-safe. dir_fd was opened by us above.
unsafe { libc::close(dir_fd) };
true
}

/// Parse a decimal FD number from a null-terminated C string pointer.
/// Returns `None` for non-numeric entries (e.g., "." and "..").
/// Async-signal-safe: no allocation, pure arithmetic.
///
/// # Safety
///
/// `ptr` must point to a valid, null-terminated byte string in readable memory.
/// Called only from `close_fds_via_proc` with pointers into the `getdents64` buffer,
/// which the kernel guarantees to contain null-terminated `d_name` fields.
#[cfg(target_os = "linux")]
fn parse_fd_from_name(ptr: *const u8) -> Option<i32> {
let mut fd: i32 = 0;
let mut i = 0usize;
loop {
// SAFETY: ptr is guaranteed by the caller to point to a null-terminated string.
// We stop reading at the null terminator (byte == 0).
let byte = unsafe { *ptr.add(i) };
if byte == 0 {
break;
}
if !byte.is_ascii_digit() {
return None;
}
fd = fd.checked_mul(10)?.checked_add((byte - b'0') as i32)?;
i += 1;
}
if i == 0 {
return None;
}
Some(fd)
}

/// Brute-force close all FDs from `first_fd` to `get_max_fd()`. Async-signal-safe.
///
/// Last-resort fallback when `close_range` and `/proc/self/fd` are unavailable.
fn brute_force_close_fds(first_fd: i32) {
let max_fd = get_max_fd();
for fd in first_fd..max_fd {
// SAFETY: close() is async-signal-safe. Closing a non-open FD returns EBADF (ignored).
unsafe { libc::close(fd) };
}
}

/// Close all FDs from `first_fd` onwards. Async-signal-safe.
///
Expand All @@ -15,7 +169,8 @@
///
/// # Safety
///
/// This function only uses async-signal-safe syscalls (close, syscall).
/// This function only uses async-signal-safe syscalls (close, syscall,
/// getrlimit, open, getdents64).
/// Do NOT add:
/// - Logging (tracing, println)
/// - Memory allocation (Box, Vec, String)
Expand All @@ -29,7 +184,7 @@
pub fn close_fds_from(first_fd: i32) -> Result<(), i32> {
#[cfg(target_os = "linux")]
{
// Try close_range syscall (Linux 5.9+, most efficient)
// Strategy 1: close_range syscall (Linux 5.9+, most efficient — O(1))
let result = unsafe {
libc::syscall(
libc::SYS_close_range,
Expand All @@ -42,25 +197,24 @@ pub fn close_fds_from(first_fd: i32) -> Result<(), i32> {
return Ok(());
}

// Fallback: brute force close
// Note: We can't use /proc/self/fd here because:
// 1. read_dir allocates memory (not async-signal-safe)
// 2. We might be in a mount namespace where /proc isn't mounted
for fd in first_fd..1024 {
// Ignore errors - FD might not be open
unsafe { libc::close(fd) };
// Strategy 2: Enumerate /proc/self/fd via raw getdents64 (no allocation).
// Closes only open FDs — efficient even with high ulimit -n.
if close_fds_via_proc(first_fd) {
return Ok(());
}

// Strategy 3: Brute-force close with dynamic limit from getrlimit.
// Used when both close_range and /proc are unavailable (e.g., old kernel
// in a mount namespace without /proc).
brute_force_close_fds(first_fd);
Ok(())
}

#[cfg(target_os = "macos")]
{
// macOS: brute force close (no close_range syscall)
// 4096 is a reasonable upper bound for most processes
for fd in first_fd..4096 {
// Ignore errors - FD might not be open
unsafe { libc::close(fd) };
}
// macOS: brute-force close with dynamic limit from getrlimit.
// Handles systems where ulimit -n exceeds the previous hardcoded 4096.
brute_force_close_fds(first_fd);
Ok(())
}

Expand Down Expand Up @@ -230,4 +384,92 @@ mod tests {
child_close_fds_from_closes_target_and_above,
);
}

/// Create a high-numbered FD (above the old hardcoded 1024/4096 limits)
/// and verify that `close_inherited_fds_raw` closes it.
fn child_close_high_numbered_fd() -> i32 {
// Raise the soft FD limit so we can dup2 to a high FD number.
// Some systems default to ulimit -n 1024, which would prevent dup2 to FD 2000.
let new_limit = libc::rlimit {
rlim_cur: 4096,
rlim_max: 4096,
};
// SAFETY: setrlimit is async-signal-safe. Raising soft limit within hard limit is allowed.
if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &new_limit) } != 0 {
// Can't raise limit (hard limit too low) — skip test gracefully
return 0;
}

// dup2 stdout to a high FD number that exceeds the old hardcoded limits.
// Use 2000 on all platforms — above old Linux limit (1024) and reasonable.
let high_fd: i32 = 2000;
let result = unsafe { libc::dup2(STDOUT_FD, high_fd) };
if result != high_fd {
// dup2 failed — unexpected after raising ulimit
return 1;
}

// Verify the high FD is open
if unsafe { libc::fcntl(high_fd, libc::F_GETFD) } < 0 {
return 2;
}

// Close all inherited FDs
if close_inherited_fds_raw().is_err() {
return 3;
}

// The high FD should now be closed
if unsafe { libc::fcntl(high_fd, libc::F_GETFD) } != -1 {
return 4;
}
0
}

#[test]
fn test_close_high_numbered_fd() {
run_in_child("test_close_high_numbered_fd", child_close_high_numbered_fd);
}

#[test]
fn test_get_max_fd_returns_positive() {
let max = get_max_fd();
assert!(
max > 0,
"get_max_fd should return positive value, got {}",
max
);
assert!(
max >= 256,
"get_max_fd should return at least 256, got {}",
max
);
}

#[cfg(target_os = "linux")]
#[test]
fn test_parse_fd_from_name() {
// Valid numeric names
assert_eq!(parse_fd_from_name(b"0\0".as_ptr()), Some(0));
assert_eq!(parse_fd_from_name(b"3\0".as_ptr()), Some(3));
assert_eq!(parse_fd_from_name(b"42\0".as_ptr()), Some(42));
assert_eq!(parse_fd_from_name(b"1024\0".as_ptr()), Some(1024));
assert_eq!(parse_fd_from_name(b"65535\0".as_ptr()), Some(65535));

// Non-numeric names (. and ..)
assert_eq!(parse_fd_from_name(b".\0".as_ptr()), None);
assert_eq!(parse_fd_from_name(b"..\0".as_ptr()), None);

// Empty name
assert_eq!(parse_fd_from_name(b"\0".as_ptr()), None);

// Overflow: i32::MAX (2147483647) should succeed
assert_eq!(parse_fd_from_name(b"2147483647\0".as_ptr()), Some(i32::MAX));

// Overflow: i32::MAX + 1 (2147483648) should return None (checked_add overflow)
assert_eq!(parse_fd_from_name(b"2147483648\0".as_ptr()), None);

// Overflow: very large number should return None
assert_eq!(parse_fd_from_name(b"99999999999\0".as_ptr()), None);
}
}
Loading