From 5070e786ca67d2a91f54f35ac7c7124c0a6ab1c8 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Tue, 24 Feb 2026 22:02:23 +0100 Subject: [PATCH] nspawn: Stop overmounting /sys and /proc when a user namespace is used When the container runs in a user namespace, we don't need to protect /proc and /sys by overmounting things. In fact this is actively harmful as it prevents nested systemd-nspawn from working as to mount procfs and sysfs in a container it cannot be overmounted or the kernel will refuse the mount. To make nesting possible, let's stop overmounting parts of /proc and /sys when user namespaces are in use. --- src/nspawn/nspawn-mount.c | 25 ++++++++++++++++------ src/nspawn/nspawn.c | 44 +++++++++++++++++++++++++++++++++------ 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 282a29c359f70..53e2544e5491a 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -440,6 +440,7 @@ int tmpfs_patch_options( int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { _cleanup_free_ char *top = NULL, *full = NULL;; unsigned long extra_flags = 0; + bool is_mount_point; int r; top = path_join(dest, "/sys"); @@ -449,12 +450,9 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { r = path_is_mount_point(top); if (r < 0) return log_error_errno(r, "Failed to determine if '%s' is a mountpoint: %m", top); - if (r == 0) { - /* If this is not a mount point yet, then mount a tmpfs there */ - r = mount_nofollow_verbose(LOG_ERR, "tmpfs", top, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0555" TMPFS_LIMITS_SYS); - if (r < 0) - return r; - } else { + is_mount_point = r > 0; + + if (is_mount_point) { r = path_is_fs_type(top, SYSFS_MAGIC); if (r < 0) return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top); @@ -467,6 +465,21 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { return 0; } + /* When running in a user namespace, to enable mounting sysfs in nested containers, we cannot + * overmount it, so we mount it as is. While the user namespace won't be able to write to sysfs, we + * still have to mount it read-only as that's part of the container interface and various units + * conditionalize themselves based on whether /sys is mounted read-only or not. */ + if (!FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO)) + return mount_nofollow_verbose(LOG_ERR, "sysfs", top, "sysfs", + MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); + + if (!is_mount_point) { + /* If this is not a mount point yet, then mount a tmpfs there */ + r = mount_nofollow_verbose(LOG_ERR, "tmpfs", top, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0555" TMPFS_LIMITS_SYS); + if (r < 0) + return r; + } + full = path_join(top, "/full"); if (!full) return log_oom(); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 1b3aa7d1ad50a..3745f83b71d9b 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -1642,6 +1642,19 @@ static int parse_argv(int argc, char *argv[]) { return 1; } +static int container_in_userns(void) { + int r; + + if (arg_userns_mode != USER_NAMESPACE_NO) + return true; + + r = namespace_is_init(NAMESPACE_USER); + if (r < 0 && !IN_SET(r, -EBADR, -ENOSYS)) + return log_error_errno(r, "Failed to check if in initial user namespace: %m"); + + return r == 0; +} + static int verify_arguments(void) { int r; @@ -1654,6 +1667,15 @@ static int verify_arguments(void) { SET_FLAG(arg_mount_settings, MOUNT_USE_USERNS, arg_userns_mode != USER_NAMESPACE_NO); + /* When running in a user namespace the kernel will protect procfs/sysfs for us, so there's no need + * to mount them read-only or mask individual files. This applies both when we allocate a user + * namespace ourselves, and when nspawn is invoked from within an existing user namespace. */ + r = container_in_userns(); + if (r < 0) + return r; + if (r > 0) + arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO; + if (arg_private_network) SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, arg_private_network); @@ -1735,9 +1757,6 @@ static int verify_arguments(void) { if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network."); - if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write API VFS mounts."); - if (arg_expose_ports && !arg_private_network) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking."); @@ -2130,6 +2149,10 @@ static int setup_boot_id(void) { const char *to; int r; + r = container_in_userns(); + if (r != 0) + return r; + /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */ r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path); @@ -2539,6 +2562,10 @@ static int setup_kmsg(int fd_inner_socket) { assert(fd_inner_socket >= 0); + r = container_in_userns(); + if (r != 0) + return r; + BLOCK_WITH_UMASK(0000); /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to @@ -5793,9 +5820,14 @@ static int run_container( (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, pid); /* Retrieve the kmsg fifo allocated by inner child */ - fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0); - if (fd_kmsg_fifo < 0) - return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m"); + r = container_in_userns(); + if (r < 0) + return r; + if (r == 0) { + fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0); + if (fd_kmsg_fifo < 0) + return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m"); + } if (arg_expose_ports) { r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);