From 5070e786ca67d2a91f54f35ac7c7124c0a6ab1c8 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan@amutable.com>
Date: Tue, 24 Feb 2026 22:02:23 +0100
Subject: [PATCH] nspawn: Stop overmounting /sys and /proc when a user
 namespace is used

When the container runs in a user namespace, we don't need to protect
/proc and /sys by overmounting things. In fact this is actively harmful
as it prevents nested systemd-nspawn from working as to mount procfs and
sysfs in a container it cannot be overmounted or the kernel will refuse
the mount.

To make nesting possible, let's stop overmounting parts of /proc and /sys
when user namespaces are in use.
---
 src/nspawn/nspawn-mount.c | 25 ++++++++++++++++------
 src/nspawn/nspawn.c       | 44 +++++++++++++++++++++++++++++++++------
 2 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index 282a29c359f70..53e2544e5491a 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -440,6 +440,7 @@ int tmpfs_patch_options(
 int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
         _cleanup_free_ char *top = NULL, *full = NULL;;
         unsigned long extra_flags = 0;
+        bool is_mount_point;
         int r;
 
         top = path_join(dest, "/sys");
@@ -449,12 +450,9 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
         r = path_is_mount_point(top);
         if (r < 0)
                 return log_error_errno(r, "Failed to determine if '%s' is a mountpoint: %m", top);
-        if (r == 0) {
-                /* If this is not a mount point yet, then mount a tmpfs there */
-                r = mount_nofollow_verbose(LOG_ERR, "tmpfs", top, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0555" TMPFS_LIMITS_SYS);
-                if (r < 0)
-                        return r;
-        } else {
+        is_mount_point = r > 0;
+
+        if (is_mount_point) {
                 r = path_is_fs_type(top, SYSFS_MAGIC);
                 if (r < 0)
                         return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
@@ -467,6 +465,21 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
                         return 0;
         }
 
+        /* When running in a user namespace, to enable mounting sysfs in nested containers, we cannot
+         * overmount it, so we mount it as is. While the user namespace won't be able to write to sysfs, we
+         * still have to mount it read-only as that's part of the container interface and various units
+         * conditionalize themselves based on whether /sys is mounted read-only or not. */
+        if (!FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO))
+                return mount_nofollow_verbose(LOG_ERR, "sysfs", top, "sysfs",
+                                              MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+
+        if (!is_mount_point) {
+                /* If this is not a mount point yet, then mount a tmpfs there */
+                r = mount_nofollow_verbose(LOG_ERR, "tmpfs", top, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0555" TMPFS_LIMITS_SYS);
+                if (r < 0)
+                        return r;
+        }
+
         full = path_join(top, "/full");
         if (!full)
                 return log_oom();
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 1b3aa7d1ad50a..3745f83b71d9b 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -1642,6 +1642,19 @@ static int parse_argv(int argc, char *argv[]) {
         return 1;
 }
 
+static int container_in_userns(void) {
+        int r;
+
+        if (arg_userns_mode != USER_NAMESPACE_NO)
+                return true;
+
+        r = namespace_is_init(NAMESPACE_USER);
+        if (r < 0 && !IN_SET(r, -EBADR, -ENOSYS))
+                return log_error_errno(r, "Failed to check if in initial user namespace: %m");
+
+        return r == 0;
+}
+
 static int verify_arguments(void) {
         int r;
 
@@ -1654,6 +1667,15 @@ static int verify_arguments(void) {
 
         SET_FLAG(arg_mount_settings, MOUNT_USE_USERNS, arg_userns_mode != USER_NAMESPACE_NO);
 
+        /* When running in a user namespace the kernel will protect procfs/sysfs for us, so there's no need
+         * to mount them read-only or mask individual files. This applies both when we allocate a user
+         * namespace ourselves, and when nspawn is invoked from within an existing user namespace. */
+        r = container_in_userns();
+        if (r < 0)
+                return r;
+        if (r > 0)
+                arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO;
+
         if (arg_private_network)
                 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, arg_private_network);
 
@@ -1735,9 +1757,6 @@ static int verify_arguments(void) {
         if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
 
-        if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
-                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write API VFS mounts.");
-
         if (arg_expose_ports && !arg_private_network)
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
 
@@ -2130,6 +2149,10 @@ static int setup_boot_id(void) {
         const char *to;
         int r;
 
+        r = container_in_userns();
+        if (r != 0)
+                return r;
+
         /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
 
         r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
@@ -2539,6 +2562,10 @@ static int setup_kmsg(int fd_inner_socket) {
 
         assert(fd_inner_socket >= 0);
 
+        r = container_in_userns();
+        if (r != 0)
+                return r;
+
         BLOCK_WITH_UMASK(0000);
 
         /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
@@ -5793,9 +5820,14 @@ static int run_container(
         (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, pid);
 
         /* Retrieve the kmsg fifo allocated by inner child */
-        fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
-        if (fd_kmsg_fifo < 0)
-                return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
+        r = container_in_userns();
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
+                if (fd_kmsg_fifo < 0)
+                        return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
+        }
 
         if (arg_expose_ports) {
                 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);