From 2671c4670ce4ef2b808169df346596245cc7993f Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Sat, 7 Mar 2026 21:13:06 +0100 Subject: [PATCH 1/7] test-mempress: Migrate to new assertion macros --- src/test/test-mempress.c | 164 +++++++++++++++++++-------------------- 1 file changed, 82 insertions(+), 82 deletions(-) diff --git a/src/test/test-mempress.c b/src/test/test-mempress.c index 817eaa421f1ad..6d6bc0b481a95 100644 --- a/src/test/test-mempress.c +++ b/src/test/test-mempress.c @@ -39,16 +39,16 @@ static void *fake_pressure_thread(void *p) { usleep_safe(150); - assert_se(write(c->fifo_fd, &(const char) { 'x' }, 1) == 1); + ASSERT_EQ(write(c->fifo_fd, &(const char) { 'x' }, 1), 1); usleep_safe(150); cfd = accept4(c->socket_fd, NULL, NULL, SOCK_CLOEXEC); - assert_se(cfd >= 0); + ASSERT_OK_ERRNO(cfd); char buf[STRLEN("hello")+1] = {}; - assert_se(read(cfd, buf, sizeof(buf)-1) == sizeof(buf)-1); + ASSERT_EQ(read(cfd, buf, sizeof(buf)-1), (ssize_t) (sizeof(buf)-1)); ASSERT_STREQ(buf, "hello"); - assert_se(write(cfd, &(const char) { 'z' }, 1) == 1); + ASSERT_EQ(write(cfd, &(const char) { 'z' }, 1), 1); return NULL; } @@ -57,15 +57,15 @@ static int fake_pressure_callback(sd_event_source *s, void *userdata) { int *value = userdata; const char *d; - assert_se(s); - assert_se(sd_event_source_get_description(s, &d) >= 0); + ASSERT_NOT_NULL(s); + ASSERT_OK(sd_event_source_get_description(s, &d)); *value *= d[0]; log_notice("memory pressure event: %s", d); if (*value == 7 * 'f' * 's') - assert_se(sd_event_exit(sd_event_source_get_event(s), 0) >= 0); + ASSERT_OK(sd_event_exit(sd_event_source_get_event(s), 0)); return 0; } @@ -80,50 +80,50 @@ TEST(fake_pressure) { pthread_t th; int value = 7; - assert_se(sd_event_default(&e) >= 0); + ASSERT_OK(sd_event_default(&e)); - assert_se(mkdtemp_malloc(NULL, &tmp) >= 0); + ASSERT_OK(mkdtemp_malloc(NULL, &tmp)); - assert_se(j = path_join(tmp, "fifo")); - assert_se(mkfifo(j, 0600) >= 0); + ASSERT_NOT_NULL(j = path_join(tmp, "fifo")); + ASSERT_OK_ERRNO(mkfifo(j, 0600)); fifo_fd = open(j, O_CLOEXEC|O_RDWR|O_NONBLOCK); - assert_se(fifo_fd >= 0); + ASSERT_OK_ERRNO(fifo_fd); - assert_se(k = path_join(tmp, "sock")); + ASSERT_NOT_NULL(k = path_join(tmp, "sock")); socket_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); - assert_se(socket_fd >= 0); - assert_se(sockaddr_un_set_path(&sa.un, k) >= 0); - assert_se(bind(socket_fd, &sa.sa, sockaddr_un_len(&sa.un)) >= 0); - assert_se(listen(socket_fd, 1) >= 0); + ASSERT_OK_ERRNO(socket_fd); + ASSERT_OK(sockaddr_un_set_path(&sa.un, k)); + ASSERT_OK_ERRNO(bind(socket_fd, &sa.sa, sockaddr_un_len(&sa.un))); + ASSERT_OK_ERRNO(listen(socket_fd, 1)); /* Ideally we'd just allocate this on the stack, but AddressSanitizer doesn't like it if threads * access each other's stack */ struct fake_pressure_context *fp = new(struct fake_pressure_context, 1); - assert_se(fp); + ASSERT_NOT_NULL(fp); *fp = (struct fake_pressure_context) { .fifo_fd = fifo_fd, .socket_fd = socket_fd, }; - assert_se(pthread_create(&th, NULL, fake_pressure_thread, TAKE_PTR(fp)) == 0); + ASSERT_EQ(pthread_create(&th, NULL, fake_pressure_thread, TAKE_PTR(fp)), 0); - assert_se(setenv("MEMORY_PRESSURE_WATCH", j, /* override= */ true) >= 0); - assert_se(unsetenv("MEMORY_PRESSURE_WRITE") >= 0); + ASSERT_OK_ERRNO(setenv("MEMORY_PRESSURE_WATCH", j, /* override= */ true)); + ASSERT_OK_ERRNO(unsetenv("MEMORY_PRESSURE_WRITE")); - assert_se(sd_event_add_memory_pressure(e, &es, fake_pressure_callback, &value) >= 0); - assert_se(sd_event_source_set_description(es, "fifo event source") >= 0); + ASSERT_OK(sd_event_add_memory_pressure(e, &es, fake_pressure_callback, &value)); + ASSERT_OK(sd_event_source_set_description(es, "fifo event source")); - assert_se(setenv("MEMORY_PRESSURE_WATCH", k, /* override= */ true) >= 0); - assert_se(setenv("MEMORY_PRESSURE_WRITE", "aGVsbG8K", /* override= */ true) >= 0); + ASSERT_OK_ERRNO(setenv("MEMORY_PRESSURE_WATCH", k, /* override= */ true)); + ASSERT_OK_ERRNO(setenv("MEMORY_PRESSURE_WRITE", "aGVsbG8K", /* override= */ true)); - assert_se(sd_event_add_memory_pressure(e, &ef, fake_pressure_callback, &value) >= 0); - assert_se(sd_event_source_set_description(ef, "socket event source") >= 0); + ASSERT_OK(sd_event_add_memory_pressure(e, &ef, fake_pressure_callback, &value)); + ASSERT_OK(sd_event_source_set_description(ef, "socket event source")); - assert_se(sd_event_loop(e) >= 0); + ASSERT_OK(sd_event_loop(e)); - assert_se(value == 7 * 'f' * 's'); + ASSERT_EQ(value, 7 * 'f' * 's'); - assert_se(pthread_join(th, NULL) == 0); + ASSERT_EQ(pthread_join(th, NULL), 0); } struct real_pressure_context { @@ -134,15 +134,15 @@ static int real_pressure_callback(sd_event_source *s, void *userdata) { struct real_pressure_context *c = ASSERT_PTR(userdata); const char *d; - assert_se(s); - assert_se(sd_event_source_get_description(s, &d) >= 0); + ASSERT_NOT_NULL(s); + ASSERT_OK(sd_event_source_get_description(s, &d)); log_notice("real_memory pressure event: %s", d); sd_event_trim_memory(); - assert_se(c->pid); - assert_se(sd_event_source_send_child_signal(c->pid, SIGKILL, NULL, 0) >= 0); + ASSERT_NOT_NULL(c->pid); + ASSERT_OK(sd_event_source_send_child_signal(c->pid, SIGKILL, NULL, 0)); c->pid = NULL; return 0; @@ -156,13 +156,13 @@ _noreturn_ static void real_pressure_eat_memory(int pipe_fd) { /* Allocates and touches 10M at a time, until runs out of memory */ char x; - assert_se(read(pipe_fd, &x, 1) == 1); /* Wait for the GO! */ + ASSERT_EQ(read(pipe_fd, &x, 1), 1); /* Wait for the GO! */ for (;;) { void *p; p = mmap(NULL, MMAP_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); - assert_se(p != MAP_FAILED); + ASSERT_TRUE(p != MAP_FAILED); log_info("Eating another %s.", FORMAT_BYTES(MMAP_SIZE)); @@ -176,16 +176,16 @@ _noreturn_ static void real_pressure_eat_memory(int pipe_fd) { } static int real_pressure_child_callback(sd_event_source *s, const siginfo_t *si, void *userdata) { - assert_se(s); - assert_se(si); + ASSERT_NOT_NULL(s); + ASSERT_NOT_NULL(si); log_notice("child dead"); - assert_se(si->si_signo == SIGCHLD); - assert_se(si->si_status == SIGKILL); - assert_se(si->si_code == CLD_KILLED); + ASSERT_EQ(si->si_signo, SIGCHLD); + ASSERT_EQ(si->si_status, SIGKILL); + ASSERT_EQ(si->si_code, CLD_KILLED); - assert_se(sd_event_exit(sd_event_source_get_event(s), 31) >= 0); + ASSERT_OK(sd_event_exit(sd_event_source_get_event(s), 31)); return 0; } @@ -205,42 +205,42 @@ TEST(real_pressure) { if (r < 0) return (void) log_tests_skipped_errno(r, "can't connect to system bus"); - assert_se(bus_wait_for_jobs_new(bus, &w) >= 0); + ASSERT_OK(bus_wait_for_jobs_new(bus, &w)); - assert_se(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit") >= 0); - assert_se(asprintf(&scope, "test-%" PRIu64 ".scope", random_u64()) >= 0); - assert_se(sd_bus_message_append(m, "ss", scope, "fail") >= 0); - assert_se(sd_bus_message_open_container(m, 'a', "(sv)") >= 0); - assert_se(sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, 0) >= 0); - assert_se(sd_bus_message_append(m, "(sv)", "MemoryAccounting", "b", true) >= 0); - assert_se(sd_bus_message_close_container(m) >= 0); - assert_se(sd_bus_message_append(m, "a(sa(sv))", 0) >= 0); + ASSERT_OK(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit")); + ASSERT_OK(asprintf(&scope, "test-%" PRIu64 ".scope", random_u64())); + ASSERT_OK(sd_bus_message_append(m, "ss", scope, "fail")); + ASSERT_OK(sd_bus_message_open_container(m, 'a', "(sv)")); + ASSERT_OK(sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, 0)); + ASSERT_OK(sd_bus_message_append(m, "(sv)", "MemoryAccounting", "b", true)); + ASSERT_OK(sd_bus_message_close_container(m)); + ASSERT_OK(sd_bus_message_append(m, "a(sa(sv))", 0)); r = sd_bus_call(bus, m, 0, &error, &reply); if (r < 0) return (void) log_tests_skipped_errno(r, "can't issue transient unit call"); - assert_se(sd_bus_message_read(reply, "o", &object) >= 0); + ASSERT_OK(sd_bus_message_read(reply, "o", &object)); - assert_se(bus_wait_for_jobs_one(w, object, /* flags= */ BUS_WAIT_JOBS_LOG_ERROR, /* extra_args= */ NULL) >= 0); + ASSERT_OK(bus_wait_for_jobs_one(w, object, /* flags= */ BUS_WAIT_JOBS_LOG_ERROR, /* extra_args= */ NULL)); - assert_se(sd_event_default(&e) >= 0); + ASSERT_OK(sd_event_default(&e)); - assert_se(pipe2(pipe_fd, O_CLOEXEC) >= 0); + ASSERT_OK_ERRNO(pipe2(pipe_fd, O_CLOEXEC)); _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; r = pidref_safe_fork("(eat-memory)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM, &pidref); - assert_se(r >= 0); + ASSERT_OK(r); if (r == 0) { real_pressure_eat_memory(pipe_fd[0]); _exit(EXIT_SUCCESS); } - assert_se(event_add_child_pidref(e, &cs, &pidref, WEXITED, real_pressure_child_callback, NULL) >= 0); - assert_se(sd_event_source_set_child_process_own(cs, true) >= 0); + ASSERT_OK(event_add_child_pidref(e, &cs, &pidref, WEXITED, real_pressure_child_callback, NULL)); + ASSERT_OK(sd_event_source_set_child_process_own(cs, true)); - assert_se(unsetenv("MEMORY_PRESSURE_WATCH") >= 0); - assert_se(unsetenv("MEMORY_PRESSURE_WRITE") >= 0); + ASSERT_OK_ERRNO(unsetenv("MEMORY_PRESSURE_WATCH")); + ASSERT_OK_ERRNO(unsetenv("MEMORY_PRESSURE_WRITE")); struct real_pressure_context context = { .pid = cs, @@ -250,21 +250,21 @@ TEST(real_pressure) { if (r < 0) return (void) log_tests_skipped_errno(r, "can't allocate memory pressure fd"); - assert_se(sd_event_source_set_description(es, "real pressure event source") >= 0); - assert_se(sd_event_source_set_memory_pressure_type(es, "some") == 0); - assert_se(sd_event_source_set_memory_pressure_type(es, "full") > 0); - assert_se(sd_event_source_set_memory_pressure_type(es, "full") == 0); - assert_se(sd_event_source_set_memory_pressure_type(es, "some") > 0); - assert_se(sd_event_source_set_memory_pressure_type(es, "some") == 0); - assert_se(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC) > 0); - assert_se(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC) == 0); - assert_se(sd_event_source_set_enabled(es, SD_EVENT_ONESHOT) >= 0); + ASSERT_OK(sd_event_source_set_description(es, "real pressure event source")); + ASSERT_OK_ZERO(sd_event_source_set_memory_pressure_type(es, "some")); + ASSERT_OK_POSITIVE(sd_event_source_set_memory_pressure_type(es, "full")); + ASSERT_OK_ZERO(sd_event_source_set_memory_pressure_type(es, "full")); + ASSERT_OK_POSITIVE(sd_event_source_set_memory_pressure_type(es, "some")); + ASSERT_OK_ZERO(sd_event_source_set_memory_pressure_type(es, "some")); + ASSERT_OK_POSITIVE(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC)); + ASSERT_OK_ZERO(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC)); + ASSERT_OK(sd_event_source_set_enabled(es, SD_EVENT_ONESHOT)); _cleanup_free_ char *uo = NULL; - assert_se(uo = unit_dbus_path_from_name(scope)); + ASSERT_NOT_NULL(uo = unit_dbus_path_from_name(scope)); uint64_t mcurrent = UINT64_MAX; - assert_se(sd_bus_get_property_trivial(bus, "org.freedesktop.systemd1", uo, "org.freedesktop.systemd1.Scope", "MemoryCurrent", &error, 't', &mcurrent) >= 0); + ASSERT_OK(sd_bus_get_property_trivial(bus, "org.freedesktop.systemd1", uo, "org.freedesktop.systemd1.Scope", "MemoryCurrent", &error, 't', &mcurrent)); printf("current: %" PRIu64 "\n", mcurrent); if (mcurrent == UINT64_MAX) @@ -272,14 +272,14 @@ TEST(real_pressure) { m = sd_bus_message_unref(m); - assert_se(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "SetUnitProperties") >= 0); - assert_se(sd_bus_message_append(m, "sb", scope, true) >= 0); - assert_se(sd_bus_message_open_container(m, 'a', "(sv)") >= 0); - assert_se(sd_bus_message_append(m, "(sv)", "MemoryHigh", "t", mcurrent + (15 * 1024 * 1024)) >= 0); - assert_se(sd_bus_message_append(m, "(sv)", "MemoryMax", "t", mcurrent + (50 * 1024 * 1024)) >= 0); - assert_se(sd_bus_message_close_container(m) >= 0); + ASSERT_OK(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "SetUnitProperties")); + ASSERT_OK(sd_bus_message_append(m, "sb", scope, true)); + ASSERT_OK(sd_bus_message_open_container(m, 'a', "(sv)")); + ASSERT_OK(sd_bus_message_append(m, "(sv)", "MemoryHigh", "t", mcurrent + (15 * 1024 * 1024))); + ASSERT_OK(sd_bus_message_append(m, "(sv)", "MemoryMax", "t", mcurrent + (50 * 1024 * 1024))); + ASSERT_OK(sd_bus_message_close_container(m)); - assert_se(sd_bus_call(bus, m, 0, NULL, NULL) >= 0); + ASSERT_OK(sd_bus_call(bus, m, 0, NULL, NULL)); /* Generate some memory allocations via mempool */ #define NN (1024) @@ -291,12 +291,12 @@ TEST(real_pressure) { free(h); /* Now start eating memory */ - assert_se(write(pipe_fd[1], &(const char) { 'x' }, 1) == 1); + ASSERT_EQ(write(pipe_fd[1], &(const char) { 'x' }, 1), 1); - assert_se(sd_event_loop(e) >= 0); + ASSERT_OK(sd_event_loop(e)); int ex = 0; - assert_se(sd_event_get_exit_code(e, &ex) >= 0); - assert_se(ex == 31); + ASSERT_OK(sd_event_get_exit_code(e, &ex)); + ASSERT_EQ(ex, 31); } static int outro(void) { From 83c857bb9ba96ed36477e838e71ea4deadad277c Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Sat, 7 Mar 2026 21:25:09 +0100 Subject: [PATCH 2/7] test-mempress: Support unprivileged operation --- src/test/test-mempress.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/test/test-mempress.c b/src/test/test-mempress.c index 6d6bc0b481a95..5ec372591e0f3 100644 --- a/src/test/test-mempress.c +++ b/src/test/test-mempress.c @@ -201,9 +201,12 @@ TEST(real_pressure) { const char *object; int r; - r = sd_bus_open_system(&bus); + if (getuid() == 0) + r = sd_bus_open_system(&bus); + else + r = sd_bus_open_user(&bus); if (r < 0) - return (void) log_tests_skipped_errno(r, "can't connect to system bus"); + return (void) log_tests_skipped_errno(r, "can't connect to bus"); ASSERT_OK(bus_wait_for_jobs_new(bus, &w)); @@ -256,8 +259,9 @@ TEST(real_pressure) { ASSERT_OK_ZERO(sd_event_source_set_memory_pressure_type(es, "full")); ASSERT_OK_POSITIVE(sd_event_source_set_memory_pressure_type(es, "some")); ASSERT_OK_ZERO(sd_event_source_set_memory_pressure_type(es, "some")); - ASSERT_OK_POSITIVE(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC)); - ASSERT_OK_ZERO(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC)); + /* Unprivileged writes require a minimum of 2s otherwise the kernel will refuse the write. */ + ASSERT_OK_POSITIVE(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, 2 * USEC_PER_SEC)); + ASSERT_OK_ZERO(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, 2 * USEC_PER_SEC)); ASSERT_OK(sd_event_source_set_enabled(es, SD_EVENT_ONESHOT)); _cleanup_free_ char *uo = NULL; From 3faad3d5165b45bbf6298fd6bae21452d903d5d9 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Sat, 7 Mar 2026 23:19:34 +0100 Subject: [PATCH 3/7] sd-event: Add support for cpu pressure notifications --- man/rules/meson.build | 5 +- man/sd_event_add_memory_pressure.xml | 86 ++++- src/basic/psi-util.c | 21 ++ src/basic/psi-util.h | 26 +- src/core/exec-invoke.c | 8 +- src/libsystemd/libsystemd.sym | 7 + src/libsystemd/sd-event/event-source.h | 3 +- src/libsystemd/sd-event/sd-event.c | 334 +++++++++++------- src/systemd/sd-event.h | 3 + src/test/meson.build | 2 +- src/test/{test-mempress.c => test-pressure.c} | 208 +++++++++-- 11 files changed, 524 insertions(+), 179 deletions(-) rename src/test/{test-mempress.c => test-pressure.c} (59%) diff --git a/man/rules/meson.build b/man/rules/meson.build index d2d26abe5da31..ee20ace35f3e2 100644 --- a/man/rules/meson.build +++ b/man/rules/meson.build @@ -608,7 +608,10 @@ manpages = [ ''], ['sd_event_add_memory_pressure', '3', - ['sd_event_source_set_memory_pressure_period', + ['sd_event_add_cpu_pressure', + 'sd_event_source_set_cpu_pressure_period', + 'sd_event_source_set_cpu_pressure_type', + 'sd_event_source_set_memory_pressure_period', 'sd_event_source_set_memory_pressure_type', 'sd_event_trim_memory'], ''], diff --git a/man/sd_event_add_memory_pressure.xml b/man/sd_event_add_memory_pressure.xml index b112855f061b0..1e6b734738f6c 100644 --- a/man/sd_event_add_memory_pressure.xml +++ b/man/sd_event_add_memory_pressure.xml @@ -21,7 +21,11 @@ sd_event_source_set_memory_pressure_period sd_event_trim_memory - Add and configure an event source run as result of memory pressure + sd_event_add_cpu_pressure + sd_event_source_set_cpu_pressure_type + sd_event_source_set_cpu_pressure_period + + Add and configure an event source run as result of memory or CPU pressure @@ -51,6 +55,27 @@ uint64_t window_usec + + int sd_event_add_cpu_pressure + sd_event *event + sd_event_source **ret_source + sd_event_handler_t handler + void *userdata + + + + int sd_event_source_set_cpu_pressure_type + sd_event_source *source + const char *type + + + + int sd_event_source_set_cpu_pressure_period + sd_event_source *source + uint64_t threshold_usec + uint64_t window_usec + + int sd_event_trim_memory void @@ -62,12 +87,14 @@ Description sd_event_add_memory_pressure() adds a new event source that is triggered - whenever memory pressure is seen. This functionality is built around the Linux kernel's sd_event_add_cpu_pressure() adds a new event source that is triggered whenever CPU + pressure is seen. This functionality is built around the Linux kernel's Pressure Stall Information (PSI) logic. - Expects an event loop object as first parameter, and returns the allocated event source object in - the second parameter, on success. The handler parameter is a function to call when - memory pressure is seen, or NULL. The handler function will be passed the + Both functions expect an event loop object as first parameter, and return the allocated event source + object in the second parameter, on success. The handler parameter is a function to + call when pressure is seen, or NULL. The handler function will be passed the userdata pointer, which may be chosen freely by the caller. The handler may return negative to signal an error (see below), other return values are ignored. If handler is NULL, a default handler that compacts allocation @@ -83,12 +110,13 @@ sd_event_source_set_enabled3 with SD_EVENT_OFF. - If the second parameter of sd_event_add_memory_pressure() is + If the second parameter of sd_event_add_memory_pressure() or + sd_event_add_cpu_pressure() is NULL no reference to the event source object is returned. In this case, the event source is considered "floating", and will be destroyed implicitly when the event loop itself is destroyed. - The event source will fire according to the following logic: + The memory pressure event source will fire according to the following logic: If the @@ -111,6 +139,13 @@ /proc/pressure/memory is watched instead. + The CPU pressure event source follows the same logic, but uses the + $CPU_PRESSURE_WATCH/$CPU_PRESSURE_WRITE environment variables, + the cpu.pressure cgroup file, and the system-wide PSI interface file + /proc/pressure/cpu instead. Note that /proc/pressure/cpu only + provides the some line, not the full line, so only + some is valid when watching at the system level. + Or in other words: preferably any explicit configuration passed in by an invoking service manager (or similar) is used as notification source, before falling back to local notifications of the service, and finally to global notifications of the system. @@ -143,7 +178,7 @@ The sd_event_source_set_memory_pressure_type() and sd_event_source_set_memory_pressure_period() functions can be used to fine-tune the - PSI parameters for pressure notifications. The former takes either some, + PSI parameters for memory pressure notifications. The former takes either some, full as second parameter, the latter takes threshold and period times in microseconds as parameters. For details about these three parameters see the PSI documentation. Note that these two calls must be invoked immediately after allocating the event source, as they must be configured before @@ -152,6 +187,16 @@ environment variables (or in other words: configuration supplied by a service manager wins over internal settings). + Similarly, sd_event_source_set_cpu_pressure_type() and + sd_event_source_set_cpu_pressure_period() can be used to fine-tune the PSI + parameters for CPU pressure notifications. They work identically to their memory pressure counterparts. + The type parameter takes either some or full, and the period + function takes threshold and period times in microseconds. The same constraints apply: these calls must + be invoked immediately after allocating the event source, and will fail if CPU pressure parameterization + has been passed in via the + $CPU_PRESSURE_WATCH/$CPU_PRESSURE_WRITE environment + variables. + The sd_event_trim_memory() function releases various internal allocation caches maintained by libsystemd and then invokes glibc's malloc_trim3. This @@ -197,8 +242,9 @@ -EHOSTDOWN - The $MEMORY_PRESSURE_WATCH variable has been set to the literal - string /dev/null, in order to explicitly disable memory pressure + The $MEMORY_PRESSURE_WATCH or + $CPU_PRESSURE_WATCH variable has been set to the literal + string /dev/null, in order to explicitly disable pressure handling. @@ -207,7 +253,8 @@ -EBADMSG - The $MEMORY_PRESSURE_WATCH variable has been set to an invalid + The $MEMORY_PRESSURE_WATCH or + $CPU_PRESSURE_WATCH variable has been set to an invalid string, for example a relative rather than an absolute path. @@ -216,7 +263,8 @@ -ENOTTY - The $MEMORY_PRESSURE_WATCH variable points to a regular file + The $MEMORY_PRESSURE_WATCH or + $CPU_PRESSURE_WATCH variable points to a regular file outside of the procfs or cgroupfs file systems. @@ -225,8 +273,9 @@ -EOPNOTSUPP - No configuration via $MEMORY_PRESSURE_WATCH has been specified - and the local kernel does not support the PSI interface. + No configuration via $MEMORY_PRESSURE_WATCH or + $CPU_PRESSURE_WATCH has been specified and the local kernel does not support the + PSI interface. @@ -234,8 +283,10 @@ -EBUSY - This is returned by sd_event_source_set_memory_pressure_type() - and sd_event_source_set_memory_pressure_period() if invoked on event sources + This is returned by sd_event_source_set_memory_pressure_type(), + sd_event_source_set_memory_pressure_period(), + sd_event_source_set_cpu_pressure_type(), + and sd_event_source_set_cpu_pressure_period() if invoked on event sources at a time later than immediately after allocating them. @@ -277,6 +328,9 @@ sd_event_source_set_memory_pressure_type(), sd_event_source_set_memory_pressure_period(), and sd_event_trim_memory() were added in version 254. + sd_event_add_cpu_pressure(), + sd_event_source_set_cpu_pressure_type(), and + sd_event_source_set_cpu_pressure_period() were added in version 261. diff --git a/src/basic/psi-util.c b/src/basic/psi-util.c index df1ccbc1b20fb..cf05485dc7b67 100644 --- a/src/basic/psi-util.c +++ b/src/basic/psi-util.c @@ -10,6 +10,7 @@ #include "fileio.h" #include "parse-util.h" #include "psi-util.h" +#include "string-table.h" #include "string-util.h" #include "strv.h" @@ -104,6 +105,26 @@ int read_resource_pressure(const char *path, PressureType type, ResourcePressure return 0; } +const PressureResourceInfo pressure_resource_info[_PRESSURE_RESOURCE_MAX] = { + [PRESSURE_MEMORY] = { + .name = "memory", + .env_watch = "MEMORY_PRESSURE_WATCH", + .env_write = "MEMORY_PRESSURE_WRITE", + }, + [PRESSURE_CPU] = { + .name = "cpu", + .env_watch = "CPU_PRESSURE_WATCH", + .env_write = "CPU_PRESSURE_WRITE", + }, +}; + +static const char* const pressure_resource_table[_PRESSURE_RESOURCE_MAX] = { + [PRESSURE_MEMORY] = "memory", + [PRESSURE_CPU] = "cpu", +}; + +DEFINE_STRING_TABLE_LOOKUP(pressure_resource, PressureResource); + int is_pressure_supported(void) { static thread_local int cached = -1; int r; diff --git a/src/basic/psi-util.h b/src/basic/psi-util.h index f5e79960a8159..aed74ef742d5a 100644 --- a/src/basic/psi-util.h +++ b/src/basic/psi-util.h @@ -9,6 +9,13 @@ typedef enum PressureType { PRESSURE_TYPE_FULL, } PressureType; +typedef enum PressureResource { + PRESSURE_MEMORY, + PRESSURE_CPU, + _PRESSURE_RESOURCE_MAX, + _PRESSURE_RESOURCE_INVALID = -EINVAL, +} PressureResource; + /* Averages are stored in fixed-point with 11 bit fractions */ typedef struct ResourcePressure { loadavg_t avg10; @@ -27,7 +34,18 @@ int read_resource_pressure(const char *path, PressureType type, ResourcePressure /* Was the kernel compiled with CONFIG_PSI=y? 1 if yes, 0 if not, negative on error. */ int is_pressure_supported(void); -/* Default parameters for memory pressure watch logic in sd-event and PID 1 */ -#define MEMORY_PRESSURE_DEFAULT_TYPE "some" -#define MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC (200 * USEC_PER_MSEC) -#define MEMORY_PRESSURE_DEFAULT_WINDOW_USEC (2 * USEC_PER_SEC) +/* Metadata for each pressure resource type, for use in sd-event and PID 1 */ +typedef struct PressureResourceInfo { + const char *name; /* "memory", "cpu", "io" */ + const char *env_watch; /* "MEMORY_PRESSURE_WATCH", etc. */ + const char *env_write; /* "MEMORY_PRESSURE_WRITE", etc. */ +} PressureResourceInfo; + +extern const PressureResourceInfo pressure_resource_info[_PRESSURE_RESOURCE_MAX]; + +DECLARE_STRING_TABLE_LOOKUP(pressure_resource, PressureResource); + +/* Default parameters for pressure watch logic in sd-event and PID 1 */ +#define PRESSURE_DEFAULT_TYPE "some" +#define PRESSURE_DEFAULT_THRESHOLD_USEC (200 * USEC_PER_MSEC) +#define PRESSURE_DEFAULT_WINDOW_USEC (2 * USEC_PER_SEC) diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index b91a964cdd6ab..7500888c414a3 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2224,10 +2224,10 @@ static int build_environment( _cleanup_free_ char *b = NULL, *x = NULL; if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT, - MEMORY_PRESSURE_DEFAULT_TYPE, - cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC : - CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC), - MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0) + PRESSURE_DEFAULT_TYPE, + cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? PRESSURE_DEFAULT_THRESHOLD_USEC : + CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, PRESSURE_DEFAULT_WINDOW_USEC), + PRESSURE_DEFAULT_WINDOW_USEC) < 0) return -ENOMEM; if (base64mem(b, strlen(b) + 1, &x) < 0) diff --git a/src/libsystemd/libsystemd.sym b/src/libsystemd/libsystemd.sym index aa270a483a40e..244040c77e086 100644 --- a/src/libsystemd/libsystemd.sym +++ b/src/libsystemd/libsystemd.sym @@ -1090,3 +1090,10 @@ LIBSYSTEMD_260 { global: sd_session_get_extra_device_access; } LIBSYSTEMD_259; + +LIBSYSTEMD_261 { +global: + sd_event_add_cpu_pressure; + sd_event_source_set_cpu_pressure_type; + sd_event_source_set_cpu_pressure_period; +} LIBSYSTEMD_260; diff --git a/src/libsystemd/sd-event/event-source.h b/src/libsystemd/sd-event/event-source.h index e4dc456fae8ea..c7d5ba166da31 100644 --- a/src/libsystemd/sd-event/event-source.h +++ b/src/libsystemd/sd-event/event-source.h @@ -26,6 +26,7 @@ typedef enum EventSourceType { SOURCE_WATCHDOG, SOURCE_INOTIFY, SOURCE_MEMORY_PRESSURE, + SOURCE_CPU_PRESSURE, _SOURCE_EVENT_SOURCE_TYPE_MAX, _SOURCE_EVENT_SOURCE_TYPE_INVALID = -EINVAL, } EventSourceType; @@ -144,7 +145,7 @@ struct sd_event_source { size_t write_buffer_size; uint32_t events, revents; LIST_FIELDS(sd_event_source, write_list); - } memory_pressure; + } pressure; }; }; diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index b78cfe86fa40e..c43f0f9e716e8 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -76,6 +76,7 @@ static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] [SOURCE_WATCHDOG] = "watchdog", [SOURCE_INOTIFY] = "inotify", [SOURCE_MEMORY_PRESSURE] = "memory-pressure", + [SOURCE_CPU_PRESSURE] = "cpu-pressure", }; DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int); @@ -99,7 +100,8 @@ DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int); SOURCE_SIGNAL, \ SOURCE_DEFER, \ SOURCE_INOTIFY, \ - SOURCE_MEMORY_PRESSURE) + SOURCE_MEMORY_PRESSURE, \ + SOURCE_CPU_PRESSURE) /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put(). * Time sources and ratelimited sources can be passed, so effectively this is the same as the @@ -144,8 +146,8 @@ struct sd_event { /* A list of inotify objects that already have events buffered which aren't processed yet */ LIST_HEAD(InotifyData, buffered_inotify_data_list); - /* A list of memory pressure event sources that still need their subscription string written */ - LIST_HEAD(sd_event_source, memory_pressure_write_list); + /* A list of pressure event sources that still need their subscription string written */ + LIST_HEAD(sd_event_source, pressure_write_list); uint64_t origin_id; @@ -564,63 +566,65 @@ static int source_child_pidfd_register(sd_event_source *s, int enabled) { return 0; } -static void source_memory_pressure_unregister(sd_event_source *s) { +#define EVENT_SOURCE_IS_PRESSURE(s) IN_SET((s)->type, SOURCE_MEMORY_PRESSURE, SOURCE_CPU_PRESSURE) + +static void source_pressure_unregister(sd_event_source *s) { assert(s); - assert(s->type == SOURCE_MEMORY_PRESSURE); + assert(EVENT_SOURCE_IS_PRESSURE(s)); if (event_origin_changed(s->event)) return; - if (!s->memory_pressure.registered) + if (!s->pressure.registered) return; - if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0) + if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->pressure.fd, NULL) < 0) log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m", strna(s->description), event_source_type_to_string(s->type)); - s->memory_pressure.registered = false; + s->pressure.registered = false; } -static int source_memory_pressure_register(sd_event_source *s, int enabled) { +static int source_pressure_register(sd_event_source *s, int enabled) { assert(s); - assert(s->type == SOURCE_MEMORY_PRESSURE); + assert(EVENT_SOURCE_IS_PRESSURE(s)); assert(enabled != SD_EVENT_OFF); struct epoll_event ev = { - .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT : - (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)), + .events = s->pressure.write_buffer_size > 0 ? EPOLLOUT : + (s->pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)), .data.ptr = s, }; if (epoll_ctl(s->event->epoll_fd, - s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, - s->memory_pressure.fd, &ev) < 0) + s->pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, + s->pressure.fd, &ev) < 0) return -errno; - s->memory_pressure.registered = true; + s->pressure.registered = true; return 0; } -static void source_memory_pressure_add_to_write_list(sd_event_source *s) { +static void source_pressure_add_to_write_list(sd_event_source *s) { assert(s); - assert(s->type == SOURCE_MEMORY_PRESSURE); + assert(EVENT_SOURCE_IS_PRESSURE(s)); - if (s->memory_pressure.in_write_list) + if (s->pressure.in_write_list) return; - LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s); - s->memory_pressure.in_write_list = true; + LIST_PREPEND(pressure.write_list, s->event->pressure_write_list, s); + s->pressure.in_write_list = true; } -static void source_memory_pressure_remove_from_write_list(sd_event_source *s) { +static void source_pressure_remove_from_write_list(sd_event_source *s) { assert(s); - assert(s->type == SOURCE_MEMORY_PRESSURE); + assert(EVENT_SOURCE_IS_PRESSURE(s)); - if (!s->memory_pressure.in_write_list) + if (!s->pressure.in_write_list) return; - LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s); - s->memory_pressure.in_write_list = false; + LIST_REMOVE(pressure.write_list, s->event->pressure_write_list, s); + s->pressure.in_write_list = false; } static clockid_t event_source_type_to_clock(EventSourceType t) { @@ -1047,8 +1051,9 @@ static void source_disconnect(sd_event_source *s) { } case SOURCE_MEMORY_PRESSURE: - source_memory_pressure_remove_from_write_list(s); - source_memory_pressure_unregister(s); + case SOURCE_CPU_PRESSURE: + source_pressure_remove_from_write_list(s); + source_pressure_unregister(s); break; default: @@ -1111,9 +1116,9 @@ static sd_event_source* source_free(sd_event_source *s) { s->child.pidfd = safe_close(s->child.pidfd); } - if (s->type == SOURCE_MEMORY_PRESSURE) { - s->memory_pressure.fd = safe_close(s->memory_pressure.fd); - s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer); + if (EVENT_SOURCE_IS_PRESSURE(s)) { + s->pressure.fd = safe_close(s->pressure.fd); + s->pressure.write_buffer = mfree(s->pressure.write_buffer); } if (s->destroy_callback) @@ -1191,7 +1196,8 @@ static sd_event_source* source_new(sd_event *e, bool floating, EventSourceType t [SOURCE_POST] = endoffsetof_field(sd_event_source, post), [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit), [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify), - [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure), + [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, pressure), + [SOURCE_CPU_PRESSURE] = endoffsetof_field(sd_event_source, pressure), }; sd_event_source *s; @@ -1917,17 +1923,21 @@ static int memory_pressure_callback(sd_event_source *s, void *userdata) { return 0; } -_public_ int sd_event_add_memory_pressure( +static int event_add_pressure( sd_event *e, sd_event_source **ret, sd_event_handler_t callback, - void *userdata) { + void *userdata, + EventSourceType type, + sd_event_handler_t default_callback, + PressureResource resource) { _cleanup_free_ char *w = NULL; _cleanup_(source_freep) sd_event_source *s = NULL; _cleanup_close_ int path_fd = -EBADF, fd = -EBADF; _cleanup_free_ void *write_buffer = NULL; - const char *watch, *watch_fallback = NULL, *env; + _cleanup_free_ char *watch_fallback = NULL; + const char *watch, *env; size_t write_buffer_size = 0; struct stat st; uint32_t events; @@ -1939,32 +1949,35 @@ _public_ int sd_event_add_memory_pressure( assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); assert_return(!event_origin_changed(e), -ECHILD); + assert(resource >= 0 && resource < _PRESSURE_RESOURCE_MAX); + const PressureResourceInfo *info = &pressure_resource_info[resource]; + if (!callback) - callback = memory_pressure_callback; + callback = default_callback; - s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE); + s = source_new(e, !ret, type); if (!s) return -ENOMEM; s->wakeup = WAKEUP_EVENT_SOURCE; - s->memory_pressure.callback = callback; + s->pressure.callback = callback; s->userdata = userdata; s->enabled = SD_EVENT_ON; - s->memory_pressure.fd = -EBADF; + s->pressure.fd = -EBADF; - env = secure_getenv("MEMORY_PRESSURE_WATCH"); + env = secure_getenv(info->env_watch); if (env) { if (isempty(env) || path_equal(env, "/dev/null")) return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), - "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH."); + "Pressure logic is explicitly disabled via $%s.", info->env_watch); if (!path_is_absolute(env) || !path_is_normalized(env)) return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), - "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env); + "$%s set to invalid path: %s", info->env_watch, env); watch = env; - env = secure_getenv("MEMORY_PRESSURE_WRITE"); + env = secure_getenv(info->env_write); if (env) { r = unbase64mem(env, &write_buffer, &write_buffer_size); if (r < 0) @@ -1980,8 +1993,8 @@ _public_ int sd_event_add_memory_pressure( if (r == 0) return -EOPNOTSUPP; - /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on - * the system wide pressure if for some reason we cannot (which could be: memory controller + /* By default we want to watch pressure on the local cgroup, but we'll fall back on + * the system wide pressure if for some reason we cannot (which could be: controller * not delegated to us, or PSI simply not available in the kernel). */ _cleanup_free_ char *cg = NULL; @@ -1989,12 +2002,19 @@ _public_ int sd_event_add_memory_pressure( if (r < 0) return r; - w = path_join("/sys/fs/cgroup", cg, "memory.pressure"); + _cleanup_free_ char *cgroup_file = strjoin(info->name, ".pressure"); + if (!cgroup_file) + return -ENOMEM; + + w = path_join("/sys/fs/cgroup", cg, cgroup_file); if (!w) return -ENOMEM; watch = w; - watch_fallback = "/proc/pressure/memory"; + + watch_fallback = strjoin("/proc/pressure/", info->name); + if (!watch_fallback) + return -ENOMEM; /* Android uses three levels in its userspace low memory killer logic: * some 70000 1000000 @@ -2011,9 +2031,9 @@ _public_ int sd_event_add_memory_pressure( * kernel will allow us to do unprivileged, also in the future. */ if (asprintf((char**) &write_buffer, "%s " USEC_FMT " " USEC_FMT, - MEMORY_PRESSURE_DEFAULT_TYPE, - MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC, - MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0) + PRESSURE_DEFAULT_TYPE, + PRESSURE_DEFAULT_THRESHOLD_USEC, + PRESSURE_DEFAULT_WINDOW_USEC) < 0) return -ENOMEM; write_buffer_size = strlen(write_buffer) + 1; @@ -2080,24 +2100,24 @@ _public_ int sd_event_add_memory_pressure( else return -EBADF; - s->memory_pressure.fd = TAKE_FD(fd); - s->memory_pressure.write_buffer = TAKE_PTR(write_buffer); - s->memory_pressure.write_buffer_size = write_buffer_size; - s->memory_pressure.events = events; - s->memory_pressure.locked = locked; + s->pressure.fd = TAKE_FD(fd); + s->pressure.write_buffer = TAKE_PTR(write_buffer); + s->pressure.write_buffer_size = write_buffer_size; + s->pressure.events = events; + s->pressure.locked = locked; /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the - * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure - * event sources on which writes must be executed before the first event loop iteration is - * executed. (We could also write the data here, right away, but we want to give the caller the - * freedom to call sd_event_source_set_memory_pressure_type() and - * sd_event_source_set_memory_pressure_rate() before we write it. */ - - if (s->memory_pressure.write_buffer_size > 0) - source_memory_pressure_add_to_write_list(s); + * fd with the epoll right-away. Instead, we just add the event source to a list of pressure event + * sources on which writes must be executed before the first event loop iteration is executed. (We + * could also write the data here, right away, but we want to give the caller the freedom to call + * sd_event_source_set_{memory,cpu}_pressure_type() and + * sd_event_source_set_{memory,cpu}_pressure_period() before we write it. */ + + if (s->pressure.write_buffer_size > 0) + source_pressure_add_to_write_list(s); else { - r = source_memory_pressure_register(s, s->enabled); + r = source_pressure_register(s, s->enabled); if (r < 0) return r; } @@ -2109,6 +2129,38 @@ _public_ int sd_event_add_memory_pressure( return 0; } +_public_ int sd_event_add_memory_pressure( + sd_event *e, + sd_event_source **ret, + sd_event_handler_t callback, + void *userdata) { + + return event_add_pressure( + e, ret, callback, userdata, + SOURCE_MEMORY_PRESSURE, + memory_pressure_callback, + PRESSURE_MEMORY); +} + +static int cpu_pressure_callback(sd_event_source *s, void *userdata) { + assert(s); + + return 0; +} + +_public_ int sd_event_add_cpu_pressure( + sd_event *e, + sd_event_source **ret, + sd_event_handler_t callback, + void *userdata) { + + return event_add_pressure( + e, ret, callback, userdata, + SOURCE_CPU_PRESSURE, + cpu_pressure_callback, + PRESSURE_CPU); +} + static void event_free_inotify_data(sd_event *e, InotifyData *d) { assert(e); @@ -2910,7 +2962,8 @@ static int event_source_offline( break; case SOURCE_MEMORY_PRESSURE: - source_memory_pressure_unregister(s); + case SOURCE_CPU_PRESSURE: + source_pressure_unregister(s); break; case SOURCE_TIME_REALTIME: @@ -3001,10 +3054,11 @@ static int event_source_online( break; case SOURCE_MEMORY_PRESSURE: - /* As documented in sd_event_add_memory_pressure(), we can only register the PSI fd with - * epoll after writing the watch string. */ - if (s->memory_pressure.write_buffer_size == 0) { - r = source_memory_pressure_register(s, enabled); + case SOURCE_CPU_PRESSURE: + /* As documented in sd_event_add_{memory,cpu,io}_pressure(), we can only register the PSI fd + * with epoll after writing the watch string. */ + if (s->pressure.write_buffer_size == 0) { + r = source_pressure_register(s, enabled); if (r < 0) return r; } @@ -3983,30 +4037,30 @@ static int process_inotify(sd_event *e) { return done; } -static int process_memory_pressure(sd_event_source *s, uint32_t revents) { +static int process_pressure(sd_event_source *s, uint32_t revents) { assert(s); - assert(s->type == SOURCE_MEMORY_PRESSURE); + assert(EVENT_SOURCE_IS_PRESSURE(s)); if (s->pending) - s->memory_pressure.revents |= revents; + s->pressure.revents |= revents; else - s->memory_pressure.revents = revents; + s->pressure.revents = revents; return source_set_pending(s, true); } -static int source_memory_pressure_write(sd_event_source *s) { +static int source_pressure_write(sd_event_source *s) { ssize_t n; int r; assert(s); - assert(s->type == SOURCE_MEMORY_PRESSURE); + assert(EVENT_SOURCE_IS_PRESSURE(s)); /* once we start writing, the buffer is locked, we allow no further changes. */ - s->memory_pressure.locked = true; + s->pressure.locked = true; - if (s->memory_pressure.write_buffer_size > 0) { - n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size); + if (s->pressure.write_buffer_size > 0) { + n = write(s->pressure.fd, s->pressure.write_buffer, s->pressure.write_buffer_size); if (n < 0) { if (!ERRNO_IS_TRANSIENT(errno)) { /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI @@ -4015,7 +4069,7 @@ static int source_memory_pressure_write(sd_event_source *s) { * so late. Let's make the best of it, and turn off the event source like we * do for failed event source handlers. */ - log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m"); + log_debug_errno(errno, "Writing pressure settings to kernel failed, disabling pressure event source: %m"); assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0); return 0; } @@ -4027,41 +4081,41 @@ static int source_memory_pressure_write(sd_event_source *s) { assert(n >= 0); - if ((size_t) n == s->memory_pressure.write_buffer_size) { - s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer); + if ((size_t) n == s->pressure.write_buffer_size) { + s->pressure.write_buffer = mfree(s->pressure.write_buffer); if (n > 0) { - s->memory_pressure.write_buffer_size = 0; + s->pressure.write_buffer_size = 0; /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */ - r = source_memory_pressure_register(s, s->enabled); + r = source_pressure_register(s, s->enabled); if (r < 0) return r; } } else if (n > 0) { _cleanup_free_ void *c = NULL; - assert((size_t) n < s->memory_pressure.write_buffer_size); + assert((size_t) n < s->pressure.write_buffer_size); - c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n); + c = memdup((uint8_t*) s->pressure.write_buffer + n, s->pressure.write_buffer_size - n); if (!c) return -ENOMEM; - free_and_replace(s->memory_pressure.write_buffer, c); - s->memory_pressure.write_buffer_size -= n; + free_and_replace(s->pressure.write_buffer, c); + s->pressure.write_buffer_size -= n; return 1; } return 0; } -static int source_memory_pressure_initiate_dispatch(sd_event_source *s) { +static int source_pressure_initiate_dispatch(sd_event_source *s) { int r; assert(s); - assert(s->type == SOURCE_MEMORY_PRESSURE); + assert(EVENT_SOURCE_IS_PRESSURE(s)); - r = source_memory_pressure_write(s); + r = source_pressure_write(s); if (r < 0) return r; if (r > 0) @@ -4069,22 +4123,22 @@ static int source_memory_pressure_initiate_dispatch(sd_event_source *s) { * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */ /* No pending incoming IO? Then let's not continue further */ - if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) { + if ((s->pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) { /* Treat IO errors on the notifier the same ways errors returned from a callback */ - if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0) + if ((s->pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0) return -EIO; return 1; /* leave dispatch, we already processed everything */ } - if (s->memory_pressure.revents & EPOLLIN) { + if (s->pressure.revents & EPOLLIN) { uint8_t pipe_buf[PIPE_BUF]; ssize_t n; /* If the fd is readable, then flush out anything that might be queued */ - n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf)); + n = read(s->pressure.fd, pipe_buf, sizeof(pipe_buf)); if (n < 0 && !ERRNO_IS_TRANSIENT(errno)) return -errno; } @@ -4155,8 +4209,8 @@ static int source_dispatch(sd_event_source *s) { if (r < 0) return r; - if (s->type == SOURCE_MEMORY_PRESSURE) { - r = source_memory_pressure_initiate_dispatch(s); + if (EVENT_SOURCE_IS_PRESSURE(s)) { + r = source_pressure_initiate_dispatch(s); if (r == -EIO) /* handle EIO errors similar to callback errors */ goto finish; if (r < 0) @@ -4251,7 +4305,8 @@ static int source_dispatch(sd_event_source *s) { } case SOURCE_MEMORY_PRESSURE: - r = s->memory_pressure.callback(s, s->userdata); + case SOURCE_CPU_PRESSURE: + r = s->pressure.callback(s, s->userdata); break; case SOURCE_WATCHDOG: @@ -4419,7 +4474,7 @@ static void event_close_inode_data_fds(sd_event *e) { } } -static int event_memory_pressure_write_list(sd_event *e) { +static int event_pressure_write_list(sd_event *e) { int r; assert(e); @@ -4427,15 +4482,15 @@ static int event_memory_pressure_write_list(sd_event *e) { for (;;) { sd_event_source *s; - s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list); + s = LIST_POP(pressure.write_list, e->pressure_write_list); if (!s) break; - assert(s->type == SOURCE_MEMORY_PRESSURE); - assert(s->memory_pressure.write_buffer_size > 0); - s->memory_pressure.in_write_list = false; + assert(EVENT_SOURCE_IS_PRESSURE(s)); + assert(s->pressure.write_buffer_size > 0); + s->pressure.in_write_list = false; - r = source_memory_pressure_write(s); + r = source_pressure_write(s); if (r < 0) return r; } @@ -4496,7 +4551,7 @@ _public_ int sd_event_prepare(sd_event *e) { if (r < 0) return r; - r = event_memory_pressure_write_list(e); + r = event_pressure_write_list(e); if (r < 0) return r; @@ -4665,7 +4720,8 @@ static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t break; case SOURCE_MEMORY_PRESSURE: - r = process_memory_pressure(s, i->events); + case SOURCE_CPU_PRESSURE: + r = process_pressure(s, i->events); break; default: @@ -5302,27 +5358,27 @@ _public_ int sd_event_get_exit_on_idle(sd_event *e) { return e->exit_on_idle; } -_public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) { +static int event_source_set_pressure_type(sd_event_source *s, const char *ty) { _cleanup_free_ char *b = NULL; _cleanup_free_ void *w = NULL; assert_return(s, -EINVAL); - assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM); + assert_return(EVENT_SOURCE_IS_PRESSURE(s), -EDOM); assert_return(ty, -EINVAL); assert_return(!event_origin_changed(s->event), -ECHILD); if (!STR_IN_SET(ty, "some", "full")) return -EINVAL; - if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */ + if (s->pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */ return -EBUSY; - char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size); + char* space = memchr(s->pressure.write_buffer, ' ', s->pressure.write_buffer_size); if (!space) return -EINVAL; - size_t l = space - (char*) s->memory_pressure.write_buffer; - b = memdup_suffix0(s->memory_pressure.write_buffer, l); + size_t l = space - (char*) s->pressure.write_buffer; + b = memdup_suffix0(s->pressure.write_buffer, l); if (!b) return -ENOMEM; if (!STR_IN_SET(b, "some", "full")) @@ -5331,26 +5387,40 @@ _public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const if (streq(b, ty)) return 0; - size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l); + size_t nl = strlen(ty) + (s->pressure.write_buffer_size - l); w = new(char, nl); if (!w) return -ENOMEM; - memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l)); + memcpy(stpcpy(w, ty), space, (s->pressure.write_buffer_size - l)); - free_and_replace(s->memory_pressure.write_buffer, w); - s->memory_pressure.write_buffer_size = nl; - s->memory_pressure.locked = false; + free_and_replace(s->pressure.write_buffer, w); + s->pressure.write_buffer_size = nl; + s->pressure.locked = false; return 1; } -_public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) { +_public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM); + + return event_source_set_pressure_type(s, ty); +} + +_public_ int sd_event_source_set_cpu_pressure_type(sd_event_source *s, const char *ty) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CPU_PRESSURE, -EDOM); + + return event_source_set_pressure_type(s, ty); +} + +static int event_source_set_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) { _cleanup_free_ char *b = NULL; _cleanup_free_ void *w = NULL; assert_return(s, -EINVAL); - assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM); + assert_return(EVENT_SOURCE_IS_PRESSURE(s), -EDOM); assert_return(!event_origin_changed(s->event), -ECHILD); if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX) @@ -5360,15 +5430,15 @@ _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint if (threshold_usec > window_usec) return -EINVAL; - if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */ + if (s->pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */ return -EBUSY; - char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size); + char* space = memchr(s->pressure.write_buffer, ' ', s->pressure.write_buffer_size); if (!space) return -EINVAL; - size_t l = space - (char*) s->memory_pressure.write_buffer; - b = memdup_suffix0(s->memory_pressure.write_buffer, l); + size_t l = space - (char*) s->pressure.write_buffer; + b = memdup_suffix0(s->pressure.write_buffer, l); if (!b) return -ENOMEM; if (!STR_IN_SET(b, "some", "full")) @@ -5382,12 +5452,26 @@ _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint return -EINVAL; l = strlen(w) + 1; - if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0) + if (memcmp_nn(s->pressure.write_buffer, s->pressure.write_buffer_size, w, l) == 0) return 0; - free_and_replace(s->memory_pressure.write_buffer, w); - s->memory_pressure.write_buffer_size = l; - s->memory_pressure.locked = false; + free_and_replace(s->pressure.write_buffer, w); + s->pressure.write_buffer_size = l; + s->pressure.locked = false; return 1; } + +_public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM); + + return event_source_set_pressure_period(s, threshold_usec, window_usec); +} + +_public_ int sd_event_source_set_cpu_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CPU_PRESSURE, -EDOM); + + return event_source_set_pressure_period(s, threshold_usec, window_usec); +} diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h index f5c79acdfdabc..71fc9504889e6 100644 --- a/src/systemd/sd-event.h +++ b/src/systemd/sd-event.h @@ -97,6 +97,7 @@ int sd_event_add_defer(sd_event *e, sd_event_source **ret, sd_event_handler_t ca int sd_event_add_post(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); int sd_event_add_exit(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); int sd_event_add_memory_pressure(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); +int sd_event_add_cpu_pressure(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); int sd_event_prepare(sd_event *e); int sd_event_wait(sd_event *e, uint64_t timeout); @@ -162,6 +163,8 @@ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret); int sd_event_source_get_inotify_path(sd_event_source *s, const char **ret); int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty); int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec); +int sd_event_source_set_cpu_pressure_type(sd_event_source *s, const char *ty); +int sd_event_source_set_cpu_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec); int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback); int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret); int sd_event_source_get_floating(sd_event_source *s); diff --git a/src/test/meson.build b/src/test/meson.build index adbcd3c0d4dcf..c0da81fbf44da 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -367,7 +367,7 @@ executables += [ 'dependencies' : libm, }, test_template + { - 'sources' : files('test-mempress.c'), + 'sources' : files('test-pressure.c'), 'dependencies' : threads, }, test_template + { diff --git a/src/test/test-mempress.c b/src/test/test-pressure.c similarity index 59% rename from src/test/test-mempress.c rename to src/test/test-pressure.c index 5ec372591e0f3..86fc070618155 100644 --- a/src/test/test-mempress.c +++ b/src/test/test-pressure.c @@ -28,6 +28,8 @@ #include "tmpfile-util.h" #include "unit-def.h" +/* Shared infrastructure for fake pressure tests */ + struct fake_pressure_context { int fifo_fd; int socket_fd; @@ -62,7 +64,7 @@ static int fake_pressure_callback(sd_event_source *s, void *userdata) { *value *= d[0]; - log_notice("memory pressure event: %s", d); + log_notice("pressure event: %s", d); if (*value == 7 * 'f' * 's') ASSERT_OK(sd_event_exit(sd_event_source_get_event(s), 0)); @@ -70,16 +72,31 @@ static int fake_pressure_callback(sd_event_source *s, void *userdata) { return 0; } -TEST(fake_pressure) { +typedef int (*event_add_pressure_t)(sd_event *, sd_event_source **, sd_event_handler_t, void *); + +static void test_fake_pressure( + const char *resource, + event_add_pressure_t add_pressure) { + _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *ef = NULL; _cleanup_(sd_event_unrefp) sd_event *e = NULL; - _cleanup_free_ char *j = NULL, *k = NULL; + _cleanup_free_ char *j = NULL, *k = NULL, + *env_watch = NULL, *env_write = NULL, *resource_upper = NULL; _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL; _cleanup_close_ int fifo_fd = -EBADF, socket_fd = -EBADF; union sockaddr_union sa; pthread_t th; int value = 7; + resource_upper = strdup(resource); + ASSERT_NOT_NULL(resource_upper); + ascii_strupper(resource_upper); + + env_watch = strjoin(resource_upper, "_PRESSURE_WATCH"); + ASSERT_NOT_NULL(env_watch); + env_write = strjoin(resource_upper, "_PRESSURE_WRITE"); + ASSERT_NOT_NULL(env_write); + ASSERT_OK(sd_event_default(&e)); ASSERT_OK(mkdtemp_malloc(NULL, &tmp)); @@ -107,16 +124,16 @@ TEST(fake_pressure) { ASSERT_EQ(pthread_create(&th, NULL, fake_pressure_thread, TAKE_PTR(fp)), 0); - ASSERT_OK_ERRNO(setenv("MEMORY_PRESSURE_WATCH", j, /* override= */ true)); - ASSERT_OK_ERRNO(unsetenv("MEMORY_PRESSURE_WRITE")); + ASSERT_OK_ERRNO(setenv(env_watch, j, /* override= */ true)); + ASSERT_OK_ERRNO(unsetenv(env_write)); - ASSERT_OK(sd_event_add_memory_pressure(e, &es, fake_pressure_callback, &value)); + ASSERT_OK(add_pressure(e, &es, fake_pressure_callback, &value)); ASSERT_OK(sd_event_source_set_description(es, "fifo event source")); - ASSERT_OK_ERRNO(setenv("MEMORY_PRESSURE_WATCH", k, /* override= */ true)); - ASSERT_OK_ERRNO(setenv("MEMORY_PRESSURE_WRITE", "aGVsbG8K", /* override= */ true)); + ASSERT_OK_ERRNO(setenv(env_watch, k, /* override= */ true)); + ASSERT_OK_ERRNO(setenv(env_write, "aGVsbG8K", /* override= */ true)); - ASSERT_OK(sd_event_add_memory_pressure(e, &ef, fake_pressure_callback, &value)); + ASSERT_OK(add_pressure(e, &ef, fake_pressure_callback, &value)); ASSERT_OK(sd_event_source_set_description(ef, "socket event source")); ASSERT_OK(sd_event_loop(e)); @@ -126,18 +143,52 @@ TEST(fake_pressure) { ASSERT_EQ(pthread_join(th, NULL), 0); } +static int fake_pressure_wrapper(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata) { + return sd_event_add_memory_pressure(e, ret, callback, userdata); +} + +TEST(fake_memory_pressure) { + test_fake_pressure("memory", fake_pressure_wrapper); +} + +static int fake_cpu_pressure_wrapper(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata) { + return sd_event_add_cpu_pressure(e, ret, callback, userdata); +} + +TEST(fake_cpu_pressure) { + test_fake_pressure("cpu", fake_cpu_pressure_wrapper); +} + +/* Shared infrastructure for real pressure tests */ + struct real_pressure_context { sd_event_source *pid; }; -static int real_pressure_callback(sd_event_source *s, void *userdata) { +static int real_pressure_child_callback(sd_event_source *s, const siginfo_t *si, void *userdata) { + ASSERT_NOT_NULL(s); + ASSERT_NOT_NULL(si); + + log_notice("child dead"); + + ASSERT_EQ(si->si_signo, SIGCHLD); + ASSERT_EQ(si->si_status, SIGKILL); + ASSERT_EQ(si->si_code, CLD_KILLED); + + ASSERT_OK(sd_event_exit(sd_event_source_get_event(s), 31)); + return 0; +} + +/* Memory pressure real test */ + +static int real_memory_pressure_callback(sd_event_source *s, void *userdata) { struct real_pressure_context *c = ASSERT_PTR(userdata); const char *d; ASSERT_NOT_NULL(s); ASSERT_OK(sd_event_source_get_description(s, &d)); - log_notice("real_memory pressure event: %s", d); + log_notice("real memory pressure event: %s", d); sd_event_trim_memory(); @@ -175,21 +226,7 @@ _noreturn_ static void real_pressure_eat_memory(int pipe_fd) { } } -static int real_pressure_child_callback(sd_event_source *s, const siginfo_t *si, void *userdata) { - ASSERT_NOT_NULL(s); - ASSERT_NOT_NULL(si); - - log_notice("child dead"); - - ASSERT_EQ(si->si_signo, SIGCHLD); - ASSERT_EQ(si->si_status, SIGKILL); - ASSERT_EQ(si->si_code, CLD_KILLED); - - ASSERT_OK(sd_event_exit(sd_event_source_get_event(s), 31)); - return 0; -} - -TEST(real_pressure) { +TEST(real_memory_pressure) { _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *cs = NULL; @@ -249,7 +286,7 @@ TEST(real_pressure) { .pid = cs, }; - r = sd_event_add_memory_pressure(e, &es, real_pressure_callback, &context); + r = sd_event_add_memory_pressure(e, &es, real_memory_pressure_callback, &context); if (r < 0) return (void) log_tests_skipped_errno(r, "can't allocate memory pressure fd"); @@ -303,6 +340,123 @@ TEST(real_pressure) { ASSERT_EQ(ex, 31); } +/* CPU pressure real test */ + +static int real_cpu_pressure_callback(sd_event_source *s, void *userdata) { + struct real_pressure_context *c = ASSERT_PTR(userdata); + const char *d; + + ASSERT_NOT_NULL(s); + ASSERT_OK(sd_event_source_get_description(s, &d)); + + log_notice("real cpu pressure event: %s", d); + + ASSERT_NOT_NULL(c->pid); + ASSERT_OK(sd_event_source_send_child_signal(c->pid, SIGKILL, NULL, 0)); + c->pid = NULL; + + return 0; +} + +_noreturn_ static void real_pressure_eat_cpu(int pipe_fd) { + char x; + ASSERT_EQ(read(pipe_fd, &x, 1), 1); /* Wait for the GO! */ + + /* Busy-loop to generate CPU pressure */ + for (;;) + __asm__ volatile("" ::: "memory"); /* Prevent optimization */ +} + +TEST(real_cpu_pressure) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *cs = NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_pair_ int pipe_fd[2] = EBADF_PAIR; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_free_ char *scope = NULL; + const char *object; + int r; + + if (getuid() == 0) + r = sd_bus_open_system(&bus); + else + r = sd_bus_open_user(&bus); + if (r < 0) + return (void) log_tests_skipped_errno(r, "can't connect to bus"); + + ASSERT_OK(bus_wait_for_jobs_new(bus, &w)); + + ASSERT_OK(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit")); + ASSERT_OK(asprintf(&scope, "test-%" PRIu64 ".scope", random_u64())); + ASSERT_OK(sd_bus_message_append(m, "ss", scope, "fail")); + ASSERT_OK(sd_bus_message_open_container(m, 'a', "(sv)")); + ASSERT_OK(sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, 0)); + ASSERT_OK(sd_bus_message_append(m, "(sv)", "CPUAccounting", "b", true)); + ASSERT_OK(sd_bus_message_close_container(m)); + ASSERT_OK(sd_bus_message_append(m, "a(sa(sv))", 0)); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return (void) log_tests_skipped_errno(r, "can't issue transient unit call"); + + ASSERT_OK(sd_bus_message_read(reply, "o", &object)); + + ASSERT_OK(bus_wait_for_jobs_one(w, object, /* flags= */ BUS_WAIT_JOBS_LOG_ERROR, /* extra_args= */ NULL)); + + ASSERT_OK(sd_event_default(&e)); + + ASSERT_OK_ERRNO(pipe2(pipe_fd, O_CLOEXEC)); + + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + r = pidref_safe_fork("(eat-cpu)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM, &pidref); + ASSERT_OK(r); + if (r == 0) { + real_pressure_eat_cpu(pipe_fd[0]); + _exit(EXIT_SUCCESS); + } + + ASSERT_OK(event_add_child_pidref(e, &cs, &pidref, WEXITED, real_pressure_child_callback, NULL)); + ASSERT_OK(sd_event_source_set_child_process_own(cs, true)); + + ASSERT_OK_ERRNO(unsetenv("CPU_PRESSURE_WATCH")); + ASSERT_OK_ERRNO(unsetenv("CPU_PRESSURE_WRITE")); + + struct real_pressure_context context = { + .pid = cs, + }; + + r = sd_event_add_cpu_pressure(e, &es, real_cpu_pressure_callback, &context); + if (r < 0) + return (void) log_tests_skipped_errno(r, "can't allocate cpu pressure fd"); + + ASSERT_OK(sd_event_source_set_description(es, "real pressure event source")); + ASSERT_OK_ZERO(sd_event_source_set_cpu_pressure_type(es, "some")); + /* Unprivileged writes require a minimum of 2s otherwise the kernel will refuse the write. */ + ASSERT_OK_POSITIVE(sd_event_source_set_cpu_pressure_period(es, 70 * USEC_PER_MSEC, 2 * USEC_PER_SEC)); + ASSERT_OK_ZERO(sd_event_source_set_cpu_pressure_period(es, 70 * USEC_PER_MSEC, 2 * USEC_PER_SEC)); + ASSERT_OK(sd_event_source_set_enabled(es, SD_EVENT_ONESHOT)); + + m = sd_bus_message_unref(m); + + ASSERT_OK(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "SetUnitProperties")); + ASSERT_OK(sd_bus_message_append(m, "sb", scope, true)); + ASSERT_OK(sd_bus_message_open_container(m, 'a', "(sv)")); + ASSERT_OK(sd_bus_message_append(m, "(sv)", "CPUQuotaPerSecUSec", "t", (uint64_t) 1000)); /* 0.1% CPU */ + ASSERT_OK(sd_bus_message_close_container(m)); + + ASSERT_OK(sd_bus_call(bus, m, 0, NULL, NULL)); + + /* Now start eating CPU */ + ASSERT_EQ(write(pipe_fd[1], &(const char) { 'x' }, 1), 1); + + ASSERT_OK(sd_event_loop(e)); + int ex = 0; + ASSERT_OK(sd_event_get_exit_code(e, &ex)); + ASSERT_EQ(ex, 31); +} + static int outro(void) { hashmap_trim_pools(); return 0; From 84d7150a486861b3ca1e01a44aeacfe6177c837e Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Sat, 7 Mar 2026 23:20:19 +0100 Subject: [PATCH 4/7] core: Add support for CPU pressure notifications Works the same way as memory pressure notifications. Code is refactored to work on enum arrays to reduce duplication. --- man/org.freedesktop.systemd1.xml | 98 ++++++++++++++ man/systemd-system.conf.xml | 14 ++ man/systemd.exec.xml | 12 ++ man/systemd.resource-control.xml | 49 +++++++ src/core/cgroup.c | 27 ++-- src/core/cgroup.h | 40 ++++-- src/core/dbus-cgroup.c | 24 ++-- src/core/dbus-manager.c | 6 +- src/core/exec-invoke.c | 128 ++++++++++++------ src/core/execute-serialize.c | 30 +++- src/core/load-fragment-gperf.gperf.in | 6 +- src/core/load-fragment.c | 2 +- src/core/load-fragment.h | 2 +- src/core/main.c | 6 +- src/core/manager.c | 56 +++++--- src/core/manager.h | 8 +- src/core/system.conf.in | 2 + src/core/unit.c | 4 +- src/core/user.conf.in | 2 + src/core/varlink-cgroup.c | 6 +- src/core/varlink-manager.c | 6 +- src/shared/bus-unit-util.c | 2 + src/shared/varlink-io.systemd.Manager.c | 4 + src/shared/varlink-io.systemd.Unit.c | 4 + .../meson.build | 0 test/integration-tests/meson.build | 2 +- ...EST-79-MEMPRESS.sh => TEST-79-PRESSURE.sh} | 55 +++++++- 27 files changed, 474 insertions(+), 121 deletions(-) rename test/integration-tests/{TEST-79-MEMPRESS => TEST-79-PRESSURE}/meson.build (100%) rename test/units/{TEST-79-MEMPRESS.sh => TEST-79-PRESSURE.sh} (56%) diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index f4a06901b0368..2c590183dfb9b 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -552,6 +552,10 @@ node /org/freedesktop/systemd1 { readonly t DefaultMemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s DefaultMemoryPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t DefaultCPUPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DefaultCPUPressureWatch = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly t TimerSlackNSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -791,6 +795,10 @@ node /org/freedesktop/systemd1 { + + + + @@ -1239,6 +1247,10 @@ node /org/freedesktop/systemd1 { + + + + @@ -3060,6 +3072,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s CPUPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t CPUPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -3729,6 +3745,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + @@ -4421,6 +4441,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + @@ -5320,6 +5344,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s CPUPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t CPUPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -6005,6 +6033,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + @@ -6671,6 +6703,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + @@ -7393,6 +7429,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s CPUPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t CPUPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -8002,6 +8042,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + @@ -8576,6 +8620,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + @@ -9431,6 +9479,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s CPUPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t CPUPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -10022,6 +10074,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + @@ -10578,6 +10634,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + @@ -11286,6 +11346,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s CPUPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t CPUPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -11459,6 +11523,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + @@ -11647,6 +11715,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + @@ -11858,6 +11930,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s CPUPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t CPUPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -12045,6 +12121,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + @@ -12257,6 +12337,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + @@ -12469,6 +12553,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ RemoveSubgroupFromUnit(), and KillUnitSubgroup() were added in version 258. TransactionsWithOrderingCycle was added in version 259. + DefaultCPUPressureThresholdUSec and + DefaultCPUPressureWatch were added in version 261. Unit Objects @@ -12560,6 +12646,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ExecReloadPostEx were added in version 259. BindNetworkInterface, MemoryTHP, RefreshOnReload, and RootMStack were added in version 260. + CPUPressureThresholdUSec and + CPUPressureWatch were added in version 261. Socket Unit Objects @@ -12630,6 +12718,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMKills were added in 259. BindNetworkInterface MemoryTHP, and RootMStack were added in version 260. + CPUPressureThresholdUSec and + CPUPressureWatch were added in version 261. Mount Unit Objects @@ -12695,6 +12785,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMKills were added in 259. BindNetworkInterface MemoryTHP, and RootMStack were added in version 260. + CPUPressureThresholdUSec and + CPUPressureWatch were added in version 261. Swap Unit Objects @@ -12758,6 +12850,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMKills were added in 259. BindNetworkInterface, MemoryTHP, and RootMStack were added in version 260. + CPUPressureThresholdUSec and + CPUPressureWatch were added in version 261. Slice Unit Objects @@ -12791,6 +12885,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ OOMKills, and ManagedOOMKills were added in 259. BindNetworkInterface was added in version 260. + CPUPressureThresholdUSec and + CPUPressureWatch were added in version 261. Scope Unit Objects @@ -12822,6 +12918,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ OOMKills, and ManagedOOMKills were added in 259. BindNetworkInterface was added in version 260. + CPUPressureThresholdUSec and + CPUPressureWatch were added in version 261. Job Objects diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml index b7fe53dc9cf38..eca8a0f0bd7a0 100644 --- a/man/systemd-system.conf.xml +++ b/man/systemd-system.conf.xml @@ -326,6 +326,20 @@ + + + DefaultCPUPressureWatch= + DefaultCPUPressureThresholdSec= + + Configures the default settings for the per-unit + CPUPressureWatch= and CPUPressureThresholdSec= + settings. See + systemd.resource-control5 + for details. Defaults to auto and 200ms, respectively. This + also sets the CPU pressure monitoring threshold for the service manager itself. + + + diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 48bec7361bde1..1048fcadfc376 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -4705,6 +4705,18 @@ StandardInputData=V2XigLJyZSBubyBzdHJhbmdlcnMgdG8gbG92ZQpZb3Uga25vdyB0aGUgcnVsZX + + $CPU_PRESSURE_WATCH + $CPU_PRESSURE_WRITE + + If CPU pressure monitoring is enabled for this service unit, the path to watch + and the data to write into it. See Resource Pressure + Handling for details about these variables and the service protocol data they + convey. + + + + $FDSTORE diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 12a3c0e644eba..20584c0b7373b 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -1654,6 +1654,55 @@ DeviceAllow=/dev/loop-control + + + CPUPressureWatch= + + Controls CPU pressure monitoring for invoked processes. Takes a boolean or one of + auto and skip. If no, tells the service not + to watch for CPU pressure events, by setting the $CPU_PRESSURE_WATCH + environment variable to the literal string /dev/null. If yes, + tells the service to watch for CPU pressure events. This enables CPU accounting for the + service, and ensures the cpu.pressure cgroup attribute file is accessible for + reading and writing by the service's user. It then sets the $CPU_PRESSURE_WATCH + environment variable for processes invoked by the unit to the file system path to this file. The + threshold information configured with CPUPressureThresholdSec= is encoded in + the $CPU_PRESSURE_WRITE environment variable. If the auto + value is set the protocol is enabled if CPU accounting is anyway enabled for the unit (e.g. because + CPUWeight= or CPUQuota= is set), and + disabled otherwise. If set to skip the logic is neither enabled, nor disabled and + the two environment variables are not set. + + Note that services are free to use the two environment variables, but it is unproblematic if + they ignore them. CPU pressure handling must be implemented individually in each service, and + usually means different things for different software. + + Services implemented using + sd-event3 may use + sd_event_add_cpu_pressure3 + to watch for and handle CPU pressure events. + + If not explicitly set, defaults to the DefaultCPUPressureWatch= setting in + systemd-system.conf5. + + + + + + CPUPressureThresholdSec= + + Sets the CPU pressure threshold time for CPU pressure monitor as configured via + CPUPressureWatch=. Specifies the maximum CPU stall time before a CPU + pressure event is signalled to the service, per 2s window. If not specified, defaults to the + DefaultCPUPressureThresholdSec= setting in + systemd-system.conf5 + (which in turn defaults to 200ms). The specified value expects a time unit such as + ms or μs, see + systemd.time7 for + details on the permitted syntax. + + + Coredump Control diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 514dabf371b7f..79c1c36b1da2f 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -185,8 +185,8 @@ void cgroup_context_init(CGroupContext *c) { * moom_mem_pressure_duration_usec is set to infinity. */ .moom_mem_pressure_duration_usec = USEC_INFINITY, - .memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID, - .memory_pressure_threshold_usec = USEC_INFINITY, + .pressure_watch = { _CGROUP_PRESSURE_WATCH_INVALID, _CGROUP_PRESSURE_WATCH_INVALID }, + .pressure_threshold_usec = { USEC_INFINITY, USEC_INFINITY }, }; } @@ -526,6 +526,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n" "%sManagedOOMPreference: %s\n" "%sMemoryPressureWatch: %s\n" + "%sCPUPressureWatch: %s\n" "%sCoredumpReceive: %s\n", prefix, yes_no(c->io_accounting), prefix, yes_no(c->memory_accounting), @@ -561,7 +562,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { prefix, managed_oom_mode_to_string(c->moom_mem_pressure), prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)), prefix, managed_oom_preference_to_string(c->moom_preference), - prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch), + prefix, cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_MEMORY]), + prefix, cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_CPU]), prefix, yes_no(c->coredump_receive)); if (c->delegate_subgroup) @@ -572,9 +574,13 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { fprintf(f, "%sBindNetworkInterface: %s\n", prefix, c->bind_network_interface); - if (c->memory_pressure_threshold_usec != USEC_INFINITY) + if (c->pressure_threshold_usec[PRESSURE_MEMORY] != USEC_INFINITY) fprintf(f, "%sMemoryPressureThresholdSec: %s\n", - prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1)); + prefix, FORMAT_TIMESPAN(c->pressure_threshold_usec[PRESSURE_MEMORY], 1)); + + if (c->pressure_threshold_usec[PRESSURE_CPU] != USEC_INFINITY) + fprintf(f, "%sCPUPressureThresholdSec: %s\n", + prefix, FORMAT_TIMESPAN(c->pressure_threshold_usec[PRESSURE_CPU], 1)); if (c->moom_mem_pressure_duration_usec != USEC_INFINITY) fprintf(f, "%sManagedOOMMemoryPressureDurationSec: %s\n", @@ -2105,12 +2111,13 @@ static int unit_update_cgroup( cgroup_context_apply(u, target_mask, state); cgroup_xattr_apply(u); - /* For most units we expect that memory monitoring is set up before the unit is started and we won't - * touch it after. For PID 1 this is different though, because we couldn't possibly do that given - * that PID 1 runs before init.scope is even set up. Hence, whenever init.scope is realized, let's - * try to open the memory pressure interface anew. */ + /* For most units we expect that pressure monitoring is set up before the unit is started and we + * won't touch it after. For PID 1 this is different though, because we couldn't possibly do that + * given that PID 1 runs before init.scope is even set up. Hence, whenever init.scope is realized, + * let's try to open the pressure interfaces anew. */ if (unit_has_name(u, SPECIAL_INIT_SCOPE)) - (void) manager_setup_memory_pressure_event_source(u->manager); + for (PressureResource t = 0; t < _PRESSURE_RESOURCE_MAX; t++) + (void) manager_setup_pressure_event_source(u->manager, t); return 0; } diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 0cd290e92f25d..69086467f1f92 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -6,6 +6,7 @@ #include "cpu-set-util.h" #include "firewall-util.h" #include "list.h" +#include "psi-util.h" typedef struct CGroupTasksMax { /* If scale == 0, just use value; otherwise, value / scale. @@ -95,10 +96,10 @@ typedef struct CGroupSocketBindItem { } CGroupSocketBindItem; typedef enum CGroupPressureWatch { - CGROUP_PRESSURE_WATCH_NO, /* → tells the service payload explicitly not to watch for memory pressure */ + CGROUP_PRESSURE_WATCH_NO, /* → tells the service payload explicitly not to watch for pressure */ CGROUP_PRESSURE_WATCH_YES, - CGROUP_PRESSURE_WATCH_AUTO, /* → on if memory account is on anyway for the unit, otherwise off */ - CGROUP_PRESSURE_WATCH_SKIP, /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */ + CGROUP_PRESSURE_WATCH_AUTO, /* → on if relevant accounting is on anyway for the unit, otherwise off */ + CGROUP_PRESSURE_WATCH_SKIP, /* → doesn't set up pressure watch, but also doesn't explicitly tell payload to avoid it */ _CGROUP_PRESSURE_WATCH_MAX, _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL, } CGroupPressureWatch; @@ -189,11 +190,9 @@ typedef struct CGroupContext { usec_t moom_mem_pressure_duration_usec; ManagedOOMPreference moom_preference; - /* Memory pressure logic */ - CGroupPressureWatch memory_pressure_watch; - usec_t memory_pressure_threshold_usec; - /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple - * triggers, nor triggers for non-memory pressure. We might add that later. */ + /* Pressure logic */ + CGroupPressureWatch pressure_watch[_PRESSURE_RESOURCE_MAX]; + usec_t pressure_threshold_usec[_PRESSURE_RESOURCE_MAX]; NFTSetContext nft_set_context; @@ -353,11 +352,29 @@ void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLaten void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p); void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head); -static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) { +static inline bool cgroup_context_want_pressure(const CGroupContext *c, PressureResource t) { assert(c); + assert(t >= 0 && t < _PRESSURE_RESOURCE_MAX); - return c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_YES || - (c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_AUTO && c->memory_accounting); + if (c->pressure_watch[t] == CGROUP_PRESSURE_WATCH_YES) + return true; + + if (c->pressure_watch[t] != CGROUP_PRESSURE_WATCH_AUTO) + return false; + + switch (t) { + + case PRESSURE_MEMORY: + return c->memory_accounting; + + case PRESSURE_CPU: + return c->cpu_weight != CGROUP_WEIGHT_INVALID || + c->startup_cpu_weight != CGROUP_WEIGHT_INVALID || + c->cpu_quota_per_sec_usec != USEC_INFINITY; + + default: + assert_not_reached(); + } } static inline bool cgroup_context_has_device_policy(const CGroupContext *c) { @@ -457,6 +474,7 @@ int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, DECLARE_STRING_TABLE_LOOKUP(cgroup_pressure_watch, CGroupPressureWatch); + DECLARE_STRING_TABLE_LOOKUP(cgroup_device_permissions, CGroupDevicePermissions); DECLARE_STRING_TABLE_LOOKUP(cgroup_ip_accounting_metric, CGroupIPAccountingMetric); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index a7508c96daa11..3c84baf415c37 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -427,8 +427,10 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("SocketBindDeny", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_deny), 0), SD_BUS_PROPERTY("RestrictNetworkInterfaces", "(bas)", property_get_restrict_network_interfaces, 0, 0), SD_BUS_PROPERTY("BindNetworkInterface", "s", NULL, offsetof(CGroupContext, bind_network_interface), 0), - SD_BUS_PROPERTY("MemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, memory_pressure_watch), 0), - SD_BUS_PROPERTY("MemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, memory_pressure_threshold_usec), 0), + SD_BUS_PROPERTY("MemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, pressure_watch[PRESSURE_MEMORY]), 0), + SD_BUS_PROPERTY("MemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, pressure_threshold_usec[PRESSURE_MEMORY]), 0), + SD_BUS_PROPERTY("CPUPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, pressure_watch[PRESSURE_CPU]), 0), + SD_BUS_PROPERTY("CPUPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, pressure_threshold_usec[PRESSURE_CPU]), 0), SD_BUS_PROPERTY("NFTSet", "a(iiss)", property_get_cgroup_nft_set, 0, 0), SD_BUS_PROPERTY("CoredumpReceive", "b", bus_property_get_bool, offsetof(CGroupContext, coredump_receive), 0), @@ -712,10 +714,12 @@ static int bus_cgroup_set_transient_property( return 1; - } else if (streq(name, "MemoryPressureWatch")) { + } else if (STR_IN_SET(name, "MemoryPressureWatch", "CPUPressureWatch")) { CGroupPressureWatch p; const char *t; + PressureResource pt = streq(name, "MemoryPressureWatch") ? PRESSURE_MEMORY : PRESSURE_CPU; + r = sd_bus_message_read(message, "s", &t); if (r < 0) return r; @@ -729,26 +733,28 @@ static int bus_cgroup_set_transient_property( } if (!UNIT_WRITE_FLAGS_NOOP(flags)) { - c->memory_pressure_watch = p; - unit_write_settingf(u, flags, name, "MemoryPressureWatch=%s", strempty(cgroup_pressure_watch_to_string(p))); + c->pressure_watch[pt] = p; + unit_write_settingf(u, flags, name, "%s=%s", name, strempty(cgroup_pressure_watch_to_string(p))); } return 1; - } else if (streq(name, "MemoryPressureThresholdUSec")) { + } else if (STR_IN_SET(name, "MemoryPressureThresholdUSec", "CPUPressureThresholdUSec")) { uint64_t t; + PressureResource pt = streq(name, "MemoryPressureThresholdUSec") ? PRESSURE_MEMORY : PRESSURE_CPU; + r = sd_bus_message_read(message, "t", &t); if (r < 0) return r; if (!UNIT_WRITE_FLAGS_NOOP(flags)) { - c->memory_pressure_threshold_usec = t; + c->pressure_threshold_usec[pt] = t; if (t == UINT64_MAX) - unit_write_setting(u, flags, name, "MemoryPressureThresholdUSec="); + unit_write_settingf(u, flags, name, "%s=", name); else - unit_write_settingf(u, flags, name, "MemoryPressureThresholdUSec=%" PRIu64, t); + unit_write_settingf(u, flags, name, "%s=%" PRIu64, name, t); } return 1; diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index fec53341caecf..6347bd7b5389d 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -2977,8 +2977,10 @@ const sd_bus_vtable bus_manager_vtable[] = { SD_BUS_PROPERTY("DefaultLimitRTTIME", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultTasksMax", "t", bus_property_get_tasks_max, offsetof(Manager, defaults.tasks_max), 0), - SD_BUS_PROPERTY("DefaultMemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.memory_pressure_threshold_usec), 0), - SD_BUS_PROPERTY("DefaultMemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.memory_pressure_watch), 0), + SD_BUS_PROPERTY("DefaultMemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.pressure_threshold_usec[PRESSURE_MEMORY]), 0), + SD_BUS_PROPERTY("DefaultMemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.pressure_watch[PRESSURE_MEMORY]), 0), + SD_BUS_PROPERTY("DefaultCPUPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.pressure_threshold_usec[PRESSURE_CPU]), 0), + SD_BUS_PROPERTY("DefaultCPUPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.pressure_watch[PRESSURE_CPU]), 0), SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, defaults.oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST), diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 7500888c414a3..8650c7ae556a7 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2034,7 +2034,7 @@ static int build_environment( const char *shell, dev_t journal_stream_dev, ino_t journal_stream_ino, - const char *memory_pressure_path, + char *const *pressure_path, bool needs_sandboxing, char ***ret) { @@ -2215,25 +2215,38 @@ static int build_environment( if (r < 0) return r; - if (memory_pressure_path) { - r = strv_extend_joined_with_size(&e, &n, "MEMORY_PRESSURE_WATCH=", memory_pressure_path); + for (PressureResource t = 0; t < _PRESSURE_RESOURCE_MAX; t++) { + if (!pressure_path[t]) + continue; + + const PressureResourceInfo *info = &pressure_resource_info[t]; + + _cleanup_free_ char *env_watch = strjoin(info->env_watch, "="); + if (!env_watch) + return -ENOMEM; + + r = strv_extend_joined_with_size(&e, &n, env_watch, pressure_path[t]); if (r < 0) return r; - if (!path_equal(memory_pressure_path, "/dev/null")) { + if (!path_equal(pressure_path[t], "/dev/null")) { _cleanup_free_ char *b = NULL, *x = NULL; if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT, PRESSURE_DEFAULT_TYPE, - cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? PRESSURE_DEFAULT_THRESHOLD_USEC : - CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, PRESSURE_DEFAULT_WINDOW_USEC), + cgroup_context->pressure_threshold_usec[t] == USEC_INFINITY ? PRESSURE_DEFAULT_THRESHOLD_USEC : + CLAMP(cgroup_context->pressure_threshold_usec[t], 1U, PRESSURE_DEFAULT_WINDOW_USEC), PRESSURE_DEFAULT_WINDOW_USEC) < 0) return -ENOMEM; if (base64mem(b, strlen(b) + 1, &x) < 0) return -ENOMEM; - r = strv_extend_joined_with_size(&e, &n, "MEMORY_PRESSURE_WRITE=", x); + _cleanup_free_ char *env_write = strjoin(info->env_write, "="); + if (!env_write) + return -ENOMEM; + + r = strv_extend_joined_with_size(&e, &n, env_write, x); if (r < 0) return r; } @@ -3855,7 +3868,7 @@ static int apply_mount_namespace( const ExecParameters *params, const ExecRuntime *runtime, const PinnedResource *rootfs, - const char *memory_pressure_path, + char *const *pressure_path, bool needs_sandboxing, uid_t exec_directory_uid, gid_t exec_directory_gid, @@ -3887,16 +3900,28 @@ static int apply_mount_namespace( if (r < 0) return r; - /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the - * service will need to write to it in order to start the notifications. */ - if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) { + /* We need to make the pressure paths writable even if /sys/fs/cgroups is made read-only, as the + * service will need to write to them in order to start the notifications. */ + bool need_pressure_rw = false; + for (PressureResource t = 0; t < _PRESSURE_RESOURCE_MAX; t++) + if (pressure_path[t] && !streq(pressure_path[t], "/dev/null")) { + need_pressure_rw = true; + break; + } + + if (exec_is_cgroup_mount_read_only(context) && need_pressure_rw) { read_write_paths_cleanup = strv_copy(context->read_write_paths); if (!read_write_paths_cleanup) return -ENOMEM; - r = strv_extend(&read_write_paths_cleanup, memory_pressure_path); - if (r < 0) - return r; + for (PressureResource t = 0; t < _PRESSURE_RESOURCE_MAX; t++) { + if (!pressure_path[t] || streq(pressure_path[t], "/dev/null")) + continue; + + r = strv_extend(&read_write_paths_cleanup, pressure_path[t]); + if (r < 0) + return r; + } read_write_paths = read_write_paths_cleanup; } else @@ -4689,7 +4714,7 @@ static int setup_delegated_namespaces( const ExecRuntime *runtime, const PinnedResource *rootfs, bool delegate, - const char *memory_pressure_path, + char *const *pressure_path, uid_t uid, gid_t gid, const ExecCommand *command, @@ -4820,7 +4845,7 @@ static int setup_delegated_namespaces( params, runtime, rootfs, - memory_pressure_path, + pressure_path, needs_sandboxing, uid, gid, @@ -5146,6 +5171,10 @@ static int setup_term_environment(const ExecContext *context, char ***env) { return strv_env_replace_strdup(env, "TERM=" FALLBACK_TERM); } +static inline void free_pressure_paths(char *(*p)[_PRESSURE_RESOURCE_MAX]) { + free_many_charp(*p, _PRESSURE_RESOURCE_MAX); +} + int exec_invoke( const ExecCommand *command, const ExecContext *context, @@ -5157,7 +5186,8 @@ int exec_invoke( _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL; int r; const char *username = NULL, *groupname = NULL; - _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL, *own_user = NULL; + _cleanup_free_ char *home_buffer = NULL, *own_user = NULL; + _cleanup_(free_pressure_paths) char *pressure_path[_PRESSURE_RESOURCE_MAX] = {}; const char *pwent_home = NULL, *shell = NULL; dev_t journal_stream_dev = 0; ino_t journal_stream_ino = 0; @@ -5753,36 +5783,44 @@ int exec_invoke( } if (is_pressure_supported() > 0) { - if (cgroup_context_want_memory_pressure(cgroup_context)) { - r = cg_get_path(params->cgroup_path, "memory.pressure", &memory_pressure_path); - if (r < 0) { - *exit_status = EXIT_MEMORY; - return log_oom(); - } + for (PressureResource t = 0; t < _PRESSURE_RESOURCE_MAX; t++) { + if (cgroup_context_want_pressure(cgroup_context, t)) { + _cleanup_free_ char *pressure_file = strjoin(pressure_resource_to_string(t), ".pressure"); + if (!pressure_file) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } - r = chmod_and_chown(memory_pressure_path, 0644, uid, gid); - if (r < 0) { - log_full_errno(r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path); - memory_pressure_path = mfree(memory_pressure_path); - } - /* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative - * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory - * pressure path environment variable or read-write mount to the unit. This is why we check if - * memory_pressure_path != NULL in the conditional below. */ - if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) { - memory_pressure_path = mfree(memory_pressure_path); - r = cg_get_path("/", "memory.pressure", &memory_pressure_path); + r = cg_get_path(params->cgroup_path, pressure_file, &pressure_path[t]); if (r < 0) { *exit_status = EXIT_MEMORY; return log_oom(); } - } - } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_NO) { - memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */ - if (!memory_pressure_path) { - *exit_status = EXIT_MEMORY; - return log_oom(); + + r = chmod_and_chown(pressure_path[t], 0644, uid, gid); + if (r < 0) { + log_full_errno(r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to adjust ownership of '%s', ignoring: %m", pressure_path[t]); + pressure_path[t] = mfree(pressure_path[t]); + } + /* First we use the current cgroup path to chmod and chown the pressure path, then pass the + * path relative to the cgroup namespace to environment variables and mounts. If chown/chmod + * fails, we should not pass pressure path environment variable or read-write mount to the + * unit. This is why we check if pressure_path[t] != NULL in the conditional below. */ + if (pressure_path[t] && needs_sandboxing && exec_needs_cgroup_namespace(context)) { + pressure_path[t] = mfree(pressure_path[t]); + r = cg_get_path("/", pressure_file, &pressure_path[t]); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + } + } else if (cgroup_context->pressure_watch[t] == CGROUP_PRESSURE_WATCH_NO) { + pressure_path[t] = strdup("/dev/null"); /* /dev/null is explicit indicator for turning off pressure watch */ + if (!pressure_path[t]) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } } } } @@ -5829,7 +5867,7 @@ int exec_invoke( shell, journal_stream_dev, journal_stream_ino, - memory_pressure_path, + pressure_path, needs_sandboxing, &our_env); if (r < 0) { @@ -6047,7 +6085,7 @@ int exec_invoke( runtime, &rootfs, /* delegate= */ false, - memory_pressure_path, + pressure_path, uid, gid, command, @@ -6144,7 +6182,7 @@ int exec_invoke( runtime, &rootfs, /* delegate= */ true, - memory_pressure_path, + pressure_path, uid, gid, command, diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index c8f802687ee5b..7aa0a3a6c003d 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -279,7 +279,11 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) { if (r < 0) return r; - r = serialize_item(f, "exec-cgroup-context-memory-pressure-watch", cgroup_pressure_watch_to_string(c->memory_pressure_watch)); + r = serialize_item(f, "exec-cgroup-context-memory-pressure-watch", cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_MEMORY])); + if (r < 0) + return r; + + r = serialize_item(f, "exec-cgroup-context-cpu-pressure-watch", cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_CPU])); if (r < 0) return r; @@ -287,8 +291,14 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) { if (r < 0) return r; - if (c->memory_pressure_threshold_usec != USEC_INFINITY) { - r = serialize_usec(f, "exec-cgroup-context-memory-pressure-threshold-usec", c->memory_pressure_threshold_usec); + if (c->pressure_threshold_usec[PRESSURE_MEMORY] != USEC_INFINITY) { + r = serialize_usec(f, "exec-cgroup-context-memory-pressure-threshold-usec", c->pressure_threshold_usec[PRESSURE_MEMORY]); + if (r < 0) + return r; + } + + if (c->pressure_threshold_usec[PRESSURE_CPU] != USEC_INFINITY) { + r = serialize_usec(f, "exec-cgroup-context-cpu-pressure-threshold-usec", c->pressure_threshold_usec[PRESSURE_CPU]); if (r < 0) return r; } @@ -621,15 +631,23 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { if (r < 0) return r; } else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-watch="))) { - c->memory_pressure_watch = cgroup_pressure_watch_from_string(val); - if (c->memory_pressure_watch < 0) + c->pressure_watch[PRESSURE_MEMORY] = cgroup_pressure_watch_from_string(val); + if (c->pressure_watch[PRESSURE_MEMORY] < 0) + return -EINVAL; + } else if ((val = startswith(l, "exec-cgroup-context-cpu-pressure-watch="))) { + c->pressure_watch[PRESSURE_CPU] = cgroup_pressure_watch_from_string(val); + if (c->pressure_watch[PRESSURE_CPU] < 0) return -EINVAL; } else if ((val = startswith(l, "exec-cgroup-context-delegate-subgroup="))) { r = free_and_strdup(&c->delegate_subgroup, val); if (r < 0) return r; } else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-threshold-usec="))) { - r = deserialize_usec(val, &c->memory_pressure_threshold_usec); + r = deserialize_usec(val, &c->pressure_threshold_usec[PRESSURE_MEMORY]); + if (r < 0) + return r; + } else if ((val = startswith(l, "exec-cgroup-context-cpu-pressure-threshold-usec="))) { + r = deserialize_usec(val, &c->pressure_threshold_usec[PRESSURE_CPU]); if (r < 0) return r; } else if ((val = startswith(l, "exec-cgroup-context-device-allow="))) { diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index bf808d220bb73..1f5b85838255c 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -276,8 +276,10 @@ {{type}}.SocketBindAllow, config_parse_cgroup_socket_bind, 0, offsetof({{type}}, cgroup_context.socket_bind_allow) {{type}}.SocketBindDeny, config_parse_cgroup_socket_bind, 0, offsetof({{type}}, cgroup_context.socket_bind_deny) {{type}}.RestrictNetworkInterfaces, config_parse_restrict_network_interfaces, 0, offsetof({{type}}, cgroup_context) -{{type}}.MemoryPressureThresholdSec, config_parse_sec, 0, offsetof({{type}}, cgroup_context.memory_pressure_threshold_usec) -{{type}}.MemoryPressureWatch, config_parse_memory_pressure_watch, 0, offsetof({{type}}, cgroup_context.memory_pressure_watch) +{{type}}.MemoryPressureThresholdSec, config_parse_sec, 0, offsetof({{type}}, cgroup_context.pressure_threshold_usec[PRESSURE_MEMORY]) +{{type}}.MemoryPressureWatch, config_parse_pressure_watch, 0, offsetof({{type}}, cgroup_context.pressure_watch[PRESSURE_MEMORY]) +{{type}}.CPUPressureThresholdSec, config_parse_sec, 0, offsetof({{type}}, cgroup_context.pressure_threshold_usec[PRESSURE_CPU]) +{{type}}.CPUPressureWatch, config_parse_pressure_watch, 0, offsetof({{type}}, cgroup_context.pressure_watch[PRESSURE_CPU]) {{type}}.NFTSet, config_parse_cgroup_nft_set, NFT_SET_PARSE_CGROUP, offsetof({{type}}, cgroup_context) {{type}}.CoredumpReceive, config_parse_bool, 0, offsetof({{type}}, cgroup_context.coredump_receive) {{type}}.BindNetworkInterface, config_parse_bind_network_interface, 0, offsetof({{type}}, cgroup_context) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index cef01ab776365..f4480719a886f 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -150,7 +150,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_time DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only); DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy); DEFINE_CONFIG_PARSE_ENUM(config_parse_managed_oom_preference, managed_oom_preference, ManagedOOMPreference); -DEFINE_CONFIG_PARSE_ENUM(config_parse_memory_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch); +DEFINE_CONFIG_PARSE_ENUM(config_parse_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch); DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1); DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t); DEFINE_CONFIG_PARSE_PTR(config_parse_cg_cpu_weight, cg_cpu_weight_parse, uint64_t); diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 4677564904c52..a5b7595dbfdb9 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -164,7 +164,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_watchdog_sec); CONFIG_PARSER_PROTOTYPE(config_parse_tty_size); CONFIG_PARSER_PROTOTYPE(config_parse_log_filter_patterns); CONFIG_PARSER_PROTOTYPE(config_parse_open_file); -CONFIG_PARSER_PROTOTYPE(config_parse_memory_pressure_watch); +CONFIG_PARSER_PROTOTYPE(config_parse_pressure_watch); CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_nft_set); CONFIG_PARSER_PROTOTYPE(config_parse_mount_node); CONFIG_PARSER_PROTOTYPE(config_parse_concurrency_max); diff --git a/src/core/main.c b/src/core/main.c index 3a6284b456bc3..5db6ae7402b9b 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -806,8 +806,10 @@ static int parse_config_file(void) { { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_defaults.memory_accounting }, { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_defaults.tasks_accounting }, { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_defaults.tasks_max }, - { "Manager", "DefaultMemoryPressureThresholdSec", config_parse_sec, 0, &arg_defaults.memory_pressure_threshold_usec }, - { "Manager", "DefaultMemoryPressureWatch", config_parse_memory_pressure_watch, 0, &arg_defaults.memory_pressure_watch }, + { "Manager", "DefaultMemoryPressureThresholdSec", config_parse_sec, 0, &arg_defaults.pressure_threshold_usec[PRESSURE_MEMORY] }, + { "Manager", "DefaultMemoryPressureWatch", config_parse_pressure_watch, 0, &arg_defaults.pressure_watch[PRESSURE_MEMORY] }, + { "Manager", "DefaultCPUPressureThresholdSec", config_parse_sec, 0, &arg_defaults.pressure_threshold_usec[PRESSURE_CPU] }, + { "Manager", "DefaultCPUPressureWatch", config_parse_pressure_watch, 0, &arg_defaults.pressure_watch[PRESSURE_CPU] }, { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, arg_runtime_scope, &arg_cad_burst_action }, { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_defaults.oom_policy }, { "Manager", "DefaultOOMScoreAdjust", config_parse_oom_score_adjust, 0, NULL }, diff --git a/src/core/manager.c b/src/core/manager.c index e8c5f00895847..0b158fc285aae 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -616,6 +616,8 @@ static char** sanitize_environment(char **l) { l, "CACHE_DIRECTORY", "CONFIGURATION_DIRECTORY", + "CPU_PRESSURE_WATCH", + "CPU_PRESSURE_WRITE", "CREDENTIALS_DIRECTORY", "EXIT_CODE", "EXIT_STATUS", @@ -796,26 +798,37 @@ static int manager_setup_sigchld_event_source(Manager *m) { return 0; } -int manager_setup_memory_pressure_event_source(Manager *m) { +typedef int (*pressure_add_t)(sd_event *, sd_event_source **, sd_event_handler_t, void *); +typedef int (*pressure_set_period_t)(sd_event_source *, usec_t, usec_t); + +static const struct { + pressure_add_t add; + pressure_set_period_t set_period; +} pressure_dispatch_table[_PRESSURE_RESOURCE_MAX] = { + [PRESSURE_MEMORY] = { sd_event_add_memory_pressure, sd_event_source_set_memory_pressure_period }, + [PRESSURE_CPU] = { sd_event_add_cpu_pressure, sd_event_source_set_cpu_pressure_period }, +}; + +int manager_setup_pressure_event_source(Manager *m, PressureResource t) { int r; assert(m); + assert(t >= 0 && t < _PRESSURE_RESOURCE_MAX); - m->memory_pressure_event_source = sd_event_source_disable_unref(m->memory_pressure_event_source); + m->pressure_event_source[t] = sd_event_source_disable_unref(m->pressure_event_source[t]); - r = sd_event_add_memory_pressure(m->event, &m->memory_pressure_event_source, NULL, NULL); + r = pressure_dispatch_table[t].add(m->event, &m->pressure_event_source[t], NULL, NULL); if (r < 0) log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || (r == -EHOSTDOWN) ? LOG_DEBUG : LOG_NOTICE, r, - "Failed to establish memory pressure event source, ignoring: %m"); - else if (m->defaults.memory_pressure_threshold_usec != USEC_INFINITY) { - - /* If there's a default memory pressure threshold set, also apply it to the service manager itself */ - r = sd_event_source_set_memory_pressure_period( - m->memory_pressure_event_source, - m->defaults.memory_pressure_threshold_usec, - MEMORY_PRESSURE_DEFAULT_WINDOW_USEC); + "Failed to establish %s pressure event source, ignoring: %m", pressure_resource_to_string(t)); + else if (m->defaults.pressure_threshold_usec[t] != USEC_INFINITY) { + + r = pressure_dispatch_table[t].set_period( + m->pressure_event_source[t], + m->defaults.pressure_threshold_usec[t], + PRESSURE_DEFAULT_WINDOW_USEC); if (r < 0) - log_warning_errno(r, "Failed to adjust memory pressure threshold, ignoring: %m"); + log_warning_errno(r, "Failed to adjust %s pressure threshold, ignoring: %m", pressure_resource_to_string(t)); } return 0; @@ -1001,9 +1014,11 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, if (r < 0) return r; - r = manager_setup_memory_pressure_event_source(m); - if (r < 0) - return r; + for (PressureResource t = 0; t < _PRESSURE_RESOURCE_MAX; t++) { + r = manager_setup_pressure_event_source(m, t); + if (r < 0) + return r; + } #if HAVE_LIBBPF if (MANAGER_IS_SYSTEM(m) && bpf_restrict_fs_supported(/* initialize= */ true)) { @@ -1711,7 +1726,8 @@ Manager* manager_free(Manager *m) { sd_event_source_unref(m->user_lookup_event_source); sd_event_source_unref(m->handoff_timestamp_event_source); sd_event_source_unref(m->pidref_event_source); - sd_event_source_unref(m->memory_pressure_event_source); + for (PressureResource t = 0; t < _PRESSURE_RESOURCE_MAX; t++) + sd_event_source_unref(m->pressure_event_source[t]); safe_close(m->signal_fd); safe_close(m->notify_fd); @@ -4298,8 +4314,8 @@ int manager_set_unit_defaults(Manager *m, const UnitDefaults *defaults) { m->defaults.oom_score_adjust = defaults->oom_score_adjust; m->defaults.oom_score_adjust_set = defaults->oom_score_adjust_set; - m->defaults.memory_pressure_watch = defaults->memory_pressure_watch; - m->defaults.memory_pressure_threshold_usec = defaults->memory_pressure_threshold_usec; + memcpy(m->defaults.pressure_watch, defaults->pressure_watch, sizeof(m->defaults.pressure_watch)); + memcpy(m->defaults.pressure_threshold_usec, defaults->pressure_threshold_usec, sizeof(m->defaults.pressure_threshold_usec)); free_and_replace(m->defaults.smack_process_label, label); rlimit_free_all(m->defaults.rlimit); @@ -5191,8 +5207,8 @@ void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope) { .tasks_max = DEFAULT_TASKS_MAX, .timer_accuracy_usec = 1 * USEC_PER_MINUTE, - .memory_pressure_watch = CGROUP_PRESSURE_WATCH_AUTO, - .memory_pressure_threshold_usec = MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC, + .pressure_watch = { CGROUP_PRESSURE_WATCH_AUTO, CGROUP_PRESSURE_WATCH_AUTO }, + .pressure_threshold_usec = { PRESSURE_DEFAULT_THRESHOLD_USEC, PRESSURE_DEFAULT_THRESHOLD_USEC }, .oom_policy = OOM_STOP, .oom_score_adjust_set = false, diff --git a/src/core/manager.h b/src/core/manager.h index 2df606005dbb1..3e461f22528e8 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -147,8 +147,8 @@ typedef struct UnitDefaults { int oom_score_adjust; bool oom_score_adjust_set; - CGroupPressureWatch memory_pressure_watch; - usec_t memory_pressure_threshold_usec; + CGroupPressureWatch pressure_watch[_PRESSURE_RESOURCE_MAX]; + usec_t pressure_threshold_usec[_PRESSURE_RESOURCE_MAX]; char *smack_process_label; @@ -479,7 +479,7 @@ typedef struct Manager { /* Dump*() are slow, so always rate limit them to 10 per 10 minutes */ RateLimit dump_ratelimit; - sd_event_source *memory_pressure_event_source; + sd_event_source *pressure_event_source[_PRESSURE_RESOURCE_MAX]; /* For NFTSet= */ sd_netlink *nfnl; @@ -560,7 +560,7 @@ void manager_unwatch_pidref(Manager *m, const PidRef *pid); unsigned manager_dispatch_load_queue(Manager *m); -int manager_setup_memory_pressure_event_source(Manager *m); +int manager_setup_pressure_event_source(Manager *m, PressureResource t); int manager_default_environment(Manager *m); int manager_transient_environment_add(Manager *m, char **plus); diff --git a/src/core/system.conf.in b/src/core/system.conf.in index 54196e84894df..4507d02905cb0 100644 --- a/src/core/system.conf.in +++ b/src/core/system.conf.in @@ -77,6 +77,8 @@ #DefaultLimitRTTIME= #DefaultMemoryPressureThresholdSec=200ms #DefaultMemoryPressureWatch=auto +#DefaultCPUPressureThresholdSec=200ms +#DefaultCPUPressureWatch=auto #DefaultOOMPolicy=stop #DefaultSmackProcessLabel= #DefaultRestrictSUIDSGID= diff --git a/src/core/unit.c b/src/core/unit.c index 9af7fb51405e0..54c8d1bd35069 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -178,8 +178,8 @@ static void unit_init(Unit *u) { if (u->type != UNIT_SLICE) cc->tasks_max = u->manager->defaults.tasks_max; - cc->memory_pressure_watch = u->manager->defaults.memory_pressure_watch; - cc->memory_pressure_threshold_usec = u->manager->defaults.memory_pressure_threshold_usec; + memcpy(cc->pressure_watch, u->manager->defaults.pressure_watch, sizeof(cc->pressure_watch)); + memcpy(cc->pressure_threshold_usec, u->manager->defaults.pressure_threshold_usec, sizeof(cc->pressure_threshold_usec)); } ec = unit_get_exec_context(u); diff --git a/src/core/user.conf.in b/src/core/user.conf.in index 9c37f4b54e9bd..fe45c00b74e4c 100644 --- a/src/core/user.conf.in +++ b/src/core/user.conf.in @@ -54,6 +54,8 @@ #DefaultLimitRTTIME= #DefaultMemoryPressureThresholdSec=200ms #DefaultMemoryPressureWatch=auto +#DefaultCPUPressureThresholdSec=200ms +#DefaultCPUPressureWatch=auto #DefaultSmackProcessLabel= #DefaultRestrictSUIDSGID= #ReloadLimitIntervalSec= diff --git a/src/core/varlink-cgroup.c b/src/core/varlink-cgroup.c index 65c07ecfad477..1e4a8f8682dd2 100644 --- a/src/core/varlink-cgroup.c +++ b/src/core/varlink-cgroup.c @@ -323,8 +323,10 @@ int unit_cgroup_context_build_json(sd_json_variant **ret, const char *name, void JSON_BUILD_PAIR_UNSIGNED_NON_ZERO("ManagedOOMMemoryPressureLimit", c->moom_mem_pressure_limit), JSON_BUILD_PAIR_FINITE_USEC("ManagedOOMMemoryPressureDurationUSec", c->moom_mem_pressure_duration_usec), SD_JSON_BUILD_PAIR_STRING("ManagedOOMPreference", managed_oom_preference_to_string(c->moom_preference)), - SD_JSON_BUILD_PAIR_STRING("MemoryPressureWatch", cgroup_pressure_watch_to_string(c->memory_pressure_watch)), - JSON_BUILD_PAIR_FINITE_USEC("MemoryPressureThresholdUSec", c->memory_pressure_threshold_usec), + SD_JSON_BUILD_PAIR_STRING("MemoryPressureWatch", cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_MEMORY])), + JSON_BUILD_PAIR_FINITE_USEC("MemoryPressureThresholdUSec", c->pressure_threshold_usec[PRESSURE_MEMORY]), + SD_JSON_BUILD_PAIR_STRING("CPUPressureWatch", cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_CPU])), + JSON_BUILD_PAIR_FINITE_USEC("CPUPressureThresholdUSec", c->pressure_threshold_usec[PRESSURE_CPU]), /* Others */ SD_JSON_BUILD_PAIR_BOOLEAN("CoredumpReceive", c->coredump_receive)); diff --git a/src/core/varlink-manager.c b/src/core/varlink-manager.c index bad37206328dd..63d61a56fc8c3 100644 --- a/src/core/varlink-manager.c +++ b/src/core/varlink-manager.c @@ -106,8 +106,10 @@ static int manager_context_build_json(sd_json_variant **ret, const char *name, v SD_JSON_BUILD_PAIR_BOOLEAN("DefaultTasksAccounting", m->defaults.tasks_accounting), SD_JSON_BUILD_PAIR_CALLBACK("DefaultLimits", rlimit_table_build_json, m->defaults.rlimit), SD_JSON_BUILD_PAIR_UNSIGNED("DefaultTasksMax", cgroup_tasks_max_resolve(&m->defaults.tasks_max)), - JSON_BUILD_PAIR_FINITE_USEC("DefaultMemoryPressureThresholdUSec", m->defaults.memory_pressure_threshold_usec), - SD_JSON_BUILD_PAIR_STRING("DefaultMemoryPressureWatch", cgroup_pressure_watch_to_string(m->defaults.memory_pressure_watch)), + JSON_BUILD_PAIR_FINITE_USEC("DefaultMemoryPressureThresholdUSec", m->defaults.pressure_threshold_usec[PRESSURE_MEMORY]), + SD_JSON_BUILD_PAIR_STRING("DefaultMemoryPressureWatch", cgroup_pressure_watch_to_string(m->defaults.pressure_watch[PRESSURE_MEMORY])), + JSON_BUILD_PAIR_FINITE_USEC("DefaultCPUPressureThresholdUSec", m->defaults.pressure_threshold_usec[PRESSURE_CPU]), + SD_JSON_BUILD_PAIR_STRING("DefaultCPUPressureWatch", cgroup_pressure_watch_to_string(m->defaults.pressure_watch[PRESSURE_CPU])), JSON_BUILD_PAIR_FINITE_USEC("RuntimeWatchdogUSec", manager_get_watchdog(m, WATCHDOG_RUNTIME)), JSON_BUILD_PAIR_FINITE_USEC("RebootWatchdogUSec", manager_get_watchdog(m, WATCHDOG_REBOOT)), JSON_BUILD_PAIR_FINITE_USEC("KExecWatchdogUSec", manager_get_watchdog(m, WATCHDOG_KEXEC)), diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 84de3478a7f9e..9c732543fac7d 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -2383,6 +2383,7 @@ static const BusProperty cgroup_properties[] = { { "ManagedOOMMemoryPressure", bus_append_string }, { "ManagedOOMPreference", bus_append_string }, { "MemoryPressureWatch", bus_append_string }, + { "CPUPressureWatch", bus_append_string }, { "DelegateSubgroup", bus_append_string }, { "ManagedOOMMemoryPressureLimit", bus_append_parse_permyriad }, { "MemoryAccounting", bus_append_parse_boolean }, @@ -2421,6 +2422,7 @@ static const BusProperty cgroup_properties[] = { { "SocketBindAllow", bus_append_socket_filter }, { "SocketBindDeny", bus_append_socket_filter }, { "MemoryPressureThresholdSec", bus_append_parse_sec_rename }, + { "CPUPressureThresholdSec", bus_append_parse_sec_rename }, { "NFTSet", bus_append_nft_set }, { "BindNetworkInterface", bus_append_string }, diff --git a/src/shared/varlink-io.systemd.Manager.c b/src/shared/varlink-io.systemd.Manager.c index f33cab34b3de9..dc49788fd9067 100644 --- a/src/shared/varlink-io.systemd.Manager.c +++ b/src/shared/varlink-io.systemd.Manager.c @@ -64,6 +64,10 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE( SD_VARLINK_DEFINE_FIELD(DefaultMemoryPressureThresholdUSec, SD_VARLINK_INT, 0), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultMemoryPressureWatch="), SD_VARLINK_DEFINE_FIELD(DefaultMemoryPressureWatch, SD_VARLINK_STRING, 0), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultCPUPressureThresholdUSec="), + SD_VARLINK_DEFINE_FIELD(DefaultCPUPressureThresholdUSec, SD_VARLINK_INT, 0), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultCPUPressureWatch="), + SD_VARLINK_DEFINE_FIELD(DefaultCPUPressureWatch, SD_VARLINK_STRING, 0), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#RuntimeWatchdogSec="), SD_VARLINK_DEFINE_FIELD(RuntimeWatchdogUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#RebootWatchdogSec="), diff --git a/src/shared/varlink-io.systemd.Unit.c b/src/shared/varlink-io.systemd.Unit.c index 05676210ef2a4..a230f29daba8b 100644 --- a/src/shared/varlink-io.systemd.Unit.c +++ b/src/shared/varlink-io.systemd.Unit.c @@ -228,6 +228,10 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE( SD_VARLINK_DEFINE_FIELD(MemoryPressureWatch, SD_VARLINK_STRING, 0), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#MemoryPressureThresholdSec="), SD_VARLINK_DEFINE_FIELD(MemoryPressureThresholdUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#CPUPressureWatch="), + SD_VARLINK_DEFINE_FIELD(CPUPressureWatch, SD_VARLINK_STRING, 0), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#CPUPressureThresholdSec="), + SD_VARLINK_DEFINE_FIELD(CPUPressureThresholdUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE), /* Others */ SD_VARLINK_FIELD_COMMENT("Reflects whether to forward coredumps for processes that crash within this cgroup"), diff --git a/test/integration-tests/TEST-79-MEMPRESS/meson.build b/test/integration-tests/TEST-79-PRESSURE/meson.build similarity index 100% rename from test/integration-tests/TEST-79-MEMPRESS/meson.build rename to test/integration-tests/TEST-79-PRESSURE/meson.build diff --git a/test/integration-tests/meson.build b/test/integration-tests/meson.build index 5d71e87c79cbc..198371ccae49a 100644 --- a/test/integration-tests/meson.build +++ b/test/integration-tests/meson.build @@ -91,7 +91,7 @@ foreach dirname : [ 'TEST-74-AUX-UTILS', 'TEST-75-RESOLVED', 'TEST-78-SIGQUEUE', - 'TEST-79-MEMPRESS', + 'TEST-79-PRESSURE', 'TEST-80-NOTIFYACCESS', 'TEST-81-GENERATORS', 'TEST-82-SOFTREBOOT', diff --git a/test/units/TEST-79-MEMPRESS.sh b/test/units/TEST-79-PRESSURE.sh similarity index 56% rename from test/units/TEST-79-MEMPRESS.sh rename to test/units/TEST-79-PRESSURE.sh index 065916096682e..d4e4a9e06b5b4 100755 --- a/test/units/TEST-79-MEMPRESS.sh +++ b/test/units/TEST-79-PRESSURE.sh @@ -13,7 +13,7 @@ if ! cat /proc/pressure/memory >/dev/null ; then exit 0 fi -CGROUP=/sys/fs/cgroup/"$(systemctl show TEST-79-MEMPRESS.service -P ControlGroup)" +CGROUP=/sys/fs/cgroup/"$(systemctl show TEST-79-PRESSURE.service -P ControlGroup)" test -d "$CGROUP" if ! test -f "$CGROUP"/memory.pressure ; then @@ -61,4 +61,57 @@ systemd-run \ rm "$SCRIPT" +# Now test CPU pressure + +if ! cat /proc/pressure/cpu >/dev/null ; then + echo "kernel has no CPU PSI support." >&2 + echo OK >/testok + exit 0 +fi + +if ! test -f "$CGROUP"/cpu.pressure ; then + echo "No CPU accounting/PSI delegated via cgroup, can't test." >&2 + echo OK >/testok + exit 0 +fi + +UNIT="test-cpupress-$RANDOM.service" +SCRIPT="/tmp/cpupress-$RANDOM.sh" + +cat >"$SCRIPT" <<'EOF' +#!/usr/bin/env bash + +set -ex + +export +id + +test -n "$CPU_PRESSURE_WATCH" +test "$CPU_PRESSURE_WATCH" != /dev/null +test -w "$CPU_PRESSURE_WATCH" + +ls -al "$CPU_PRESSURE_WATCH" + +EXPECTED="$(echo -n -e "some 123000 2000000\x00" | base64)" + +test "$EXPECTED" = "$CPU_PRESSURE_WRITE" + +EOF + +chmod +x "$SCRIPT" + +systemd-run \ + -u "$UNIT" \ + -p Type=exec \ + -p ProtectControlGroups=1 \ + -p DynamicUser=1 \ + -p CPUPressureWatch=on \ + -p CPUPressureThresholdSec=123ms \ + -p BindPaths=$SCRIPT \ + `# Make sanitizers happy when DynamicUser=1 pulls in instrumented systemd NSS modules` \ + -p EnvironmentFile=-/usr/lib/systemd/systemd-asan-env \ + --wait "$SCRIPT" + +rm "$SCRIPT" + touch /testok From 73b1284fd0060b6db8c567e27840652b41d52ec4 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Sat, 7 Mar 2026 23:37:55 +0100 Subject: [PATCH 5/7] core: Add I/O pressure support --- man/org.freedesktop.systemd1.xml | 126 ++++++++++++++++++++--- man/rules/meson.build | 3 + man/sd_event_add_memory_pressure.xml | 101 +++++++++++++------ man/systemd-system.conf.xml | 14 +++ man/systemd.exec.xml | 12 +++ man/systemd.resource-control.xml | 49 +++++++++ src/basic/psi-util.c | 6 ++ src/basic/psi-util.h | 1 + src/core/cgroup.c | 10 +- src/core/cgroup.h | 9 +- src/core/dbus-cgroup.c | 12 ++- src/core/dbus-manager.c | 2 + src/core/execute-serialize.c | 18 ++++ src/core/load-fragment-gperf.gperf.in | 2 + src/core/main.c | 2 + src/core/manager.c | 7 +- src/core/system.conf.in | 2 + src/core/user.conf.in | 2 + src/core/varlink-cgroup.c | 2 + src/core/varlink-manager.c | 2 + src/libsystemd/libsystemd.sym | 3 + src/libsystemd/sd-event/event-source.h | 1 + src/libsystemd/sd-event/sd-event.c | 49 ++++++++- src/shared/bus-unit-util.c | 2 + src/shared/varlink-io.systemd.Manager.c | 4 + src/shared/varlink-io.systemd.Unit.c | 4 + src/systemd/sd-event.h | 3 + src/test/test-pressure.c | 128 ++++++++++++++++++++++++ test/units/TEST-79-PRESSURE.sh | 53 ++++++++++ 29 files changed, 573 insertions(+), 56 deletions(-) diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 2c590183dfb9b..e51f385122c2a 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -556,6 +556,10 @@ node /org/freedesktop/systemd1 { readonly t DefaultCPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s DefaultCPUPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t DefaultIOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DefaultIOPressureWatch = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly t TimerSlackNSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -799,6 +803,10 @@ node /org/freedesktop/systemd1 { + + + + @@ -1251,6 +1259,10 @@ node /org/freedesktop/systemd1 { + + + + @@ -3076,6 +3088,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -3749,6 +3765,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + @@ -4445,6 +4465,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + @@ -5348,6 +5372,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -6037,6 +6065,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + @@ -6707,6 +6739,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + @@ -7433,6 +7469,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -8046,6 +8086,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + @@ -8624,6 +8668,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + @@ -9483,6 +9531,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -10078,6 +10130,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + @@ -10638,6 +10694,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + @@ -11350,6 +11410,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -11527,6 +11591,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + @@ -11719,6 +11787,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + @@ -11934,6 +12006,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -12125,6 +12201,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + @@ -12341,6 +12421,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + @@ -12553,8 +12637,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ RemoveSubgroupFromUnit(), and KillUnitSubgroup() were added in version 258. TransactionsWithOrderingCycle was added in version 259. - DefaultCPUPressureThresholdUSec and - DefaultCPUPressureWatch were added in version 261. + DefaultCPUPressureThresholdUSec, + DefaultCPUPressureWatch, + DefaultIOPressureThresholdUSec, and + DefaultIOPressureWatch were added in version 261. Unit Objects @@ -12646,8 +12732,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ExecReloadPostEx were added in version 259. BindNetworkInterface, MemoryTHP, RefreshOnReload, and RootMStack were added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Socket Unit Objects @@ -12718,8 +12806,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMKills were added in 259. BindNetworkInterface MemoryTHP, and RootMStack were added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Mount Unit Objects @@ -12785,8 +12875,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMKills were added in 259. BindNetworkInterface MemoryTHP, and RootMStack were added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Swap Unit Objects @@ -12850,8 +12942,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMKills were added in 259. BindNetworkInterface, MemoryTHP, and RootMStack were added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Slice Unit Objects @@ -12885,8 +12979,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ OOMKills, and ManagedOOMKills were added in 259. BindNetworkInterface was added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Scope Unit Objects @@ -12918,8 +13014,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ OOMKills, and ManagedOOMKills were added in 259. BindNetworkInterface was added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Job Objects diff --git a/man/rules/meson.build b/man/rules/meson.build index ee20ace35f3e2..11cc66331dbe8 100644 --- a/man/rules/meson.build +++ b/man/rules/meson.build @@ -609,8 +609,11 @@ manpages = [ ['sd_event_add_memory_pressure', '3', ['sd_event_add_cpu_pressure', + 'sd_event_add_io_pressure', 'sd_event_source_set_cpu_pressure_period', 'sd_event_source_set_cpu_pressure_type', + 'sd_event_source_set_io_pressure_period', + 'sd_event_source_set_io_pressure_type', 'sd_event_source_set_memory_pressure_period', 'sd_event_source_set_memory_pressure_type', 'sd_event_trim_memory'], diff --git a/man/sd_event_add_memory_pressure.xml b/man/sd_event_add_memory_pressure.xml index 1e6b734738f6c..05f2ff2b74528 100644 --- a/man/sd_event_add_memory_pressure.xml +++ b/man/sd_event_add_memory_pressure.xml @@ -25,7 +25,11 @@ sd_event_source_set_cpu_pressure_type sd_event_source_set_cpu_pressure_period - Add and configure an event source run as result of memory or CPU pressure + sd_event_add_io_pressure + sd_event_source_set_io_pressure_type + sd_event_source_set_io_pressure_period + + Add and configure an event source for memory, CPU, or IO pressure notifications @@ -76,6 +80,27 @@ uint64_t window_usec + + int sd_event_add_io_pressure + sd_event *event + sd_event_source **ret_source + sd_event_handler_t handler + void *userdata + + + + int sd_event_source_set_io_pressure_type + sd_event_source *source + const char *type + + + + int sd_event_source_set_io_pressure_period + sd_event_source *source + uint64_t threshold_usec + uint64_t window_usec + + int sd_event_trim_memory void @@ -88,19 +113,24 @@ sd_event_add_memory_pressure() adds a new event source that is triggered whenever memory pressure is seen. Similarly, - sd_event_add_cpu_pressure() adds a new event source that is triggered whenever CPU - pressure is seen. This functionality is built around the Linux kernel's sd_event_add_cpu_pressure() and sd_event_add_io_pressure() add + new event sources that are triggered whenever CPU or IO pressure is seen, respectively. This functionality + is built around the Linux kernel's Pressure Stall Information (PSI) logic. - Both functions expect an event loop object as first parameter, and return the allocated event source + These functions expect an event loop object as first parameter, and return the allocated event source object in the second parameter, on success. The handler parameter is a function to call when pressure is seen, or NULL. The handler function will be passed the userdata pointer, which may be chosen freely by the caller. The handler may return negative to signal an error (see below), other return values are ignored. If - handler is NULL, a default handler that compacts allocation - caches maintained by libsystemd as well as glibc (via malloc_trim3) - will be used. + handler is NULL, a default handler is used. For + sd_event_add_memory_pressure(), the default handler compacts allocation caches + maintained by libsystemd as well as glibc (via malloc_trim3). + For sd_event_add_cpu_pressure() and + sd_event_add_io_pressure(), the default handler is a no-op. It is recommended to + pass a custom handler for CPU and IO pressure to take meaningful action when pressure is + detected. To destroy an event source object use sd_event_source_unref3, @@ -110,8 +140,8 @@ sd_event_source_set_enabled3 with SD_EVENT_OFF. - If the second parameter of sd_event_add_memory_pressure() or - sd_event_add_cpu_pressure() is + If the second parameter of sd_event_add_memory_pressure(), + sd_event_add_cpu_pressure(), or sd_event_add_io_pressure() is NULL no reference to the event source object is returned. In this case, the event source is considered "floating", and will be destroyed implicitly when the event loop itself is destroyed. @@ -146,6 +176,11 @@ provides the some line, not the full line, so only some is valid when watching at the system level. + The IO pressure event source follows the same logic, but uses the + $IO_PRESSURE_WATCH/$IO_PRESSURE_WRITE environment variables, + the io.pressure cgroup file, and the system-wide PSI interface file + /proc/pressure/io instead. + Or in other words: preferably any explicit configuration passed in by an invoking service manager (or similar) is used as notification source, before falling back to local notifications of the service, and finally to global notifications of the system. @@ -189,12 +224,15 @@ Similarly, sd_event_source_set_cpu_pressure_type() and sd_event_source_set_cpu_pressure_period() can be used to fine-tune the PSI - parameters for CPU pressure notifications. They work identically to their memory pressure counterparts. + parameters for CPU pressure notifications, and + sd_event_source_set_io_pressure_type() and + sd_event_source_set_io_pressure_period() can be used to fine-tune the PSI + parameters for IO pressure notifications. They work identically to their memory pressure counterparts. The type parameter takes either some or full, and the period function takes threshold and period times in microseconds. The same constraints apply: these calls must - be invoked immediately after allocating the event source, and will fail if CPU pressure parameterization - has been passed in via the - $CPU_PRESSURE_WATCH/$CPU_PRESSURE_WRITE environment + be invoked immediately after allocating the event source, and will fail if pressure parameterization + has been passed in via the corresponding + $*_PRESSURE_WATCH/$*_PRESSURE_WRITE environment variables. The sd_event_trim_memory() function releases various internal allocation @@ -242,9 +280,9 @@ -EHOSTDOWN - The $MEMORY_PRESSURE_WATCH or - $CPU_PRESSURE_WATCH variable has been set to the literal - string /dev/null, in order to explicitly disable pressure + The $MEMORY_PRESSURE_WATCH, + $CPU_PRESSURE_WATCH, or $IO_PRESSURE_WATCH variable has been + set to the literal string /dev/null, in order to explicitly disable pressure handling. @@ -253,9 +291,9 @@ -EBADMSG - The $MEMORY_PRESSURE_WATCH or - $CPU_PRESSURE_WATCH variable has been set to an invalid - string, for example a relative rather than an absolute path. + The $MEMORY_PRESSURE_WATCH, + $CPU_PRESSURE_WATCH, or $IO_PRESSURE_WATCH variable has been + set to an invalid string, for example a relative rather than an absolute path. @@ -263,9 +301,9 @@ -ENOTTY - The $MEMORY_PRESSURE_WATCH or - $CPU_PRESSURE_WATCH variable points to a regular file - outside of the procfs or cgroupfs file systems. + The $MEMORY_PRESSURE_WATCH, + $CPU_PRESSURE_WATCH, or $IO_PRESSURE_WATCH variable points + to a regular file outside of the procfs or cgroupfs file systems. @@ -273,9 +311,9 @@ -EOPNOTSUPP - No configuration via $MEMORY_PRESSURE_WATCH or - $CPU_PRESSURE_WATCH has been specified and the local kernel does not support the - PSI interface. + No configuration via $MEMORY_PRESSURE_WATCH, + $CPU_PRESSURE_WATCH, or $IO_PRESSURE_WATCH has been specified + and the local kernel does not support the PSI interface. @@ -286,7 +324,9 @@ This is returned by sd_event_source_set_memory_pressure_type(), sd_event_source_set_memory_pressure_period(), sd_event_source_set_cpu_pressure_type(), - and sd_event_source_set_cpu_pressure_period() if invoked on event sources + sd_event_source_set_cpu_pressure_period(), + sd_event_source_set_io_pressure_type(), + and sd_event_source_set_io_pressure_period() if invoked on event sources at a time later than immediately after allocating them. @@ -329,8 +369,11 @@ sd_event_source_set_memory_pressure_period(), and sd_event_trim_memory() were added in version 254. sd_event_add_cpu_pressure(), - sd_event_source_set_cpu_pressure_type(), and - sd_event_source_set_cpu_pressure_period() were added in version 261. + sd_event_source_set_cpu_pressure_type(), + sd_event_source_set_cpu_pressure_period(), + sd_event_add_io_pressure(), + sd_event_source_set_io_pressure_type(), and + sd_event_source_set_io_pressure_period() were added in version 261. diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml index eca8a0f0bd7a0..b93c31162b499 100644 --- a/man/systemd-system.conf.xml +++ b/man/systemd-system.conf.xml @@ -340,6 +340,20 @@ + + + DefaultIOPressureWatch= + DefaultIOPressureThresholdSec= + + Configures the default settings for the per-unit + IOPressureWatch= and IOPressureThresholdSec= + settings. See + systemd.resource-control5 + for details. Defaults to auto and 200ms, respectively. This + also sets the IO pressure monitoring threshold for the service manager itself. + + + diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 1048fcadfc376..455f666374f99 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -4717,6 +4717,18 @@ StandardInputData=V2XigLJyZSBubyBzdHJhbmdlcnMgdG8gbG92ZQpZb3Uga25vdyB0aGUgcnVsZX + + $IO_PRESSURE_WATCH + $IO_PRESSURE_WRITE + + If IO pressure monitoring is enabled for this service unit, the path to watch + and the data to write into it. See Resource Pressure + Handling for details about these variables and the service protocol data they + convey. + + + + $FDSTORE diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 20584c0b7373b..ed8b0e11c8698 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -1703,6 +1703,55 @@ DeviceAllow=/dev/loop-control + + + IOPressureWatch= + + Controls IO pressure monitoring for invoked processes. Takes a boolean or one of + auto and skip. If no, tells the service not + to watch for IO pressure events, by setting the $IO_PRESSURE_WATCH + environment variable to the literal string /dev/null. If yes, + tells the service to watch for IO pressure events. This enables IO accounting for the + service, and ensures the io.pressure cgroup attribute file is accessible for + reading and writing by the service's user. It then sets the $IO_PRESSURE_WATCH + environment variable for processes invoked by the unit to the file system path to this file. The + threshold information configured with IOPressureThresholdSec= is encoded in + the $IO_PRESSURE_WRITE environment variable. If the auto + value is set the protocol is enabled if IO accounting is anyway enabled for the unit (e.g. because + IOWeight= or IODeviceWeight= is set), and + disabled otherwise. If set to skip the logic is neither enabled, nor disabled and + the two environment variables are not set. + + Note that services are free to use the two environment variables, but it is unproblematic if + they ignore them. IO pressure handling must be implemented individually in each service, and + usually means different things for different software. + + Services implemented using + sd-event3 may use + sd_event_add_io_pressure3 + to watch for and handle IO pressure events. + + If not explicitly set, defaults to the DefaultIOPressureWatch= setting in + systemd-system.conf5. + + + + + + IOPressureThresholdSec= + + Sets the IO pressure threshold time for IO pressure monitor as configured via + IOPressureWatch=. Specifies the maximum IO stall time before an IO + pressure event is signalled to the service, per 2s window. If not specified, defaults to the + DefaultIOPressureThresholdSec= setting in + systemd-system.conf5 + (which in turn defaults to 200ms). The specified value expects a time unit such as + ms or μs, see + systemd.time7 for + details on the permitted syntax. + + + Coredump Control diff --git a/src/basic/psi-util.c b/src/basic/psi-util.c index cf05485dc7b67..f2a93e674f0d9 100644 --- a/src/basic/psi-util.c +++ b/src/basic/psi-util.c @@ -116,11 +116,17 @@ const PressureResourceInfo pressure_resource_info[_PRESSURE_RESOURCE_MAX] = { .env_watch = "CPU_PRESSURE_WATCH", .env_write = "CPU_PRESSURE_WRITE", }, + [PRESSURE_IO] = { + .name = "io", + .env_watch = "IO_PRESSURE_WATCH", + .env_write = "IO_PRESSURE_WRITE", + }, }; static const char* const pressure_resource_table[_PRESSURE_RESOURCE_MAX] = { [PRESSURE_MEMORY] = "memory", [PRESSURE_CPU] = "cpu", + [PRESSURE_IO] = "io", }; DEFINE_STRING_TABLE_LOOKUP(pressure_resource, PressureResource); diff --git a/src/basic/psi-util.h b/src/basic/psi-util.h index aed74ef742d5a..88b79739463c7 100644 --- a/src/basic/psi-util.h +++ b/src/basic/psi-util.h @@ -12,6 +12,7 @@ typedef enum PressureType { typedef enum PressureResource { PRESSURE_MEMORY, PRESSURE_CPU, + PRESSURE_IO, _PRESSURE_RESOURCE_MAX, _PRESSURE_RESOURCE_INVALID = -EINVAL, } PressureResource; diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 79c1c36b1da2f..d175c972c541e 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -185,8 +185,8 @@ void cgroup_context_init(CGroupContext *c) { * moom_mem_pressure_duration_usec is set to infinity. */ .moom_mem_pressure_duration_usec = USEC_INFINITY, - .pressure_watch = { _CGROUP_PRESSURE_WATCH_INVALID, _CGROUP_PRESSURE_WATCH_INVALID }, - .pressure_threshold_usec = { USEC_INFINITY, USEC_INFINITY }, + .pressure_watch = { _CGROUP_PRESSURE_WATCH_INVALID, _CGROUP_PRESSURE_WATCH_INVALID, _CGROUP_PRESSURE_WATCH_INVALID }, + .pressure_threshold_usec = { USEC_INFINITY, USEC_INFINITY, USEC_INFINITY }, }; } @@ -527,6 +527,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { "%sManagedOOMPreference: %s\n" "%sMemoryPressureWatch: %s\n" "%sCPUPressureWatch: %s\n" + "%sIOPressureWatch: %s\n" "%sCoredumpReceive: %s\n", prefix, yes_no(c->io_accounting), prefix, yes_no(c->memory_accounting), @@ -564,6 +565,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { prefix, managed_oom_preference_to_string(c->moom_preference), prefix, cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_MEMORY]), prefix, cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_CPU]), + prefix, cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_IO]), prefix, yes_no(c->coredump_receive)); if (c->delegate_subgroup) @@ -582,6 +584,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { fprintf(f, "%sCPUPressureThresholdSec: %s\n", prefix, FORMAT_TIMESPAN(c->pressure_threshold_usec[PRESSURE_CPU], 1)); + if (c->pressure_threshold_usec[PRESSURE_IO] != USEC_INFINITY) + fprintf(f, "%sIOPressureThresholdSec: %s\n", + prefix, FORMAT_TIMESPAN(c->pressure_threshold_usec[PRESSURE_IO], 1)); + if (c->moom_mem_pressure_duration_usec != USEC_INFINITY) fprintf(f, "%sManagedOOMMemoryPressureDurationSec: %s\n", prefix, FORMAT_TIMESPAN(c->moom_mem_pressure_duration_usec, 1)); diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 69086467f1f92..26bf513bd80bd 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -372,6 +372,14 @@ static inline bool cgroup_context_want_pressure(const CGroupContext *c, Pressure c->startup_cpu_weight != CGROUP_WEIGHT_INVALID || c->cpu_quota_per_sec_usec != USEC_INFINITY; + case PRESSURE_IO: + return c->io_accounting || + c->io_weight != CGROUP_WEIGHT_INVALID || + c->startup_io_weight != CGROUP_WEIGHT_INVALID || + c->io_device_weights || + c->io_device_latencies || + c->io_device_limits; + default: assert_not_reached(); } @@ -474,7 +482,6 @@ int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, DECLARE_STRING_TABLE_LOOKUP(cgroup_pressure_watch, CGroupPressureWatch); - DECLARE_STRING_TABLE_LOOKUP(cgroup_device_permissions, CGroupDevicePermissions); DECLARE_STRING_TABLE_LOOKUP(cgroup_ip_accounting_metric, CGroupIPAccountingMetric); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 3c84baf415c37..ccf4304027ab4 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -431,6 +431,8 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("MemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, pressure_threshold_usec[PRESSURE_MEMORY]), 0), SD_BUS_PROPERTY("CPUPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, pressure_watch[PRESSURE_CPU]), 0), SD_BUS_PROPERTY("CPUPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, pressure_threshold_usec[PRESSURE_CPU]), 0), + SD_BUS_PROPERTY("IOPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, pressure_watch[PRESSURE_IO]), 0), + SD_BUS_PROPERTY("IOPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, pressure_threshold_usec[PRESSURE_IO]), 0), SD_BUS_PROPERTY("NFTSet", "a(iiss)", property_get_cgroup_nft_set, 0, 0), SD_BUS_PROPERTY("CoredumpReceive", "b", bus_property_get_bool, offsetof(CGroupContext, coredump_receive), 0), @@ -714,11 +716,12 @@ static int bus_cgroup_set_transient_property( return 1; - } else if (STR_IN_SET(name, "MemoryPressureWatch", "CPUPressureWatch")) { + } else if (STR_IN_SET(name, "MemoryPressureWatch", "CPUPressureWatch", "IOPressureWatch")) { CGroupPressureWatch p; const char *t; - PressureResource pt = streq(name, "MemoryPressureWatch") ? PRESSURE_MEMORY : PRESSURE_CPU; + PressureResource pt = streq(name, "MemoryPressureWatch") ? PRESSURE_MEMORY : + streq(name, "CPUPressureWatch") ? PRESSURE_CPU : PRESSURE_IO; r = sd_bus_message_read(message, "s", &t); if (r < 0) @@ -739,10 +742,11 @@ static int bus_cgroup_set_transient_property( return 1; - } else if (STR_IN_SET(name, "MemoryPressureThresholdUSec", "CPUPressureThresholdUSec")) { + } else if (STR_IN_SET(name, "MemoryPressureThresholdUSec", "CPUPressureThresholdUSec", "IOPressureThresholdUSec")) { uint64_t t; - PressureResource pt = streq(name, "MemoryPressureThresholdUSec") ? PRESSURE_MEMORY : PRESSURE_CPU; + PressureResource pt = streq(name, "MemoryPressureThresholdUSec") ? PRESSURE_MEMORY : + streq(name, "CPUPressureThresholdUSec") ? PRESSURE_CPU : PRESSURE_IO; r = sd_bus_message_read(message, "t", &t); if (r < 0) diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index 6347bd7b5389d..3b7b639e9f372 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -2981,6 +2981,8 @@ const sd_bus_vtable bus_manager_vtable[] = { SD_BUS_PROPERTY("DefaultMemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.pressure_watch[PRESSURE_MEMORY]), 0), SD_BUS_PROPERTY("DefaultCPUPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.pressure_threshold_usec[PRESSURE_CPU]), 0), SD_BUS_PROPERTY("DefaultCPUPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.pressure_watch[PRESSURE_CPU]), 0), + SD_BUS_PROPERTY("DefaultIOPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.pressure_threshold_usec[PRESSURE_IO]), 0), + SD_BUS_PROPERTY("DefaultIOPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.pressure_watch[PRESSURE_IO]), 0), SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, defaults.oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST), diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index 7aa0a3a6c003d..d9312e0534ec2 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -287,6 +287,10 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) { if (r < 0) return r; + r = serialize_item(f, "exec-cgroup-context-io-pressure-watch", cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_IO])); + if (r < 0) + return r; + r = serialize_item(f, "exec-cgroup-context-delegate-subgroup", c->delegate_subgroup); if (r < 0) return r; @@ -303,6 +307,12 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) { return r; } + if (c->pressure_threshold_usec[PRESSURE_IO] != USEC_INFINITY) { + r = serialize_usec(f, "exec-cgroup-context-io-pressure-threshold-usec", c->pressure_threshold_usec[PRESSURE_IO]); + if (r < 0) + return r; + } + LIST_FOREACH(device_allow, a, c->device_allow) { r = serialize_item_format(f, "exec-cgroup-context-device-allow", "%s %s", a->path, @@ -638,6 +648,10 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { c->pressure_watch[PRESSURE_CPU] = cgroup_pressure_watch_from_string(val); if (c->pressure_watch[PRESSURE_CPU] < 0) return -EINVAL; + } else if ((val = startswith(l, "exec-cgroup-context-io-pressure-watch="))) { + c->pressure_watch[PRESSURE_IO] = cgroup_pressure_watch_from_string(val); + if (c->pressure_watch[PRESSURE_IO] < 0) + return -EINVAL; } else if ((val = startswith(l, "exec-cgroup-context-delegate-subgroup="))) { r = free_and_strdup(&c->delegate_subgroup, val); if (r < 0) @@ -650,6 +664,10 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { r = deserialize_usec(val, &c->pressure_threshold_usec[PRESSURE_CPU]); if (r < 0) return r; + } else if ((val = startswith(l, "exec-cgroup-context-io-pressure-threshold-usec="))) { + r = deserialize_usec(val, &c->pressure_threshold_usec[PRESSURE_IO]); + if (r < 0) + return r; } else if ((val = startswith(l, "exec-cgroup-context-device-allow="))) { _cleanup_free_ char *path = NULL, *rwm = NULL; CGroupDevicePermissions p; diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index 1f5b85838255c..a6554a149901c 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -280,6 +280,8 @@ {{type}}.MemoryPressureWatch, config_parse_pressure_watch, 0, offsetof({{type}}, cgroup_context.pressure_watch[PRESSURE_MEMORY]) {{type}}.CPUPressureThresholdSec, config_parse_sec, 0, offsetof({{type}}, cgroup_context.pressure_threshold_usec[PRESSURE_CPU]) {{type}}.CPUPressureWatch, config_parse_pressure_watch, 0, offsetof({{type}}, cgroup_context.pressure_watch[PRESSURE_CPU]) +{{type}}.IOPressureThresholdSec, config_parse_sec, 0, offsetof({{type}}, cgroup_context.pressure_threshold_usec[PRESSURE_IO]) +{{type}}.IOPressureWatch, config_parse_pressure_watch, 0, offsetof({{type}}, cgroup_context.pressure_watch[PRESSURE_IO]) {{type}}.NFTSet, config_parse_cgroup_nft_set, NFT_SET_PARSE_CGROUP, offsetof({{type}}, cgroup_context) {{type}}.CoredumpReceive, config_parse_bool, 0, offsetof({{type}}, cgroup_context.coredump_receive) {{type}}.BindNetworkInterface, config_parse_bind_network_interface, 0, offsetof({{type}}, cgroup_context) diff --git a/src/core/main.c b/src/core/main.c index 5db6ae7402b9b..ce663d3376872 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -810,6 +810,8 @@ static int parse_config_file(void) { { "Manager", "DefaultMemoryPressureWatch", config_parse_pressure_watch, 0, &arg_defaults.pressure_watch[PRESSURE_MEMORY] }, { "Manager", "DefaultCPUPressureThresholdSec", config_parse_sec, 0, &arg_defaults.pressure_threshold_usec[PRESSURE_CPU] }, { "Manager", "DefaultCPUPressureWatch", config_parse_pressure_watch, 0, &arg_defaults.pressure_watch[PRESSURE_CPU] }, + { "Manager", "DefaultIOPressureThresholdSec", config_parse_sec, 0, &arg_defaults.pressure_threshold_usec[PRESSURE_IO] }, + { "Manager", "DefaultIOPressureWatch", config_parse_pressure_watch, 0, &arg_defaults.pressure_watch[PRESSURE_IO] }, { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, arg_runtime_scope, &arg_cad_burst_action }, { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_defaults.oom_policy }, { "Manager", "DefaultOOMScoreAdjust", config_parse_oom_score_adjust, 0, NULL }, diff --git a/src/core/manager.c b/src/core/manager.c index 0b158fc285aae..e7f807ebb3280 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -621,6 +621,8 @@ static char** sanitize_environment(char **l) { "CREDENTIALS_DIRECTORY", "EXIT_CODE", "EXIT_STATUS", + "IO_PRESSURE_WATCH", + "IO_PRESSURE_WRITE", "INVOCATION_ID", "JOURNAL_STREAM", "LISTEN_FDNAMES", @@ -807,6 +809,7 @@ static const struct { } pressure_dispatch_table[_PRESSURE_RESOURCE_MAX] = { [PRESSURE_MEMORY] = { sd_event_add_memory_pressure, sd_event_source_set_memory_pressure_period }, [PRESSURE_CPU] = { sd_event_add_cpu_pressure, sd_event_source_set_cpu_pressure_period }, + [PRESSURE_IO] = { sd_event_add_io_pressure, sd_event_source_set_io_pressure_period }, }; int manager_setup_pressure_event_source(Manager *m, PressureResource t) { @@ -5207,8 +5210,8 @@ void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope) { .tasks_max = DEFAULT_TASKS_MAX, .timer_accuracy_usec = 1 * USEC_PER_MINUTE, - .pressure_watch = { CGROUP_PRESSURE_WATCH_AUTO, CGROUP_PRESSURE_WATCH_AUTO }, - .pressure_threshold_usec = { PRESSURE_DEFAULT_THRESHOLD_USEC, PRESSURE_DEFAULT_THRESHOLD_USEC }, + .pressure_watch = { CGROUP_PRESSURE_WATCH_AUTO, CGROUP_PRESSURE_WATCH_AUTO, CGROUP_PRESSURE_WATCH_AUTO }, + .pressure_threshold_usec = { PRESSURE_DEFAULT_THRESHOLD_USEC, PRESSURE_DEFAULT_THRESHOLD_USEC, PRESSURE_DEFAULT_THRESHOLD_USEC }, .oom_policy = OOM_STOP, .oom_score_adjust_set = false, diff --git a/src/core/system.conf.in b/src/core/system.conf.in index 4507d02905cb0..b8a8510fb4741 100644 --- a/src/core/system.conf.in +++ b/src/core/system.conf.in @@ -79,6 +79,8 @@ #DefaultMemoryPressureWatch=auto #DefaultCPUPressureThresholdSec=200ms #DefaultCPUPressureWatch=auto +#DefaultIOPressureThresholdSec=200ms +#DefaultIOPressureWatch=auto #DefaultOOMPolicy=stop #DefaultSmackProcessLabel= #DefaultRestrictSUIDSGID= diff --git a/src/core/user.conf.in b/src/core/user.conf.in index fe45c00b74e4c..33c6733268c08 100644 --- a/src/core/user.conf.in +++ b/src/core/user.conf.in @@ -56,6 +56,8 @@ #DefaultMemoryPressureWatch=auto #DefaultCPUPressureThresholdSec=200ms #DefaultCPUPressureWatch=auto +#DefaultIOPressureThresholdSec=200ms +#DefaultIOPressureWatch=auto #DefaultSmackProcessLabel= #DefaultRestrictSUIDSGID= #ReloadLimitIntervalSec= diff --git a/src/core/varlink-cgroup.c b/src/core/varlink-cgroup.c index 1e4a8f8682dd2..048772ab38cae 100644 --- a/src/core/varlink-cgroup.c +++ b/src/core/varlink-cgroup.c @@ -327,6 +327,8 @@ int unit_cgroup_context_build_json(sd_json_variant **ret, const char *name, void JSON_BUILD_PAIR_FINITE_USEC("MemoryPressureThresholdUSec", c->pressure_threshold_usec[PRESSURE_MEMORY]), SD_JSON_BUILD_PAIR_STRING("CPUPressureWatch", cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_CPU])), JSON_BUILD_PAIR_FINITE_USEC("CPUPressureThresholdUSec", c->pressure_threshold_usec[PRESSURE_CPU]), + SD_JSON_BUILD_PAIR_STRING("IOPressureWatch", cgroup_pressure_watch_to_string(c->pressure_watch[PRESSURE_IO])), + JSON_BUILD_PAIR_FINITE_USEC("IOPressureThresholdUSec", c->pressure_threshold_usec[PRESSURE_IO]), /* Others */ SD_JSON_BUILD_PAIR_BOOLEAN("CoredumpReceive", c->coredump_receive)); diff --git a/src/core/varlink-manager.c b/src/core/varlink-manager.c index 63d61a56fc8c3..b56cd9018d0fc 100644 --- a/src/core/varlink-manager.c +++ b/src/core/varlink-manager.c @@ -110,6 +110,8 @@ static int manager_context_build_json(sd_json_variant **ret, const char *name, v SD_JSON_BUILD_PAIR_STRING("DefaultMemoryPressureWatch", cgroup_pressure_watch_to_string(m->defaults.pressure_watch[PRESSURE_MEMORY])), JSON_BUILD_PAIR_FINITE_USEC("DefaultCPUPressureThresholdUSec", m->defaults.pressure_threshold_usec[PRESSURE_CPU]), SD_JSON_BUILD_PAIR_STRING("DefaultCPUPressureWatch", cgroup_pressure_watch_to_string(m->defaults.pressure_watch[PRESSURE_CPU])), + JSON_BUILD_PAIR_FINITE_USEC("DefaultIOPressureThresholdUSec", m->defaults.pressure_threshold_usec[PRESSURE_IO]), + SD_JSON_BUILD_PAIR_STRING("DefaultIOPressureWatch", cgroup_pressure_watch_to_string(m->defaults.pressure_watch[PRESSURE_IO])), JSON_BUILD_PAIR_FINITE_USEC("RuntimeWatchdogUSec", manager_get_watchdog(m, WATCHDOG_RUNTIME)), JSON_BUILD_PAIR_FINITE_USEC("RebootWatchdogUSec", manager_get_watchdog(m, WATCHDOG_REBOOT)), JSON_BUILD_PAIR_FINITE_USEC("KExecWatchdogUSec", manager_get_watchdog(m, WATCHDOG_KEXEC)), diff --git a/src/libsystemd/libsystemd.sym b/src/libsystemd/libsystemd.sym index 244040c77e086..1071072a5b65a 100644 --- a/src/libsystemd/libsystemd.sym +++ b/src/libsystemd/libsystemd.sym @@ -1096,4 +1096,7 @@ global: sd_event_add_cpu_pressure; sd_event_source_set_cpu_pressure_type; sd_event_source_set_cpu_pressure_period; + sd_event_add_io_pressure; + sd_event_source_set_io_pressure_type; + sd_event_source_set_io_pressure_period; } LIBSYSTEMD_260; diff --git a/src/libsystemd/sd-event/event-source.h b/src/libsystemd/sd-event/event-source.h index c7d5ba166da31..8487c966ab409 100644 --- a/src/libsystemd/sd-event/event-source.h +++ b/src/libsystemd/sd-event/event-source.h @@ -27,6 +27,7 @@ typedef enum EventSourceType { SOURCE_INOTIFY, SOURCE_MEMORY_PRESSURE, SOURCE_CPU_PRESSURE, + SOURCE_IO_PRESSURE, _SOURCE_EVENT_SOURCE_TYPE_MAX, _SOURCE_EVENT_SOURCE_TYPE_INVALID = -EINVAL, } EventSourceType; diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index c43f0f9e716e8..8ed49797e3a86 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -77,6 +77,7 @@ static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] [SOURCE_INOTIFY] = "inotify", [SOURCE_MEMORY_PRESSURE] = "memory-pressure", [SOURCE_CPU_PRESSURE] = "cpu-pressure", + [SOURCE_IO_PRESSURE] = "io-pressure", }; DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int); @@ -101,7 +102,8 @@ DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int); SOURCE_DEFER, \ SOURCE_INOTIFY, \ SOURCE_MEMORY_PRESSURE, \ - SOURCE_CPU_PRESSURE) + SOURCE_CPU_PRESSURE, \ + SOURCE_IO_PRESSURE) /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put(). * Time sources and ratelimited sources can be passed, so effectively this is the same as the @@ -566,7 +568,7 @@ static int source_child_pidfd_register(sd_event_source *s, int enabled) { return 0; } -#define EVENT_SOURCE_IS_PRESSURE(s) IN_SET((s)->type, SOURCE_MEMORY_PRESSURE, SOURCE_CPU_PRESSURE) +#define EVENT_SOURCE_IS_PRESSURE(s) IN_SET((s)->type, SOURCE_MEMORY_PRESSURE, SOURCE_CPU_PRESSURE, SOURCE_IO_PRESSURE) static void source_pressure_unregister(sd_event_source *s) { assert(s); @@ -1052,6 +1054,7 @@ static void source_disconnect(sd_event_source *s) { case SOURCE_MEMORY_PRESSURE: case SOURCE_CPU_PRESSURE: + case SOURCE_IO_PRESSURE: source_pressure_remove_from_write_list(s); source_pressure_unregister(s); break; @@ -1198,6 +1201,7 @@ static sd_event_source* source_new(sd_event *e, bool floating, EventSourceType t [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify), [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, pressure), [SOURCE_CPU_PRESSURE] = endoffsetof_field(sd_event_source, pressure), + [SOURCE_IO_PRESSURE] = endoffsetof_field(sd_event_source, pressure), }; sd_event_source *s; @@ -2111,8 +2115,8 @@ static int event_add_pressure( * fd with the epoll right-away. Instead, we just add the event source to a list of pressure event * sources on which writes must be executed before the first event loop iteration is executed. (We * could also write the data here, right away, but we want to give the caller the freedom to call - * sd_event_source_set_{memory,cpu}_pressure_type() and - * sd_event_source_set_{memory,cpu}_pressure_period() before we write it. */ + * sd_event_source_set_{memory,cpu,io}_pressure_type() and + * sd_event_source_set_{memory,cpu,io}_pressure_period() before we write it. */ if (s->pressure.write_buffer_size > 0) source_pressure_add_to_write_list(s); @@ -2161,6 +2165,25 @@ _public_ int sd_event_add_cpu_pressure( PRESSURE_CPU); } +static int io_pressure_callback(sd_event_source *s, void *userdata) { + assert(s); + + return 0; +} + +_public_ int sd_event_add_io_pressure( + sd_event *e, + sd_event_source **ret, + sd_event_handler_t callback, + void *userdata) { + + return event_add_pressure( + e, ret, callback, userdata, + SOURCE_IO_PRESSURE, + io_pressure_callback, + PRESSURE_IO); +} + static void event_free_inotify_data(sd_event *e, InotifyData *d) { assert(e); @@ -2963,6 +2986,7 @@ static int event_source_offline( case SOURCE_MEMORY_PRESSURE: case SOURCE_CPU_PRESSURE: + case SOURCE_IO_PRESSURE: source_pressure_unregister(s); break; @@ -3055,6 +3079,7 @@ static int event_source_online( case SOURCE_MEMORY_PRESSURE: case SOURCE_CPU_PRESSURE: + case SOURCE_IO_PRESSURE: /* As documented in sd_event_add_{memory,cpu,io}_pressure(), we can only register the PSI fd * with epoll after writing the watch string. */ if (s->pressure.write_buffer_size == 0) { @@ -4306,6 +4331,7 @@ static int source_dispatch(sd_event_source *s) { case SOURCE_MEMORY_PRESSURE: case SOURCE_CPU_PRESSURE: + case SOURCE_IO_PRESSURE: r = s->pressure.callback(s, s->userdata); break; @@ -4721,6 +4747,7 @@ static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t case SOURCE_MEMORY_PRESSURE: case SOURCE_CPU_PRESSURE: + case SOURCE_IO_PRESSURE: r = process_pressure(s, i->events); break; @@ -5415,6 +5442,13 @@ _public_ int sd_event_source_set_cpu_pressure_type(sd_event_source *s, const cha return event_source_set_pressure_type(s, ty); } +_public_ int sd_event_source_set_io_pressure_type(sd_event_source *s, const char *ty) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_IO_PRESSURE, -EDOM); + + return event_source_set_pressure_type(s, ty); +} + static int event_source_set_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) { _cleanup_free_ char *b = NULL; _cleanup_free_ void *w = NULL; @@ -5475,3 +5509,10 @@ _public_ int sd_event_source_set_cpu_pressure_period(sd_event_source *s, uint64_ return event_source_set_pressure_period(s, threshold_usec, window_usec); } + +_public_ int sd_event_source_set_io_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_IO_PRESSURE, -EDOM); + + return event_source_set_pressure_period(s, threshold_usec, window_usec); +} diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 9c732543fac7d..1a6bc7370f81e 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -2384,6 +2384,7 @@ static const BusProperty cgroup_properties[] = { { "ManagedOOMPreference", bus_append_string }, { "MemoryPressureWatch", bus_append_string }, { "CPUPressureWatch", bus_append_string }, + { "IOPressureWatch", bus_append_string }, { "DelegateSubgroup", bus_append_string }, { "ManagedOOMMemoryPressureLimit", bus_append_parse_permyriad }, { "MemoryAccounting", bus_append_parse_boolean }, @@ -2423,6 +2424,7 @@ static const BusProperty cgroup_properties[] = { { "SocketBindDeny", bus_append_socket_filter }, { "MemoryPressureThresholdSec", bus_append_parse_sec_rename }, { "CPUPressureThresholdSec", bus_append_parse_sec_rename }, + { "IOPressureThresholdSec", bus_append_parse_sec_rename }, { "NFTSet", bus_append_nft_set }, { "BindNetworkInterface", bus_append_string }, diff --git a/src/shared/varlink-io.systemd.Manager.c b/src/shared/varlink-io.systemd.Manager.c index dc49788fd9067..ff79f446a059c 100644 --- a/src/shared/varlink-io.systemd.Manager.c +++ b/src/shared/varlink-io.systemd.Manager.c @@ -68,6 +68,10 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE( SD_VARLINK_DEFINE_FIELD(DefaultCPUPressureThresholdUSec, SD_VARLINK_INT, 0), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultCPUPressureWatch="), SD_VARLINK_DEFINE_FIELD(DefaultCPUPressureWatch, SD_VARLINK_STRING, 0), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultIOPressureThresholdUSec="), + SD_VARLINK_DEFINE_FIELD(DefaultIOPressureThresholdUSec, SD_VARLINK_INT, 0), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultIOPressureWatch="), + SD_VARLINK_DEFINE_FIELD(DefaultIOPressureWatch, SD_VARLINK_STRING, 0), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#RuntimeWatchdogSec="), SD_VARLINK_DEFINE_FIELD(RuntimeWatchdogUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#RebootWatchdogSec="), diff --git a/src/shared/varlink-io.systemd.Unit.c b/src/shared/varlink-io.systemd.Unit.c index a230f29daba8b..c1ff4ebc5a76c 100644 --- a/src/shared/varlink-io.systemd.Unit.c +++ b/src/shared/varlink-io.systemd.Unit.c @@ -232,6 +232,10 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE( SD_VARLINK_DEFINE_FIELD(CPUPressureWatch, SD_VARLINK_STRING, 0), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#CPUPressureThresholdSec="), SD_VARLINK_DEFINE_FIELD(CPUPressureThresholdUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#IOPressureWatch="), + SD_VARLINK_DEFINE_FIELD(IOPressureWatch, SD_VARLINK_STRING, 0), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#IOPressureThresholdSec="), + SD_VARLINK_DEFINE_FIELD(IOPressureThresholdUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE), /* Others */ SD_VARLINK_FIELD_COMMENT("Reflects whether to forward coredumps for processes that crash within this cgroup"), diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h index 71fc9504889e6..34bd396080dc3 100644 --- a/src/systemd/sd-event.h +++ b/src/systemd/sd-event.h @@ -98,6 +98,7 @@ int sd_event_add_post(sd_event *e, sd_event_source **ret, sd_event_handler_t cal int sd_event_add_exit(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); int sd_event_add_memory_pressure(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); int sd_event_add_cpu_pressure(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); +int sd_event_add_io_pressure(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); int sd_event_prepare(sd_event *e); int sd_event_wait(sd_event *e, uint64_t timeout); @@ -165,6 +166,8 @@ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec); int sd_event_source_set_cpu_pressure_type(sd_event_source *s, const char *ty); int sd_event_source_set_cpu_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec); +int sd_event_source_set_io_pressure_type(sd_event_source *s, const char *ty); +int sd_event_source_set_io_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec); int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback); int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret); int sd_event_source_get_floating(sd_event_source *s); diff --git a/src/test/test-pressure.c b/src/test/test-pressure.c index 86fc070618155..ec967a0e83e56 100644 --- a/src/test/test-pressure.c +++ b/src/test/test-pressure.c @@ -159,6 +159,14 @@ TEST(fake_cpu_pressure) { test_fake_pressure("cpu", fake_cpu_pressure_wrapper); } +static int fake_io_pressure_wrapper(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata) { + return sd_event_add_io_pressure(e, ret, callback, userdata); +} + +TEST(fake_io_pressure) { + test_fake_pressure("io", fake_io_pressure_wrapper); +} + /* Shared infrastructure for real pressure tests */ struct real_pressure_context { @@ -457,7 +465,127 @@ TEST(real_cpu_pressure) { ASSERT_EQ(ex, 31); } +/* IO pressure real test */ + +static int real_io_pressure_callback(sd_event_source *s, void *userdata) { + struct real_pressure_context *c = ASSERT_PTR(userdata); + const char *d; + + ASSERT_NOT_NULL(s); + ASSERT_OK(sd_event_source_get_description(s, &d)); + + log_notice("real io pressure event: %s", d); + + ASSERT_NOT_NULL(c->pid); + ASSERT_OK(sd_event_source_send_child_signal(c->pid, SIGKILL, NULL, 0)); + c->pid = NULL; + + return 0; +} + +_noreturn_ static void real_pressure_eat_io(int pipe_fd) { + char x; + ASSERT_EQ(read(pipe_fd, &x, 1), 1); /* Wait for the GO! */ + + /* Write and fsync in a loop to generate IO pressure */ + for (;;) { + _cleanup_close_ int fd = -EBADF; + + fd = open("/var/tmp/.io-pressure-test", O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, 0600); + if (fd < 0) + continue; + + char buf[4096]; + memset(buf, 'x', sizeof(buf)); + for (int i = 0; i < 256; i++) + if (write(fd, buf, sizeof(buf)) < 0) + break; + (void) fsync(fd); + } +} + +TEST(real_io_pressure) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *cs = NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_pair_ int pipe_fd[2] = EBADF_PAIR; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_free_ char *scope = NULL; + const char *object; + int r; + + if (getuid() == 0) + r = sd_bus_open_system(&bus); + else + r = sd_bus_open_user(&bus); + if (r < 0) + return (void) log_tests_skipped_errno(r, "can't connect to bus"); + + ASSERT_OK(bus_wait_for_jobs_new(bus, &w)); + + ASSERT_OK(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit")); + ASSERT_OK(asprintf(&scope, "test-%" PRIu64 ".scope", random_u64())); + ASSERT_OK(sd_bus_message_append(m, "ss", scope, "fail")); + ASSERT_OK(sd_bus_message_open_container(m, 'a', "(sv)")); + ASSERT_OK(sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, 0)); + ASSERT_OK(sd_bus_message_append(m, "(sv)", "IOAccounting", "b", true)); + ASSERT_OK(sd_bus_message_close_container(m)); + ASSERT_OK(sd_bus_message_append(m, "a(sa(sv))", 0)); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return (void) log_tests_skipped_errno(r, "can't issue transient unit call"); + + ASSERT_OK(sd_bus_message_read(reply, "o", &object)); + + ASSERT_OK(bus_wait_for_jobs_one(w, object, /* flags= */ BUS_WAIT_JOBS_LOG_ERROR, /* extra_args= */ NULL)); + + ASSERT_OK(sd_event_default(&e)); + + ASSERT_OK_ERRNO(pipe2(pipe_fd, O_CLOEXEC)); + + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + r = pidref_safe_fork("(eat-io)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM, &pidref); + ASSERT_OK(r); + if (r == 0) { + real_pressure_eat_io(pipe_fd[0]); + _exit(EXIT_SUCCESS); + } + + ASSERT_OK(event_add_child_pidref(e, &cs, &pidref, WEXITED, real_pressure_child_callback, NULL)); + ASSERT_OK(sd_event_source_set_child_process_own(cs, true)); + + ASSERT_OK_ERRNO(unsetenv("IO_PRESSURE_WATCH")); + ASSERT_OK_ERRNO(unsetenv("IO_PRESSURE_WRITE")); + + struct real_pressure_context context = { + .pid = cs, + }; + + r = sd_event_add_io_pressure(e, &es, real_io_pressure_callback, &context); + if (r < 0) + return (void) log_tests_skipped_errno(r, "can't allocate io pressure fd"); + + ASSERT_OK(sd_event_source_set_description(es, "real pressure event source")); + ASSERT_OK_ZERO(sd_event_source_set_io_pressure_type(es, "some")); + /* Unprivileged writes require a minimum of 2s otherwise the kernel will refuse the write. */ + ASSERT_OK_POSITIVE(sd_event_source_set_io_pressure_period(es, 70 * USEC_PER_MSEC, 2 * USEC_PER_SEC)); + ASSERT_OK_ZERO(sd_event_source_set_io_pressure_period(es, 70 * USEC_PER_MSEC, 2 * USEC_PER_SEC)); + ASSERT_OK(sd_event_source_set_enabled(es, SD_EVENT_ONESHOT)); + + /* Now start eating IO */ + ASSERT_EQ(write(pipe_fd[1], &(const char) { 'x' }, 1), 1); + + ASSERT_OK(sd_event_loop(e)); + int ex = 0; + ASSERT_OK(sd_event_get_exit_code(e, &ex)); + ASSERT_EQ(ex, 31); +} + static int outro(void) { + (void) unlink("/var/tmp/.io-pressure-test"); hashmap_trim_pools(); return 0; } diff --git a/test/units/TEST-79-PRESSURE.sh b/test/units/TEST-79-PRESSURE.sh index d4e4a9e06b5b4..72de8a1d9d189 100755 --- a/test/units/TEST-79-PRESSURE.sh +++ b/test/units/TEST-79-PRESSURE.sh @@ -114,4 +114,57 @@ systemd-run \ rm "$SCRIPT" +# Now test IO pressure + +if ! cat /proc/pressure/io >/dev/null ; then + echo "kernel has no IO PSI support." >&2 + echo OK >/testok + exit 0 +fi + +if ! test -f "$CGROUP"/io.pressure ; then + echo "No IO accounting/PSI delegated via cgroup, can't test." >&2 + echo OK >/testok + exit 0 +fi + +UNIT="test-iopress-$RANDOM.service" +SCRIPT="/tmp/iopress-$RANDOM.sh" + +cat >"$SCRIPT" <<'EOF' +#!/usr/bin/env bash + +set -ex + +export +id + +test -n "$IO_PRESSURE_WATCH" +test "$IO_PRESSURE_WATCH" != /dev/null +test -w "$IO_PRESSURE_WATCH" + +ls -al "$IO_PRESSURE_WATCH" + +EXPECTED="$(echo -n -e "some 123000 2000000\x00" | base64)" + +test "$EXPECTED" = "$IO_PRESSURE_WRITE" + +EOF + +chmod +x "$SCRIPT" + +systemd-run \ + -u "$UNIT" \ + -p Type=exec \ + -p ProtectControlGroups=1 \ + -p DynamicUser=1 \ + -p IOPressureWatch=on \ + -p IOPressureThresholdSec=123ms \ + -p BindPaths=$SCRIPT \ + `# Make sanitizers happy when DynamicUser=1 pulls in instrumented systemd NSS modules` \ + -p EnvironmentFile=-/usr/lib/systemd/systemd-asan-env \ + --wait "$SCRIPT" + +rm "$SCRIPT" + touch /testok From 7ec45e5e3b2cf468278bcaccee2c67cc0723c369 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Sun, 8 Mar 2026 00:33:09 +0100 Subject: [PATCH 6/7] docs: Update MEMORY_PRESSURE.md => PRESSURE.md Make the doc more generic and mention all pressure types, not just memory. --- docs/MEMORY_PRESSURE.md | 235 +----------------------- docs/PRESSURE.md | 255 +++++++++++++++++++++++++++ man/oomd.conf.xml | 2 +- man/sd_event_add_memory_pressure.xml | 2 +- man/systemd.exec.xml | 2 +- man/systemd.resource-control.xml | 2 +- 6 files changed, 260 insertions(+), 238 deletions(-) create mode 100644 docs/PRESSURE.md diff --git a/docs/MEMORY_PRESSURE.md b/docs/MEMORY_PRESSURE.md index 3d3832cac7ea2..1cbd6f18fcbf8 100644 --- a/docs/MEMORY_PRESSURE.md +++ b/docs/MEMORY_PRESSURE.md @@ -5,237 +5,4 @@ layout: default SPDX-License-Identifier: LGPL-2.1-or-later --- -# Memory Pressure Handling in systemd - -When the system is under memory pressure (i.e. some component of the OS -requires memory allocation but there is only very little or none available), -it can attempt various things to make more memory available again ("reclaim"): - -* The kernel can flush out memory pages backed by files on disk, under the - knowledge that it can reread them from disk when needed again. Candidate - pages are the many memory mapped executable files and shared libraries on - disk, among others. - -* The kernel can flush out memory pages not backed by files on disk - ("anonymous" memory, i.e. memory allocated via `malloc()` and similar calls, - or `tmpfs` file system contents) if there's swap to write it to. - -* Userspace can proactively release memory it allocated but doesn't immediately - require back to the kernel. This includes allocation caches, and other forms - of caches that are not required for normal operation to continue. - -The latter is what we want to focus on in this document: how to ensure -userspace process can detect mounting memory pressure early and release memory -back to the kernel as it happens, relieving the memory pressure before it -becomes too critical. - -The effects of memory pressure during runtime generally are growing latencies -during operation: when a program requires memory but the system is busy writing -out memory to (relatively slow) disks in order make some available, this -generally surfaces in scheduling latencies, and applications and services will -slow down until memory pressure is relieved. Hence, to ensure stable service -latencies it is essential to release unneeded memory back to the kernel early -on. - -On Linux the [Pressure Stall Information -(PSI)](https://docs.kernel.org/accounting/psi.html) Linux kernel interface is -the primary way to determine the system or a part of it is under memory -pressure. PSI makes available to userspace a `poll()`-able file descriptor that -gets notifications whenever memory pressure latencies for the system or a -control group grow beyond some level. - -`systemd` itself makes use of PSI, and helps applications to do so too. -Specifically: - -* Most of systemd's long running components watch for PSI memory pressure - events, and release allocation caches and other resources once seen. - -* systemd's service manager provides a protocol for asking services to monitor - PSI events and configure the appropriate pressure thresholds. - -* systemd's `sd-event` event loop API provides a high-level call - `sd_event_add_memory_pressure()` enabling programs using it to efficiently - hook into the PSI memory pressure protocol provided by the service manager, - with very few lines of code. - -## Memory Pressure Service Protocol - -If memory pressure handling for a specific service is enabled via -`MemoryPressureWatch=` the memory pressure service protocol is used to tell the -service code about this. Specifically two environment variables are set by the -service manager, and typically consumed by the service: - -* The `$MEMORY_PRESSURE_WATCH` environment variable will contain an absolute - path in the file system to the file to watch for memory pressure events. This - will usually point to a PSI file such as the `memory.pressure` file of the - service's cgroup. In order to make debugging easier, and allow later - extension it is recommended for applications to also allow this path to refer - to an `AF_UNIX` stream socket in the file system or a FIFO inode in the file - system. Regardless of which of the three types of inodes this absolute path - refers to, all three are `poll()`-able for memory pressure events. The - variable can also be set to the literal string `/dev/null`. If so the service - code should take this as indication that memory pressure monitoring is not - desired and should be turned off. - -* The `$MEMORY_PRESSURE_WRITE` environment variable is optional. If set by the - service manager it contains Base64 encoded data (that may contain arbitrary - binary values, including NUL bytes) that should be written into the path - provided via `$MEMORY_PRESSURE_WATCH` right after opening it. Typically, if - talking directly to a PSI kernel file this will contain information about the - threshold settings configurable in the service manager. - -When a service initializes it hence should look for -`$MEMORY_PRESSURE_WATCH`. If set, it should try to open the specified path. If -it detects the path to refer to a regular file it should assume it refers to a -PSI kernel file. If so, it should write the data from `$MEMORY_PRESSURE_WRITE` -into the file descriptor (after Base64-decoding it, and only if the variable is -set) and then watch for `POLLPRI` events on it. If it detects the paths refers -to a FIFO inode, it should open it, write the `$MEMORY_PRESSURE_WRITE` data -into it (as above) and then watch for `POLLIN` events on it. Whenever `POLLIN` -is seen it should read and discard any data queued in the FIFO. If the path -refers to an `AF_UNIX` socket in the file system, the application should -`connect()` a stream socket to it, write `$MEMORY_PRESSURE_WRITE` into it (as -above) and watch for `POLLIN`, discarding any data it might receive. - -To summarize: - -* If `$MEMORY_PRESSURE_WATCH` points to a regular file: open and watch for - `POLLPRI`, never read from the file descriptor. - -* If `$MEMORY_PRESSURE_WATCH` points to a FIFO: open and watch for `POLLIN`, - read/discard any incoming data. - -* If `$MEMORY_PRESSURE_WATCH` points to an `AF_UNIX` socket: connect and watch - for `POLLIN`, read/discard any incoming data. - -* If `$MEMORY_PRESSURE_WATCH` contains the literal string `/dev/null`, turn off - memory pressure handling. - -(And in each case, immediately after opening/connecting to the path, write the -decoded `$MEMORY_PRESSURE_WRITE` data into it.) - -Whenever a `POLLPRI`/`POLLIN` event is seen the service is under memory -pressure. It should use this as hint to release suitable redundant resources, -for example: - -* glibc's memory allocation cache, via - [`malloc_trim()`](https://man7.org/linux/man-pages/man3/malloc_trim.3.html). Similar, - allocation caches implemented in the service itself. - -* Any other local caches, such DNS caches, or web caches (in particular if - service is a web browser). - -* Terminate any idle worker threads or processes. - -* Run a garbage collection (GC) cycle, if the runtime environment supports it. - -* Terminate the process if idle, and can be automatically started when - needed next. - -Which actions precisely to take depends on the service in question. Note that -the notifications are delivered when memory allocation latency already degraded -beyond some point. Hence when discussing which resources to keep and which to -discard, keep in mind it's typically acceptable that latencies incurred -recovering discarded resources at a later point are acceptable, given that -latencies *already* are affected negatively. - -In case the path supplied via `$MEMORY_PRESSURE_WATCH` points to a PSI kernel -API file, or to an `AF_UNIX` opening it multiple times is safe and reliable, -and should deliver notifications to each of the opened file descriptors. This -is specifically useful for services that consist of multiple processes, and -where each of them shall be able to release resources on memory pressure. - -The `POLLPRI`/`POLLIN` conditions will be triggered every time memory pressure -is detected, but not continuously. It is thus safe to keep `poll()`-ing on the -same file descriptor continuously, and executing resource release operations -whenever the file descriptor triggers without having to expect overloading the -process. - -(Currently, the protocol defined here only allows configuration of a single -"degree" of memory pressure, there's no distinction made on how strong the -pressure is. In future, if it becomes apparent that there's clear need to -extend this we might eventually add different degrees, most likely by adding -additional environment variables such as `$MEMORY_PRESSURE_WRITE_LOW` and -`$MEMORY_PRESSURE_WRITE_HIGH` or similar, which may contain different settings -for lower or higher memory pressure thresholds.) - -## Service Manager Settings - -The service manager provides two per-service settings that control the memory -pressure handling: - -* The - [`MemoryPressureWatch=`](https://www.freedesktop.org/software/systemd/man/latest/systemd.resource-control.html#MemoryPressureWatch=) - setting controls whether to enable the memory pressure protocol for the - service in question. - -* The `MemoryPressureThresholdSec=` setting allows configuring the threshold - when to signal memory pressure to the services. It takes a time value - (usually in the millisecond range) that defines a threshold per 1s time - window: if memory allocation latencies grow beyond this threshold - notifications are generated towards the service, requesting it to release - resources. - -The `/etc/systemd/system.conf` file provides two settings that may be used to -select the default values for the above settings. If the threshold isn't -configured via the per-service nor system-wide option, it defaults to 100ms. - -When memory pressure monitoring is enabled for a service via -`MemoryPressureWatch=` this primarily does three things: - -* It enables cgroup memory accounting for the service (this is a requirement - for per-cgroup PSI) - -* It sets the aforementioned two environment variables for processes invoked - for the service, based on the control group of the service and provided - settings. - -* The `memory.pressure` PSI control group file associated with the service's - cgroup is delegated to the service (i.e. permissions are relaxed so that - unprivileged service payload code can open the file for writing). - -## Memory Pressure Events in `sd-event` - -The -[`sd-event`](https://www.freedesktop.org/software/systemd/man/latest/sd-event.html) -event loop library provides two API calls that encapsulate the -functionality described above: - -* The - [`sd_event_add_memory_pressure()`](https://www.freedesktop.org/software/systemd/man/latest/sd_event_add_memory_pressure.html) - call implements the service-side of the memory pressure protocol and - integrates it with an `sd-event` event loop. It reads the two environment - variables, connects/opens the specified file, writes the specified data to it, - then watches it for events. - -* The `sd_event_trim_memory()` call may be called to trim the calling - processes' memory. It's a wrapper around glibc's `malloc_trim()`, but first - releases allocation caches maintained by libsystemd internally. This function - serves as the default when a NULL callback is supplied to - `sd_event_add_memory_pressure()`. - -When implementing a service using `sd-event`, for automatic memory pressure -handling, it's typically sufficient to add a line such as: - -```c -(void) sd_event_add_memory_pressure(event, NULL, NULL, NULL); -``` - -– right after allocating the event loop object `event`. - -## Other APIs - -Other programming environments might have native APIs to watch memory -pressure/low memory events. Most notable is probably GLib's -[GMemoryMonitor](https://docs.gtk.org/gio/iface.MemoryMonitor.html). As of GLib -2.86.0, it uses the per-cgroup PSI kernel file to monitor for memory pressure, -but does not yet read the environment variables recommended above. - -In older versions, it used the per-system Linux PSI interface as the backend, but operated -differently than the above: memory pressure events were picked up by a system -service, which then propagated this through D-Bus to the applications. This was -typically less than ideal, since this means each notification event had to -traverse three processes before being handled. This traversal created -additional latencies at a time where the system is already experiencing adverse -latencies. Moreover, it focused on system-wide PSI events, even though -service-local ones are generally the better approach. +This page has been renamed to [Resource Pressure Handling](PRESSURE). diff --git a/docs/PRESSURE.md b/docs/PRESSURE.md new file mode 100644 index 0000000000000..d0b7ce8a5ed9b --- /dev/null +++ b/docs/PRESSURE.md @@ -0,0 +1,255 @@ +--- +title: Resource Pressure Handling +category: Interfaces +layout: default +SPDX-License-Identifier: LGPL-2.1-or-later +--- + +# Resource Pressure Handling in systemd + +On Linux the [Pressure Stall Information +(PSI)](https://docs.kernel.org/accounting/psi.html) Linux kernel interface +provides a way to monitor resource pressure — situations where tasks are +stalled waiting for a resource to become available. PSI covers three types of +resources: + +* **Memory pressure**: tasks are stalled because the system is low on memory + and the kernel is busy reclaiming it (e.g. writing out pages to swap or + flushing file-backed pages). + +* **CPU pressure**: tasks are stalled waiting for CPU time because the CPU is + oversubscribed. + +* **IO pressure**: tasks are stalled waiting for IO operations to complete + because the IO subsystem is saturated. + +PSI makes available to userspace a `poll()`-able file descriptor that gets +notifications whenever pressure latencies for the system or a control group +grow beyond some configured level. + +When the system is under memory pressure, userspace can proactively release +memory it allocated but doesn't immediately require back to the kernel. This +includes allocation caches, and other forms of caches that are not required for +normal operation to continue. Similarly, when CPU or IO pressure is detected, +services can take appropriate action such as reducing parallelism, deferring +background work, or shedding load. + +The effects of resource pressure during runtime generally are growing latencies +during operation: applications and services slow down until pressure is +relieved. Hence, to ensure stable service latencies it is essential to detect +pressure early and respond appropriately. + +`systemd` itself makes use of PSI, and helps applications to do so too. +Specifically: + +* Most of systemd's long running components watch for PSI memory pressure + events, and release allocation caches and other resources once seen. + +* systemd's service manager provides a protocol for asking services to monitor + PSI events and configure the appropriate pressure thresholds, for memory, CPU, + and IO pressure independently. + +* systemd's `sd-event` event loop API provides high-level calls + `sd_event_add_memory_pressure()`, `sd_event_add_cpu_pressure()`, and + `sd_event_add_io_pressure()` enabling programs using it to efficiently hook + into the PSI pressure protocol provided by the service manager, with very few + lines of code. + +## Pressure Service Protocol + +For each resource type, if pressure handling for a specific service is enabled +via the corresponding `*PressureWatch=` setting (i.e. `MemoryPressureWatch=`, +`CPUPressureWatch=`, or `IOPressureWatch=`), two environment variables are set +by the service manager: + +* `$MEMORY_PRESSURE_WATCH` / `$CPU_PRESSURE_WATCH` / `$IO_PRESSURE_WATCH` — + contains an absolute path in the file system to the file to watch for + pressure events. This will usually point to a PSI file such as the + `memory.pressure`, `cpu.pressure`, or `io.pressure` file of the service's + cgroup. In order to make debugging easier, and allow later extension it is + recommended for applications to also allow this path to refer to an `AF_UNIX` + stream socket in the file system or a FIFO inode in the file system. + Regardless of which of the three types of inodes this absolute path refers + to, all three are `poll()`-able for pressure events. The variable can also be + set to the literal string `/dev/null`. If so the service code should take this + as indication that pressure monitoring for this resource is not desired and + should be turned off. + +* `$MEMORY_PRESSURE_WRITE` / `$CPU_PRESSURE_WRITE` / `$IO_PRESSURE_WRITE` — + optional. If set by the service manager it contains Base64 encoded data (that + may contain arbitrary binary values, including NUL bytes) that should be + written into the path provided via the corresponding `*_PRESSURE_WATCH` + variable right after opening it. Typically, if talking directly to a PSI + kernel file this will contain information about the threshold settings + configurable in the service manager. + +The protocol works the same for all three resource types. The remainder of this +section uses memory pressure as the example, but the same logic applies to CPU +and IO pressure with the corresponding environment variable names. + +When a service initializes it hence should look for +`$MEMORY_PRESSURE_WATCH`. If set, it should try to open the specified path. If +it detects the path to refer to a regular file it should assume it refers to a +PSI kernel file. If so, it should write the data from `$MEMORY_PRESSURE_WRITE` +into the file descriptor (after Base64-decoding it, and only if the variable is +set) and then watch for `POLLPRI` events on it. If it detects the paths refers +to a FIFO inode, it should open it, write the `$MEMORY_PRESSURE_WRITE` data +into it (as above) and then watch for `POLLIN` events on it. Whenever `POLLIN` +is seen it should read and discard any data queued in the FIFO. If the path +refers to an `AF_UNIX` socket in the file system, the application should +`connect()` a stream socket to it, write `$MEMORY_PRESSURE_WRITE` into it (as +above) and watch for `POLLIN`, discarding any data it might receive. + +To summarize: + +* If `$MEMORY_PRESSURE_WATCH` points to a regular file: open and watch for + `POLLPRI`, never read from the file descriptor. + +* If `$MEMORY_PRESSURE_WATCH` points to a FIFO: open and watch for `POLLIN`, + read/discard any incoming data. + +* If `$MEMORY_PRESSURE_WATCH` points to an `AF_UNIX` socket: connect and watch + for `POLLIN`, read/discard any incoming data. + +* If `$MEMORY_PRESSURE_WATCH` contains the literal string `/dev/null`, turn off + memory pressure handling. + +(And in each case, immediately after opening/connecting to the path, write the +decoded `$MEMORY_PRESSURE_WRITE` data into it.) + +Whenever a `POLLPRI`/`POLLIN` event is seen the service is under pressure. It +should use this as hint to release suitable redundant resources, for example: + +* glibc's memory allocation cache, via + [`malloc_trim()`](https://man7.org/linux/man-pages/man3/malloc_trim.3.html). Similar, + allocation caches implemented in the service itself. + +* Any other local caches, such DNS caches, or web caches (in particular if + service is a web browser). + +* Terminate any idle worker threads or processes. + +* Run a garbage collection (GC) cycle, if the runtime environment supports it. + +* Terminate the process if idle, and can be automatically started when + needed next. + +Which actions precisely to take depends on the service in question and the type +of pressure. Note that the notifications are delivered when resource latency +already degraded beyond some point. Hence when discussing which resources to +keep and which to discard, keep in mind it's typically acceptable that latencies +incurred recovering discarded resources at a later point are acceptable, given +that latencies *already* are affected negatively. + +In case the path supplied via `$MEMORY_PRESSURE_WATCH` points to a PSI kernel +API file, or to an `AF_UNIX` opening it multiple times is safe and reliable, +and should deliver notifications to each of the opened file descriptors. This +is specifically useful for services that consist of multiple processes, and +where each of them shall be able to release resources on memory pressure. + +The `POLLPRI`/`POLLIN` conditions will be triggered every time pressure is +detected, but not continuously. It is thus safe to keep `poll()`-ing on the +same file descriptor continuously, and executing resource release operations +whenever the file descriptor triggers without having to expect overloading the +process. + +(Currently, the protocol defined here only allows configuration of a single +"degree" of pressure per resource type, there's no distinction made on how +strong the pressure is. In future, if it becomes apparent that there's clear +need to extend this we might eventually add different degrees, most likely by +adding additional environment variables such as `$MEMORY_PRESSURE_WRITE_LOW` +and `$MEMORY_PRESSURE_WRITE_HIGH` or similar, which may contain different +settings for lower or higher pressure thresholds.) + +## Service Manager Settings + +The service manager provides two per-service settings for each resource type +that control pressure handling: + +* `MemoryPressureWatch=` / `CPUPressureWatch=` / `IOPressureWatch=` controls + whether to enable the pressure protocol for the respective resource type for + the service in question. See + [`systemd.resource-control(5)`](https://www.freedesktop.org/software/systemd/man/latest/systemd.resource-control.html#MemoryPressureWatch=) + for details. + +* `MemoryPressureThresholdSec=` / `CPUPressureThresholdSec=` / + `IOPressureThresholdSec=` allows configuring the threshold when to signal + pressure to the services. It takes a time value (usually in the millisecond + range) that defines a threshold per 1s time window: if resource latencies grow + beyond this threshold notifications are generated towards the service, + requesting it to release resources. + +The `/etc/systemd/system.conf` file provides two settings for each resource +type that may be used to select the default values for the above settings. If +the threshold isn't configured via the per-service nor system-wide option, it +defaults to 100ms. + +When pressure monitoring is enabled for a service this primarily does three +things: + +* It enables the corresponding cgroup accounting for the service (this is a + requirement for per-cgroup PSI). + +* It sets the aforementioned two environment variables for processes invoked + for the service, based on the control group of the service and provided + settings. + +* The corresponding PSI control group file (`memory.pressure`, `cpu.pressure`, + or `io.pressure`) associated with the service's cgroup is delegated to the + service (i.e. permissions are relaxed so that unprivileged service payload + code can open the file for writing). + +## Pressure Events in `sd-event` + +The +[`sd-event`](https://www.freedesktop.org/software/systemd/man/latest/sd-event.html) +event loop library provides API calls that encapsulate the functionality +described above: + +* [`sd_event_add_memory_pressure()`](https://www.freedesktop.org/software/systemd/man/latest/sd_event_add_memory_pressure.html), + `sd_event_add_cpu_pressure()`, and `sd_event_add_io_pressure()` implement the + service-side of the pressure protocol for each resource type and integrate it + with an `sd-event` event loop. Each reads the corresponding two environment + variables, connects/opens the specified file, writes the specified data to it, + then watches it for events. + +* The `sd_event_trim_memory()` call may be called to trim the calling + processes' memory. It's a wrapper around glibc's `malloc_trim()`, but first + releases allocation caches maintained by libsystemd internally. This function + serves as the default when a NULL callback is supplied to + `sd_event_add_memory_pressure()`. Note that the default handler for + `sd_event_add_cpu_pressure()` and `sd_event_add_io_pressure()` is a no-op; + a custom callback should be provided for CPU and IO pressure to take + meaningful action. + +When implementing a service using `sd-event`, for automatic memory pressure +handling, it's typically sufficient to add a line such as: + +```c +(void) sd_event_add_memory_pressure(event, NULL, NULL, NULL); +``` + +– right after allocating the event loop object `event`. For CPU and IO pressure, +a custom handler should be provided to take appropriate action: + +```c +(void) sd_event_add_cpu_pressure(event, NULL, my_cpu_pressure_handler, userdata); +(void) sd_event_add_io_pressure(event, NULL, my_io_pressure_handler, userdata); +``` + +## Other APIs + +Other programming environments might have native APIs to watch memory +pressure/low memory events. Most notable is probably GLib's +[GMemoryMonitor](https://docs.gtk.org/gio/iface.MemoryMonitor.html). As of GLib +2.86.0, it uses the per-cgroup PSI kernel file to monitor for memory pressure, +but does not yet read the environment variables recommended above. + +In older versions, it used the per-system Linux PSI interface as the backend, but operated +differently than the above: memory pressure events were picked up by a system +service, which then propagated this through D-Bus to the applications. This was +typically less than ideal, since this means each notification event had to +traverse three processes before being handled. This traversal created +additional latencies at a time where the system is already experiencing adverse +latencies. Moreover, it focused on system-wide PSI events, even though +service-local ones are generally the better approach. diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml index a4be5e1274ff9..f8c3c0a173e15 100644 --- a/man/oomd.conf.xml +++ b/man/oomd.conf.xml @@ -62,7 +62,7 @@ Note that this is a privileged option as, even if it has a timeout, is synchronous and delays the kill, so use with care. The typically preferable mechanism to process memory pressure is to do what - MEMORY_PRESSURE describes which is unprivileged, + Resource Pressure Handling describes which is unprivileged, asynchronous and does not delay the kill. diff --git a/man/sd_event_add_memory_pressure.xml b/man/sd_event_add_memory_pressure.xml index 05f2ff2b74528..e472e620439c9 100644 --- a/man/sd_event_add_memory_pressure.xml +++ b/man/sd_event_add_memory_pressure.xml @@ -244,7 +244,7 @@ LOG_DEBUG level (with message ID f9b0be465ad540d0850ad32172d57c21) about the memory pressure operation. - For further details see Memory Pressure Handling in + For further details see Resource Pressure Handling in systemd. diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 455f666374f99..809cc285fdce1 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -4698,7 +4698,7 @@ StandardInputData=V2XigLJyZSBubyBzdHJhbmdlcnMgdG8gbG92ZQpZb3Uga25vdyB0aGUgcnVsZX $MEMORY_PRESSURE_WRITE If memory pressure monitoring is enabled for this service unit, the path to watch - and the data to write into it. See Memory Pressure + and the data to write into it. See Resource Pressure Handling for details about these variables and the service protocol data they convey. diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index ed8b0e11c8698..a3a1342c6ec03 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -1625,7 +1625,7 @@ DeviceAllow=/dev/loop-control Note that services are free to use the two environment variables, but it is unproblematic if they ignore them. Memory pressure handling must be implemented individually in each service, and usually means different things for different software. For further details on memory pressure - handling see Memory Pressure Handling in + handling see Resource Pressure Handling in systemd. Services implemented using From acb2b963c06a8cc30801b5657a29ae927c4eb799 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 16 Mar 2026 14:20:12 +0100 Subject: [PATCH 7/7] docs: Fix window in PRESSURE.md --- docs/PRESSURE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/PRESSURE.md b/docs/PRESSURE.md index d0b7ce8a5ed9b..a970dec34e450 100644 --- a/docs/PRESSURE.md +++ b/docs/PRESSURE.md @@ -175,14 +175,14 @@ that control pressure handling: * `MemoryPressureThresholdSec=` / `CPUPressureThresholdSec=` / `IOPressureThresholdSec=` allows configuring the threshold when to signal pressure to the services. It takes a time value (usually in the millisecond - range) that defines a threshold per 1s time window: if resource latencies grow + range) that defines a threshold per 2s time window: if resource latencies grow beyond this threshold notifications are generated towards the service, requesting it to release resources. The `/etc/systemd/system.conf` file provides two settings for each resource type that may be used to select the default values for the above settings. If the threshold isn't configured via the per-service nor system-wide option, it -defaults to 100ms. +defaults to 200ms. When pressure monitoring is enabled for a service this primarily does three things: