-
Notifications
You must be signed in to change notification settings - Fork 183
scx_chaos: use peek operation to optimise for empty delay dsq #2894
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,6 +63,8 @@ const volatile u64 kprobe_delays_max_ns = 2; | |
#define MIN(x, y) ((x) < (y) ? (x) : (y)) | ||
#define MAX(x, y) ((x) > (y) ? (x) : (y)) | ||
|
||
#define U64_MAX ((u64)~0ULL) | ||
|
||
enum chaos_timer_callbacks { | ||
CHAOS_TIMER_CHECK_QUEUES, | ||
CHAOS_MAX_TIMERS, | ||
|
@@ -143,6 +145,33 @@ static __always_inline void chaos_stat_inc(enum chaos_stat_idx stat) | |
(*cnt_p)++; | ||
} | ||
|
||
/* | ||
* Get the next time a delay DSQ needs processing. | ||
* | ||
* Safe for delay DSQs which use monotonic time (vtimes won't wrap to U64_MAX). | ||
* Must be called with RCU read lock held. | ||
*/ | ||
static __always_inline u64 delay_dsq_next_time(u64 dsq_id) | ||
{ | ||
struct task_struct *first_p; | ||
u64 vtime; | ||
|
||
// If we don't have native peek, fall back to always iterating | ||
if (!bpf_ksym_exists(scx_bpf_dsq_peek)) { | ||
chaos_stat_inc(CHAOS_STAT_PEEK_NEEDS_PROCESSING); | ||
return 0; | ||
} | ||
|
||
first_p = scx_bpf_dsq_peek(dsq_id); | ||
if (!first_p) { | ||
chaos_stat_inc(CHAOS_STAT_PEEK_EMPTY_DSQ); | ||
return U64_MAX; | ||
} | ||
|
||
vtime = first_p->scx.dsq_vtime; | ||
return vtime; | ||
} | ||
|
||
static __always_inline enum chaos_trait_kind | ||
choose_chaos(struct chaos_task_ctx *taskc) | ||
{ | ||
|
@@ -362,9 +391,25 @@ __weak u64 check_dsq_times(int cpu_idx) | |
u64 next_trigger_time = 0; | ||
u64 now = bpf_ktime_get_ns(); | ||
bool has_kicked = false; | ||
u64 dsq_id = get_cpu_delay_dsq(cpu_idx); | ||
|
||
bpf_rcu_read_lock(); | ||
bpf_for_each(scx_dsq, p, get_cpu_delay_dsq(cpu_idx), 0) { | ||
|
||
next_trigger_time = delay_dsq_next_time(dsq_id); | ||
if (next_trigger_time > now + chaos_timer_check_queues_slack_ns) { | ||
chaos_stat_inc(CHAOS_STAT_PEEK_NOT_READY); | ||
// DSQ empty (U64_MAX) or first task beyond slack window | ||
bpf_rcu_read_unlock(); | ||
return next_trigger_time == U64_MAX ? 0 : next_trigger_time; | ||
} | ||
|
||
chaos_stat_inc(CHAOS_STAT_PEEK_NEEDS_PROCESSING); | ||
|
||
// Need to iterate: no peek support (0), task ready, or task within slack window | ||
next_trigger_time = 0; | ||
|
||
// Need to iterate to handle ready tasks | ||
bpf_for_each(scx_dsq, p, dsq_id, 0) { | ||
p = bpf_task_from_pid(p->pid); | ||
if (!p) | ||
break; | ||
|
@@ -387,8 +432,8 @@ __weak u64 check_dsq_times(int cpu_idx) | |
if (next_trigger_time > now + chaos_timer_check_queues_slack_ns) | ||
break; | ||
} | ||
bpf_rcu_read_unlock(); | ||
|
||
bpf_rcu_read_unlock(); | ||
return next_trigger_time; | ||
} | ||
|
||
|
@@ -531,9 +576,17 @@ void BPF_STRUCT_OPS(chaos_dispatch, s32 cpu, struct task_struct *prev) | |
struct enqueue_promise promise; | ||
struct chaos_task_ctx *taskc; | ||
struct task_struct *p; | ||
u64 now = bpf_ktime_get_ns(); | ||
u64 now = bpf_ktime_get_ns(); | ||
u64 dsq_id = get_cpu_delay_dsq(-1); | ||
|
||
// Check if we need to process the delay DSQ | ||
if (delay_dsq_next_time(dsq_id) > now) { | ||
chaos_stat_inc(CHAOS_STAT_PEEK_NOT_READY); | ||
goto p2dq; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes sense as the short-circuit. I want to quickly make sure that the races are benign. Normally, we would expect a peek/vtime reading to be a lower bound -- other cores could race and asynchronously pop the task and increase the vtime after the observation we made (but not decrease it, b.c. of monotonicity). But here, because we only peek our own delay_dsq here... maybe we don't even need to worry about that because no one else will dispatch from it? Do we need to rely on non-interference with delay-dsq here? I think not, because if the head of delay_dsq was popped and its head-delayed-time became even FURTHER into the future, then it is even more ahead of our fixed snapshot of Nevertheless, reasoning about the concurrency safety here seems a bit scary because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for this. The two parts make a lot of sense: information loss between dsq insertions/the check in chaos_dispatch, and the check in the timer being out of date.
The timer is a much more interesting case. I initially did this work on an Intel machine with a unified cache and didn't see many stalls after I fixed the logic. However on my multi-CCX EPYC Rome machine I'm seeing fairly consistent stalls. I'll play around with this, but I believe it's the timer not triggering dispatch when it should, as I don't see how this would be a problem between enqueue/dispatch in isolation. |
||
} | ||
chaos_stat_inc(CHAOS_STAT_PEEK_NEEDS_PROCESSING); | ||
|
||
bpf_for_each(scx_dsq, p, get_cpu_delay_dsq(-1), 0) { | ||
bpf_for_each(scx_dsq, p, dsq_id, 0) { | ||
p = bpf_task_from_pid(p->pid); | ||
if (!p) | ||
continue; | ||
|
@@ -557,6 +610,7 @@ void BPF_STRUCT_OPS(chaos_dispatch, s32 cpu, struct task_struct *prev) | |
bpf_task_release(p); | ||
} | ||
|
||
p2dq: | ||
return p2dq_dispatch_impl(cpu, prev); | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is different than the compat macro (actually inline function) in the V5 that we ended up with and TJ moved to his branch. It probably doesn't matter, but it does make me curious what are rules are for updating this file.
Shouldn't it be automatically pulled from kernel somewhere? I don't see any other scripts in the repo that seem to mention
compat.bpf.h
-- I was expecting some update script that copies it from the kernel..There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, this has landed upstream now. I grabbed this from #2675 and plan to merge after that goes in, will take whatever's there (I don't actually need the compat macro).