Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions scheds/include/scx/common.bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ u32 scx_bpf_reenqueue_local(void) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
Expand Down
7 changes: 7 additions & 0 deletions scheds/include/scx/compat.bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,13 @@ static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags)
scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) : \
scx_bpf_pick_any_cpu(cpus_allowed, flags))

#define __COMPAT_scx_bpf_dsq_peek(dsq_id) \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is different than the compat macro (actually inline function) in the V5 that we ended up with and TJ moved to his branch. It probably doesn't matter, but it does make me curious what are rules are for updating this file.

Shouldn't it be automatically pulled from kernel somewhere? I don't see any other scripts in the repo that seem to mention compat.bpf.h -- I was expecting some update script that copies it from the kernel..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, this has landed upstream now. I grabbed this from #2675 and plan to merge after that goes in, will take whatever's there (I don't actually need the compat macro).

(bpf_ksym_exists(scx_bpf_dsq_peek) ? scx_bpf_dsq_peek(dsq_id) : ({ \
struct task_struct *p = NULL; \
bpf_for_each(scx_dsq, p, dsq_id, 0) { break; } \
p; \
}))

/*
* Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
Expand Down
3 changes: 3 additions & 0 deletions scheds/rust/scx_chaos/src/bpf/intf.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ enum chaos_stat_idx {
CHAOS_STAT_CHAOS_SKIPPED,
CHAOS_STAT_KPROBE_RANDOM_DELAYS,
CHAOS_STAT_TIMER_KICKS,
CHAOS_STAT_PEEK_EMPTY_DSQ,
CHAOS_STAT_PEEK_NOT_READY,
CHAOS_STAT_PEEK_NEEDS_PROCESSING,
CHAOS_NR_STATS,
};

Expand Down
62 changes: 58 additions & 4 deletions scheds/rust/scx_chaos/src/bpf/main.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ const volatile u64 kprobe_delays_max_ns = 2;
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define MAX(x, y) ((x) > (y) ? (x) : (y))

#define U64_MAX ((u64)~0ULL)

enum chaos_timer_callbacks {
CHAOS_TIMER_CHECK_QUEUES,
CHAOS_MAX_TIMERS,
Expand Down Expand Up @@ -143,6 +145,33 @@ static __always_inline void chaos_stat_inc(enum chaos_stat_idx stat)
(*cnt_p)++;
}

/*
* Get the next time a delay DSQ needs processing.
*
* Safe for delay DSQs which use monotonic time (vtimes won't wrap to U64_MAX).
* Must be called with RCU read lock held.
*/
static __always_inline u64 delay_dsq_next_time(u64 dsq_id)
{
struct task_struct *first_p;
u64 vtime;

// If we don't have native peek, fall back to always iterating
if (!bpf_ksym_exists(scx_bpf_dsq_peek)) {
chaos_stat_inc(CHAOS_STAT_PEEK_NEEDS_PROCESSING);
return 0;
}

first_p = scx_bpf_dsq_peek(dsq_id);
if (!first_p) {
chaos_stat_inc(CHAOS_STAT_PEEK_EMPTY_DSQ);
return U64_MAX;
}

vtime = first_p->scx.dsq_vtime;
return vtime;
}

static __always_inline enum chaos_trait_kind
choose_chaos(struct chaos_task_ctx *taskc)
{
Expand Down Expand Up @@ -362,9 +391,25 @@ __weak u64 check_dsq_times(int cpu_idx)
u64 next_trigger_time = 0;
u64 now = bpf_ktime_get_ns();
bool has_kicked = false;
u64 dsq_id = get_cpu_delay_dsq(cpu_idx);

bpf_rcu_read_lock();
bpf_for_each(scx_dsq, p, get_cpu_delay_dsq(cpu_idx), 0) {

next_trigger_time = delay_dsq_next_time(dsq_id);
if (next_trigger_time > now + chaos_timer_check_queues_slack_ns) {
chaos_stat_inc(CHAOS_STAT_PEEK_NOT_READY);
// DSQ empty (U64_MAX) or first task beyond slack window
bpf_rcu_read_unlock();
return next_trigger_time == U64_MAX ? 0 : next_trigger_time;
}

chaos_stat_inc(CHAOS_STAT_PEEK_NEEDS_PROCESSING);

// Need to iterate: no peek support (0), task ready, or task within slack window
next_trigger_time = 0;

// Need to iterate to handle ready tasks
bpf_for_each(scx_dsq, p, dsq_id, 0) {
p = bpf_task_from_pid(p->pid);
if (!p)
break;
Expand All @@ -387,8 +432,8 @@ __weak u64 check_dsq_times(int cpu_idx)
if (next_trigger_time > now + chaos_timer_check_queues_slack_ns)
break;
}
bpf_rcu_read_unlock();

bpf_rcu_read_unlock();
return next_trigger_time;
}

Expand Down Expand Up @@ -531,9 +576,17 @@ void BPF_STRUCT_OPS(chaos_dispatch, s32 cpu, struct task_struct *prev)
struct enqueue_promise promise;
struct chaos_task_ctx *taskc;
struct task_struct *p;
u64 now = bpf_ktime_get_ns();
u64 now = bpf_ktime_get_ns();
u64 dsq_id = get_cpu_delay_dsq(-1);

// Check if we need to process the delay DSQ
if (delay_dsq_next_time(dsq_id) > now) {
chaos_stat_inc(CHAOS_STAT_PEEK_NOT_READY);
goto p2dq;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense as the short-circuit. I want to quickly make sure that the races are benign. Normally, we would expect a peek/vtime reading to be a lower bound -- other cores could race and asynchronously pop the task and increase the vtime after the observation we made (but not decrease it, b.c. of monotonicity).

But here, because we only peek our own delay_dsq here... maybe we don't even need to worry about that because no one else will dispatch from it?

Do we need to rely on non-interference with delay-dsq here? I think not, because if the head of delay_dsq was popped and its head-delayed-time became even FURTHER into the future, then it is even more ahead of our fixed snapshot of now and we remain justified in taking the fastpath goto p2dq.

Nevertheless, reasoning about the concurrency safety here seems a bit scary because bpf_ktime_get_ns is realtime that is constantly changing (monotonically forward, and our "now" snapshots are immediately stale, we don't get to control the forward-march of time like in a simulation). If "now" and "earliest vtime in delay queue" are both variables that change asynchronously in the background... then atomic snapshots of just one of them would never be able to guarantee that a moment in time exists where they have a particular ordering relationship..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for this. The two parts make a lot of sense: information loss between dsq insertions/the check in chaos_dispatch, and the check in the timer being out of date.

chaos_enqueue and chaos_dispatch both only enqueue/dequeue from their local DSQ. There should be no possible interference there, and every enqueue should show up in the related dispatch. If we ever place the task onto a delay DSQ that isn't the CPU that triggered the enqueue this would need a re-think, but I don't see any reason we'd need to do that.

The timer is a much more interesting case. I initially did this work on an Intel machine with a unified cache and didn't see many stalls after I fixed the logic. However on my multi-CCX EPYC Rome machine I'm seeing fairly consistent stalls. I'll play around with this, but I believe it's the timer not triggering dispatch when it should, as I don't see how this would be a problem between enqueue/dispatch in isolation.

}
chaos_stat_inc(CHAOS_STAT_PEEK_NEEDS_PROCESSING);

bpf_for_each(scx_dsq, p, get_cpu_delay_dsq(-1), 0) {
bpf_for_each(scx_dsq, p, dsq_id, 0) {
p = bpf_task_from_pid(p->pid);
if (!p)
continue;
Expand All @@ -557,6 +610,7 @@ void BPF_STRUCT_OPS(chaos_dispatch, s32 cpu, struct task_struct *prev)
bpf_task_release(p);
}

p2dq:
return p2dq_dispatch_impl(cpu, prev);
}

Expand Down
4 changes: 4 additions & 0 deletions scheds/rust/scx_chaos/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@ impl Scheduler {
kprobe_random_delays: stats
[bpf_intf::chaos_stat_idx_CHAOS_STAT_KPROBE_RANDOM_DELAYS as usize],
timer_kicks: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_TIMER_KICKS as usize],
peek_empty_dsq: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_PEEK_EMPTY_DSQ as usize],
peek_not_ready: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_PEEK_NOT_READY as usize],
peek_needs_processing: stats
[bpf_intf::chaos_stat_idx_CHAOS_STAT_PEEK_NEEDS_PROCESSING as usize],
}
}

Expand Down
14 changes: 14 additions & 0 deletions scheds/rust/scx_chaos/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ pub struct Metrics {
pub timer_kicks: u64,
#[stat(desc = "Number of times a kprobe caused a random delay to be applied")]
pub kprobe_random_delays: u64,
#[stat(desc = "Peek found empty DSQ")]
pub peek_empty_dsq: u64,
#[stat(desc = "Peek found task not ready")]
pub peek_not_ready: u64,
#[stat(desc = "Peek determined DSQ needs processing")]
pub peek_needs_processing: u64,
}

impl Metrics {
Expand All @@ -44,6 +50,11 @@ impl Metrics {
self.kprobe_random_delays,
self.timer_kicks,
)?;
writeln!(
w,
"peek: empty/not_ready/needs_proc {}/{}/{}",
self.peek_empty_dsq, self.peek_not_ready, self.peek_needs_processing,
)?;
Ok(())
}

Expand All @@ -56,6 +67,9 @@ impl Metrics {
chaos_skipped: self.chaos_skipped - rhs.chaos_skipped,
kprobe_random_delays: self.kprobe_random_delays - rhs.kprobe_random_delays,
timer_kicks: self.timer_kicks - rhs.timer_kicks,
peek_empty_dsq: self.peek_empty_dsq - rhs.peek_empty_dsq,
peek_not_ready: self.peek_not_ready - rhs.peek_not_ready,
peek_needs_processing: self.peek_needs_processing - rhs.peek_needs_processing,
}
}
}
Expand Down