Skip to content

Commit 3406cfe

Browse files
committed
chaos: add futex delays trait
Add futex delays to chaos. To best reproduce deadlocks and other futex issues we need to affect locking. The approach here: - Delays a waiter when a lock has contention up to futex_uncontended_delay_ns. - Swaps out the existing delayed waiter when another waiter comes along. - Delays the previous waiter by a random delay between futex_contended_delay_ns and futex_uncontended_delay_ns. This approach is chosen over random delays to flip futex conditions with minimal performance impact on a machine/process. If we had a futex and pair of threads that have many idle seconds after a short period of contention we would need huge random delays to affect their ordering at all, on every task that touches the futex. Instead we can limit the delays to a solo waiter at any point, and have a much smaller delay when we know the mutex is already under contention. We'll see how this works in practice. This is the most complicated chaos trait in terms of data structures by far. Currently we use a BPF hash map and a built in DSQ to maintain the data. The hash map maps a specific futex (well, close, a tgid/uaddr pair) to an entry in a CPU's delay DSQ. The delay DSQ holds the task until its timeout, and the map stores how to find that entry in the DSQ to re-queue it with the uncontended timeout. As commented in the code, the complexity of a search in a native DSQ is hideous - it's O(n). We can change the implementation in the future while keeping the logic the same. Test plan: - Lightly tested. Futex is attached to and sees many entries. Slow futex waiters are delayed. The hand off between an old delayed waiter and a new delayed waiter are not reliable and likely have a bug. - This change is a no-op unless you provide new command line flags.
1 parent ebf1a26 commit 3406cfe

File tree

4 files changed

+332
-8
lines changed

4 files changed

+332
-8
lines changed

scheds/rust/scx_chaos/src/bpf/intf.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ enum chaos_trait_kind {
3030
CHAOS_TRAIT_RANDOM_DELAYS,
3131
CHAOS_TRAIT_CPU_FREQ,
3232
CHAOS_TRAIT_DEGRADATION,
33+
CHAOS_TRAIT_FUTEX_DELAYS,
3334
CHAOS_TRAIT_MAX,
3435
};
3536

@@ -41,12 +42,17 @@ struct chaos_task_ctx {
4142
enum chaos_trait_kind pending_trait;
4243
u64 enq_flags;
4344
u64 p2dq_vtime;
45+
46+
// Futex delay state
47+
u64 futex_uaddr;
4448
};
4549

4650
enum chaos_stat_idx {
4751
CHAOS_STAT_TRAIT_RANDOM_DELAYS,
4852
CHAOS_STAT_TRAIT_CPU_FREQ,
4953
CHAOS_STAT_TRAIT_DEGRADATION,
54+
CHAOS_STAT_TRAIT_FUTEX_DELAYS,
55+
CHAOS_STAT_TRAIT_FUTEX_DELAYS_CONTENDED,
5056
CHAOS_STAT_CHAOS_EXCLUDED,
5157
CHAOS_STAT_CHAOS_SKIPPED,
5258
CHAOS_STAT_TIMER_KICKS,

scheds/rust/scx_chaos/src/bpf/main.bpf.c

Lines changed: 248 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,39 @@
3232
scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \
3333
scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)))
3434

35+
/*
36+
* The following defines are from 'linux/include/uapi/linux/futex.h'
37+
*/
38+
#define FUTEX_WAIT 0
39+
#define FUTEX_WAKE 1
40+
#define FUTEX_FD 2
41+
#define FUTEX_REQUEUE 3
42+
#define FUTEX_CMP_REQUEUE 4
43+
#define FUTEX_WAKE_OP 5
44+
#define FUTEX_LOCK_PI 6
45+
#define FUTEX_UNLOCK_PI 7
46+
#define FUTEX_TRYLOCK_PI 8
47+
#define FUTEX_WAIT_BITSET 9
48+
#define FUTEX_WAKE_BITSET 10
49+
#define FUTEX_WAIT_REQUEUE_PI 11
50+
#define FUTEX_CMP_REQUEUE_PI 12
51+
#define FUTEX_LOCK_PI2 13
52+
53+
#define FUTEX_PRIVATE_FLAG 128
54+
#define FUTEX_CLOCK_REALTIME 256
55+
#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
56+
57+
struct tp_syscall_enter_futex {
58+
struct trace_entry ent;
59+
int __syscall_nr;
60+
u32 __attribute__((btf_type_tag("user"))) * uaddr;
61+
int op;
62+
u32 val;
63+
struct __kernel_timespec __attribute__((btf_type_tag("user"))) * utime;
64+
u32 __attribute__((btf_type_tag("user"))) * uaddr2;
65+
u32 val3;
66+
};
67+
3568
const volatile int ppid_targeting_ppid = 1;
3669
const volatile bool ppid_targeting_inclusive = false; /* include ppid_targeting_ppid in chaos */
3770

@@ -52,6 +85,11 @@ const volatile u64 degradation_frac7 = 0;
5285

5386
const volatile u32 kprobe_delays_freq_frac32 = 1;
5487

88+
const volatile u64 futex_uncontended_delay_ns = 1;
89+
const volatile u64 futex_contended_delay_min_ns = 1;
90+
const volatile u64 futex_contended_delay_max_ns = 1;
91+
92+
5593
#define MIN(x, y) ((x) < (y) ? (x) : (y))
5694
#define MAX(x, y) ((x) > (y) ? (x) : (y))
5795

@@ -81,6 +119,30 @@ struct {
81119
__type(value, u64);
82120
} chaos_stats SEC(".maps");
83121

122+
struct chaos_futex_key {
123+
u32 tgid;
124+
u64 uaddr;
125+
};
126+
127+
struct chaos_futex_waiter {
128+
struct bpf_spin_lock lock;
129+
u64 timeout_key;
130+
u32 pid;
131+
s32 delay_dsq_cpu_idx;
132+
};
133+
134+
struct {
135+
__uint(type, BPF_MAP_TYPE_HASH);
136+
__uint(max_entries, 1024*1024);
137+
__type(key, struct chaos_futex_key);
138+
__type(value, struct chaos_futex_waiter);
139+
} chaos_futex_waiters SEC(".maps");
140+
141+
static __always_inline u64 chaos_get_prandom_u64()
142+
{
143+
return ((u64)bpf_get_prandom_u32() << 32) | bpf_get_prandom_u32();
144+
}
145+
84146
struct chaos_task_ctx *lookup_create_chaos_task_ctx(struct task_struct *p)
85147
{
86148
return bpf_task_storage_get(&chaos_task_ctxs, p, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE);
@@ -114,9 +176,13 @@ static __always_inline enum chaos_trait_kind choose_chaos(struct chaos_task_ctx
114176

115177
static __always_inline bool chaos_trait_skips_select_cpu(struct chaos_task_ctx *taskc)
116178
{
117-
if (taskc->next_trait == CHAOS_TRAIT_RANDOM_DELAYS)
179+
switch (taskc->next_trait) {
180+
case CHAOS_TRAIT_RANDOM_DELAYS:
181+
case CHAOS_TRAIT_FUTEX_DELAYS:
118182
return true;
119-
return false;
183+
default:
184+
return false;
185+
}
120186
}
121187

122188
static __always_inline u64 get_cpu_delay_dsq(int cpu_idx)
@@ -252,14 +318,141 @@ static __always_inline s32 calculate_chaos_match(struct task_struct *p)
252318
return ret;
253319
}
254320

321+
// Traverse a DSQ to find the first element with a key with hideous complexity.
322+
// This is O(n) in DSQ members.
323+
//
324+
// To improve:
325+
// - Add this as a kfunc to the kernel where it can be O(log n)
326+
// - Use arena DSQs where we can get this behaviour in O(log n)
327+
static __always_inline
328+
void bpf_iter_scx_dsq_search(struct bpf_iter_scx_dsq *it,
329+
struct task_struct **p,
330+
u64 dsq_id,
331+
u64 flags,
332+
u64 key)
333+
{
334+
bpf_iter_scx_dsq_new(it, dsq_id, flags);
335+
336+
while((*p = bpf_iter_scx_dsq_next(it))) {
337+
if ((*p)->scx.dsq_vtime == key)
338+
return;
339+
340+
if ((*p)->scx.dsq_vtime > key)
341+
break;
342+
}
343+
344+
*p = NULL;
345+
}
346+
347+
static __always_inline bool update_delayed_task_vtime(s32 cpu_idx, u64 key,
348+
u64 pid, u64 new_vtime)
349+
{
350+
u64 dsq_id = get_cpu_delay_dsq(cpu_idx);
351+
struct bpf_iter_scx_dsq it;
352+
struct task_struct *p;
353+
bool ret = false;
354+
355+
bpf_iter_scx_dsq_search(&it, &p, dsq_id, 0, key);
356+
if (!p)
357+
goto out;
358+
359+
while (p->pid != pid && (p = bpf_iter_scx_dsq_next(&it)) && p->scx.dsq_vtime == key) {}
360+
if (!p || p->pid != pid)
361+
goto out;
362+
363+
ret = true;
364+
scx_bpf_dsq_move_set_vtime(&it, new_vtime);
365+
ret = scx_bpf_dsq_move_vtime(&it, p, dsq_id, 0);
366+
367+
out:
368+
bpf_iter_scx_dsq_destroy(&it);
369+
return ret;
370+
}
371+
372+
__weak s32 enqueue_futex_delay(struct task_struct *p __arg_trusted,
373+
u64 enq_flags,
374+
struct chaos_task_ctx *taskc __arg_nonnull)
375+
{
376+
s64 ret;
377+
struct chaos_futex_key key;
378+
struct chaos_futex_waiter *entry;
379+
struct chaos_futex_waiter val;
380+
u64 vtime, now;
381+
s32 cpu;
382+
383+
key.tgid = p->tgid;
384+
key.uaddr = taskc->futex_uaddr;
385+
386+
// First ensure an entry exists but in a largely empty state. We need the
387+
// spinlock to correctly interlock with the delay DSQ.
388+
val.pid = -1;
389+
390+
ret = bpf_map_update_elem(&chaos_futex_waiters, &key, &val, BPF_NOEXIST);
391+
if (ret && ret != -EEXIST) {
392+
scx_bpf_error("failed to create chaos_futex_waiter in runnable_futex_delays");
393+
return false;
394+
}
395+
396+
// Get the real element. This might be an empty element that we inserted
397+
// or it might be an element filled with another PID. It doesn't matter
398+
// whether we inserted the element or somebody else did, this races.
399+
entry = (struct chaos_futex_waiter*)bpf_map_lookup_elem(&chaos_futex_waiters, &key);
400+
if (!entry) {
401+
scx_bpf_error("failed to lookup chaos_futex_waiter in runnable_futex_delays");
402+
return false;
403+
}
404+
405+
// enqueue ourselves before entering the spinlock. critical sections
406+
// can't call kfuncs.
407+
now = bpf_ktime_get_ns();
408+
cpu = bpf_get_smp_processor_id();
409+
410+
chaos_stat_inc(CHAOS_STAT_TRAIT_FUTEX_DELAYS);
411+
scx_bpf_dsq_insert_vtime(p, get_cpu_delay_dsq(cpu), 0, now + futex_uncontended_delay_ns, enq_flags);
412+
413+
// critical sections can't call kfuncs which makes this very complicated.
414+
// we must have already enqueued ourselves, and we must then insert
415+
// ourselves in the hashmap. when we take a task out of the lock we
416+
// should attempt to re-queue it after. the task will not hit this path
417+
// again until it has been re-queued, thus this isn't racy - either we
418+
// will re-queue it, or it will run naturally when its delay expires.
419+
// This might mean it doesn't get quite enough delay, but no invariants
420+
// are broken.
421+
bpf_spin_lock(&entry->lock);
422+
423+
val.pid = entry->pid;
424+
val.timeout_key = entry->timeout_key;
425+
val.delay_dsq_cpu_idx = entry->delay_dsq_cpu_idx;
426+
427+
// enqueue ourselves and prepare the metadata for the next one to come along
428+
entry->pid = p->pid;
429+
entry->timeout_key = now + futex_uncontended_delay_ns;
430+
entry->delay_dsq_cpu_idx = cpu;
431+
432+
bpf_spin_unlock(&entry->lock);
433+
434+
// re-queue task that has a contender behind it
435+
if (val.pid != -1) {
436+
vtime = now + futex_contended_delay_min_ns;
437+
if (futex_contended_delay_min_ns != futex_contended_delay_max_ns) {
438+
vtime += chaos_get_prandom_u64()
439+
% (futex_contended_delay_max_ns - futex_contended_delay_min_ns);
440+
}
441+
442+
if (update_delayed_task_vtime(val.delay_dsq_cpu_idx, val.timeout_key, val.pid, vtime))
443+
chaos_stat_inc(CHAOS_STAT_TRAIT_FUTEX_DELAYS_CONTENDED);
444+
}
445+
446+
return true;
447+
}
448+
255449
__weak s32 enqueue_random_delay(struct task_struct *p __arg_trusted, u64 enq_flags,
256450
struct chaos_task_ctx *taskc __arg_nonnull)
257451
{
258-
u64 rand64 = ((u64)bpf_get_prandom_u32() << 32) | bpf_get_prandom_u32();
259-
260452
u64 vtime = bpf_ktime_get_ns() + random_delays_min_ns;
261453
if (random_delays_min_ns != random_delays_max_ns) {
262-
vtime += rand64 % (random_delays_max_ns - random_delays_min_ns);
454+
vtime += chaos_get_prandom_u64()
455+
% (random_delays_max_ns - random_delays_min_ns);
263456
}
264457

265458
scx_bpf_dsq_insert_vtime(p, get_cpu_delay_dsq(-1), 0, vtime, enq_flags);
@@ -278,6 +471,10 @@ __weak s32 enqueue_chaotic(struct task_struct *p __arg_trusted, u64 enq_flags,
278471
out = enqueue_random_delay(p, enq_flags, taskc);
279472
break;
280473

474+
case CHAOS_TRAIT_FUTEX_DELAYS:
475+
out = enqueue_futex_delay(p, enq_flags, taskc);
476+
break;
477+
281478
case CHAOS_TRAIT_NONE:
282479
chaos_stat_inc(CHAOS_STAT_CHAOS_SKIPPED);
283480
out = false;
@@ -479,8 +676,7 @@ void BPF_STRUCT_OPS(chaos_enqueue, struct task_struct *p __arg_trusted, u64 enq_
479676
if (promise.kind == P2DQ_ENQUEUE_PROMISE_COMPLETE)
480677
return;
481678

482-
if (taskc->next_trait == CHAOS_TRAIT_RANDOM_DELAYS &&
483-
enqueue_chaotic(p, enq_flags, taskc))
679+
if (enqueue_chaotic(p, enq_flags, taskc))
484680
return;
485681

486682
// NOTE: this may not work for affinitized tasks because p2dq does
@@ -582,6 +778,51 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(chaos_init_task, struct task_struct *p,
582778
return 0;
583779
}
584780

781+
SEC("?tracepoint/syscalls/sys_enter_futex")
782+
int rtp_sys_enter_futex(struct tp_syscall_enter_futex *ctx)
783+
{
784+
struct task_struct *p;
785+
struct chaos_task_ctx *taskc;
786+
int futex_op;
787+
s32 ret;
788+
789+
// should be detached from userspace but if it is attached then no-op
790+
if (!futex_uncontended_delay_ns && !futex_contended_delay_min_ns &&
791+
!futex_contended_delay_max_ns)
792+
return 0;
793+
794+
p = (struct task_struct *)bpf_get_current_task_btf();
795+
taskc = lookup_create_chaos_task_ctx(p);
796+
if (!taskc)
797+
return 0;
798+
799+
if (!(taskc->match & CHAOS_MATCH_COMPLETE)) {
800+
ret = calculate_chaos_match(p);
801+
if (ret) {
802+
scx_bpf_error("failed to match task");
803+
return 0;
804+
}
805+
}
806+
807+
if (taskc->match & CHAOS_MATCH_EXCLUDED)
808+
return 0;
809+
810+
futex_op = ctx->op & FUTEX_CMD_MASK;
811+
812+
if (futex_op != FUTEX_WAIT && futex_op != FUTEX_WAIT_BITSET &&
813+
futex_op != FUTEX_WAIT_REQUEUE_PI)
814+
return 0;
815+
816+
// The task is either about to wait because it hit FUTEX_WAIT on the slow
817+
// path or hit the fast path. The fast path is irrelevant for our purposes
818+
// as we have no scheduler input there, so it's safe to delay our work
819+
// until a struct_ops .runnable callback comes along.
820+
taskc->pending_trait = CHAOS_TRAIT_FUTEX_DELAYS;
821+
taskc->futex_uaddr = (u64)ctx->uaddr;
822+
823+
return 0;
824+
}
825+
585826
SCX_OPS_DEFINE(chaos,
586827
.dispatch = (void *)chaos_dispatch,
587828
.enqueue = (void *)chaos_enqueue,

0 commit comments

Comments
 (0)