32
32
scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \
33
33
scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)))
34
34
35
+ /*
36
+ * The following defines are from 'linux/include/uapi/linux/futex.h'
37
+ */
38
+ #define FUTEX_WAIT 0
39
+ #define FUTEX_WAKE 1
40
+ #define FUTEX_FD 2
41
+ #define FUTEX_REQUEUE 3
42
+ #define FUTEX_CMP_REQUEUE 4
43
+ #define FUTEX_WAKE_OP 5
44
+ #define FUTEX_LOCK_PI 6
45
+ #define FUTEX_UNLOCK_PI 7
46
+ #define FUTEX_TRYLOCK_PI 8
47
+ #define FUTEX_WAIT_BITSET 9
48
+ #define FUTEX_WAKE_BITSET 10
49
+ #define FUTEX_WAIT_REQUEUE_PI 11
50
+ #define FUTEX_CMP_REQUEUE_PI 12
51
+ #define FUTEX_LOCK_PI2 13
52
+
53
+ #define FUTEX_PRIVATE_FLAG 128
54
+ #define FUTEX_CLOCK_REALTIME 256
55
+ #define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
56
+
57
+ struct tp_syscall_enter_futex {
58
+ struct trace_entry ent ;
59
+ int __syscall_nr ;
60
+ u32 __attribute__((btf_type_tag ("user" ))) * uaddr ;
61
+ int op ;
62
+ u32 val ;
63
+ struct __kernel_timespec __attribute__((btf_type_tag ("user" ))) * utime ;
64
+ u32 __attribute__((btf_type_tag ("user" ))) * uaddr2 ;
65
+ u32 val3 ;
66
+ };
67
+
35
68
const volatile int ppid_targeting_ppid = 1 ;
36
69
const volatile bool ppid_targeting_inclusive = false; /* include ppid_targeting_ppid in chaos */
37
70
@@ -52,6 +85,11 @@ const volatile u64 degradation_frac7 = 0;
52
85
53
86
const volatile u32 kprobe_delays_freq_frac32 = 1 ;
54
87
88
+ const volatile u64 futex_uncontended_delay_ns = 1 ;
89
+ const volatile u64 futex_contended_delay_min_ns = 1 ;
90
+ const volatile u64 futex_contended_delay_max_ns = 1 ;
91
+
92
+
55
93
#define MIN (x , y ) ((x) < (y) ? (x) : (y))
56
94
#define MAX (x , y ) ((x) > (y) ? (x) : (y))
57
95
@@ -81,6 +119,30 @@ struct {
81
119
__type (value , u64 );
82
120
} chaos_stats SEC (".maps" );
83
121
122
+ struct chaos_futex_key {
123
+ u32 tgid ;
124
+ u64 uaddr ;
125
+ };
126
+
127
+ struct chaos_futex_waiter {
128
+ struct bpf_spin_lock lock ;
129
+ u64 timeout_key ;
130
+ u32 pid ;
131
+ s32 delay_dsq_cpu_idx ;
132
+ };
133
+
134
+ struct {
135
+ __uint (type , BPF_MAP_TYPE_HASH );
136
+ __uint (max_entries , 1024 * 1024 );
137
+ __type (key , struct chaos_futex_key );
138
+ __type (value , struct chaos_futex_waiter );
139
+ } chaos_futex_waiters SEC (".maps" );
140
+
141
+ static __always_inline u64 chaos_get_prandom_u64 ()
142
+ {
143
+ return ((u64 )bpf_get_prandom_u32 () << 32 ) | bpf_get_prandom_u32 ();
144
+ }
145
+
84
146
struct chaos_task_ctx * lookup_create_chaos_task_ctx (struct task_struct * p )
85
147
{
86
148
return bpf_task_storage_get (& chaos_task_ctxs , p , NULL , BPF_LOCAL_STORAGE_GET_F_CREATE );
@@ -114,9 +176,13 @@ static __always_inline enum chaos_trait_kind choose_chaos(struct chaos_task_ctx
114
176
115
177
static __always_inline bool chaos_trait_skips_select_cpu (struct chaos_task_ctx * taskc )
116
178
{
117
- if (taskc -> next_trait == CHAOS_TRAIT_RANDOM_DELAYS )
179
+ switch (taskc -> next_trait ) {
180
+ case CHAOS_TRAIT_RANDOM_DELAYS :
181
+ case CHAOS_TRAIT_FUTEX_DELAYS :
118
182
return true;
119
- return false;
183
+ default :
184
+ return false;
185
+ }
120
186
}
121
187
122
188
static __always_inline u64 get_cpu_delay_dsq (int cpu_idx )
@@ -252,14 +318,141 @@ static __always_inline s32 calculate_chaos_match(struct task_struct *p)
252
318
return ret ;
253
319
}
254
320
321
+ // Traverse a DSQ to find the first element with a key with hideous complexity.
322
+ // This is O(n) in DSQ members.
323
+ //
324
+ // To improve:
325
+ // - Add this as a kfunc to the kernel where it can be O(log n)
326
+ // - Use arena DSQs where we can get this behaviour in O(log n)
327
+ static __always_inline
328
+ void bpf_iter_scx_dsq_search (struct bpf_iter_scx_dsq * it ,
329
+ struct task_struct * * p ,
330
+ u64 dsq_id ,
331
+ u64 flags ,
332
+ u64 key )
333
+ {
334
+ bpf_iter_scx_dsq_new (it , dsq_id , flags );
335
+
336
+ while ((* p = bpf_iter_scx_dsq_next (it ))) {
337
+ if ((* p )-> scx .dsq_vtime == key )
338
+ return ;
339
+
340
+ if ((* p )-> scx .dsq_vtime > key )
341
+ break ;
342
+ }
343
+
344
+ * p = NULL ;
345
+ }
346
+
347
+ static __always_inline bool update_delayed_task_vtime (s32 cpu_idx , u64 key ,
348
+ u64 pid , u64 new_vtime )
349
+ {
350
+ u64 dsq_id = get_cpu_delay_dsq (cpu_idx );
351
+ struct bpf_iter_scx_dsq it ;
352
+ struct task_struct * p ;
353
+ bool ret = false;
354
+
355
+ bpf_iter_scx_dsq_search (& it , & p , dsq_id , 0 , key );
356
+ if (!p )
357
+ goto out ;
358
+
359
+ while (p -> pid != pid && (p = bpf_iter_scx_dsq_next (& it )) && p -> scx .dsq_vtime == key ) {}
360
+ if (!p || p -> pid != pid )
361
+ goto out ;
362
+
363
+ ret = true;
364
+ scx_bpf_dsq_move_set_vtime (& it , new_vtime );
365
+ ret = scx_bpf_dsq_move_vtime (& it , p , dsq_id , 0 );
366
+
367
+ out :
368
+ bpf_iter_scx_dsq_destroy (& it );
369
+ return ret ;
370
+ }
371
+
372
+ __weak s32 enqueue_futex_delay (struct task_struct * p __arg_trusted ,
373
+ u64 enq_flags ,
374
+ struct chaos_task_ctx * taskc __arg_nonnull )
375
+ {
376
+ s64 ret ;
377
+ struct chaos_futex_key key ;
378
+ struct chaos_futex_waiter * entry ;
379
+ struct chaos_futex_waiter val ;
380
+ u64 vtime , now ;
381
+ s32 cpu ;
382
+
383
+ key .tgid = p -> tgid ;
384
+ key .uaddr = taskc -> futex_uaddr ;
385
+
386
+ // First ensure an entry exists but in a largely empty state. We need the
387
+ // spinlock to correctly interlock with the delay DSQ.
388
+ val .pid = -1 ;
389
+
390
+ ret = bpf_map_update_elem (& chaos_futex_waiters , & key , & val , BPF_NOEXIST );
391
+ if (ret && ret != - EEXIST ) {
392
+ scx_bpf_error ("failed to create chaos_futex_waiter in runnable_futex_delays" );
393
+ return false;
394
+ }
395
+
396
+ // Get the real element. This might be an empty element that we inserted
397
+ // or it might be an element filled with another PID. It doesn't matter
398
+ // whether we inserted the element or somebody else did, this races.
399
+ entry = (struct chaos_futex_waiter * )bpf_map_lookup_elem (& chaos_futex_waiters , & key );
400
+ if (!entry ) {
401
+ scx_bpf_error ("failed to lookup chaos_futex_waiter in runnable_futex_delays" );
402
+ return false;
403
+ }
404
+
405
+ // enqueue ourselves before entering the spinlock. critical sections
406
+ // can't call kfuncs.
407
+ now = bpf_ktime_get_ns ();
408
+ cpu = bpf_get_smp_processor_id ();
409
+
410
+ chaos_stat_inc (CHAOS_STAT_TRAIT_FUTEX_DELAYS );
411
+ scx_bpf_dsq_insert_vtime (p , get_cpu_delay_dsq (cpu ), 0 , now + futex_uncontended_delay_ns , enq_flags );
412
+
413
+ // critical sections can't call kfuncs which makes this very complicated.
414
+ // we must have already enqueued ourselves, and we must then insert
415
+ // ourselves in the hashmap. when we take a task out of the lock we
416
+ // should attempt to re-queue it after. the task will not hit this path
417
+ // again until it has been re-queued, thus this isn't racy - either we
418
+ // will re-queue it, or it will run naturally when its delay expires.
419
+ // This might mean it doesn't get quite enough delay, but no invariants
420
+ // are broken.
421
+ bpf_spin_lock (& entry -> lock );
422
+
423
+ val .pid = entry -> pid ;
424
+ val .timeout_key = entry -> timeout_key ;
425
+ val .delay_dsq_cpu_idx = entry -> delay_dsq_cpu_idx ;
426
+
427
+ // enqueue ourselves and prepare the metadata for the next one to come along
428
+ entry -> pid = p -> pid ;
429
+ entry -> timeout_key = now + futex_uncontended_delay_ns ;
430
+ entry -> delay_dsq_cpu_idx = cpu ;
431
+
432
+ bpf_spin_unlock (& entry -> lock );
433
+
434
+ // re-queue task that has a contender behind it
435
+ if (val .pid != -1 ) {
436
+ vtime = now + futex_contended_delay_min_ns ;
437
+ if (futex_contended_delay_min_ns != futex_contended_delay_max_ns ) {
438
+ vtime += chaos_get_prandom_u64 ()
439
+ % (futex_contended_delay_max_ns - futex_contended_delay_min_ns );
440
+ }
441
+
442
+ if (update_delayed_task_vtime (val .delay_dsq_cpu_idx , val .timeout_key , val .pid , vtime ))
443
+ chaos_stat_inc (CHAOS_STAT_TRAIT_FUTEX_DELAYS_CONTENDED );
444
+ }
445
+
446
+ return true;
447
+ }
448
+
255
449
__weak s32 enqueue_random_delay (struct task_struct * p __arg_trusted , u64 enq_flags ,
256
450
struct chaos_task_ctx * taskc __arg_nonnull )
257
451
{
258
- u64 rand64 = ((u64 )bpf_get_prandom_u32 () << 32 ) | bpf_get_prandom_u32 ();
259
-
260
452
u64 vtime = bpf_ktime_get_ns () + random_delays_min_ns ;
261
453
if (random_delays_min_ns != random_delays_max_ns ) {
262
- vtime += rand64 % (random_delays_max_ns - random_delays_min_ns );
454
+ vtime += chaos_get_prandom_u64 ()
455
+ % (random_delays_max_ns - random_delays_min_ns );
263
456
}
264
457
265
458
scx_bpf_dsq_insert_vtime (p , get_cpu_delay_dsq (-1 ), 0 , vtime , enq_flags );
@@ -278,6 +471,10 @@ __weak s32 enqueue_chaotic(struct task_struct *p __arg_trusted, u64 enq_flags,
278
471
out = enqueue_random_delay (p , enq_flags , taskc );
279
472
break ;
280
473
474
+ case CHAOS_TRAIT_FUTEX_DELAYS :
475
+ out = enqueue_futex_delay (p , enq_flags , taskc );
476
+ break ;
477
+
281
478
case CHAOS_TRAIT_NONE :
282
479
chaos_stat_inc (CHAOS_STAT_CHAOS_SKIPPED );
283
480
out = false;
@@ -479,8 +676,7 @@ void BPF_STRUCT_OPS(chaos_enqueue, struct task_struct *p __arg_trusted, u64 enq_
479
676
if (promise .kind == P2DQ_ENQUEUE_PROMISE_COMPLETE )
480
677
return ;
481
678
482
- if (taskc -> next_trait == CHAOS_TRAIT_RANDOM_DELAYS &&
483
- enqueue_chaotic (p , enq_flags , taskc ))
679
+ if (enqueue_chaotic (p , enq_flags , taskc ))
484
680
return ;
485
681
486
682
// NOTE: this may not work for affinitized tasks because p2dq does
@@ -582,6 +778,51 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(chaos_init_task, struct task_struct *p,
582
778
return 0 ;
583
779
}
584
780
781
+ SEC ("?tracepoint/syscalls/sys_enter_futex" )
782
+ int rtp_sys_enter_futex (struct tp_syscall_enter_futex * ctx )
783
+ {
784
+ struct task_struct * p ;
785
+ struct chaos_task_ctx * taskc ;
786
+ int futex_op ;
787
+ s32 ret ;
788
+
789
+ // should be detached from userspace but if it is attached then no-op
790
+ if (!futex_uncontended_delay_ns && !futex_contended_delay_min_ns &&
791
+ !futex_contended_delay_max_ns )
792
+ return 0 ;
793
+
794
+ p = (struct task_struct * )bpf_get_current_task_btf ();
795
+ taskc = lookup_create_chaos_task_ctx (p );
796
+ if (!taskc )
797
+ return 0 ;
798
+
799
+ if (!(taskc -> match & CHAOS_MATCH_COMPLETE )) {
800
+ ret = calculate_chaos_match (p );
801
+ if (ret ) {
802
+ scx_bpf_error ("failed to match task" );
803
+ return 0 ;
804
+ }
805
+ }
806
+
807
+ if (taskc -> match & CHAOS_MATCH_EXCLUDED )
808
+ return 0 ;
809
+
810
+ futex_op = ctx -> op & FUTEX_CMD_MASK ;
811
+
812
+ if (futex_op != FUTEX_WAIT && futex_op != FUTEX_WAIT_BITSET &&
813
+ futex_op != FUTEX_WAIT_REQUEUE_PI )
814
+ return 0 ;
815
+
816
+ // The task is either about to wait because it hit FUTEX_WAIT on the slow
817
+ // path or hit the fast path. The fast path is irrelevant for our purposes
818
+ // as we have no scheduler input there, so it's safe to delay our work
819
+ // until a struct_ops .runnable callback comes along.
820
+ taskc -> pending_trait = CHAOS_TRAIT_FUTEX_DELAYS ;
821
+ taskc -> futex_uaddr = (u64 )ctx -> uaddr ;
822
+
823
+ return 0 ;
824
+ }
825
+
585
826
SCX_OPS_DEFINE (chaos ,
586
827
.dispatch = (void * )chaos_dispatch ,
587
828
.enqueue = (void * )chaos_enqueue ,
0 commit comments