@@ -325,9 +325,6 @@ union bpf_iter_link_info {
325325 * **BPF_PROG_TYPE_SK_LOOKUP**
326326 * *data_in* and *data_out* must be NULL.
327327 *
328- * **BPF_PROG_TYPE_XDP**
329- * *ctx_in* and *ctx_out* must be NULL.
330- *
331328 * **BPF_PROG_TYPE_RAW_TRACEPOINT**,
332329 * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE**
333330 *
@@ -528,6 +525,15 @@ union bpf_iter_link_info {
528525 * Look up an element with the given *key* in the map referred to
529526 * by the file descriptor *fd*, and if found, delete the element.
530527 *
528+ * For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map
529+ * types, the *flags* argument needs to be set to 0, but for other
530+ * map types, it may be specified as:
531+ *
532+ * **BPF_F_LOCK**
533+ * Look up and delete the value of a spin-locked map
534+ * without returning the lock. This must be specified if
535+ * the elements contain a spinlock.
536+ *
531537 * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
532538 * implement this command as a "pop" operation, deleting the top
533539 * element rather than one corresponding to *key*.
@@ -537,6 +543,10 @@ union bpf_iter_link_info {
537543 * This command is only valid for the following map types:
538544 * * **BPF_MAP_TYPE_QUEUE**
539545 * * **BPF_MAP_TYPE_STACK**
546+ * * **BPF_MAP_TYPE_HASH**
547+ * * **BPF_MAP_TYPE_PERCPU_HASH**
548+ * * **BPF_MAP_TYPE_LRU_HASH**
549+ * * **BPF_MAP_TYPE_LRU_PERCPU_HASH**
540550 *
541551 * Return
542552 * Returns zero on success. On error, -1 is returned and *errno*
@@ -838,6 +848,7 @@ enum bpf_cmd {
838848 BPF_PROG_ATTACH ,
839849 BPF_PROG_DETACH ,
840850 BPF_PROG_TEST_RUN ,
851+ BPF_PROG_RUN = BPF_PROG_TEST_RUN ,
841852 BPF_PROG_GET_NEXT_ID ,
842853 BPF_MAP_GET_NEXT_ID ,
843854 BPF_PROG_GET_FD_BY_ID ,
@@ -938,6 +949,7 @@ enum bpf_prog_type {
938949 BPF_PROG_TYPE_EXT ,
939950 BPF_PROG_TYPE_LSM ,
940951 BPF_PROG_TYPE_SK_LOOKUP ,
952+ BPF_PROG_TYPE_SYSCALL , /* a program that can execute syscalls */
941953};
942954
943955enum bpf_attach_type {
@@ -980,6 +992,8 @@ enum bpf_attach_type {
980992 BPF_SK_LOOKUP ,
981993 BPF_XDP ,
982994 BPF_SK_SKB_VERDICT ,
995+ BPF_SK_REUSEPORT_SELECT ,
996+ BPF_SK_REUSEPORT_SELECT_OR_MIGRATE ,
983997 __MAX_BPF_ATTACH_TYPE
984998};
985999
@@ -1098,24 +1112,28 @@ enum bpf_link_type {
10981112/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
10991113 * the following extensions:
11001114 *
1101- * insn[0].src_reg: BPF_PSEUDO_MAP_FD
1102- * insn[0].imm: map fd
1115+ * insn[0].src_reg: BPF_PSEUDO_MAP_[FD|IDX]
1116+ * insn[0].imm: map fd or fd_idx
11031117 * insn[1].imm: 0
11041118 * insn[0].off: 0
11051119 * insn[1].off: 0
11061120 * ldimm64 rewrite: address of map
11071121 * verifier type: CONST_PTR_TO_MAP
11081122 */
11091123#define BPF_PSEUDO_MAP_FD 1
1110- /* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE
1111- * insn[0].imm: map fd
1124+ #define BPF_PSEUDO_MAP_IDX 5
1125+
1126+ /* insn[0].src_reg: BPF_PSEUDO_MAP_[IDX_]VALUE
1127+ * insn[0].imm: map fd or fd_idx
11121128 * insn[1].imm: offset into value
11131129 * insn[0].off: 0
11141130 * insn[1].off: 0
11151131 * ldimm64 rewrite: address of map[0]+offset
11161132 * verifier type: PTR_TO_MAP_VALUE
11171133 */
1118- #define BPF_PSEUDO_MAP_VALUE 2
1134+ #define BPF_PSEUDO_MAP_VALUE 2
1135+ #define BPF_PSEUDO_MAP_IDX_VALUE 6
1136+
11191137/* insn[0].src_reg: BPF_PSEUDO_BTF_ID
11201138 * insn[0].imm: kernel btd id of VAR
11211139 * insn[1].imm: 0
@@ -1315,6 +1333,8 @@ union bpf_attr {
13151333 /* or valid module BTF object fd or 0 to attach to vmlinux */
13161334 __u32 attach_btf_obj_fd ;
13171335 };
1336+ __u32 :32 ; /* pad */
1337+ __aligned_u64 fd_array ; /* array of FDs */
13181338 };
13191339
13201340 struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -2535,8 +2555,12 @@ union bpf_attr {
25352555 * The lower two bits of *flags* are used as the return code if
25362556 * the map lookup fails. This is so that the return value can be
25372557 * one of the XDP program return codes up to **XDP_TX**, as chosen
2538- * by the caller. Any higher bits in the *flags* argument must be
2539- * unset.
2558+ * by the caller. The higher bits of *flags* can be set to
2559+ * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
2560+ *
2561+ * With BPF_F_BROADCAST the packet will be broadcasted to all the
2562+ * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
2563+ * interface will be excluded when do broadcasting.
25402564 *
25412565 * See also **bpf_redirect**\ (), which only supports redirecting
25422566 * to an ifindex, but doesn't require a map to do so.
@@ -3223,7 +3247,7 @@ union bpf_attr {
32233247 * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
32243248 * Description
32253249 * Select a **SO_REUSEPORT** socket from a
3226- * **BPF_MAP_TYPE_REUSEPORT_ARRAY ** *map*.
3250+ * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY ** *map*.
32273251 * It checks the selected socket is matching the incoming
32283252 * request in the socket buffer.
32293253 * Return
@@ -4736,6 +4760,94 @@ union bpf_attr {
47364760 * be zero-terminated except when **str_size** is 0.
47374761 *
47384762 * Or **-EBUSY** if the per-CPU memory copy buffer is busy.
4763+ *
4764+ * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size)
4765+ * Description
4766+ * Execute bpf syscall with given arguments.
4767+ * Return
4768+ * A syscall result.
4769+ *
4770+ * long bpf_btf_find_by_name_kind(char *name, int name_sz, u32 kind, int flags)
4771+ * Description
4772+ * Find BTF type with given name and kind in vmlinux BTF or in module's BTFs.
4773+ * Return
4774+ * Returns btf_id and btf_obj_fd in lower and upper 32 bits.
4775+ *
4776+ * long bpf_sys_close(u32 fd)
4777+ * Description
4778+ * Execute close syscall for given FD.
4779+ * Return
4780+ * A syscall result.
4781+ *
4782+ * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags)
4783+ * Description
4784+ * Initialize the timer.
4785+ * First 4 bits of *flags* specify clockid.
4786+ * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
4787+ * All other bits of *flags* are reserved.
4788+ * The verifier will reject the program if *timer* is not from
4789+ * the same *map*.
4790+ * Return
4791+ * 0 on success.
4792+ * **-EBUSY** if *timer* is already initialized.
4793+ * **-EINVAL** if invalid *flags* are passed.
4794+ * **-EPERM** if *timer* is in a map that doesn't have any user references.
4795+ * The user space should either hold a file descriptor to a map with timers
4796+ * or pin such map in bpffs. When map is unpinned or file descriptor is
4797+ * closed all timers in the map will be cancelled and freed.
4798+ *
4799+ * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn)
4800+ * Description
4801+ * Configure the timer to call *callback_fn* static function.
4802+ * Return
4803+ * 0 on success.
4804+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
4805+ * **-EPERM** if *timer* is in a map that doesn't have any user references.
4806+ * The user space should either hold a file descriptor to a map with timers
4807+ * or pin such map in bpffs. When map is unpinned or file descriptor is
4808+ * closed all timers in the map will be cancelled and freed.
4809+ *
4810+ * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags)
4811+ * Description
4812+ * Set timer expiration N nanoseconds from the current time. The
4813+ * configured callback will be invoked in soft irq context on some cpu
4814+ * and will not repeat unless another bpf_timer_start() is made.
4815+ * In such case the next invocation can migrate to a different cpu.
4816+ * Since struct bpf_timer is a field inside map element the map
4817+ * owns the timer. The bpf_timer_set_callback() will increment refcnt
4818+ * of BPF program to make sure that callback_fn code stays valid.
4819+ * When user space reference to a map reaches zero all timers
4820+ * in a map are cancelled and corresponding program's refcnts are
4821+ * decremented. This is done to make sure that Ctrl-C of a user
4822+ * process doesn't leave any timers running. If map is pinned in
4823+ * bpffs the callback_fn can re-arm itself indefinitely.
4824+ * bpf_map_update/delete_elem() helpers and user space sys_bpf commands
4825+ * cancel and free the timer in the given map element.
4826+ * The map can contain timers that invoke callback_fn-s from different
4827+ * programs. The same callback_fn can serve different timers from
4828+ * different maps if key/value layout matches across maps.
4829+ * Every bpf_timer_set_callback() can have different callback_fn.
4830+ *
4831+ * Return
4832+ * 0 on success.
4833+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
4834+ * or invalid *flags* are passed.
4835+ *
4836+ * long bpf_timer_cancel(struct bpf_timer *timer)
4837+ * Description
4838+ * Cancel the timer and wait for callback_fn to finish if it was running.
4839+ * Return
4840+ * 0 if the timer was not active.
4841+ * 1 if the timer was active.
4842+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
4843+ * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
4844+ * own timer which would have led to a deadlock otherwise.
4845+ *
4846+ * u64 bpf_get_func_ip(void *ctx)
4847+ * Description
4848+ * Get address of the traced function (for tracing and kprobe programs).
4849+ * Return
4850+ * Address of the traced function.
47394851 */
47404852#define __BPF_FUNC_MAPPER (FN ) \
47414853 FN(unspec), \
@@ -4904,6 +5016,14 @@ union bpf_attr {
49045016 FN(check_mtu), \
49055017 FN(for_each_map_elem), \
49065018 FN(snprintf), \
5019+ FN(sys_bpf), \
5020+ FN(btf_find_by_name_kind), \
5021+ FN(sys_close), \
5022+ FN(timer_init), \
5023+ FN(timer_set_callback), \
5024+ FN(timer_start), \
5025+ FN(timer_cancel), \
5026+ FN(get_func_ip), \
49075027 /* */
49085028
49095029/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5081,6 +5201,12 @@ enum {
50815201 BPF_F_BPRM_SECUREEXEC = (1ULL << 0 ),
50825202};
50835203
5204+ /* Flags for bpf_redirect_map helper */
5205+ enum {
5206+ BPF_F_BROADCAST = (1ULL << 3 ),
5207+ BPF_F_EXCLUDE_INGRESS = (1ULL << 4 ),
5208+ };
5209+
50845210#define __bpf_md_ptr (type , name ) \
50855211union { \
50865212 type name; \
@@ -5365,6 +5491,20 @@ struct sk_reuseport_md {
53655491 __u32 ip_protocol ; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
53665492 __u32 bind_inany ; /* Is sock bound to an INANY address? */
53675493 __u32 hash ; /* A hash of the packet 4 tuples */
5494+ /* When reuse->migrating_sk is NULL, it is selecting a sk for the
5495+ * new incoming connection request (e.g. selecting a listen sk for
5496+ * the received SYN in the TCP case). reuse->sk is one of the sk
5497+ * in the reuseport group. The bpf prog can use reuse->sk to learn
5498+ * the local listening ip/port without looking into the skb.
5499+ *
5500+ * When reuse->migrating_sk is not NULL, reuse->sk is closed and
5501+ * reuse->migrating_sk is the socket that needs to be migrated
5502+ * to another listening socket. migrating_sk could be a fullsock
5503+ * sk that is fully established or a reqsk that is in-the-middle
5504+ * of 3-way handshake.
5505+ */
5506+ __bpf_md_ptr (struct bpf_sock * , sk );
5507+ __bpf_md_ptr (struct bpf_sock * , migrating_sk );
53685508};
53695509
53705510#define BPF_TAG_SIZE 8
@@ -6010,6 +6150,11 @@ struct bpf_spin_lock {
60106150 __u32 val ;
60116151};
60126152
6153+ struct bpf_timer {
6154+ __u64 :64 ;
6155+ __u64 :64 ;
6156+ } __attribute__((aligned (8 )));
6157+
60136158struct bpf_sysctl {
60146159 __u32 write ; /* Sysctl is being read (= 0) or written (= 1).
60156160 * Allows 1,2,4-byte read, but no write.
0 commit comments