From 563b0d4619df45ea81d60545949da012bafca6cf Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis <emil@etsalapatis.com> Date: Thu, 19 Jun 2025 23:11:17 -0400 Subject: [PATCH 1/2] bpf/arena: add bpf_arena_guard_pages kfunc Add a new BPF arena kfunc from protecting a range of pages. These pages cannot be allocated, either explicitly through bpf_arena_alloc_pages() or implicitly through userspace page faults. Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com> --- kernel/bpf/arena.c | 95 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 0d56cea716022..2f9293eb71517 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -48,6 +48,7 @@ struct bpf_arena { u64 user_vm_end; struct vm_struct *kern_vm; struct range_tree rt; + struct range_tree rt_guard; struct list_head vma_list; struct mutex lock; }; @@ -143,6 +144,20 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) bpf_map_area_free(arena); goto err; } + + /* + * Use the same semantics as the main range tree to reuse + * its methods: Present ranges are all unguarded, while + * absent ones are guarded. + */ + range_tree_init(&arena->rt_guard); + err = range_tree_set(&arena->rt_guard, 0, attr->max_entries); + if (err) { + range_tree_destroy(&arena->rt); + bpf_map_area_free(arena); + goto err; + } + mutex_init(&arena->lock); return &arena->map; @@ -193,6 +208,7 @@ static void arena_map_free(struct bpf_map *map) apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); free_vm_area(arena->kern_vm); + range_tree_destroy(&arena->rt_guard); range_tree_destroy(&arena->rt); bpf_map_area_free(arena); } @@ -282,6 +298,11 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) /* User space requested to segfault when page is not allocated by bpf prog */ return VM_FAULT_SIGSEGV; + /* Make sure the page is not guarded. */ + ret = is_range_tree_set(&arena->rt_guard, vmf->pgoff, 1); + if (ret) + return VM_FAULT_SIGSEGV; + ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) return VM_FAULT_SIGSEGV; @@ -456,12 +477,17 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); if (ret) goto out_free_pages; - ret = range_tree_clear(&arena->rt, pgoff, page_cnt); } else { ret = pgoff = range_tree_find(&arena->rt, page_cnt); - if (pgoff >= 0) - ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + if (pgoff < 0) + goto out_free_pages; } + + ret = is_range_tree_set(&arena->rt_guard, pgoff, page_cnt); + if (ret) + goto out_free_pages; + + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); if (ret) goto out_free_pages; @@ -512,6 +538,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) u64 full_uaddr, uaddr_end; long kaddr, pgoff, i; struct page *page; + int ret; /* only aligned lower 32-bit are relevant */ uaddr = (u32)uaddr; @@ -525,7 +552,14 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) guard(mutex)(&arena->lock); + pgoff = compute_pgoff(arena, uaddr); + + /* Do not free regions that include guarded pages. */ + ret = is_range_tree_set(&arena->rt_guard, pgoff, page_cnt); + if (ret) + return; + /* clear range */ range_tree_set(&arena->rt, pgoff, page_cnt); @@ -550,6 +584,46 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) } } +static int arena_guard_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) +{ + long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + long pgoff; + int ret; + + if (uaddr & ~PAGE_MASK) + return 0; + + pgoff = compute_pgoff(arena, uaddr); + if (pgoff + page_cnt > page_cnt_max) + return -EINVAL; + + guard(mutex)(&arena->lock); + + /* Make sure we have not already guarded the pages. */ + ret = is_range_tree_set(&arena->rt_guard, pgoff, page_cnt); + if (ret) + return -EALREADY; + + /* Cannot guard already allocated pages. */ + ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); + if (ret) + return -EINVAL; + + /* Reserve the region. */ + ret = range_tree_clear(&arena->rt_guard, pgoff, page_cnt); + if (ret) + return ret; + + /* Also "allocate" the region to prevent it from being allocated. */ + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + if (ret) { + range_tree_set(&arena->rt_guard, pgoff, page_cnt); + return ret; + } + + return 0; +} + __bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, @@ -573,11 +647,26 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt return; arena_free_pages(arena, (long)ptr__ign, page_cnt); } + +__bpf_kfunc int bpf_arena_guard_pages(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA) + return -EINVAL; + + if (!page_cnt) + return 0; + + return arena_guard_pages(arena, (long)ptr__ign, page_cnt); +} __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_guard_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { From c538963b3b59d4ba83e9cde6411437cb9367814e Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis <emil@etsalapatis.com> Date: Thu, 19 Jun 2025 23:11:18 -0400 Subject: [PATCH 2/2] selftests/bpf: add selftests for bpf_arena_guard_pages Add selftests for the new bpf_arena_guard_pages kfunc. Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com> --- .../testing/selftests/bpf/bpf_arena_common.h | 3 + .../selftests/bpf/progs/verifier_arena.c | 106 ++++++++++++++++++ .../bpf/progs/verifier_arena_large.c | 93 +++++++++++++++ 3 files changed, 202 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_arena_common.h b/tools/testing/selftests/bpf/bpf_arena_common.h index 68a51dcc06692..339de1719bc7d 100644 --- a/tools/testing/selftests/bpf/bpf_arena_common.h +++ b/tools/testing/selftests/bpf/bpf_arena_common.h @@ -46,8 +46,11 @@ void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, int node_id, __u64 flags) __ksym __weak; +int bpf_arena_guard_pages(void *map, void __arena *addr, __u32 page_cnt) __ksym __weak; void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; +#define arena_base(map) ((void __arena *)((struct bpf_arena *)(map))->user_vm_start) + #else /* when compiled as user space code */ #define __arena diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 67509c5d3982a..af175dc89a8c9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -114,6 +114,112 @@ int basic_alloc3(void *ctx) return 0; } +SEC("syscall") +__success __retval(0) +int basic_guard1(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) + return 1; + + page += __PAGE_SIZE; + + /* Guard the second page */ + ret = bpf_arena_guard_pages(&arena, page, 1); + if (ret) + return 2; + + /* Try to explicitly allocate the guarded page. */ + page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0); + if (page) + return 3; + + /* Try to implicitly allocate the page (since there's only 2 of them). */ + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (page) + return 4; +#endif + return 0; +} + +SEC("syscall") +__success __retval(0) +int basic_guard2(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = arena_base(&arena); + ret = bpf_arena_guard_pages(&arena, page, 1); + if (ret) + return 1; + + page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0); + if ((u64)page) + return 2; +#endif + return 0; +} + +/* Guard the same page twice, should return -EALREADY. */ +SEC("syscall") +__success __retval(0) +int guard_twice(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = arena_base(&arena); + + ret = bpf_arena_guard_pages(&arena, page, 1); + if (ret) + return 1; + + /* Should be -EALREADY. */ + ret = bpf_arena_guard_pages(&arena, page, 1); + if (ret != -114) + return 2; +#endif + return 0; +} + +/* Try to add a guard past the end of the arena. */ +SEC("syscall") +__success __retval(0) +int guard_invalid_region(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + /* Try a NULL pointer. */ + ret = bpf_arena_guard_pages(&arena, NULL, 3); + if (ret != -22) + return 1; + + page = arena_base(&arena); + + ret = bpf_arena_guard_pages(&arena, page, 3); + if (ret != -22) + return 2; + + ret = bpf_arena_guard_pages(&arena, page, 4096); + if (ret != -22) + return 3; + + ret = bpf_arena_guard_pages(&arena, page, (1ULL << 32) - 1); + if (ret != -22) + return 4; +#endif + return 0; +} + SEC("iter.s/bpf_map") __success __log_level(2) int iter_maps1(struct bpf_iter__bpf_map *ctx) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index f94f30cf1bb80..cf76acd72ed1b 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -67,6 +67,99 @@ int big_alloc1(void *ctx) return 0; } +/* Try to access a guarded page. Behavior should be identical with accessing unallocated pages. */ +SEC("syscall") +__success __retval(0) +int access_guarded(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + volatile char __arena *page; + char __arena *base; + const size_t len = 4; + int ret, i; + + /* Get a separate region of the arena. */ + page = base = arena_base(&arena) + 16384 * PAGE_SIZE; + + ret = bpf_arena_guard_pages(&arena, base, len); + if (ret) + return 1; + + /* Try to dirty guarded memory. */ + for (i = 0; i < len && can_loop; i++) + *page = 0x5a; + + for (i = 0; i < len && can_loop; i++) { + page = (volatile char __arena *)(base + i * PAGE_SIZE); + + /* + * Error out in case either the write went through, + * or the address has random garbage. + */ + if (*page == 0x5a) + return 2 + 2 * i; + + if (*page) + return 2 + 2 * i + 1; + } +#endif + return 0; +} + +/* Try to allocate a region overlapping with a guard. */ +SEC("syscall") +__success __retval(0) +int request_partially_guarded(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + volatile char __arena *page; + char __arena *base; + int ret; + + /* Add an arbitrary page offset. */ + page = base = arena_base(&arena) + 4096 * __PAGE_SIZE; + + ret = bpf_arena_guard_pages(&arena, base + 3 * __PAGE_SIZE, 4); + if (ret) + return 1; + + page = bpf_arena_alloc_pages(&arena, base, 5, NUMA_NO_NODE, 0); + if ((u64)page != 0ULL) + return 2; +#endif + return 0; +} + +SEC("syscall") +__success __retval(0) +int free_guarded(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *addr; + char __arena *page; + int ret; + + /* Add an arbitrary page offset. */ + addr = arena_base(&arena) + 32768 * __PAGE_SIZE; + + page = bpf_arena_alloc_pages(&arena, addr, 4, NUMA_NO_NODE, 0); + if (!page) + return 1; + + ret = bpf_arena_guard_pages(&arena, addr + 4 * __PAGE_SIZE, 4); + if (ret) + return 2; + + bpf_arena_free_pages(&arena, addr + 3 * __PAGE_SIZE, 2); + + /* The free pages call above should have failed, so this allocation should fail too. */ + page = bpf_arena_alloc_pages(&arena, addr + 3 * __PAGE_SIZE, 1, NUMA_NO_NODE, 0); + if (page) + return 3; +#endif + return 0; +} + #if defined(__BPF_FEATURE_ADDR_SPACE_CAST) #define PAGE_CNT 100 __u8 __arena * __arena page[PAGE_CNT]; /* occupies the first page */