Skip to content

bpf/arena: Add kfunc for reserving arena guard memory regions #5497

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 92 additions & 3 deletions kernel/bpf/arena.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ struct bpf_arena {
u64 user_vm_end;
struct vm_struct *kern_vm;
struct range_tree rt;
struct range_tree rt_guard;
struct list_head vma_list;
struct mutex lock;
};
Expand Down Expand Up @@ -143,6 +144,20 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
bpf_map_area_free(arena);
goto err;
}

/*
* Use the same semantics as the main range tree to reuse
* its methods: Present ranges are all unguarded, while
* absent ones are guarded.
*/
range_tree_init(&arena->rt_guard);
err = range_tree_set(&arena->rt_guard, 0, attr->max_entries);
if (err) {
range_tree_destroy(&arena->rt);
bpf_map_area_free(arena);
goto err;
}

mutex_init(&arena->lock);

return &arena->map;
Expand Down Expand Up @@ -193,6 +208,7 @@ static void arena_map_free(struct bpf_map *map)
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
free_vm_area(arena->kern_vm);
range_tree_destroy(&arena->rt_guard);
range_tree_destroy(&arena->rt);
bpf_map_area_free(arena);
}
Expand Down Expand Up @@ -282,6 +298,11 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
/* User space requested to segfault when page is not allocated by bpf prog */
return VM_FAULT_SIGSEGV;

/* Make sure the page is not guarded. */
ret = is_range_tree_set(&arena->rt_guard, vmf->pgoff, 1);
if (ret)
return VM_FAULT_SIGSEGV;

ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
if (ret)
return VM_FAULT_SIGSEGV;
Expand Down Expand Up @@ -456,12 +477,17 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
if (ret)
goto out_free_pages;
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
} else {
ret = pgoff = range_tree_find(&arena->rt, page_cnt);
if (pgoff >= 0)
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
if (pgoff < 0)
goto out_free_pages;
}

ret = is_range_tree_set(&arena->rt_guard, pgoff, page_cnt);
if (ret)
goto out_free_pages;

ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
if (ret)
goto out_free_pages;

Expand Down Expand Up @@ -512,6 +538,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
u64 full_uaddr, uaddr_end;
long kaddr, pgoff, i;
struct page *page;
int ret;

/* only aligned lower 32-bit are relevant */
uaddr = (u32)uaddr;
Expand All @@ -525,7 +552,14 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)

guard(mutex)(&arena->lock);


pgoff = compute_pgoff(arena, uaddr);

/* Do not free regions that include guarded pages. */
ret = is_range_tree_set(&arena->rt_guard, pgoff, page_cnt);
if (ret)
return;

/* clear range */
range_tree_set(&arena->rt, pgoff, page_cnt);

Expand All @@ -550,6 +584,46 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
}
}

static int arena_guard_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt)
{
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
long pgoff;
int ret;

if (uaddr & ~PAGE_MASK)
return 0;

pgoff = compute_pgoff(arena, uaddr);
if (pgoff + page_cnt > page_cnt_max)
return -EINVAL;

guard(mutex)(&arena->lock);

/* Make sure we have not already guarded the pages. */
ret = is_range_tree_set(&arena->rt_guard, pgoff, page_cnt);
if (ret)
return -EALREADY;

/* Cannot guard already allocated pages. */
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
if (ret)
return -EINVAL;

/* Reserve the region. */
ret = range_tree_clear(&arena->rt_guard, pgoff, page_cnt);
if (ret)
return ret;

/* Also "allocate" the region to prevent it from being allocated. */
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
if (ret) {
range_tree_set(&arena->rt_guard, pgoff, page_cnt);
return ret;
}

return 0;
}

__bpf_kfunc_start_defs();

__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
Expand All @@ -573,11 +647,26 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt
return;
arena_free_pages(arena, (long)ptr__ign, page_cnt);
}

__bpf_kfunc int bpf_arena_guard_pages(void *p__map, void *ptr__ign, u32 page_cnt)
{
struct bpf_map *map = p__map;
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);

if (map->map_type != BPF_MAP_TYPE_ARENA)
return -EINVAL;

if (!page_cnt)
return 0;

return arena_guard_pages(arena, (long)ptr__ign, page_cnt);
}
__bpf_kfunc_end_defs();

BTF_KFUNCS_START(arena_kfuncs)
BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2)
BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
BTF_ID_FLAGS(func, bpf_arena_guard_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
BTF_KFUNCS_END(arena_kfuncs)

static const struct btf_kfunc_id_set common_kfunc_set = {
Expand Down
3 changes: 3 additions & 0 deletions tools/testing/selftests/bpf/bpf_arena_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,11 @@

void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt,
int node_id, __u64 flags) __ksym __weak;
int bpf_arena_guard_pages(void *map, void __arena *addr, __u32 page_cnt) __ksym __weak;
void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak;

#define arena_base(map) ((void __arena *)((struct bpf_arena *)(map))->user_vm_start)

#else /* when compiled as user space code */

#define __arena
Expand Down
106 changes: 106 additions & 0 deletions tools/testing/selftests/bpf/progs/verifier_arena.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,112 @@ int basic_alloc3(void *ctx)
return 0;
}

SEC("syscall")
__success __retval(0)
int basic_guard1(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
char __arena *page;
int ret;

page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
if (!page)
return 1;

page += __PAGE_SIZE;

/* Guard the second page */
ret = bpf_arena_guard_pages(&arena, page, 1);
if (ret)
return 2;

/* Try to explicitly allocate the guarded page. */
page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0);
if (page)
return 3;

/* Try to implicitly allocate the page (since there's only 2 of them). */
page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
if (page)
return 4;
#endif
return 0;
}

SEC("syscall")
__success __retval(0)
int basic_guard2(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
char __arena *page;
int ret;

page = arena_base(&arena);
ret = bpf_arena_guard_pages(&arena, page, 1);
if (ret)
return 1;

page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0);
if ((u64)page)
return 2;
#endif
return 0;
}

/* Guard the same page twice, should return -EALREADY. */
SEC("syscall")
__success __retval(0)
int guard_twice(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
char __arena *page;
int ret;

page = arena_base(&arena);

ret = bpf_arena_guard_pages(&arena, page, 1);
if (ret)
return 1;

/* Should be -EALREADY. */
ret = bpf_arena_guard_pages(&arena, page, 1);
if (ret != -114)
return 2;
#endif
return 0;
}

/* Try to add a guard past the end of the arena. */
SEC("syscall")
__success __retval(0)
int guard_invalid_region(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
char __arena *page;
int ret;

/* Try a NULL pointer. */
ret = bpf_arena_guard_pages(&arena, NULL, 3);
if (ret != -22)
return 1;

page = arena_base(&arena);

ret = bpf_arena_guard_pages(&arena, page, 3);
if (ret != -22)
return 2;

ret = bpf_arena_guard_pages(&arena, page, 4096);
if (ret != -22)
return 3;

ret = bpf_arena_guard_pages(&arena, page, (1ULL << 32) - 1);
if (ret != -22)
return 4;
#endif
return 0;
}

SEC("iter.s/bpf_map")
__success __log_level(2)
int iter_maps1(struct bpf_iter__bpf_map *ctx)
Expand Down
93 changes: 93 additions & 0 deletions tools/testing/selftests/bpf/progs/verifier_arena_large.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,99 @@ int big_alloc1(void *ctx)
return 0;
}

/* Try to access a guarded page. Behavior should be identical with accessing unallocated pages. */
SEC("syscall")
__success __retval(0)
int access_guarded(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
volatile char __arena *page;
char __arena *base;
const size_t len = 4;
int ret, i;

/* Get a separate region of the arena. */
page = base = arena_base(&arena) + 16384 * PAGE_SIZE;

ret = bpf_arena_guard_pages(&arena, base, len);
if (ret)
return 1;

/* Try to dirty guarded memory. */
for (i = 0; i < len && can_loop; i++)
*page = 0x5a;

for (i = 0; i < len && can_loop; i++) {
page = (volatile char __arena *)(base + i * PAGE_SIZE);

/*
* Error out in case either the write went through,
* or the address has random garbage.
*/
if (*page == 0x5a)
return 2 + 2 * i;

if (*page)
return 2 + 2 * i + 1;
}
#endif
return 0;
}

/* Try to allocate a region overlapping with a guard. */
SEC("syscall")
__success __retval(0)
int request_partially_guarded(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
volatile char __arena *page;
char __arena *base;
int ret;

/* Add an arbitrary page offset. */
page = base = arena_base(&arena) + 4096 * __PAGE_SIZE;

ret = bpf_arena_guard_pages(&arena, base + 3 * __PAGE_SIZE, 4);
if (ret)
return 1;

page = bpf_arena_alloc_pages(&arena, base, 5, NUMA_NO_NODE, 0);
if ((u64)page != 0ULL)
return 2;
#endif
return 0;
}

SEC("syscall")
__success __retval(0)
int free_guarded(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
char __arena *addr;
char __arena *page;
int ret;

/* Add an arbitrary page offset. */
addr = arena_base(&arena) + 32768 * __PAGE_SIZE;

page = bpf_arena_alloc_pages(&arena, addr, 4, NUMA_NO_NODE, 0);
if (!page)
return 1;

ret = bpf_arena_guard_pages(&arena, addr + 4 * __PAGE_SIZE, 4);
if (ret)
return 2;

bpf_arena_free_pages(&arena, addr + 3 * __PAGE_SIZE, 2);

/* The free pages call above should have failed, so this allocation should fail too. */
page = bpf_arena_alloc_pages(&arena, addr + 3 * __PAGE_SIZE, 1, NUMA_NO_NODE, 0);
if (page)
return 3;
#endif
return 0;
}

#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
#define PAGE_CNT 100
__u8 __arena * __arena page[PAGE_CNT]; /* occupies the first page */
Expand Down
Loading