From 26c7794b24d689d1d2bbbf4b50f8e48142bc4606 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 9 Jun 2025 18:19:44 +0100 Subject: [PATCH] BPF task work WIP Signed-off-by: Mykyta Yatsenko --- include/linux/bpf.h | 11 ++ include/uapi/linux/bpf.h | 4 + kernel/bpf/arraymap.c | 8 +- kernel/bpf/btf.c | 15 ++ kernel/bpf/hashtab.c | 22 ++- kernel/bpf/helpers.c | 173 +++++++++++++++++- kernel/bpf/syscall.c | 22 ++- kernel/bpf/verifier.c | 112 +++++++++++- tools/include/uapi/linux/bpf.h | 4 + .../selftests/bpf/prog_tests/test_task_work.c | 128 +++++++++++++ tools/testing/selftests/bpf/progs/task_work.c | 101 ++++++++++ 11 files changed, 582 insertions(+), 18 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_task_work.c create mode 100644 tools/testing/selftests/bpf/progs/task_work.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bc887831eaa52..321b1229aca37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -206,6 +206,7 @@ enum btf_field_type { BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), BPF_RES_SPIN_LOCK = (1 << 12), + BPF_TASK_WORK = (1 << 13), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -245,6 +246,7 @@ struct btf_record { int timer_off; int wq_off; int refcount_off; + int task_work_off; struct btf_field fields[]; }; @@ -340,6 +342,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; + case BPF_TASK_WORK: + return "bpf_task_work"; default: WARN_ON_ONCE(1); return "unknown"; @@ -378,6 +382,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); + case BPF_TASK_WORK: + return sizeof(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -410,6 +416,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); + case BPF_TASK_WORK: + return __alignof__(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -441,6 +449,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: + case BPF_TASK_WORK: break; default: WARN_ON_ONCE(1); @@ -577,6 +586,7 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); void bpf_wq_cancel_and_free(void *timer); +void bpf_task_work_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, @@ -2390,6 +2400,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0670e15a6100a..333458d696dfb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7410,6 +7410,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque[8]; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d080916faf97..4130d8e76dff7 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -431,7 +431,7 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers_wq(struct bpf_map *map) +static void array_map_free_internal_structs(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -439,12 +439,14 @@ static void array_map_free_timers_wq(struct bpf_map *map) /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { for (i = 0; i < array->map.max_entries; i++) { if (btf_record_has_field(map->record, BPF_TIMER)) bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); if (btf_record_has_field(map->record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); } } } @@ -783,7 +785,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers_wq, + .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2dd13eea7b0ea..cb593ac7223b8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3527,6 +3527,15 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ goto end; } } + if (field_mask & BPF_TASK_WORK) { + if (!strcmp(name, "bpf_task_work")) { + if (*seen_mask & BPF_TASK_WORK) + return -E2BIG; + *seen_mask |= BPF_TASK_WORK; + type = BPF_TASK_WORK; + goto end; + } + } field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); @@ -3693,6 +3702,7 @@ static int btf_find_field_one(const struct btf *btf, case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: + case BPF_TASK_WORK: ret = btf_find_struct(btf, var_type, off, sz, field_type, info_cnt ? &info[0] : &tmp); if (ret < 0) @@ -3985,6 +3995,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; + rec->task_work_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { @@ -4050,6 +4061,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type case BPF_LIST_NODE: case BPF_RB_NODE: break; + case BPF_TASK_WORK: + WARN_ON_ONCE(rec->task_work_off >= 0); + rec->task_work_off = rec->fields[i].offset; + break; default: ret = -EFAULT; goto end; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 71f9931ac64cd..207ad4823b5b0 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -215,7 +215,7 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } -static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; @@ -233,6 +233,9 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, htab_elem_value(elem, htab->map.key_size)); + if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) + bpf_obj_free_task_work(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -1490,7 +1493,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } -static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_malloced_internal_structs(struct bpf_htab *htab) { int i; @@ -1508,22 +1511,25 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, htab_elem_value(l, htab->map.key_size)); + if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) + bpf_obj_free_task_work(htab->map.record, + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } rcu_read_unlock(); } -static void htab_map_free_timers_and_wq(struct bpf_map *map) +static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_and_wq(htab); + htab_free_malloced_internal_structs(htab); else - htab_free_prealloced_timers_and_wq(htab); + htab_free_prealloced_internal_structs(htab); } } @@ -2255,7 +2261,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2276,7 +2282,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3d33181d5e677..54940bdf55435 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -25,7 +25,7 @@ #include #include #include - +#include #include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, @@ -1138,6 +1138,155 @@ enum bpf_async_type { BPF_ASYNC_TYPE_WQ, }; +enum bpf_task_work_state { + BPF_TW_EMPTY = 0, + BPF_TW_BUSY, + BPF_TW_SCHEDULED, + BPF_TW_FREED, +}; + +struct bpf_task_work_context { + struct bpf_map *map; + atomic_t state; + struct bpf_prog *prog; + struct task_struct *task; + bpf_callback_t callback_fn; + struct callback_head work; +} __attribute__((aligned(8))); + +static void bpf_task_work_kv_ptr(struct bpf_map *map, struct bpf_task_work_context *tw, void **k, void **v) +{ + *v = (void *)tw - map->record->task_work_off; + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 *idx = *k; + + *idx = ((char *)*v - array->value) / array->elem_size; + } else { + *k = *v - round_up(map->key_size, 8); + } +} + +static bool task_work_match(struct callback_head *head, void *data) +{ + struct bpf_task_work_context *ctx = container_of(head, struct bpf_task_work_context, work); + + return ctx == data; +} + +static void bpf_reset_task_work_context(struct bpf_task_work_context *ctx) +{ + bpf_prog_put(ctx->prog); + put_task_struct(ctx->task); + rcu_assign_pointer(ctx->map, NULL); +} + +static void bpf_task_work_callback(struct callback_head *cb) +{ + struct bpf_task_work_context *ctx; + struct bpf_map *map; + void *key; + void *value; + u32 idx; + + rcu_read_lock_trace(); + ctx = container_of(cb, struct bpf_task_work_context, work); + + if (atomic_read(&ctx->state) != BPF_TW_SCHEDULED) + goto out; /* work cancelled */ + + map = rcu_dereference(ctx->map); + if (!map) + goto out; + + key = &idx; + bpf_task_work_kv_ptr(map, ctx, &key, &value); + + migrate_disable(); + ctx->callback_fn((u64)map, (u64)key, (u64)value, 0, 0); + migrate_enable(); + + if (atomic_cmpxchg_relaxed(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_BUSY) == BPF_TW_FREED) + goto out; + + /* reset to empty state if map element is not freed */ + bpf_reset_task_work_context(ctx); + atomic_cmpxchg_relaxed(&ctx->state, BPF_TW_BUSY, BPF_TW_EMPTY); + +out: + rcu_read_unlock_trace(); +} + + +static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work_context *ctx, + struct bpf_map *map, bpf_callback_t callback_fn, + struct bpf_prog_aux *aux, enum task_work_notify_mode mode) +{ + int err; + struct bpf_prog *prog; + + BTF_TYPE_EMIT(struct bpf_task_work); + + prog = bpf_prog_inc_not_zero(aux->prog); + if (!atomic64_read(&map->usercnt) || IS_ERR(prog)) { + bpf_prog_put(ctx->prog); + return -EPERM; + } + + if (atomic_cmpxchg_relaxed(&ctx->state, BPF_TW_EMPTY, BPF_TW_BUSY) != BPF_TW_EMPTY) + return -EBUSY; + + ctx->work.func = bpf_task_work_callback; + ctx->work.next = NULL; + ctx->prog = aux->prog; + ctx->task = get_task_struct(task); + ctx->callback_fn = callback_fn; + ctx->prog = prog; + rcu_assign_pointer(ctx->map, map); + + if (atomic_cmpxchg_relaxed(&ctx->state, BPF_TW_BUSY, BPF_TW_SCHEDULED) != BPF_TW_BUSY) { + bpf_reset_task_work_context(ctx); + return -EBUSY; + } + + err = task_work_add(task, &ctx->work, mode); + if (err) { + if (atomic_cmpxchg_relaxed(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_EMPTY) == BPF_TW_SCHEDULED) + bpf_reset_task_work_context(ctx); + return err; + } + + if (atomic_read(&ctx->state) == BPF_TW_FREED) { + task_work_cancel_match(task, task_work_match, ctx); + return -EBUSY; + } + return 0; +} + +void bpf_task_work_cancel_and_free(void *val) +{ + struct bpf_task_work_context *ctx = val; + enum bpf_task_work_state state; + + state = atomic_xchg(&ctx->state, BPF_TW_FREED); + switch (state) { + /* work is not initialized, mark as freed and exit */ + case BPF_TW_EMPTY: break; + /* + * work is being initialized by bpf_task_work_schedule concurrently. + * It's marked freed before getting to BPF_TW_SCHEDULED, so expect + * bpf_task_work_schedule to cleanup. + */ + case BPF_TW_BUSY: break; + /* work might have been scheduled, try cancelling and clean up */ + case BPF_TW_SCHEDULED: + task_work_cancel_match(ctx->task, task_work_match, ctx); + bpf_reset_task_work_context(ctx); + break; + default: break; + } +} + static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) @@ -3698,6 +3847,25 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); } +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + struct bpf_map *map, bpf_callback_t callback_fn, + void *aux__prog) +{ + return bpf_task_work_schedule(task, (struct bpf_task_work_context *)tw, map, + callback_fn, aux__prog, TWA_SIGNAL); +} + +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + struct bpf_map *map, bpf_callback_t callback_fn, + void *aux__prog) +{ + enum task_work_notify_mode mode; + + mode = task == current && in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME; + return bpf_task_work_schedule(task, (struct bpf_task_work_context *)tw, map, + callback_fn, aux__prog, mode); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3723,7 +3891,8 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL) - +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1a26d17536bef..c912ec5f38cf9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -670,6 +670,7 @@ void btf_record_free(struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to release */ break; default: @@ -723,6 +724,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to acquire */ break; default: @@ -781,6 +783,13 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) bpf_wq_cancel_and_free(obj + rec->wq_off); } +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) + return; + bpf_task_work_cancel_and_free(obj + rec->task_work_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -838,6 +847,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) continue; bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off); break; + case BPF_TASK_WORK: + bpf_task_work_cancel_and_free(field_ptr); + break; case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: @@ -1234,7 +1246,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | BPF_TASK_WORK, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1306,6 +1318,14 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, goto free_map_tab; } break; + case BPF_TASK_WORK: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; default: /* Fail if map_type checks are missing for a field type */ ret = -EOPNOTSUPP; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e2fcea860755c..fc74d5cc4b4a8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -524,9 +524,11 @@ static bool is_sync_callback_calling_function(enum bpf_func_id func_id) func_id == BPF_FUNC_user_ringbuf_drain; } -static bool is_async_callback_calling_function(enum bpf_func_id func_id) +static bool is_task_work_add_kfunc(enum bpf_func_id func_id); + +static bool is_async_callback_calling_function(u32 func_id) { - return func_id == BPF_FUNC_timer_set_callback; + return func_id == BPF_FUNC_timer_set_callback || is_task_work_add_kfunc(func_id); } static bool is_callback_calling_function(enum bpf_func_id func_id) @@ -2236,6 +2238,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) reg->map_uid = reg->id; if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) reg->map_uid = reg->id; + if (btf_record_has_field(map->inner_map_meta->record, BPF_TASK_WORK)) + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -8530,6 +8534,23 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_task_work_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (map->record->task_work_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_task_work' that is at %d\n", + val + reg->off, map->record->task_work_off); + return -EINVAL; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -10577,7 +10598,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm)); + is_bpf_wq_set_callback_impl_kfunc(insn->imm) || + is_task_work_add_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -10890,6 +10912,37 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + + struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr; + + /* + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_callback_fn = true; + callee->callback_ret_range = retval_range(0, 1); + return 0; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id); /* Are we currently verifying the callback for a rbtree helper that must @@ -12020,6 +12073,7 @@ enum { KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, + KF_ARG_TASK_WORK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12030,6 +12084,7 @@ BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) +BTF_ID(struct, bpf_task_work) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12078,6 +12133,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID); +} + static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); @@ -12165,6 +12225,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, + KF_ARG_PTR_TO_TASK_WORK, }; enum special_kfunc_type { @@ -12213,6 +12274,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, }; BTF_ID_LIST(special_kfunc_list) @@ -12279,6 +12342,14 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) + +static bool is_task_work_add_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; +} static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12369,6 +12440,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TASK_WORK; + if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; @@ -12712,7 +12786,8 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + is_task_work_add_kfunc(btf_id); } static bool is_bpf_throw_kfunc(struct bpf_insn *insn) @@ -13114,6 +13189,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } } + if (meta->map.ptr && reg->map_ptr->record->task_work_off >= 0) { + if (meta->map.ptr != reg->map_ptr || + meta->map.uid != reg->map_uid) { + verbose(env, + "bpf_task_work pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } + } meta->map.ptr = reg->map_ptr; meta->map.uid = reg->map_uid; fallthrough; @@ -13146,6 +13230,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; @@ -13437,6 +13522,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_TASK_WORK: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_task_work_func(env, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); @@ -13803,6 +13897,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_task_work_add_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_task_work_schedule_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0670e15a6100a..333458d696dfb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7410,6 +7410,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque[8]; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_work.c b/tools/testing/selftests/bpf/prog_tests/test_task_work.c new file mode 100644 index 0000000000000..45e8d3e5af0a7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_task_work.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "task_work.skel.h" +#include +#include +#include +#include + +static int perf_event_open(__u32 type, __u64 config, int pid) { + struct perf_event_attr attr = { + .type = type, + .config = config, + .size = sizeof(struct perf_event_attr), + .sample_period = 100000, + }; + + return syscall(__NR_perf_event_open, &attr, pid, -1, -1, 0); +} + +struct elem { + __s32 src_pid; + const void *src_data; + char data[128]; + struct bpf_task_work tw; +}; + +static int verify_map(struct bpf_map *map, const char *expected_data) +{ + int err; + struct elem value; + int processed_values = 0; + int k, sz; + + sz = bpf_map__max_entries(map); + for (k = 0; k < sz; ++k) { + err = bpf_map__lookup_elem(map, &k, sizeof(int), &value, + sizeof(struct elem), 0); + if (err) + continue; + if (!value.src_data || !value.data[0]) + continue; + err = strcmp(expected_data, value.data); + if (err) + return err; + processed_values++; + } + + return processed_values == 0; +} + +static void test_task_work_run(void) +{ + struct task_work *skel; + int err, pe_fd = 0, pid; + char user_string1[] = "hello world"; + char user_string2[] = "foo bar baz"; + int status; + int pipefd[2]; + + if (!ASSERT_NEQ(pipe(pipefd), -1, "pipe")) + return; + + pid = fork(); + if (pid == 0) { + __u64 num = 1; + int i; + char buf; + + close(pipefd[1]); + read(pipefd[0], &buf, sizeof(buf)); + close(pipefd[0]); + + for (i = 0; i < 10000; ++i) + num *= time(0) % 7; + (void)num; + exit(0); + } + skel = task_work__open(); + if (!ASSERT_OK_PTR(skel, "task_work__open")) + return; + + bpf_program__set_type(skel->progs.oncpu, BPF_PROG_TYPE_PERF_EVENT); + skel->rodata->pid = pid; + skel->rodata->data_pid = getpid(); + skel->bss->user_ptr1 = (char *)user_string1; + skel->bss->user_ptr2 = (char *)user_string2; + + err = task_work__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + pe_fd = perf_event_open(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, pid); + if (!ASSERT_NEQ(pe_fd, -1, "pe_fd")) { + fprintf(stderr, "perf_event_open errno: %d\n", errno); + goto cleanup; + } + + skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu, pe_fd); + if (!ASSERT_OK_PTR(link, "attach_perf_event")) + goto cleanup; + + close(pipefd[0]); + write(pipefd[1], user_string1, 1); + close(pipefd[1]); + /* Wait to collect some samples */ + waitpid(pid, &status, 0); + pid = 0; + if (!ASSERT_OK(verify_map(skel->maps.hmap, user_string1), "hmap_data")) + goto cleanup; + if (!ASSERT_OK(verify_map(skel->maps.arrmap, user_string2), "arrmap_data")) + goto cleanup; + +cleanup: + if (pe_fd >= 0) + close(pe_fd); + task_work__destroy(skel); + if (pid) + waitpid(pid, &status, 0); +} + +void test_task_work(void) +{ + if (test__start_subtest("test_task_work_run")) + test_task_work_run(); +} diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c new file mode 100644 index 0000000000000..9b04fc87925c2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Facebook */ + +#include +#include +#include +#include +#include +#include "bpf_misc.h" +#include "errno.h" + +char _license[] SEC("license") = "GPL"; + +const volatile int pid = -1; +const volatile int data_pid = -1; +const volatile void *user_ptr1 = NULL; +const volatile void *user_ptr2 = NULL; + +struct elem { + __s32 src_pid; + const void *src_data; + char data[128]; + struct bpf_task_work tw; +}; + +#define MAX_ENTRIES 5 + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, MAX_ENTRIES); + __type(key, int); + __type(value, struct elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, MAX_ENTRIES); + __type(key, int); + __type(value, struct elem); +} arrmap SEC(".maps"); + + +static __u64 process_work(struct bpf_map *map, void *key, struct elem *work) +{ + int *k = key; + u64 timestamp; + struct task_struct *ptr_task; + + timestamp = bpf_ktime_get_ns(); + ptr_task = bpf_task_from_pid(work->src_pid); + if (!ptr_task) + return 0; + timestamp = bpf_ktime_get_ns(); + bpf_copy_from_user_task_str(work->data, sizeof(work->data), work->src_data, ptr_task, 0); + bpf_task_release(ptr_task); + bpf_printk("Callback key: %d value: %s, copy time %llu \n", *k, work->data, (bpf_ktime_get_ns() - timestamp)/1000); + return 0; +} + +int hkey = 0; +int arrkey = 0; + +SEC("perf_event") +int oncpu(struct pt_regs *args) +{ + struct elem empty_work = {.data = {0}, .src_data = NULL}; + struct elem *work; + struct task_struct *task; + int err; + + if ((bpf_get_current_pid_tgid() >> 32) != pid) + return 0; + + task = bpf_get_current_task_btf(); + + work = bpf_map_lookup_elem(&arrmap, &arrkey); + + arrkey++; + if (!work || arrkey >= MAX_ENTRIES) + goto hash_map; + work->src_data = (const void *)user_ptr2; + work->src_pid = data_pid; + bpf_task_work_schedule_signal(task, &work->tw, (struct bpf_map *)&arrmap, + (bpf_callback_t)process_work, NULL); + +hash_map: + err = bpf_map_update_elem(&hmap, &hkey, &empty_work, BPF_NOEXIST); + if (err) + return 0; + work = bpf_map_lookup_elem(&hmap, &hkey); + ++hkey; + if (!work) + return 0; + + work->src_data = (const void *)user_ptr1; + work->src_pid = data_pid; + bpf_task_work_schedule_resume(task, &work->tw, (struct bpf_map *)&hmap, + (bpf_callback_t)process_work, NULL); + return 0; +}