On 08/06/2023 10:42, Zachary Leaf wrote:
The bpf syscall does not require a compat handler for 32-bit compat. This is achieved by using u64 instead of pointer types in the bpf_attr union used to pass arguments to the syscall. This means that in a system where pointers are 32-bit, the struct/union layouts and offsets are the same as in a 64-bit arch, since the u64 field is split into two u32 fields/registers.
This greatly simplifies 32-bit compat at the small cost of requiring casting pointers passed in through the uAPI to u64 (generally via ptr_to_u64() helper functions).
This poses a problem in architectures where user pointers are longer than 64b such as Morello/PCuABI where pointers are represented as 129b capabilities. In order to extend the bpf syscall interface to accept capabilities and still retain compatibility with the existing 64/32b ABI, a 64-bit compat layer and appropriate conversions must be added to handle the different union/struct sizes caused by this pointer size mis-match.
Before extending the number of bits in union bpf_attr to accept capabilitities, set the groundwork with a compat64 handler and
"capabilitities" sounds rather wrong, better with one less "ti" :D
conversion function to take a compat64 sized bpf_attr and convert it to what will be the new native offsets.
Inbound conversion is handled upfront to minimise impact on existing code and reduce overall diff size. After dispatch_bpf the majority of code can remain unchanged. The cases where conversion back out to userspace is required are handled in subsequent commits.
Signed-off-by: Zachary Leaf zachary.leaf@arm.com
[...] +#ifdef CONFIG_COMPAT64 +static void convert_compat_bpf_attr(union bpf_attr *dest, const union compat_bpf_attr *cattr, int cmd) +{
- struct bpf_prog *prog;
- switch (cmd) {
- case BPF_MAP_CREATE:
dest->map_type = cattr->map_type;dest->key_size = cattr->key_size;dest->value_size = cattr->value_size;dest->max_entries = cattr->max_entries;dest->map_flags = cattr->map_flags;dest->inner_map_fd = cattr->inner_map_fd;dest->numa_node = cattr->numa_node;strncpy(dest->map_name, cattr->map_name, BPF_OBJ_NAME_LEN);dest->map_ifindex = cattr->map_ifindex;dest->btf_fd = cattr->btf_fd;dest->btf_key_type_id = cattr->btf_key_type_id;dest->btf_value_type_id = cattr->btf_value_type_id;dest->btf_vmlinux_value_type_id = cattr->btf_vmlinux_value_type_id;dest->map_extra = cattr->map_extra;break;- case BPF_MAP_LOOKUP_ELEM:
- case BPF_MAP_UPDATE_ELEM:
- case BPF_MAP_DELETE_ELEM:
- case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
dest->map_fd = cattr->map_fd;dest->key = cattr->key;dest->value = cattr->value;/* u64 next_key is in a union with u64 value */dest->flags = cattr->flags;break;- case BPF_MAP_LOOKUP_BATCH:
- case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
- case BPF_MAP_UPDATE_BATCH:
- case BPF_MAP_DELETE_BATCH:
dest->batch.in_batch = cattr->batch.in_batch;dest->batch.out_batch = cattr->batch.out_batch;dest->batch.keys = cattr->batch.keys;dest->batch.values = cattr->batch.values;dest->batch.count = cattr->batch.count;dest->batch.map_fd = cattr->batch.map_fd;dest->batch.elem_flags = cattr->batch.elem_flags;dest->batch.flags = cattr->batch.flags;break;- case BPF_PROG_LOAD:
dest->prog_type = cattr->prog_type;dest->insn_cnt = cattr->insn_cnt;dest->insns = cattr->insns;dest->license = cattr->license;dest->log_level = cattr->log_level;dest->log_size = cattr->log_size;dest->log_buf = cattr->log_buf;dest->kern_version = cattr->kern_version;dest->prog_flags = cattr->prog_flags;strncpy(dest->prog_name, cattr->prog_name, BPF_OBJ_NAME_LEN);dest->prog_ifindex = cattr->prog_ifindex;dest->expected_attach_type = cattr->expected_attach_type;dest->prog_btf_fd = cattr->prog_btf_fd;dest->func_info_rec_size = cattr->func_info_rec_size;dest->func_info = cattr->func_info;dest->func_info_cnt = cattr->func_info_cnt;dest->line_info_rec_size = cattr->line_info_rec_size;dest->line_info = cattr->line_info;dest->line_info_cnt = cattr->line_info_cnt;dest->attach_btf_id = cattr->attach_btf_id;dest->attach_prog_fd = cattr->attach_prog_fd;/* u32 attach_btf_obj_fd is in a union with u32 attach_prog_fd */dest->core_relo_cnt = cattr->core_relo_cnt;dest->fd_array = cattr->fd_array;dest->core_relos = cattr->core_relos;dest->core_relo_rec_size = cattr->core_relo_rec_size;break;- case BPF_OBJ_PIN:
- case BPF_OBJ_GET:
dest->pathname = cattr->pathname;dest->bpf_fd = cattr->bpf_fd;dest->file_flags = cattr->file_flags;break;- case BPF_PROG_ATTACH:
- case BPF_PROG_DETACH:
dest->target_fd = cattr->target_fd;dest->attach_bpf_fd = cattr->attach_bpf_fd;dest->attach_type = cattr->attach_type;dest->attach_flags = cattr->attach_flags;dest->replace_bpf_fd = cattr->replace_bpf_fd;break;- case BPF_PROG_RUN: /* same as BPF_PROG_TEST_RUN */
dest->test.prog_fd = cattr->test.prog_fd;dest->test.retval = cattr->test.retval;dest->test.data_size_in = cattr->test.data_size_in;dest->test.data_size_out = cattr->test.data_size_out;dest->test.data_in = cattr->test.data_in;dest->test.data_out = cattr->test.data_out;dest->test.repeat = cattr->test.repeat;dest->test.duration = cattr->test.duration;dest->test.ctx_size_in = cattr->test.ctx_size_in;dest->test.ctx_size_out = cattr->test.ctx_size_out;dest->test.ctx_in = cattr->test.ctx_in;dest->test.ctx_out = cattr->test.ctx_out;dest->test.flags = cattr->test.flags;dest->test.cpu = cattr->test.cpu;dest->test.batch_size = cattr->test.batch_size;break;- case BPF_PROG_GET_NEXT_ID:
- case BPF_MAP_GET_NEXT_ID:
- case BPF_PROG_GET_FD_BY_ID:
- case BPF_MAP_GET_FD_BY_ID:
- case BPF_BTF_GET_FD_BY_ID:
- case BPF_BTF_GET_NEXT_ID:
- case BPF_LINK_GET_FD_BY_ID:
- case BPF_LINK_GET_NEXT_ID:
/* u32 prog_id, map_id, btf_id + link_id are in a union with* u32 start_id */dest->start_id = cattr->start_id;dest->next_id = cattr->next_id;dest->open_flags = cattr->open_flags;break;- case BPF_OBJ_GET_INFO_BY_FD:
dest->info.bpf_fd = cattr->info.bpf_fd;dest->info.info_len = cattr->info.info_len;dest->info.info = cattr->info.info;break;- case BPF_PROG_QUERY:
dest->query.target_fd = cattr->query.target_fd;dest->query.attach_type = cattr->query.attach_type;dest->query.query_flags = cattr->query.query_flags;dest->query.attach_flags = cattr->query.attach_flags;dest->query.prog_ids = cattr->query.prog_ids;dest->query.prog_cnt = cattr->query.prog_cnt;dest->query.prog_attach_flags = cattr->query.prog_attach_flags;break;- case BPF_RAW_TRACEPOINT_OPEN:
dest->raw_tracepoint.name = cattr->raw_tracepoint.name;dest->raw_tracepoint.prog_fd = cattr->raw_tracepoint.prog_fd;break;- case BPF_BTF_LOAD:
dest->btf = cattr->btf;dest->btf_log_buf = cattr->btf_log_buf;dest->btf_size = cattr->btf_size;dest->btf_log_size = cattr->btf_log_size;dest->btf_log_level = cattr->btf_log_level;break;- case BPF_TASK_FD_QUERY:
dest->task_fd_query.pid = cattr->task_fd_query.pid;dest->task_fd_query.fd = cattr->task_fd_query.fd;dest->task_fd_query.flags = cattr->task_fd_query.flags;dest->task_fd_query.buf_len = cattr->task_fd_query.buf_len;dest->task_fd_query.buf = cattr->task_fd_query.buf;dest->task_fd_query.prog_id = cattr->task_fd_query.prog_id;dest->task_fd_query.fd_type = cattr->task_fd_query.fd_type;dest->task_fd_query.probe_offset = cattr->task_fd_query.probe_offset;dest->task_fd_query.probe_addr = cattr->task_fd_query.probe_addr;break;- case BPF_LINK_CREATE:
dest->link_create.prog_fd = cattr->link_create.prog_fd;dest->link_create.target_fd = cattr->link_create.target_fd;/* u32 target_ifindex is in a union with u32 target_fd */dest->link_create.attach_type = cattr->link_create.attach_type;dest->link_create.flags = cattr->link_create.flags;prog = bpf_prog_get(cattr->link_create.prog_fd);if (prog->type == BPF_PROG_TYPE_CGROUP_SKB ||prog->type == BPF_PROG_TYPE_CGROUP_SOCK ||prog->type == BPF_PROG_TYPE_CGROUP_SOCK_ADDR ||prog->type == BPF_PROG_TYPE_SOCK_OPS ||prog->type == BPF_PROG_TYPE_CGROUP_DEVICE ||prog->type == BPF_PROG_TYPE_CGROUP_SYSCTL ||prog->type == BPF_PROG_TYPE_CGROUP_SOCKOPT)break;if (prog->type == BPF_PROG_TYPE_EXT) {dest->link_create.tracing.target_btf_id =cattr->link_create.tracing.target_btf_id;dest->link_create.tracing.cookie =cattr->link_create.tracing.cookie;break;}if (prog->type == BPF_PROG_TYPE_LSM ||prog->type == BPF_PROG_TYPE_TRACING) {if (prog->expected_attach_type == BPF_TRACE_ITER) {/* iter_info is a user pointer to union* bpf_iter_link_info however since this union* contains no pointers, the size/offsets are* the same for compat64/purecap; hence no* conversion needed */dest->link_create.iter_info =cattr->link_create.iter_info;dest->link_create.iter_info_len =cattr->link_create.iter_info_len;break;} else if (prog->expected_attach_type == BPF_TRACE_RAW_TP|| prog->expected_attach_type == BPF_LSM_CGROUP) {/* only uses common fields above */break;} else {dest->link_create.target_btf_id =cattr->link_create.target_btf_id;dest->link_create.tracing.cookie =cattr->link_create.tracing.cookie;break;}}if (prog->type == BPF_PROG_TYPE_FLOW_DISSECTOR ||prog->type == BPF_PROG_TYPE_SK_LOOKUP ||prog->type == BPF_PROG_TYPE_XDP)break;/* bpf_cookie is used in bpf_perf_link_attach() */if (prog->type == BPF_PROG_TYPE_PERF_EVENT ||prog->type == BPF_PROG_TYPE_TRACEPOINT ||(prog->type == BPF_PROG_TYPE_KPROBE &&cattr->link_create.attach_type == BPF_PERF_EVENT)) {dest->link_create.perf_event.bpf_cookie =cattr->link_create.perf_event.bpf_cookie;break;}/* kprobe_multi is used in bpf_kprobe_multi_link_attach() */if (prog->type == BPF_PROG_TYPE_KPROBE &&cattr->link_create.attach_type != BPF_PERF_EVENT) {dest->link_create.kprobe_multi.flags =cattr->link_create.kprobe_multi.flags;dest->link_create.kprobe_multi.cnt =cattr->link_create.kprobe_multi.cnt;dest->link_create.kprobe_multi.syms =cattr->link_create.kprobe_multi.syms;dest->link_create.kprobe_multi.addrs =cattr->link_create.kprobe_multi.addrs;dest->link_create.kprobe_multi.cookies =cattr->link_create.kprobe_multi.cookies;break;}break;- case BPF_LINK_UPDATE:
dest->link_update.link_fd = cattr->link_update.link_fd;dest->link_update.new_prog_fd = cattr->link_update.new_prog_fd;dest->link_update.flags = cattr->link_update.flags;dest->link_update.old_prog_fd = cattr->link_update.old_prog_fd;break;- case BPF_LINK_DETACH:
dest->link_detach.link_fd = cattr->link_detach.link_fd;break;- case BPF_ENABLE_STATS:
dest->enable_stats.type = cattr->enable_stats.type;break;- case BPF_ITER_CREATE:
dest->iter_create.link_fd = cattr->iter_create.link_fd;dest->iter_create.flags = cattr->iter_create.flags;break;- case BPF_PROG_BIND_MAP:
dest->prog_bind_map.prog_fd = cattr->prog_bind_map.prog_fd;dest->prog_bind_map.map_fd = cattr->prog_bind_map.map_fd;dest->prog_bind_map.flags = cattr->prog_bind_map.flags;break;- };
Considering the really large number of assignments, I think it would be worth introducing a macro that takes dest, cattr and the name of the member to copy. That would shorten things a bit for nested structs, but most importantly it would prevent typos slipping in. A similar macro for pointer conversion could be introduced in the last patch (which would also hide the very noisy cast to __kernel_aligned_uintptr_t).
+} +#endif /* CONFIG_COMPAT64 */
+static int bpf_check_perms(int cmd) +{
- bool capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;
- /* Intent here is for unprivileged_bpf_disabled to block key object
* creation commands for unprivileged users; other actions depend* of fd availability and access to bpffs, so are dependent on* object creation success. Capabilities are later verified for* operations such as load and map create, so even with unprivileged* BPF disabled, capability checks are still carried out for these* and other operations.*/- if (!capable &&
(cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))return -EPERM;- return 0;
+}
+static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) +{
- union bpf_attr attr;
- int err;
- err = bpf_check_perms(cmd);
- if (err)
return err;- err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
- if (err)
return err;- size = min_t(u32, size, sizeof(attr));
- /* copy attributes from user space, may be less than sizeof(bpf_attr) */
- memset(&attr, 0, sizeof(attr));
- if (copy_from_bpfptr_with_ptr(&attr, uattr, size) != 0)
I would prefer if the switch to the *_with_ptr() variants happened in the final patch, like in the io_uring series. That's because it is more logical this way: at this stage there is no actual pointer in bpf_attr, the need for *_with_ptr() arises when changing the types in the final patch. This also applies to patch 5-8.
return -EFAULT;- return dispatch_bpf(cmd, &attr, uattr, size);
+}
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { return __sys_bpf(cmd, USER_BPFPTR(uattr), size); } +#ifdef CONFIG_COMPAT64 +static int __sys_compat_bpf(int cmd, bpfptr_t uattr, unsigned int size) +{
- union bpf_attr attr;
- union compat_bpf_attr cattr;
- int err;
- err = bpf_check_perms(cmd);
- if (err)
return err;- err = bpf_check_uarg_tail_zero(uattr, sizeof(cattr), size);
- if (err)
return err;- size = min_t(u32, size, sizeof(cattr));
- /* copy attributes from user space, may be less than sizeof(bpf_attr) */
- memset(&cattr, 0, sizeof(cattr));
- if (copy_from_bpfptr_with_ptr(&cattr, uattr, size) != 0)
compat structs/unions do not hold actual pointers so we shouldn't use *_with_ptr() in that case.
return -EFAULT;- convert_compat_bpf_attr(&attr, &cattr, cmd);
- return dispatch_bpf(cmd, &attr, uattr, sizeof(attr));
+}
+COMPAT_SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{
- return __sys_compat_bpf(cmd, USER_BPFPTR(uattr), size);
+}
Adding a compat handler is the "normal" way to handle this situation, so the approach in this patch is sound. However, it seems preferable to keep a native handler only and use in_compat_syscall() in __sys_bpf() for a couple of reasons: - AFAICT the diff in this file would be a lot smaller - just a few lines changed in __sys_bpf(), and of course the new conversion function. - It would avoid overriding the handler in sys_compat64.c, which is not ideal as it makes it hard to figure out what handler is actually used. The situation would be different if we were adding a "generic" compat handler, as in that case we could just change include/uapi/asm-generic/unistd.h (the normal way to do this).
With that approach, a further small improvement can be made. On the same model as Tudor's "io_uring: Implement compat versions of uAPI structs and handle them", __sys_bpf() could call a generic function, say copy_bpf_attr_from_user(). This function would then call the conversion function in compat64. Besides a better encapsulation, the main advantage is that the union compat_bpf_attr variable is allocated in the conversion function, and it is therefore live only for the duration of the conversion. Conversely, allocating it in __sys_compat_bpf() means it lives for the whole duration of the syscall handling. In other words, it slightly improves the stack usage. Not essential but nice to have. I think that approach could also be used in patch 5-8.
Finally to reduce the amount of #ifdef, we could have a macro like io_in_compat64() that only returns true in compat64. This allows keeping most of the code out of any #ifdef, assuming struct definitions and such are not #ifdef'd either (which is not particularly useful anyway).
Kevin