The bpf syscall does not require a compat handler for 32-bit compat. This is achieved by using u64 instead of pointer types in the bpf_attr union used to pass arguments to the syscall. This means that in a system where pointers are 32-bit, the struct/union layouts and offsets are the same as in a 64-bit arch, since the u64 field is split into two u32 fields/registers.
This greatly simplifies 32-bit compat at the small cost of requiring casting pointers passed in through the uAPI to u64 (generally via ptr_to_u64() helper functions).
This poses a problem in architectures where user pointers are longer than 64b such as Morello/PCuABI where pointers are represented as 129b capabilities. In order to extend the bpf syscall interface to accept capabilities and still retain compatibility with the existing 64/32b ABI, 64-bit compat handling and appropriate conversions must be added to handle the different union/struct sizes caused by this pointer size mis-match.
Before extending the number of bits in union bpf_attr to accept capabilities, set the groundwork for handling compat64. When in_compat64_syscall(), take the compat64 sized bpf_attr and convert it to what will be the new native offsets.
Inbound conversion is handled upfront to minimise impact on existing code and reduce overall diff size. After dispatch_bpf the majority of code can remain unchanged. The cases where conversion back out to userspace is required are handled in subsequent commits.
Signed-off-by: Zachary Leaf zachary.leaf@arm.com --- include/linux/bpf.h | 1 + include/linux/bpf_compat.h | 300 +++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 261 +++++++++++++++++++++++++++++++- 3 files changed, 555 insertions(+), 7 deletions(-) create mode 100644 include/linux/bpf_compat.h
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e53ceee1df37..6c8594a0f883 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -29,6 +29,7 @@ #include <linux/rcupdate_trace.h> #include <linux/static_call.h> #include <linux/memcontrol.h> +#include <linux/bpf_compat.h>
struct bpf_verifier_env; struct bpf_verifier_log; diff --git a/include/linux/bpf_compat.h b/include/linux/bpf_compat.h new file mode 100644 index 000000000000..85e0198bede7 --- /dev/null +++ b/include/linux/bpf_compat.h @@ -0,0 +1,300 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2023 Arm Ltd */ + +union compat_bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __u32 map_type; /* one of enum bpf_map_type */ + __u32 key_size; /* size of key in bytes */ + __u32 value_size; /* size of value in bytes */ + __u32 max_entries; /* max number of entries in a map */ + __u32 map_flags; /* BPF_MAP_CREATE related + * flags defined above. + */ + __u32 inner_map_fd; /* fd pointing to the inner map */ + __u32 numa_node; /* numa node (effective only if + * BPF_F_NUMA_NODE is set). + */ + char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ + __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- + * struct stored as the + * map value + */ + /* Any per-map-type extra fields + * + * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the + * number of hash functions (if 0, the bloom filter will default + * to using 5 hash functions). + */ + __u64 map_extra; + }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + __u64 flags; + }; + + struct { /* struct used by BPF_MAP_*_BATCH commands */ + __aligned_u64 in_batch; /* start batch, + * NULL to start from beginning + */ + __aligned_u64 out_batch; /* output: next start batch */ + __aligned_u64 keys; + __aligned_u64 values; + __u32 count; /* input/output: + * input: # of key/value + * elements + * output: # of filled elements + */ + __u32 map_fd; + __u64 elem_flags; + __u64 flags; + } batch; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + __u32 log_level; /* verbosity level of verifier */ + __u32 log_size; /* size of user buffer */ + __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* not used */ + __u32 prog_flags; + char prog_name[BPF_OBJ_NAME_LEN]; + __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ + __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; + __u32 core_relo_cnt; /* number of bpf_core_relo */ + __aligned_u64 fd_array; /* array of FDs */ + __aligned_u64 core_relos; + __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 log_true_size; + }; + + struct { /* anonymous struct used by BPF_OBJ_* commands */ + __aligned_u64 pathname; + __u32 bpf_fd; + __u32 file_flags; + }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + __u32 target_fd; /* container object to attach to */ + __u32 attach_bpf_fd; /* eBPF program to attach */ + __u32 attach_type; + __u32 attach_flags; + __u32 replace_bpf_fd; /* previously attached eBPF + * program to replace if + * BPF_F_REPLACE is used + */ + }; + + struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ + __u32 prog_fd; + __u32 retval; + __u32 data_size_in; /* input: len of data_in */ + __u32 data_size_out; /* input/output: len of data_out + * returns ENOSPC if data_out + * is too small. + */ + __aligned_u64 data_in; + __aligned_u64 data_out; + __u32 repeat; + __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; + __u32 batch_size; + } test; + + struct { /* anonymous struct used by BPF_*_GET_*_ID */ + union { + __u32 start_id; + __u32 prog_id; + __u32 map_id; + __u32 btf_id; + __u32 link_id; + }; + __u32 next_id; + __u32 open_flags; + }; + + struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ + __u32 bpf_fd; + __u32 info_len; + __aligned_u64 info; + } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __u32 target_fd; /* container object to query */ + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + __u32 prog_cnt; + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; + } query; + + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ + __u64 name; + __u32 prog_fd; + } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __aligned_u64 btf; + __aligned_u64 btf_log_buf; + __u32 btf_size; + __u32 btf_log_size; + __u32 btf_log_level; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 btf_log_true_size; + }; + + struct { + __u32 pid; /* input: pid */ + __u32 fd; /* input: fd */ + __u32 flags; /* input: flags */ + __u32 buf_len; /* input/output: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 fd_type; /* output: BPF_FD_TYPE_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; + + struct { /* struct used by BPF_LINK_CREATE command */ + union { + __u32 prog_fd; /* eBPF program to attach */ + __u32 map_fd; /* struct_ops to attach */ + }; + union { + __u32 target_fd; /* object to attach to */ + __u32 target_ifindex; /* target ifindex */ + }; + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; + struct { + __u32 flags; + __u32 cnt; + __aligned_u64 syms; + __aligned_u64 addrs; + __aligned_u64 cookies; + } kprobe_multi; + struct { + /* this is overlaid with the target_btf_id above. */ + __u32 target_btf_id; + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 cookie; + } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + }; + } link_create; + + struct { /* struct used by BPF_LINK_UPDATE command */ + __u32 link_fd; /* link fd */ + union { + /* new program fd to update link with */ + __u32 new_prog_fd; + /* new struct_ops map fd to update link with */ + __u32 new_map_fd; + }; + __u32 flags; /* extra flags */ + union { + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags. + */ + __u32 old_prog_fd; + /* expected link's map fd; is specified only + * if BPF_F_REPLACE flag is set. + */ + __u32 old_map_fd; + }; + } link_update; + + struct { + __u32 link_fd; + } link_detach; + + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + +} __attribute__((aligned(8))); + diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f1c8733f76b8..26d3cdd2ad4a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5015,6 +5015,259 @@ static int bpf_prog_bind_map(union bpf_attr *attr) return ret; }
+static void convert_compat_bpf_attr(union bpf_attr *dest, + const union compat_bpf_attr *cattr, int cmd) +{ + switch (cmd) { + case BPF_MAP_CREATE: + copy_field(dest, cattr, map_type); + copy_field(dest, cattr, key_size); + copy_field(dest, cattr, value_size); + copy_field(dest, cattr, max_entries); + copy_field(dest, cattr, map_flags); + copy_field(dest, cattr, inner_map_fd); + copy_field(dest, cattr, numa_node); + strncpy(dest->map_name, cattr->map_name, BPF_OBJ_NAME_LEN); + copy_field(dest, cattr, map_ifindex); + copy_field(dest, cattr, btf_fd); + copy_field(dest, cattr, btf_key_type_id); + copy_field(dest, cattr, btf_value_type_id); + copy_field(dest, cattr, btf_vmlinux_value_type_id); + copy_field(dest, cattr, map_extra); + break; + case BPF_MAP_LOOKUP_ELEM: + case BPF_MAP_UPDATE_ELEM: + case BPF_MAP_DELETE_ELEM: + case BPF_MAP_LOOKUP_AND_DELETE_ELEM: + copy_field(dest, cattr, map_fd); + copy_field(dest, cattr, key); + copy_field(dest, cattr, value); + /* u64 next_key is in a union with u64 value */ + copy_field(dest, cattr, flags); + break; + case BPF_MAP_LOOKUP_BATCH: + case BPF_MAP_LOOKUP_AND_DELETE_BATCH: + case BPF_MAP_UPDATE_BATCH: + case BPF_MAP_DELETE_BATCH: + copy_field(dest, cattr, batch.in_batch); + copy_field(dest, cattr, batch.out_batch); + copy_field(dest, cattr, batch.keys); + copy_field(dest, cattr, batch.values); + copy_field(dest, cattr, batch.count); + copy_field(dest, cattr, batch.map_fd); + copy_field(dest, cattr, batch.elem_flags); + copy_field(dest, cattr, batch.flags); + break; + case BPF_PROG_LOAD: + copy_field(dest, cattr, prog_type); + copy_field(dest, cattr, insn_cnt); + copy_field(dest, cattr, insns); + copy_field(dest, cattr, license); + copy_field(dest, cattr, log_level); + copy_field(dest, cattr, log_size); + copy_field(dest, cattr, log_buf); + copy_field(dest, cattr, kern_version); + copy_field(dest, cattr, prog_flags); + strncpy(dest->prog_name, cattr->prog_name, BPF_OBJ_NAME_LEN); + copy_field(dest, cattr, prog_ifindex); + copy_field(dest, cattr, expected_attach_type); + copy_field(dest, cattr, prog_btf_fd); + copy_field(dest, cattr, func_info_rec_size); + copy_field(dest, cattr, func_info); + copy_field(dest, cattr, func_info_cnt); + copy_field(dest, cattr, line_info_rec_size); + copy_field(dest, cattr, line_info); + copy_field(dest, cattr, line_info_cnt); + copy_field(dest, cattr, attach_btf_id); + copy_field(dest, cattr, attach_prog_fd); + /* u32 attach_btf_obj_fd is in a union with u32 attach_prog_fd */ + copy_field(dest, cattr, core_relo_cnt); + copy_field(dest, cattr, fd_array); + copy_field(dest, cattr, core_relos); + copy_field(dest, cattr, core_relo_rec_size); + copy_field(dest, cattr, log_true_size); + break; + case BPF_OBJ_PIN: + case BPF_OBJ_GET: + copy_field(dest, cattr, pathname); + copy_field(dest, cattr, bpf_fd); + copy_field(dest, cattr, file_flags); + break; + case BPF_PROG_ATTACH: + case BPF_PROG_DETACH: + copy_field(dest, cattr, target_fd); + copy_field(dest, cattr, attach_bpf_fd); + copy_field(dest, cattr, attach_type); + copy_field(dest, cattr, attach_flags); + copy_field(dest, cattr, replace_bpf_fd); + break; + case BPF_PROG_RUN: /* same as BPF_PROG_TEST_RUN */ + copy_field(dest, cattr, test.prog_fd); + copy_field(dest, cattr, test.retval); + copy_field(dest, cattr, test.data_size_in); + copy_field(dest, cattr, test.data_size_out); + copy_field(dest, cattr, test.data_in); + copy_field(dest, cattr, test.data_out); + copy_field(dest, cattr, test.repeat); + copy_field(dest, cattr, test.duration); + copy_field(dest, cattr, test.ctx_size_in); + copy_field(dest, cattr, test.ctx_size_out); + copy_field(dest, cattr, test.ctx_in); + copy_field(dest, cattr, test.ctx_out); + copy_field(dest, cattr, test.flags); + copy_field(dest, cattr, test.cpu); + copy_field(dest, cattr, test.batch_size); + break; + case BPF_PROG_GET_NEXT_ID: + case BPF_MAP_GET_NEXT_ID: + case BPF_PROG_GET_FD_BY_ID: + case BPF_MAP_GET_FD_BY_ID: + case BPF_BTF_GET_FD_BY_ID: + case BPF_BTF_GET_NEXT_ID: + case BPF_LINK_GET_FD_BY_ID: + case BPF_LINK_GET_NEXT_ID: + /* + * u32 prog_id, map_id, btf_id + link_id are in a union with + * u32 start_id + */ + copy_field(dest, cattr, start_id); + copy_field(dest, cattr, next_id); + copy_field(dest, cattr, open_flags); + break; + case BPF_OBJ_GET_INFO_BY_FD: + copy_field(dest, cattr, info.bpf_fd); + copy_field(dest, cattr, info.info_len); + copy_field(dest, cattr, info.info); + break; + case BPF_PROG_QUERY: + copy_field(dest, cattr, query.target_fd); + copy_field(dest, cattr, query.attach_type); + copy_field(dest, cattr, query.query_flags); + copy_field(dest, cattr, query.attach_flags); + copy_field(dest, cattr, query.prog_ids); + copy_field(dest, cattr, query.prog_cnt); + copy_field(dest, cattr, query.prog_attach_flags); + break; + case BPF_RAW_TRACEPOINT_OPEN: + copy_field(dest, cattr, raw_tracepoint.name); + copy_field(dest, cattr, raw_tracepoint.prog_fd); + break; + case BPF_BTF_LOAD: + copy_field(dest, cattr, btf); + copy_field(dest, cattr, btf_log_buf); + copy_field(dest, cattr, btf_size); + copy_field(dest, cattr, btf_log_size); + copy_field(dest, cattr, btf_log_level); + copy_field(dest, cattr, btf_log_true_size); + break; + case BPF_TASK_FD_QUERY: + copy_field(dest, cattr, task_fd_query.pid); + copy_field(dest, cattr, task_fd_query.fd); + copy_field(dest, cattr, task_fd_query.flags); + copy_field(dest, cattr, task_fd_query.buf_len); + copy_field(dest, cattr, task_fd_query.buf); + copy_field(dest, cattr, task_fd_query.prog_id); + copy_field(dest, cattr, task_fd_query.fd_type); + copy_field(dest, cattr, task_fd_query.probe_offset); + copy_field(dest, cattr, task_fd_query.probe_addr); + break; + case BPF_LINK_CREATE: + copy_field(dest, cattr, link_create.prog_fd); + copy_field(dest, cattr, link_create.target_fd); + /* u32 target_ifindex is in a union with u32 target_fd */ + copy_field(dest, cattr, link_create.attach_type); + copy_field(dest, cattr, link_create.flags); + + /* + * identify the union members that require conversion (i.e. with + * pointers) by attach_type, otherwise just memcpy the lot + */ + switch (cattr->link_create.attach_type) { + /* + * iter_info is a user pointer to union bpf_iter_link_info, + * however since this union contains no pointers the + * size/offsets are the same regardless of the ABI; hence no + * conversion needed + */ + case BPF_TRACE_ITER: + copy_field(dest, cattr, link_create.iter_info); + copy_field(dest, cattr, link_create.iter_info_len); + break; + /* kprobe_multi is used in bpf_kprobe_multi_link_attach() */ + case BPF_TRACE_KPROBE_MULTI: + copy_field(dest, cattr, link_create.kprobe_multi.flags); + copy_field(dest, cattr, link_create.kprobe_multi.cnt); + copy_field(dest, cattr, link_create.kprobe_multi.syms); + copy_field(dest, cattr, link_create.kprobe_multi.addrs); + copy_field(dest, cattr, link_create.kprobe_multi.cookies); + break; + /* + * remaining union members only contain fixed size integers + * so offsets are the same across all ABIs - + * calculate the size of the whole union + copy that + */ + default: + memcpy((u8 *)dest+offsetof(union bpf_attr, + link_create.target_btf_id), + (u8 *)cattr+offsetof(union compat_bpf_attr, + link_create.target_btf_id), + offsetofend(union compat_bpf_attr, link_create) - + offsetofend(union compat_bpf_attr, link_create.flags)); + } + + break; + case BPF_LINK_UPDATE: + copy_field(dest, cattr, link_update.link_fd); + copy_field(dest, cattr, link_update.new_prog_fd); + copy_field(dest, cattr, link_update.flags); + copy_field(dest, cattr, link_update.old_prog_fd); + break; + case BPF_LINK_DETACH: + copy_field(dest, cattr, link_detach.link_fd); + break; + case BPF_ENABLE_STATS: + copy_field(dest, cattr, enable_stats.type); + break; + case BPF_ITER_CREATE: + copy_field(dest, cattr, iter_create.link_fd); + copy_field(dest, cattr, iter_create.flags); + break; + case BPF_PROG_BIND_MAP: + copy_field(dest, cattr, prog_bind_map.prog_fd); + copy_field(dest, cattr, prog_bind_map.map_fd); + copy_field(dest, cattr, prog_bind_map.flags); + break; + }; +} + +static int copy_bpf_attr_from_user(union bpf_attr *attr, int cmd, + bpfptr_t uattr, unsigned int *size) +{ + union compat_bpf_attr cattr; + size_t attr_size = in_compat64_syscall() ? sizeof(union compat_bpf_attr) + : sizeof(union bpf_attr); + int err; + + err = bpf_check_uarg_tail_zero(uattr, attr_size, *size); + if (err) + return err; + *size = min_t(u32, *size, attr_size); + + /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + memset(attr, 0, sizeof(*attr)); + if (in_compat64_syscall()) { + memset(&cattr, 0, sizeof(cattr)); + if (copy_from_bpfptr(&cattr, uattr, *size) != 0) + return -EFAULT; + convert_compat_bpf_attr(attr, &cattr, cmd); + } else { + if (copy_from_bpfptr(attr, uattr, *size) != 0) + return -EFAULT; + } + + return 0; +} + static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -5035,15 +5288,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD)) return -EPERM;
- err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); + err = copy_bpf_attr_from_user(&attr, cmd, uattr, &size); if (err) return err; - size = min_t(u32, size, sizeof(attr)); - - /* copy attributes from user space, may be less than sizeof(bpf_attr) */ - memset(&attr, 0, sizeof(attr)); - if (copy_from_bpfptr(&attr, uattr, size) != 0) - return -EFAULT;
err = security_bpf(cmd, &attr, size); if (err < 0)