Next version of Harry's tcp_zerocopy_receive series. It aims to enable purecap applications to utilise tcp_zerocopy_receive to map packets directly from a network interface card into a shared space with the process and kernel.
v3..v4: - Remove function tcp_zerocopy_receive_size() as it's used in only one spot - Change form compat to compat64 in the commit messages. - Make initialisation of compat_zc more compact in set_compat64_tcp_zerocopy_receive() - Split copy_{from,to}_sockptr_with_ptr() into a separate commit - Leave address tcp_zerocopy_vm_insert_batch() as just an address, not a pointer - Change the check_user_ptr_read() to a check for CHERI_PERM_SW_VMEM
v2..v3: - Fix the split between compat enablement and capability support - Reorganise comments in struct tcp_zerocopy_receive - Fix the order of arguments for the *_to_sockptr functions - Change a few variables to user_uintptr_t as they were originally unsigned long - Remove unnecessary checks - Fix cast warnings - Add compat handling for offsetofend - Move a check_user_ptr closer to where its metadata is dropped - Format misc nits
Gitlab Issue: https://git.morello-project.org/morello/kernel/linux/-/issues/46
Review branch: https://git.morello-project.org/tudcre01/linux/-/commits/morello/tcp_v4
Cc: Harry Ramsey harry.ramsey@arm.com
Harry Ramsey (3): sockptr: Preserve capability tags with copy_{from,to}_sockptr_with_ptr tcp: Implement compat64 handling for struct tcp_zerocopy_receive tcp: Support capabilities in tcp_zerocopy_receive
include/linux/sockptr.h | 28 ++++++++ include/uapi/linux/tcp.h | 10 +-- net/ipv4/tcp.c | 148 +++++++++++++++++++++++++++++++-------- 3 files changed, 151 insertions(+), 35 deletions(-)
From: Harry Ramsey harry.ramsey@arm.com
Introduce variants of copy_{from,to}_sockptr that will make use of copy_{from,to}_user_with_ptr as its actual copying routine, in order to preserve capability tags throughout the process.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com Co-developed-by: Tudor Cretu tudor.cretu@arm.com Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- include/linux/sockptr.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+)
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index bae5e2369b4f..fd270378e8c8 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -41,6 +41,20 @@ static inline bool sockptr_is_null(sockptr_t sockptr) return !sockptr.user; }
+static inline int copy_from_sockptr_offset_with_ptr(void *dst, sockptr_t src, + size_t offset, size_t size) +{ + if (!sockptr_is_kernel(src)) + return copy_from_user_with_ptr(dst, src.user + offset, size); + memcpy(dst, src.kernel + offset, size); + return 0; +} + +static inline int copy_from_sockptr_with_ptr(void *dst, sockptr_t src, size_t size) +{ + return copy_from_sockptr_offset_with_ptr(dst, src, 0, size); +} + static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, size_t offset, size_t size) { @@ -55,6 +69,20 @@ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) return copy_from_sockptr_offset(dst, src, 0, size); }
+static inline int copy_to_sockptr_offset_with_ptr(sockptr_t dst, size_t offset, + const void *src, size_t size) +{ + if (!sockptr_is_kernel(dst)) + return copy_to_user_with_ptr(dst.user + offset, src, size); + memcpy(dst.kernel + offset, src, size); + return 0; +} + +static inline int copy_to_sockptr_with_ptr(sockptr_t dst, const void *src, size_t size) +{ + return copy_to_sockptr_offset_with_ptr(dst, 0, src, size); +} + static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, const void *src, size_t size) {
On 27/09/2023 14:27, Tudor Cretu wrote:
From: Harry Ramsey harry.ramsey@arm.com
Introduce variants of copy_{from,to}_sockptr that will make use of copy_{from,to}_user_with_ptr as its actual copying routine, in order to preserve capability tags throughout the process.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com Co-developed-by: Tudor Cretu tudor.cretu@arm.com Signed-off-by: Tudor Cretu tudor.cretu@arm.com
include/linux/sockptr.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+)
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index bae5e2369b4f..fd270378e8c8 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -41,6 +41,20 @@ static inline bool sockptr_is_null(sockptr_t sockptr) return !sockptr.user; } +static inline int copy_from_sockptr_offset_with_ptr(void *dst, sockptr_t src,
size_t offset, size_t size)
+{
- if (!sockptr_is_kernel(src))
return copy_from_user_with_ptr(dst, src.user + offset, size);
- memcpy(dst, src.kernel + offset, size);
- return 0;
+}
+static inline int copy_from_sockptr_with_ptr(void *dst, sockptr_t src, size_t size) +{
- return copy_from_sockptr_offset_with_ptr(dst, src, 0, size);
+}
Nit: would make more sense to add the new variants after the original ones, rather than before.
Kevin
static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, size_t offset, size_t size) { @@ -55,6 +69,20 @@ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) return copy_from_sockptr_offset(dst, src, 0, size); } +static inline int copy_to_sockptr_offset_with_ptr(sockptr_t dst, size_t offset,
const void *src, size_t size)
+{
- if (!sockptr_is_kernel(dst))
return copy_to_user_with_ptr(dst.user + offset, src, size);
- memcpy(dst.kernel + offset, src, size);
- return 0;
+}
+static inline int copy_to_sockptr_with_ptr(sockptr_t dst, const void *src, size_t size) +{
- return copy_to_sockptr_offset_with_ptr(dst, 0, src, size);
+}
static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, const void *src, size_t size) {
From: Harry Ramsey harry.ramsey@arm.com
Introduce a compat64 version of the struct tcp_zerocopy_receive structure. Also, implement helper functions that convert between the compat64 version and the native version of the struct.
A subsequent patch is going to change the struct tcp_zerocopy_receive to enable it to support new architectures. On such architectures, the current struct layout still needs to be supported for compat64 tasks.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com Co-developed-by: Tudor Cretu tudor.cretu@arm.com Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- net/ipv4/tcp.c | 124 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 103 insertions(+), 21 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 90526625671b..1459ebf810d8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1940,6 +1940,92 @@ static int find_next_mappable_frag(const skb_frag_t *frag, return offset; }
+struct compat_tcp_zerocopy_receive { + __u64 address; + __u32 length; + __u32 recv_skip_hint; + __u32 inq; + __s32 err; + __u64 copybuf_address; + __s32 copybuf_len; + __u32 flags; + __u64 msg_control; + __u64 msg_controllen; + __u32 msg_flags; + __u32 reserved; +}; + +static inline bool in_compat64(void) +{ + return IS_ENABLED(CONFIG_COMPAT64) && in_compat_syscall(); +} + +static int get_compat64_tcp_zerocopy_receive(struct tcp_zerocopy_receive *zc, + sockptr_t src, size_t size) +{ + struct compat_tcp_zerocopy_receive compat_zc = {}; + + if (copy_from_sockptr(&compat_zc, src, size)) + return -EFAULT; + + zc->address = compat_zc.address; + zc->length = compat_zc.length; + zc->recv_skip_hint = compat_zc.recv_skip_hint; + zc->inq = compat_zc.inq; + zc->err = compat_zc.err; + zc->copybuf_address = compat_zc.copybuf_address; + zc->copybuf_len = compat_zc.copybuf_len; + zc->flags = compat_zc.flags; + zc->msg_control = compat_zc.msg_control; + zc->msg_controllen = compat_zc.msg_controllen; + zc->msg_flags = compat_zc.msg_flags; + zc->reserved = compat_zc.reserved; + return 0; +} + +static int copy_tcp_zerocopy_receive_from_sockptr(struct tcp_zerocopy_receive *zc, + sockptr_t src, size_t size) +{ + if (in_compat64()) + return get_compat64_tcp_zerocopy_receive(zc, src, size); + if (copy_from_sockptr(zc, src, size)) + return -EFAULT; + return 0; +} + +static int set_compat64_tcp_zerocopy_receive(sockptr_t dst, + struct tcp_zerocopy_receive *zc, + size_t size) +{ + struct compat_tcp_zerocopy_receive compat_zc = { + .address = zc->address, + .length = zc->length, + .recv_skip_hint = zc->recv_skip_hint, + .inq = zc->inq, + .err = zc->err, + .copybuf_address = zc->copybuf_address, + .copybuf_len = zc->copybuf_len, + .flags = zc->flags, + .msg_control = zc->msg_control, + .msg_controllen = zc->msg_controllen, + .msg_flags = zc->msg_flags, + .reserved = zc->reserved, + }; + + return copy_to_sockptr(dst, &compat_zc, size); +} + +static int copy_tcp_zerocopy_receive_to_sockptr(sockptr_t dst, + struct tcp_zerocopy_receive *zc, + size_t size) +{ + if (in_compat64()) + return set_compat64_tcp_zerocopy_receive(dst, zc, size); + if (copy_to_sockptr(dst, zc, size)) + return -EFAULT; + return 0; +} + static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 offset) @@ -4313,26 +4399,31 @@ int do_tcp_getsockopt(struct sock *sk, int level, return 0; } #ifdef CONFIG_MMU +#define offsetofend_tcp_zerocopy_receive(member) \ + (in_compat64() ? offsetofend(struct compat_tcp_zerocopy_receive, member) \ + : offsetofend(struct tcp_zerocopy_receive, member)) case TCP_ZEROCOPY_RECEIVE: { struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; int err; + size_t zc_size = in_compat64() ? sizeof(struct compat_tcp_zerocopy_receive) + : sizeof(struct tcp_zerocopy_receive);
if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0 || - len < offsetofend(struct tcp_zerocopy_receive, length)) + len < offsetofend_tcp_zerocopy_receive(length)) return -EINVAL; - if (unlikely(len > sizeof(zc))) { - err = check_zeroed_sockptr(optval, sizeof(zc), - len - sizeof(zc)); + if (unlikely(len > zc_size)) { + err = check_zeroed_sockptr(optval, zc_size, + len - zc_size); if (err < 1) return err == 0 ? -EINVAL : err; - len = sizeof(zc); + len = zc_size; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } - if (copy_from_sockptr(&zc, optval, len)) + if (copy_tcp_zerocopy_receive_from_sockptr(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL; @@ -4343,24 +4434,14 @@ int do_tcp_getsockopt(struct sock *sk, int level, err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); sockopt_release_sock(sk); - if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) + if (len >= offsetofend_tcp_zerocopy_receive(msg_flags)) goto zerocopy_rcv_cmsg; - switch (len) { - case offsetofend(struct tcp_zerocopy_receive, msg_flags): - goto zerocopy_rcv_cmsg; - case offsetofend(struct tcp_zerocopy_receive, msg_controllen): - case offsetofend(struct tcp_zerocopy_receive, msg_control): - case offsetofend(struct tcp_zerocopy_receive, flags): - case offsetofend(struct tcp_zerocopy_receive, copybuf_len): - case offsetofend(struct tcp_zerocopy_receive, copybuf_address): - case offsetofend(struct tcp_zerocopy_receive, err): + else if (len >= offsetofend_tcp_zerocopy_receive(err)) goto zerocopy_rcv_sk_err; - case offsetofend(struct tcp_zerocopy_receive, inq): + else if (len >= offsetofend_tcp_zerocopy_receive(inq)) goto zerocopy_rcv_inq; - case offsetofend(struct tcp_zerocopy_receive, length): - default: + else goto zerocopy_rcv_out; - } zerocopy_rcv_cmsg: if (zc.msg_flags & TCP_CMSG_TS) tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); @@ -4372,10 +4453,11 @@ int do_tcp_getsockopt(struct sock *sk, int level, zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: - if (!err && copy_to_sockptr(optval, &zc, len)) + if (!err && copy_tcp_zerocopy_receive_to_sockptr(optval, &zc, len)) err = -EFAULT; return err; } +#undef offsetofend_tcp_zerocopy_receive #endif default: return -ENOPROTOOPT;
From: Harry Ramsey harry.ramsey@arm.com
The tcp_zerocopy_receive struct is used to create a shared memory region between userspace and the kernel in which received network packets are directly copied from the network interface card.
In the PCuABI, a user pointer is a 129-bit capability, so the __u64 type is not big enough to hold it. Use the __kernel_uintptr_t type instead, which is big enough on the affected architectures while remaining 64-bit on others. Additionally, use the special copy routine when copying user pointers from and to userspace.
An additional check has been added to ensure that the capability has ownership on the mapping backed by the TCP socket fd to receive network packets.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com Co-developed-by: Tudor Cretu tudor.cretu@arm.com Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- include/uapi/linux/tcp.h | 10 +++++----- net/ipv4/tcp.c | 40 +++++++++++++++++++++++----------------- 2 files changed, 28 insertions(+), 22 deletions(-)
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 879eeb0a084b..4da689910e0e 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -352,15 +352,15 @@ struct tcp_diag_md5sig {
#define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1 struct tcp_zerocopy_receive { - __u64 address; /* in: address of mapping */ - __u32 length; /* in/out: number of bytes to map/mapped */ - __u32 recv_skip_hint; /* out: amount of bytes to skip */ + __kernel_uintptr_t address; /* in: address of mapping */ + __u32 length; /* in/out: number of bytes to map/mapped */ + __u32 recv_skip_hint; /* out: amount of bytes to skip */ __u32 inq; /* out: amount of bytes in read queue */ __s32 err; /* out: socket error */ - __u64 copybuf_address; /* in: copybuf address (small reads) */ + __kernel_uintptr_t copybuf_address; /* in: copybuf address (small reads) */ __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ __u32 flags; /* in: flags */ - __u64 msg_control; /* ancillary data */ + __kernel_uintptr_t msg_control; /* ancillary data */ __u64 msg_controllen; __u32 msg_flags; __u32 reserved; /* set to 0 for now */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1459ebf810d8..d63efa85198b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1968,15 +1968,15 @@ static int get_compat64_tcp_zerocopy_receive(struct tcp_zerocopy_receive *zc, if (copy_from_sockptr(&compat_zc, src, size)) return -EFAULT;
- zc->address = compat_zc.address; + zc->address = (__kernel_uintptr_t)compat_ptr(compat_zc.address); zc->length = compat_zc.length; zc->recv_skip_hint = compat_zc.recv_skip_hint; zc->inq = compat_zc.inq; zc->err = compat_zc.err; - zc->copybuf_address = compat_zc.copybuf_address; + zc->copybuf_address = (__kernel_uintptr_t)compat_ptr(compat_zc.copybuf_address); zc->copybuf_len = compat_zc.copybuf_len; zc->flags = compat_zc.flags; - zc->msg_control = compat_zc.msg_control; + zc->msg_control = (__kernel_uintptr_t)compat_ptr(compat_zc.msg_control); zc->msg_controllen = compat_zc.msg_controllen; zc->msg_flags = compat_zc.msg_flags; zc->reserved = compat_zc.reserved; @@ -1988,7 +1988,7 @@ static int copy_tcp_zerocopy_receive_from_sockptr(struct tcp_zerocopy_receive *z { if (in_compat64()) return get_compat64_tcp_zerocopy_receive(zc, src, size); - if (copy_from_sockptr(zc, src, size)) + if (copy_from_sockptr_with_ptr(zc, src, size)) return -EFAULT; return 0; } @@ -1998,15 +1998,15 @@ static int set_compat64_tcp_zerocopy_receive(sockptr_t dst, size_t size) { struct compat_tcp_zerocopy_receive compat_zc = { - .address = zc->address, + .address = (__u64)zc->address, .length = zc->length, .recv_skip_hint = zc->recv_skip_hint, .inq = zc->inq, .err = zc->err, - .copybuf_address = zc->copybuf_address, + .copybuf_address = (__u64)zc->copybuf_address, .copybuf_len = zc->copybuf_len, .flags = zc->flags, - .msg_control = zc->msg_control, + .msg_control = (__u64)zc->msg_control, .msg_controllen = zc->msg_controllen, .msg_flags = zc->msg_flags, .reserved = zc->reserved, @@ -2021,7 +2021,7 @@ static int copy_tcp_zerocopy_receive_to_sockptr(sockptr_t dst, { if (in_compat64()) return set_compat64_tcp_zerocopy_receive(dst, zc, size); - if (copy_to_sockptr(dst, zc, size)) + if (copy_to_sockptr_with_ptr(dst, zc, size)) return -EFAULT; return 0; } @@ -2070,7 +2070,7 @@ static int receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, struct scm_timestamping_internal *tss) { - unsigned long copy_address = (unsigned long)zc->copybuf_address; + user_uintptr_t copy_address = (user_uintptr_t)zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err; @@ -2081,7 +2081,7 @@ static int receive_fallback_to_copy(struct sock *sk, if (copy_address != zc->copybuf_address) return -EINVAL;
- err = import_single_range(ITER_DEST, uaddr_to_user_ptr(copy_address), + err = import_single_range(ITER_DEST, (void __user *)copy_address, inq, &iov, &msg.msg_iter); if (err) return err; @@ -2107,7 +2107,7 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 copylen, u32 *offset, u32 *seq) { - unsigned long copy_address = (unsigned long)zc->copybuf_address; + user_uintptr_t copy_address = (user_uintptr_t)zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err; @@ -2115,7 +2115,7 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, if (copy_address != zc->copybuf_address) return -EINVAL;
- err = import_single_range(ITER_DEST, uaddr_to_user_ptr(copy_address), + err = import_single_range(ITER_DEST, (void __user *)copy_address, copylen, &iov, &msg.msg_iter); if (err) return err; @@ -2240,11 +2240,11 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { - unsigned long msg_control_addr; + user_uintptr_t msg_control_addr; struct msghdr cmsg_dummy;
- msg_control_addr = (unsigned long)zc->msg_control; - cmsg_dummy.msg_control_user = uaddr_to_user_ptr(msg_control_addr); + msg_control_addr = zc->msg_control; + cmsg_dummy.msg_control_user = (void __user *)msg_control_addr; cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen; cmsg_dummy.msg_flags = in_compat_syscall() @@ -2254,8 +2254,8 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, if (zc->msg_control == msg_control_addr && zc->msg_controllen == cmsg_dummy.msg_controllen) { tcp_recv_timestamp(&cmsg_dummy, sk, tss); - zc->msg_control = (__u64) - user_ptr_addr(cmsg_dummy.msg_control_user); + zc->msg_control = + (__kernel_uintptr_t)cmsg_dummy.msg_control_user; zc->msg_controllen = (__u64)cmsg_dummy.msg_controllen; zc->msg_flags = (__u32)cmsg_dummy.msg_flags; @@ -2303,6 +2303,12 @@ static int tcp_zerocopy_receive(struct sock *sk, return 0; }
+#ifdef CONFIG_CHERI_PURECAP_UABI + /* Check that the pointer has ownership of the mapping */ + if (!cheri_check_cap((void __user *)zc->address, zc->length, CHERI_PERM_SW_VMEM)) + return -EFAULT; +#endif + mmap_read_lock(current->mm);
vma = vma_lookup(current->mm, address);
On 27/09/2023 14:27, Tudor Cretu wrote:
+#ifdef CONFIG_CHERI_PURECAP_UABI
- /* Check that the pointer has ownership of the mapping */
- if (!cheri_check_cap((void __user *)zc->address, zc->length, CHERI_PERM_SW_VMEM))
return -EFAULT;
It's really a borderline situation, but I would say let's make it -EINVAL. That would be more consistent with other syscalls that require VMem, and maybe more importantly it feels "less surprising" than failing with -EFAULT on a permission that is unrelated to memory accesses.
Kevin
linux-morello@op-lists.linaro.org