Next version of Harry's tcp_zerocopy_receive series. It aims to enable purecap applications to utilise tcp_zerocopy_receive to map packets directly from a network interface card into a shared space with the process and kernel.
v2..v3: - Fix the split between compat enablement and capability support - Reorganise comments in struct tcp_zerocopy_receive - Fix the order of arguments for the *_to_sockptr functions - Change a few variables to user_uintptr_t as they were originally unsigned long - Remove unnecessary checks - Fix cast warnings - Add compat handling for offsetofend - Move a check_user_ptr closer to where its metadata is dropped - Format misc nits
Gitlab Issue: https://git.morello-project.org/morello/kernel/linux/-/issues/46
Review branch: https://git.morello-project.org/tudcre01/linux/-/commits/morello/tcp_v3
Cc: Harry Ramsey harry.ramsey@arm.com
Harry Ramsey (2): tcp: Implement compat handling for struct tcp_zerocopy_receive tcp: Support capabilities in tcp_zerocopy_receive
include/linux/sockptr.h | 28 +++++++ include/uapi/linux/tcp.h | 10 +-- net/ipv4/tcp.c | 157 +++++++++++++++++++++++++++++++-------- 3 files changed, 157 insertions(+), 38 deletions(-)
From: Harry Ramsey harry.ramsey@arm.com
Introduce a compat version of the struct tcp_zerocopy_receive structure. Also, implement helper functions that convert between the compat version and the native version of the struct.
A subsequent patch is going to change the struct tcp_zerocopy_receive to enable it to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com Co-developed-by: Tudor Cretu tudor.cretu@arm.com Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- net/ipv4/tcp.c | 130 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 109 insertions(+), 21 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 90526625671b..8690ce837cfb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1940,6 +1940,100 @@ static int find_next_mappable_frag(const skb_frag_t *frag, return offset; }
+struct compat_tcp_zerocopy_receive { + __u64 address; + __u32 length; + __u32 recv_skip_hint; + __u32 inq; + __s32 err; + __u64 copybuf_address; + __s32 copybuf_len; + __u32 flags; + __u64 msg_control; + __u64 msg_controllen; + __u32 msg_flags; + __u32 reserved; +}; + +static inline bool in_compat64(void) +{ + return IS_ENABLED(CONFIG_COMPAT64) && in_compat_syscall(); +} + +static inline size_t tcp_zerocopy_receive_size(void) +{ + if (in_compat64()) + return sizeof(struct compat_tcp_zerocopy_receive); + + return sizeof(struct tcp_zerocopy_receive); +} + +static int get_compat64_tcp_zerocopy_receive(struct tcp_zerocopy_receive *zc, + sockptr_t src, size_t size) +{ + struct compat_tcp_zerocopy_receive compat_zc = {}; + + if (copy_from_sockptr(&compat_zc, src, size)) + return -EFAULT; + + zc->address = compat_zc.address; + zc->length = compat_zc.length; + zc->recv_skip_hint = compat_zc.recv_skip_hint; + zc->inq = compat_zc.inq; + zc->err = compat_zc.err; + zc->copybuf_address = compat_zc.copybuf_address; + zc->copybuf_len = compat_zc.copybuf_len; + zc->flags = compat_zc.flags; + zc->msg_control = compat_zc.msg_control; + zc->msg_controllen = compat_zc.msg_controllen; + zc->msg_flags = compat_zc.msg_flags; + zc->reserved = compat_zc.reserved; + return 0; +} + +static int copy_tcp_zerocopy_receive_from_sockptr(struct tcp_zerocopy_receive *zc, + sockptr_t src, size_t size) +{ + if (in_compat64()) + return get_compat64_tcp_zerocopy_receive(zc, src, size); + if (copy_from_sockptr(zc, src, size)) + return -EFAULT; + return 0; +} + +static int set_compat64_tcp_zerocopy_receive(sockptr_t dst, + struct tcp_zerocopy_receive *zc, + size_t size) +{ + struct compat_tcp_zerocopy_receive compat_zc = {}; + + compat_zc.address = zc->address; + compat_zc.length = zc->length; + compat_zc.recv_skip_hint = zc->recv_skip_hint; + compat_zc.inq = zc->inq; + compat_zc.err = zc->err; + compat_zc.copybuf_address = zc->copybuf_address; + compat_zc.copybuf_len = zc->copybuf_len; + compat_zc.flags = zc->flags; + compat_zc.msg_control = zc->msg_control; + compat_zc.msg_controllen = zc->msg_controllen; + compat_zc.msg_flags = zc->msg_flags; + compat_zc.reserved = zc->reserved; + + return copy_to_sockptr(dst, &compat_zc, size); +} + +static int copy_tcp_zerocopy_receive_to_sockptr(sockptr_t dst, + struct tcp_zerocopy_receive *zc, + size_t size) +{ + if (in_compat64()) + return set_compat64_tcp_zerocopy_receive(dst, zc, size); + if (copy_to_sockptr(dst, zc, size)) + return -EFAULT; + return 0; +} + static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 offset) @@ -4313,6 +4407,9 @@ int do_tcp_getsockopt(struct sock *sk, int level, return 0; } #ifdef CONFIG_MMU +#define offsetofend_tcp_zerocopy_receive(member) \ + (in_compat64() ? offsetofend(struct compat_tcp_zerocopy_receive, member) \ + : offsetofend(struct tcp_zerocopy_receive, member)) case TCP_ZEROCOPY_RECEIVE: { struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; @@ -4321,18 +4418,18 @@ int do_tcp_getsockopt(struct sock *sk, int level, if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0 || - len < offsetofend(struct tcp_zerocopy_receive, length)) + len < offsetofend_tcp_zerocopy_receive(length)) return -EINVAL; - if (unlikely(len > sizeof(zc))) { - err = check_zeroed_sockptr(optval, sizeof(zc), - len - sizeof(zc)); + if (unlikely(len > tcp_zerocopy_receive_size())) { + err = check_zeroed_sockptr(optval, tcp_zerocopy_receive_size(), + len - tcp_zerocopy_receive_size()); if (err < 1) return err == 0 ? -EINVAL : err; - len = sizeof(zc); + len = tcp_zerocopy_receive_size(); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } - if (copy_from_sockptr(&zc, optval, len)) + if (copy_tcp_zerocopy_receive_from_sockptr(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL; @@ -4343,24 +4440,14 @@ int do_tcp_getsockopt(struct sock *sk, int level, err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); sockopt_release_sock(sk); - if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) - goto zerocopy_rcv_cmsg; - switch (len) { - case offsetofend(struct tcp_zerocopy_receive, msg_flags): + if (len >= offsetofend_tcp_zerocopy_receive(msg_flags)) goto zerocopy_rcv_cmsg; - case offsetofend(struct tcp_zerocopy_receive, msg_controllen): - case offsetofend(struct tcp_zerocopy_receive, msg_control): - case offsetofend(struct tcp_zerocopy_receive, flags): - case offsetofend(struct tcp_zerocopy_receive, copybuf_len): - case offsetofend(struct tcp_zerocopy_receive, copybuf_address): - case offsetofend(struct tcp_zerocopy_receive, err): + else if (len >= offsetofend_tcp_zerocopy_receive(err)) goto zerocopy_rcv_sk_err; - case offsetofend(struct tcp_zerocopy_receive, inq): + else if (len >= offsetofend_tcp_zerocopy_receive(inq)) goto zerocopy_rcv_inq; - case offsetofend(struct tcp_zerocopy_receive, length): - default: + else goto zerocopy_rcv_out; - } zerocopy_rcv_cmsg: if (zc.msg_flags & TCP_CMSG_TS) tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); @@ -4372,10 +4459,11 @@ int do_tcp_getsockopt(struct sock *sk, int level, zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: - if (!err && copy_to_sockptr(optval, &zc, len)) + if (!err && copy_tcp_zerocopy_receive_to_sockptr(optval, &zc, len)) err = -EFAULT; return err; } +#undef offsetofend_tcp_zerocopy_receive #endif default: return -ENOPROTOOPT;
On 22/09/2023 14:07, Tudor Cretu wrote:
From: Harry Ramsey harry.ramsey@arm.com
Introduce a compat version of the struct tcp_zerocopy_receive structure. Also, implement helper functions that convert between the compat version and the native version of the struct.
A subsequent patch is going to change the struct tcp_zerocopy_receive to enable it to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
I would replace "compat" with "compat64" everywhere to avoid any confusion.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com Co-developed-by: Tudor Cretu tudor.cretu@arm.com Signed-off-by: Tudor Cretu tudor.cretu@arm.com
net/ipv4/tcp.c | 130 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 109 insertions(+), 21 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 90526625671b..8690ce837cfb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1940,6 +1940,100 @@ static int find_next_mappable_frag(const skb_frag_t *frag, return offset; } +struct compat_tcp_zerocopy_receive {
- __u64 address;
- __u32 length;
- __u32 recv_skip_hint;
- __u32 inq;
- __s32 err;
- __u64 copybuf_address;
- __s32 copybuf_len;
- __u32 flags;
- __u64 msg_control;
- __u64 msg_controllen;
- __u32 msg_flags;
- __u32 reserved;
+};
+static inline bool in_compat64(void) +{
- return IS_ENABLED(CONFIG_COMPAT64) && in_compat_syscall();
+}
+static inline size_t tcp_zerocopy_receive_size(void) +{
- if (in_compat64())
return sizeof(struct compat_tcp_zerocopy_receive);
- return sizeof(struct tcp_zerocopy_receive);
+}
Considering this is used in only one spot, we don't really need a function - just set a local variable to that value there. Bonus point is that it improves readability (avoids calling 4 times a mysterious function that takes no argument).
+static int get_compat64_tcp_zerocopy_receive(struct tcp_zerocopy_receive *zc,
sockptr_t src, size_t size)
+{
- struct compat_tcp_zerocopy_receive compat_zc = {};
- if (copy_from_sockptr(&compat_zc, src, size))
return -EFAULT;
- zc->address = compat_zc.address;
- zc->length = compat_zc.length;
- zc->recv_skip_hint = compat_zc.recv_skip_hint;
- zc->inq = compat_zc.inq;
- zc->err = compat_zc.err;
- zc->copybuf_address = compat_zc.copybuf_address;
- zc->copybuf_len = compat_zc.copybuf_len;
- zc->flags = compat_zc.flags;
- zc->msg_control = compat_zc.msg_control;
- zc->msg_controllen = compat_zc.msg_controllen;
- zc->msg_flags = compat_zc.msg_flags;
- zc->reserved = compat_zc.reserved;
- return 0;
+}
+static int copy_tcp_zerocopy_receive_from_sockptr(struct tcp_zerocopy_receive *zc,
sockptr_t src, size_t size)
+{
- if (in_compat64())
return get_compat64_tcp_zerocopy_receive(zc, src, size);
- if (copy_from_sockptr(zc, src, size))
return -EFAULT;
- return 0;
+}
+static int set_compat64_tcp_zerocopy_receive(sockptr_t dst,
struct tcp_zerocopy_receive *zc,
size_t size)
+{
- struct compat_tcp_zerocopy_receive compat_zc = {};
- compat_zc.address = zc->address;
- compat_zc.length = zc->length;
- compat_zc.recv_skip_hint = zc->recv_skip_hint;
- compat_zc.inq = zc->inq;
- compat_zc.err = zc->err;
- compat_zc.copybuf_address = zc->copybuf_address;
- compat_zc.copybuf_len = zc->copybuf_len;
- compat_zc.flags = zc->flags;
- compat_zc.msg_control = zc->msg_control;
- compat_zc.msg_controllen = zc->msg_controllen;
- compat_zc.msg_flags = zc->msg_flags;
- compat_zc.reserved = zc->reserved;
In this specific case we can make this more compact by directly initialising the struct:
struct ... compat_zc = { .address = zc->address, ... };
A small bonus is that this way we don't zero-init first, though in practice I expect the compiler to optimise out.
Kevin
From: Harry Ramsey harry.ramsey@arm.com
The tcp_zerocopy_receive struct is used to create a shared memory region between userspace and the kernel in which received network packets are directly copied from the network interface card.
In the PCuABI, a user pointer is a 129-bit capability, so the __u64 type is not big enough to hold it. Use the __kernel_uintptr_t type instead, which is big enough on the affected architectures while remaining 64-bit on others. Additionally, use the special copy routine when copying user pointers from and to userspace.
An additional check has been added to ensure that the owning capability has read access in order to read the received network packets from the shared memory region.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com Co-developed-by: Tudor Cretu tudor.cretu@arm.com Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- include/linux/sockptr.h | 28 ++++++++++++++++++++++++++ include/uapi/linux/tcp.h | 10 +++++----- net/ipv4/tcp.c | 43 +++++++++++++++++++++------------------- 3 files changed, 56 insertions(+), 25 deletions(-)
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index bae5e2369b4f..fd270378e8c8 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -41,6 +41,20 @@ static inline bool sockptr_is_null(sockptr_t sockptr) return !sockptr.user; }
+static inline int copy_from_sockptr_offset_with_ptr(void *dst, sockptr_t src, + size_t offset, size_t size) +{ + if (!sockptr_is_kernel(src)) + return copy_from_user_with_ptr(dst, src.user + offset, size); + memcpy(dst, src.kernel + offset, size); + return 0; +} + +static inline int copy_from_sockptr_with_ptr(void *dst, sockptr_t src, size_t size) +{ + return copy_from_sockptr_offset_with_ptr(dst, src, 0, size); +} + static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, size_t offset, size_t size) { @@ -55,6 +69,20 @@ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) return copy_from_sockptr_offset(dst, src, 0, size); }
+static inline int copy_to_sockptr_offset_with_ptr(sockptr_t dst, size_t offset, + const void *src, size_t size) +{ + if (!sockptr_is_kernel(dst)) + return copy_to_user_with_ptr(dst.user + offset, src, size); + memcpy(dst.kernel + offset, src, size); + return 0; +} + +static inline int copy_to_sockptr_with_ptr(sockptr_t dst, const void *src, size_t size) +{ + return copy_to_sockptr_offset_with_ptr(dst, 0, src, size); +} + static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, const void *src, size_t size) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 879eeb0a084b..4da689910e0e 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -352,15 +352,15 @@ struct tcp_diag_md5sig {
#define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1 struct tcp_zerocopy_receive { - __u64 address; /* in: address of mapping */ - __u32 length; /* in/out: number of bytes to map/mapped */ - __u32 recv_skip_hint; /* out: amount of bytes to skip */ + __kernel_uintptr_t address; /* in: address of mapping */ + __u32 length; /* in/out: number of bytes to map/mapped */ + __u32 recv_skip_hint; /* out: amount of bytes to skip */ __u32 inq; /* out: amount of bytes in read queue */ __s32 err; /* out: socket error */ - __u64 copybuf_address; /* in: copybuf address (small reads) */ + __kernel_uintptr_t copybuf_address; /* in: copybuf address (small reads) */ __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ __u32 flags; /* in: flags */ - __u64 msg_control; /* ancillary data */ + __kernel_uintptr_t msg_control; /* ancillary data */ __u64 msg_controllen; __u32 msg_flags; __u32 reserved; /* set to 0 for now */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8690ce837cfb..48a5ba9ebec4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1976,15 +1976,15 @@ static int get_compat64_tcp_zerocopy_receive(struct tcp_zerocopy_receive *zc, if (copy_from_sockptr(&compat_zc, src, size)) return -EFAULT;
- zc->address = compat_zc.address; + zc->address = (__kernel_uintptr_t)compat_ptr(compat_zc.address); zc->length = compat_zc.length; zc->recv_skip_hint = compat_zc.recv_skip_hint; zc->inq = compat_zc.inq; zc->err = compat_zc.err; - zc->copybuf_address = compat_zc.copybuf_address; + zc->copybuf_address = (__kernel_uintptr_t)compat_ptr(compat_zc.copybuf_address); zc->copybuf_len = compat_zc.copybuf_len; zc->flags = compat_zc.flags; - zc->msg_control = compat_zc.msg_control; + zc->msg_control = (__kernel_uintptr_t)compat_ptr(compat_zc.msg_control); zc->msg_controllen = compat_zc.msg_controllen; zc->msg_flags = compat_zc.msg_flags; zc->reserved = compat_zc.reserved; @@ -1996,7 +1996,7 @@ static int copy_tcp_zerocopy_receive_from_sockptr(struct tcp_zerocopy_receive *z { if (in_compat64()) return get_compat64_tcp_zerocopy_receive(zc, src, size); - if (copy_from_sockptr(zc, src, size)) + if (copy_from_sockptr_with_ptr(zc, src, size)) return -EFAULT; return 0; } @@ -2007,15 +2007,15 @@ static int set_compat64_tcp_zerocopy_receive(sockptr_t dst, { struct compat_tcp_zerocopy_receive compat_zc = {};
- compat_zc.address = zc->address; + compat_zc.address = (__u64)zc->address; compat_zc.length = zc->length; compat_zc.recv_skip_hint = zc->recv_skip_hint; compat_zc.inq = zc->inq; compat_zc.err = zc->err; - compat_zc.copybuf_address = zc->copybuf_address; + compat_zc.copybuf_address = (__u64)zc->copybuf_address; compat_zc.copybuf_len = zc->copybuf_len; compat_zc.flags = zc->flags; - compat_zc.msg_control = zc->msg_control; + compat_zc.msg_control = (__u64)zc->msg_control; compat_zc.msg_controllen = zc->msg_controllen; compat_zc.msg_flags = zc->msg_flags; compat_zc.reserved = zc->reserved; @@ -2029,7 +2029,7 @@ static int copy_tcp_zerocopy_receive_to_sockptr(sockptr_t dst, { if (in_compat64()) return set_compat64_tcp_zerocopy_receive(dst, zc, size); - if (copy_to_sockptr(dst, zc, size)) + if (copy_to_sockptr_with_ptr(dst, zc, size)) return -EFAULT; return 0; } @@ -2078,7 +2078,7 @@ static int receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, struct scm_timestamping_internal *tss) { - unsigned long copy_address = (unsigned long)zc->copybuf_address; + user_uintptr_t copy_address = (user_uintptr_t)zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err; @@ -2089,7 +2089,7 @@ static int receive_fallback_to_copy(struct sock *sk, if (copy_address != zc->copybuf_address) return -EINVAL;
- err = import_single_range(ITER_DEST, uaddr_to_user_ptr(copy_address), + err = import_single_range(ITER_DEST, (void __user *)copy_address, inq, &iov, &msg.msg_iter); if (err) return err; @@ -2115,7 +2115,7 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 copylen, u32 *offset, u32 *seq) { - unsigned long copy_address = (unsigned long)zc->copybuf_address; + user_uintptr_t copy_address = (user_uintptr_t)zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err; @@ -2123,7 +2123,7 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, if (copy_address != zc->copybuf_address) return -EINVAL;
- err = import_single_range(ITER_DEST, uaddr_to_user_ptr(copy_address), + err = import_single_range(ITER_DEST, (void __user *)copy_address, copylen, &iov, &msg.msg_iter); if (err) return err; @@ -2166,7 +2166,7 @@ static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, struct page **pending_pages, unsigned long pages_remaining, - unsigned long *address, + __kernel_uintptr_t *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, @@ -2214,7 +2214,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, unsigned int pages_to_map, - unsigned long *address, + __kernel_uintptr_t *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, @@ -2248,11 +2248,11 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { - unsigned long msg_control_addr; + __kernel_uintptr_t msg_control_addr; struct msghdr cmsg_dummy;
- msg_control_addr = (unsigned long)zc->msg_control; - cmsg_dummy.msg_control_user = uaddr_to_user_ptr(msg_control_addr); + msg_control_addr = zc->msg_control; + cmsg_dummy.msg_control_user = (void __user *)msg_control_addr; cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen; cmsg_dummy.msg_flags = in_compat_syscall() @@ -2262,8 +2262,8 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, if (zc->msg_control == msg_control_addr && zc->msg_controllen == cmsg_dummy.msg_controllen) { tcp_recv_timestamp(&cmsg_dummy, sk, tss); - zc->msg_control = (__u64) - user_ptr_addr(cmsg_dummy.msg_control_user); + zc->msg_control = + (__kernel_uintptr_t)cmsg_dummy.msg_control_user; zc->msg_controllen = (__u64)cmsg_dummy.msg_controllen; zc->msg_flags = (__u32)cmsg_dummy.msg_flags; @@ -2276,7 +2276,7 @@ static int tcp_zerocopy_receive(struct sock *sk, struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; - unsigned long address = (unsigned long)zc->address; + user_uintptr_t address = (user_uintptr_t)zc->address; struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk); @@ -2303,6 +2303,9 @@ static int tcp_zerocopy_receive(struct sock *sk, if (inq && inq <= copybuf_len) return receive_fallback_to_copy(sk, zc, inq, tss);
+ if (!check_user_ptr_read((void __user *)zc->address, zc->length)) + return -EFAULT; + if (inq < PAGE_SIZE) { zc->length = 0; zc->recv_skip_hint = inq;
On 22/09/2023 14:07, Tudor Cretu wrote:
From: Harry Ramsey harry.ramsey@arm.com
The tcp_zerocopy_receive struct is used to create a shared memory region between userspace and the kernel in which received network packets are directly copied from the network interface card.
In the PCuABI, a user pointer is a 129-bit capability, so the __u64 type is not big enough to hold it. Use the __kernel_uintptr_t type instead, which is big enough on the affected architectures while remaining 64-bit on others. Additionally, use the special copy routine when copying user pointers from and to userspace.
An additional check has been added to ensure that the owning capability has read access in order to read the received network packets from the shared memory region.
To be updated, see my last comment: this is actually a write from a userspace perspective, and we do not require an owning capability.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com Co-developed-by: Tudor Cretu tudor.cretu@arm.com Signed-off-by: Tudor Cretu tudor.cretu@arm.com
include/linux/sockptr.h | 28 ++++++++++++++++++++++++++ include/uapi/linux/tcp.h | 10 +++++----- net/ipv4/tcp.c | 43 +++++++++++++++++++++------------------- 3 files changed, 56 insertions(+), 25 deletions(-)
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index bae5e2369b4f..fd270378e8c8 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -41,6 +41,20 @@ static inline bool sockptr_is_null(sockptr_t sockptr) return !sockptr.user; } +static inline int copy_from_sockptr_offset_with_ptr(void *dst, sockptr_t src,
size_t offset, size_t size)
+{
- if (!sockptr_is_kernel(src))
return copy_from_user_with_ptr(dst, src.user + offset, size);
- memcpy(dst, src.kernel + offset, size);
- return 0;
+}
+static inline int copy_from_sockptr_with_ptr(void *dst, sockptr_t src, size_t size) +{
- return copy_from_sockptr_offset_with_ptr(dst, src, 0, size);
+}
static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, size_t offset, size_t size) { @@ -55,6 +69,20 @@ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) return copy_from_sockptr_offset(dst, src, 0, size); } +static inline int copy_to_sockptr_offset_with_ptr(sockptr_t dst, size_t offset,
const void *src, size_t size)
+{
- if (!sockptr_is_kernel(dst))
return copy_to_user_with_ptr(dst.user + offset, src, size);
- memcpy(dst.kernel + offset, src, size);
- return 0;
+}
+static inline int copy_to_sockptr_with_ptr(sockptr_t dst, const void *src, size_t size) +{
- return copy_to_sockptr_offset_with_ptr(dst, 0, src, size);
+}
I would split this out, especially since it's not specific to TCP.
[...]
@@ -2248,11 +2248,11 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) {
- unsigned long msg_control_addr;
- __kernel_uintptr_t msg_control_addr;tcp_zerocopy_vm_insert_batch
It would make sense to use user_uintptr_t here too.
struct msghdr cmsg_dummy;
- msg_control_addr = (unsigned long)zc->msg_control;
- cmsg_dummy.msg_control_user = uaddr_to_user_ptr(msg_control_addr);
- msg_control_addr = zc->msg_control;
- cmsg_dummy.msg_control_user = (void __user *)msg_control_addr; cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen; cmsg_dummy.msg_flags = in_compat_syscall()
@@ -2262,8 +2262,8 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, if (zc->msg_control == msg_control_addr && zc->msg_controllen == cmsg_dummy.msg_controllen) { tcp_recv_timestamp(&cmsg_dummy, sk, tss);
zc->msg_control = (__u64)
user_ptr_addr(cmsg_dummy.msg_control_user);
zc->msg_control =
zc->msg_controllen = (__u64)cmsg_dummy.msg_controllen; zc->msg_flags = (__u32)cmsg_dummy.msg_flags;(__kernel_uintptr_t)cmsg_dummy.msg_control_user;
@@ -2276,7 +2276,7 @@ static int tcp_zerocopy_receive(struct sock *sk, struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0;
- unsigned long address = (unsigned long)zc->address;
- user_uintptr_t address = (user_uintptr_t)zc->address;
There's no need to change this or tcp_zerocopy_vm_insert_batch*. Everything operates on addresses here, the check below that uses zc->address directly is sufficient.
struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk); @@ -2303,6 +2303,9 @@ static int tcp_zerocopy_receive(struct sock *sk, if (inq && inq <= copybuf_len) return receive_fallback_to_copy(sk, zc, inq, tss);
- if (!check_user_ptr_read((void __user *)zc->address, zc->length))
This is a bit of a special situation as the kernel is essentially asked to throw away all pages in that range and replace them with new pages. I would argue this looks more like a write than a read - from a userspace perspective, data appears in the specified range, which is what a write from a kernel looks like regardless of the specific mechanism.
I would also tend to put that check just after the address alignment check, so that we fail reliably regardless of the state of the socket.
Kevin
return -EFAULT;
- if (inq < PAGE_SIZE) { zc->length = 0; zc->recv_skip_hint = inq;
On 26-09-2023 10:02, Kevin Brodsky wrote:
On 22/09/2023 14:07, Tudor Cretu wrote:
From: Harry Ramsey harry.ramsey@arm.com
The tcp_zerocopy_receive struct is used to create a shared memory region between userspace and the kernel in which received network packets are directly copied from the network interface card.
In the PCuABI, a user pointer is a 129-bit capability, so the __u64 type is not big enough to hold it. Use the __kernel_uintptr_t type instead, which is big enough on the affected architectures while remaining 64-bit on others. Additionally, use the special copy routine when copying user pointers from and to userspace.
An additional check has been added to ensure that the owning capability has read access in order to read the received network packets from the shared memory region.
To be updated, see my last comment: this is actually a write from a userspace perspective, and we do not require an owning capability.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com Co-developed-by: Tudor Cretu tudor.cretu@arm.com Signed-off-by: Tudor Cretu tudor.cretu@arm.com
include/linux/sockptr.h | 28 ++++++++++++++++++++++++++ include/uapi/linux/tcp.h | 10 +++++----- net/ipv4/tcp.c | 43 +++++++++++++++++++++------------------- 3 files changed, 56 insertions(+), 25 deletions(-)
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index bae5e2369b4f..fd270378e8c8 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -41,6 +41,20 @@ static inline bool sockptr_is_null(sockptr_t sockptr) return !sockptr.user; } +static inline int copy_from_sockptr_offset_with_ptr(void *dst, sockptr_t src,
size_t offset, size_t size)
+{
- if (!sockptr_is_kernel(src))
return copy_from_user_with_ptr(dst, src.user + offset, size);
- memcpy(dst, src.kernel + offset, size);
- return 0;
+}
+static inline int copy_from_sockptr_with_ptr(void *dst, sockptr_t src, size_t size) +{
- return copy_from_sockptr_offset_with_ptr(dst, src, 0, size);
+}
- static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, size_t offset, size_t size) {
@@ -55,6 +69,20 @@ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) return copy_from_sockptr_offset(dst, src, 0, size); } +static inline int copy_to_sockptr_offset_with_ptr(sockptr_t dst, size_t offset,
const void *src, size_t size)
+{
- if (!sockptr_is_kernel(dst))
return copy_to_user_with_ptr(dst.user + offset, src, size);
- memcpy(dst.kernel + offset, src, size);
- return 0;
+}
+static inline int copy_to_sockptr_with_ptr(sockptr_t dst, const void *src, size_t size) +{
- return copy_to_sockptr_offset_with_ptr(dst, 0, src, size);
+}
I would split this out, especially since it's not specific to TCP.
[...]
@@ -2248,11 +2248,11 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) {
- unsigned long msg_control_addr;
- __kernel_uintptr_t msg_control_addr;tcp_zerocopy_vm_insert_batch
It would make sense to use user_uintptr_t here too.
struct msghdr cmsg_dummy;
- msg_control_addr = (unsigned long)zc->msg_control;
- cmsg_dummy.msg_control_user = uaddr_to_user_ptr(msg_control_addr);
- msg_control_addr = zc->msg_control;
- cmsg_dummy.msg_control_user = (void __user *)msg_control_addr; cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen; cmsg_dummy.msg_flags = in_compat_syscall()
@@ -2262,8 +2262,8 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, if (zc->msg_control == msg_control_addr && zc->msg_controllen == cmsg_dummy.msg_controllen) { tcp_recv_timestamp(&cmsg_dummy, sk, tss);
zc->msg_control = (__u64)
user_ptr_addr(cmsg_dummy.msg_control_user);
zc->msg_control =
zc->msg_controllen = (__u64)cmsg_dummy.msg_controllen; zc->msg_flags = (__u32)cmsg_dummy.msg_flags;(__kernel_uintptr_t)cmsg_dummy.msg_control_user;
@@ -2276,7 +2276,7 @@ static int tcp_zerocopy_receive(struct sock *sk, struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0;
- unsigned long address = (unsigned long)zc->address;
- user_uintptr_t address = (user_uintptr_t)zc->address;
There's no need to change this or tcp_zerocopy_vm_insert_batch*. Everything operates on addresses here, the check below that uses zc->address directly is sufficient.
struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk); @@ -2303,6 +2303,9 @@ static int tcp_zerocopy_receive(struct sock *sk, if (inq && inq <= copybuf_len) return receive_fallback_to_copy(sk, zc, inq, tss);
- if (!check_user_ptr_read((void __user *)zc->address, zc->length))
This is a bit of a special situation as the kernel is essentially asked to throw away all pages in that range and replace them with new pages. I would argue this looks more like a write than a read - from a userspace perspective, data appears in the specified range, which is what a write from a kernel looks like regardless of the specific mechanism.
Hi Kevin,
Thank you for the detailed review! Indeed it looked to me as well that a write happens here. AFAIK. Harry chose a `check_user_ptr_read` because the user passes a capability with only read permissions. The capability is one returned from an mmap with PROT_READ only, as also found in this kselftest [1]:
If you try to call getsockopt with zc.address equal to an mmap(..., PROT_READ | PROT_WRITE, ...), you actually get an EINVAL. So, I'm not sure what we should do, as there's no capability that would suggest that the userspace is allowed to write there.
[1] https://github.com/torvalds/linux/blob/master/tools/testing/selftests/net/tc...
Thanks, Tudor
I would also tend to put that check just after the address alignment check, so that we fail reliably regardless of the state of the socket.
Kevin
return -EFAULT;
- if (inq < PAGE_SIZE) { zc->length = 0; zc->recv_skip_hint = inq;
On 26/09/2023 16:45, Tudor Cretu wrote:
+ if (!check_user_ptr_read((void __user *)zc->address, zc->length))
This is a bit of a special situation as the kernel is essentially asked to throw away all pages in that range and replace them with new pages. I would argue this looks more like a write than a read - from a userspace perspective, data appears in the specified range, which is what a write from a kernel looks like regardless of the specific mechanism.
Hi Kevin,
Thank you for the detailed review! Indeed it looked to me as well that a write happens here. AFAIK. Harry chose a `check_user_ptr_read` because the user passes a capability with only read permissions. The capability is one returned from an mmap with PROT_READ only, as also found in this kselftest [1]:
If you try to call getsockopt with zc.address equal to an mmap(..., PROT_READ | PROT_WRITE, ...), you actually get an EINVAL. So, I'm not sure what we should do, as there's no capability that would suggest that the userspace is allowed to write there.
Ah, thanks for pointing this out, I missed it completely. I hadn't realised that in fact we are not dealing with some arbitrary anonymous mapping, but rather one backed by a (TCP) socket fd, and I see that tcp_mmap() fully prevents mapping it as writeable. That makes sense, because otherwise userspace could corrupt these pages (which are owned by the kernel).
With this considered, I understand why a read check was chosen - there isn't any other option with our current generic interface. However, I wonder if this is sufficient. The problem is that with that approach, any pointer that allows reading (at least a page of) the zerocopy region also allows changing its contents via the appropriate getsockopt() call. As a result, such a pointer cannot be passed around (say to a non-privileged context) while retaining the guarantee that the underlying memory will be left unmodified - which one would expect from a read-only pointer.
Thus, as a very special case, I think we should consider requiring VMem here, i.e. that the capability owns the range (as Harry originally wrote in the commit message). This is not quite the semantics of VMem, but it is close enough - that getsockopt() call is more or less the same as an mmap(MAP_FIXED) overwriting (part of) the mapping. Does that make sense? We do not have a generic helper for checking this, but this may well be the only generic case where we need it so I think it would be fine to have some PCuABI-specific code there (using cheri_check_cap()).
Kevin
Hi Tudor,
On 9/22/23 13:07, Tudor Cretu wrote:
Next version of Harry's tcp_zerocopy_receive series. It aims to enable purecap applications to utilise tcp_zerocopy_receive to map packets directly from a network interface card into a shared space with the process and kernel.
Something worth noticing in this case is that you should maintain the original author and Sign-off (as per v2) and then add "Co-developed-by:" with your tag and "Signed-off-by:". This helps keeping the history of the patches and it is the way in which it is handled upstream.
e.g.:
--->8---
Author: Harry Ramsey harry.ramsey@arm.com
[...]
Signed-off-by: Harry Ramsey harry.ramsey@arm.com Co-developed-by: Tudor Cretu tudor.cretu@arm.com Signed-off-by: Tudor Cretu tudor.cretu@arm.com
--->8---
Please refer to [1] for further details.
The only case in which you become author of a patch in a pre-existing series is if you add the patch ex novo.
[1] https://docs.kernel.org/process/submitting-patches.html#:~:text=Co%2Ddevelop....
v2..v3:
- Fix the split between compat enablement and capability support
- Reorganise comments in struct tcp_zerocopy_receive
- Fix the order of arguments for the *_to_sockptr functions
- Change a few variables to user_uintptr_t as they were originally unsigned long
- Remove unnecessary checks
- Fix cast warnings
- Add compat handling for offsetofend
- Move a check_user_ptr closer to where its metadata is dropped
- Format misc nits
Gitlab Issue: https://git.morello-project.org/morello/kernel/linux/-/issues/46
Review branch: https://git.morello-project.org/tudcre01/linux/-/commits/morello/tcp_v3
Cc: Harry Ramsey harry.ramsey@arm.com
Harry Ramsey (2): tcp: Implement compat handling for struct tcp_zerocopy_receive tcp: Support capabilities in tcp_zerocopy_receive
include/linux/sockptr.h | 28 +++++++ include/uapi/linux/tcp.h | 10 +-- net/ipv4/tcp.c | 157 +++++++++++++++++++++++++++++++-------- 3 files changed, 157 insertions(+), 38 deletions(-)
On 22-09-2023 14:16, Vincenzo Frascino wrote:
Hi Tudor,
Hi Vincenzo,
On 9/22/23 13:07, Tudor Cretu wrote:
Next version of Harry's tcp_zerocopy_receive series. It aims to enable purecap applications to utilise tcp_zerocopy_receive to map packets directly from a network interface card into a shared space with the process and kernel.
Something worth noticing in this case is that you should maintain the original author and Sign-off (as per v2) and then add "Co-developed-by:" with your tag and "Signed-off-by:". This helps keeping the history of the patches and it is the way in which it is handled upstream.
e.g.:
--->8---
Author: Harry Ramsey harry.ramsey@arm.com
[...]
Signed-off-by: Harry Ramsey harry.ramsey@arm.com Co-developed-by: Tudor Cretu tudor.cretu@arm.com Signed-off-by: Tudor Cretu tudor.cretu@arm.com
--->8---
Please refer to [1] for further details.
The individual patches do have the format referred in [1]. If you're referring about the cover letter, this cover letter is from me, so it didn't make sense to me to use the tags here. Furthermore, the series diff at the end of this cover letter credits Harry as the author as expected.
The only case in which you become author of a patch in a pre-existing series is if you add the patch ex novo.
[1] https://docs.kernel.org/process/submitting-patches.html#:~:text=Co%2Ddevelop....
v2..v3:
- Fix the split between compat enablement and capability support
- Reorganise comments in struct tcp_zerocopy_receive
- Fix the order of arguments for the *_to_sockptr functions
- Change a few variables to user_uintptr_t as they were originally unsigned long
- Remove unnecessary checks
- Fix cast warnings
- Add compat handling for offsetofend
- Move a check_user_ptr closer to where its metadata is dropped
- Format misc nits
Gitlab Issue: https://git.morello-project.org/morello/kernel/linux/-/issues/46
Review branch: https://git.morello-project.org/tudcre01/linux/-/commits/morello/tcp_v3
Cc: Harry Ramsey harry.ramsey@arm.com
Harry Ramsey (2): tcp: Implement compat handling for struct tcp_zerocopy_receive tcp: Support capabilities in tcp_zerocopy_receive
include/linux/sockptr.h | 28 +++++++ include/uapi/linux/tcp.h | 10 +-- net/ipv4/tcp.c | 157 +++++++++++++++++++++++++++++++-------- 3 files changed, 157 insertions(+), 38 deletions(-)
Hi Tudor,
On 9/22/23 14:26, Tudor Cretu wrote:
The individual patches do have the format referred in [1]. If you're referring about the cover letter, this cover letter is from me, so it didn't make sense to me to use the tags here. Furthermore, the series diff at the end of this cover letter credits Harry as the author as expected.
You are right, apologize. For some reasons I missed that it was already there.
Thanks!
On 22-09-2023 14:36, Vincenzo Frascino wrote:
Hi Tudor,
On 9/22/23 14:26, Tudor Cretu wrote:
The individual patches do have the format referred in [1]. If you're referring about the cover letter, this cover letter is from me, so it didn't make sense to me to use the tags here. Furthermore, the series diff at the end of this cover letter credits Harry as the author as expected.
You are right, apologize. For some reasons I missed that it was already there.
No worries! I was just confused for a second.
Thanks!
Have a great weekend, take care!
linux-morello@op-lists.linaro.org