Hello,
This patch series enables purecap applications to utilise tcp_zerocopy_receive to map packets directly from a network interface card into a shared space with the process and kernel.
I have tested these changes against musl and there still exists an issue in this implementation with copybuf and potentially msg_control generating an EFAULT error.
Gitlab Issue: - https://git.morello-project.org/morello/kernel/linux/-/issues/46
Review branch: - https://git.morello-project.org/harryramsey/linux/-/commits/tcp_zerocopy
Thanks, Harry
Harry Ramsey (2): tcp: Support userspace capabilities for tcp_zerocopy_receive tcp: Implement compat version of tcp_zerocopy_receive
include/linux/sockptr.h | 28 ++++++++++ include/uapi/linux/tcp.h | 27 +++++++--- net/ipv4/tcp.c | 111 +++++++++++++++++++++++++++++++++------ 3 files changed, 145 insertions(+), 21 deletions(-)
The tcp_zerocopy_receive struct is used to create a shared memory region between userspace and the kernel in which received network packets are directly copied from the network interface card.
In PCuABI, a user pointer is a 129 bit capability, the __u64 type is not sufficient enough to store it. Instead __kernel_uinptr_t is used instead which is sufficient enough to store a capability whilst remaining 64-bits on other architectures.
Additional checks have been added to ensure that the owning capability has read access in order to read the received network packets from the shared memory region.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com --- include/linux/sockptr.h | 14 ++++++++++++++ include/uapi/linux/tcp.h | 20 ++++++++++---------- net/ipv4/tcp.c | 26 ++++++++++++++++++-------- 3 files changed, 42 insertions(+), 18 deletions(-)
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index bae5e2369b4f..c81355d61d07 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -41,6 +41,20 @@ static inline bool sockptr_is_null(sockptr_t sockptr) return !sockptr.user; }
+static inline int copy_from_sockptr_offset_with_ptr(void *dst, sockptr_t src, + size_t offset, size_t size) +{ + if (!sockptr_is_kernel(src)) + return copy_from_user_with_ptr(dst, src.user + offset, size); + memcpy(dst, src.kernel + offset, size); + return 0; +} + +static inline int copy_from_sockptr_with_ptr(void *dst, sockptr_t src, size_t size) +{ + return copy_from_sockptr_offset_with_ptr(dst, src, 0, size); +} + static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, size_t offset, size_t size) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 879eeb0a084b..3854b7ace73e 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -352,17 +352,17 @@ struct tcp_diag_md5sig {
#define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1 struct tcp_zerocopy_receive { - __u64 address; /* in: address of mapping */ - __u32 length; /* in/out: number of bytes to map/mapped */ - __u32 recv_skip_hint; /* out: amount of bytes to skip */ - __u32 inq; /* out: amount of bytes in read queue */ - __s32 err; /* out: socket error */ - __u64 copybuf_address; /* in: copybuf address (small reads) */ - __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ - __u32 flags; /* in: flags */ - __u64 msg_control; /* ancillary data */ + __kernel_uintptr_t address; /* in: address of mapping */ + __u32 length; /* in/out: number of bytes to map/mapped */ + __u32 recv_skip_hint; /* out: amount of bytes to skip */ + __u32 inq; /* out: amount of bytes in read queue */ + __s32 err; /* out: socket error */ + __kernel_uintptr_t copybuf_address; /* in: copybuf address (small reads) */ + __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ + __u32 flags; /* in: flags */ + __kernel_uintptr_t msg_control; /* ancillary data */ __u64 msg_controllen; __u32 msg_flags; - __u32 reserved; /* set to 0 for now */ + __u32 reserved; /* set to 0 for now */ }; #endif /* _UAPI_LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 90526625671b..440b00e763e6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -267,6 +267,7 @@ #include <linux/errqueue.h> #include <linux/static_key.h> #include <linux/btf.h> +#include <linux/user_ptr.h>
#include <net/icmp.h> #include <net/inet_common.h> @@ -1984,7 +1985,7 @@ static int receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, struct scm_timestamping_internal *tss) { - unsigned long copy_address = (unsigned long)zc->copybuf_address; + __kernel_uintptr_t copy_address = zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err; @@ -1995,6 +1996,9 @@ static int receive_fallback_to_copy(struct sock *sk, if (copy_address != zc->copybuf_address) return -EINVAL;
+ if (check_user_ptr_read((void __user *)zc->copybuf_address, zc->copybuf_len)) + return -EFAULT; + err = import_single_range(ITER_DEST, uaddr_to_user_ptr(copy_address), inq, &iov, &msg.msg_iter); if (err) @@ -2021,7 +2025,7 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 copylen, u32 *offset, u32 *seq) { - unsigned long copy_address = (unsigned long)zc->copybuf_address; + __kernel_uintptr_t copy_address = zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err; @@ -2029,6 +2033,9 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, if (copy_address != zc->copybuf_address) return -EINVAL;
+ if (check_user_ptr_read((void __user *)zc->copybuf_address, zc->copybuf_len)) + return -EFAULT; + err = import_single_range(ITER_DEST, uaddr_to_user_ptr(copy_address), copylen, &iov, &msg.msg_iter); if (err) @@ -2072,7 +2079,7 @@ static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, struct page **pending_pages, unsigned long pages_remaining, - unsigned long *address, + __kernel_uintptr_t *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, @@ -2120,7 +2127,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, unsigned int pages_to_map, - unsigned long *address, + __kernel_uintptr_t *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, @@ -2154,10 +2161,10 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { - unsigned long msg_control_addr; + __kernel_uintptr_t msg_control_addr; struct msghdr cmsg_dummy;
- msg_control_addr = (unsigned long)zc->msg_control; + msg_control_addr = zc->msg_control; cmsg_dummy.msg_control_user = uaddr_to_user_ptr(msg_control_addr); cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen; @@ -2182,7 +2189,7 @@ static int tcp_zerocopy_receive(struct sock *sk, struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; - unsigned long address = (unsigned long)zc->address; + __kernel_uintptr_t address = zc->address; struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk); @@ -2198,6 +2205,9 @@ static int tcp_zerocopy_receive(struct sock *sk, zc->copybuf_len = 0; zc->msg_flags = 0;
+ if (!check_user_ptr_read((void __user *)zc->address, zc->length)) + return -EFAULT; + if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL;
@@ -4332,7 +4342,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } - if (copy_from_sockptr(&zc, optval, len)) + if (copy_from_sockptr_with_ptr(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL;
On 15-09-2023 08:56, Harry Ramsey wrote:
The tcp_zerocopy_receive struct is used to create a shared memory region between userspace and the kernel in which received network packets are directly copied from the network interface card.
In PCuABI, a user pointer is a 129 bit capability, the __u64 type is not sufficient enough to store it. Instead __kernel_uinptr_t is used instead which is sufficient enough to store a capability whilst remaining 64-bits on other architectures.
Additional checks have been added to ensure that the owning capability has read access in order to read the received network packets from the shared memory region.
Well done for adding the checks as well!
Signed-off-by: Harry Ramsey harry.ramsey@arm.com
include/linux/sockptr.h | 14 ++++++++++++++ include/uapi/linux/tcp.h | 20 ++++++++++---------- net/ipv4/tcp.c | 26 ++++++++++++++++++-------- 3 files changed, 42 insertions(+), 18 deletions(-)
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index bae5e2369b4f..c81355d61d07 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -41,6 +41,20 @@ static inline bool sockptr_is_null(sockptr_t sockptr) return !sockptr.user; } +static inline int copy_from_sockptr_offset_with_ptr(void *dst, sockptr_t src,
size_t offset, size_t size)
+{
- if (!sockptr_is_kernel(src))
return copy_from_user_with_ptr(dst, src.user + offset, size);
- memcpy(dst, src.kernel + offset, size);
- return 0;
+}
+static inline int copy_from_sockptr_with_ptr(void *dst, sockptr_t src, size_t size) +{
- return copy_from_sockptr_offset_with_ptr(dst, src, 0, size);
+}
- static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, size_t offset, size_t size) {
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 879eeb0a084b..3854b7ace73e 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -352,17 +352,17 @@ struct tcp_diag_md5sig { #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1 struct tcp_zerocopy_receive {
- __u64 address; /* in: address of mapping */
- __u32 length; /* in/out: number of bytes to map/mapped */
- __u32 recv_skip_hint; /* out: amount of bytes to skip */
- __u32 inq; /* out: amount of bytes in read queue */
- __s32 err; /* out: socket error */
- __u64 copybuf_address; /* in: copybuf address (small reads) */
- __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */
- __u32 flags; /* in: flags */
- __u64 msg_control; /* ancillary data */
- __kernel_uintptr_t address; /* in: address of mapping */
- __u32 length; /* in/out: number of bytes to map/mapped */
- __u32 recv_skip_hint; /* out: amount of bytes to skip */
- __u32 inq; /* out: amount of bytes in read queue */
- __s32 err; /* out: socket error */
- __kernel_uintptr_t copybuf_address; /* in: copybuf address (small reads) */
- __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */
- __u32 flags; /* in: flags */
- __kernel_uintptr_t msg_control; /* ancillary data */ __u64 msg_controllen; __u32 msg_flags;
- __u32 reserved; /* set to 0 for now */
- __u32 reserved; /* set to 0 for now */
nit: the situation with the comments is interesting... I am not sure if it's worth aligning them. I would especially leave the comment for `reserved` where it was so that it doesn't hinder future rebases.
}; #endif /* _UAPI_LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 90526625671b..440b00e763e6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -267,6 +267,7 @@ #include <linux/errqueue.h> #include <linux/static_key.h> #include <linux/btf.h> +#include <linux/user_ptr.h> #include <net/icmp.h> #include <net/inet_common.h> @@ -1984,7 +1985,7 @@ static int receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, struct scm_timestamping_internal *tss) {
- unsigned long copy_address = (unsigned long)zc->copybuf_address;
- __kernel_uintptr_t copy_address = zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err;
@@ -1995,6 +1996,9 @@ static int receive_fallback_to_copy(struct sock *sk, if (copy_address != zc->copybuf_address) return -EINVAL;
- if (check_user_ptr_read((void __user *)zc->copybuf_address, zc->copybuf_len))
return -EFAULT;
- err = import_single_range(ITER_DEST, uaddr_to_user_ptr(copy_address), inq, &iov, &msg.msg_iter);
uaddr_to_user_ptr weakens the enforcement of the capability model as it creates a capability from an arbitrary user address. You already have a valid capability here, so don't need to to us uaddr_to_user_ptr. Ideally we get remove most uaddr_to_user_ptr instances. In this case, you already propagated the capability, so don't need to use it at all. There are 2 more instances in this file that you can just remove them. Well done!
Thanks, Tudor
if (err) @@ -2021,7 +2025,7 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 copylen, u32 *offset, u32 *seq) {
- unsigned long copy_address = (unsigned long)zc->copybuf_address;
- __kernel_uintptr_t copy_address = zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err;
@@ -2029,6 +2033,9 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, if (copy_address != zc->copybuf_address) return -EINVAL;
- if (check_user_ptr_read((void __user *)zc->copybuf_address, zc->copybuf_len))
return -EFAULT;
- err = import_single_range(ITER_DEST, uaddr_to_user_ptr(copy_address), copylen, &iov, &msg.msg_iter); if (err)
@@ -2072,7 +2079,7 @@ static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, struct page **pending_pages, unsigned long pages_remaining,
unsigned long *address,
__kernel_uintptr_t *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc,
@@ -2120,7 +2127,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, unsigned int pages_to_map,
unsigned long *address,
__kernel_uintptr_t *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc,
@@ -2154,10 +2161,10 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) {
- unsigned long msg_control_addr;
- __kernel_uintptr_t msg_control_addr; struct msghdr cmsg_dummy;
- msg_control_addr = (unsigned long)zc->msg_control;
- msg_control_addr = zc->msg_control; cmsg_dummy.msg_control_user = uaddr_to_user_ptr(msg_control_addr); cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen;
@@ -2182,7 +2189,7 @@ static int tcp_zerocopy_receive(struct sock *sk, struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0;
- unsigned long address = (unsigned long)zc->address;
- __kernel_uintptr_t address = zc->address; struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk);
@@ -2198,6 +2205,9 @@ static int tcp_zerocopy_receive(struct sock *sk, zc->copybuf_len = 0; zc->msg_flags = 0;
- if (!check_user_ptr_read((void __user *)zc->address, zc->length))
return -EFAULT;
- if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL;
@@ -4332,7 +4342,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; }
if (copy_from_sockptr(&zc, optval, len))
if (zc.reserved) return -EINVAL;if (copy_from_sockptr_with_ptr(&zc, optval, len)) return -EFAULT;
This patch introduces a compat version of the tcp_zerocopy_receive structure. Subsequently additional helper functions have been added to convert between compat and purecap strucutres.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com --- include/linux/sockptr.h | 14 +++++++ include/uapi/linux/tcp.h | 15 +++++++ net/ipv4/tcp.c | 87 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 108 insertions(+), 8 deletions(-)
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index c81355d61d07..fd270378e8c8 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -69,6 +69,20 @@ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) return copy_from_sockptr_offset(dst, src, 0, size); }
+static inline int copy_to_sockptr_offset_with_ptr(sockptr_t dst, size_t offset, + const void *src, size_t size) +{ + if (!sockptr_is_kernel(dst)) + return copy_to_user_with_ptr(dst.user + offset, src, size); + memcpy(dst.kernel + offset, src, size); + return 0; +} + +static inline int copy_to_sockptr_with_ptr(sockptr_t dst, const void *src, size_t size) +{ + return copy_to_sockptr_offset_with_ptr(dst, 0, src, size); +} + static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, const void *src, size_t size) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 3854b7ace73e..631d5f3c11fd 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -365,4 +365,19 @@ struct tcp_zerocopy_receive { __u32 msg_flags; __u32 reserved; /* set to 0 for now */ }; + +struct tcp_zerocopy_receive_compat { + __u64 address; /* in: address of mapping */ + __u32 length; /* in/out: number of bytes to map/mapped */ + __u32 recv_skip_hint; /* out: amount of bytes to skip */ + __u32 inq; /* out: amount of bytes in read queue */ + __s32 err; /* out: socket error */ + __u64 copybuf_address; /* in: copybuf address (small reads) */ + __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ + __u32 flags; /* in: flags */ + __u64 msg_control; /* ancillary data */ + __u64 msg_controllen; + __u32 msg_flags; + __u32 reserved; /* set to 0 for now */ +}; #endif /* _UAPI_LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 440b00e763e6..f4e7d2d3922b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1941,6 +1941,59 @@ static int find_next_mappable_frag(const skb_frag_t *frag, return offset; }
+static inline bool in_compat64(void) +{ + return IS_ENABLED(CONFIG_COMPAT64) && in_compat_syscall(); +} + +static int tcp_zerocopy_receive_len(void) +{ + if (in_compat64()) + return sizeof(struct tcp_zerocopy_receive_compat); + + return sizeof(struct tcp_zerocopy_receive); +} + +static void copy_tcp_zerocopy_from_user(struct tcp_zerocopy_receive *zc, + struct tcp_zerocopy_receive_compat *zcc) +{ + if (!in_compat64()) + return; + + zc->address = (__kernel_uintptr_t)compat_ptr(zcc->address); + zc->length = zcc->length; + zc->recv_skip_hint = zcc->recv_skip_hint; + zc->inq = zcc->inq; + zc->err = zcc->err; + zc->copybuf_address = (__kernel_uintptr_t)compat_ptr(zcc->copybuf_address); + zc->copybuf_len = zcc->copybuf_len; + zc->flags = zcc->flags; + zc->msg_control = (__kernel_uintptr_t)compat_ptr(zcc->msg_control); + zc->msg_controllen = zcc->msg_controllen; + zc->msg_flags = zcc->msg_flags; + zc->reserved = zcc->reserved; +} + +static void copy_tcp_zerocopy_to_user(struct tcp_zerocopy_receive *zc, + struct tcp_zerocopy_receive_compat *zcc) +{ + if (!in_compat64()) + return; + + zcc->address = zc->address; + zcc->length = zc->length; + zcc->recv_skip_hint = zc->recv_skip_hint; + zcc->inq = zc->inq; + zcc->err = zc->err; + zcc->copybuf_address = zc->copybuf_address; + zcc->copybuf_len = zc->copybuf_len; + zcc->flags = zc->flags; + zcc->msg_control = zc->msg_control; + zcc->msg_controllen = zc->msg_controllen; + zcc->msg_flags = zc->msg_flags; + zcc->reserved = zc->reserved; +} + static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 offset) @@ -2234,6 +2287,7 @@ static int tcp_zerocopy_receive(struct sock *sk, mmap_read_unlock(current->mm); return -EINVAL; } + vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); avail_len = min_t(u32, vma_len, inq); total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); @@ -4326,6 +4380,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, case TCP_ZEROCOPY_RECEIVE: { struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; + struct tcp_zerocopy_receive_compat zcc = {}; int err;
if (copy_from_sockptr(&len, optlen, sizeof(int))) @@ -4333,17 +4388,24 @@ int do_tcp_getsockopt(struct sock *sk, int level, if (len < 0 || len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; - if (unlikely(len > sizeof(zc))) { - err = check_zeroed_sockptr(optval, sizeof(zc), - len - sizeof(zc)); + if (unlikely(len > tcp_zerocopy_receive_len())) { + err = check_zeroed_sockptr(optval, + tcp_zerocopy_receive_len(), + len - tcp_zerocopy_receive_len()); if (err < 1) return err == 0 ? -EINVAL : err; - len = sizeof(zc); + len = tcp_zerocopy_receive_len(); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } - if (copy_from_sockptr_with_ptr(&zc, optval, len)) - return -EFAULT; + if (in_compat64()) { + if (copy_from_sockptr(&zcc, optval, len)) + return -EFAULT; + copy_tcp_zerocopy_from_user(&zc, &zcc); + } else { + if (copy_from_sockptr_with_ptr(&zc, optval, len)) + return -EFAULT; + } if (zc.reserved) return -EINVAL; if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) @@ -4382,8 +4444,17 @@ int do_tcp_getsockopt(struct sock *sk, int level, zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: - if (!err && copy_to_sockptr(optval, &zc, len)) - err = -EFAULT; + if (!err) { + if (in_compat64()) { + copy_tcp_zerocopy_to_user(&zc, &zcc); + if (copy_to_sockptr(optval, &zcc, len)) + return -EFAULT; + } else { + // convert to capability copy. + if (copy_to_sockptr_with_ptr(optval, &zc, len)) + return -EFAULT; + } + } return err; } #endif
On 15-09-2023 08:56, Harry Ramsey wrote:
This patch introduces a compat version of the tcp_zerocopy_receive structure. Subsequently additional helper functions have been added to convert between compat and purecap strucutres.
Signed-off-by: Harry Ramsey harry.ramsey@arm.com
include/linux/sockptr.h | 14 +++++++ include/uapi/linux/tcp.h | 15 +++++++ net/ipv4/tcp.c | 87 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 108 insertions(+), 8 deletions(-)
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index c81355d61d07..fd270378e8c8 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -69,6 +69,20 @@ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) return copy_from_sockptr_offset(dst, src, 0, size); } +static inline int copy_to_sockptr_offset_with_ptr(sockptr_t dst, size_t offset,
const void *src, size_t size)
+{
- if (!sockptr_is_kernel(dst))
return copy_to_user_with_ptr(dst.user + offset, src, size);
- memcpy(dst.kernel + offset, src, size);
- return 0;
+}
+static inline int copy_to_sockptr_with_ptr(sockptr_t dst, const void *src, size_t size) +{
- return copy_to_sockptr_offset_with_ptr(dst, 0, src, size);
+}
These 2 functions belong to the commit I think.
static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, const void *src, size_t size) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 3854b7ace73e..631d5f3c11fd 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -365,4 +365,19 @@ struct tcp_zerocopy_receive { __u32 msg_flags; __u32 reserved; /* set to 0 for now */ };
+struct tcp_zerocopy_receive_compat {
Generally, compat types start with compat, so rename it to compat_tcp_zerocopy_receive.
- __u64 address; /* in: address of mapping */
- __u32 length; /* in/out: number of bytes to map/mapped */
- __u32 recv_skip_hint; /* out: amount of bytes to skip */
- __u32 inq; /* out: amount of bytes in read queue */
- __s32 err; /* out: socket error */
- __u64 copybuf_address; /* in: copybuf address (small reads) */
- __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */
- __u32 flags; /* in: flags */
- __u64 msg_control; /* ancillary data */
- __u64 msg_controllen;
- __u32 msg_flags;
- __u32 reserved; /* set to 0 for now */
+};
Don't add compat structs in the uAPI header. This struct is only used in the tcp.c, so just add it locally. No need for it to be exposed to other files, especially not the userspace. Also, don't need to keep the comments for this.
#endif /* _UAPI_LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 440b00e763e6..f4e7d2d3922b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1941,6 +1941,59 @@ static int find_next_mappable_frag(const skb_frag_t *frag, return offset; } +static inline bool in_compat64(void) +{
- return IS_ENABLED(CONFIG_COMPAT64) && in_compat_syscall();
+}
+static int tcp_zerocopy_receive_len(void)
size_t instead of int. Also, what keeping sizeof instead of len, to not be confused with the member length of the struct tcp_zerocopy_receive. So, maybe rename it to: tcp_zerocopy_receive_size() or maybe sizeof_tcp_zerocopy_receive().
also, make it inline as well.
+{
- if (in_compat64())
return sizeof(struct tcp_zerocopy_receive_compat);
- return sizeof(struct tcp_zerocopy_receive);
+}
+static void copy_tcp_zerocopy_from_user(struct tcp_zerocopy_receive *zc,
struct tcp_zerocopy_receive_compat *zcc)
This is just converting from a compat struct to a native struct, not copying to user. I would say, rename this to something with "convert" and add another function that actually does the copy from user. Same comment applies to the "to_user" function.
+{
- if (!in_compat64())
return;
You're already checking in_compat64 before calling the function.
- zc->address = (__kernel_uintptr_t)compat_ptr(zcc->address);
- zc->length = zcc->length;
- zc->recv_skip_hint = zcc->recv_skip_hint;
- zc->inq = zcc->inq;
- zc->err = zcc->err;
- zc->copybuf_address = (__kernel_uintptr_t)compat_ptr(zcc->copybuf_address);
- zc->copybuf_len = zcc->copybuf_len;
- zc->flags = zcc->flags;
- zc->msg_control = (__kernel_uintptr_t)compat_ptr(zcc->msg_control);
- zc->msg_controllen = zcc->msg_controllen;
- zc->msg_flags = zcc->msg_flags;
- zc->reserved = zcc->reserved;
+}
+static void copy_tcp_zerocopy_to_user(struct tcp_zerocopy_receive *zc,
struct tcp_zerocopy_receive_compat *zcc)
+{
- if (!in_compat64())
return;
- zcc->address = zc->address;
- zcc->length = zc->length;
- zcc->recv_skip_hint = zc->recv_skip_hint;
- zcc->inq = zc->inq;
- zcc->err = zc->err;
- zcc->copybuf_address = zc->copybuf_address;
- zcc->copybuf_len = zc->copybuf_len;
- zcc->flags = zc->flags;
- zcc->msg_control = zc->msg_control;
- zcc->msg_controllen = zc->msg_controllen;
- zcc->msg_flags = zc->msg_flags;
- zcc->reserved = zc->reserved;
+}
- static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 offset)
@@ -2234,6 +2287,7 @@ static int tcp_zerocopy_receive(struct sock *sk, mmap_read_unlock(current->mm); return -EINVAL; }
Spurious newline.
vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); avail_len = min_t(u32, vma_len, inq); total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); @@ -4326,6 +4380,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, case TCP_ZEROCOPY_RECEIVE: { struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {};
struct tcp_zerocopy_receive_compat zcc = {};
You don't need to define this out here, you can define it only in the corresponding if block. This will probably be inside the function that copies tcp_zerocopy_receive from user. Also, zcc is a bit obfuscated. maybe compat_zc instead.
int err;
if (copy_from_sockptr(&len, optlen, sizeof(int))) @@ -4333,17 +4388,24 @@ int do_tcp_getsockopt(struct sock *sk, int level, if (len < 0 || len < offsetofend(struct tcp_zerocopy_receive, length))
You need to take into account that it might be in_compat64() and you should return -EINVAL if len < offsetofend(struct tcp_zerocopy_receive_compat, length)
return -EINVAL;
if (unlikely(len > sizeof(zc))) {
err = check_zeroed_sockptr(optval, sizeof(zc),
len - sizeof(zc));
if (unlikely(len > tcp_zerocopy_receive_len())) {
err = check_zeroed_sockptr(optval,
tcp_zerocopy_receive_len(),
len - tcp_zerocopy_receive_len()); if (err < 1) return err == 0 ? -EINVAL : err;
len = sizeof(zc);
}len = tcp_zerocopy_receive_len(); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT;
if (copy_from_sockptr_with_ptr(&zc, optval, len))
return -EFAULT;
if (in_compat64()) {
if (copy_from_sockptr(&zcc, optval, len))
return -EFAULT;
copy_tcp_zerocopy_from_user(&zc, &zcc);
} else {
if (copy_from_sockptr_with_ptr(&zc, optval, len))
return -EFAULT;
if (zc.reserved) return -EINVAL; if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))}
@@ -4382,8 +4444,17 @@ int do_tcp_getsockopt(struct sock *sk, int level, zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out:
if (!err && copy_to_sockptr(optval, &zc, len))
err = -EFAULT;
if (!err) {
if (in_compat64()) {
copy_tcp_zerocopy_to_user(&zc, &zcc);
if (copy_to_sockptr(optval, &zcc, len))
return -EFAULT;
} else {
// convert to capability copy.
Don't need this comment.
Looks good overall, just a few format nits! Well done!
Thanks, Tudor
if (copy_to_sockptr_with_ptr(optval, &zc, len))
return -EFAULT;
}
return err; } #endif}
On 15-09-2023 08:56, Harry Ramsey wrote:
Hello,
This patch series enables purecap applications to utilise tcp_zerocopy_receive to map packets directly from a network interface card into a shared space with the process and kernel.
I have tested these changes against musl and there still exists an issue in this implementation with copybuf and potentially msg_control generating an EFAULT error.
Hi Harry,
Thanks for sending this out! Let us know if you intend to continue debugging it or if you want us to pick it up.
Gitlab Issue:
Review branch:
Thanks, Harry
Harry Ramsey (2): tcp: Support userspace capabilities for tcp_zerocopy_receive tcp: Implement compat version of tcp_zerocopy_receive
The order of the commits should be reversed, so that the plain AArch64 apps are still working after every commit.
A great series, well done! Just a few nits and comments.
Many thanks, Tudor
include/linux/sockptr.h | 28 ++++++++++ include/uapi/linux/tcp.h | 27 +++++++--- net/ipv4/tcp.c | 111 +++++++++++++++++++++++++++++++++------ 3 files changed, 145 insertions(+), 21 deletions(-)
linux-morello@op-lists.linaro.org