On 17/10/2022 19:09, Tudor Cretu wrote:
Some members of the io_uring uAPI structs may contain user pointers. On some architectures (for example CHERI) a user pointer is a 129-bit
CHERI is not really an architecture per se, and in general it's better to refer to PCuABI as this is beyond a particular architecture.
capability, so the __u64 type is not big enough to hold it. Use the __kernel_uintptr_t type instead, which is big enough on the affected architectures while remaining 64-bit on others.
In addition, special copy routines need to be used when copying user pointers from/to userspace. Use these for the io_uring structs containing user pointers.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/io_uring.c | 268 +++++++++++++++++++++------------- include/uapi/linux/io_uring.h | 32 ++-- 2 files changed, 179 insertions(+), 121 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 2db37c68e715..9216617c1edf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -351,7 +351,7 @@ struct io_buffer_list { struct io_buffer { struct list_head list;
- __u64 addr;
- __kernel_uintptr_t addr; __u32 len; __u16 bid; __u16 bgid;
@@ -652,7 +652,7 @@ struct io_sync { struct io_cancel { struct file *file;
- u64 addr;
- __kernel_uintptr_t addr; };
struct io_timeout { @@ -668,7 +668,7 @@ struct io_timeout { struct io_timeout_rem { struct file *file;
- u64 addr;
- __kernel_uintptr_t addr;
/* timeout update */ struct timespec64 ts; @@ -679,7 +679,7 @@ struct io_timeout_rem { struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb;
- u64 addr;
- __kernel_uintptr_t addr; u32 len; u32 flags; };
@@ -752,7 +752,7 @@ struct io_splice { struct io_provide_buf { struct file *file;
- __u64 addr;
- __kernel_uintptr_t addr; __u32 len; __u32 bgid; __u16 nbufs;
@@ -814,7 +814,7 @@ struct io_hardlink { struct io_msg { struct file *file;
- u64 user_data;
- __kernel_uintptr_t user_data; u32 len;
In that specific case I'd say might as well align these two members like in all the other structs, you're already touching one line so touching one more is not a big deal.
}; @@ -1001,7 +1001,7 @@ struct io_kiocb { u16 buf_index; unsigned int flags;
- u64 user_data;
- __kernel_uintptr_t user_data; u32 result; /* fd initially, then cflags for completion */ union {
@@ -1921,7 +1921,7 @@ static inline void *io_get_cqe(struct io_ring_ctx *ctx) static inline void convert_compat_io_uring_cqe(struct io_uring_cqe *cqe, const struct compat_io_uring_cqe *compat_cqe) {
- cqe->user_data = READ_ONCE(compat_cqe->user_data);
- cqe->user_data = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_cqe->user_data));
This is entirely user-defined data so we shouldn't use compat_ptr(), we just need to store the 64-bit value and return it later as-is.
cqe->res = READ_ONCE(compat_cqe->res); cqe->flags = READ_ONCE(compat_cqe->flags); } @@ -2096,7 +2096,7 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, +static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, s32 res, u32 cflags) { struct io_overflow_cqe *ocqe; @@ -2133,7 +2133,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, +static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, s32 res, u32 cflags) { void *cqe; @@ -2178,7 +2178,7 @@ static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) __io_fill_cqe_req(req, res, cflags); } -static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, +static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, s32 res, u32 cflags) { ctx->cq_extra++; @@ -3509,7 +3509,7 @@ static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, kbuf = io_buffer_select(req, len, bgid, issue_flags); if (IS_ERR(kbuf)) return ERR_USER_PTR(PTR_ERR(kbuf));
- return u64_to_user_ptr(kbuf->addr);
- return (void __user *)kbuf->addr; }
#ifdef CONFIG_COMPAT @@ -3521,7 +3521,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, void __user *buf; ssize_t len;
- uiov = u64_to_user_ptr(req->rw.addr);
- uiov = (struct compat_iovec __user *)req->rw.addr; if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(clen, &uiov->iov_len))
@@ -3542,11 +3542,11 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) {
- struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
- struct iovec __user *uiov = (struct iovec __user *)req->rw.addr; void __user *buf; ssize_t len;
- if (copy_from_user(iov, uiov, sizeof(*uiov)))
- if (copy_from_user_with_ptr(iov, uiov, sizeof(*uiov)))
I'm rather confused even by the original code: do we even use iov[0].iov_base? In other words, don't we only use the iov_len provided by the user, meaning that we don't need to preserve pointers (and that the original code copies iov_base for no reason)?
return -EFAULT;
len = iov[0].iov_len; @@ -3566,7 +3566,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, if (req->flags & REQ_F_BUFFER_SELECTED) { struct io_buffer *kbuf = req->kbuf;
iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
iov[0].iov_len = kbuf->len; return 0; }iov[0].iov_base = (void __user *)kbuf->addr;
@@ -3603,7 +3603,7 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) return ERR_PTR(-EINVAL);
- buf = u64_to_user_ptr(req->rw.addr);
- buf = (void __user *)req->rw.addr; sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { @@ -3684,7 +3684,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) if (!iov_iter_is_bvec(iter)) { iovec = iov_iter_iovec(iter); } else {
iovec.iov_base = u64_to_user_ptr(req->rw.addr);
}iovec.iov_base = (void __user *)req->rw.addr; iovec.iov_len = req->rw.len;
@@ -4185,8 +4185,8 @@ static int io_renameat_prep(struct io_kiocb *req, return -EBADF; ren->old_dfd = READ_ONCE(sqe->fd);
- oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- oldf = (const char __user *)READ_ONCE(sqe->addr);
- newf = (const char __user *)READ_ONCE(sqe->addr2); ren->new_dfd = READ_ONCE(sqe->len); ren->flags = READ_ONCE(sqe->rename_flags);
@@ -4242,7 +4242,7 @@ static int io_unlinkat_prep(struct io_kiocb *req, if (un->flags & ~AT_REMOVEDIR) return -EINVAL;
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- fname = (const char __user *)READ_ONCE(sqe->addr); un->filename = getname(fname); if (IS_ERR(un->filename)) return PTR_ERR(un->filename);
@@ -4288,7 +4288,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, mkd->dfd = READ_ONCE(sqe->fd); mkd->mode = READ_ONCE(sqe->len);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- fname = (const char __user *)READ_ONCE(sqe->addr); mkd->filename = getname(fname); if (IS_ERR(mkd->filename)) return PTR_ERR(mkd->filename);
@@ -4329,8 +4329,8 @@ static int io_symlinkat_prep(struct io_kiocb *req, return -EBADF; sl->new_dfd = READ_ONCE(sqe->fd);
- oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- oldpath = (const char __user *)READ_ONCE(sqe->addr);
- newpath = (const char __user *)READ_ONCE(sqe->addr2);
sl->oldpath = getname(oldpath); if (IS_ERR(sl->oldpath)) @@ -4378,8 +4378,8 @@ static int io_linkat_prep(struct io_kiocb *req, lnk->old_dfd = READ_ONCE(sqe->fd); lnk->new_dfd = READ_ONCE(sqe->len);
- oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- oldf = (const char __user *)READ_ONCE(sqe->addr);
- newf = (const char __user *)READ_ONCE(sqe->addr2); lnk->flags = READ_ONCE(sqe->hardlink_flags);
lnk->oldpath = getname(oldf); @@ -4702,7 +4702,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe req->open.how.flags |= O_LARGEFILE; req->open.dfd = READ_ONCE(sqe->fd);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- fname = (const char __user *)READ_ONCE(sqe->addr); req->open.filename = getname(fname); if (IS_ERR(req->open.filename)) { ret = PTR_ERR(req->open.filename);
@@ -4734,7 +4734,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret;
- how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- how = (struct open_how __user *)READ_ONCE(sqe->addr2); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) return -EINVAL;
@@ -4911,7 +4911,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, return -EOVERFLOW; size = (unsigned long)p->len * p->nbufs;
- if (!access_ok(u64_to_user_ptr(p->addr), size))
- if (!access_ok(p->addr, size)) return -EFAULT;
p->bgid = READ_ONCE(sqe->buf_group); @@ -5039,8 +5039,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, if (ep_op_has_event(req->epoll.op)) { struct epoll_event __user *ev;
ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
ev = (struct epoll_event __user *)READ_ONCE(sqe->addr);
}if (copy_from_user_with_ptr(&req->epoll.event, ev, sizeof(*ev))) return -EFAULT;
@@ -5155,8 +5155,8 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->statx.dfd = READ_ONCE(sqe->fd); req->statx.mask = READ_ONCE(sqe->len);
- path = u64_to_user_ptr(READ_ONCE(sqe->addr));
- req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- path = (const char __user *)READ_ONCE(sqe->addr);
- req->statx.buffer = (struct statx __user *)READ_ONCE(sqe->addr2); req->statx.flags = READ_ONCE(sqe->statx_flags);
req->statx.filename = getname_flags(path, @@ -5343,7 +5343,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL;
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
- sr->umsg = (struct user_msghdr __user *)READ_ONCE(sqe->addr); sr->len = READ_ONCE(sqe->len); sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT)
@@ -5458,7 +5458,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, if (req->flags & REQ_F_BUFFER_SELECT) { if (iov_len > 1) return -EINVAL;
if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
sr->len = iomsg->fast_iov[0].iov_len; iomsg->free_iov = NULL;if (copy_from_user_with_ptr(iomsg->fast_iov, uiov, sizeof(*uiov))) return -EFAULT;
@@ -5556,7 +5556,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL;
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
- sr->umsg = (struct user_msghdr __user *)READ_ONCE(sqe->addr); sr->len = READ_ONCE(sqe->len); sr->bgid = READ_ONCE(sqe->buf_group); sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
@@ -5605,7 +5605,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf);
kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
kmsg->fast_iov[0].iov_len = req->sr_msg.len; iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, req->sr_msg.len);kmsg->fast_iov[0].iov_base = (void __user *)kbuf->addr;
@@ -5666,7 +5666,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf);
buf = u64_to_user_ptr(kbuf->addr);
}buf = (void __user *)kbuf->addr;
ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); @@ -5722,8 +5722,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL;
- accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
- accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- accept->addr = (struct sockaddr __user *)READ_ONCE(sqe->addr);
- accept->addr_len = (int __user *)READ_ONCE(sqe->addr2); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE);
@@ -5791,8 +5791,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sqe->splice_fd_in) return -EINVAL;
- conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
- conn->addr_len = READ_ONCE(sqe->addr2);
- conn->addr = (struct sockaddr __user *)READ_ONCE(sqe->addr);
- conn->addr_len = READ_ONCE(sqe->addr2); return 0; }
@@ -6564,7 +6564,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) } static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
__u64 user_data)
__must_hold(&ctx->timeout_lock) { struct io_timeout_data *io;__kernel_uintptr_t user_data)
@@ -6586,7 +6586,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, return req; } -static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +static int io_timeout_cancel(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data) __must_hold(&ctx->completion_lock) __must_hold(&ctx->timeout_lock) { @@ -6614,7 +6614,7 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data) } } -static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, +static int io_linked_timeout_update(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { @@ -6639,7 +6639,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, return 0; } -static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, +static int io_timeout_update(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { @@ -6680,7 +6680,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req, tr->ltimeout = true; if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL;
if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) return -EINVAL;if (get_timespec64(&tr->ts, (const struct __kernel_timespec __user *)sqe->addr2)) return -EFAULT;
@@ -6766,7 +6766,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data->req = req; data->flags = flags;
- if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
- if (get_timespec64(&data->ts, (const struct __kernel_timespec __user *)sqe->addr)) return -EFAULT;
if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) @@ -6841,7 +6841,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct io_cancel_data { struct io_ring_ctx *ctx;
- u64 user_data;
- __kernel_uintptr_t user_data; };
static bool io_cancel_cb(struct io_wq_work *work, void *data) @@ -6852,7 +6852,7 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data) return req->ctx == cd->ctx && req->user_data == cd->user_data; } -static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, +static int io_async_cancel_one(struct io_uring_task *tctx, __kernel_uintptr_t user_data, struct io_ring_ctx *ctx) { struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; @@ -7984,13 +7984,13 @@ static void convert_compat_io_uring_sqe(struct io_uring_sqe *sqe, sqe->ioprio = READ_ONCE(compat_sqe->ioprio); sqe->fd = READ_ONCE(compat_sqe->fd); BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr2, addr);
- sqe->addr2 = READ_ONCE(compat_sqe->addr2);
- sqe->addr2 = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_sqe->addr2)); BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr, len);
- sqe->addr = READ_ONCE(compat_sqe->addr);
- sqe->addr = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_sqe->addr)); sqe->len = READ_ONCE(compat_sqe->len); BUILD_BUG_COMPAT_SQE_UNION_ELEM(rw_flags, user_data); sqe->rw_flags = READ_ONCE(compat_sqe->rw_flags);
- sqe->user_data = READ_ONCE(compat_sqe->user_data);
- sqe->user_data = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_sqe->user_data)); BUILD_BUG_COMPAT_SQE_UNION_ELEM(buf_index, personality); sqe->buf_index = READ_ONCE(compat_sqe->buf_index); sqe->personality = READ_ONCE(compat_sqe->personality);
@@ -9237,8 +9237,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) {
- u64 __user *tags = u64_to_user_ptr(up->tags);
- __s32 __user *fds = u64_to_user_ptr(up->data);
- u64 __user *tags = (u64 __user *)up->tags;
- __s32 __user *fds = (__s32 __user *)up->data; struct io_rsrc_data *data = ctx->file_data; struct io_fixed_file *file_slot; struct file *file;
@@ -9674,7 +9674,7 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
I've noticed there is an invalid use of u64_to_user_ptr() in this function, could you take care of replacing it with compat_ptr() while you're at it? You could potentially do this in patch 3, not sure it really requires a separate patch.
Well done, with this change that'll be all the u64_to_user_ptr() gone from this file, a good chunk of all uses in the kernel in fact, down 40!
} #endif src = (struct iovec __user *) arg;
- if (copy_from_user(dst, &src[index], sizeof(*dst)))
- if (copy_from_user_with_ptr(dst, &src[index], sizeof(*dst))) return -EFAULT; return 0; }
@@ -9935,9 +9935,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) {
- u64 __user *tags = u64_to_user_ptr(up->tags);
- u64 __user *tags = (u64 __user *)up->tags; struct iovec iov;
- struct iovec __user *iovs = u64_to_user_ptr(up->data);
- struct iovec __user *iovs = (struct iovec __user *)up->data; struct page *last_hpage = NULL; bool needs_switch = false; __u32 done;
@@ -10717,7 +10717,20 @@ static int get_compat_io_uring_rsrc_update(struct io_uring_rsrc_update *up, return -EFAULT; up->offset = compat_up.offset; up->resv = compat_up.resv;
- up->data = compat_up.data;
- up->data = (__kernel_uintptr_t)compat_ptr(compat_up.data);
- return 0;
+}
+static int set_compat_io_uring_rsrc_update(void __user *user_up,
const struct io_uring_rsrc_update *up)
+{
- struct compat_io_uring_rsrc_update compat_up;
- compat_up.offset = up->offset;
- compat_up.resv = up->resv;
- compat_up.data = user_ptr_addr((void __user *)up->data);
- if (unlikely(copy_to_user(user_up, &compat_up, sizeof(compat_up))))
return 0; } #endifreturn -EFAULT;
@@ -10759,7 +10772,7 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
Upon closer inspection, this function really does not use struct io_uring_rsrc_update in the way one might expect. I'm not even sure why it uses this struct, as it expects a fd to be stored in ->data (not a pointer!), an index in ->offset, and nothing in ->resv... In any case, this means we don't need any pointer-preserving operation on the native side here, and on the compat side we shouldn't create a pointer either. When setting ->data in the compat struct, a direct assignment from up->data would also be just fine (only the bottom 32 bits are expected to be set, so letting the compiler change the integer representation itself is appropriate).
The same applies to io_ringfd_unregister(), where in fact ->data is always supposed to be zero!
} } else
#endif
if (copy_from_user(®, &arg[i], sizeof(reg))) {
if (copy_from_user_with_ptr(®, &arg[i], sizeof(reg))) { ret = -EFAULT; break; }
@@ -10786,7 +10799,14 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, break; reg.offset = ret;
if (copy_to_user(&arg[i], ®, sizeof(reg))) {
+#ifdef CONFIG_COMPAT
if (ctx->compat)
ret = set_compat_io_uring_rsrc_update(&arg[i], ®);
else
+#endif
ret = copy_to_user_with_ptr(&arg[i], ®, sizeof(reg));
Not sure why this diff is here and not in patch 3?
if (ret) { fput(tctx->registered_rings[reg.offset]); tctx->registered_rings[reg.offset] = NULL; ret = -EFAULT;
@@ -10819,7 +10839,7 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, } } else #endif
if (copy_from_user(®, &arg[i], sizeof(reg))) {
if (copy_from_user_with_ptr(®, &arg[i], sizeof(reg))) { ret = -EFAULT; break; }
@@ -10879,6 +10899,10 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) return PTR_ERR(ptr); pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
- vma->vm_flags |= VM_READ_CAPS | VM_WRITE_CAPS;
- vma_set_page_prot(vma);
These VM flags are arch-specific, so an an arch hook needs to be introduced here (in a separate patch) to set them.
- return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); }
@@ -10935,10 +10959,10 @@ static int get_compat_io_uring_getevents_arg(struct io_uring_getevents_arg *arg, if (unlikely(copy_from_user(&compat_arg, user_arg, sizeof(compat_arg)))) return -EFAULT;
- arg->sigmask = compat_arg.sigmask;
- arg->sigmask = (__kernel_uintptr_t)compat_ptr(compat_arg.sigmask); arg->sigmask_sz = compat_arg.sigmask_sz; arg->pad = compat_arg.pad;
- arg->ts = compat_arg.ts;
- arg->ts = (__kernel_uintptr_t)compat_ptr(compat_arg.ts); return 0; } #endif
@@ -10980,15 +11004,15 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned int flags, const voi { if (*argsz != sizeof(arg)) return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
}if (copy_from_user_with_ptr(&arg, argp, sizeof(arg))) return -EFAULT;
if (arg.pad) return -EINVAL;
- *sig = u64_to_user_ptr(arg.sigmask);
- *sig = (const sigset_t __user *)arg.sigmask; *argsz = arg.sigmask_sz;
- *ts = u64_to_user_ptr(arg.ts);
- *ts = (struct __kernel_timespec __user *)arg.ts; return 0; }
@@ -11755,7 +11779,7 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, return -EFAULT; } else #endif
if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
if (copy_from_user_with_ptr(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT;
if (up.resv || up.resv2) @@ -11773,8 +11797,8 @@ static int get_compat_io_uring_rsrc_update2(struct io_uring_rsrc_update2 *up2, return -EFAULT; up2->offset = compat_up2.offset; up2->resv = compat_up2.resv;
- up2->data = compat_up2.data;
- up2->tags = compat_up2.tags;
- up2->data = (__kernel_uintptr_t)compat_ptr(compat_up2.data);
- up2->tags = (__kernel_uintptr_t)compat_ptr(compat_up2.tags); up2->nr = compat_up2.nr; up2->resv2 = compat_up2.resv2; return 0;
@@ -11797,7 +11821,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, { if (size != sizeof(up)) return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
}if (copy_from_user_with_ptr(&up, arg, sizeof(up))) return -EFAULT;
@@ -11818,8 +11842,8 @@ static int get_compat_io_uring_rsrc_register(struct io_uring_rsrc_register *rr, rr->nr = compat_rr.nr; rr->resv = compat_rr.resv; rr->resv2 = compat_rr.resv2;
- rr->data = compat_rr.data;
- rr->tags = compat_rr.tags;
- rr->data = (__kernel_uintptr_t)compat_ptr(compat_rr.data);
- rr->tags = (__kernel_uintptr_t)compat_ptr(compat_rr.tags); return 0; } #endif
@@ -11841,7 +11865,7 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, /* keep it extendible */ if (size != sizeof(rr)) return -EINVAL;
if (copy_from_user(&rr, arg, size))
}if (copy_from_user_with_ptr(&rr, arg, size)) return -EFAULT;
@@ -11850,11 +11874,11 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, switch (type) { case IORING_RSRC_FILE:
return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
return io_sqe_files_register(ctx, (void __user *)rr.data,
case IORING_RSRC_BUFFER:rr.nr, (u64 __user *)rr.tags);
return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
return io_sqe_buffers_register(ctx, (void __user *)rr.data,
} return -EINVAL; }rr.nr, (u64 __user *)rr.tags);
@@ -12158,46 +12182,80 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ } while (0)
- BUILD_BUG_ON(__builtin_types_compatible_p(struct io_uring_sqe, struct compat_io_uring_sqe));
+#define BUILD_BUG_COMPAT_SQE_ELEM(eoffset, etype, ename) \
- __BUILD_BUG_VERIFY_ELEMENT(struct compat_io_uring_sqe, eoffset, etype, ename)
- BUILD_BUG_ON(sizeof(struct compat_io_uring_sqe) != 64);
- BUILD_BUG_COMPAT_SQE_ELEM(0, __u8, opcode);
- BUILD_BUG_COMPAT_SQE_ELEM(1, __u8, flags);
- BUILD_BUG_COMPAT_SQE_ELEM(2, __u16, ioprio);
- BUILD_BUG_COMPAT_SQE_ELEM(4, __s32, fd);
- BUILD_BUG_COMPAT_SQE_ELEM(8, __u64, off);
- BUILD_BUG_COMPAT_SQE_ELEM(8, __u64, addr2);
- BUILD_BUG_COMPAT_SQE_ELEM(16, __u64, addr);
- BUILD_BUG_COMPAT_SQE_ELEM(16, __u64, splice_off_in);
- BUILD_BUG_COMPAT_SQE_ELEM(24, __u32, len);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
Don't think you need to preserve these extra spaces :)
- BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ int, rw_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ __u32, rw_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, fsync_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ __u16, poll_events);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, poll32_events);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, sync_range_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, msg_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, timeout_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, accept_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, cancel_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, open_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, statx_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, fadvise_advice);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, splice_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(32, __u64, user_data);
- BUILD_BUG_COMPAT_SQE_ELEM(40, __u16, buf_index);
- BUILD_BUG_COMPAT_SQE_ELEM(40, __u16, buf_group);
- BUILD_BUG_COMPAT_SQE_ELEM(42, __u16, personality);
- BUILD_BUG_COMPAT_SQE_ELEM(44, __s32, splice_fd_in);
- BUILD_BUG_COMPAT_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_ON(sizeof(struct compat_io_uring_files_update) != sizeof(struct compat_io_uring_rsrc_update)); BUILD_BUG_ON(sizeof(struct compat_io_uring_rsrc_update) > sizeof(struct compat_io_uring_rsrc_update2)); +#ifdef __CHERI_PURE_CAPABILITY__
You might have been spending too much time in userspace lately, you must mean CONFIG_CHERI_PURECAP_UABI :) Which explains why you didn't notice the inverted condition in the #else part, as indeed the structs are different in PCuABI!
#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
- BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
- BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 112);
Indeed not an optimal size, but it's hard to make the right decision at this point. It does make me think though that the __pad2 at the end of the struct is dubious as it stands: adding 16 bytes only makes things worse in purecap. OTOH making it 32 bytes would make the whole struct 128 bytes, potentially better. That said without padding we get a size of 96 bytes, that is 1.5 cache lines, which is really not too bad and I doubt that 32 bytes of padding would be worth it.
BUILD_BUG_SQE_ELEM(0, __u8, opcode); BUILD_BUG_SQE_ELEM(1, __u8, flags); BUILD_BUG_SQE_ELEM(2, __u16, ioprio); BUILD_BUG_SQE_ELEM(4, __s32, fd); BUILD_BUG_SQE_ELEM(8, __u64, off);
- BUILD_BUG_SQE_ELEM(8, __u64, addr2);
- BUILD_BUG_SQE_ELEM(16, __u64, addr);
- BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
- BUILD_BUG_SQE_ELEM(24, __u32, len);
- BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
- BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
- BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
- BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
- BUILD_BUG_SQE_ELEM(32, __u64, user_data);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
- BUILD_BUG_SQE_ELEM(42, __u16, personality);
- BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
- BUILD_BUG_SQE_ELEM(44, __u32, file_index);
- BUILD_BUG_SQE_ELEM(16, __uintcap_t, addr2);
- BUILD_BUG_SQE_ELEM(32, __uintcap_t, addr);
- BUILD_BUG_SQE_ELEM(32, __u64, splice_off_in);
- BUILD_BUG_SQE_ELEM(48, __u32, len);
- BUILD_BUG_SQE_ELEM(52, __kernel_rwf_t, rw_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, fsync_flags);
- BUILD_BUG_SQE_ELEM(52, __u16, poll_events);
- BUILD_BUG_SQE_ELEM(52, __u32, poll32_events);
- BUILD_BUG_SQE_ELEM(52, __u32, sync_range_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, msg_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, timeout_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, accept_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, cancel_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, open_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, statx_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, fadvise_advice);
- BUILD_BUG_SQE_ELEM(52, __u32, splice_flags);
- BUILD_BUG_SQE_ELEM(64, __uintcap_t, user_data);
- BUILD_BUG_SQE_ELEM(80, __u16, buf_index);
- BUILD_BUG_SQE_ELEM(80, __u16, buf_group);
- BUILD_BUG_SQE_ELEM(82, __u16, personality);
- BUILD_BUG_SQE_ELEM(84, __s32, splice_fd_in);
- BUILD_BUG_SQE_ELEM(88, __u32, file_index);
+#else
- BUILD_BUG_ON(__builtin_types_compatible_p(struct io_uring_sqe, struct compat_io_uring_sqe));
+#endif BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1845cf7c80ba..22fb128badc9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -20,12 +20,12 @@ struct io_uring_sqe { __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ union {
__u64 off; /* offset into file */
__u64 addr2;
__u64 off; /* offset into file */
}; union {__kernel_uintptr_t addr2;
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in;
__kernel_uintptr_t addr; /* pointer to buffer or iovecs */
}; __u32 len; /* buffer size or number of iovecs */ union {__u64 splice_off_in;
@@ -46,7 +46,7 @@ struct io_uring_sqe { __u32 unlink_flags; __u32 hardlink_flags; };
- __u64 user_data; /* data to be passed back at completion time */
- __kernel_uintptr_t user_data; /* data to be passed back at completion time */
Just reflecting some offline discussions for the benefit of everyone, it is of course impossible to tell from kernel code but user_data does need to be expanded to a __kernel_uintptr_t as it is standard practice to store a pointer in user_data. Probably a good idea to add something on that in the commit message.
/* pack this to avoid bogus arm OABI complaints */ union { /* index into fixed buffers, if used */ @@ -191,9 +191,9 @@ enum {
- IO completion data structure (Completion Queue Entry)
*/ struct io_uring_cqe {
- __u64 user_data; /* sqe->data submission passed back */
- __s32 res; /* result code for this event */
- __u32 flags;
- __kernel_uintptr_t user_data; /* sqe->data submission passed back */
- __s32 res; /* result code for this event */
- __u32 flags; };
/* @@ -347,28 +347,28 @@ enum { struct io_uring_files_update { __u32 offset; __u32 resv;
- __aligned_u64 /* __s32 * */ fds;
- __kernel_aligned_uintptr_t /* __s32 * */ fds; };
struct io_uring_rsrc_register { __u32 nr; __u32 resv; __u64 resv2;
- __aligned_u64 data;
- __aligned_u64 tags;
- __kernel_aligned_uintptr_t data;
- __kernel_aligned_uintptr_t tags; };
struct io_uring_rsrc_update { __u32 offset; __u32 resv;
- __aligned_u64 data;
- __kernel_aligned_uintptr_t data; };
struct io_uring_rsrc_update2 { __u32 offset; __u32 resv;
- __aligned_u64 data;
- __aligned_u64 tags;
- __kernel_aligned_uintptr_t data;
- __kernel_aligned_uintptr_t tags; __u32 nr; __u32 resv2; };
@@ -424,10 +424,10 @@ enum { }; struct io_uring_getevents_arg {
- __u64 sigmask;
- __kernel_uintptr_t sigmask; __u32 sigmask_sz; __u32 pad;
- __u64 ts;
- __kernel_uintptr_t ts; };
#endif
A final note reflecting on offline discussions, I don't think it is worth trying to split this patch into native and compat parts. Changing the fields to hold user pointers immediately requires the use of compat_ptr() when converting from compat, so both sides are tightly coupled.
Kevin