On 28-03-2023 09:48, Kevin Brodsky wrote:
On 16/03/2023 14:40, Tudor Cretu wrote:
Introduce compat versions of the structs exposed in the uAPI headers that might contain pointers as a member. Also, implement functions that convert the compat versions to the native versions of the struct.
A subsequent patch is going to change the io_uring structs to enable them to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
include/linux/io_uring_types.h | 135 ++++++++++++++++++- io_uring/cancel.c | 26 +++- io_uring/epoll.c | 2 +- io_uring/fdinfo.c | 80 ++++++----- io_uring/io_uring.c | 234 +++++++++++++++++++++++---------- io_uring/io_uring.h | 111 +++++++++++++--- io_uring/kbuf.c | 96 ++++++++++++-- io_uring/kbuf.h | 6 +- io_uring/net.c | 5 +- io_uring/rsrc.c | 108 +++++++++++++-- io_uring/tctx.c | 56 +++++++- io_uring/uring_cmd.h | 5 + 12 files changed, 704 insertions(+), 160 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 440179029a8f0..f0eb34ad8b709 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -7,6 +7,127 @@ #include <linux/llist.h> #include <uapi/linux/io_uring.h> +struct compat_io_uring_sqe {
- __u8 opcode;
- __u8 flags;
- __u16 ioprio;
- __s32 fd;
- union {
__u64 off;
__u64 addr2;
struct {
__u32 cmd_op;
__u32 __pad1;
};
- };
- union {
__u64 addr;
__u64 splice_off_in;
- };
- __u32 len;
- /* This member is actually a union in the native struct */
- __kernel_rwf_t rw_flags;
- __u64 user_data;
- union {
__u16 buf_index;
__u16 buf_group;
- } __packed;
- __u16 personality;
- union {
__s32 splice_fd_in;
__u32 file_index;
struct {
__u16 addr_len;
__u16 __pad3[1];
};
- };
- union {
struct {
__u64 addr3;
__u64 __pad2[1];
};
__u8 cmd[0];
- };
+};
+struct compat_io_uring_cqe {
- __u64 user_data;
- __s32 res;
- __u32 flags;
- __u64 big_cqe[];
+};
+struct compat_io_uring_files_update {
- __u32 offset;
- __u32 resv;
- __aligned_u64 fds;
+};
+struct compat_io_uring_rsrc_register {
- __u32 nr;
- __u32 flags;
- __u64 resv2;
- __aligned_u64 data;
- __aligned_u64 tags;
+};
+struct compat_io_uring_rsrc_update {
- __u32 offset;
- __u32 resv;
- __aligned_u64 data;
+};
+struct compat_io_uring_rsrc_update2 {
- __u32 offset;
- __u32 resv;
- __aligned_u64 data;
- __aligned_u64 tags;
- __u32 nr;
- __u32 resv2;
+};
+struct compat_io_uring_buf {
- __u64 addr;
- __u32 len;
- __u16 bid;
- __u16 resv;
+};
+struct compat_io_uring_buf_ring {
- union {
struct {
__u64 resv1;
__u32 resv2;
__u16 resv3;
__u16 tail;
};
struct compat_io_uring_buf bufs[0];
- };
+};
+struct compat_io_uring_buf_reg {
- __u64 ring_addr;
- __u32 ring_entries;
- __u16 bgid;
- __u16 pad;
- __u64 resv[3];
+};
+struct compat_io_uring_getevents_arg {
- __u64 sigmask;
- __u32 sigmask_sz;
- __u32 pad;
- __u64 ts;
+};
+struct compat_io_uring_sync_cancel_reg {
- __u64 addr;
- __s32 fd;
- __u32 flags;
- struct __kernel_timespec timeout;
- __u64 pad[4];
+};
Just thinking about this now, might make more sense to have these structs at the end of the file, rather than before the (common) native ones? Or maybe it would be even better to move them to a new file, say <linux/io_uring_compat.h>, that can be included from this file. That would make conflicts less likely and avoid adding a lot of structs to io__uring_types.h
Indeed much better, done!
[...]
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index bc8c9d764bc13..c724e6c544809 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -88,45 +88,64 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, sq_entries = min(sq_tail - sq_head, ctx->sq_entries); for (i = 0; i < sq_entries; i++) { unsigned int entry = i + sq_head;
struct io_uring_sqe *sqe;
unsigned int sq_idx;
unsigned int sq_idx, sq_off;
sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); if (sq_idx > sq_mask) continue;
sqe = &ctx->sq_sqes[sq_idx << sq_shift];
seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "
"addr:0x%llx, rw_flags:0x%x, buf_index:%d "
"user_data:%llu",
sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd,
sqe->flags, (unsigned long long) sqe->off,
(unsigned long long) sqe->addr, sqe->rw_flags,
sqe->buf_index, sqe->user_data);
if (sq_shift) {
u64 *sqeb = (void *) (sqe + 1);
int size = sizeof(struct io_uring_sqe) / sizeof(u64);
int j;
for (j = 0; j < size; j++) {
seq_printf(m, ", e%d:0x%llx", j,
(unsigned long long) *sqeb);
sqeb++;
}
}
sq_off = sq_idx << sq_shift;
+#define print_sqe(sqe) \
It's unusual to define macros in the middle of a function, I can see why it could make sense here but I think it's still more readable to define them normally, before the function. The number of arguments remains reasonable if you do it this way (AFAICT just need to add m, sq_idx, sq_shift).
Also minor thing, some of the \ are not aligned.
Sure!
do { \
seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " \
"addr:0x%llx, rw_flags:0x%x, buf_index:%d " \
"user_data:%llu", \
sq_idx, io_uring_get_opcode((sqe)->opcode), (sqe)->fd, \
(sqe)->flags, (unsigned long long) (sqe)->off, \
(unsigned long long) (sqe)->addr, (sqe)->rw_flags, \
(sqe)->buf_index, (sqe)->user_data); \
if (sq_shift) { \
u64 *sqeb = (void *) ((sqe) + 1); \
int size = sizeof(*(sqe)) / sizeof(u64); \
int j; \
\
for (j = 0; j < size; j++) { \
seq_printf(m, ", e%d:0x%llx", j, \
(unsigned long long) *sqeb); \
sqeb++; \
} \
} \
} while (0)
if (is_compat64_io_ring_ctx(ctx))
print_sqe(&ctx->sq_sqes_compat[sq_off]);
else
print_sqe(&ctx->sq_sqes[sq_off]);
+#undef print_sqe
- seq_printf(m, "\n"); } seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { unsigned int entry = i + cq_head;
struct io_uring_cqe *cqe = &ctx->cqes[(entry & cq_mask) << cq_shift];
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x",
entry & cq_mask, cqe->user_data, cqe->res,
cqe->flags);
if (cq_shift)
seq_printf(m, ", extra1:%llu, extra2:%llu\n",
cqe->big_cqe[0], cqe->big_cqe[1]);
unsigned int cq_off = (entry & cq_mask) << cq_shift;
+#define print_cqe(cqe) \
do { \
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", \
entry & cq_mask, (cqe)->user_data, (cqe)->res, \
(cqe)->flags); \
if (cq_shift) \
seq_printf(m, ", extra1:%llu, extra2:%llu\n", \
(cqe)->big_cqe[0], (cqe)->big_cqe[1]); \
} while (0)
if (is_compat64_io_ring_ctx(ctx))
print_cqe((struct compat_io_uring_cqe *)&ctx->cqes_compat[cq_off]);
I don't think you need the casts.
Yes, I've removed them from print_sqe() and I forgot them here...
else
print_cqe((struct io_uring_cqe *)&ctx->cqes[cq_off]);
+#undef print_cqe
- seq_printf(m, "\n"); }
@@ -191,8 +210,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct io_uring_cqe *cqe = &ocqe->cqe; seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
cqe->user_data, cqe->res, cqe->flags);
(cqe)->user_data, (cqe)->res, (cqe)->flags);
Looks like a leftover, I guess you don't need to change this any more as struct io_overflow_cqe is now always native?
That's right, thank you!
} spin_unlock(&ctx->completion_lock); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 707229ae04dc8..3f0e005481f3f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -152,6 +152,35 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; +static int get_compat64_io_uring_getevents_arg(struct io_uring_getevents_arg *arg,
const void __user *user_arg)
+{
- struct compat_io_uring_getevents_arg compat_arg;
- if (copy_from_user(&compat_arg, user_arg, sizeof(compat_arg)))
return -EFAULT;
- arg->sigmask = compat_arg.sigmask;
- arg->sigmask_sz = compat_arg.sigmask_sz;
- arg->pad = compat_arg.pad;
- arg->ts = compat_arg.ts;
- return 0;
+}
+static int copy_io_uring_getevents_arg_from_user(struct io_ring_ctx *ctx,
struct io_uring_getevents_arg *arg,
const void __user *argp,
size_t size)
+{
- if (is_compat64_io_ring_ctx(ctx)) {
if (size != sizeof(struct compat_io_uring_getevents_arg))
return -EINVAL;
return get_compat64_io_uring_getevents_arg(arg, argp);
- }
- if (size != sizeof(*arg))
return -EINVAL;
- return copy_from_user(arg, argp, sizeof(*arg));
+}
- struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX)
@@ -604,14 +633,10 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx) static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { bool all_flushed;
- size_t cqe_size = sizeof(struct io_uring_cqe);
if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false;
- if (ctx->flags & IORING_SETUP_CQE32)
cqe_size <<= 1;
- io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
@@ -621,9 +646,18 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list);
if (cqe)
memcpy(cqe, &ocqe->cqe, cqe_size);
else
if (cqe) {
u64 extra1 = 0;
u64 extra2 = 0;
if (ctx->flags & IORING_SETUP_CQE32) {
extra1 = ocqe->cqe.big_cqe[0];
extra2 = ocqe->cqe.big_cqe[1];
}
__io_fill_cqe(ctx, cqe, ocqe->cqe.user_data, ocqe->cqe.res,
ocqe->cqe.flags, extra1, extra2);
} else io_account_cq_overflow(ctx);
list_del(&ocqe->list); @@ -745,6 +779,10 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) { unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); unsigned int free, queued, len;
- size_t cqe_size = ctx->compat ?
It would be better to use is_compat64_io_ring_ctx() everywhere, now that we have a helper. This should leave the code entirely unmodified unless CONFIG_COMPAT64 is selected.
Indeed, I missed quite a few of them...
sizeof(struct compat_io_uring_cqe) :
sizeof(struct io_uring_cqe);
- struct io_uring_cqe *cqe;
/* * Posting into the CQ when there are pending overflowed CQEs may break @@ -767,14 +805,15 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) len <<= 1; }
- ctx->cqe_cached = &ctx->cqes[off];
- ctx->cqe_sentinel = ctx->cqe_cached + len;
- cqe = ctx->compat ? (struct io_uring_cqe *)&ctx->cqes_compat[off] : &ctx->cqes[off];
- ctx->cqe_cached = cqe;
- ctx->cqe_sentinel = ctx->cqe_cached + len * cqe_size;
The cqe_cached / cqe_sentinel changes look fine, but I wonder if we wouldn't be better off changing things a little more in that case. Manipulating these as void * pointers is a little awkward and can be error-prone, for instance on this line I thought "why not use cqe instead of ctx->cqe_cached", but that is definitely not equivalent as they are pointers to different types.
An alternative would be to make cqe_cached / cqe_sentinel indices into cqes / cqes_compat. This way we don't need to compute cqe_size to update them, and the logic remains otherwise unaffected. It could be a separate patch to make it easier to understand. That's just an idea though, leaving it as-is is also fine by me.
I've thought about that as well. Now it seems it's the obvious choice when you put it like that. Added the changes in a new patch.
ctx->cached_cq_tail++;
- ctx->cqe_cached++;
- ctx->cqe_cached += cqe_size; if (ctx->flags & IORING_SETUP_CQE32)
ctx->cqe_cached++;
- return &ctx->cqes[off];
ctx->cqe_cached += cqe_size;
- return cqe; }
[...]
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 50bc3af449534..b44ad558137be 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -5,6 +5,7 @@ #include <linux/lockdep.h> #include <linux/io_uring_types.h> #include "io-wq.h" +#include "uring_cmd.h" #include "slist.h" #include "filetable.h" @@ -93,16 +94,69 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx) void io_cq_unlock_post(struct io_ring_ctx *ctx); +static inline bool is_compat64_io_ring_ctx(struct io_ring_ctx *ctx)
I think it makes sense to follow the convention of having the full struct name in conversion helpers, but I'm less convinced here. It could be better to use the io_ prefix like most functions in this header, and pass struct io_ring_ctx without reflecting it in the function name. Maybe simple io_in_compat64(), reminiscent of in_compat_syscall()? That's also shorter, handy as we're using it a lot.
Indeed, done!
+{
- return IS_ENABLED(CONFIG_COMPAT64) && ctx->compat;
+}
[...]
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 41e192de9e8a7..c65b99fb9264f 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -23,6 +23,89 @@ struct io_rsrc_update { u32 offset; }; +static int get_compat64_io_uring_rsrc_update(struct io_uring_rsrc_update2 *up2,
const void __user *user_up)
+{
- struct compat_io_uring_rsrc_update compat_up;
- if (copy_from_user(&compat_up, user_up, sizeof(compat_up)))
return -EFAULT;
- up2->offset = compat_up.offset;
- up2->resv = compat_up.resv;
- up2->data = compat_up.data;
- return 0;
+}
+static int get_compat64_io_uring_rsrc_update2(struct io_uring_rsrc_update2 *up2,
const void __user *user_up2)
+{
- struct compat_io_uring_rsrc_update2 compat_up2;
- if (copy_from_user(&compat_up2, user_up2, sizeof(compat_up2)))
return -EFAULT;
- up2->offset = compat_up2.offset;
- up2->resv = compat_up2.resv;
- up2->data = compat_up2.data;
- up2->tags = compat_up2.tags;
- up2->nr = compat_up2.nr;
- up2->resv2 = compat_up2.resv2;
- return 0;
+}
+static int get_compat64_io_uring_rsrc_register(struct io_uring_rsrc_register *rr,
const void __user *user_rr)
+{
- struct compat_io_uring_rsrc_register compat_rr;
- if (copy_from_user(&compat_rr, user_rr, sizeof(compat_rr)))
return -EFAULT;
- rr->nr = compat_rr.nr;
- rr->flags = compat_rr.flags;
- rr->resv2 = compat_rr.resv2;
- rr->data = compat_rr.data;
- rr->tags = compat_rr.tags;
- return 0;
+}
+static int copy_io_uring_rsrc_update_from_user(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up2,
const void __user *arg)
+{
- if (is_compat64_io_ring_ctx(ctx))
return get_compat64_io_uring_rsrc_update(up2, arg);
- return copy_from_user(up2, arg, sizeof(struct io_uring_rsrc_update));
I've just realised that this can be problematic. Since io_register_files_update() returns whatever this function returns if not 0, it effectively returns what copy_from_user() returns (and eventually that value will be returned by the io_uring_register handler). However, copy_from_user() doesn't always return 0 or an error. It can also return a positive value, the number of bytes left to copy, in case a fault happened before reaching the requested size. We would therefore end up returning that positive value to userspace, instead of the intended -EFAULT.
Ah, good catch!
In most other contexts where a copy_io_uring_* helper is called, this doesn't matter as -EFAULT will be forced as return value at some point, but we should handle this consistently to avoid accidentally returning the value from copy_from_user() unmodified. Either all the copy_io_uring_* helpers should return -EFAULT explicitly, or all their (direct) callers should return -EFAULT explicitly. The former might be a little safer.
Alright, great suggestion! Thanks!
Kevin
Many thanks for the detailed review!
Tudor