On 28-10-2022 12:57, Kevin Brodsky wrote:
On 17/10/2022 19:09, Tudor Cretu wrote:
Introduce compat versions of the structs exposed in the uAPI headers that might contain pointers as a member. Also, implement functions that convert the compat versions to the native versions of the struct.
Pull cqes member out from rings struct so that we are able to have a union between cqes and cqes_compat. This is done in a similar way to commit 75b28affdd6a ("io_uring: allocate the two rings together"), where sq_array was pulled out from the rings struct.
As discussed offline, better to split this out as this is logically separate (though related) and it would make it easier to see the implications of each change.
Ack
A subsequent patch is going to change the io_uring structs to enable them to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/io_uring.c | 502 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 434 insertions(+), 68 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index d901be8da630..2db37c68e715 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -115,6 +115,93 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) +struct compat_io_uring_sqe { + __u8 opcode; + __u8 flags; + __u16 ioprio; + __s32 fd; + union { + __u64 off; + __u64 addr2; + }; + union { + __u64 addr; + __u64 splice_off_in; + }; + __u32 len; + union { + __kernel_rwf_t rw_flags; + __u32 fsync_flags; + __u16 poll_events; + __u32 poll32_events; + __u32 sync_range_flags; + __u32 msg_flags; + __u32 timeout_flags; + __u32 accept_flags; + __u32 cancel_flags; + __u32 open_flags; + __u32 statx_flags; + __u32 fadvise_advice; + __u32 splice_flags; + __u32 rename_flags; + __u32 unlink_flags; + __u32 hardlink_flags; + }; + __u64 user_data; + union { + __u16 buf_index; + __u16 buf_group; + } __packed; + __u16 personality; + union { + __s32 splice_fd_in; + __u32 file_index; + }; + __u64 __pad2[2]; +};
+struct compat_io_uring_cqe { + __u64 user_data; + __s32 res; + __u32 flags; +};
+struct compat_io_uring_files_update { + __u32 offset; + __u32 resv; + __aligned_u64 fds; +};
+struct compat_io_uring_rsrc_register { + __u32 nr; + __u32 resv; + __u64 resv2; + __aligned_u64 data; + __aligned_u64 tags; +};
+struct compat_io_uring_rsrc_update { + __u32 offset; + __u32 resv; + __aligned_u64 data; +};
+struct compat_io_uring_rsrc_update2 { + __u32 offset; + __u32 resv; + __aligned_u64 data; + __aligned_u64 tags; + __u32 nr; + __u32 resv2; +};
+struct compat_io_uring_getevents_arg { + __u64 sigmask; + __u32 sigmask_sz; + __u32 pad; + __u64 ts; +};
struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; @@ -188,14 +275,6 @@ struct io_rings { * ordered with any other data. */ u32 cq_overflow; - /* - * Ring buffer of completion events. - * - * The kernel writes completion events fresh every time they are - * produced, so the application is allowed to modify pending - * entries. - */ - struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; enum io_uring_cmd_flags { @@ -216,7 +295,10 @@ struct io_mapped_ubuf { struct io_ring_ctx; struct io_overflow_cqe { - struct io_uring_cqe cqe; + union { + struct compat_io_uring_cqe compat_cqe; + struct io_uring_cqe cqe; + }; struct list_head list; }; @@ -345,6 +427,17 @@ struct io_ring_ctx { struct percpu_ref refs; struct io_rings *rings; + /* + * Ring buffer of completion events. + * + * The kernel writes completion events fresh every time they are + * produced, so the application is allowed to modify pending + * entries. + */ + union { + struct compat_io_uring_cqe *cqes_compat; + struct io_uring_cqe *cqes; + };
Just curious, is there a particular reason to add this in that specific position in the struct?
No, just an artefact of one of my previous iterations. I moved it before sq_array now.
unsigned int flags; unsigned int compat: 1; unsigned int drain_next: 1; @@ -371,7 +464,10 @@ struct io_ring_ctx { * array. */ u32 *sq_array; - struct io_uring_sqe *sq_sqes; + union { + struct compat_io_uring_sqe *sq_sqes_compat; + struct io_uring_sqe *sq_sqes; + }; unsigned cached_sq_head; unsigned sq_entries; struct list_head defer_list; @@ -1804,10 +1900,9 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +static inline void *io_get_cqe(struct io_ring_ctx *ctx) { - struct io_rings *rings = ctx->rings; - unsigned tail, mask = ctx->cq_entries - 1; + unsigned int cq_idx, tail, mask = ctx->cq_entries - 1; /* * writes to the cq entry need to come after reading head; the @@ -1818,8 +1913,19 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) return NULL; tail = ctx->cached_cq_tail++; - return &rings->cqes[tail & mask]; + cq_idx = tail & mask; + return ctx->compat ? (void *)&ctx->cqes_compat[cq_idx] : (void *)&ctx->cqes[cq_idx];
Better to wrap at 80 char if reasonably possible, like here. Be careful, it looks like your editor is configured to wrap at 100 char (looking at the big comment you added further below).
checkpatch.pl was telling me to wrap it at 100 char, but yes, I should've done it at 80. Ack
+}
+#ifdef CONFIG_COMPAT +static inline void convert_compat_io_uring_cqe(struct io_uring_cqe *cqe, + const struct compat_io_uring_cqe *compat_cqe) +{ + cqe->user_data = READ_ONCE(compat_cqe->user_data); + cqe->res = READ_ONCE(compat_cqe->res); + cqe->flags = READ_ONCE(compat_cqe->flags); } +#endif
I don't see any obvious rationale for the position of these new conversion helpers in the file, especially this one. I wonder if it wouldn't be better to group them all somewhere towards the top, not only to avoid difficult choices but also to reduce the risk of conflicts when rebasing.
They were each before the first function that uses them, so yes, not obvious. I moved them to the top. Thanks!
static void io_eventfd_signal(struct io_ring_ctx *ctx) { @@ -1896,7 +2002,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) posted = false; spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { - struct io_uring_cqe *cqe = io_get_cqe(ctx); + void *cqe = io_get_cqe(ctx); struct io_overflow_cqe *ocqe; if (!cqe && !force) @@ -1904,7 +2010,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); if (cqe) - memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); + memcpy(cqe, ctx->compat ? (void *)&ocqe->compat_cqe : (void *)&ocqe->cqe, + ctx->compat ? sizeof(ocqe->compat_cqe) : sizeof(ocqe->cqe));
I think it's OK to leave the second argument unchanged: you know that the address of ->cqe and ->compat_cqe is the same (by design). You just need to choose the right size.
Ack
else io_account_cq_overflow(ctx); @@ -2010,9 +2117,18 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + ocqe->compat_cqe.user_data = user_data; + ocqe->compat_cqe.res = res; + ocqe->compat_cqe.flags = cflags; + } else +#endif + { + ocqe->cqe.user_data = user_data; + ocqe->cqe.res = res; + ocqe->cqe.flags = cflags; + }
For all such cases where there is a copy path for compat and another for native, I think it would be nice to create helpers, along the lines of copy_siginfo_from_user_any(). One advantage is that you can use an early return in the helper, avoiding the awkward if { ... } else followed by #endif.
Ack
list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } @@ -2020,7 +2136,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { - struct io_uring_cqe *cqe; + void *cqe; /* * If we can't get a cq entry, userspace overflowed the @@ -2029,9 +2145,22 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, */ cqe = io_get_cqe(ctx); if (likely(cqe)) { - WRITE_ONCE(cqe->user_data, user_data); - WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, cflags); +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct compat_io_uring_cqe *compat_cqe = (struct compat_io_uring_cqe *)cqe;
+ WRITE_ONCE(compat_cqe->user_data, user_data); + WRITE_ONCE(compat_cqe->res, res); + WRITE_ONCE(compat_cqe->flags, cflags); + } else +#endif + { + struct io_uring_cqe *native_cqe = (struct io_uring_cqe *)cqe;
+ WRITE_ONCE(native_cqe->user_data, user_data); + WRITE_ONCE(native_cqe->res, res); + WRITE_ONCE(native_cqe->flags, cflags); + } return true; } return io_cqring_event_overflow(ctx, user_data, res, cflags); @@ -7811,7 +7940,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) +static const void *io_get_sqe(struct io_ring_ctx *ctx) { unsigned head, mask = ctx->sq_entries - 1; unsigned sq_idx = ctx->cached_sq_head++ & mask; @@ -7826,7 +7955,8 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) */ head = READ_ONCE(ctx->sq_array[sq_idx]); if (likely(head < ctx->sq_entries)) - return &ctx->sq_sqes[head]; + return ctx->compat ? (void *)&ctx->sq_sqes_compat[head] + : (void *)&ctx->sq_sqes[head]; /* drop invalid entries */ ctx->cq_extra--; @@ -7835,6 +7965,40 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) return NULL; } +#ifdef CONFIG_COMPAT +static void convert_compat_io_uring_sqe(struct io_uring_sqe *sqe, + const struct compat_io_uring_sqe *compat_sqe) +{ +/*
- The struct io_uring_sqe contains anonymous unions and there is no
field keeping track of which
- union's member is active. Because in all the cases, the unions are
between integral types and
- the types are compatible, use the largest member of each union to
perform the copy. Use this
- compile-time check to ensure that the union's members are not
truncated during the conversion.
- */
+#define BUILD_BUG_COMPAT_SQE_UNION_ELEM(elem1, elem2) \ + BUILD_BUG_ON(sizeof_field(struct compat_io_uring_sqe, elem1) != \ + (offsetof(struct compat_io_uring_sqe, elem2) - \ + offsetof(struct compat_io_uring_sqe, elem1)))
An empty line here would help. Also, standard practice when declaring a macro inside a function is to #undef it at the end of the function. I see that io_uring_init() doesn't do that, but that's somewhat acceptable as it's the last function in the file.
Ack
+ sqe->opcode = READ_ONCE(compat_sqe->opcode); + sqe->flags = READ_ONCE(compat_sqe->flags); + sqe->ioprio = READ_ONCE(compat_sqe->ioprio); + sqe->fd = READ_ONCE(compat_sqe->fd); + BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr2, addr); + sqe->addr2 = READ_ONCE(compat_sqe->addr2); + BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr, len); + sqe->addr = READ_ONCE(compat_sqe->addr); + sqe->len = READ_ONCE(compat_sqe->len); + BUILD_BUG_COMPAT_SQE_UNION_ELEM(rw_flags, user_data); + sqe->rw_flags = READ_ONCE(compat_sqe->rw_flags); + sqe->user_data = READ_ONCE(compat_sqe->user_data); + BUILD_BUG_COMPAT_SQE_UNION_ELEM(buf_index, personality); + sqe->buf_index = READ_ONCE(compat_sqe->buf_index); + sqe->personality = READ_ONCE(compat_sqe->personality); + BUILD_BUG_COMPAT_SQE_UNION_ELEM(splice_fd_in, __pad2); + sqe->splice_fd_in = READ_ONCE(compat_sqe->splice_fd_in); +} +#endif
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { @@ -7849,7 +8013,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) io_submit_state_start(&ctx->submit_state, nr); do { - const struct io_uring_sqe *sqe; + const void *sqe; struct io_kiocb *req; if (unlikely(!io_alloc_req_refill(ctx))) { @@ -7863,6 +8027,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); break; } +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct io_uring_sqe native_sqe;
+ convert_compat_io_uring_sqe(&native_sqe, sqe); + sqe = &native_sqe;
That is a textbook example of use-after-free, native_sqe goes out of scope on the next line and sqe becomes a dangling pointer :) To my disappointment Clang doesn't seem to warn even about such a straightforward case, but interestingly GCC 12 does (looks like a new warning, -Wdangling-pointer).
Same issue with the conversions in __io_uring_show_fdinfo().
Oups! Thanks for spotting it out!
+ } +#endif /* will complete beyond this point, count as submitted */ submitted++; if (io_submit_sqe(ctx, req, sqe)) { @@ -9389,13 +9561,29 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp, get_order(size)); } -static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, - size_t *sq_offset) +static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, + unsigned int cq_entries, size_t *cq_offset, size_t *sq_offset) { struct io_rings *rings; - size_t off, sq_array_size; + size_t off, cq_array_size, sq_array_size;
+ off = sizeof(*rings);
+#ifdef CONFIG_SMP + off = ALIGN(off, SMP_CACHE_BYTES); + if (off == 0) + return SIZE_MAX; +#endif
+ if (cq_offset) + *cq_offset = off;
+ cq_array_size = array_size(ctx->compat ? sizeof(struct compat_io_uring_cqe) + : sizeof(struct io_uring_cqe), cq_entries); + if (cq_array_size == SIZE_MAX) + return SIZE_MAX; - off = struct_size(rings, cqes, cq_entries); + off = size_add(off, cq_array_size); if (off == SIZE_MAX) return SIZE_MAX; @@ -9412,7 +9600,8 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, if (sq_array_size == SIZE_MAX) return SIZE_MAX; - if (check_add_overflow(off, sq_array_size, &off)) + off = size_add(off, sq_array_size); + if (off == SIZE_MAX)
Could you elaborate on why we need this change?
It's not needed, just wanted to align it with the rest of the checks in the function and I find it more readable. I can make it again check_add_overflow, as well with the check at "size_add(off, cq_array_size)" if you prefer.
return SIZE_MAX; return off; @@ -9972,7 +10161,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); + io_mem_free(ctx->compat ? (void *)ctx->sq_sqes_compat : (void *)ctx->sq_sqes);
I find this sort of diff slightly frustrating as sq_sqes and sq_sqes_compat are both pointers, simply to a different type. If I got this correctly, you are making use of the type in two cases, while it is irrelevant in three (like here). However it is technically UB to refer to the inactive member of the union, so just getting rid of this diff wouldn't be recommended.
Fortunately there is an alternative: instead of having a union of pointers, make it a pointer to a union, e.g.:
union { struct io_uring_cqe native; struct compat_io_uring_cqe compat; } *sq_sqes;
This way sq_sqes can still be viewed as a void * without change, and the cases where the type matters have to be changed anyway.
Might as well do the same with cqes/cqes_compat to be consistent.
I think this change would be misleading. sq_sqes is not a pointer to an element which is either native or compat, but rather is an array of {compat_}io_uring_sqe structs, so it's not really the same thing. The compat member of the union would never be used. In the compat case, I would need to cast sq_sqes to an array of struct compat_io_uring_sqe and then get the element by index. Is this what intended to suggest?
percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -10518,6 +10707,21 @@ static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, return -EBUSY; } +#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_update(struct io_uring_rsrc_update *up, + const void __user *user_up) +{ + struct compat_io_uring_rsrc_update compat_up;
+ if (unlikely(copy_from_user(&compat_up, user_up, sizeof(compat_up)))) + return -EFAULT; + up->offset = compat_up.offset; + up->resv = compat_up.resv; + up->data = compat_up.data; + return 0; +} +#endif
/* * Register a ring fd to avoid fdget/fdput for each io_uring_enter() * invocation. User passes in an array of struct io_uring_rsrc_update @@ -10547,10 +10751,18 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, for (i = 0; i < nr_args; i++) { int start, end; - if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } +#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (get_compat_io_uring_rsrc_update(®, &arg[i])) { + ret = -EFAULT; + break; + } + } else +#endif + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + } if (reg.resv) { ret = -EINVAL; @@ -10599,10 +10811,19 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, return 0; for (i = 0; i < nr_args; i++) { - if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } +#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (get_compat_io_uring_rsrc_update(®, &arg[i])) { + ret = -EFAULT; + break; + } + } else +#endif + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + }
if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break; @@ -10632,7 +10853,7 @@ static void *io_uring_validate_mmap_request(struct file *file, ptr = ctx->rings; break; case IORING_OFF_SQES: - ptr = ctx->sq_sqes; + ptr = ctx->compat ? (void *)ctx->sq_sqes_compat : (void *)ctx->sq_sqes; break; default: return ERR_PTR(-EINVAL); @@ -10706,7 +10927,24 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) return 0; } -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, +#ifdef CONFIG_COMPAT +static int get_compat_io_uring_getevents_arg(struct io_uring_getevents_arg *arg, + const void __user *user_arg) +{ + struct compat_io_uring_getevents_arg compat_arg;
+ if (unlikely(copy_from_user(&compat_arg, user_arg, sizeof(compat_arg)))) + return -EFAULT; + arg->sigmask = compat_arg.sigmask; + arg->sigmask_sz = compat_arg.sigmask_sz; + arg->pad = compat_arg.pad; + arg->ts = compat_arg.ts; + return 0; +} +#endif
+static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned int flags, const void __user *argp, + size_t *argsz, #ifdef CONFIG_CHERI_PURECAP_UABI struct __kernel_timespec * __capability *ts, const sigset_t * __capability *sig) @@ -10731,10 +10969,21 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */ - if (*argsz != sizeof(arg)) - return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) - return -EFAULT; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (*argsz != sizeof(struct compat_io_uring_getevents_arg)) + return -EINVAL; + if (get_compat_io_uring_getevents_arg(&arg, argp)) + return -EFAULT; + } else +#endif + { + if (*argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + }
if (arg.pad) return -EINVAL; *sig = u64_to_user_ptr(arg.sigmask); @@ -10827,7 +11076,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, const sigset_t __user *sig; struct __kernel_timespec __user *ts; - ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + ret = io_get_ext_arg(ctx, flags, argp, &argsz, &ts, &sig); if (unlikely(ret)) goto out; @@ -10926,7 +11175,16 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, if (sq_idx > sq_mask) continue; - sqe = &ctx->sq_sqes[sq_idx]; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct io_uring_sqe native_sqe; + struct compat_io_uring_sqe *compat_sqe = &ctx->sq_sqes_compat[sq_idx];
+ convert_compat_io_uring_sqe(&native_sqe, compat_sqe); + sqe = &native_sqe; + } else +#endif + sqe = &ctx->sq_sqes[sq_idx]; seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", sq_idx, sqe->opcode, sqe->fd, sqe->flags, sqe->user_data); @@ -10935,7 +11193,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; + unsigned int cq_idx = entry & cq_mask; + struct io_uring_cqe *cqe; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct io_uring_cqe native_cqe; + struct compat_io_uring_cqe *compat_cqe = &ctx->cqes_compat[cq_idx];
+ convert_compat_io_uring_cqe(&native_cqe, compat_cqe); + cqe = &native_cqe; + } else +#endif + cqe = &ctx->cqes[cq_idx]; seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", entry & cq_mask, cqe->user_data, cqe->res, @@ -10998,7 +11267,17 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, seq_puts(m, "CqOverflowList:\n"); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { - struct io_uring_cqe *cqe = &ocqe->cqe; + struct io_uring_cqe *cqe; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct io_uring_cqe native_cqe; + struct compat_io_uring_cqe *compat_cqe = &ocqe->compat_cqe;
+ convert_compat_io_uring_cqe(&native_cqe, compat_cqe); + cqe = &native_cqe; + } else +#endif + cqe = &ocqe->cqe; seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", cqe->user_data, cqe->res, cqe->flags); @@ -11036,13 +11315,14 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { struct io_rings *rings; - size_t size, sq_array_offset; + size_t size, cqes_offset, sq_array_offset; + void *cqes, *sqes; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(ctx, p->sq_entries, p->cq_entries, &cqes_offset, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -11051,25 +11331,40 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -ENOMEM; ctx->rings = rings; + cqes = (char *)rings + cqes_offset; +#ifdef CONFIG_COMPAT + if (ctx->compat) + ctx->cqes_compat = (struct compat_io_uring_cqe *)cqes; + else +#endif + ctx->cqes = (struct io_uring_cqe *)cqes;
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; rings->cq_ring_mask = p->cq_entries - 1; rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries; - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + size = array_size(ctx->compat ? sizeof(struct compat_io_uring_sqe) + : sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { io_mem_free(ctx->rings); ctx->rings = NULL; return -EOVERFLOW; } - ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { + sqes = io_mem_alloc(size); + if (!sqes) { io_mem_free(ctx->rings); ctx->rings = NULL; return -ENOMEM; } +#ifdef CONFIG_COMPAT + if (ctx->compat) + ctx->sq_sqes_compat = (struct compat_io_uring_sqe *)sqes; + else +#endif + ctx->sq_sqes = (struct io_uring_sqe *)sqes; return 0; } @@ -11210,7 +11505,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); - p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->cq_off.cqes = (char *)ctx->cqes - (char *)ctx->rings; p->cq_off.flags = offsetof(struct io_rings, cq_flags); p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | @@ -11453,39 +11748,103 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, if (!nr_args) return -EINVAL; memset(&up, 0, sizeof(up)); - if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) - return -EFAULT;
+#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (get_compat_io_uring_rsrc_update((struct io_uring_rsrc_update *)&up, arg))
This is rather dubious from a strict aliasing perspective. Mind you, it's really the ABI that's scary here (struct io_uring_rsrc_update is supposed to represent the bottom part of struct io_uring_rsrc_update2, which is apparently used internally to avoid duplication). When doing a copy_from_user(), there's no aliasing concern as copy_from_user() doesn't care about types, but get_compat_io_uring_rsrc_update() does access up as a struct io_uring_rsrc_update and that's not really OK.
The best is probably to have an (admittedly somewhat strange) helper that takes a struct io_uring_rsrc_update2 * instead (without touching the extra fields). This is not too bad though as there should be two versions of get_compat_io_uring_rsrc_update() anyway, one for this function and one for the two others above, see my comments in patch 4 as to why that is.
Ack, good suggestion, more clear this way.
+ return -EFAULT; + } else +#endif + if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) + return -EFAULT;
if (up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } +#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_update2(struct io_uring_rsrc_update2 *up2, + const void __user *user_up2) +{ + struct compat_io_uring_rsrc_update2 compat_up2;
+ if (unlikely(copy_from_user(&compat_up2, user_up2, sizeof(compat_up2)))) + return -EFAULT; + up2->offset = compat_up2.offset; + up2->resv = compat_up2.resv; + up2->data = compat_up2.data; + up2->tags = compat_up2.tags; + up2->nr = compat_up2.nr; + up2->resv2 = compat_up2.resv2; + return 0; +} +#endif
static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type) { struct io_uring_rsrc_update2 up; - if (size != sizeof(up)) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (size != sizeof(struct compat_io_uring_rsrc_update2)) + return -EINVAL; + if (get_compat_io_uring_rsrc_update2(&up, arg)) + return -EFAULT; + } else +#endif + { + if (size != sizeof(up)) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + }
if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); }
+#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_register(struct io_uring_rsrc_register *rr, + const void __user *user_rr) +{ + struct compat_io_uring_rsrc_register compat_rr;
+ if (unlikely(copy_from_user(&compat_rr, user_rr, sizeof(compat_rr)))) + return -EFAULT; + rr->nr = compat_rr.nr; + rr->resv = compat_rr.resv; + rr->resv2 = compat_rr.resv2; + rr->data = compat_rr.data; + rr->tags = compat_rr.tags; + return 0; +} +#endif
static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr; - /* keep it extendible */ - if (size != sizeof(rr)) - return -EINVAL; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (size != sizeof(struct compat_io_uring_rsrc_register)) + return -EINVAL; + if (get_compat_io_uring_rsrc_register(&rr, arg)) + return -EFAULT; + } else +#endif + { + /* keep it extendible */ + if (size != sizeof(rr)) + return -EINVAL; + if (copy_from_user(&rr, arg, size)) + return -EFAULT; + } - memset(&rr, 0, sizeof(rr)); - if (copy_from_user(&rr, arg, size)) - return -EFAULT; if (!rr.nr || rr.resv || rr.resv2) return -EINVAL; @@ -11799,6 +12158,13 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ } while (0) + BUILD_BUG_ON(__builtin_types_compatible_p(struct io_uring_sqe, struct compat_io_uring_sqe));
The condition is reversed :)
Actually, this __builtin_types_compatible_p returns 0 😅, so it doesn't do what I expected. Copying all the layout checks for the struct compat_io_uring_sqe doesn't feel right, so I am unsure what to do here. Any suggestion?
+ BUILD_BUG_ON(sizeof(struct compat_io_uring_files_update) != + sizeof(struct compat_io_uring_rsrc_update)); + BUILD_BUG_ON(sizeof(struct compat_io_uring_rsrc_update) > + sizeof(struct compat_io_uring_rsrc_update2));
Nit: I'd rather have the compat checks after the native ones.
Ack
Kevin
Many thanks for the in-depth review and suggestions!
Thanks, Tudor
#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);