This series makes it possible for purecap apps to use the io_uring system.
With these patches, all io_uring LTP tests pass in both Purecap and compat modes. Note that the LTP tests only address the basic functionality of the io_uring system and a significant portion of the multiplexed functionality is untested.
Review branch: https://git.morello-project.org/tudcre01/linux/-/commits/io_uring_v1
Tudor Cretu (4): compiler_types: Add (u)intcap_t to native_words arm64: mm: Add VM_READ_CAPS and VM_WRITE_CAPS flags io_uring: Implement compat versions of uAPI structs and handle them io_uring: Use user pointer type in the uAPI structs
arch/arm64/Kconfig | 1 + arch/arm64/include/asm/mman.h | 6 + fs/io_uring.c | 734 ++++++++++++++++++++++++++------- fs/proc/task_mmu.c | 4 + include/linux/compiler_types.h | 7 + include/linux/mm.h | 10 + include/uapi/linux/io_uring.h | 32 +- 7 files changed, 623 insertions(+), 171 deletions(-)
On CHERI architectures, the stores/loads of capabilities can be atomic. Add (u)intcap_t types to the native_words check in order to allow the stores/loads of capabilities to pass the checks for atomic operations.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- include/linux/compiler_types.h | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 36d8b41f7ccf..755b0b42f7b9 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -323,9 +323,16 @@ struct ftrace_likely_data { default: (x)))
/* Is this type a native word size -- useful for atomic operations */ +#ifdef __CHERI__ +#define __native_word(t) \ + (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ + sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long) || \ + __same_type(t, __intcap_t) || __same_type(t, __uintcap_t)) +#else #define __native_word(t) \ (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) +#endif
#ifdef __OPTIMIZE__ # define __compiletime_assert(condition, msg, prefix, suffix) \
On 17/10/2022 19:09, Tudor Cretu wrote:
On CHERI architectures, the stores/loads of capabilities can be atomic.
Maybe "should be atomic", as otherwise it's a bit unclear what justifies this change.
Add (u)intcap_t types to the native_words check in order to allow the stores/loads of capabilities to pass the checks for atomic operations.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
include/linux/compiler_types.h | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 36d8b41f7ccf..755b0b42f7b9 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -323,9 +323,16 @@ struct ftrace_likely_data { default: (x))) /* Is this type a native word size -- useful for atomic operations */ +#ifdef __CHERI__ +#define __native_word(t) \
- (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \
sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long) || \
__same_type(t, __intcap_t) || __same_type(t, __uintcap_t))
I think you have made the best compromise here, but I just wanted to note that it won't work with capability pointers, notably user pointers. However we already know painfully well that we cannot tell whether a type is a pointer, so the only alternative to what you chose is == sizeof(__intcap_t) or similar, which is clearly less safe. It seems that this macro only needs to deal with capabilities in the form of __kernel_uintptr_t, so I'm in favour of your approach at least for the time being.
Kevin
+#else #define __native_word(t) \ (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) +#endif #ifdef __OPTIMIZE__ # define __compiletime_assert(condition, msg, prefix, suffix) \
Some systems (e.g. io_uring) need to load/store capabilities on buffers shared with the userspace. Shared mappings don't have load/store capabilities permissions by default, so add two new VM flags that would allow to set up such mappings.
Note: this wouldn't allow userspace to make arbitrary shared mappings with tag access as the flags are not exposed; the new VM flags would be for internal use only.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/mman.h | 6 ++++++ fs/proc/task_mmu.c | 4 ++++ include/linux/mm.h | 10 ++++++++++ 4 files changed, 21 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c784d8664a40..e8e6b0f21a91 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1971,6 +1971,7 @@ config ARM64_MORELLO depends on CC_HAS_MORELLO select ARCH_NO_SWAP select ARCH_HAS_CHERI_CAPABILITIES + select ARCH_USES_HIGH_VMA_FLAGS help The Morello architecture is an experimental extension to Armv8.2-A, which extends the AArch64 state with the principles proposed in diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index e3e28f7daf62..eb0b862121a2 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -55,6 +55,12 @@ static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) if (vm_flags & VM_MTE) prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED);
+ if (vm_flags & VM_READ_CAPS) + prot |= PTE_LOAD_CAPS; + + if (vm_flags & VM_WRITE_CAPS) + prot |= PTE_STORE_CAPS; + return __pgprot(prot); } #define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f46060eb91b5..4f56772da016 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -697,6 +697,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +#ifdef CONFIG_ARM64_MORELLO + [ilog2(VM_READ_CAPS)] = "rc", + [ilog2(VM_WRITE_CAPS)] = "wc", +#endif }; size_t i;
diff --git a/include/linux/mm.h b/include/linux/mm.h index 9b7b730db4e9..67247211aefa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -368,6 +368,16 @@ extern unsigned int kobjsize(const void *objp); # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+#ifdef CONFIG_ARM64_MORELLO +# define VM_READ_CAPS_BIT 38 +# define VM_WRITE_CAPS_BIT 39 +# define VM_READ_CAPS BIT(VM_READ_CAPS_BIT) +# define VM_WRITE_CAPS BIT(VM_WRITE_CAPS_BIT) +#else +# define VM_READ_CAPS VM_NONE +# define VM_WRITE_CAPS VM_NONE +#endif + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
On 17/10/2022 19:09, Tudor Cretu wrote:
Some systems (e.g. io_uring) need to load/store capabilities on buffers shared with the userspace. Shared mappings don't have load/store capabilities permissions by default, so add two new VM flags that would allow to set up such mappings.
Note: this wouldn't allow userspace to make arbitrary shared mappings with tag access as the flags are not exposed; the new VM flags would be for internal use only.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
arch/arm64/Kconfig | 1 + arch/arm64/include/asm/mman.h | 6 ++++++ fs/proc/task_mmu.c | 4 ++++ include/linux/mm.h | 10 ++++++++++ 4 files changed, 21 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c784d8664a40..e8e6b0f21a91 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1971,6 +1971,7 @@ config ARM64_MORELLO depends on CC_HAS_MORELLO select ARCH_NO_SWAP select ARCH_HAS_CHERI_CAPABILITIES
- select ARCH_USES_HIGH_VMA_FLAGS help The Morello architecture is an experimental extension to Armv8.2-A, which extends the AArch64 state with the principles proposed in
diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index e3e28f7daf62..eb0b862121a2 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -55,6 +55,12 @@ static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) if (vm_flags & VM_MTE) prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED);
- if (vm_flags & VM_READ_CAPS)
prot |= PTE_LOAD_CAPS;
- if (vm_flags & VM_WRITE_CAPS)
prot |= PTE_STORE_CAPS;
- return __pgprot(prot); } #define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f46060eb91b5..4f56772da016 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -697,6 +697,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +#ifdef CONFIG_ARM64_MORELLO
[ilog2(VM_READ_CAPS)] = "rc",
[ilog2(VM_WRITE_CAPS)] = "wc",
+#endif
This is nice, people have been asking for exactly this before :)
}; size_t i; diff --git a/include/linux/mm.h b/include/linux/mm.h index 9b7b730db4e9..67247211aefa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -368,6 +368,16 @@ extern unsigned int kobjsize(const void *objp); # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +#ifdef CONFIG_ARM64_MORELLO +# define VM_READ_CAPS_BIT 38 +# define VM_WRITE_CAPS_BIT 39
These are arch-specific (at least in the current form) so you should use VM_HIGH_ARCH_*, in fact I see you have already selected ARCH_USES_HIGH_VMA_FLAGS above. MTE uses 0/1 but 2/3 seem to be available still (anyway ARM64_MTE and ARM64_MORELLO are mutually exclusive options).
+# define VM_READ_CAPS BIT(VM_READ_CAPS_BIT) +# define VM_WRITE_CAPS BIT(VM_WRITE_CAPS_BIT) +#else +# define VM_READ_CAPS VM_NONE +# define VM_WRITE_CAPS VM_NONE +#endif
- /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
This patch is a good start, but I would rather we didn't stop halfway through. With just this patch, VM_{READ,WRITE}_CAP is only present when explicitly requested when creating the mapping. This means that the private mappings don't have them, although they all have capability access enabled by virtue of d054e88a4994 ("arm64: morello: Enable access to capabilities in memory"). It would be much nicer if we also removed this magic addition of PTE_*_CAPS to user mappings from pgtable-prot.h and instead made sure that VM_{READ,WRITE}_CAP are automatically added to all private user mappings (not only because this is cleaner, but more importantly because the new rc/wc smaps flags you're adding won't be set otherwise). This clearly belongs to at least one new commit, I'm not sure exactly how many places need to be changed (this needs to apply to all user mappings whether they are created by userspace through mmap(), or directly by the kernel itself, typically during execve()).
Depending on the amount of changes required, it might make sense to split all this into a new series. If not, make sure to move all the commits to the beginning of the series as they are fundamental groundwork for the rest of the series.
Kevin
Introduce compat versions of the structs exposed in the uAPI headers that might contain pointers as a member. Also, implement functions that convert the compat versions to the native versions of the struct.
Pull cqes member out from rings struct so that we are able to have a union between cqes and cqes_compat. This is done in a similar way to commit 75b28affdd6a ("io_uring: allocate the two rings together"), where sq_array was pulled out from the rings struct.
A subsequent patch is going to change the io_uring structs to enable them to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- fs/io_uring.c | 502 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 434 insertions(+), 68 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index d901be8da630..2db37c68e715 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -115,6 +115,93 @@
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
+struct compat_io_uring_sqe { + __u8 opcode; + __u8 flags; + __u16 ioprio; + __s32 fd; + union { + __u64 off; + __u64 addr2; + }; + union { + __u64 addr; + __u64 splice_off_in; + }; + __u32 len; + union { + __kernel_rwf_t rw_flags; + __u32 fsync_flags; + __u16 poll_events; + __u32 poll32_events; + __u32 sync_range_flags; + __u32 msg_flags; + __u32 timeout_flags; + __u32 accept_flags; + __u32 cancel_flags; + __u32 open_flags; + __u32 statx_flags; + __u32 fadvise_advice; + __u32 splice_flags; + __u32 rename_flags; + __u32 unlink_flags; + __u32 hardlink_flags; + }; + __u64 user_data; + union { + __u16 buf_index; + __u16 buf_group; + } __packed; + __u16 personality; + union { + __s32 splice_fd_in; + __u32 file_index; + }; + __u64 __pad2[2]; +}; + +struct compat_io_uring_cqe { + __u64 user_data; + __s32 res; + __u32 flags; +}; + +struct compat_io_uring_files_update { + __u32 offset; + __u32 resv; + __aligned_u64 fds; +}; + +struct compat_io_uring_rsrc_register { + __u32 nr; + __u32 resv; + __u64 resv2; + __aligned_u64 data; + __aligned_u64 tags; +}; + +struct compat_io_uring_rsrc_update { + __u32 offset; + __u32 resv; + __aligned_u64 data; +}; + +struct compat_io_uring_rsrc_update2 { + __u32 offset; + __u32 resv; + __aligned_u64 data; + __aligned_u64 tags; + __u32 nr; + __u32 resv2; +}; + +struct compat_io_uring_getevents_arg { + __u64 sigmask; + __u32 sigmask_sz; + __u32 pad; + __u64 ts; +}; + struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; @@ -188,14 +275,6 @@ struct io_rings { * ordered with any other data. */ u32 cq_overflow; - /* - * Ring buffer of completion events. - * - * The kernel writes completion events fresh every time they are - * produced, so the application is allowed to modify pending - * entries. - */ - struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; };
enum io_uring_cmd_flags { @@ -216,7 +295,10 @@ struct io_mapped_ubuf { struct io_ring_ctx;
struct io_overflow_cqe { - struct io_uring_cqe cqe; + union { + struct compat_io_uring_cqe compat_cqe; + struct io_uring_cqe cqe; + }; struct list_head list; };
@@ -345,6 +427,17 @@ struct io_ring_ctx { struct percpu_ref refs;
struct io_rings *rings; + /* + * Ring buffer of completion events. + * + * The kernel writes completion events fresh every time they are + * produced, so the application is allowed to modify pending + * entries. + */ + union { + struct compat_io_uring_cqe *cqes_compat; + struct io_uring_cqe *cqes; + }; unsigned int flags; unsigned int compat: 1; unsigned int drain_next: 1; @@ -371,7 +464,10 @@ struct io_ring_ctx { * array. */ u32 *sq_array; - struct io_uring_sqe *sq_sqes; + union { + struct compat_io_uring_sqe *sq_sqes_compat; + struct io_uring_sqe *sq_sqes; + }; unsigned cached_sq_head; unsigned sq_entries; struct list_head defer_list; @@ -1804,10 +1900,9 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); }
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +static inline void *io_get_cqe(struct io_ring_ctx *ctx) { - struct io_rings *rings = ctx->rings; - unsigned tail, mask = ctx->cq_entries - 1; + unsigned int cq_idx, tail, mask = ctx->cq_entries - 1;
/* * writes to the cq entry need to come after reading head; the @@ -1818,8 +1913,19 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) return NULL;
tail = ctx->cached_cq_tail++; - return &rings->cqes[tail & mask]; + cq_idx = tail & mask; + return ctx->compat ? (void *)&ctx->cqes_compat[cq_idx] : (void *)&ctx->cqes[cq_idx]; +} + +#ifdef CONFIG_COMPAT +static inline void convert_compat_io_uring_cqe(struct io_uring_cqe *cqe, + const struct compat_io_uring_cqe *compat_cqe) +{ + cqe->user_data = READ_ONCE(compat_cqe->user_data); + cqe->res = READ_ONCE(compat_cqe->res); + cqe->flags = READ_ONCE(compat_cqe->flags); } +#endif
static void io_eventfd_signal(struct io_ring_ctx *ctx) { @@ -1896,7 +2002,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) posted = false; spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { - struct io_uring_cqe *cqe = io_get_cqe(ctx); + void *cqe = io_get_cqe(ctx); struct io_overflow_cqe *ocqe;
if (!cqe && !force) @@ -1904,7 +2010,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); if (cqe) - memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); + memcpy(cqe, ctx->compat ? (void *)&ocqe->compat_cqe : (void *)&ocqe->cqe, + ctx->compat ? sizeof(ocqe->compat_cqe) : sizeof(ocqe->cqe)); else io_account_cq_overflow(ctx);
@@ -2010,9 +2117,18 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
} - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + ocqe->compat_cqe.user_data = user_data; + ocqe->compat_cqe.res = res; + ocqe->compat_cqe.flags = cflags; + } else +#endif + { + ocqe->cqe.user_data = user_data; + ocqe->cqe.res = res; + ocqe->cqe.flags = cflags; + } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } @@ -2020,7 +2136,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { - struct io_uring_cqe *cqe; + void *cqe;
/* * If we can't get a cq entry, userspace overflowed the @@ -2029,9 +2145,22 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, */ cqe = io_get_cqe(ctx); if (likely(cqe)) { - WRITE_ONCE(cqe->user_data, user_data); - WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, cflags); +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct compat_io_uring_cqe *compat_cqe = (struct compat_io_uring_cqe *)cqe; + + WRITE_ONCE(compat_cqe->user_data, user_data); + WRITE_ONCE(compat_cqe->res, res); + WRITE_ONCE(compat_cqe->flags, cflags); + } else +#endif + { + struct io_uring_cqe *native_cqe = (struct io_uring_cqe *)cqe; + + WRITE_ONCE(native_cqe->user_data, user_data); + WRITE_ONCE(native_cqe->res, res); + WRITE_ONCE(native_cqe->flags, cflags); + } return true; } return io_cqring_event_overflow(ctx, user_data, res, cflags); @@ -7811,7 +7940,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) +static const void *io_get_sqe(struct io_ring_ctx *ctx) { unsigned head, mask = ctx->sq_entries - 1; unsigned sq_idx = ctx->cached_sq_head++ & mask; @@ -7826,7 +7955,8 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) */ head = READ_ONCE(ctx->sq_array[sq_idx]); if (likely(head < ctx->sq_entries)) - return &ctx->sq_sqes[head]; + return ctx->compat ? (void *)&ctx->sq_sqes_compat[head] + : (void *)&ctx->sq_sqes[head];
/* drop invalid entries */ ctx->cq_extra--; @@ -7835,6 +7965,40 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) return NULL; }
+#ifdef CONFIG_COMPAT +static void convert_compat_io_uring_sqe(struct io_uring_sqe *sqe, + const struct compat_io_uring_sqe *compat_sqe) +{ +/* + * The struct io_uring_sqe contains anonymous unions and there is no field keeping track of which + * union's member is active. Because in all the cases, the unions are between integral types and + * the types are compatible, use the largest member of each union to perform the copy. Use this + * compile-time check to ensure that the union's members are not truncated during the conversion. + */ +#define BUILD_BUG_COMPAT_SQE_UNION_ELEM(elem1, elem2) \ + BUILD_BUG_ON(sizeof_field(struct compat_io_uring_sqe, elem1) != \ + (offsetof(struct compat_io_uring_sqe, elem2) - \ + offsetof(struct compat_io_uring_sqe, elem1))) + sqe->opcode = READ_ONCE(compat_sqe->opcode); + sqe->flags = READ_ONCE(compat_sqe->flags); + sqe->ioprio = READ_ONCE(compat_sqe->ioprio); + sqe->fd = READ_ONCE(compat_sqe->fd); + BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr2, addr); + sqe->addr2 = READ_ONCE(compat_sqe->addr2); + BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr, len); + sqe->addr = READ_ONCE(compat_sqe->addr); + sqe->len = READ_ONCE(compat_sqe->len); + BUILD_BUG_COMPAT_SQE_UNION_ELEM(rw_flags, user_data); + sqe->rw_flags = READ_ONCE(compat_sqe->rw_flags); + sqe->user_data = READ_ONCE(compat_sqe->user_data); + BUILD_BUG_COMPAT_SQE_UNION_ELEM(buf_index, personality); + sqe->buf_index = READ_ONCE(compat_sqe->buf_index); + sqe->personality = READ_ONCE(compat_sqe->personality); + BUILD_BUG_COMPAT_SQE_UNION_ELEM(splice_fd_in, __pad2); + sqe->splice_fd_in = READ_ONCE(compat_sqe->splice_fd_in); +} +#endif + static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { @@ -7849,7 +8013,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
io_submit_state_start(&ctx->submit_state, nr); do { - const struct io_uring_sqe *sqe; + const void *sqe; struct io_kiocb *req;
if (unlikely(!io_alloc_req_refill(ctx))) { @@ -7863,6 +8027,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); break; } +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct io_uring_sqe native_sqe; + + convert_compat_io_uring_sqe(&native_sqe, sqe); + sqe = &native_sqe; + } +#endif /* will complete beyond this point, count as submitted */ submitted++; if (io_submit_sqe(ctx, req, sqe)) { @@ -9389,13 +9561,29 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp, get_order(size)); }
-static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, - size_t *sq_offset) +static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, + unsigned int cq_entries, size_t *cq_offset, size_t *sq_offset) { struct io_rings *rings; - size_t off, sq_array_size; + size_t off, cq_array_size, sq_array_size; + + off = sizeof(*rings); + +#ifdef CONFIG_SMP + off = ALIGN(off, SMP_CACHE_BYTES); + if (off == 0) + return SIZE_MAX; +#endif + + if (cq_offset) + *cq_offset = off; + + cq_array_size = array_size(ctx->compat ? sizeof(struct compat_io_uring_cqe) + : sizeof(struct io_uring_cqe), cq_entries); + if (cq_array_size == SIZE_MAX) + return SIZE_MAX;
- off = struct_size(rings, cqes, cq_entries); + off = size_add(off, cq_array_size); if (off == SIZE_MAX) return SIZE_MAX;
@@ -9412,7 +9600,8 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, if (sq_array_size == SIZE_MAX) return SIZE_MAX;
- if (check_add_overflow(off, sq_array_size, &off)) + off = size_add(off, sq_array_size); + if (off == SIZE_MAX) return SIZE_MAX;
return off; @@ -9972,7 +10161,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); + io_mem_free(ctx->compat ? (void *)ctx->sq_sqes_compat : (void *)ctx->sq_sqes);
percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -10518,6 +10707,21 @@ static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, return -EBUSY; }
+#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_update(struct io_uring_rsrc_update *up, + const void __user *user_up) +{ + struct compat_io_uring_rsrc_update compat_up; + + if (unlikely(copy_from_user(&compat_up, user_up, sizeof(compat_up)))) + return -EFAULT; + up->offset = compat_up.offset; + up->resv = compat_up.resv; + up->data = compat_up.data; + return 0; +} +#endif + /* * Register a ring fd to avoid fdget/fdput for each io_uring_enter() * invocation. User passes in an array of struct io_uring_rsrc_update @@ -10547,10 +10751,18 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, for (i = 0; i < nr_args; i++) { int start, end;
- if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } +#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (get_compat_io_uring_rsrc_update(®, &arg[i])) { + ret = -EFAULT; + break; + } + } else +#endif + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + }
if (reg.resv) { ret = -EINVAL; @@ -10599,10 +10811,19 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, return 0;
for (i = 0; i < nr_args; i++) { - if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } +#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (get_compat_io_uring_rsrc_update(®, &arg[i])) { + ret = -EFAULT; + break; + } + } else +#endif + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + } + if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break; @@ -10632,7 +10853,7 @@ static void *io_uring_validate_mmap_request(struct file *file, ptr = ctx->rings; break; case IORING_OFF_SQES: - ptr = ctx->sq_sqes; + ptr = ctx->compat ? (void *)ctx->sq_sqes_compat : (void *)ctx->sq_sqes; break; default: return ERR_PTR(-EINVAL); @@ -10706,7 +10927,24 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) return 0; }
-static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, +#ifdef CONFIG_COMPAT +static int get_compat_io_uring_getevents_arg(struct io_uring_getevents_arg *arg, + const void __user *user_arg) +{ + struct compat_io_uring_getevents_arg compat_arg; + + if (unlikely(copy_from_user(&compat_arg, user_arg, sizeof(compat_arg)))) + return -EFAULT; + arg->sigmask = compat_arg.sigmask; + arg->sigmask_sz = compat_arg.sigmask_sz; + arg->pad = compat_arg.pad; + arg->ts = compat_arg.ts; + return 0; +} +#endif + +static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned int flags, const void __user *argp, + size_t *argsz, #ifdef CONFIG_CHERI_PURECAP_UABI struct __kernel_timespec * __capability *ts, const sigset_t * __capability *sig) @@ -10731,10 +10969,21 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */ - if (*argsz != sizeof(arg)) - return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) - return -EFAULT; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (*argsz != sizeof(struct compat_io_uring_getevents_arg)) + return -EINVAL; + if (get_compat_io_uring_getevents_arg(&arg, argp)) + return -EFAULT; + } else +#endif + { + if (*argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + } + if (arg.pad) return -EINVAL; *sig = u64_to_user_ptr(arg.sigmask); @@ -10827,7 +11076,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, const sigset_t __user *sig; struct __kernel_timespec __user *ts;
- ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + ret = io_get_ext_arg(ctx, flags, argp, &argsz, &ts, &sig); if (unlikely(ret)) goto out;
@@ -10926,7 +11175,16 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
if (sq_idx > sq_mask) continue; - sqe = &ctx->sq_sqes[sq_idx]; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct io_uring_sqe native_sqe; + struct compat_io_uring_sqe *compat_sqe = &ctx->sq_sqes_compat[sq_idx]; + + convert_compat_io_uring_sqe(&native_sqe, compat_sqe); + sqe = &native_sqe; + } else +#endif + sqe = &ctx->sq_sqes[sq_idx]; seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", sq_idx, sqe->opcode, sqe->fd, sqe->flags, sqe->user_data); @@ -10935,7 +11193,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; + unsigned int cq_idx = entry & cq_mask; + struct io_uring_cqe *cqe; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct io_uring_cqe native_cqe; + struct compat_io_uring_cqe *compat_cqe = &ctx->cqes_compat[cq_idx]; + + convert_compat_io_uring_cqe(&native_cqe, compat_cqe); + cqe = &native_cqe; + } else +#endif + cqe = &ctx->cqes[cq_idx];
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", entry & cq_mask, cqe->user_data, cqe->res, @@ -10998,7 +11267,17 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
seq_puts(m, "CqOverflowList:\n"); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { - struct io_uring_cqe *cqe = &ocqe->cqe; + struct io_uring_cqe *cqe; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct io_uring_cqe native_cqe; + struct compat_io_uring_cqe *compat_cqe = &ocqe->compat_cqe; + + convert_compat_io_uring_cqe(&native_cqe, compat_cqe); + cqe = &native_cqe; + } else +#endif + cqe = &ocqe->cqe;
seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", cqe->user_data, cqe->res, cqe->flags); @@ -11036,13 +11315,14 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { struct io_rings *rings; - size_t size, sq_array_offset; + size_t size, cqes_offset, sq_array_offset; + void *cqes, *sqes;
/* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries;
- size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(ctx, p->sq_entries, p->cq_entries, &cqes_offset, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW;
@@ -11051,25 +11331,40 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -ENOMEM;
ctx->rings = rings; + cqes = (char *)rings + cqes_offset; +#ifdef CONFIG_COMPAT + if (ctx->compat) + ctx->cqes_compat = (struct compat_io_uring_cqe *)cqes; + else +#endif + ctx->cqes = (struct io_uring_cqe *)cqes; + ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; rings->cq_ring_mask = p->cq_entries - 1; rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries;
- size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + size = array_size(ctx->compat ? sizeof(struct compat_io_uring_sqe) + : sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { io_mem_free(ctx->rings); ctx->rings = NULL; return -EOVERFLOW; }
- ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { + sqes = io_mem_alloc(size); + if (!sqes) { io_mem_free(ctx->rings); ctx->rings = NULL; return -ENOMEM; } +#ifdef CONFIG_COMPAT + if (ctx->compat) + ctx->sq_sqes_compat = (struct compat_io_uring_sqe *)sqes; + else +#endif + ctx->sq_sqes = (struct io_uring_sqe *)sqes;
return 0; } @@ -11210,7 +11505,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); - p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->cq_off.cqes = (char *)ctx->cqes - (char *)ctx->rings; p->cq_off.flags = offsetof(struct io_rings, cq_flags);
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | @@ -11453,39 +11748,103 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, if (!nr_args) return -EINVAL; memset(&up, 0, sizeof(up)); - if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) - return -EFAULT; + +#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (get_compat_io_uring_rsrc_update((struct io_uring_rsrc_update *)&up, arg)) + return -EFAULT; + } else +#endif + if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) + return -EFAULT; + if (up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); }
+#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_update2(struct io_uring_rsrc_update2 *up2, + const void __user *user_up2) +{ + struct compat_io_uring_rsrc_update2 compat_up2; + + if (unlikely(copy_from_user(&compat_up2, user_up2, sizeof(compat_up2)))) + return -EFAULT; + up2->offset = compat_up2.offset; + up2->resv = compat_up2.resv; + up2->data = compat_up2.data; + up2->tags = compat_up2.tags; + up2->nr = compat_up2.nr; + up2->resv2 = compat_up2.resv2; + return 0; +} +#endif + static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type) { struct io_uring_rsrc_update2 up;
- if (size != sizeof(up)) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (size != sizeof(struct compat_io_uring_rsrc_update2)) + return -EINVAL; + if (get_compat_io_uring_rsrc_update2(&up, arg)) + return -EFAULT; + } else +#endif + { + if (size != sizeof(up)) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + } + if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); }
+ +#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_register(struct io_uring_rsrc_register *rr, + const void __user *user_rr) +{ + struct compat_io_uring_rsrc_register compat_rr; + + if (unlikely(copy_from_user(&compat_rr, user_rr, sizeof(compat_rr)))) + return -EFAULT; + rr->nr = compat_rr.nr; + rr->resv = compat_rr.resv; + rr->resv2 = compat_rr.resv2; + rr->data = compat_rr.data; + rr->tags = compat_rr.tags; + return 0; +} +#endif + static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr;
- /* keep it extendible */ - if (size != sizeof(rr)) - return -EINVAL; +#ifdef CONFIG_COMPAT + if (ctx->compat) { + if (size != sizeof(struct compat_io_uring_rsrc_register)) + return -EINVAL; + if (get_compat_io_uring_rsrc_register(&rr, arg)) + return -EFAULT; + } else +#endif + { + /* keep it extendible */ + if (size != sizeof(rr)) + return -EINVAL; + if (copy_from_user(&rr, arg, size)) + return -EFAULT; + }
- memset(&rr, 0, sizeof(rr)); - if (copy_from_user(&rr, arg, size)) - return -EFAULT; if (!rr.nr || rr.resv || rr.resv2) return -EINVAL;
@@ -11799,6 +12158,13 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ } while (0)
+ BUILD_BUG_ON(__builtin_types_compatible_p(struct io_uring_sqe, struct compat_io_uring_sqe)); + + BUILD_BUG_ON(sizeof(struct compat_io_uring_files_update) != + sizeof(struct compat_io_uring_rsrc_update)); + BUILD_BUG_ON(sizeof(struct compat_io_uring_rsrc_update) > + sizeof(struct compat_io_uring_rsrc_update2)); + #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
On 17/10/2022 19:09, Tudor Cretu wrote:
Introduce compat versions of the structs exposed in the uAPI headers that might contain pointers as a member. Also, implement functions that convert the compat versions to the native versions of the struct.
Pull cqes member out from rings struct so that we are able to have a union between cqes and cqes_compat. This is done in a similar way to commit 75b28affdd6a ("io_uring: allocate the two rings together"), where sq_array was pulled out from the rings struct.
As discussed offline, better to split this out as this is logically separate (though related) and it would make it easier to see the implications of each change.
A subsequent patch is going to change the io_uring structs to enable them to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/io_uring.c | 502 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 434 insertions(+), 68 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index d901be8da630..2db37c68e715 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -115,6 +115,93 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) +struct compat_io_uring_sqe {
- __u8 opcode;
- __u8 flags;
- __u16 ioprio;
- __s32 fd;
- union {
__u64 off;
__u64 addr2;
- };
- union {
__u64 addr;
__u64 splice_off_in;
- };
- __u32 len;
- union {
__kernel_rwf_t rw_flags;
__u32 fsync_flags;
__u16 poll_events;
__u32 poll32_events;
__u32 sync_range_flags;
__u32 msg_flags;
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
__u32 rename_flags;
__u32 unlink_flags;
__u32 hardlink_flags;
- };
- __u64 user_data;
- union {
__u16 buf_index;
__u16 buf_group;
- } __packed;
- __u16 personality;
- union {
__s32 splice_fd_in;
__u32 file_index;
- };
- __u64 __pad2[2];
+};
+struct compat_io_uring_cqe {
- __u64 user_data;
- __s32 res;
- __u32 flags;
+};
+struct compat_io_uring_files_update {
- __u32 offset;
- __u32 resv;
- __aligned_u64 fds;
+};
+struct compat_io_uring_rsrc_register {
- __u32 nr;
- __u32 resv;
- __u64 resv2;
- __aligned_u64 data;
- __aligned_u64 tags;
+};
+struct compat_io_uring_rsrc_update {
- __u32 offset;
- __u32 resv;
- __aligned_u64 data;
+};
+struct compat_io_uring_rsrc_update2 {
- __u32 offset;
- __u32 resv;
- __aligned_u64 data;
- __aligned_u64 tags;
- __u32 nr;
- __u32 resv2;
+};
+struct compat_io_uring_getevents_arg {
- __u64 sigmask;
- __u32 sigmask_sz;
- __u32 pad;
- __u64 ts;
+};
- struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp;
@@ -188,14 +275,6 @@ struct io_rings { * ordered with any other data. */ u32 cq_overflow;
- /*
* Ring buffer of completion events.
*
* The kernel writes completion events fresh every time they are
* produced, so the application is allowed to modify pending
* entries.
*/
- struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; };
enum io_uring_cmd_flags { @@ -216,7 +295,10 @@ struct io_mapped_ubuf { struct io_ring_ctx; struct io_overflow_cqe {
- struct io_uring_cqe cqe;
- union {
struct compat_io_uring_cqe compat_cqe;
struct io_uring_cqe cqe;
- }; struct list_head list; };
@@ -345,6 +427,17 @@ struct io_ring_ctx { struct percpu_ref refs; struct io_rings *rings;
/*
* Ring buffer of completion events.
*
* The kernel writes completion events fresh every time they are
* produced, so the application is allowed to modify pending
* entries.
*/
union {
struct compat_io_uring_cqe *cqes_compat;
struct io_uring_cqe *cqes;
};
Just curious, is there a particular reason to add this in that specific position in the struct?
unsigned int flags; unsigned int compat: 1; unsigned int drain_next: 1;
@@ -371,7 +464,10 @@ struct io_ring_ctx { * array. */ u32 *sq_array;
struct io_uring_sqe *sq_sqes;
union {
struct compat_io_uring_sqe *sq_sqes_compat;
struct io_uring_sqe *sq_sqes;
unsigned cached_sq_head; unsigned sq_entries; struct list_head defer_list;};
@@ -1804,10 +1900,9 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +static inline void *io_get_cqe(struct io_ring_ctx *ctx) {
- struct io_rings *rings = ctx->rings;
- unsigned tail, mask = ctx->cq_entries - 1;
- unsigned int cq_idx, tail, mask = ctx->cq_entries - 1;
/* * writes to the cq entry need to come after reading head; the @@ -1818,8 +1913,19 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) return NULL; tail = ctx->cached_cq_tail++;
- return &rings->cqes[tail & mask];
- cq_idx = tail & mask;
- return ctx->compat ? (void *)&ctx->cqes_compat[cq_idx] : (void *)&ctx->cqes[cq_idx];
Better to wrap at 80 char if reasonably possible, like here. Be careful, it looks like your editor is configured to wrap at 100 char (looking at the big comment you added further below).
+}
+#ifdef CONFIG_COMPAT +static inline void convert_compat_io_uring_cqe(struct io_uring_cqe *cqe,
const struct compat_io_uring_cqe *compat_cqe)
+{
- cqe->user_data = READ_ONCE(compat_cqe->user_data);
- cqe->res = READ_ONCE(compat_cqe->res);
- cqe->flags = READ_ONCE(compat_cqe->flags);
} +#endif
I don't see any obvious rationale for the position of these new conversion helpers in the file, especially this one. I wonder if it wouldn't be better to group them all somewhere towards the top, not only to avoid difficult choices but also to reduce the risk of conflicts when rebasing.
static void io_eventfd_signal(struct io_ring_ctx *ctx) { @@ -1896,7 +2002,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) posted = false; spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) {
struct io_uring_cqe *cqe = io_get_cqe(ctx);
struct io_overflow_cqe *ocqe;void *cqe = io_get_cqe(ctx);
if (!cqe && !force) @@ -1904,7 +2010,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); if (cqe)
memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
memcpy(cqe, ctx->compat ? (void *)&ocqe->compat_cqe : (void *)&ocqe->cqe,
ctx->compat ? sizeof(ocqe->compat_cqe) : sizeof(ocqe->cqe));
I think it's OK to leave the second argument unchanged: you know that the address of ->cqe and ->compat_cqe is the same (by design). You just need to choose the right size.
else io_account_cq_overflow(ctx);
@@ -2010,9 +2117,18 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); }
- ocqe->cqe.user_data = user_data;
- ocqe->cqe.res = res;
- ocqe->cqe.flags = cflags;
+#ifdef CONFIG_COMPAT
- if (ctx->compat) {
ocqe->compat_cqe.user_data = user_data;
ocqe->compat_cqe.res = res;
ocqe->compat_cqe.flags = cflags;
- } else
+#endif
- {
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
ocqe->cqe.flags = cflags;
- }
For all such cases where there is a copy path for compat and another for native, I think it would be nice to create helpers, along the lines of copy_siginfo_from_user_any(). One advantage is that you can use an early return in the helper, avoiding the awkward if { ... } else followed by #endif.
list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } @@ -2020,7 +2136,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) {
- struct io_uring_cqe *cqe;
- void *cqe;
/* * If we can't get a cq entry, userspace overflowed the @@ -2029,9 +2145,22 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, */ cqe = io_get_cqe(ctx); if (likely(cqe)) {
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
+#ifdef CONFIG_COMPAT
if (ctx->compat) {
struct compat_io_uring_cqe *compat_cqe = (struct compat_io_uring_cqe *)cqe;
WRITE_ONCE(compat_cqe->user_data, user_data);
WRITE_ONCE(compat_cqe->res, res);
WRITE_ONCE(compat_cqe->flags, cflags);
} else
+#endif
{
struct io_uring_cqe *native_cqe = (struct io_uring_cqe *)cqe;
WRITE_ONCE(native_cqe->user_data, user_data);
WRITE_ONCE(native_cqe->res, res);
WRITE_ONCE(native_cqe->flags, cflags);
return true; } return io_cqring_event_overflow(ctx, user_data, res, cflags);}
@@ -7811,7 +7940,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
- used, it's important that those reads are done through READ_ONCE() to
- prevent a re-load down the line.
*/ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) +static const void *io_get_sqe(struct io_ring_ctx *ctx) { unsigned head, mask = ctx->sq_entries - 1; unsigned sq_idx = ctx->cached_sq_head++ & mask; @@ -7826,7 +7955,8 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) */ head = READ_ONCE(ctx->sq_array[sq_idx]); if (likely(head < ctx->sq_entries))
return &ctx->sq_sqes[head];
return ctx->compat ? (void *)&ctx->sq_sqes_compat[head]
: (void *)&ctx->sq_sqes[head];
/* drop invalid entries */ ctx->cq_extra--; @@ -7835,6 +7965,40 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) return NULL; } +#ifdef CONFIG_COMPAT +static void convert_compat_io_uring_sqe(struct io_uring_sqe *sqe,
const struct compat_io_uring_sqe *compat_sqe)
+{ +/*
- The struct io_uring_sqe contains anonymous unions and there is no field keeping track of which
- union's member is active. Because in all the cases, the unions are between integral types and
- the types are compatible, use the largest member of each union to perform the copy. Use this
- compile-time check to ensure that the union's members are not truncated during the conversion.
- */
+#define BUILD_BUG_COMPAT_SQE_UNION_ELEM(elem1, elem2) \
- BUILD_BUG_ON(sizeof_field(struct compat_io_uring_sqe, elem1) != \
(offsetof(struct compat_io_uring_sqe, elem2) - \
offsetof(struct compat_io_uring_sqe, elem1)))
An empty line here would help. Also, standard practice when declaring a macro inside a function is to #undef it at the end of the function. I see that io_uring_init() doesn't do that, but that's somewhat acceptable as it's the last function in the file.
- sqe->opcode = READ_ONCE(compat_sqe->opcode);
- sqe->flags = READ_ONCE(compat_sqe->flags);
- sqe->ioprio = READ_ONCE(compat_sqe->ioprio);
- sqe->fd = READ_ONCE(compat_sqe->fd);
- BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr2, addr);
- sqe->addr2 = READ_ONCE(compat_sqe->addr2);
- BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr, len);
- sqe->addr = READ_ONCE(compat_sqe->addr);
- sqe->len = READ_ONCE(compat_sqe->len);
- BUILD_BUG_COMPAT_SQE_UNION_ELEM(rw_flags, user_data);
- sqe->rw_flags = READ_ONCE(compat_sqe->rw_flags);
- sqe->user_data = READ_ONCE(compat_sqe->user_data);
- BUILD_BUG_COMPAT_SQE_UNION_ELEM(buf_index, personality);
- sqe->buf_index = READ_ONCE(compat_sqe->buf_index);
- sqe->personality = READ_ONCE(compat_sqe->personality);
- BUILD_BUG_COMPAT_SQE_UNION_ELEM(splice_fd_in, __pad2);
- sqe->splice_fd_in = READ_ONCE(compat_sqe->splice_fd_in);
+} +#endif
- static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) {
@@ -7849,7 +8013,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) io_submit_state_start(&ctx->submit_state, nr); do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;const void *sqe;
if (unlikely(!io_alloc_req_refill(ctx))) { @@ -7863,6 +8027,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); break; } +#ifdef CONFIG_COMPAT
if (ctx->compat) {
struct io_uring_sqe native_sqe;
convert_compat_io_uring_sqe(&native_sqe, sqe);
sqe = &native_sqe;
That is a textbook example of use-after-free, native_sqe goes out of scope on the next line and sqe becomes a dangling pointer :) To my disappointment Clang doesn't seem to warn even about such a straightforward case, but interestingly GCC 12 does (looks like a new warning, -Wdangling-pointer).
Same issue with the conversions in __io_uring_show_fdinfo().
}
+#endif /* will complete beyond this point, count as submitted */ submitted++; if (io_submit_sqe(ctx, req, sqe)) { @@ -9389,13 +9561,29 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp, get_order(size)); } -static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
size_t *sq_offset)
+static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
{ struct io_rings *rings;unsigned int cq_entries, size_t *cq_offset, size_t *sq_offset)
- size_t off, sq_array_size;
- size_t off, cq_array_size, sq_array_size;
- off = sizeof(*rings);
+#ifdef CONFIG_SMP
- off = ALIGN(off, SMP_CACHE_BYTES);
- if (off == 0)
return SIZE_MAX;
+#endif
- if (cq_offset)
*cq_offset = off;
- cq_array_size = array_size(ctx->compat ? sizeof(struct compat_io_uring_cqe)
: sizeof(struct io_uring_cqe), cq_entries);
- if (cq_array_size == SIZE_MAX)
return SIZE_MAX;
- off = struct_size(rings, cqes, cq_entries);
- off = size_add(off, cq_array_size); if (off == SIZE_MAX) return SIZE_MAX;
@@ -9412,7 +9600,8 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, if (sq_array_size == SIZE_MAX) return SIZE_MAX;
- if (check_add_overflow(off, sq_array_size, &off))
- off = size_add(off, sq_array_size);
- if (off == SIZE_MAX)
Could you elaborate on why we need this change?
return SIZE_MAX;
return off; @@ -9972,7 +10161,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); io_mem_free(ctx->rings);
- io_mem_free(ctx->sq_sqes);
- io_mem_free(ctx->compat ? (void *)ctx->sq_sqes_compat : (void *)ctx->sq_sqes);
I find this sort of diff slightly frustrating as sq_sqes and sq_sqes_compat are both pointers, simply to a different type. If I got this correctly, you are making use of the type in two cases, while it is irrelevant in three (like here). However it is technically UB to refer to the inactive member of the union, so just getting rid of this diff wouldn't be recommended.
Fortunately there is an alternative: instead of having a union of pointers, make it a pointer to a union, e.g.:
union { Â Â Â struct io_uring_cqe native; Â Â Â struct compat_io_uring_cqe compat; Â } *sq_sqes;
This way sq_sqes can still be viewed as a void * without change, and the cases where the type matters have to be changed anyway.
Might as well do the same with cqes/cqes_compat to be consistent.
percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -10518,6 +10707,21 @@ static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, return -EBUSY; } +#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_update(struct io_uring_rsrc_update *up,
const void __user *user_up)
+{
- struct compat_io_uring_rsrc_update compat_up;
- if (unlikely(copy_from_user(&compat_up, user_up, sizeof(compat_up))))
return -EFAULT;
- up->offset = compat_up.offset;
- up->resv = compat_up.resv;
- up->data = compat_up.data;
- return 0;
+} +#endif
- /*
- Register a ring fd to avoid fdget/fdput for each io_uring_enter()
- invocation. User passes in an array of struct io_uring_rsrc_update
@@ -10547,10 +10751,18 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, for (i = 0; i < nr_args; i++) { int start, end;
if (copy_from_user(®, &arg[i], sizeof(reg))) {
ret = -EFAULT;
break;
}
+#ifdef CONFIG_COMPAT
if (ctx->compat) {
if (get_compat_io_uring_rsrc_update(®, &arg[i])) {
ret = -EFAULT;
break;
}
} else
+#endif
if (copy_from_user(®, &arg[i], sizeof(reg))) {
ret = -EFAULT;
break;
}
if (reg.resv) { ret = -EINVAL; @@ -10599,10 +10811,19 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, return 0; for (i = 0; i < nr_args; i++) {
if (copy_from_user(®, &arg[i], sizeof(reg))) {
ret = -EFAULT;
break;
}
+#ifdef CONFIG_COMPAT
if (ctx->compat) {
if (get_compat_io_uring_rsrc_update(®, &arg[i])) {
ret = -EFAULT;
break;
}
} else
+#endif
if (copy_from_user(®, &arg[i], sizeof(reg))) {
ret = -EFAULT;
break;
}
- if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break;
@@ -10632,7 +10853,7 @@ static void *io_uring_validate_mmap_request(struct file *file, ptr = ctx->rings; break; case IORING_OFF_SQES:
ptr = ctx->sq_sqes;
break; default: return ERR_PTR(-EINVAL);ptr = ctx->compat ? (void *)ctx->sq_sqes_compat : (void *)ctx->sq_sqes;
@@ -10706,7 +10927,24 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) return 0; } -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, +#ifdef CONFIG_COMPAT +static int get_compat_io_uring_getevents_arg(struct io_uring_getevents_arg *arg,
const void __user *user_arg)
+{
- struct compat_io_uring_getevents_arg compat_arg;
- if (unlikely(copy_from_user(&compat_arg, user_arg, sizeof(compat_arg))))
return -EFAULT;
- arg->sigmask = compat_arg.sigmask;
- arg->sigmask_sz = compat_arg.sigmask_sz;
- arg->pad = compat_arg.pad;
- arg->ts = compat_arg.ts;
- return 0;
+} +#endif
+static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned int flags, const void __user *argp,
#ifdef CONFIG_CHERI_PURECAP_UABI struct __kernel_timespec * __capability *ts, const sigset_t * __capability *sig)size_t *argsz,
@@ -10731,10 +10969,21 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */
- if (*argsz != sizeof(arg))
return -EINVAL;
- if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
+#ifdef CONFIG_COMPAT
- if (ctx->compat) {
if (*argsz != sizeof(struct compat_io_uring_getevents_arg))
return -EINVAL;
if (get_compat_io_uring_getevents_arg(&arg, argp))
return -EFAULT;
- } else
+#endif
- {
if (*argsz != sizeof(arg))
return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
- }
- if (arg.pad) return -EINVAL; *sig = u64_to_user_ptr(arg.sigmask);
@@ -10827,7 +11076,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, const sigset_t __user *sig; struct __kernel_timespec __user *ts;
ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
if (unlikely(ret)) goto out;ret = io_get_ext_arg(ctx, flags, argp, &argsz, &ts, &sig);
@@ -10926,7 +11175,16 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, if (sq_idx > sq_mask) continue;
sqe = &ctx->sq_sqes[sq_idx];
+#ifdef CONFIG_COMPAT
if (ctx->compat) {
struct io_uring_sqe native_sqe;
struct compat_io_uring_sqe *compat_sqe = &ctx->sq_sqes_compat[sq_idx];
convert_compat_io_uring_sqe(&native_sqe, compat_sqe);
sqe = &native_sqe;
} else
+#endif
seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", sq_idx, sqe->opcode, sqe->fd, sqe->flags, sqe->user_data);sqe = &ctx->sq_sqes[sq_idx];
@@ -10935,7 +11193,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { unsigned int entry = i + cq_head;
struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
unsigned int cq_idx = entry & cq_mask;
struct io_uring_cqe *cqe;
+#ifdef CONFIG_COMPAT
if (ctx->compat) {
struct io_uring_cqe native_cqe;
struct compat_io_uring_cqe *compat_cqe = &ctx->cqes_compat[cq_idx];
convert_compat_io_uring_cqe(&native_cqe, compat_cqe);
cqe = &native_cqe;
} else
+#endif
cqe = &ctx->cqes[cq_idx];
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", entry & cq_mask, cqe->user_data, cqe->res, @@ -10998,7 +11267,17 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, seq_puts(m, "CqOverflowList:\n"); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
struct io_uring_cqe *cqe = &ocqe->cqe;
struct io_uring_cqe *cqe;
+#ifdef CONFIG_COMPAT
if (ctx->compat) {
struct io_uring_cqe native_cqe;
struct compat_io_uring_cqe *compat_cqe = &ocqe->compat_cqe;
convert_compat_io_uring_cqe(&native_cqe, compat_cqe);
cqe = &native_cqe;
} else
+#endif
cqe = &ocqe->cqe;
seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", cqe->user_data, cqe->res, cqe->flags); @@ -11036,13 +11315,14 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { struct io_rings *rings;
- size_t size, sq_array_offset;
- size_t size, cqes_offset, sq_array_offset;
- void *cqes, *sqes;
/* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries;
- size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
- size = rings_size(ctx, p->sq_entries, p->cq_entries, &cqes_offset, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW;
@@ -11051,25 +11331,40 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -ENOMEM; ctx->rings = rings;
- cqes = (char *)rings + cqes_offset;
+#ifdef CONFIG_COMPAT
- if (ctx->compat)
ctx->cqes_compat = (struct compat_io_uring_cqe *)cqes;
- else
+#endif
ctx->cqes = (struct io_uring_cqe *)cqes;
- ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; rings->cq_ring_mask = p->cq_entries - 1; rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries;
- size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
- size = array_size(ctx->compat ? sizeof(struct compat_io_uring_sqe)
if (size == SIZE_MAX) { io_mem_free(ctx->rings); ctx->rings = NULL; return -EOVERFLOW; }: sizeof(struct io_uring_sqe), p->sq_entries);
- ctx->sq_sqes = io_mem_alloc(size);
- if (!ctx->sq_sqes) {
- sqes = io_mem_alloc(size);
- if (!sqes) { io_mem_free(ctx->rings); ctx->rings = NULL; return -ENOMEM; }
+#ifdef CONFIG_COMPAT
- if (ctx->compat)
ctx->sq_sqes_compat = (struct compat_io_uring_sqe *)sqes;
- else
+#endif
ctx->sq_sqes = (struct io_uring_sqe *)sqes;
return 0; } @@ -11210,7 +11505,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
- p->cq_off.cqes = offsetof(struct io_rings, cqes);
- p->cq_off.cqes = (char *)ctx->cqes - (char *)ctx->rings; p->cq_off.flags = offsetof(struct io_rings, cq_flags);
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | @@ -11453,39 +11748,103 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, if (!nr_args) return -EINVAL; memset(&up, 0, sizeof(up));
- if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
return -EFAULT;
+#ifdef CONFIG_COMPAT
- if (ctx->compat) {
if (get_compat_io_uring_rsrc_update((struct io_uring_rsrc_update *)&up, arg))
This is rather dubious from a strict aliasing perspective. Mind you, it's really the ABI that's scary here (struct io_uring_rsrc_update is supposed to represent the bottom part of struct io_uring_rsrc_update2, which is apparently used internally to avoid duplication). When doing a copy_from_user(), there's no aliasing concern as copy_from_user() doesn't care about types, but get_compat_io_uring_rsrc_update() does access up as a struct io_uring_rsrc_update and that's not really OK.
The best is probably to have an (admittedly somewhat strange) helper that takes a struct io_uring_rsrc_update2 * instead (without touching the extra fields). This is not too bad though as there should be two versions of get_compat_io_uring_rsrc_update() anyway, one for this function and one for the two others above, see my comments in patch 4 as to why that is.
return -EFAULT;
- } else
+#endif
if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
return -EFAULT;
- if (up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); }
+#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_update2(struct io_uring_rsrc_update2 *up2,
const void __user *user_up2)
+{
- struct compat_io_uring_rsrc_update2 compat_up2;
- if (unlikely(copy_from_user(&compat_up2, user_up2, sizeof(compat_up2))))
return -EFAULT;
- up2->offset = compat_up2.offset;
- up2->resv = compat_up2.resv;
- up2->data = compat_up2.data;
- up2->tags = compat_up2.tags;
- up2->nr = compat_up2.nr;
- up2->resv2 = compat_up2.resv2;
- return 0;
+} +#endif
- static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type) { struct io_uring_rsrc_update2 up;
- if (size != sizeof(up))
return -EINVAL;
- if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
+#ifdef CONFIG_COMPAT
- if (ctx->compat) {
if (size != sizeof(struct compat_io_uring_rsrc_update2))
return -EINVAL;
if (get_compat_io_uring_rsrc_update2(&up, arg))
return -EFAULT;
- } else
+#endif
- {
if (size != sizeof(up))
return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
- }
- if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); }
+#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_register(struct io_uring_rsrc_register *rr,
const void __user *user_rr)
+{
- struct compat_io_uring_rsrc_register compat_rr;
- if (unlikely(copy_from_user(&compat_rr, user_rr, sizeof(compat_rr))))
return -EFAULT;
- rr->nr = compat_rr.nr;
- rr->resv = compat_rr.resv;
- rr->resv2 = compat_rr.resv2;
- rr->data = compat_rr.data;
- rr->tags = compat_rr.tags;
- return 0;
+} +#endif
- static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr;
- /* keep it extendible */
- if (size != sizeof(rr))
return -EINVAL;
+#ifdef CONFIG_COMPAT
- if (ctx->compat) {
if (size != sizeof(struct compat_io_uring_rsrc_register))
return -EINVAL;
if (get_compat_io_uring_rsrc_register(&rr, arg))
return -EFAULT;
- } else
+#endif
- {
/* keep it extendible */
if (size != sizeof(rr))
return -EINVAL;
if (copy_from_user(&rr, arg, size))
return -EFAULT;
- }
- memset(&rr, 0, sizeof(rr));
- if (copy_from_user(&rr, arg, size))
if (!rr.nr || rr.resv || rr.resv2) return -EINVAL;return -EFAULT;
@@ -11799,6 +12158,13 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ } while (0)
- BUILD_BUG_ON(__builtin_types_compatible_p(struct io_uring_sqe, struct compat_io_uring_sqe));
The condition is reversed :)
- BUILD_BUG_ON(sizeof(struct compat_io_uring_files_update) !=
sizeof(struct compat_io_uring_rsrc_update));
- BUILD_BUG_ON(sizeof(struct compat_io_uring_rsrc_update) >
sizeof(struct compat_io_uring_rsrc_update2));
Nit: I'd rather have the compat checks after the native ones.
Kevin
- #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
On 28-10-2022 12:57, Kevin Brodsky wrote:
On 17/10/2022 19:09, Tudor Cretu wrote:
Introduce compat versions of the structs exposed in the uAPI headers that might contain pointers as a member. Also, implement functions that convert the compat versions to the native versions of the struct.
Pull cqes member out from rings struct so that we are able to have a union between cqes and cqes_compat. This is done in a similar way to commit 75b28affdd6a ("io_uring: allocate the two rings together"), where sq_array was pulled out from the rings struct.
As discussed offline, better to split this out as this is logically separate (though related) and it would make it easier to see the implications of each change.
Ack
A subsequent patch is going to change the io_uring structs to enable them to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/io_uring.c | 502 +++++++++++++++++++++++++++++++++++++++++++------- Â 1 file changed, 434 insertions(+), 68 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index d901be8da630..2db37c68e715 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -115,6 +115,93 @@  #define IO_TCTX_REFS_CACHE_NR   (1U << 10) +struct compat_io_uring_sqe { +   __u8   opcode; +   __u8   flags; +   __u16   ioprio; +   __s32   fd; +   union { +       __u64   off; +       __u64   addr2; +   }; +   union { +       __u64   addr; +       __u64   splice_off_in; +   }; +   __u32   len; +   union { +       __kernel_rwf_t   rw_flags; +       __u32       fsync_flags; +       __u16       poll_events; +       __u32       poll32_events; +       __u32       sync_range_flags; +       __u32       msg_flags; +       __u32       timeout_flags; +       __u32       accept_flags; +       __u32       cancel_flags; +       __u32       open_flags; +       __u32       statx_flags; +       __u32       fadvise_advice; +       __u32       splice_flags; +       __u32       rename_flags; +       __u32       unlink_flags; +       __u32       hardlink_flags; +   }; +   __u64   user_data; +   union { +       __u16   buf_index; +       __u16   buf_group; +   } __packed; +   __u16   personality; +   union { +       __s32   splice_fd_in; +       __u32   file_index; +   }; +   __u64   __pad2[2]; +};
+struct compat_io_uring_cqe { +Â Â Â __u64Â Â Â user_data; +Â Â Â __s32Â Â Â res; +Â Â Â __u32Â Â Â flags; +};
+struct compat_io_uring_files_update { +Â Â Â __u32 offset; +Â Â Â __u32 resv; +Â Â Â __aligned_u64 fds; +};
+struct compat_io_uring_rsrc_register { +Â Â Â __u32 nr; +Â Â Â __u32 resv; +Â Â Â __u64 resv2; +Â Â Â __aligned_u64 data; +Â Â Â __aligned_u64 tags; +};
+struct compat_io_uring_rsrc_update { +Â Â Â __u32 offset; +Â Â Â __u32 resv; +Â Â Â __aligned_u64 data; +};
+struct compat_io_uring_rsrc_update2 { +Â Â Â __u32 offset; +Â Â Â __u32 resv; +Â Â Â __aligned_u64 data; +Â Â Â __aligned_u64 tags; +Â Â Â __u32 nr; +Â Â Â __u32 resv2; +};
+struct compat_io_uring_getevents_arg { +Â Â Â __u64 sigmask; +Â Â Â __u32 sigmask_sz; +Â Â Â __u32 pad; +Â Â Â __u64 ts; +};
struct io_uring {      u32 head ____cacheline_aligned_in_smp;      u32 tail ____cacheline_aligned_in_smp; @@ -188,14 +275,6 @@ struct io_rings {       * ordered with any other data.       */      u32           cq_overflow; -   /* -    * Ring buffer of completion events. -    * -    * The kernel writes completion events fresh every time they are -    * produced, so the application is allowed to modify pending -    * entries. -    */ -   struct io_uring_cqe   cqes[] ____cacheline_aligned_in_smp;  };  enum io_uring_cmd_flags { @@ -216,7 +295,10 @@ struct io_mapped_ubuf {  struct io_ring_ctx;  struct io_overflow_cqe { -   struct io_uring_cqe cqe; +   union { +       struct compat_io_uring_cqe compat_cqe; +       struct io_uring_cqe cqe; +   };      struct list_head list;  }; @@ -345,6 +427,17 @@ struct io_ring_ctx {          struct percpu_ref   refs;          struct io_rings       *rings; +       /* +        * Ring buffer of completion events. +        * +        * The kernel writes completion events fresh every time they are +        * produced, so the application is allowed to modify pending +        * entries. +        */ +       union { +           struct compat_io_uring_cqe   *cqes_compat; +           struct io_uring_cqe       *cqes; +       };
Just curious, is there a particular reason to add this in that specific position in the struct?
No, just an artefact of one of my previous iterations. I moved it before sq_array now.
unsigned int       flags;          unsigned int       compat: 1;          unsigned int       drain_next: 1; @@ -371,7 +464,10 @@ struct io_ring_ctx {           * array.           */          u32           *sq_array; -       struct io_uring_sqe   *sq_sqes; +       union { +           struct compat_io_uring_sqe   *sq_sqes_compat; +           struct io_uring_sqe       *sq_sqes; +       };          unsigned       cached_sq_head;          unsigned       sq_entries;          struct list_head   defer_list; @@ -1804,10 +1900,9 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)      return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);  } -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +static inline void *io_get_cqe(struct io_ring_ctx *ctx)  { -   struct io_rings *rings = ctx->rings; -   unsigned tail, mask = ctx->cq_entries - 1; +   unsigned int cq_idx, tail, mask = ctx->cq_entries - 1;      /*       * writes to the cq entry need to come after reading head; the @@ -1818,8 +1913,19 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)          return NULL;      tail = ctx->cached_cq_tail++; -   return &rings->cqes[tail & mask]; +   cq_idx = tail & mask; +   return ctx->compat ? (void *)&ctx->cqes_compat[cq_idx] : (void *)&ctx->cqes[cq_idx];
Better to wrap at 80 char if reasonably possible, like here. Be careful, it looks like your editor is configured to wrap at 100 char (looking at the big comment you added further below).
checkpatch.pl was telling me to wrap it at 100 char, but yes, I should've done it at 80. Ack
+}
+#ifdef CONFIG_COMPAT +static inline void convert_compat_io_uring_cqe(struct io_uring_cqe *cqe, +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const struct compat_io_uring_cqe *compat_cqe) +{ +Â Â Â cqe->user_data = READ_ONCE(compat_cqe->user_data); +Â Â Â cqe->res = READ_ONCE(compat_cqe->res); +Â Â Â cqe->flags = READ_ONCE(compat_cqe->flags); } +#endif
I don't see any obvious rationale for the position of these new conversion helpers in the file, especially this one. I wonder if it wouldn't be better to group them all somewhere towards the top, not only to avoid difficult choices but also to reduce the risk of conflicts when rebasing.
They were each before the first function that uses them, so yes, not obvious. I moved them to the top. Thanks!
static void io_eventfd_signal(struct io_ring_ctx *ctx) Â { @@ -1896,7 +2002,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) Â Â Â Â Â posted = false; Â Â Â Â Â spin_lock(&ctx->completion_lock); Â Â Â Â Â while (!list_empty(&ctx->cq_overflow_list)) { -Â Â Â Â Â Â Â struct io_uring_cqe *cqe = io_get_cqe(ctx); +Â Â Â Â Â Â Â void *cqe = io_get_cqe(ctx); Â Â Â Â Â Â Â Â Â struct io_overflow_cqe *ocqe; Â Â Â Â Â Â Â Â Â if (!cqe && !force) @@ -1904,7 +2010,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) Â Â Â Â Â Â Â Â Â ocqe = list_first_entry(&ctx->cq_overflow_list, Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct io_overflow_cqe, list); Â Â Â Â Â Â Â Â Â if (cqe) -Â Â Â Â Â Â Â Â Â Â Â memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); +Â Â Â Â Â Â Â Â Â Â Â memcpy(cqe, ctx->compat ? (void *)&ocqe->compat_cqe : (void *)&ocqe->cqe, +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ctx->compat ? sizeof(ocqe->compat_cqe) : sizeof(ocqe->cqe));
I think it's OK to leave the second argument unchanged: you know that the address of ->cqe and ->compat_cqe is the same (by design). You just need to choose the right size.
Ack
else              io_account_cq_overflow(ctx); @@ -2010,9 +2117,18 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,                 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);      } -   ocqe->cqe.user_data = user_data; -   ocqe->cqe.res = res; -   ocqe->cqe.flags = cflags; +#ifdef CONFIG_COMPAT +   if (ctx->compat) { +       ocqe->compat_cqe.user_data = user_data; +       ocqe->compat_cqe.res = res; +       ocqe->compat_cqe.flags = cflags; +   } else +#endif +   { +       ocqe->cqe.user_data = user_data; +       ocqe->cqe.res = res; +       ocqe->cqe.flags = cflags; +   }
For all such cases where there is a copy path for compat and another for native, I think it would be nice to create helpers, along the lines of copy_siginfo_from_user_any(). One advantage is that you can use an early return in the helper, avoiding the awkward if { ... } else followed by #endif.
Ack
list_add_tail(&ocqe->list, &ctx->cq_overflow_list); Â Â Â Â Â return true; Â } @@ -2020,7 +2136,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, Â static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â s32 res, u32 cflags) Â { -Â Â Â struct io_uring_cqe *cqe; +Â Â Â void *cqe; Â Â Â Â Â /* Â Â Â Â Â Â * If we can't get a cq entry, userspace overflowed the @@ -2029,9 +2145,22 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, Â Â Â Â Â Â */ Â Â Â Â Â cqe = io_get_cqe(ctx); Â Â Â Â Â if (likely(cqe)) { -Â Â Â Â Â Â Â WRITE_ONCE(cqe->user_data, user_data); -Â Â Â Â Â Â Â WRITE_ONCE(cqe->res, res); -Â Â Â Â Â Â Â WRITE_ONCE(cqe->flags, cflags); +#ifdef CONFIG_COMPAT +Â Â Â Â Â Â Â if (ctx->compat) { +Â Â Â Â Â Â Â Â Â Â Â struct compat_io_uring_cqe *compat_cqe = (struct compat_io_uring_cqe *)cqe;
+Â Â Â Â Â Â Â Â Â Â Â WRITE_ONCE(compat_cqe->user_data, user_data); +Â Â Â Â Â Â Â Â Â Â Â WRITE_ONCE(compat_cqe->res, res); +Â Â Â Â Â Â Â Â Â Â Â WRITE_ONCE(compat_cqe->flags, cflags); +Â Â Â Â Â Â Â } else +#endif +Â Â Â Â Â Â Â { +Â Â Â Â Â Â Â Â Â Â Â struct io_uring_cqe *native_cqe = (struct io_uring_cqe *)cqe;
+           WRITE_ONCE(native_cqe->user_data, user_data); +           WRITE_ONCE(native_cqe->res, res); +           WRITE_ONCE(native_cqe->flags, cflags); +       }          return true;      }      return io_cqring_event_overflow(ctx, user_data, res, cflags); @@ -7811,7 +7940,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)   * used, it's important that those reads are done through READ_ONCE() to   * prevent a re-load down the line.   */ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) +static const void *io_get_sqe(struct io_ring_ctx *ctx)  {      unsigned head, mask = ctx->sq_entries - 1;      unsigned sq_idx = ctx->cached_sq_head++ & mask; @@ -7826,7 +7955,8 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)       */      head = READ_ONCE(ctx->sq_array[sq_idx]);      if (likely(head < ctx->sq_entries)) -       return &ctx->sq_sqes[head]; +       return ctx->compat ? (void *)&ctx->sq_sqes_compat[head] +                  : (void *)&ctx->sq_sqes[head];      /* drop invalid entries */      ctx->cq_extra--; @@ -7835,6 +7965,40 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)      return NULL;  } +#ifdef CONFIG_COMPAT +static void convert_compat_io_uring_sqe(struct io_uring_sqe *sqe, +                   const struct compat_io_uring_sqe *compat_sqe) +{ +/*
- The struct io_uring_sqe contains anonymous unions and there is no
field keeping track of which
- union's member is active. Because in all the cases, the unions are
between integral types and
- the types are compatible, use the largest member of each union to
perform the copy. Use this
- compile-time check to ensure that the union's members are not
truncated during the conversion.
- */
+#define BUILD_BUG_COMPAT_SQE_UNION_ELEM(elem1, elem2) \ +Â Â Â BUILD_BUG_ON(sizeof_field(struct compat_io_uring_sqe, elem1) != \ +Â Â Â Â Â Â Â (offsetof(struct compat_io_uring_sqe, elem2) - \ +Â Â Â Â Â Â Â Â offsetof(struct compat_io_uring_sqe, elem1)))
An empty line here would help. Also, standard practice when declaring a macro inside a function is to #undef it at the end of the function. I see that io_uring_init() doesn't do that, but that's somewhat acceptable as it's the last function in the file.
Ack
+Â Â Â sqe->opcode = READ_ONCE(compat_sqe->opcode); +Â Â Â sqe->flags = READ_ONCE(compat_sqe->flags); +Â Â Â sqe->ioprio = READ_ONCE(compat_sqe->ioprio); +Â Â Â sqe->fd = READ_ONCE(compat_sqe->fd); +Â Â Â BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr2, addr); +Â Â Â sqe->addr2 = READ_ONCE(compat_sqe->addr2); +Â Â Â BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr, len); +Â Â Â sqe->addr = READ_ONCE(compat_sqe->addr); +Â Â Â sqe->len = READ_ONCE(compat_sqe->len); +Â Â Â BUILD_BUG_COMPAT_SQE_UNION_ELEM(rw_flags, user_data); +Â Â Â sqe->rw_flags = READ_ONCE(compat_sqe->rw_flags); +Â Â Â sqe->user_data = READ_ONCE(compat_sqe->user_data); +Â Â Â BUILD_BUG_COMPAT_SQE_UNION_ELEM(buf_index, personality); +Â Â Â sqe->buf_index = READ_ONCE(compat_sqe->buf_index); +Â Â Â sqe->personality = READ_ONCE(compat_sqe->personality); +Â Â Â BUILD_BUG_COMPAT_SQE_UNION_ELEM(splice_fd_in, __pad2); +Â Â Â sqe->splice_fd_in = READ_ONCE(compat_sqe->splice_fd_in); +} +#endif
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) Â Â Â Â Â __must_hold(&ctx->uring_lock) Â { @@ -7849,7 +8013,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) Â Â Â Â Â io_submit_state_start(&ctx->submit_state, nr); Â Â Â Â Â do { -Â Â Â Â Â Â Â const struct io_uring_sqe *sqe; +Â Â Â Â Â Â Â const void *sqe; Â Â Â Â Â Â Â Â Â struct io_kiocb *req; Â Â Â Â Â Â Â Â Â if (unlikely(!io_alloc_req_refill(ctx))) { @@ -7863,6 +8027,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) Â Â Â Â Â Â Â Â Â Â Â Â Â wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); Â Â Â Â Â Â Â Â Â Â Â Â Â break; Â Â Â Â Â Â Â Â Â } +#ifdef CONFIG_COMPAT +Â Â Â Â Â Â Â if (ctx->compat) { +Â Â Â Â Â Â Â Â Â Â Â struct io_uring_sqe native_sqe;
+Â Â Â Â Â Â Â Â Â Â Â convert_compat_io_uring_sqe(&native_sqe, sqe); +Â Â Â Â Â Â Â Â Â Â Â sqe = &native_sqe;
That is a textbook example of use-after-free, native_sqe goes out of scope on the next line and sqe becomes a dangling pointer :) To my disappointment Clang doesn't seem to warn even about such a straightforward case, but interestingly GCC 12 does (looks like a new warning, -Wdangling-pointer).
Same issue with the conversions in __io_uring_show_fdinfo().
Oups! Thanks for spotting it out!
+       } +#endif          /* will complete beyond this point, count as submitted */          submitted++;          if (io_submit_sqe(ctx, req, sqe)) { @@ -9389,13 +9561,29 @@ static void *io_mem_alloc(size_t size)      return (void *) __get_free_pages(gfp, get_order(size));  } -static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, -               size_t *sq_offset) +static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, +               unsigned int cq_entries, size_t *cq_offset, size_t *sq_offset)  {      struct io_rings *rings; -   size_t off, sq_array_size; +   size_t off, cq_array_size, sq_array_size;
+Â Â Â off = sizeof(*rings);
+#ifdef CONFIG_SMP +Â Â Â off = ALIGN(off, SMP_CACHE_BYTES); +Â Â Â if (off == 0) +Â Â Â Â Â Â Â return SIZE_MAX; +#endif
+Â Â Â if (cq_offset) +Â Â Â Â Â Â Â *cq_offset = off;
+Â Â Â cq_array_size = array_size(ctx->compat ? sizeof(struct compat_io_uring_cqe) +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â : sizeof(struct io_uring_cqe), cq_entries); +Â Â Â if (cq_array_size == SIZE_MAX) +Â Â Â Â Â Â Â return SIZE_MAX; -Â Â Â off = struct_size(rings, cqes, cq_entries); +Â Â Â off = size_add(off, cq_array_size); Â Â Â Â Â if (off == SIZE_MAX) Â Â Â Â Â Â Â Â Â return SIZE_MAX; @@ -9412,7 +9600,8 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, Â Â Â Â Â if (sq_array_size == SIZE_MAX) Â Â Â Â Â Â Â Â Â return SIZE_MAX; -Â Â Â if (check_add_overflow(off, sq_array_size, &off)) +Â Â Â off = size_add(off, sq_array_size); +Â Â Â if (off == SIZE_MAX)
Could you elaborate on why we need this change?
It's not needed, just wanted to align it with the rest of the checks in the function and I find it more readable. I can make it again check_add_overflow, as well with the check at "size_add(off, cq_array_size)" if you prefer.
return SIZE_MAX; Â Â Â Â Â return off; @@ -9972,7 +10161,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) Â Â Â Â Â WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); Â Â Â Â Â io_mem_free(ctx->rings); -Â Â Â io_mem_free(ctx->sq_sqes); +Â Â Â io_mem_free(ctx->compat ? (void *)ctx->sq_sqes_compat : (void *)ctx->sq_sqes);
I find this sort of diff slightly frustrating as sq_sqes and sq_sqes_compat are both pointers, simply to a different type. If I got this correctly, you are making use of the type in two cases, while it is irrelevant in three (like here). However it is technically UB to refer to the inactive member of the union, so just getting rid of this diff wouldn't be recommended.
Fortunately there is an alternative: instead of having a union of pointers, make it a pointer to a union, e.g.:
union { Â Â Â struct io_uring_cqe native; Â Â Â struct compat_io_uring_cqe compat; Â } *sq_sqes;
This way sq_sqes can still be viewed as a void * without change, and the cases where the type matters have to be changed anyway.
Might as well do the same with cqes/cqes_compat to be consistent.
I think this change would be misleading. sq_sqes is not a pointer to an element which is either native or compat, but rather is an array of {compat_}io_uring_sqe structs, so it's not really the same thing. The compat member of the union would never be used. In the compat case, I would need to cast sq_sqes to an array of struct compat_io_uring_sqe and then get the element by index. Is this what intended to suggest?
percpu_ref_exit(&ctx->refs); Â Â Â Â Â free_uid(ctx->user); @@ -10518,6 +10707,21 @@ static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, Â Â Â Â Â return -EBUSY; Â } +#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_update(struct io_uring_rsrc_update *up, +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const void __user *user_up) +{ +Â Â Â struct compat_io_uring_rsrc_update compat_up;
+Â Â Â if (unlikely(copy_from_user(&compat_up, user_up, sizeof(compat_up)))) +Â Â Â Â Â Â Â return -EFAULT; +Â Â Â up->offset = compat_up.offset; +Â Â Â up->resv = compat_up.resv; +Â Â Â up->data = compat_up.data; +Â Â Â return 0; +} +#endif
/* Â Â * Register a ring fd to avoid fdget/fdput for each io_uring_enter() Â Â * invocation. User passes in an array of struct io_uring_rsrc_update @@ -10547,10 +10751,18 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, Â Â Â Â Â for (i = 0; i < nr_args; i++) { Â Â Â Â Â Â Â Â Â int start, end; -Â Â Â Â Â Â Â if (copy_from_user(®, &arg[i], sizeof(reg))) { -Â Â Â Â Â Â Â Â Â Â Â ret = -EFAULT; -Â Â Â Â Â Â Â Â Â Â Â break; -Â Â Â Â Â Â Â } +#ifdef CONFIG_COMPAT +Â Â Â Â Â Â Â if (ctx->compat) { +Â Â Â Â Â Â Â Â Â Â Â if (get_compat_io_uring_rsrc_update(®, &arg[i])) { +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ret = -EFAULT; +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â break; +Â Â Â Â Â Â Â Â Â Â Â } +Â Â Â Â Â Â Â } else +#endif +Â Â Â Â Â Â Â Â Â Â Â if (copy_from_user(®, &arg[i], sizeof(reg))) { +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ret = -EFAULT; +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â break; +Â Â Â Â Â Â Â Â Â Â Â } Â Â Â Â Â Â Â Â Â if (reg.resv) { Â Â Â Â Â Â Â Â Â Â Â Â Â ret = -EINVAL; @@ -10599,10 +10811,19 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, Â Â Â Â Â Â Â Â Â return 0; Â Â Â Â Â for (i = 0; i < nr_args; i++) { -Â Â Â Â Â Â Â if (copy_from_user(®, &arg[i], sizeof(reg))) { -Â Â Â Â Â Â Â Â Â Â Â ret = -EFAULT; -Â Â Â Â Â Â Â Â Â Â Â break; -Â Â Â Â Â Â Â } +#ifdef CONFIG_COMPAT +Â Â Â Â Â Â Â if (ctx->compat) { +Â Â Â Â Â Â Â Â Â Â Â if (get_compat_io_uring_rsrc_update(®, &arg[i])) { +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ret = -EFAULT; +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â break; +Â Â Â Â Â Â Â Â Â Â Â } +Â Â Â Â Â Â Â } else +#endif +Â Â Â Â Â Â Â Â Â Â Â if (copy_from_user(®, &arg[i], sizeof(reg))) { +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ret = -EFAULT; +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â break; +Â Â Â Â Â Â Â Â Â Â Â }
if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { Â Â Â Â Â Â Â Â Â Â Â Â Â ret = -EINVAL; Â Â Â Â Â Â Â Â Â Â Â Â Â break; @@ -10632,7 +10853,7 @@ static void *io_uring_validate_mmap_request(struct file *file, Â Â Â Â Â Â Â Â Â ptr = ctx->rings; Â Â Â Â Â Â Â Â Â break; Â Â Â Â Â case IORING_OFF_SQES: -Â Â Â Â Â Â Â ptr = ctx->sq_sqes; +Â Â Â Â Â Â Â ptr = ctx->compat ? (void *)ctx->sq_sqes_compat : (void *)ctx->sq_sqes; Â Â Â Â Â Â Â Â Â break; Â Â Â Â Â default: Â Â Â Â Â Â Â Â Â return ERR_PTR(-EINVAL); @@ -10706,7 +10927,24 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) Â Â Â Â Â return 0; Â } -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, +#ifdef CONFIG_COMPAT +static int get_compat_io_uring_getevents_arg(struct io_uring_getevents_arg *arg, +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const void __user *user_arg) +{ +Â Â Â struct compat_io_uring_getevents_arg compat_arg;
+Â Â Â if (unlikely(copy_from_user(&compat_arg, user_arg, sizeof(compat_arg)))) +Â Â Â Â Â Â Â return -EFAULT; +Â Â Â arg->sigmask = compat_arg.sigmask; +Â Â Â arg->sigmask_sz = compat_arg.sigmask_sz; +Â Â Â arg->pad = compat_arg.pad; +Â Â Â arg->ts = compat_arg.ts; +Â Â Â return 0; +} +#endif
+static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned int flags, const void __user *argp, +             size_t *argsz,  #ifdef CONFIG_CHERI_PURECAP_UABI                struct __kernel_timespec * __capability *ts,                const sigset_t * __capability *sig) @@ -10731,10 +10969,21 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz       * EXT_ARG is set - ensure we agree on the size of it and copy in our       * timespec and sigset_t pointers if good.       */ -   if (*argsz != sizeof(arg)) -       return -EINVAL; -   if (copy_from_user(&arg, argp, sizeof(arg))) -       return -EFAULT; +#ifdef CONFIG_COMPAT +   if (ctx->compat) { +       if (*argsz != sizeof(struct compat_io_uring_getevents_arg)) +           return -EINVAL; +       if (get_compat_io_uring_getevents_arg(&arg, argp)) +           return -EFAULT; +   } else +#endif +   { +       if (*argsz != sizeof(arg)) +           return -EINVAL; +       if (copy_from_user(&arg, argp, sizeof(arg))) +           return -EFAULT; +   }
if (arg.pad) Â Â Â Â Â Â Â Â Â return -EINVAL; Â Â Â Â Â *sig = u64_to_user_ptr(arg.sigmask); @@ -10827,7 +11076,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, Â Â Â Â Â Â Â Â Â const sigset_t __user *sig; Â Â Â Â Â Â Â Â Â struct __kernel_timespec __user *ts; -Â Â Â Â Â Â Â ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); +Â Â Â Â Â Â Â ret = io_get_ext_arg(ctx, flags, argp, &argsz, &ts, &sig); Â Â Â Â Â Â Â Â Â if (unlikely(ret)) Â Â Â Â Â Â Â Â Â Â Â Â Â goto out; @@ -10926,7 +11175,16 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, Â Â Â Â Â Â Â Â Â if (sq_idx > sq_mask) Â Â Â Â Â Â Â Â Â Â Â Â Â continue; -Â Â Â Â Â Â Â sqe = &ctx->sq_sqes[sq_idx]; +#ifdef CONFIG_COMPAT +Â Â Â Â Â Â Â if (ctx->compat) { +Â Â Â Â Â Â Â Â Â Â Â struct io_uring_sqe native_sqe; +Â Â Â Â Â Â Â Â Â Â Â struct compat_io_uring_sqe *compat_sqe = &ctx->sq_sqes_compat[sq_idx];
+Â Â Â Â Â Â Â Â Â Â Â convert_compat_io_uring_sqe(&native_sqe, compat_sqe); +Â Â Â Â Â Â Â Â Â Â Â sqe = &native_sqe; +Â Â Â Â Â Â Â } else +#endif +Â Â Â Â Â Â Â Â Â Â Â sqe = &ctx->sq_sqes[sq_idx]; Â Â Â Â Â Â Â Â Â seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â sq_idx, sqe->opcode, sqe->fd, sqe->flags, Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â sqe->user_data); @@ -10935,7 +11193,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, Â Â Â Â Â cq_entries = min(cq_tail - cq_head, ctx->cq_entries); Â Â Â Â Â for (i = 0; i < cq_entries; i++) { Â Â Â Â Â Â Â Â Â unsigned int entry = i + cq_head; -Â Â Â Â Â Â Â struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; +Â Â Â Â Â Â Â unsigned int cq_idx = entry & cq_mask; +Â Â Â Â Â Â Â struct io_uring_cqe *cqe; +#ifdef CONFIG_COMPAT +Â Â Â Â Â Â Â if (ctx->compat) { +Â Â Â Â Â Â Â Â Â Â Â struct io_uring_cqe native_cqe; +Â Â Â Â Â Â Â Â Â Â Â struct compat_io_uring_cqe *compat_cqe = &ctx->cqes_compat[cq_idx];
+Â Â Â Â Â Â Â Â Â Â Â convert_compat_io_uring_cqe(&native_cqe, compat_cqe); +Â Â Â Â Â Â Â Â Â Â Â cqe = &native_cqe; +Â Â Â Â Â Â Â } else +#endif +Â Â Â Â Â Â Â Â Â Â Â cqe = &ctx->cqes[cq_idx]; Â Â Â Â Â Â Â Â Â seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â entry & cq_mask, cqe->user_data, cqe->res, @@ -10998,7 +11267,17 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, Â Â Â Â Â seq_puts(m, "CqOverflowList:\n"); Â Â Â Â Â list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { -Â Â Â Â Â Â Â struct io_uring_cqe *cqe = &ocqe->cqe; +Â Â Â Â Â Â Â struct io_uring_cqe *cqe; +#ifdef CONFIG_COMPAT +Â Â Â Â Â Â Â if (ctx->compat) { +Â Â Â Â Â Â Â Â Â Â Â struct io_uring_cqe native_cqe; +Â Â Â Â Â Â Â Â Â Â Â struct compat_io_uring_cqe *compat_cqe = &ocqe->compat_cqe;
+Â Â Â Â Â Â Â Â Â Â Â convert_compat_io_uring_cqe(&native_cqe, compat_cqe); +Â Â Â Â Â Â Â Â Â Â Â cqe = &native_cqe; +Â Â Â Â Â Â Â } else +#endif +Â Â Â Â Â Â Â Â Â Â Â cqe = &ocqe->cqe; Â Â Â Â Â Â Â Â Â seq_printf(m, "Â user_data=%llu, res=%d, flags=%x\n", Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â cqe->user_data, cqe->res, cqe->flags); @@ -11036,13 +11315,14 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct io_uring_params *p) Â { Â Â Â Â Â struct io_rings *rings; -Â Â Â size_t size, sq_array_offset; +Â Â Â size_t size, cqes_offset, sq_array_offset; +Â Â Â void *cqes, *sqes; Â Â Â Â Â /* make sure these are sane, as we already accounted them */ Â Â Â Â Â ctx->sq_entries = p->sq_entries; Â Â Â Â Â ctx->cq_entries = p->cq_entries; -Â Â Â size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); +Â Â Â size = rings_size(ctx, p->sq_entries, p->cq_entries, &cqes_offset, &sq_array_offset); Â Â Â Â Â if (size == SIZE_MAX) Â Â Â Â Â Â Â Â Â return -EOVERFLOW; @@ -11051,25 +11331,40 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, Â Â Â Â Â Â Â Â Â return -ENOMEM; Â Â Â Â Â ctx->rings = rings; +Â Â Â cqes = (char *)rings + cqes_offset; +#ifdef CONFIG_COMPAT +Â Â Â if (ctx->compat) +Â Â Â Â Â Â Â ctx->cqes_compat = (struct compat_io_uring_cqe *)cqes; +Â Â Â else +#endif +Â Â Â Â Â Â Â ctx->cqes = (struct io_uring_cqe *)cqes;
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); Â Â Â Â Â rings->sq_ring_mask = p->sq_entries - 1; Â Â Â Â Â rings->cq_ring_mask = p->cq_entries - 1; Â Â Â Â Â rings->sq_ring_entries = p->sq_entries; Â Â Â Â Â rings->cq_ring_entries = p->cq_entries; -Â Â Â size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); +Â Â Â size = array_size(ctx->compat ? sizeof(struct compat_io_uring_sqe) +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â : sizeof(struct io_uring_sqe), p->sq_entries); Â Â Â Â Â if (size == SIZE_MAX) { Â Â Â Â Â Â Â Â Â io_mem_free(ctx->rings); Â Â Â Â Â Â Â Â Â ctx->rings = NULL; Â Â Â Â Â Â Â Â Â return -EOVERFLOW; Â Â Â Â Â } -Â Â Â ctx->sq_sqes = io_mem_alloc(size); -Â Â Â if (!ctx->sq_sqes) { +Â Â Â sqes = io_mem_alloc(size); +Â Â Â if (!sqes) { Â Â Â Â Â Â Â Â Â io_mem_free(ctx->rings); Â Â Â Â Â Â Â Â Â ctx->rings = NULL; Â Â Â Â Â Â Â Â Â return -ENOMEM; Â Â Â Â Â } +#ifdef CONFIG_COMPAT +Â Â Â if (ctx->compat) +Â Â Â Â Â Â Â ctx->sq_sqes_compat = (struct compat_io_uring_sqe *)sqes; +Â Â Â else +#endif +Â Â Â Â Â Â Â ctx->sq_sqes = (struct io_uring_sqe *)sqes; Â Â Â Â Â return 0; Â } @@ -11210,7 +11505,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, Â Â Â Â Â p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); Â Â Â Â Â p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); Â Â Â Â Â p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); -Â Â Â p->cq_off.cqes = offsetof(struct io_rings, cqes); +Â Â Â p->cq_off.cqes = (char *)ctx->cqes - (char *)ctx->rings; Â Â Â Â Â p->cq_off.flags = offsetof(struct io_rings, cq_flags); Â Â Â Â Â p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | @@ -11453,39 +11748,103 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, Â Â Â Â Â if (!nr_args) Â Â Â Â Â Â Â Â Â return -EINVAL; Â Â Â Â Â memset(&up, 0, sizeof(up)); -Â Â Â if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) -Â Â Â Â Â Â Â return -EFAULT;
+#ifdef CONFIG_COMPAT +Â Â Â if (ctx->compat) { +Â Â Â Â Â Â Â if (get_compat_io_uring_rsrc_update((struct io_uring_rsrc_update *)&up, arg))
This is rather dubious from a strict aliasing perspective. Mind you, it's really the ABI that's scary here (struct io_uring_rsrc_update is supposed to represent the bottom part of struct io_uring_rsrc_update2, which is apparently used internally to avoid duplication). When doing a copy_from_user(), there's no aliasing concern as copy_from_user() doesn't care about types, but get_compat_io_uring_rsrc_update() does access up as a struct io_uring_rsrc_update and that's not really OK.
The best is probably to have an (admittedly somewhat strange) helper that takes a struct io_uring_rsrc_update2 * instead (without touching the extra fields). This is not too bad though as there should be two versions of get_compat_io_uring_rsrc_update() anyway, one for this function and one for the two others above, see my comments in patch 4 as to why that is.
Ack, good suggestion, more clear this way.
+Â Â Â Â Â Â Â Â Â Â Â return -EFAULT; +Â Â Â } else +#endif +Â Â Â Â Â Â Â if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) +Â Â Â Â Â Â Â Â Â Â Â return -EFAULT;
if (up.resv || up.resv2) Â Â Â Â Â Â Â Â Â return -EINVAL; Â Â Â Â Â return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); Â } +#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_update2(struct io_uring_rsrc_update2 *up2, +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const void __user *user_up2) +{ +Â Â Â struct compat_io_uring_rsrc_update2 compat_up2;
+Â Â Â if (unlikely(copy_from_user(&compat_up2, user_up2, sizeof(compat_up2)))) +Â Â Â Â Â Â Â return -EFAULT; +Â Â Â up2->offset = compat_up2.offset; +Â Â Â up2->resv = compat_up2.resv; +Â Â Â up2->data = compat_up2.data; +Â Â Â up2->tags = compat_up2.tags; +Â Â Â up2->nr = compat_up2.nr; +Â Â Â up2->resv2 = compat_up2.resv2; +Â Â Â return 0; +} +#endif
static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â unsigned size, unsigned type) Â { Â Â Â Â Â struct io_uring_rsrc_update2 up; -Â Â Â if (size != sizeof(up)) -Â Â Â Â Â Â Â return -EINVAL; -Â Â Â if (copy_from_user(&up, arg, sizeof(up))) -Â Â Â Â Â Â Â return -EFAULT; +#ifdef CONFIG_COMPAT +Â Â Â if (ctx->compat) { +Â Â Â Â Â Â Â if (size != sizeof(struct compat_io_uring_rsrc_update2)) +Â Â Â Â Â Â Â Â Â Â Â return -EINVAL; +Â Â Â Â Â Â Â if (get_compat_io_uring_rsrc_update2(&up, arg)) +Â Â Â Â Â Â Â Â Â Â Â return -EFAULT; +Â Â Â } else +#endif +Â Â Â { +Â Â Â Â Â Â Â if (size != sizeof(up)) +Â Â Â Â Â Â Â Â Â Â Â return -EINVAL; +Â Â Â Â Â Â Â if (copy_from_user(&up, arg, sizeof(up))) +Â Â Â Â Â Â Â Â Â Â Â return -EFAULT; +Â Â Â }
if (!up.nr || up.resv || up.resv2) Â Â Â Â Â Â Â Â Â return -EINVAL; Â Â Â Â Â return __io_register_rsrc_update(ctx, type, &up, up.nr); Â }
+#ifdef CONFIG_COMPAT +static int get_compat_io_uring_rsrc_register(struct io_uring_rsrc_register *rr, +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const void __user *user_rr) +{ +Â Â Â struct compat_io_uring_rsrc_register compat_rr;
+Â Â Â if (unlikely(copy_from_user(&compat_rr, user_rr, sizeof(compat_rr)))) +Â Â Â Â Â Â Â return -EFAULT; +Â Â Â rr->nr = compat_rr.nr; +Â Â Â rr->resv = compat_rr.resv; +Â Â Â rr->resv2 = compat_rr.resv2; +Â Â Â rr->data = compat_rr.data; +Â Â Â rr->tags = compat_rr.tags; +Â Â Â return 0; +} +#endif
static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â unsigned int size, unsigned int type) Â { Â Â Â Â Â struct io_uring_rsrc_register rr; -Â Â Â /* keep it extendible */ -Â Â Â if (size != sizeof(rr)) -Â Â Â Â Â Â Â return -EINVAL; +#ifdef CONFIG_COMPAT +Â Â Â if (ctx->compat) { +Â Â Â Â Â Â Â if (size != sizeof(struct compat_io_uring_rsrc_register)) +Â Â Â Â Â Â Â Â Â Â Â return -EINVAL; +Â Â Â Â Â Â Â if (get_compat_io_uring_rsrc_register(&rr, arg)) +Â Â Â Â Â Â Â Â Â Â Â return -EFAULT; +Â Â Â } else +#endif +Â Â Â { +Â Â Â Â Â Â Â /* keep it extendible */ +Â Â Â Â Â Â Â if (size != sizeof(rr)) +Â Â Â Â Â Â Â Â Â Â Â return -EINVAL; +Â Â Â Â Â Â Â if (copy_from_user(&rr, arg, size)) +Â Â Â Â Â Â Â Â Â Â Â return -EFAULT; +Â Â Â } -Â Â Â memset(&rr, 0, sizeof(rr)); -Â Â Â if (copy_from_user(&rr, arg, size)) -Â Â Â Â Â Â Â return -EFAULT; Â Â Â Â Â if (!rr.nr || rr.resv || rr.resv2) Â Â Â Â Â Â Â Â Â return -EINVAL; @@ -11799,6 +12158,13 @@ static int __init io_uring_init(void) Â Â Â Â Â BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ Â } while (0) +Â Â Â BUILD_BUG_ON(__builtin_types_compatible_p(struct io_uring_sqe, struct compat_io_uring_sqe));
The condition is reversed :)
Actually, this __builtin_types_compatible_p returns 0 😅, so it doesn't do what I expected. Copying all the layout checks for the struct compat_io_uring_sqe doesn't feel right, so I am unsure what to do here. Any suggestion?
+Â Â Â BUILD_BUG_ON(sizeof(struct compat_io_uring_files_update) != +Â Â Â Â Â Â Â Â Â Â Â Â sizeof(struct compat_io_uring_rsrc_update)); +Â Â Â BUILD_BUG_ON(sizeof(struct compat_io_uring_rsrc_update) > +Â Â Â Â Â Â Â Â Â Â Â Â sizeof(struct compat_io_uring_rsrc_update2));
Nit: I'd rather have the compat checks after the native ones.
Ack
Kevin
Many thanks for the in-depth review and suggestions!
Thanks, Tudor
#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ Â Â Â Â Â __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) Â Â Â Â Â BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
On 02/11/2022 19:25, Tudor Cretu wrote:
@@ -9389,13 +9561,29 @@ static void *io_mem_alloc(size_t size) Â Â Â Â Â return (void *) __get_free_pages(gfp, get_order(size)); Â } -static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, -Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â size_t *sq_offset) +static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â unsigned int cq_entries, size_t *cq_offset, size_t *sq_offset) Â { Â Â Â Â Â struct io_rings *rings; -Â Â Â size_t off, sq_array_size; +Â Â Â size_t off, cq_array_size, sq_array_size;
+Â Â Â off = sizeof(*rings);
+#ifdef CONFIG_SMP +Â Â Â off = ALIGN(off, SMP_CACHE_BYTES); +Â Â Â if (off == 0) +Â Â Â Â Â Â Â return SIZE_MAX; +#endif
+Â Â Â if (cq_offset) +Â Â Â Â Â Â Â *cq_offset = off;
+Â Â Â cq_array_size = array_size(ctx->compat ? sizeof(struct compat_io_uring_cqe) +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â : sizeof(struct io_uring_cqe), cq_entries); +Â Â Â if (cq_array_size == SIZE_MAX) +Â Â Â Â Â Â Â return SIZE_MAX; -Â Â Â off = struct_size(rings, cqes, cq_entries); +Â Â Â off = size_add(off, cq_array_size); Â Â Â Â Â if (off == SIZE_MAX) Â Â Â Â Â Â Â Â Â return SIZE_MAX; @@ -9412,7 +9600,8 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, Â Â Â Â Â if (sq_array_size == SIZE_MAX) Â Â Â Â Â Â Â Â Â return SIZE_MAX; -Â Â Â if (check_add_overflow(off, sq_array_size, &off)) +Â Â Â off = size_add(off, sq_array_size); +Â Â Â if (off == SIZE_MAX)
Could you elaborate on why we need this change?
It's not needed, just wanted to align it with the rest of the checks in the function and I find it more readable. I can make it again check_add_overflow, as well with the check at "size_add(off, cq_array_size)" if you prefer.
check_add_overflow() is already used in many places in this file, and size_add() is not, so yes I think it's better to use check_add_overflow(). Not that I disagree that size_add() is clearer, but consistency first :)
return SIZE_MAX; Â Â Â Â Â return off; @@ -9972,7 +10161,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) Â Â Â Â Â WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); Â Â Â Â Â io_mem_free(ctx->rings); -Â Â Â io_mem_free(ctx->sq_sqes); +Â Â Â io_mem_free(ctx->compat ? (void *)ctx->sq_sqes_compat : (void *)ctx->sq_sqes);
I find this sort of diff slightly frustrating as sq_sqes and sq_sqes_compat are both pointers, simply to a different type. If I got this correctly, you are making use of the type in two cases, while it is irrelevant in three (like here). However it is technically UB to refer to the inactive member of the union, so just getting rid of this diff wouldn't be recommended.
Fortunately there is an alternative: instead of having a union of pointers, make it a pointer to a union, e.g.:
union { Â Â Â Â struct io_uring_cqe native; Â Â Â Â struct compat_io_uring_cqe compat; Â Â } *sq_sqes;
This way sq_sqes can still be viewed as a void * without change, and the cases where the type matters have to be changed anyway.
Might as well do the same with cqes/cqes_compat to be consistent.
I think this change would be misleading. sq_sqes is not a pointer to an element which is either native or compat, but rather is an array of {compat_}io_uring_sqe structs, so it's not really the same thing. The compat member of the union would never be used. In the compat case, I would need to cast sq_sqes to an array of struct compat_io_uring_sqe and then get the element by index. Is this what intended to suggest?
You are absolutely right, I somehow missed this "detail"... I don't think it is possible to have a different approach to the union then.
That said I have realised that I was also wrong on the fact that using the inactive member is UB: it is in C++ but not in C, where it is explicitly supported since C99 (this is known as "type punning", see [1]). Therefore we can have the best of both worlds: use ctx->sq_sqes unconditionally if the actual type doesn't matter, like here or when initialising the member, and otherwise use the right member.
percpu_ref_exit(&ctx->refs); Â Â Â Â Â free_uid(ctx->user);
[...]
@@ -11799,6 +12158,13 @@ static int __init io_uring_init(void) Â Â Â Â Â BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ Â } while (0) +Â Â Â BUILD_BUG_ON(__builtin_types_compatible_p(struct io_uring_sqe, struct compat_io_uring_sqe));
The condition is reversed 😄
Actually, this __builtin_types_compatible_p returns 0 😅, so it doesn't do what I expected. Copying all the layout checks for the struct compat_io_uring_sqe doesn't feel right, so I am unsure what to do here. Any suggestion?
Right, that makes sense in fact, as two distinct structs are incompatible regardless of their layout (you don't want to inadvertently allow assigning a struct to another just because the number and types of their members happen to match).
I think you might as well copy all the checks. [2] did that for siginfo_t and I think being explicit makes sense here, it doesn't really hurt to bloat this function a bit.
+Â Â Â BUILD_BUG_ON(sizeof(struct compat_io_uring_files_update) != +Â Â Â Â Â Â Â Â Â Â Â Â sizeof(struct compat_io_uring_rsrc_update)); +Â Â Â BUILD_BUG_ON(sizeof(struct compat_io_uring_rsrc_update) > +Â Â Â Â Â Â Â Â Â Â Â Â sizeof(struct compat_io_uring_rsrc_update2));
Nit: I'd rather have the compat checks after the native ones.
Ack
Kevin
Many thanks for the in-depth review and suggestions!
You are very welcome :)
Kevin
[1] https://en.cppreference.com/w/c/language/union [2] https://lkml.kernel.org/r/20210429190734.624918-3-elver@google.com
Some members of the io_uring uAPI structs may contain user pointers. On some architectures (for example CHERI) a user pointer is a 129-bit capability, so the __u64 type is not big enough to hold it. Use the __kernel_uintptr_t type instead, which is big enough on the affected architectures while remaining 64-bit on others.
In addition, special copy routines need to be used when copying user pointers from/to userspace. Use these for the io_uring structs containing user pointers.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- fs/io_uring.c | 268 +++++++++++++++++++++------------- include/uapi/linux/io_uring.h | 32 ++-- 2 files changed, 179 insertions(+), 121 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 2db37c68e715..9216617c1edf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -351,7 +351,7 @@ struct io_buffer_list {
struct io_buffer { struct list_head list; - __u64 addr; + __kernel_uintptr_t addr; __u32 len; __u16 bid; __u16 bgid; @@ -652,7 +652,7 @@ struct io_sync {
struct io_cancel { struct file *file; - u64 addr; + __kernel_uintptr_t addr; };
struct io_timeout { @@ -668,7 +668,7 @@ struct io_timeout {
struct io_timeout_rem { struct file *file; - u64 addr; + __kernel_uintptr_t addr;
/* timeout update */ struct timespec64 ts; @@ -679,7 +679,7 @@ struct io_timeout_rem { struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; - u64 addr; + __kernel_uintptr_t addr; u32 len; u32 flags; }; @@ -752,7 +752,7 @@ struct io_splice {
struct io_provide_buf { struct file *file; - __u64 addr; + __kernel_uintptr_t addr; __u32 len; __u32 bgid; __u16 nbufs; @@ -814,7 +814,7 @@ struct io_hardlink {
struct io_msg { struct file *file; - u64 user_data; + __kernel_uintptr_t user_data; u32 len; };
@@ -1001,7 +1001,7 @@ struct io_kiocb { u16 buf_index; unsigned int flags;
- u64 user_data; + __kernel_uintptr_t user_data; u32 result; /* fd initially, then cflags for completion */ union { @@ -1921,7 +1921,7 @@ static inline void *io_get_cqe(struct io_ring_ctx *ctx) static inline void convert_compat_io_uring_cqe(struct io_uring_cqe *cqe, const struct compat_io_uring_cqe *compat_cqe) { - cqe->user_data = READ_ONCE(compat_cqe->user_data); + cqe->user_data = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_cqe->user_data)); cqe->res = READ_ONCE(compat_cqe->res); cqe->flags = READ_ONCE(compat_cqe->flags); } @@ -2096,7 +2096,7 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } }
-static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, +static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, s32 res, u32 cflags) { struct io_overflow_cqe *ocqe; @@ -2133,7 +2133,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; }
-static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, +static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, s32 res, u32 cflags) { void *cqe; @@ -2178,7 +2178,7 @@ static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) __io_fill_cqe_req(req, res, cflags); }
-static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, +static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, s32 res, u32 cflags) { ctx->cq_extra++; @@ -3509,7 +3509,7 @@ static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, kbuf = io_buffer_select(req, len, bgid, issue_flags); if (IS_ERR(kbuf)) return ERR_USER_PTR(PTR_ERR(kbuf)); - return u64_to_user_ptr(kbuf->addr); + return (void __user *)kbuf->addr; }
#ifdef CONFIG_COMPAT @@ -3521,7 +3521,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, void __user *buf; ssize_t len;
- uiov = u64_to_user_ptr(req->rw.addr); + uiov = (struct compat_iovec __user *)req->rw.addr; if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(clen, &uiov->iov_len)) @@ -3542,11 +3542,11 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) { - struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); + struct iovec __user *uiov = (struct iovec __user *)req->rw.addr; void __user *buf; ssize_t len;
- if (copy_from_user(iov, uiov, sizeof(*uiov))) + if (copy_from_user_with_ptr(iov, uiov, sizeof(*uiov))) return -EFAULT;
len = iov[0].iov_len; @@ -3566,7 +3566,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, if (req->flags & REQ_F_BUFFER_SELECTED) { struct io_buffer *kbuf = req->kbuf;
- iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + iov[0].iov_base = (void __user *)kbuf->addr; iov[0].iov_len = kbuf->len; return 0; } @@ -3603,7 +3603,7 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) return ERR_PTR(-EINVAL);
- buf = u64_to_user_ptr(req->rw.addr); + buf = (void __user *)req->rw.addr; sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { @@ -3684,7 +3684,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) if (!iov_iter_is_bvec(iter)) { iovec = iov_iter_iovec(iter); } else { - iovec.iov_base = u64_to_user_ptr(req->rw.addr); + iovec.iov_base = (void __user *)req->rw.addr; iovec.iov_len = req->rw.len; }
@@ -4185,8 +4185,8 @@ static int io_renameat_prep(struct io_kiocb *req, return -EBADF;
ren->old_dfd = READ_ONCE(sqe->fd); - oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + oldf = (const char __user *)READ_ONCE(sqe->addr); + newf = (const char __user *)READ_ONCE(sqe->addr2); ren->new_dfd = READ_ONCE(sqe->len); ren->flags = READ_ONCE(sqe->rename_flags);
@@ -4242,7 +4242,7 @@ static int io_unlinkat_prep(struct io_kiocb *req, if (un->flags & ~AT_REMOVEDIR) return -EINVAL;
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + fname = (const char __user *)READ_ONCE(sqe->addr); un->filename = getname(fname); if (IS_ERR(un->filename)) return PTR_ERR(un->filename); @@ -4288,7 +4288,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, mkd->dfd = READ_ONCE(sqe->fd); mkd->mode = READ_ONCE(sqe->len);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + fname = (const char __user *)READ_ONCE(sqe->addr); mkd->filename = getname(fname); if (IS_ERR(mkd->filename)) return PTR_ERR(mkd->filename); @@ -4329,8 +4329,8 @@ static int io_symlinkat_prep(struct io_kiocb *req, return -EBADF;
sl->new_dfd = READ_ONCE(sqe->fd); - oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + oldpath = (const char __user *)READ_ONCE(sqe->addr); + newpath = (const char __user *)READ_ONCE(sqe->addr2);
sl->oldpath = getname(oldpath); if (IS_ERR(sl->oldpath)) @@ -4378,8 +4378,8 @@ static int io_linkat_prep(struct io_kiocb *req,
lnk->old_dfd = READ_ONCE(sqe->fd); lnk->new_dfd = READ_ONCE(sqe->len); - oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + oldf = (const char __user *)READ_ONCE(sqe->addr); + newf = (const char __user *)READ_ONCE(sqe->addr2); lnk->flags = READ_ONCE(sqe->hardlink_flags);
lnk->oldpath = getname(oldf); @@ -4702,7 +4702,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe req->open.how.flags |= O_LARGEFILE;
req->open.dfd = READ_ONCE(sqe->fd); - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + fname = (const char __user *)READ_ONCE(sqe->addr); req->open.filename = getname(fname); if (IS_ERR(req->open.filename)) { ret = PTR_ERR(req->open.filename); @@ -4734,7 +4734,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret;
- how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + how = (struct open_how __user *)READ_ONCE(sqe->addr2); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) return -EINVAL; @@ -4911,7 +4911,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, return -EOVERFLOW;
size = (unsigned long)p->len * p->nbufs; - if (!access_ok(u64_to_user_ptr(p->addr), size)) + if (!access_ok(p->addr, size)) return -EFAULT;
p->bgid = READ_ONCE(sqe->buf_group); @@ -5039,8 +5039,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, if (ep_op_has_event(req->epoll.op)) { struct epoll_event __user *ev;
- ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) + ev = (struct epoll_event __user *)READ_ONCE(sqe->addr); + if (copy_from_user_with_ptr(&req->epoll.event, ev, sizeof(*ev))) return -EFAULT; }
@@ -5155,8 +5155,8 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->statx.dfd = READ_ONCE(sqe->fd); req->statx.mask = READ_ONCE(sqe->len); - path = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + path = (const char __user *)READ_ONCE(sqe->addr); + req->statx.buffer = (struct statx __user *)READ_ONCE(sqe->addr2); req->statx.flags = READ_ONCE(sqe->statx_flags);
req->statx.filename = getname_flags(path, @@ -5343,7 +5343,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL;
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = (struct user_msghdr __user *)READ_ONCE(sqe->addr); sr->len = READ_ONCE(sqe->len); sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) @@ -5458,7 +5458,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, if (req->flags & REQ_F_BUFFER_SELECT) { if (iov_len > 1) return -EINVAL; - if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) + if (copy_from_user_with_ptr(iomsg->fast_iov, uiov, sizeof(*uiov))) return -EFAULT; sr->len = iomsg->fast_iov[0].iov_len; iomsg->free_iov = NULL; @@ -5556,7 +5556,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL;
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = (struct user_msghdr __user *)READ_ONCE(sqe->addr); sr->len = READ_ONCE(sqe->len); sr->bgid = READ_ONCE(sqe->buf_group); sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; @@ -5605,7 +5605,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + kmsg->fast_iov[0].iov_base = (void __user *)kbuf->addr; kmsg->fast_iov[0].iov_len = req->sr_msg.len; iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, req->sr_msg.len); @@ -5666,7 +5666,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - buf = u64_to_user_ptr(kbuf->addr); + buf = (void __user *)kbuf->addr; }
ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); @@ -5722,8 +5722,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL;
- accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + accept->addr = (struct sockaddr __user *)READ_ONCE(sqe->addr); + accept->addr_len = (int __user *)READ_ONCE(sqe->addr2); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE);
@@ -5791,8 +5791,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sqe->splice_fd_in) return -EINVAL;
- conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - conn->addr_len = READ_ONCE(sqe->addr2); + conn->addr = (struct sockaddr __user *)READ_ONCE(sqe->addr); + conn->addr_len = READ_ONCE(sqe->addr2); return 0; }
@@ -6564,7 +6564,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) }
static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, - __u64 user_data) + __kernel_uintptr_t user_data) __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; @@ -6586,7 +6586,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, return req; }
-static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +static int io_timeout_cancel(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data) __must_hold(&ctx->completion_lock) __must_hold(&ctx->timeout_lock) { @@ -6614,7 +6614,7 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data) } }
-static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, +static int io_linked_timeout_update(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { @@ -6639,7 +6639,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, return 0; }
-static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, +static int io_timeout_update(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { @@ -6680,7 +6680,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req, tr->ltimeout = true; if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; - if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) + if (get_timespec64(&tr->ts, (const struct __kernel_timespec __user *)sqe->addr2)) return -EFAULT; if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) return -EINVAL; @@ -6766,7 +6766,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data->req = req; data->flags = flags;
- if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) + if (get_timespec64(&data->ts, (const struct __kernel_timespec __user *)sqe->addr)) return -EFAULT;
if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) @@ -6841,7 +6841,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
struct io_cancel_data { struct io_ring_ctx *ctx; - u64 user_data; + __kernel_uintptr_t user_data; };
static bool io_cancel_cb(struct io_wq_work *work, void *data) @@ -6852,7 +6852,7 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data) return req->ctx == cd->ctx && req->user_data == cd->user_data; }
-static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, +static int io_async_cancel_one(struct io_uring_task *tctx, __kernel_uintptr_t user_data, struct io_ring_ctx *ctx) { struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; @@ -7984,13 +7984,13 @@ static void convert_compat_io_uring_sqe(struct io_uring_sqe *sqe, sqe->ioprio = READ_ONCE(compat_sqe->ioprio); sqe->fd = READ_ONCE(compat_sqe->fd); BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr2, addr); - sqe->addr2 = READ_ONCE(compat_sqe->addr2); + sqe->addr2 = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_sqe->addr2)); BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr, len); - sqe->addr = READ_ONCE(compat_sqe->addr); + sqe->addr = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_sqe->addr)); sqe->len = READ_ONCE(compat_sqe->len); BUILD_BUG_COMPAT_SQE_UNION_ELEM(rw_flags, user_data); sqe->rw_flags = READ_ONCE(compat_sqe->rw_flags); - sqe->user_data = READ_ONCE(compat_sqe->user_data); + sqe->user_data = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_sqe->user_data)); BUILD_BUG_COMPAT_SQE_UNION_ELEM(buf_index, personality); sqe->buf_index = READ_ONCE(compat_sqe->buf_index); sqe->personality = READ_ONCE(compat_sqe->personality); @@ -9237,8 +9237,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) { - u64 __user *tags = u64_to_user_ptr(up->tags); - __s32 __user *fds = u64_to_user_ptr(up->data); + u64 __user *tags = (u64 __user *)up->tags; + __s32 __user *fds = (__s32 __user *)up->data; struct io_rsrc_data *data = ctx->file_data; struct io_fixed_file *file_slot; struct file *file; @@ -9674,7 +9674,7 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, } #endif src = (struct iovec __user *) arg; - if (copy_from_user(dst, &src[index], sizeof(*dst))) + if (copy_from_user_with_ptr(dst, &src[index], sizeof(*dst))) return -EFAULT; return 0; } @@ -9935,9 +9935,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { - u64 __user *tags = u64_to_user_ptr(up->tags); + u64 __user *tags = (u64 __user *)up->tags; struct iovec iov; - struct iovec __user *iovs = u64_to_user_ptr(up->data); + struct iovec __user *iovs = (struct iovec __user *)up->data; struct page *last_hpage = NULL; bool needs_switch = false; __u32 done; @@ -10717,7 +10717,20 @@ static int get_compat_io_uring_rsrc_update(struct io_uring_rsrc_update *up, return -EFAULT; up->offset = compat_up.offset; up->resv = compat_up.resv; - up->data = compat_up.data; + up->data = (__kernel_uintptr_t)compat_ptr(compat_up.data); + return 0; +} + +static int set_compat_io_uring_rsrc_update(void __user *user_up, + const struct io_uring_rsrc_update *up) +{ + struct compat_io_uring_rsrc_update compat_up; + + compat_up.offset = up->offset; + compat_up.resv = up->resv; + compat_up.data = user_ptr_addr((void __user *)up->data); + if (unlikely(copy_to_user(user_up, &compat_up, sizeof(compat_up)))) + return -EFAULT; return 0; } #endif @@ -10759,7 +10772,7 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, } } else #endif - if (copy_from_user(®, &arg[i], sizeof(reg))) { + if (copy_from_user_with_ptr(®, &arg[i], sizeof(reg))) { ret = -EFAULT; break; } @@ -10786,7 +10799,14 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, break;
reg.offset = ret; - if (copy_to_user(&arg[i], ®, sizeof(reg))) { +#ifdef CONFIG_COMPAT + if (ctx->compat) + ret = set_compat_io_uring_rsrc_update(&arg[i], ®); + else +#endif + ret = copy_to_user_with_ptr(&arg[i], ®, sizeof(reg)); + + if (ret) { fput(tctx->registered_rings[reg.offset]); tctx->registered_rings[reg.offset] = NULL; ret = -EFAULT; @@ -10819,7 +10839,7 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, } } else #endif - if (copy_from_user(®, &arg[i], sizeof(reg))) { + if (copy_from_user_with_ptr(®, &arg[i], sizeof(reg))) { ret = -EFAULT; break; } @@ -10879,6 +10899,10 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) return PTR_ERR(ptr);
pfn = virt_to_phys(ptr) >> PAGE_SHIFT; + + vma->vm_flags |= VM_READ_CAPS | VM_WRITE_CAPS; + vma_set_page_prot(vma); + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); }
@@ -10935,10 +10959,10 @@ static int get_compat_io_uring_getevents_arg(struct io_uring_getevents_arg *arg,
if (unlikely(copy_from_user(&compat_arg, user_arg, sizeof(compat_arg)))) return -EFAULT; - arg->sigmask = compat_arg.sigmask; + arg->sigmask = (__kernel_uintptr_t)compat_ptr(compat_arg.sigmask); arg->sigmask_sz = compat_arg.sigmask_sz; arg->pad = compat_arg.pad; - arg->ts = compat_arg.ts; + arg->ts = (__kernel_uintptr_t)compat_ptr(compat_arg.ts); return 0; } #endif @@ -10980,15 +11004,15 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned int flags, const voi { if (*argsz != sizeof(arg)) return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) + if (copy_from_user_with_ptr(&arg, argp, sizeof(arg))) return -EFAULT; }
if (arg.pad) return -EINVAL; - *sig = u64_to_user_ptr(arg.sigmask); + *sig = (const sigset_t __user *)arg.sigmask; *argsz = arg.sigmask_sz; - *ts = u64_to_user_ptr(arg.ts); + *ts = (struct __kernel_timespec __user *)arg.ts; return 0; }
@@ -11755,7 +11779,7 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, return -EFAULT; } else #endif - if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) + if (copy_from_user_with_ptr(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT;
if (up.resv || up.resv2) @@ -11773,8 +11797,8 @@ static int get_compat_io_uring_rsrc_update2(struct io_uring_rsrc_update2 *up2, return -EFAULT; up2->offset = compat_up2.offset; up2->resv = compat_up2.resv; - up2->data = compat_up2.data; - up2->tags = compat_up2.tags; + up2->data = (__kernel_uintptr_t)compat_ptr(compat_up2.data); + up2->tags = (__kernel_uintptr_t)compat_ptr(compat_up2.tags); up2->nr = compat_up2.nr; up2->resv2 = compat_up2.resv2; return 0; @@ -11797,7 +11821,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, { if (size != sizeof(up)) return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) + if (copy_from_user_with_ptr(&up, arg, sizeof(up))) return -EFAULT; }
@@ -11818,8 +11842,8 @@ static int get_compat_io_uring_rsrc_register(struct io_uring_rsrc_register *rr, rr->nr = compat_rr.nr; rr->resv = compat_rr.resv; rr->resv2 = compat_rr.resv2; - rr->data = compat_rr.data; - rr->tags = compat_rr.tags; + rr->data = (__kernel_uintptr_t)compat_ptr(compat_rr.data); + rr->tags = (__kernel_uintptr_t)compat_ptr(compat_rr.tags); return 0; } #endif @@ -11841,7 +11865,7 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, /* keep it extendible */ if (size != sizeof(rr)) return -EINVAL; - if (copy_from_user(&rr, arg, size)) + if (copy_from_user_with_ptr(&rr, arg, size)) return -EFAULT; }
@@ -11850,11 +11874,11 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
switch (type) { case IORING_RSRC_FILE: - return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), - rr.nr, u64_to_user_ptr(rr.tags)); + return io_sqe_files_register(ctx, (void __user *)rr.data, + rr.nr, (u64 __user *)rr.tags); case IORING_RSRC_BUFFER: - return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), - rr.nr, u64_to_user_ptr(rr.tags)); + return io_sqe_buffers_register(ctx, (void __user *)rr.data, + rr.nr, (u64 __user *)rr.tags); } return -EINVAL; } @@ -12158,46 +12182,80 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ } while (0)
- BUILD_BUG_ON(__builtin_types_compatible_p(struct io_uring_sqe, struct compat_io_uring_sqe)); +#define BUILD_BUG_COMPAT_SQE_ELEM(eoffset, etype, ename) \ + __BUILD_BUG_VERIFY_ELEMENT(struct compat_io_uring_sqe, eoffset, etype, ename) + BUILD_BUG_ON(sizeof(struct compat_io_uring_sqe) != 64); + BUILD_BUG_COMPAT_SQE_ELEM(0, __u8, opcode); + BUILD_BUG_COMPAT_SQE_ELEM(1, __u8, flags); + BUILD_BUG_COMPAT_SQE_ELEM(2, __u16, ioprio); + BUILD_BUG_COMPAT_SQE_ELEM(4, __s32, fd); + BUILD_BUG_COMPAT_SQE_ELEM(8, __u64, off); + BUILD_BUG_COMPAT_SQE_ELEM(8, __u64, addr2); + BUILD_BUG_COMPAT_SQE_ELEM(16, __u64, addr); + BUILD_BUG_COMPAT_SQE_ELEM(16, __u64, splice_off_in); + BUILD_BUG_COMPAT_SQE_ELEM(24, __u32, len); + BUILD_BUG_COMPAT_SQE_ELEM(28, __kernel_rwf_t, rw_flags); + BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ int, rw_flags); + BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ __u32, rw_flags); + BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, fsync_flags); + BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ __u16, poll_events); + BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, poll32_events); + BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, sync_range_flags); + BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, msg_flags); + BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, timeout_flags); + BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, accept_flags); + BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, cancel_flags); + BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, open_flags); + BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, statx_flags); + BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, fadvise_advice); + BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, splice_flags); + BUILD_BUG_COMPAT_SQE_ELEM(32, __u64, user_data); + BUILD_BUG_COMPAT_SQE_ELEM(40, __u16, buf_index); + BUILD_BUG_COMPAT_SQE_ELEM(40, __u16, buf_group); + BUILD_BUG_COMPAT_SQE_ELEM(42, __u16, personality); + BUILD_BUG_COMPAT_SQE_ELEM(44, __s32, splice_fd_in); + BUILD_BUG_COMPAT_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_ON(sizeof(struct compat_io_uring_files_update) != sizeof(struct compat_io_uring_rsrc_update)); BUILD_BUG_ON(sizeof(struct compat_io_uring_rsrc_update) > sizeof(struct compat_io_uring_rsrc_update2));
+#ifdef __CHERI_PURE_CAPABILITY__ #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) - BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); + BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 112); BUILD_BUG_SQE_ELEM(0, __u8, opcode); BUILD_BUG_SQE_ELEM(1, __u8, flags); BUILD_BUG_SQE_ELEM(2, __u16, ioprio); BUILD_BUG_SQE_ELEM(4, __s32, fd); BUILD_BUG_SQE_ELEM(8, __u64, off); - BUILD_BUG_SQE_ELEM(8, __u64, addr2); - BUILD_BUG_SQE_ELEM(16, __u64, addr); - BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); - BUILD_BUG_SQE_ELEM(24, __u32, len); - BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); - BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); - BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); - BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); - BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); - BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); - BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); - BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); - BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); - BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); - BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); - BUILD_BUG_SQE_ELEM(28, __u32, open_flags); - BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); - BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); - BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); - BUILD_BUG_SQE_ELEM(32, __u64, user_data); - BUILD_BUG_SQE_ELEM(40, __u16, buf_index); - BUILD_BUG_SQE_ELEM(40, __u16, buf_group); - BUILD_BUG_SQE_ELEM(42, __u16, personality); - BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); - BUILD_BUG_SQE_ELEM(44, __u32, file_index); + BUILD_BUG_SQE_ELEM(16, __uintcap_t, addr2); + BUILD_BUG_SQE_ELEM(32, __uintcap_t, addr); + BUILD_BUG_SQE_ELEM(32, __u64, splice_off_in); + BUILD_BUG_SQE_ELEM(48, __u32, len); + BUILD_BUG_SQE_ELEM(52, __kernel_rwf_t, rw_flags); + BUILD_BUG_SQE_ELEM(52, __u32, fsync_flags); + BUILD_BUG_SQE_ELEM(52, __u16, poll_events); + BUILD_BUG_SQE_ELEM(52, __u32, poll32_events); + BUILD_BUG_SQE_ELEM(52, __u32, sync_range_flags); + BUILD_BUG_SQE_ELEM(52, __u32, msg_flags); + BUILD_BUG_SQE_ELEM(52, __u32, timeout_flags); + BUILD_BUG_SQE_ELEM(52, __u32, accept_flags); + BUILD_BUG_SQE_ELEM(52, __u32, cancel_flags); + BUILD_BUG_SQE_ELEM(52, __u32, open_flags); + BUILD_BUG_SQE_ELEM(52, __u32, statx_flags); + BUILD_BUG_SQE_ELEM(52, __u32, fadvise_advice); + BUILD_BUG_SQE_ELEM(52, __u32, splice_flags); + BUILD_BUG_SQE_ELEM(64, __uintcap_t, user_data); + BUILD_BUG_SQE_ELEM(80, __u16, buf_index); + BUILD_BUG_SQE_ELEM(80, __u16, buf_group); + BUILD_BUG_SQE_ELEM(82, __u16, personality); + BUILD_BUG_SQE_ELEM(84, __s32, splice_fd_in); + BUILD_BUG_SQE_ELEM(88, __u32, file_index); +#else + BUILD_BUG_ON(__builtin_types_compatible_p(struct io_uring_sqe, struct compat_io_uring_sqe)); +#endif
BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1845cf7c80ba..22fb128badc9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -20,12 +20,12 @@ struct io_uring_sqe { __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ union { - __u64 off; /* offset into file */ - __u64 addr2; + __u64 off; /* offset into file */ + __kernel_uintptr_t addr2; }; union { - __u64 addr; /* pointer to buffer or iovecs */ - __u64 splice_off_in; + __kernel_uintptr_t addr; /* pointer to buffer or iovecs */ + __u64 splice_off_in; }; __u32 len; /* buffer size or number of iovecs */ union { @@ -46,7 +46,7 @@ struct io_uring_sqe { __u32 unlink_flags; __u32 hardlink_flags; }; - __u64 user_data; /* data to be passed back at completion time */ + __kernel_uintptr_t user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ union { /* index into fixed buffers, if used */ @@ -191,9 +191,9 @@ enum { * IO completion data structure (Completion Queue Entry) */ struct io_uring_cqe { - __u64 user_data; /* sqe->data submission passed back */ - __s32 res; /* result code for this event */ - __u32 flags; + __kernel_uintptr_t user_data; /* sqe->data submission passed back */ + __s32 res; /* result code for this event */ + __u32 flags; };
/* @@ -347,28 +347,28 @@ enum { struct io_uring_files_update { __u32 offset; __u32 resv; - __aligned_u64 /* __s32 * */ fds; + __kernel_aligned_uintptr_t /* __s32 * */ fds; };
struct io_uring_rsrc_register { __u32 nr; __u32 resv; __u64 resv2; - __aligned_u64 data; - __aligned_u64 tags; + __kernel_aligned_uintptr_t data; + __kernel_aligned_uintptr_t tags; };
struct io_uring_rsrc_update { __u32 offset; __u32 resv; - __aligned_u64 data; + __kernel_aligned_uintptr_t data; };
struct io_uring_rsrc_update2 { __u32 offset; __u32 resv; - __aligned_u64 data; - __aligned_u64 tags; + __kernel_aligned_uintptr_t data; + __kernel_aligned_uintptr_t tags; __u32 nr; __u32 resv2; }; @@ -424,10 +424,10 @@ enum { };
struct io_uring_getevents_arg { - __u64 sigmask; + __kernel_uintptr_t sigmask; __u32 sigmask_sz; __u32 pad; - __u64 ts; + __kernel_uintptr_t ts; };
#endif
On 17/10/2022 19:09, Tudor Cretu wrote:
Some members of the io_uring uAPI structs may contain user pointers. On some architectures (for example CHERI) a user pointer is a 129-bit
CHERI is not really an architecture per se, and in general it's better to refer to PCuABI as this is beyond a particular architecture.
capability, so the __u64 type is not big enough to hold it. Use the __kernel_uintptr_t type instead, which is big enough on the affected architectures while remaining 64-bit on others.
In addition, special copy routines need to be used when copying user pointers from/to userspace. Use these for the io_uring structs containing user pointers.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/io_uring.c | 268 +++++++++++++++++++++------------- include/uapi/linux/io_uring.h | 32 ++-- 2 files changed, 179 insertions(+), 121 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 2db37c68e715..9216617c1edf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -351,7 +351,7 @@ struct io_buffer_list { struct io_buffer { struct list_head list;
- __u64 addr;
- __kernel_uintptr_t addr; __u32 len; __u16 bid; __u16 bgid;
@@ -652,7 +652,7 @@ struct io_sync { struct io_cancel { struct file *file;
- u64 addr;
- __kernel_uintptr_t addr; };
struct io_timeout { @@ -668,7 +668,7 @@ struct io_timeout { struct io_timeout_rem { struct file *file;
- u64 addr;
- __kernel_uintptr_t addr;
/* timeout update */ struct timespec64 ts; @@ -679,7 +679,7 @@ struct io_timeout_rem { struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb;
- u64 addr;
- __kernel_uintptr_t addr; u32 len; u32 flags; };
@@ -752,7 +752,7 @@ struct io_splice { struct io_provide_buf { struct file *file;
- __u64 addr;
- __kernel_uintptr_t addr; __u32 len; __u32 bgid; __u16 nbufs;
@@ -814,7 +814,7 @@ struct io_hardlink { struct io_msg { struct file *file;
- u64 user_data;
- __kernel_uintptr_t user_data; u32 len;
In that specific case I'd say might as well align these two members like in all the other structs, you're already touching one line so touching one more is not a big deal.
}; @@ -1001,7 +1001,7 @@ struct io_kiocb { u16 buf_index; unsigned int flags;
- u64 user_data;
- __kernel_uintptr_t user_data; u32 result; /* fd initially, then cflags for completion */ union {
@@ -1921,7 +1921,7 @@ static inline void *io_get_cqe(struct io_ring_ctx *ctx) static inline void convert_compat_io_uring_cqe(struct io_uring_cqe *cqe, const struct compat_io_uring_cqe *compat_cqe) {
- cqe->user_data = READ_ONCE(compat_cqe->user_data);
- cqe->user_data = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_cqe->user_data));
This is entirely user-defined data so we shouldn't use compat_ptr(), we just need to store the 64-bit value and return it later as-is.
cqe->res = READ_ONCE(compat_cqe->res); cqe->flags = READ_ONCE(compat_cqe->flags); } @@ -2096,7 +2096,7 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, +static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, s32 res, u32 cflags) { struct io_overflow_cqe *ocqe; @@ -2133,7 +2133,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, +static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, s32 res, u32 cflags) { void *cqe; @@ -2178,7 +2178,7 @@ static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) __io_fill_cqe_req(req, res, cflags); } -static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, +static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, s32 res, u32 cflags) { ctx->cq_extra++; @@ -3509,7 +3509,7 @@ static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, kbuf = io_buffer_select(req, len, bgid, issue_flags); if (IS_ERR(kbuf)) return ERR_USER_PTR(PTR_ERR(kbuf));
- return u64_to_user_ptr(kbuf->addr);
- return (void __user *)kbuf->addr; }
#ifdef CONFIG_COMPAT @@ -3521,7 +3521,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, void __user *buf; ssize_t len;
- uiov = u64_to_user_ptr(req->rw.addr);
- uiov = (struct compat_iovec __user *)req->rw.addr; if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(clen, &uiov->iov_len))
@@ -3542,11 +3542,11 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) {
- struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
- struct iovec __user *uiov = (struct iovec __user *)req->rw.addr; void __user *buf; ssize_t len;
- if (copy_from_user(iov, uiov, sizeof(*uiov)))
- if (copy_from_user_with_ptr(iov, uiov, sizeof(*uiov)))
I'm rather confused even by the original code: do we even use iov[0].iov_base? In other words, don't we only use the iov_len provided by the user, meaning that we don't need to preserve pointers (and that the original code copies iov_base for no reason)?
return -EFAULT;
len = iov[0].iov_len; @@ -3566,7 +3566,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, if (req->flags & REQ_F_BUFFER_SELECTED) { struct io_buffer *kbuf = req->kbuf;
iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
iov[0].iov_len = kbuf->len; return 0; }iov[0].iov_base = (void __user *)kbuf->addr;
@@ -3603,7 +3603,7 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) return ERR_PTR(-EINVAL);
- buf = u64_to_user_ptr(req->rw.addr);
- buf = (void __user *)req->rw.addr; sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { @@ -3684,7 +3684,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) if (!iov_iter_is_bvec(iter)) { iovec = iov_iter_iovec(iter); } else {
iovec.iov_base = u64_to_user_ptr(req->rw.addr);
}iovec.iov_base = (void __user *)req->rw.addr; iovec.iov_len = req->rw.len;
@@ -4185,8 +4185,8 @@ static int io_renameat_prep(struct io_kiocb *req, return -EBADF; ren->old_dfd = READ_ONCE(sqe->fd);
- oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- oldf = (const char __user *)READ_ONCE(sqe->addr);
- newf = (const char __user *)READ_ONCE(sqe->addr2); ren->new_dfd = READ_ONCE(sqe->len); ren->flags = READ_ONCE(sqe->rename_flags);
@@ -4242,7 +4242,7 @@ static int io_unlinkat_prep(struct io_kiocb *req, if (un->flags & ~AT_REMOVEDIR) return -EINVAL;
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- fname = (const char __user *)READ_ONCE(sqe->addr); un->filename = getname(fname); if (IS_ERR(un->filename)) return PTR_ERR(un->filename);
@@ -4288,7 +4288,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, mkd->dfd = READ_ONCE(sqe->fd); mkd->mode = READ_ONCE(sqe->len);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- fname = (const char __user *)READ_ONCE(sqe->addr); mkd->filename = getname(fname); if (IS_ERR(mkd->filename)) return PTR_ERR(mkd->filename);
@@ -4329,8 +4329,8 @@ static int io_symlinkat_prep(struct io_kiocb *req, return -EBADF; sl->new_dfd = READ_ONCE(sqe->fd);
- oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- oldpath = (const char __user *)READ_ONCE(sqe->addr);
- newpath = (const char __user *)READ_ONCE(sqe->addr2);
sl->oldpath = getname(oldpath); if (IS_ERR(sl->oldpath)) @@ -4378,8 +4378,8 @@ static int io_linkat_prep(struct io_kiocb *req, lnk->old_dfd = READ_ONCE(sqe->fd); lnk->new_dfd = READ_ONCE(sqe->len);
- oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- oldf = (const char __user *)READ_ONCE(sqe->addr);
- newf = (const char __user *)READ_ONCE(sqe->addr2); lnk->flags = READ_ONCE(sqe->hardlink_flags);
lnk->oldpath = getname(oldf); @@ -4702,7 +4702,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe req->open.how.flags |= O_LARGEFILE; req->open.dfd = READ_ONCE(sqe->fd);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- fname = (const char __user *)READ_ONCE(sqe->addr); req->open.filename = getname(fname); if (IS_ERR(req->open.filename)) { ret = PTR_ERR(req->open.filename);
@@ -4734,7 +4734,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret;
- how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- how = (struct open_how __user *)READ_ONCE(sqe->addr2); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) return -EINVAL;
@@ -4911,7 +4911,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, return -EOVERFLOW; size = (unsigned long)p->len * p->nbufs;
- if (!access_ok(u64_to_user_ptr(p->addr), size))
- if (!access_ok(p->addr, size)) return -EFAULT;
p->bgid = READ_ONCE(sqe->buf_group); @@ -5039,8 +5039,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, if (ep_op_has_event(req->epoll.op)) { struct epoll_event __user *ev;
ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
ev = (struct epoll_event __user *)READ_ONCE(sqe->addr);
}if (copy_from_user_with_ptr(&req->epoll.event, ev, sizeof(*ev))) return -EFAULT;
@@ -5155,8 +5155,8 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->statx.dfd = READ_ONCE(sqe->fd); req->statx.mask = READ_ONCE(sqe->len);
- path = u64_to_user_ptr(READ_ONCE(sqe->addr));
- req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- path = (const char __user *)READ_ONCE(sqe->addr);
- req->statx.buffer = (struct statx __user *)READ_ONCE(sqe->addr2); req->statx.flags = READ_ONCE(sqe->statx_flags);
req->statx.filename = getname_flags(path, @@ -5343,7 +5343,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL;
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
- sr->umsg = (struct user_msghdr __user *)READ_ONCE(sqe->addr); sr->len = READ_ONCE(sqe->len); sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT)
@@ -5458,7 +5458,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, if (req->flags & REQ_F_BUFFER_SELECT) { if (iov_len > 1) return -EINVAL;
if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
sr->len = iomsg->fast_iov[0].iov_len; iomsg->free_iov = NULL;if (copy_from_user_with_ptr(iomsg->fast_iov, uiov, sizeof(*uiov))) return -EFAULT;
@@ -5556,7 +5556,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL;
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
- sr->umsg = (struct user_msghdr __user *)READ_ONCE(sqe->addr); sr->len = READ_ONCE(sqe->len); sr->bgid = READ_ONCE(sqe->buf_group); sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
@@ -5605,7 +5605,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf);
kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
kmsg->fast_iov[0].iov_len = req->sr_msg.len; iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, req->sr_msg.len);kmsg->fast_iov[0].iov_base = (void __user *)kbuf->addr;
@@ -5666,7 +5666,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf);
buf = u64_to_user_ptr(kbuf->addr);
}buf = (void __user *)kbuf->addr;
ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); @@ -5722,8 +5722,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL;
- accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
- accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- accept->addr = (struct sockaddr __user *)READ_ONCE(sqe->addr);
- accept->addr_len = (int __user *)READ_ONCE(sqe->addr2); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE);
@@ -5791,8 +5791,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sqe->splice_fd_in) return -EINVAL;
- conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
- conn->addr_len = READ_ONCE(sqe->addr2);
- conn->addr = (struct sockaddr __user *)READ_ONCE(sqe->addr);
- conn->addr_len = READ_ONCE(sqe->addr2); return 0; }
@@ -6564,7 +6564,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) } static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
__u64 user_data)
__must_hold(&ctx->timeout_lock) { struct io_timeout_data *io;__kernel_uintptr_t user_data)
@@ -6586,7 +6586,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, return req; } -static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +static int io_timeout_cancel(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data) __must_hold(&ctx->completion_lock) __must_hold(&ctx->timeout_lock) { @@ -6614,7 +6614,7 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data) } } -static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, +static int io_linked_timeout_update(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { @@ -6639,7 +6639,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, return 0; } -static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, +static int io_timeout_update(struct io_ring_ctx *ctx, __kernel_uintptr_t user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { @@ -6680,7 +6680,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req, tr->ltimeout = true; if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL;
if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) return -EINVAL;if (get_timespec64(&tr->ts, (const struct __kernel_timespec __user *)sqe->addr2)) return -EFAULT;
@@ -6766,7 +6766,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data->req = req; data->flags = flags;
- if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
- if (get_timespec64(&data->ts, (const struct __kernel_timespec __user *)sqe->addr)) return -EFAULT;
if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) @@ -6841,7 +6841,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct io_cancel_data { struct io_ring_ctx *ctx;
- u64 user_data;
- __kernel_uintptr_t user_data; };
static bool io_cancel_cb(struct io_wq_work *work, void *data) @@ -6852,7 +6852,7 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data) return req->ctx == cd->ctx && req->user_data == cd->user_data; } -static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, +static int io_async_cancel_one(struct io_uring_task *tctx, __kernel_uintptr_t user_data, struct io_ring_ctx *ctx) { struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; @@ -7984,13 +7984,13 @@ static void convert_compat_io_uring_sqe(struct io_uring_sqe *sqe, sqe->ioprio = READ_ONCE(compat_sqe->ioprio); sqe->fd = READ_ONCE(compat_sqe->fd); BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr2, addr);
- sqe->addr2 = READ_ONCE(compat_sqe->addr2);
- sqe->addr2 = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_sqe->addr2)); BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr, len);
- sqe->addr = READ_ONCE(compat_sqe->addr);
- sqe->addr = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_sqe->addr)); sqe->len = READ_ONCE(compat_sqe->len); BUILD_BUG_COMPAT_SQE_UNION_ELEM(rw_flags, user_data); sqe->rw_flags = READ_ONCE(compat_sqe->rw_flags);
- sqe->user_data = READ_ONCE(compat_sqe->user_data);
- sqe->user_data = (__kernel_uintptr_t)compat_ptr(READ_ONCE(compat_sqe->user_data)); BUILD_BUG_COMPAT_SQE_UNION_ELEM(buf_index, personality); sqe->buf_index = READ_ONCE(compat_sqe->buf_index); sqe->personality = READ_ONCE(compat_sqe->personality);
@@ -9237,8 +9237,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) {
- u64 __user *tags = u64_to_user_ptr(up->tags);
- __s32 __user *fds = u64_to_user_ptr(up->data);
- u64 __user *tags = (u64 __user *)up->tags;
- __s32 __user *fds = (__s32 __user *)up->data; struct io_rsrc_data *data = ctx->file_data; struct io_fixed_file *file_slot; struct file *file;
@@ -9674,7 +9674,7 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
I've noticed there is an invalid use of u64_to_user_ptr() in this function, could you take care of replacing it with compat_ptr() while you're at it? You could potentially do this in patch 3, not sure it really requires a separate patch.
Well done, with this change that'll be all the u64_to_user_ptr() gone from this file, a good chunk of all uses in the kernel in fact, down 40!
} #endif src = (struct iovec __user *) arg;
- if (copy_from_user(dst, &src[index], sizeof(*dst)))
- if (copy_from_user_with_ptr(dst, &src[index], sizeof(*dst))) return -EFAULT; return 0; }
@@ -9935,9 +9935,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) {
- u64 __user *tags = u64_to_user_ptr(up->tags);
- u64 __user *tags = (u64 __user *)up->tags; struct iovec iov;
- struct iovec __user *iovs = u64_to_user_ptr(up->data);
- struct iovec __user *iovs = (struct iovec __user *)up->data; struct page *last_hpage = NULL; bool needs_switch = false; __u32 done;
@@ -10717,7 +10717,20 @@ static int get_compat_io_uring_rsrc_update(struct io_uring_rsrc_update *up, return -EFAULT; up->offset = compat_up.offset; up->resv = compat_up.resv;
- up->data = compat_up.data;
- up->data = (__kernel_uintptr_t)compat_ptr(compat_up.data);
- return 0;
+}
+static int set_compat_io_uring_rsrc_update(void __user *user_up,
const struct io_uring_rsrc_update *up)
+{
- struct compat_io_uring_rsrc_update compat_up;
- compat_up.offset = up->offset;
- compat_up.resv = up->resv;
- compat_up.data = user_ptr_addr((void __user *)up->data);
- if (unlikely(copy_to_user(user_up, &compat_up, sizeof(compat_up))))
return 0; } #endifreturn -EFAULT;
@@ -10759,7 +10772,7 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
Upon closer inspection, this function really does not use struct io_uring_rsrc_update in the way one might expect. I'm not even sure why it uses this struct, as it expects a fd to be stored in ->data (not a pointer!), an index in ->offset, and nothing in ->resv... In any case, this means we don't need any pointer-preserving operation on the native side here, and on the compat side we shouldn't create a pointer either. When setting ->data in the compat struct, a direct assignment from up->data would also be just fine (only the bottom 32 bits are expected to be set, so letting the compiler change the integer representation itself is appropriate).
The same applies to io_ringfd_unregister(), where in fact ->data is always supposed to be zero!
} } else
#endif
if (copy_from_user(®, &arg[i], sizeof(reg))) {
if (copy_from_user_with_ptr(®, &arg[i], sizeof(reg))) { ret = -EFAULT; break; }
@@ -10786,7 +10799,14 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, break; reg.offset = ret;
if (copy_to_user(&arg[i], ®, sizeof(reg))) {
+#ifdef CONFIG_COMPAT
if (ctx->compat)
ret = set_compat_io_uring_rsrc_update(&arg[i], ®);
else
+#endif
ret = copy_to_user_with_ptr(&arg[i], ®, sizeof(reg));
Not sure why this diff is here and not in patch 3?
if (ret) { fput(tctx->registered_rings[reg.offset]); tctx->registered_rings[reg.offset] = NULL; ret = -EFAULT;
@@ -10819,7 +10839,7 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, } } else #endif
if (copy_from_user(®, &arg[i], sizeof(reg))) {
if (copy_from_user_with_ptr(®, &arg[i], sizeof(reg))) { ret = -EFAULT; break; }
@@ -10879,6 +10899,10 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) return PTR_ERR(ptr); pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
- vma->vm_flags |= VM_READ_CAPS | VM_WRITE_CAPS;
- vma_set_page_prot(vma);
These VM flags are arch-specific, so an an arch hook needs to be introduced here (in a separate patch) to set them.
- return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); }
@@ -10935,10 +10959,10 @@ static int get_compat_io_uring_getevents_arg(struct io_uring_getevents_arg *arg, if (unlikely(copy_from_user(&compat_arg, user_arg, sizeof(compat_arg)))) return -EFAULT;
- arg->sigmask = compat_arg.sigmask;
- arg->sigmask = (__kernel_uintptr_t)compat_ptr(compat_arg.sigmask); arg->sigmask_sz = compat_arg.sigmask_sz; arg->pad = compat_arg.pad;
- arg->ts = compat_arg.ts;
- arg->ts = (__kernel_uintptr_t)compat_ptr(compat_arg.ts); return 0; } #endif
@@ -10980,15 +11004,15 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned int flags, const voi { if (*argsz != sizeof(arg)) return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
}if (copy_from_user_with_ptr(&arg, argp, sizeof(arg))) return -EFAULT;
if (arg.pad) return -EINVAL;
- *sig = u64_to_user_ptr(arg.sigmask);
- *sig = (const sigset_t __user *)arg.sigmask; *argsz = arg.sigmask_sz;
- *ts = u64_to_user_ptr(arg.ts);
- *ts = (struct __kernel_timespec __user *)arg.ts; return 0; }
@@ -11755,7 +11779,7 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, return -EFAULT; } else #endif
if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
if (copy_from_user_with_ptr(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT;
if (up.resv || up.resv2) @@ -11773,8 +11797,8 @@ static int get_compat_io_uring_rsrc_update2(struct io_uring_rsrc_update2 *up2, return -EFAULT; up2->offset = compat_up2.offset; up2->resv = compat_up2.resv;
- up2->data = compat_up2.data;
- up2->tags = compat_up2.tags;
- up2->data = (__kernel_uintptr_t)compat_ptr(compat_up2.data);
- up2->tags = (__kernel_uintptr_t)compat_ptr(compat_up2.tags); up2->nr = compat_up2.nr; up2->resv2 = compat_up2.resv2; return 0;
@@ -11797,7 +11821,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, { if (size != sizeof(up)) return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
}if (copy_from_user_with_ptr(&up, arg, sizeof(up))) return -EFAULT;
@@ -11818,8 +11842,8 @@ static int get_compat_io_uring_rsrc_register(struct io_uring_rsrc_register *rr, rr->nr = compat_rr.nr; rr->resv = compat_rr.resv; rr->resv2 = compat_rr.resv2;
- rr->data = compat_rr.data;
- rr->tags = compat_rr.tags;
- rr->data = (__kernel_uintptr_t)compat_ptr(compat_rr.data);
- rr->tags = (__kernel_uintptr_t)compat_ptr(compat_rr.tags); return 0; } #endif
@@ -11841,7 +11865,7 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, /* keep it extendible */ if (size != sizeof(rr)) return -EINVAL;
if (copy_from_user(&rr, arg, size))
}if (copy_from_user_with_ptr(&rr, arg, size)) return -EFAULT;
@@ -11850,11 +11874,11 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, switch (type) { case IORING_RSRC_FILE:
return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
return io_sqe_files_register(ctx, (void __user *)rr.data,
case IORING_RSRC_BUFFER:rr.nr, (u64 __user *)rr.tags);
return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
return io_sqe_buffers_register(ctx, (void __user *)rr.data,
} return -EINVAL; }rr.nr, (u64 __user *)rr.tags);
@@ -12158,46 +12182,80 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ } while (0)
- BUILD_BUG_ON(__builtin_types_compatible_p(struct io_uring_sqe, struct compat_io_uring_sqe));
+#define BUILD_BUG_COMPAT_SQE_ELEM(eoffset, etype, ename) \
- __BUILD_BUG_VERIFY_ELEMENT(struct compat_io_uring_sqe, eoffset, etype, ename)
- BUILD_BUG_ON(sizeof(struct compat_io_uring_sqe) != 64);
- BUILD_BUG_COMPAT_SQE_ELEM(0, __u8, opcode);
- BUILD_BUG_COMPAT_SQE_ELEM(1, __u8, flags);
- BUILD_BUG_COMPAT_SQE_ELEM(2, __u16, ioprio);
- BUILD_BUG_COMPAT_SQE_ELEM(4, __s32, fd);
- BUILD_BUG_COMPAT_SQE_ELEM(8, __u64, off);
- BUILD_BUG_COMPAT_SQE_ELEM(8, __u64, addr2);
- BUILD_BUG_COMPAT_SQE_ELEM(16, __u64, addr);
- BUILD_BUG_COMPAT_SQE_ELEM(16, __u64, splice_off_in);
- BUILD_BUG_COMPAT_SQE_ELEM(24, __u32, len);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
Don't think you need to preserve these extra spaces :)
- BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ int, rw_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ __u32, rw_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, fsync_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ __u16, poll_events);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, poll32_events);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, sync_range_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, msg_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, timeout_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, accept_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, cancel_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, open_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, statx_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, fadvise_advice);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, splice_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(32, __u64, user_data);
- BUILD_BUG_COMPAT_SQE_ELEM(40, __u16, buf_index);
- BUILD_BUG_COMPAT_SQE_ELEM(40, __u16, buf_group);
- BUILD_BUG_COMPAT_SQE_ELEM(42, __u16, personality);
- BUILD_BUG_COMPAT_SQE_ELEM(44, __s32, splice_fd_in);
- BUILD_BUG_COMPAT_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_ON(sizeof(struct compat_io_uring_files_update) != sizeof(struct compat_io_uring_rsrc_update)); BUILD_BUG_ON(sizeof(struct compat_io_uring_rsrc_update) > sizeof(struct compat_io_uring_rsrc_update2)); +#ifdef __CHERI_PURE_CAPABILITY__
You might have been spending too much time in userspace lately, you must mean CONFIG_CHERI_PURECAP_UABI :) Which explains why you didn't notice the inverted condition in the #else part, as indeed the structs are different in PCuABI!
#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
- BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
- BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 112);
Indeed not an optimal size, but it's hard to make the right decision at this point. It does make me think though that the __pad2 at the end of the struct is dubious as it stands: adding 16 bytes only makes things worse in purecap. OTOH making it 32 bytes would make the whole struct 128 bytes, potentially better. That said without padding we get a size of 96 bytes, that is 1.5 cache lines, which is really not too bad and I doubt that 32 bytes of padding would be worth it.
BUILD_BUG_SQE_ELEM(0, __u8, opcode); BUILD_BUG_SQE_ELEM(1, __u8, flags); BUILD_BUG_SQE_ELEM(2, __u16, ioprio); BUILD_BUG_SQE_ELEM(4, __s32, fd); BUILD_BUG_SQE_ELEM(8, __u64, off);
- BUILD_BUG_SQE_ELEM(8, __u64, addr2);
- BUILD_BUG_SQE_ELEM(16, __u64, addr);
- BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
- BUILD_BUG_SQE_ELEM(24, __u32, len);
- BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
- BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
- BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
- BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
- BUILD_BUG_SQE_ELEM(32, __u64, user_data);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
- BUILD_BUG_SQE_ELEM(42, __u16, personality);
- BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
- BUILD_BUG_SQE_ELEM(44, __u32, file_index);
- BUILD_BUG_SQE_ELEM(16, __uintcap_t, addr2);
- BUILD_BUG_SQE_ELEM(32, __uintcap_t, addr);
- BUILD_BUG_SQE_ELEM(32, __u64, splice_off_in);
- BUILD_BUG_SQE_ELEM(48, __u32, len);
- BUILD_BUG_SQE_ELEM(52, __kernel_rwf_t, rw_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, fsync_flags);
- BUILD_BUG_SQE_ELEM(52, __u16, poll_events);
- BUILD_BUG_SQE_ELEM(52, __u32, poll32_events);
- BUILD_BUG_SQE_ELEM(52, __u32, sync_range_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, msg_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, timeout_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, accept_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, cancel_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, open_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, statx_flags);
- BUILD_BUG_SQE_ELEM(52, __u32, fadvise_advice);
- BUILD_BUG_SQE_ELEM(52, __u32, splice_flags);
- BUILD_BUG_SQE_ELEM(64, __uintcap_t, user_data);
- BUILD_BUG_SQE_ELEM(80, __u16, buf_index);
- BUILD_BUG_SQE_ELEM(80, __u16, buf_group);
- BUILD_BUG_SQE_ELEM(82, __u16, personality);
- BUILD_BUG_SQE_ELEM(84, __s32, splice_fd_in);
- BUILD_BUG_SQE_ELEM(88, __u32, file_index);
+#else
- BUILD_BUG_ON(__builtin_types_compatible_p(struct io_uring_sqe, struct compat_io_uring_sqe));
+#endif BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1845cf7c80ba..22fb128badc9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -20,12 +20,12 @@ struct io_uring_sqe { __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ union {
__u64 off; /* offset into file */
__u64 addr2;
__u64 off; /* offset into file */
}; union {__kernel_uintptr_t addr2;
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in;
__kernel_uintptr_t addr; /* pointer to buffer or iovecs */
}; __u32 len; /* buffer size or number of iovecs */ union {__u64 splice_off_in;
@@ -46,7 +46,7 @@ struct io_uring_sqe { __u32 unlink_flags; __u32 hardlink_flags; };
- __u64 user_data; /* data to be passed back at completion time */
- __kernel_uintptr_t user_data; /* data to be passed back at completion time */
Just reflecting some offline discussions for the benefit of everyone, it is of course impossible to tell from kernel code but user_data does need to be expanded to a __kernel_uintptr_t as it is standard practice to store a pointer in user_data. Probably a good idea to add something on that in the commit message.
/* pack this to avoid bogus arm OABI complaints */ union { /* index into fixed buffers, if used */ @@ -191,9 +191,9 @@ enum {
- IO completion data structure (Completion Queue Entry)
*/ struct io_uring_cqe {
- __u64 user_data; /* sqe->data submission passed back */
- __s32 res; /* result code for this event */
- __u32 flags;
- __kernel_uintptr_t user_data; /* sqe->data submission passed back */
- __s32 res; /* result code for this event */
- __u32 flags; };
/* @@ -347,28 +347,28 @@ enum { struct io_uring_files_update { __u32 offset; __u32 resv;
- __aligned_u64 /* __s32 * */ fds;
- __kernel_aligned_uintptr_t /* __s32 * */ fds; };
struct io_uring_rsrc_register { __u32 nr; __u32 resv; __u64 resv2;
- __aligned_u64 data;
- __aligned_u64 tags;
- __kernel_aligned_uintptr_t data;
- __kernel_aligned_uintptr_t tags; };
struct io_uring_rsrc_update { __u32 offset; __u32 resv;
- __aligned_u64 data;
- __kernel_aligned_uintptr_t data; };
struct io_uring_rsrc_update2 { __u32 offset; __u32 resv;
- __aligned_u64 data;
- __aligned_u64 tags;
- __kernel_aligned_uintptr_t data;
- __kernel_aligned_uintptr_t tags; __u32 nr; __u32 resv2; };
@@ -424,10 +424,10 @@ enum { }; struct io_uring_getevents_arg {
- __u64 sigmask;
- __kernel_uintptr_t sigmask; __u32 sigmask_sz; __u32 pad;
- __u64 ts;
- __kernel_uintptr_t ts; };
#endif
A final note reflecting on offline discussions, I don't think it is worth trying to split this patch into native and compat parts. Changing the fields to hold user pointers immediately requires the use of compat_ptr() when converting from compat, so both sides are tightly coupled.
Kevin
linux-morello@op-lists.linaro.org