[linux-morello] Re: [PATCH v4 7/9] io_uring: Implement compat versions of uAPI structs and handle them

29 Mar 2023

On 28-03-2023 09:48, Kevin Brodsky wrote:
...
On 16/03/2023 14:40, Tudor Cretu wrote:
...
Introduce compat versions of the structs exposed in the uAPI headers that
might contain pointers as a member. Also, implement functions that convert
the compat versions to the native versions of the struct.
A subsequent patch is going to change the io_uring structs to enable them
to support new architectures. On such architectures, the current struct
layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
include/linux/io_uring_types.h | 135 ++++++++++++++++++-
  io_uring/cancel.c              |  26 +++-
  io_uring/epoll.c               |   2 +-
  io_uring/fdinfo.c              |  80 ++++++-----
  io_uring/io_uring.c            | 234 +++++++++++++++++++++++----------
  io_uring/io_uring.h            | 111 +++++++++++++---
  io_uring/kbuf.c                |  96 ++++++++++++--
  io_uring/kbuf.h                |   6 +-
  io_uring/net.c                 |   5 +-
  io_uring/rsrc.c                | 108 +++++++++++++--
  io_uring/tctx.c                |  56 +++++++-
  io_uring/uring_cmd.h           |   5 +
  12 files changed, 704 insertions(+), 160 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 440179029a8f0..f0eb34ad8b709 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -7,6 +7,127 @@
  #include <linux/llist.h>
  #include <uapi/linux/io_uring.h>
  
+struct compat_io_uring_sqe {

__u8	opcode;
__u8	flags;
__u16	ioprio;
__s32	fd;
union {
__u64	off;


__u64	addr2;


struct {


	__u32	cmd_op;


	__u32	__pad1;


};


};
union {
__u64	addr;


__u64	splice_off_in;


};
__u32	len;
/* This member is actually a union in the native struct */
__kernel_rwf_t	rw_flags;
__u64	user_data;
union {
__u16	buf_index;


__u16	buf_group;


} __packed;
__u16	personality;
union {
__s32	splice_fd_in;


__u32	file_index;


struct {


	__u16	addr_len;


	__u16	__pad3[1];


};


};
union {
struct {


	__u64	addr3;


	__u64	__pad2[1];


};


__u8	 cmd[0];


};

+};



+struct compat_io_uring_cqe {

__u64 user_data;
__s32 res;
__u32 flags;
__u64 big_cqe[];

+};



+struct compat_io_uring_files_update {

__u32 offset;
__u32 resv;
__aligned_u64 fds;

+};



+struct compat_io_uring_rsrc_register {

__u32 nr;
__u32 flags;
__u64 resv2;
__aligned_u64 data;
__aligned_u64 tags;

+};



+struct compat_io_uring_rsrc_update {

__u32 offset;
__u32 resv;
__aligned_u64 data;

+};



+struct compat_io_uring_rsrc_update2 {

__u32 offset;
__u32 resv;
__aligned_u64 data;
__aligned_u64 tags;
__u32 nr;
__u32 resv2;

+};



+struct compat_io_uring_buf {

__u64 addr;
__u32 len;
__u16 bid;
__u16 resv;

+};



+struct compat_io_uring_buf_ring {

union {
struct {


	__u64	resv1;


	__u32	resv2;


	__u16	resv3;


	__u16	tail;


};


struct compat_io_uring_buf	bufs[0];


};

+};



+struct compat_io_uring_buf_reg {

__u64 ring_addr;
__u32 ring_entries;
__u16 bgid;
__u16 pad;
__u64 resv[3];

+};



+struct compat_io_uring_getevents_arg {

__u64 sigmask;
__u32 sigmask_sz;
__u32 pad;
__u64 ts;

+};



+struct compat_io_uring_sync_cancel_reg {

__u64 addr;
__s32 fd;
__u32 flags;
struct __kernel_timespec timeout;
__u64 pad[4];

+};
Just thinking about this now, might make more sense to have these
structs at the end of the file, rather than before the (common) native
ones? Or maybe it would be even better to move them to a new file, say
<linux/io_uring_compat.h>, that can be included from this file. That
would make conflicts less likely and avoid adding a lot of structs to
io__uring_types.h
Indeed much better, done!
...
...
[...]
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index bc8c9d764bc13..c724e6c544809 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -88,45 +88,64 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
   sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
   for (i = 0; i < sq_entries; i++) {
   	unsigned int entry = i + sq_head;

struct io_uring_sqe *sqe;


unsigned int sq_idx;




unsigned int sq_idx, sq_off;



sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
   	if (sq_idx > sq_mask)
   		continue;

sqe = &ctx->sq_sqes[sq_idx << sq_shift];


seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "


	      "addr:0x%llx, rw_flags:0x%x, buf_index:%d "


	      "user_data:%llu",


	   sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd,


	   sqe->flags, (unsigned long long) sqe->off,


	   (unsigned long long) sqe->addr, sqe->rw_flags,


	   sqe->buf_index, sqe->user_data);


if (sq_shift) {


	u64 *sqeb = (void *) (sqe + 1);


	int size = sizeof(struct io_uring_sqe) / sizeof(u64);


	int j;



	for (j = 0; j < size; j++) {


		seq_printf(m, ", e%d:0x%llx", j,


				(unsigned long long) *sqeb);


		sqeb++;


	}


}




sq_off = sq_idx << sq_shift;



+#define print_sqe(sqe)										\
It's unusual to define macros in the middle of a function, I can see why
it could make sense here but I think it's still more readable to define
them normally, before the function. The number of arguments remains
reasonable if you do it this way (AFAICT just need to add m, sq_idx,
sq_shift).
Also minor thing, some of the \ are not aligned.
Sure!
...
...

do {										\


	seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "		\


		      "addr:0x%llx, rw_flags:0x%x, buf_index:%d "		\


		      "user_data:%llu",						\


		   sq_idx, io_uring_get_opcode((sqe)->opcode), (sqe)->fd,	\


		   (sqe)->flags, (unsigned long long) (sqe)->off,		\


		   (unsigned long long) (sqe)->addr, (sqe)->rw_flags,		\


		   (sqe)->buf_index, (sqe)->user_data);				\


	if (sq_shift) {								\


		u64 *sqeb = (void *) ((sqe) + 1);				\


		int size = sizeof(*(sqe)) / sizeof(u64);		\


		int j;								\


										\


		for (j = 0; j < size; j++) {					\


			seq_printf(m, ", e%d:0x%llx", j,		\


			(unsigned long long) *sqeb);			\


			sqeb++;							\


		}								\


	}									\


} while (0)



if (is_compat64_io_ring_ctx(ctx))


	print_sqe(&ctx->sq_sqes_compat[sq_off]);


else


	print_sqe(&ctx->sq_sqes[sq_off]);



+#undef print_sqe

seq_printf(m, "\n");
 }
 seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
 cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
 for (i = 0; i < cq_entries; i++) {
  unsigned int entry = i + cq_head;


struct io_uring_cqe *cqe = &ctx->cqes[(entry & cq_mask) << cq_shift];



seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x",


	   entry & cq_mask, cqe->user_data, cqe->res,


	   cqe->flags);


if (cq_shift)


	seq_printf(m, ", extra1:%llu, extra2:%llu\n",


			cqe->big_cqe[0], cqe->big_cqe[1]);




unsigned int cq_off = (entry & cq_mask) << cq_shift;




+#define print_cqe(cqe)									\

do {									\


	seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x",		\


		   entry & cq_mask, (cqe)->user_data, (cqe)->res,	\


		   (cqe)->flags);					\


	if (cq_shift)							\


		seq_printf(m, ", extra1:%llu, extra2:%llu\n",		\


			   (cqe)->big_cqe[0], (cqe)->big_cqe[1]);	\


} while (0)



if (is_compat64_io_ring_ctx(ctx))


	print_cqe((struct compat_io_uring_cqe *)&ctx->cqes_compat[cq_off]);



I don't think you need the casts.
Yes, I've removed them from print_sqe() and I forgot them here...
...
...

else


	print_cqe((struct io_uring_cqe *)&ctx->cqes[cq_off]);



+#undef print_cqe

seq_printf(m, "\n");
 }

@@ -191,8 +210,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
   	struct io_uring_cqe *cqe = &ocqe->cqe;
  
   	seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",

	   cqe->user_data, cqe->res, cqe->flags);





	   (cqe)->user_data, (cqe)->res, (cqe)->flags);



Looks like a leftover, I guess you don't need to change this any more as
struct io_overflow_cqe is now always native?
That's right, thank you!
...
...
}
  
   spin_unlock(&ctx->completion_lock);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 707229ae04dc8..3f0e005481f3f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -152,6 +152,35 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
  
  static struct kmem_cache *req_cachep;
  
+static int get_compat64_io_uring_getevents_arg(struct io_uring_getevents_arg *arg,

			       const void __user *user_arg)



+{

struct compat_io_uring_getevents_arg compat_arg;

if (copy_from_user(&compat_arg, user_arg, sizeof(compat_arg)))
return -EFAULT;


arg->sigmask = compat_arg.sigmask;
arg->sigmask_sz = compat_arg.sigmask_sz;
arg->pad = compat_arg.pad;
arg->ts = compat_arg.ts;
return 0;

+}



+static int copy_io_uring_getevents_arg_from_user(struct io_ring_ctx *ctx,

				 struct io_uring_getevents_arg *arg,


				 const void __user *argp,


				 size_t size)



+{

if (is_compat64_io_ring_ctx(ctx)) {
if (size != sizeof(struct compat_io_uring_getevents_arg))


	return -EINVAL;


return get_compat64_io_uring_getevents_arg(arg, argp);


}
if (size != sizeof(*arg))
return -EINVAL;


return copy_from_user(arg, argp, sizeof(*arg));

+}

struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)

@@ -604,14 +633,10 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx)
  static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
  {
   bool all_flushed;

size_t cqe_size = sizeof(struct io_uring_cqe);

if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
   	return false;

if (ctx->flags & IORING_SETUP_CQE32)
cqe_size <<= 1;


io_cq_lock(ctx);
 while (!list_empty(&ctx->cq_overflow_list)) {
  struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);

@@ -621,9 +646,18 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
   		break;
   	ocqe = list_first_entry(&ctx->cq_overflow_list,
   				struct io_overflow_cqe, list);

if (cqe)


	memcpy(cqe, &ocqe->cqe, cqe_size);


else




if (cqe) {


	u64 extra1 = 0;


	u64 extra2 = 0;



	if (ctx->flags & IORING_SETUP_CQE32) {


		extra1 = ocqe->cqe.big_cqe[0];


		extra2 = ocqe->cqe.big_cqe[1];


	}



	__io_fill_cqe(ctx, cqe, ocqe->cqe.user_data, ocqe->cqe.res,


		      ocqe->cqe.flags, extra1, extra2);


} else
io_account_cq_overflow(ctx);



list_del(&ocqe->list);
@@ -745,6 +779,10 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
  {
   unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
   unsigned int free, queued, len;

size_t cqe_size = ctx->compat ?

It would be better to use is_compat64_io_ring_ctx() everywhere, now that
we have a helper. This should leave the code entirely unmodified unless
CONFIG_COMPAT64 is selected.
Indeed, I missed quite a few of them...
...
...

	  sizeof(struct compat_io_uring_cqe) :


	  sizeof(struct io_uring_cqe);


struct io_uring_cqe *cqe;

/*
    * Posting into the CQ when there are pending overflowed CQEs may break
@@ -767,14 +805,15 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
   	len <<= 1;
   }

ctx->cqe_cached = &ctx->cqes[off];
ctx->cqe_sentinel = ctx->cqe_cached + len;


cqe = ctx->compat ? (struct io_uring_cqe *)&ctx->cqes_compat[off] : &ctx->cqes[off];
ctx->cqe_cached = cqe;
ctx->cqe_sentinel = ctx->cqe_cached + len * cqe_size;

The cqe_cached / cqe_sentinel changes look fine, but I wonder if we
wouldn't be better off changing things a little more in that case.
Manipulating these as void * pointers is a little awkward and can be
error-prone, for instance on this line I thought "why not use cqe
instead of ctx->cqe_cached", but that is definitely not equivalent as
they are pointers to different types.
An alternative would be to make cqe_cached / cqe_sentinel indices into
cqes / cqes_compat. This way we don't need to compute cqe_size to update
them, and the logic remains otherwise unaffected. It could be a separate
patch to make it easier to understand. That's just an idea though,
leaving it as-is is also fine by me.
I've thought about that as well. Now it seems it's the obvious choice 
when you put it like that. Added the changes in a new patch.
...
...
ctx->cached_cq_tail++;

ctx->cqe_cached++;


ctx->cqe_cached += cqe_size;
if (ctx->flags & IORING_SETUP_CQE32)


ctx->cqe_cached++;


return &ctx->cqes[off];


ctx->cqe_cached += cqe_size;


return cqe;
}

[...]
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 50bc3af449534..b44ad558137be 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -5,6 +5,7 @@
  #include <linux/lockdep.h>
  #include <linux/io_uring_types.h>
  #include "io-wq.h"
+#include "uring_cmd.h"
  #include "slist.h"
  #include "filetable.h"
  
@@ -93,16 +94,69 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
  
  void io_cq_unlock_post(struct io_ring_ctx *ctx);
  
+static inline bool is_compat64_io_ring_ctx(struct io_ring_ctx *ctx)
I think it makes sense to follow the convention of having the full
struct name in conversion helpers, but I'm less convinced here. It could
be better to use the io_ prefix like most functions in this header, and
pass struct io_ring_ctx without reflecting it in the function name.
Maybe simple io_in_compat64(), reminiscent of in_compat_syscall()?
That's also shorter, handy as we're using it a lot.
Indeed, done!
...
...
+{

return IS_ENABLED(CONFIG_COMPAT64) && ctx->compat;

+}
[...]
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 41e192de9e8a7..c65b99fb9264f 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -23,6 +23,89 @@ struct io_rsrc_update {
   u32				offset;
  };
  
+static int get_compat64_io_uring_rsrc_update(struct io_uring_rsrc_update2 *up2,

			     const void __user *user_up)



+{

struct compat_io_uring_rsrc_update compat_up;

if (copy_from_user(&compat_up, user_up, sizeof(compat_up)))
return -EFAULT;


up2->offset = compat_up.offset;
up2->resv = compat_up.resv;
up2->data = compat_up.data;
return 0;

+}



+static int get_compat64_io_uring_rsrc_update2(struct io_uring_rsrc_update2 *up2,

			      const void __user *user_up2)



+{

struct compat_io_uring_rsrc_update2 compat_up2;

if (copy_from_user(&compat_up2, user_up2, sizeof(compat_up2)))
return -EFAULT;


up2->offset = compat_up2.offset;
up2->resv = compat_up2.resv;
up2->data = compat_up2.data;
up2->tags = compat_up2.tags;
up2->nr = compat_up2.nr;
up2->resv2 = compat_up2.resv2;
return 0;

+}



+static int get_compat64_io_uring_rsrc_register(struct io_uring_rsrc_register *rr,

			       const void __user *user_rr)



+{

struct compat_io_uring_rsrc_register compat_rr;

if (copy_from_user(&compat_rr, user_rr, sizeof(compat_rr)))
return -EFAULT;


rr->nr = compat_rr.nr;
rr->flags = compat_rr.flags;
rr->resv2 = compat_rr.resv2;
rr->data = compat_rr.data;
rr->tags = compat_rr.tags;
return 0;

+}



+static int copy_io_uring_rsrc_update_from_user(struct io_ring_ctx *ctx,

			       struct io_uring_rsrc_update2 *up2,


			       const void __user *arg)



+{

if (is_compat64_io_ring_ctx(ctx))
return get_compat64_io_uring_rsrc_update(up2, arg);


return copy_from_user(up2, arg, sizeof(struct io_uring_rsrc_update));

I've just realised that this can be problematic. Since
io_register_files_update() returns whatever this function returns if not
0, it effectively returns what copy_from_user() returns (and eventually
that value will be returned by the io_uring_register handler). However,
copy_from_user() doesn't always return 0 or an error. It can also return
a positive value, the number of bytes left to copy, in case a fault
happened before reaching the requested size. We would therefore end up
returning that positive value to userspace, instead of the intended -EFAULT.
Ah, good catch!
...
In most other contexts where a copy_io_uring_* helper is called, this
doesn't matter as -EFAULT will be forced as return value at some point,
but we should handle this consistently to avoid accidentally returning
the value from copy_from_user() unmodified. Either all the
copy_io_uring_* helpers should return -EFAULT explicitly, or all their
(direct) callers should return -EFAULT explicitly. The former might be a
little safer.
Alright, great suggestion! Thanks!
...
Kevin
Many thanks for the detailed review!
Tudor

    

2025

2024

2023

2022

[linux-morello] Re: [PATCH v4 7/9] io_uring: Implement compat versions of uAPI structs and handle them

Signed-off-by: Tudor Cretu tudor.cretu@arm.com