[linux-morello] Re: [PATCH v2 3/6] aio: Implement compat handling for the io_event struct

23 Jun 2023

On 22-06-2023 08:41, Kevin Brodsky wrote:
...
On 15/06/2023 17:54, Tudor Cretu wrote:
...
Some userspace software, including libaio, use struct aio_ring's layout,
even if the uAPI headers don't expose it. This is typically done to skip
the io_getevents syscall's overhead by reading directly from the
io_events array in the aio_ring, which is shared with the userspace.
It's still not clear to me why direct access to the ring makes a
difference, as we're not touch struct aio_ring itself and struct
io_event is already exposed through syscalls.
Indeed it doesn't, it just slipped into v2. Sorry about that!
...
...
As the io_event struct might contain pointers as members, introduce a
compat version of the struct. Also, implement functions that convert
the compat version to the native version of the struct.
A subsequent patch is going to change the io_event struct to enable it
to support new architectures. On such architectures, the current struct
layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 123 +++++++++++++++++++++++++++++++++++++++++++++----------
  1 file changed, 102 insertions(+), 21 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 4fdab03bc0f2..31a9b1c818e0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -65,9 +65,8 @@ struct aio_ring {
   unsigned	incompat_features;
   unsigned	header_length;	/* size of aio_ring */

struct io_event		io_events[];

-}; /* 128 bytes + ring size */
+}; /* 32 bytes + ring size */
  
  /*

Plugging is meant to work with larger batches of IOs. If we don't

@@ -123,6 +122,11 @@ struct kioctx {
   unsigned long		mmap_base;
   unsigned long		mmap_size;

union {
struct compat_io_event	*io_events_compat;


struct io_event		*io_events;


};

I'm not convinced this is worth it. io_events_compat is used in just two
places, casting ctx->ring->io_events to struct compat_io_event * there
would be less overhead than adding that member to struct kioctx.
Makes sense, I agree.
...
...

struct page		**ring_pages;
 long			nr_pages;

@@ -165,6 +169,9 @@ struct kioctx {
   struct file		*aio_ring_file;
  
   unsigned		id;
+#ifdef CONFIG_COMPAT64

bool			compat;

+#endif
  };
  
  /*
@@ -219,6 +226,13 @@ struct aio_kiocb {
   struct eventfd_ctx	*ki_eventfd;
  };
  
+struct compat_io_event {

__u64	data;
__u64	obj;
__s64	res;
__s64	res2;

+};

struct compat_iocb {
 __u64	aio_data;
#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)

@@ -281,6 +295,56 @@ static struct vfsmount *aio_mnt;
  static const struct file_operations aio_ring_fops;
  static const struct address_space_operations aio_ctx_aops;
  
+static inline bool aio_in_compat64(struct kioctx *ctx)
+{

return IS_ENABLED(CONFIG_COMPAT64) && ctx->compat;

+}



+static inline size_t io_event_size(struct kioctx *ctx)
+{

return aio_in_compat64(ctx) ? sizeof(struct compat_io_event)
		    : sizeof(struct io_event);



+}



+static u32 __user *aio_key_uptr(struct kioctx *ctx,

		struct iocb __user *user_iocb)



+{

return aio_in_compat64(ctx) ?
      &((struct compat_iocb __user *)user_iocb)->aio_key :


      &user_iocb->aio_key;



+}



+static int copy_io_events_to_user(struct kioctx *ctx,

		  struct io_event __user *event_array,


		  long offset,


		  long ring_head,


		  int nr)



Maybe long to match avail in aio_read_events_ring()? Otherwise much
clearer now!
Indeed! Thank you for the suggestion!
...
...
+{

if (aio_in_compat64(ctx))
return copy_to_user((struct compat_io_event __user *)event_array + offset,


		    &ctx->io_events_compat[ring_head],


		    io_event_size(ctx) * nr);


return copy_to_user(event_array + offset,
	    &ctx->io_events[ring_head],


	    io_event_size(ctx) * nr);



+}



+static void copy_io_event_to_ring(struct kioctx *ctx,

		  long ring_tail,



Nit: I think ring_idx or similar would make a bit more sense as it
doesn't matter that it's the tail or not here, we're writing to whatever
index is passed.
Also, why long? Ring indices seem to be represented as unsigned
everywhere else.
Done!
...
...

		  struct io_event *native_event)



+{

if (aio_in_compat64(ctx)) {
struct compat_io_event *compat_ring_event = &ctx->io_events_compat[ring_tail];



compat_ring_event->data = native_event->data;


compat_ring_event->obj = native_event->obj;


compat_ring_event->res = native_event->res;


compat_ring_event->res2 = native_event->res2;


return;


}
ctx->io_events[ring_tail] = *native_event;

+}

static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
{
 struct file *file;

@@ -352,6 +416,9 @@ static void aio_free_ring(struct kioctx *ctx)
  {
   int i;

if (ctx->ring)
vunmap(ctx->ring);



That one escaped from patch 2 I think :)
It didn't reach very far :). Thanks for spotting that!
...
...

/* Disconnect the kiotx from the ring file.  This prevents future
accesses to the kioctx from page migration.

*/

@@ -519,7 +586,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
   nr_events += 2;	/* 1 is required, 2 for good luck */
  
   size = sizeof(struct aio_ring);

size += sizeof(struct io_event) * nr_events;


size += io_event_size(ctx) * nr_events;

nr_pages = PFN_UP(size);
   if (nr_pages < 0)
@@ -533,7 +600,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
  
   ctx->aio_ring_file = file;
   nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))

	/ sizeof(struct io_event);




	/ io_event_size(ctx);



ctx->ring_pages = ctx->internal_pages;
   if (nr_pages > AIO_RING_PAGES) {
@@ -595,6 +662,8 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
   	return -ENOMEM;
   }

ctx->io_events = ctx->ring->io_events;
ctx->ring->nr = nr_events;	/* user copy */
 ctx->ring->id = ~0U;
 ctx->ring->head = ctx->ring->tail = 0;

@@ -766,8 +835,16 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
   nr_events = max(nr_events, num_possible_cpus() * 4);
   nr_events *= 2;

ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
if (!ctx)
return ERR_PTR(-ENOMEM);




+#ifdef CONFIG_COMPAT64

ctx->compat = in_compat_syscall();

+#endif

/* Prevent overflows */


if (nr_events > (0x10000000U / sizeof(struct io_event))) {


if (nr_events > (0x10000000U / io_event_size(ctx))) {
pr_debug("ENOMEM: nr_events too high\n");
return ERR_PTR(-EINVAL);

I think we would leak ctx in this case, since it is now allocated before
that check.
Indeed, well spotted.
...
We could make it simpler by having ioctx_alloc() take an extra compat
argument. That would avoid having to allocate ctx first, and it would
also remove the need to use in_compat_syscall(), since we already have a
compat handler.
Sounds good!
...
I agree with your reply on v1, it is not essential to use the right size
here. That said effectively diving by 2 the max nr_events in compat64 is
still a user-visible change we should try to avoid.
...
}
@@ -775,10 +852,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
   if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
   	return ERR_PTR(-EAGAIN);

ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
if (!ctx)
return ERR_PTR(-ENOMEM);


ctx->max_reqs = max_reqs;

spin_lock_init(&ctx->ctx_lock);
@@ -1138,7 +1211,7 @@ static void aio_complete(struct aio_kiocb *iocb)
  
   tail = ctx->tail;

ctx->ring->io_events[tail] = iocb->ki_res;


copy_io_event_to_ring(ctx, tail, &iocb->ki_res);

pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
   	 (void __user *)(unsigned long)iocb->ki_res.obj,
@@ -1196,7 +1269,8 @@ static inline void iocb_put(struct aio_kiocb *iocb)

events fetched

*/
  static long aio_read_events_ring(struct kioctx *ctx,

		 struct io_event __user *event, long nr)




		 struct io_event __user *event, long idx,



Nit: maybe event_idx (for a while I was confused as I thought idx was a
ring index).
Done
...
...

		 long nr)

{
unsigned int head, tail;
long ret = 0;

@@ -1237,9 +1311,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
   	avail = (head <= tail ?  tail : ctx->nr_events) - head;
   	avail = min(avail, nr - ret);

copy_ret = copy_to_user(event + ret,


			&ctx->ring->io_events[head],


			sizeof(struct io_event) * avail);




copy_ret = copy_io_events_to_user(ctx, event, idx + ret, head, avail);



if (unlikely(copy_ret)) {
   		ret = -EFAULT;
@@ -1263,7 +1335,10 @@ static long aio_read_events_ring(struct kioctx *ctx,
  static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
   		    struct io_event __user *event, long *i)
  {

long ret = aio_read_events_ring(ctx, event + *i, nr - *i);


long ret = aio_read_events_ring(ctx,
			event,


			*i,


			nr - *i);



It should all fit in in one line again.
Done
Thanks,
Tudor
...
Kevin
...
if (ret > 0)
*i += ret;

@@ -1942,7 +2017,7 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
   	req->ki_eventfd = eventfd;
   }

if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {


if (unlikely(put_user(KIOCB_KEY, aio_key_uptr(ctx, user_iocb)))) {
pr_debug("EFAULT: aio_key\n");
return -EFAULT;
}

@@ -2164,15 +2239,20 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
   u32 key;
   u64 obj = user_ptr_addr(iocb);

if (unlikely(get_user(key, &iocb->aio_key)))
return -EFAULT;


if (unlikely(key != KIOCB_KEY))
return -EINVAL;


ctx = lookup_ioctx(ctx_id);
 if (unlikely(!ctx))
  return -EINVAL;


if (unlikely(get_user(key, aio_key_uptr(ctx, iocb)))) {
ret = -EFAULT;


goto out;


}

if (unlikely(key != KIOCB_KEY)) {
ret = -EINVAL;


goto out;


}
spin_lock_irq(&ctx->ctx_lock);
 /* TODO: use a hash or array, this sucks. */
 list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {

@@ -2193,6 +2273,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
   	ret = -EINPROGRESS;
   }
  
+out:
   percpu_ref_put(&ctx->users);
  
   return ret;

    

2026

2025

2024

2023

2022

[linux-morello] Re: [PATCH v2 3/6] aio: Implement compat handling for the io_event struct

Signed-off-by: Tudor Cretu tudor.cretu@arm.com