[linux-morello] Re: [PATCH 2/4] aio: Implement compat handling for the io_event struct

5 Jun 2023

On 04/04/2023 15:53, Tudor Cretu wrote:
...
Some userspace software, including libaio, use the layouts for
struct aio_ring and struct io_event, even if the uAPI headers don't
expose them. This is typically done to skip the io_getevents syscall's
overhead by reading directly from the io_event array in the aio_ring,
which is shared with the userspace.
I don't really understand this paragraph. io_event is the main struct
exposed by <uapi/linux/aio_abi.h>, which is required if only because
that's what io_getevents() manipulates. struct aio_ring does not really
contain pointers (the flexible array at the end is just syntactic
sugar), so on the whole I don't think we need to invoke direct access to
the ring to justify this change.
...
As the io_event struct might contain pointers as members, introduce a
compat version of the struct. Also, implement functions that convert
the compat version to the native version of the struct.
A subsequent patch is going to change the io_event struct to enable it
to support new architectures. On such architectures, the current struct
layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 78 insertions(+), 16 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index c835deda5cdcc..e6e45498ccd27 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -53,6 +53,13 @@
 #define AIO_RING_MAGIC			0xa10a10a1
 #define AIO_RING_COMPAT_FEATURES	1
 #define AIO_RING_INCOMPAT_FEATURES	0
+struct compat_io_event {

__u64	data;
__u64	obj;
__s64	res;
__s64	res2;

+};
Maybe adding it before struct compat_iocb would make more sense?
...



struct aio_ring {
   unsigned	id;	/* kernel internal index number */
   unsigned	nr;	/* number of io_events */
@@ -164,6 +171,7 @@ struct kioctx {
   struct file		*aio_ring_file;
 
   unsigned		id;

bool			compat;

Given that we use it only in compat64, and in fact only in
is_compat64_aio_ctx() (besides initialising it), we could easily #ifdef
it - meaning no-op unless CONFIG_COMPAT64 is actually selected.
...
};
 
 /*
@@ -280,6 +288,50 @@ static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
 
+static inline bool is_compat64_aio_ctx(struct kioctx *ctx)
Maybe we could call it aio_in_compat64(), mirroring io_in_compat64() for
io_uring.
...
+{

return IS_ENABLED(CONFIG_COMPAT64) && ctx->compat;

+}



+static inline size_t io_event_size(struct kioctx *ctx)
+{

return is_compat64_aio_ctx(ctx) ? sizeof(struct compat_io_event)
			: sizeof(struct io_event);



+}



+static struct io_event __user *get_ith_user_io_event(struct kioctx *ctx,

				     struct io_event __user *event_array,


				     int i)



I think it would be nicer to avoid this sort of function as it feels
rather wonky when combined with the typing. The way it's used together
with aio_read_events_ring() is a bit confusing because you effectively
compute the address of the event in the array then return a struct
io_event *, even though it may really be a compat struct instead.
Further inside aio_read_events_ring() a certain number of events then
get written, effectively ignoring the type as the size is computed based
on the mode.
Since we fortunately only need to do this in one place, I would suggest
the following alternative: have aio_read_events_ring() take the
original, unmodified user-provided pointer and the index to write at
separately. Then replace the copy_to_user() with a function that copies
a certain number of events at a given offset in the destination. This
way the fact that the struct may have a different layout is hidden away
in just one function.
...
+{

if (is_compat64_aio_ctx(ctx))
return (struct io_event __user *)&((struct compat_io_event __user *)event_array)[i];


return &event_array[i];

+}



+static u32 __user *get_user_key_ptr(struct kioctx *ctx,

		    struct iocb __user *user_iocb)



To be more similar to io_event_size(), maybe aio_key_uptr()?
...
+{

return is_compat64_aio_ctx(ctx) ?
      &((struct compat_iocb __user *)user_iocb)->aio_key :


      &user_iocb->aio_key;



+}



+static void copy_io_event_to_ring(struct kioctx *ctx,

		  struct io_event *native_event,


		  struct io_event *ring_event)



I'd swap the arguments as the destination is normally before the source.
...
+{

if (is_compat64_aio_ctx(ctx)) {
struct compat_io_event *compat_ring_event = (struct compat_io_event *)ring_event;



compat_ring_event->data = native_event->data;


compat_ring_event->obj = native_event->obj;


compat_ring_event->res = native_event->res;


compat_ring_event->res2 = native_event->res2;


return;


}
*ring_event = *native_event;

+}



static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
   struct file *file;
@@ -519,7 +571,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
   nr_events += 2;	/* 1 is required, 2 for good luck */
 
   size = sizeof(struct aio_ring);

size += sizeof(struct io_event) * nr_events;


size += io_event_size(ctx) * nr_events;

nr_pages = PFN_UP(size);
   if (nr_pages < 0)
@@ -533,7 +585,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
 
   ctx->aio_ring_file = file;
   nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))

	/ sizeof(struct io_event);




	/ io_event_size(ctx);



There's another instance of sizeof(struct io_event) in ioctx_alloc(),
probably should be changed too.
...
ctx->ring_pages = ctx->internal_pages;
   if (nr_pages > AIO_RING_PAGES) {
@@ -588,6 +640,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
 
   ctx->user_id = ctx->mmap_base;
   ctx->nr_events = nr_events; /* trusted copy */

ctx->compat = in_compat_syscall();

ring = kmap_atomic(ctx->ring_pages[0]);
   ring->nr = nr_events;	/* user copy */
@@ -603,7 +656,8 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
   return 0;
 }
 
-static unsigned int get_event_pages_idx(unsigned int event_idx,
+static unsigned int get_event_pages_idx(struct kioctx *ctx,

			unsigned int event_idx,
		unsigned int nr_events,
		size_t *event_offset,
		unsigned int *nr_pages)



@@ -612,12 +666,12 @@ static unsigned int get_event_pages_idx(unsigned int event_idx,
   size_t off;
 
   off = sizeof(struct aio_ring);

off += sizeof(struct io_event) * event_idx;


off += io_event_size(ctx) * event_idx;

page_idx = off / PAGE_SIZE;
   *event_offset = offset_in_page(off);

off += sizeof(struct io_event) * nr_events - 1;


off += io_event_size(ctx) * nr_events - 1;
*nr_pages = off / PAGE_SIZE + 1 - page_idx;
return page_idx;

}
@@ -1164,7 +1218,7 @@ static void aio_complete(struct aio_kiocb *iocb)
 
   tail = ctx->tail;

ctx_page_idx = get_event_pages_idx(tail, 1, &event_offset, &nr_pages);


ctx_page_idx = get_event_pages_idx(ctx, tail, 1, &event_offset, &nr_pages);
ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL);
if (!unlikely(ring_pages)) {
pr_warn("Couldn't map aio ring event pages\n");

@@ -1173,7 +1227,7 @@ static void aio_complete(struct aio_kiocb *iocb)
   }
   event = ring_pages + event_offset;

*event = iocb->ki_res;


copy_io_event_to_ring(ctx, &iocb->ki_res, event);

vunmap(ring_pages);
   for (unsigned int page_idx = ctx_page_idx; page_idx < ctx_page_idx + nr_pages; page_idx++)
@@ -1284,7 +1338,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
   		break;
   	avail = (head <= tail ?  tail : ctx->nr_events) - head;
   	avail = min(avail, nr - ret);

ctx_page_idx = get_event_pages_idx(head, avail, &event_offset, &nr_pages);




ctx_page_idx = get_event_pages_idx(ctx, head, avail, &event_offset, &nr_pages);

ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL);
if (!ring_pages) {
	ret = -ENOMEM;

@@ -1293,7 +1347,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
 
   	copy_ret = copy_to_user(event,
   				ring_pages + event_offset,

			sizeof(struct io_event) * avail);




			io_event_size(ctx) * avail);



vunmap(ring_pages);
 
@@ -1322,7 +1376,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
 static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
   		    struct io_event __user *event, long *i)
 {

long ret = aio_read_events_ring(ctx, event + *i, nr - *i);


long ret = aio_read_events_ring(ctx,
			get_ith_user_io_event(ctx, event, *i),


			nr - *i);



if (ret > 0)
   	*i += ret;
@@ -2001,7 +2057,7 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
Just noticed that the compat handler for io_submit takes nr as int while
the native handler takes it as long. Very unlikely to make any
difference in practice but could be worth changing it to compat_long_t
for consistency (in a separate patch).
Kevin
...
req->ki_eventfd = eventfd;

}

if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {


if (unlikely(put_user(KIOCB_KEY, get_user_key_ptr(ctx, user_iocb)))) {
pr_debug("EFAULT: aio_key\n");
return -EFAULT;
}

@@ -2223,15 +2279,20 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
   u32 key;
   u64 obj = user_ptr_addr(iocb);

if (unlikely(get_user(key, &iocb->aio_key)))
return -EFAULT;


if (unlikely(key != KIOCB_KEY))
return -EINVAL;


ctx = lookup_ioctx(ctx_id);
 if (unlikely(!ctx))
  return -EINVAL;


if (unlikely(get_user(key, get_user_key_ptr(ctx, iocb)))) {
ret = -EFAULT;


goto out;


}

if (unlikely(key != KIOCB_KEY)) {
ret = -EINVAL;


goto out;


}
spin_lock_irq(&ctx->ctx_lock);
 /* TODO: use a hash or array, this sucks. */
 list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {

@@ -2252,6 +2313,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
   	ret = -EINPROGRESS;
   }
 
+out:
   percpu_ref_put(&ctx->users);
 
   return ret;

    

2025

2024

2023

2022

[linux-morello] Re: [PATCH 2/4] aio: Implement compat handling for the io_event struct

Signed-off-by: Tudor Cretu tudor.cretu@arm.com