On 15/06/2023 17:54, Tudor Cretu wrote:
Some userspace software, including libaio, use struct aio_ring's layout, even if the uAPI headers don't expose it. This is typically done to skip the io_getevents syscall's overhead by reading directly from the io_events array in the aio_ring, which is shared with the userspace.
It's still not clear to me why direct access to the ring makes a difference, as we're not touch struct aio_ring itself and struct io_event is already exposed through syscalls.
As the io_event struct might contain pointers as members, introduce a compat version of the struct. Also, implement functions that convert the compat version to the native version of the struct.
A subsequent patch is going to change the io_event struct to enable it to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 123 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 102 insertions(+), 21 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index 4fdab03bc0f2..31a9b1c818e0 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -65,9 +65,8 @@ struct aio_ring { unsigned incompat_features; unsigned header_length; /* size of aio_ring */
- struct io_event io_events[];
-}; /* 128 bytes + ring size */ +}; /* 32 bytes + ring size */ /*
- Plugging is meant to work with larger batches of IOs. If we don't
@@ -123,6 +122,11 @@ struct kioctx { unsigned long mmap_base; unsigned long mmap_size;
- union {
struct compat_io_event *io_events_compat;
struct io_event *io_events;
- };
I'm not convinced this is worth it. io_events_compat is used in just two places, casting ctx->ring->io_events to struct compat_io_event * there would be less overhead than adding that member to struct kioctx.
- struct page **ring_pages; long nr_pages;
@@ -165,6 +169,9 @@ struct kioctx { struct file *aio_ring_file; unsigned id; +#ifdef CONFIG_COMPAT64
- bool compat;
+#endif }; /* @@ -219,6 +226,13 @@ struct aio_kiocb { struct eventfd_ctx *ki_eventfd; }; +struct compat_io_event {
- __u64 data;
- __u64 obj;
- __s64 res;
- __s64 res2;
+};
struct compat_iocb { __u64 aio_data; #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) @@ -281,6 +295,56 @@ static struct vfsmount *aio_mnt; static const struct file_operations aio_ring_fops; static const struct address_space_operations aio_ctx_aops; +static inline bool aio_in_compat64(struct kioctx *ctx) +{
- return IS_ENABLED(CONFIG_COMPAT64) && ctx->compat;
+}
+static inline size_t io_event_size(struct kioctx *ctx) +{
- return aio_in_compat64(ctx) ? sizeof(struct compat_io_event)
: sizeof(struct io_event);
+}
+static u32 __user *aio_key_uptr(struct kioctx *ctx,
struct iocb __user *user_iocb)
+{
- return aio_in_compat64(ctx) ?
&((struct compat_iocb __user *)user_iocb)->aio_key :
&user_iocb->aio_key;
+}
+static int copy_io_events_to_user(struct kioctx *ctx,
struct io_event __user *event_array,
long offset,
long ring_head,
int nr)
Maybe long to match avail in aio_read_events_ring()? Otherwise much clearer now!
+{
- if (aio_in_compat64(ctx))
return copy_to_user((struct compat_io_event __user *)event_array + offset,
&ctx->io_events_compat[ring_head],
io_event_size(ctx) * nr);
- return copy_to_user(event_array + offset,
&ctx->io_events[ring_head],
io_event_size(ctx) * nr);
+}
+static void copy_io_event_to_ring(struct kioctx *ctx,
long ring_tail,
Nit: I think ring_idx or similar would make a bit more sense as it doesn't matter that it's the tail or not here, we're writing to whatever index is passed.
Also, why long? Ring indices seem to be represented as unsigned everywhere else.
struct io_event *native_event)
+{
- if (aio_in_compat64(ctx)) {
struct compat_io_event *compat_ring_event = &ctx->io_events_compat[ring_tail];
compat_ring_event->data = native_event->data;
compat_ring_event->obj = native_event->obj;
compat_ring_event->res = native_event->res;
compat_ring_event->res2 = native_event->res2;
return;
- }
- ctx->io_events[ring_tail] = *native_event;
+}
static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct file *file; @@ -352,6 +416,9 @@ static void aio_free_ring(struct kioctx *ctx) { int i;
- if (ctx->ring)
vunmap(ctx->ring);
That one escaped from patch 2 I think :)
- /* Disconnect the kiotx from the ring file. This prevents future
*/
- accesses to the kioctx from page migration.
@@ -519,7 +586,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring);
- size += sizeof(struct io_event) * nr_events;
- size += io_event_size(ctx) * nr_events;
nr_pages = PFN_UP(size); if (nr_pages < 0) @@ -533,7 +600,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->aio_ring_file = file; nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
/ sizeof(struct io_event);
/ io_event_size(ctx);
ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -595,6 +662,8 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) return -ENOMEM; }
- ctx->io_events = ctx->ring->io_events;
- ctx->ring->nr = nr_events; /* user copy */ ctx->ring->id = ~0U; ctx->ring->head = ctx->ring->tail = 0;
@@ -766,8 +835,16 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) nr_events = max(nr_events, num_possible_cpus() * 4); nr_events *= 2;
- ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
- if (!ctx)
return ERR_PTR(-ENOMEM);
+#ifdef CONFIG_COMPAT64
- ctx->compat = in_compat_syscall();
+#endif
- /* Prevent overflows */
- if (nr_events > (0x10000000U / sizeof(struct io_event))) {
- if (nr_events > (0x10000000U / io_event_size(ctx))) { pr_debug("ENOMEM: nr_events too high\n"); return ERR_PTR(-EINVAL);
I think we would leak ctx in this case, since it is now allocated before that check.
We could make it simpler by having ioctx_alloc() take an extra compat argument. That would avoid having to allocate ctx first, and it would also remove the need to use in_compat_syscall(), since we already have a compat handler.
I agree with your reply on v1, it is not essential to use the right size here. That said effectively diving by 2 the max nr_events in compat64 is still a user-visible change we should try to avoid.
} @@ -775,10 +852,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (!nr_events || (unsigned long)max_reqs > aio_max_nr) return ERR_PTR(-EAGAIN);
- ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
- if (!ctx)
return ERR_PTR(-ENOMEM);
- ctx->max_reqs = max_reqs;
spin_lock_init(&ctx->ctx_lock); @@ -1138,7 +1211,7 @@ static void aio_complete(struct aio_kiocb *iocb) tail = ctx->tail;
- ctx->ring->io_events[tail] = iocb->ki_res;
- copy_io_event_to_ring(ctx, tail, &iocb->ki_res);
pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, (void __user *)(unsigned long)iocb->ki_res.obj, @@ -1196,7 +1269,8 @@ static inline void iocb_put(struct aio_kiocb *iocb)
- events fetched
*/ static long aio_read_events_ring(struct kioctx *ctx,
struct io_event __user *event, long nr)
struct io_event __user *event, long idx,
Nit: maybe event_idx (for a while I was confused as I thought idx was a ring index).
long nr)
{ unsigned int head, tail; long ret = 0; @@ -1237,9 +1311,7 @@ static long aio_read_events_ring(struct kioctx *ctx, avail = (head <= tail ? tail : ctx->nr_events) - head; avail = min(avail, nr - ret);
copy_ret = copy_to_user(event + ret,
&ctx->ring->io_events[head],
sizeof(struct io_event) * avail);
copy_ret = copy_io_events_to_user(ctx, event, idx + ret, head, avail);
if (unlikely(copy_ret)) { ret = -EFAULT; @@ -1263,7 +1335,10 @@ static long aio_read_events_ring(struct kioctx *ctx, static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i) {
- long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
- long ret = aio_read_events_ring(ctx,
event,
*i,
nr - *i);
It should all fit in in one line again.
Kevin
if (ret > 0) *i += ret; @@ -1942,7 +2017,7 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, req->ki_eventfd = eventfd; }
- if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
- if (unlikely(put_user(KIOCB_KEY, aio_key_uptr(ctx, user_iocb)))) { pr_debug("EFAULT: aio_key\n"); return -EFAULT; }
@@ -2164,15 +2239,20 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, u32 key; u64 obj = user_ptr_addr(iocb);
- if (unlikely(get_user(key, &iocb->aio_key)))
return -EFAULT;
- if (unlikely(key != KIOCB_KEY))
return -EINVAL;
- ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL;
- if (unlikely(get_user(key, aio_key_uptr(ctx, iocb)))) {
ret = -EFAULT;
goto out;
- }
- if (unlikely(key != KIOCB_KEY)) {
ret = -EINVAL;
goto out;
- }
- spin_lock_irq(&ctx->ctx_lock); /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
@@ -2193,6 +2273,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, ret = -EINPROGRESS; } +out: percpu_ref_put(&ctx->users); return ret;