Some userspace software, including libaio, use struct aio_ring's layout, even if the uAPI headers don't expose it. This is typically done to skip the io_getevents syscall's overhead by reading directly from the io_events array in the aio_ring, which is shared with the userspace.
As the io_event struct might contain pointers as members, introduce a compat version of the struct. Also, implement functions that convert the compat version to the native version of the struct.
A subsequent patch is going to change the io_event struct to enable it to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- fs/aio.c | 123 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 102 insertions(+), 21 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index 4fdab03bc0f2..31a9b1c818e0 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -65,9 +65,8 @@ struct aio_ring { unsigned incompat_features; unsigned header_length; /* size of aio_ring */
- struct io_event io_events[]; -}; /* 128 bytes + ring size */ +}; /* 32 bytes + ring size */
/* * Plugging is meant to work with larger batches of IOs. If we don't @@ -123,6 +122,11 @@ struct kioctx { unsigned long mmap_base; unsigned long mmap_size;
+ union { + struct compat_io_event *io_events_compat; + struct io_event *io_events; + }; + struct page **ring_pages; long nr_pages;
@@ -165,6 +169,9 @@ struct kioctx { struct file *aio_ring_file;
unsigned id; +#ifdef CONFIG_COMPAT64 + bool compat; +#endif };
/* @@ -219,6 +226,13 @@ struct aio_kiocb { struct eventfd_ctx *ki_eventfd; };
+struct compat_io_event { + __u64 data; + __u64 obj; + __s64 res; + __s64 res2; +}; + struct compat_iocb { __u64 aio_data; #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) @@ -281,6 +295,56 @@ static struct vfsmount *aio_mnt; static const struct file_operations aio_ring_fops; static const struct address_space_operations aio_ctx_aops;
+static inline bool aio_in_compat64(struct kioctx *ctx) +{ + return IS_ENABLED(CONFIG_COMPAT64) && ctx->compat; +} + +static inline size_t io_event_size(struct kioctx *ctx) +{ + return aio_in_compat64(ctx) ? sizeof(struct compat_io_event) + : sizeof(struct io_event); +} + +static u32 __user *aio_key_uptr(struct kioctx *ctx, + struct iocb __user *user_iocb) +{ + return aio_in_compat64(ctx) ? + &((struct compat_iocb __user *)user_iocb)->aio_key : + &user_iocb->aio_key; +} + +static int copy_io_events_to_user(struct kioctx *ctx, + struct io_event __user *event_array, + long offset, + long ring_head, + int nr) +{ + if (aio_in_compat64(ctx)) + return copy_to_user((struct compat_io_event __user *)event_array + offset, + &ctx->io_events_compat[ring_head], + io_event_size(ctx) * nr); + return copy_to_user(event_array + offset, + &ctx->io_events[ring_head], + io_event_size(ctx) * nr); +} + +static void copy_io_event_to_ring(struct kioctx *ctx, + long ring_tail, + struct io_event *native_event) +{ + if (aio_in_compat64(ctx)) { + struct compat_io_event *compat_ring_event = &ctx->io_events_compat[ring_tail]; + + compat_ring_event->data = native_event->data; + compat_ring_event->obj = native_event->obj; + compat_ring_event->res = native_event->res; + compat_ring_event->res2 = native_event->res2; + return; + } + ctx->io_events[ring_tail] = *native_event; +} + static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct file *file; @@ -352,6 +416,9 @@ static void aio_free_ring(struct kioctx *ctx) { int i;
+ if (ctx->ring) + vunmap(ctx->ring); + /* Disconnect the kiotx from the ring file. This prevents future * accesses to the kioctx from page migration. */ @@ -519,7 +586,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) nr_events += 2; /* 1 is required, 2 for good luck */
size = sizeof(struct aio_ring); - size += sizeof(struct io_event) * nr_events; + size += io_event_size(ctx) * nr_events;
nr_pages = PFN_UP(size); if (nr_pages < 0) @@ -533,7 +600,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ctx->aio_ring_file = file; nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) - / sizeof(struct io_event); + / io_event_size(ctx);
ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -595,6 +662,8 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) return -ENOMEM; }
+ ctx->io_events = ctx->ring->io_events; + ctx->ring->nr = nr_events; /* user copy */ ctx->ring->id = ~0U; ctx->ring->head = ctx->ring->tail = 0; @@ -766,8 +835,16 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) nr_events = max(nr_events, num_possible_cpus() * 4); nr_events *= 2;
+ ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + +#ifdef CONFIG_COMPAT64 + ctx->compat = in_compat_syscall(); +#endif + /* Prevent overflows */ - if (nr_events > (0x10000000U / sizeof(struct io_event))) { + if (nr_events > (0x10000000U / io_event_size(ctx))) { pr_debug("ENOMEM: nr_events too high\n"); return ERR_PTR(-EINVAL); } @@ -775,10 +852,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (!nr_events || (unsigned long)max_reqs > aio_max_nr) return ERR_PTR(-EAGAIN);
- ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); - if (!ctx) - return ERR_PTR(-ENOMEM); - ctx->max_reqs = max_reqs;
spin_lock_init(&ctx->ctx_lock); @@ -1138,7 +1211,7 @@ static void aio_complete(struct aio_kiocb *iocb)
tail = ctx->tail;
- ctx->ring->io_events[tail] = iocb->ki_res; + copy_io_event_to_ring(ctx, tail, &iocb->ki_res);
pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, (void __user *)(unsigned long)iocb->ki_res.obj, @@ -1196,7 +1269,8 @@ static inline void iocb_put(struct aio_kiocb *iocb) * events fetched */ static long aio_read_events_ring(struct kioctx *ctx, - struct io_event __user *event, long nr) + struct io_event __user *event, long idx, + long nr) { unsigned int head, tail; long ret = 0; @@ -1237,9 +1311,7 @@ static long aio_read_events_ring(struct kioctx *ctx, avail = (head <= tail ? tail : ctx->nr_events) - head; avail = min(avail, nr - ret);
- copy_ret = copy_to_user(event + ret, - &ctx->ring->io_events[head], - sizeof(struct io_event) * avail); + copy_ret = copy_io_events_to_user(ctx, event, idx + ret, head, avail);
if (unlikely(copy_ret)) { ret = -EFAULT; @@ -1263,7 +1335,10 @@ static long aio_read_events_ring(struct kioctx *ctx, static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i) { - long ret = aio_read_events_ring(ctx, event + *i, nr - *i); + long ret = aio_read_events_ring(ctx, + event, + *i, + nr - *i);
if (ret > 0) *i += ret; @@ -1942,7 +2017,7 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, req->ki_eventfd = eventfd; }
- if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) { + if (unlikely(put_user(KIOCB_KEY, aio_key_uptr(ctx, user_iocb)))) { pr_debug("EFAULT: aio_key\n"); return -EFAULT; } @@ -2164,15 +2239,20 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, u32 key; u64 obj = user_ptr_addr(iocb);
- if (unlikely(get_user(key, &iocb->aio_key))) - return -EFAULT; - if (unlikely(key != KIOCB_KEY)) - return -EINVAL; - ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL;
+ if (unlikely(get_user(key, aio_key_uptr(ctx, iocb)))) { + ret = -EFAULT; + goto out; + } + + if (unlikely(key != KIOCB_KEY)) { + ret = -EINVAL; + goto out; + } + spin_lock_irq(&ctx->ctx_lock); /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { @@ -2193,6 +2273,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, ret = -EINPROGRESS; }
+out: percpu_ref_put(&ctx->users);
return ret;