On 04/04/2023 15:53, Tudor Cretu wrote:
Some userspace software, including libaio, use the layouts for struct aio_ring and struct io_event, even if the uAPI headers don't expose them. This is typically done to skip the io_getevents syscall's overhead by reading directly from the io_event array in the aio_ring, which is shared with the userspace.
I don't really understand this paragraph. io_event is the main struct exposed by <uapi/linux/aio_abi.h>, which is required if only because that's what io_getevents() manipulates. struct aio_ring does not really contain pointers (the flexible array at the end is just syntactic sugar), so on the whole I don't think we need to invoke direct access to the ring to justify this change.
As the io_event struct might contain pointers as members, introduce a compat version of the struct. Also, implement functions that convert the compat version to the native version of the struct.
A subsequent patch is going to change the io_event struct to enable it to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 16 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index c835deda5cdcc..e6e45498ccd27 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -53,6 +53,13 @@ #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 +struct compat_io_event {
- __u64 data;
- __u64 obj;
- __s64 res;
- __s64 res2;
+};
Maybe adding it before struct compat_iocb would make more sense?
struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ @@ -164,6 +171,7 @@ struct kioctx { struct file *aio_ring_file; unsigned id;
- bool compat;
Given that we use it only in compat64, and in fact only in is_compat64_aio_ctx() (besides initialising it), we could easily #ifdef it - meaning no-op unless CONFIG_COMPAT64 is actually selected.
}; /* @@ -280,6 +288,50 @@ static struct vfsmount *aio_mnt; static const struct file_operations aio_ring_fops; static const struct address_space_operations aio_ctx_aops; +static inline bool is_compat64_aio_ctx(struct kioctx *ctx)
Maybe we could call it aio_in_compat64(), mirroring io_in_compat64() for io_uring.
+{
- return IS_ENABLED(CONFIG_COMPAT64) && ctx->compat;
+}
+static inline size_t io_event_size(struct kioctx *ctx) +{
- return is_compat64_aio_ctx(ctx) ? sizeof(struct compat_io_event)
: sizeof(struct io_event);
+}
+static struct io_event __user *get_ith_user_io_event(struct kioctx *ctx,
struct io_event __user *event_array,
int i)
I think it would be nicer to avoid this sort of function as it feels rather wonky when combined with the typing. The way it's used together with aio_read_events_ring() is a bit confusing because you effectively compute the address of the event in the array then return a struct io_event *, even though it may really be a compat struct instead. Further inside aio_read_events_ring() a certain number of events then get written, effectively ignoring the type as the size is computed based on the mode.
Since we fortunately only need to do this in one place, I would suggest the following alternative: have aio_read_events_ring() take the original, unmodified user-provided pointer and the index to write at separately. Then replace the copy_to_user() with a function that copies a certain number of events at a given offset in the destination. This way the fact that the struct may have a different layout is hidden away in just one function.
+{
- if (is_compat64_aio_ctx(ctx))
return (struct io_event __user *)&((struct compat_io_event __user *)event_array)[i];
- return &event_array[i];
+}
+static u32 __user *get_user_key_ptr(struct kioctx *ctx,
struct iocb __user *user_iocb)
To be more similar to io_event_size(), maybe aio_key_uptr()?
+{
- return is_compat64_aio_ctx(ctx) ?
&((struct compat_iocb __user *)user_iocb)->aio_key :
&user_iocb->aio_key;
+}
+static void copy_io_event_to_ring(struct kioctx *ctx,
struct io_event *native_event,
struct io_event *ring_event)
I'd swap the arguments as the destination is normally before the source.
+{
- if (is_compat64_aio_ctx(ctx)) {
struct compat_io_event *compat_ring_event = (struct compat_io_event *)ring_event;
compat_ring_event->data = native_event->data;
compat_ring_event->obj = native_event->obj;
compat_ring_event->res = native_event->res;
compat_ring_event->res2 = native_event->res2;
return;
- }
- *ring_event = *native_event;
+}
static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct file *file; @@ -519,7 +571,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring);
- size += sizeof(struct io_event) * nr_events;
- size += io_event_size(ctx) * nr_events;
nr_pages = PFN_UP(size); if (nr_pages < 0) @@ -533,7 +585,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->aio_ring_file = file; nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
/ sizeof(struct io_event);
/ io_event_size(ctx);
There's another instance of sizeof(struct io_event) in ioctx_alloc(), probably should be changed too.
ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -588,6 +640,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */
- ctx->compat = in_compat_syscall();
ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ @@ -603,7 +656,8 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) return 0; } -static unsigned int get_event_pages_idx(unsigned int event_idx, +static unsigned int get_event_pages_idx(struct kioctx *ctx,
unsigned int event_idx, unsigned int nr_events, size_t *event_offset, unsigned int *nr_pages)
@@ -612,12 +666,12 @@ static unsigned int get_event_pages_idx(unsigned int event_idx, size_t off; off = sizeof(struct aio_ring);
- off += sizeof(struct io_event) * event_idx;
- off += io_event_size(ctx) * event_idx;
page_idx = off / PAGE_SIZE; *event_offset = offset_in_page(off);
- off += sizeof(struct io_event) * nr_events - 1;
- off += io_event_size(ctx) * nr_events - 1; *nr_pages = off / PAGE_SIZE + 1 - page_idx; return page_idx;
} @@ -1164,7 +1218,7 @@ static void aio_complete(struct aio_kiocb *iocb) tail = ctx->tail;
- ctx_page_idx = get_event_pages_idx(tail, 1, &event_offset, &nr_pages);
- ctx_page_idx = get_event_pages_idx(ctx, tail, 1, &event_offset, &nr_pages); ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); if (!unlikely(ring_pages)) { pr_warn("Couldn't map aio ring event pages\n");
@@ -1173,7 +1227,7 @@ static void aio_complete(struct aio_kiocb *iocb) } event = ring_pages + event_offset;
- *event = iocb->ki_res;
- copy_io_event_to_ring(ctx, &iocb->ki_res, event);
vunmap(ring_pages); for (unsigned int page_idx = ctx_page_idx; page_idx < ctx_page_idx + nr_pages; page_idx++) @@ -1284,7 +1338,7 @@ static long aio_read_events_ring(struct kioctx *ctx, break; avail = (head <= tail ? tail : ctx->nr_events) - head; avail = min(avail, nr - ret);
ctx_page_idx = get_event_pages_idx(head, avail, &event_offset, &nr_pages);
ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); if (!ring_pages) { ret = -ENOMEM;ctx_page_idx = get_event_pages_idx(ctx, head, avail, &event_offset, &nr_pages);
@@ -1293,7 +1347,7 @@ static long aio_read_events_ring(struct kioctx *ctx, copy_ret = copy_to_user(event, ring_pages + event_offset,
sizeof(struct io_event) * avail);
io_event_size(ctx) * avail);
vunmap(ring_pages); @@ -1322,7 +1376,9 @@ static long aio_read_events_ring(struct kioctx *ctx, static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i) {
- long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
- long ret = aio_read_events_ring(ctx,
get_ith_user_io_event(ctx, event, *i),
nr - *i);
if (ret > 0) *i += ret; @@ -2001,7 +2057,7 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
Just noticed that the compat handler for io_submit takes nr as int while the native handler takes it as long. Very unlikely to make any difference in practice but could be worth changing it to compat_long_t for consistency (in a separate patch).
Kevin
req->ki_eventfd = eventfd;
}
- if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
- if (unlikely(put_user(KIOCB_KEY, get_user_key_ptr(ctx, user_iocb)))) { pr_debug("EFAULT: aio_key\n"); return -EFAULT; }
@@ -2223,15 +2279,20 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, u32 key; u64 obj = user_ptr_addr(iocb);
- if (unlikely(get_user(key, &iocb->aio_key)))
return -EFAULT;
- if (unlikely(key != KIOCB_KEY))
return -EINVAL;
- ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL;
- if (unlikely(get_user(key, get_user_key_ptr(ctx, iocb)))) {
ret = -EFAULT;
goto out;
- }
- if (unlikely(key != KIOCB_KEY)) {
ret = -EINVAL;
goto out;
- }
- spin_lock_irq(&ctx->ctx_lock); /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
@@ -2252,6 +2313,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, ret = -EINPROGRESS; } +out: percpu_ref_put(&ctx->users); return ret;