There were two underlying assumptions: 1. Size of the header of aio_ring is equal to the size of an io_event sizeof(aio_ring) == sizeof(io_event) 2. There is no leftover space at the end of a page when populating it with io_events, i.e. PAGE_SIZE is a multiple of sizeof(io_event)
This change aims to make the aio_ring mechanism more robust such that it doesn't rely anymore on the above assumptions. It also improves the copying of the io_events from the shared ring to the userspace, by batching them together in a maximum of two copy_to_user calls.
Mapping pages independently using kmap doesn't create a virtually contiguous space. As some io_event structs could span across different pages, all the pages necessary to access the events' data need to be mapped to a virtually contiguous space. For this reason, kmap is replaced with vmap.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- fs/aio.c | 80 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 28 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index 079074f47c2e7..c835deda5cdcc 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -603,9 +603,24 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) return 0; }
-#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) -#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) -#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) +static unsigned int get_event_pages_idx(unsigned int event_idx, + unsigned int nr_events, + size_t *event_offset, + unsigned int *nr_pages) +{ + unsigned int page_idx; + size_t off; + + off = sizeof(struct aio_ring); + off += sizeof(struct io_event) * event_idx; + + page_idx = off / PAGE_SIZE; + *event_offset = offset_in_page(off); + + off += sizeof(struct io_event) * nr_events - 1; + *nr_pages = off / PAGE_SIZE + 1 - page_idx; + return page_idx; +}
void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { @@ -1134,8 +1149,10 @@ static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; - struct io_event *ev_page, *event; - unsigned tail, pos, head; + struct io_event *event; + void *ring_pages; + size_t event_offset; + unsigned int tail, head, ctx_page_idx, nr_pages; unsigned long flags;
/* @@ -1146,18 +1163,21 @@ static void aio_complete(struct aio_kiocb *iocb) spin_lock_irqsave(&ctx->completion_lock, flags);
tail = ctx->tail; - pos = tail + AIO_EVENTS_OFFSET;
- if (++tail >= ctx->nr_events) - tail = 0; - - ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); - event = ev_page + pos % AIO_EVENTS_PER_PAGE; + ctx_page_idx = get_event_pages_idx(tail, 1, &event_offset, &nr_pages); + ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); + if (!unlikely(ring_pages)) { + pr_warn("Couldn't map aio ring event pages\n"); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + return; + } + event = ring_pages + event_offset;
*event = iocb->ki_res;
- kunmap_atomic(ev_page); - flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + vunmap(ring_pages); + for (unsigned int page_idx = ctx_page_idx; page_idx < ctx_page_idx + nr_pages; page_idx++) + flush_dcache_page(ctx->ring_pages[page_idx]);
pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, (void __user *)(unsigned long)iocb->ki_res.obj, @@ -1168,6 +1188,8 @@ static void aio_complete(struct aio_kiocb *iocb) */ smp_wmb(); /* make event visible before updating tail */
+ if (++tail >= ctx->nr_events) + tail = 0; ctx->tail = tail;
ring = kmap_atomic(ctx->ring_pages[0]); @@ -1219,9 +1241,8 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) { struct aio_ring *ring; - unsigned head, tail, pos; + unsigned int head, tail; long ret = 0; - int copy_ret;
/* * The mutex can block and wake us up and that will cause @@ -1253,25 +1274,28 @@ static long aio_read_events_ring(struct kioctx *ctx, tail %= ctx->nr_events;
while (ret < nr) { + unsigned int ctx_page_idx, nr_pages; + void *ring_pages; + size_t event_offset; long avail; - struct io_event *ev; - struct page *page; + int copy_ret;
- avail = (head <= tail ? tail : ctx->nr_events) - head; if (head == tail) break; - - pos = head + AIO_EVENTS_OFFSET; - page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; - pos %= AIO_EVENTS_PER_PAGE; - + avail = (head <= tail ? tail : ctx->nr_events) - head; avail = min(avail, nr - ret); - avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); + ctx_page_idx = get_event_pages_idx(head, avail, &event_offset, &nr_pages); + ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); + if (!ring_pages) { + ret = -ENOMEM; + goto out; + } + + copy_ret = copy_to_user(event, + ring_pages + event_offset, + sizeof(struct io_event) * avail);
- ev = kmap(page); - copy_ret = copy_to_user(event + ret, ev + pos, - sizeof(*ev) * avail); - kunmap(page); + vunmap(ring_pages);
if (unlikely(copy_ret)) { ret = -EFAULT;