On 23/06/2023 15:33, Tudor Cretu wrote:
The aio_ring mechanism relies on two assumptions:
- The size of the header of an aio_ring is equal to the size of an io_event, i.e. sizeof(aio_ring) equals sizeof(io_event)
- There is no leftover space at the end of a page when populating it with io_events, i.e. PAGE_SIZE is a multiple of sizeof(io_event)
These assumptions are not met in PCuABI. This change removes these assumptions, making the aio_ring mechanism more flexible and robust.
Mapping pages independently using kmap doesn't create a contiguous virtual space. As some io_event structs could span across different pages, all the pages necessary to access the events' data need to be mapped to a contiguous virtual address space. For this reason, the ring pages are vmapped at the context initialisation step, and the mapping is cached in the context.
In addition, this change improves the performance of the aio_ring mechanism. Previously, io_events were copied from the shared ring to userspace using multiple copy_to_user calls. This change batches the copying of io_events into a maximum of two copy_to_user calls.
The changes are similar to the approach taken in [0].
[0] https://lore.kernel.org/all/1349764760-21093-4-git-send-email-koverstreet@go...
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 98 +++++++++++++++++++------------------------------------- 1 file changed, 33 insertions(+), 65 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index 1f235c446e89..c5e850c4a4d7 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -119,6 +119,7 @@ struct kioctx { /* Size of ringbuffer, in units of struct io_event */ unsigned nr_events;
- struct aio_ring *ring; unsigned long mmap_base; unsigned long mmap_size;
@@ -351,6 +352,9 @@ static void aio_free_ring(struct kioctx *ctx) { int i;
- if (ctx->ring)
vunmap(ctx->ring);
- /* Disconnect the kiotx from the ring file. This prevents future
*/
- accesses to the kioctx from page migration.
@@ -508,7 +512,6 @@ static const struct address_space_operations aio_ctx_aops = { static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) {
- struct aio_ring *ring; struct mm_struct *mm = current->mm; unsigned long size, unused; int nr_pages;
@@ -589,24 +592,23 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */
- ring = kmap_atomic(ctx->ring_pages[0]);
- ring->nr = nr_events; /* user copy */
- ring->id = ~0U;
- ring->head = ring->tail = 0;
- ring->magic = AIO_RING_MAGIC;
- ring->compat_features = AIO_RING_COMPAT_FEATURES;
- ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
- ring->header_length = sizeof(struct aio_ring);
- kunmap_atomic(ring);
- flush_dcache_page(ctx->ring_pages[0]);
Upon further consideration, I'm not entirely sure removing those flush_dcache_page() is safe. They were added by [1], that is after [0] was written, so the latter doesn't inform us on what to do with them. io_uring doesn't seem to need them, but it doesn't use an actual file or vmap(), so the situation might be different.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id...
[...]
@@ -1253,25 +1233,16 @@ static long aio_read_events_ring(struct kioctx *ctx, tail %= ctx->nr_events; while (ret < nr) {
long avail;
struct io_event *ev;
struct page *page;
long avail = (head <= tail ? tail : ctx->nr_events) - head;
avail = (head <= tail ? tail : ctx->nr_events) - head;
I think we can avoid this diff too.
Kevin
if (head == tail) break;
pos = head + AIO_EVENTS_OFFSET;
page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
pos %= AIO_EVENTS_PER_PAGE;
- avail = min(avail, nr - ret);
avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
ev = kmap(page);
copy_ret = copy_to_user(event + ret, ev + pos,
sizeof(*ev) * avail);
kunmap(page);
copy_ret = copy_to_user(event + ret,
&ctx->ring->io_events[head],
sizeof(struct io_event) * avail);
if (unlikely(copy_ret)) { ret = -EFAULT; @@ -1283,10 +1254,7 @@ static long aio_read_events_ring(struct kioctx *ctx, head %= ctx->nr_events; }
- ring = kmap_atomic(ctx->ring_pages[0]);
- ring->head = head;
- kunmap_atomic(ring);
- flush_dcache_page(ctx->ring_pages[0]);
- ctx->ring->head = head;
pr_debug("%li h%u t%u\n", ret, head, tail); out: