This series makes it possible for purecap apps to use the aio_ring shared memory region to bypass the io_getevents syscall's overhead. This functionality is also used in libaio.
With these patches, all io_* LTP tests pass in both Purecap and plain AArch64 modes. Note that the LTP tests only address the basic functionality of the aio system and a significant portion of the functionality is untested in LTP.
For a more comprehensive testing, libaio has been updated with the new uAPI and ported. All the tests in libaio pass accordingly, in both Purecap and plain AArch64 modes.
Gitlab issue: https://git.morello-project.org/morello/kernel/linux/-/issues/49
Review branch: https://git.morello-project.org/tudcre01/linux/-/commits/morello/aio_v1
Tudor Cretu (4): aio: Fix the relationship between ctx pages and io_events array aio: Implement compat handling for the io_event struct aio: Allow capability tag access on the shared memory aio: Use user pointer type in the io_event struct and aio_context_t
fs/aio.c | 197 ++++++++++++++++++++++++++--------- include/uapi/linux/aio_abi.h | 12 +-- 2 files changed, 153 insertions(+), 56 deletions(-)
There were two underlying assumptions: 1. Size of the header of aio_ring is equal to the size of an io_event sizeof(aio_ring) == sizeof(io_event) 2. There is no leftover space at the end of a page when populating it with io_events, i.e. PAGE_SIZE is a multiple of sizeof(io_event)
This change aims to make the aio_ring mechanism more robust such that it doesn't rely anymore on the above assumptions. It also improves the copying of the io_events from the shared ring to the userspace, by batching them together in a maximum of two copy_to_user calls.
Mapping pages independently using kmap doesn't create a virtually contiguous space. As some io_event structs could span across different pages, all the pages necessary to access the events' data need to be mapped to a virtually contiguous space. For this reason, kmap is replaced with vmap.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- fs/aio.c | 80 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 28 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index 079074f47c2e7..c835deda5cdcc 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -603,9 +603,24 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) return 0; }
-#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) -#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) -#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) +static unsigned int get_event_pages_idx(unsigned int event_idx, + unsigned int nr_events, + size_t *event_offset, + unsigned int *nr_pages) +{ + unsigned int page_idx; + size_t off; + + off = sizeof(struct aio_ring); + off += sizeof(struct io_event) * event_idx; + + page_idx = off / PAGE_SIZE; + *event_offset = offset_in_page(off); + + off += sizeof(struct io_event) * nr_events - 1; + *nr_pages = off / PAGE_SIZE + 1 - page_idx; + return page_idx; +}
void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { @@ -1134,8 +1149,10 @@ static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; - struct io_event *ev_page, *event; - unsigned tail, pos, head; + struct io_event *event; + void *ring_pages; + size_t event_offset; + unsigned int tail, head, ctx_page_idx, nr_pages; unsigned long flags;
/* @@ -1146,18 +1163,21 @@ static void aio_complete(struct aio_kiocb *iocb) spin_lock_irqsave(&ctx->completion_lock, flags);
tail = ctx->tail; - pos = tail + AIO_EVENTS_OFFSET;
- if (++tail >= ctx->nr_events) - tail = 0; - - ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); - event = ev_page + pos % AIO_EVENTS_PER_PAGE; + ctx_page_idx = get_event_pages_idx(tail, 1, &event_offset, &nr_pages); + ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); + if (!unlikely(ring_pages)) { + pr_warn("Couldn't map aio ring event pages\n"); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + return; + } + event = ring_pages + event_offset;
*event = iocb->ki_res;
- kunmap_atomic(ev_page); - flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + vunmap(ring_pages); + for (unsigned int page_idx = ctx_page_idx; page_idx < ctx_page_idx + nr_pages; page_idx++) + flush_dcache_page(ctx->ring_pages[page_idx]);
pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, (void __user *)(unsigned long)iocb->ki_res.obj, @@ -1168,6 +1188,8 @@ static void aio_complete(struct aio_kiocb *iocb) */ smp_wmb(); /* make event visible before updating tail */
+ if (++tail >= ctx->nr_events) + tail = 0; ctx->tail = tail;
ring = kmap_atomic(ctx->ring_pages[0]); @@ -1219,9 +1241,8 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) { struct aio_ring *ring; - unsigned head, tail, pos; + unsigned int head, tail; long ret = 0; - int copy_ret;
/* * The mutex can block and wake us up and that will cause @@ -1253,25 +1274,28 @@ static long aio_read_events_ring(struct kioctx *ctx, tail %= ctx->nr_events;
while (ret < nr) { + unsigned int ctx_page_idx, nr_pages; + void *ring_pages; + size_t event_offset; long avail; - struct io_event *ev; - struct page *page; + int copy_ret;
- avail = (head <= tail ? tail : ctx->nr_events) - head; if (head == tail) break; - - pos = head + AIO_EVENTS_OFFSET; - page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; - pos %= AIO_EVENTS_PER_PAGE; - + avail = (head <= tail ? tail : ctx->nr_events) - head; avail = min(avail, nr - ret); - avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); + ctx_page_idx = get_event_pages_idx(head, avail, &event_offset, &nr_pages); + ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); + if (!ring_pages) { + ret = -ENOMEM; + goto out; + } + + copy_ret = copy_to_user(event, + ring_pages + event_offset, + sizeof(struct io_event) * avail);
- ev = kmap(page); - copy_ret = copy_to_user(event + ret, ev + pos, - sizeof(*ev) * avail); - kunmap(page); + vunmap(ring_pages);
if (unlikely(copy_ret)) { ret = -EFAULT;
In commit title: I'm a bit hesitant to call this a fix, as it doesn't actually fix anything if the size of io_event remains the same. Maybe just talk directly about using vmap? The commit message itself clarifies the point of doing that.
On 04/04/2023 15:53, Tudor Cretu wrote:
There were two underlying assumptions:
- Size of the header of aio_ring is equal to the size of an io_event sizeof(aio_ring) == sizeof(io_event)
- There is no leftover space at the end of a page when populating it with io_events, i.e. PAGE_SIZE is a multiple of sizeof(io_event)
This change aims to make the aio_ring mechanism more robust such that it doesn't rely anymore on the above assumptions. It also improves the copying of the io_events from the shared ring to the userspace, by batching them together in a maximum of two copy_to_user calls.
Mapping pages independently using kmap doesn't create a virtually contiguous space. As some io_event structs could span across different pages, all the pages necessary to access the events' data need to be mapped to a virtually contiguous space. For this reason, kmap is replaced with vmap.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 80 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 28 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index 079074f47c2e7..c835deda5cdcc 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -603,9 +603,24 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) return 0; } -#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) -#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) -#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) +static unsigned int get_event_pages_idx(unsigned int event_idx,
unsigned int nr_events,
size_t *event_offset,
unsigned int *nr_pages)
+{
- unsigned int page_idx;
- size_t off;
- off = sizeof(struct aio_ring);
- off += sizeof(struct io_event) * event_idx;
- page_idx = off / PAGE_SIZE;
- *event_offset = offset_in_page(off);
- off += sizeof(struct io_event) * nr_events - 1;
- *nr_pages = off / PAGE_SIZE + 1 - page_idx;
- return page_idx;
+} void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { @@ -1134,8 +1149,10 @@ static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring;
- struct io_event *ev_page, *event;
- unsigned tail, pos, head;
- struct io_event *event;
- void *ring_pages;
- size_t event_offset;
- unsigned int tail, head, ctx_page_idx, nr_pages; unsigned long flags;
/* @@ -1146,18 +1163,21 @@ static void aio_complete(struct aio_kiocb *iocb) spin_lock_irqsave(&ctx->completion_lock, flags); tail = ctx->tail;
- pos = tail + AIO_EVENTS_OFFSET;
- if (++tail >= ctx->nr_events)
tail = 0;
- ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
- event = ev_page + pos % AIO_EVENTS_PER_PAGE;
- ctx_page_idx = get_event_pages_idx(tail, 1, &eventV_offset, &nr_pages);
- ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL);
I've pondered on this for a while, in the end the current approach does worry me in terms of performance. The io_getevents() case is not a very big deal, you can fetch multiple pages of events at the same time this way and anyway high-performance applications will probably just read directly from the ring. OTOH there is no way around io_submit(), and here we are effectively performing a vmap() + vunmap() for every event in the array, which seems very wasteful.
Very interestingly, while I was checking whether the overhead of vmap() had been measured, I stumbled upon an aio patch [1] that also replaces kmap with vmap! A big difference is that it sets up the mapping when creating the ring and then keeps it around, allowing the ring to be accessed directly at all time (a bit like io_uring). There are some discussions on this patch, including performance-wise, but unfortunately no conclusion and it looks like the author then lost interest.
In our situation, where struct io_event ends up being 48 bytes, switching to vmap seems to be justified, if only to keep things reasonably simple. It does feel though that the approach in [1] makes more sense - we do increase TLB pressure by keeping the kernel mapping, but that should be a negligible cost compared to calling vmap/vunmap in a loop.
I don't think [1] applies as-is and it might not be a perfect solution anyway, so feel free to adapt it or just take some inspiration from it.
Kevin
[1] https://lore.kernel.org/all/1349764760-21093-4-git-send-email-koverstreet@go...
On 05-06-2023 09:39, Kevin Brodsky wrote:
In commit title: I'm a bit hesitant to call this a fix, as it doesn't actually fix anything if the size of io_event remains the same. Maybe just talk directly about using vmap? The commit message itself clarifies the point of doing that.
Makes sense. Done!
On 04/04/2023 15:53, Tudor Cretu wrote:
There were two underlying assumptions:
- Size of the header of aio_ring is equal to the size of an io_event sizeof(aio_ring) == sizeof(io_event)
- There is no leftover space at the end of a page when populating it with io_events, i.e. PAGE_SIZE is a multiple of sizeof(io_event)
This change aims to make the aio_ring mechanism more robust such that it doesn't rely anymore on the above assumptions. It also improves the copying of the io_events from the shared ring to the userspace, by batching them together in a maximum of two copy_to_user calls.
Mapping pages independently using kmap doesn't create a virtually contiguous space. As some io_event structs could span across different pages, all the pages necessary to access the events' data need to be mapped to a virtually contiguous space. For this reason, kmap is replaced with vmap.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 80 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 28 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index 079074f47c2e7..c835deda5cdcc 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -603,9 +603,24 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) return 0; } -#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) -#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) -#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) +static unsigned int get_event_pages_idx(unsigned int event_idx,
unsigned int nr_events,
size_t *event_offset,
unsigned int *nr_pages)
+{
- unsigned int page_idx;
- size_t off;
- off = sizeof(struct aio_ring);
- off += sizeof(struct io_event) * event_idx;
- page_idx = off / PAGE_SIZE;
- *event_offset = offset_in_page(off);
- off += sizeof(struct io_event) * nr_events - 1;
- *nr_pages = off / PAGE_SIZE + 1 - page_idx;
- return page_idx;
+} void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { @@ -1134,8 +1149,10 @@ static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring;
- struct io_event *ev_page, *event;
- unsigned tail, pos, head;
- struct io_event *event;
- void *ring_pages;
- size_t event_offset;
- unsigned int tail, head, ctx_page_idx, nr_pages; unsigned long flags;
/* @@ -1146,18 +1163,21 @@ static void aio_complete(struct aio_kiocb *iocb) spin_lock_irqsave(&ctx->completion_lock, flags); tail = ctx->tail;
- pos = tail + AIO_EVENTS_OFFSET;
- if (++tail >= ctx->nr_events)
tail = 0;
- ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
- event = ev_page + pos % AIO_EVENTS_PER_PAGE;
- ctx_page_idx = get_event_pages_idx(tail, 1, &eventV_offset, &nr_pages);
- ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL);
I've pondered on this for a while, in the end the current approach does worry me in terms of performance. The io_getevents() case is not a very big deal, you can fetch multiple pages of events at the same time this way and anyway high-performance applications will probably just read directly from the ring. OTOH there is no way around io_submit(), and here we are effectively performing a vmap() + vunmap() for every event in the array, which seems very wasteful.
Very interestingly, while I was checking whether the overhead of vmap() had been measured, I stumbled upon an aio patch [1] that also replaces kmap with vmap! A big difference is that it sets up the mapping when creating the ring and then keeps it around, allowing the ring to be accessed directly at all time (a bit like io_uring). There are some discussions on this patch, including performance-wise, but unfortunately no conclusion and it looks like the author then lost interest.
In our situation, where struct io_event ends up being 48 bytes, switching to vmap seems to be justified, if only to keep things reasonably simple. It does feel though that the approach in [1] makes more sense - we do increase TLB pressure by keeping the kernel mapping, but that should be a negligible cost compared to calling vmap/vunmap in a loop.
I don't think [1] applies as-is and it might not be a perfect solution anyway, so feel free to adapt it or just take some inspiration from it.
I wanted the changes to be as noninvasive as possible, but I agree that the overhead is just too much and this solution is much better. [1] is a great find and it confirms that it's a viable change in approach. Many thanks!
Tudor
Kevin
[1] https://lore.kernel.org/all/1349764760-21093-4-git-send-email-koverstreet@go...
Some userspace software, including libaio, use the layouts for struct aio_ring and struct io_event, even if the uAPI headers don't expose them. This is typically done to skip the io_getevents syscall's overhead by reading directly from the io_event array in the aio_ring, which is shared with the userspace.
As the io_event struct might contain pointers as members, introduce a compat version of the struct. Also, implement functions that convert the compat version to the native version of the struct.
A subsequent patch is going to change the io_event struct to enable it to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- fs/aio.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 16 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index c835deda5cdcc..e6e45498ccd27 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -53,6 +53,13 @@ #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 +struct compat_io_event { + __u64 data; + __u64 obj; + __s64 res; + __s64 res2; +}; + struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ @@ -164,6 +171,7 @@ struct kioctx { struct file *aio_ring_file;
unsigned id; + bool compat; };
/* @@ -280,6 +288,50 @@ static struct vfsmount *aio_mnt; static const struct file_operations aio_ring_fops; static const struct address_space_operations aio_ctx_aops;
+static inline bool is_compat64_aio_ctx(struct kioctx *ctx) +{ + return IS_ENABLED(CONFIG_COMPAT64) && ctx->compat; +} + +static inline size_t io_event_size(struct kioctx *ctx) +{ + return is_compat64_aio_ctx(ctx) ? sizeof(struct compat_io_event) + : sizeof(struct io_event); +} + +static struct io_event __user *get_ith_user_io_event(struct kioctx *ctx, + struct io_event __user *event_array, + int i) +{ + if (is_compat64_aio_ctx(ctx)) + return (struct io_event __user *)&((struct compat_io_event __user *)event_array)[i]; + return &event_array[i]; +} + +static u32 __user *get_user_key_ptr(struct kioctx *ctx, + struct iocb __user *user_iocb) +{ + return is_compat64_aio_ctx(ctx) ? + &((struct compat_iocb __user *)user_iocb)->aio_key : + &user_iocb->aio_key; +} + +static void copy_io_event_to_ring(struct kioctx *ctx, + struct io_event *native_event, + struct io_event *ring_event) +{ + if (is_compat64_aio_ctx(ctx)) { + struct compat_io_event *compat_ring_event = (struct compat_io_event *)ring_event; + + compat_ring_event->data = native_event->data; + compat_ring_event->obj = native_event->obj; + compat_ring_event->res = native_event->res; + compat_ring_event->res2 = native_event->res2; + return; + } + *ring_event = *native_event; +} + static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct file *file; @@ -519,7 +571,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) nr_events += 2; /* 1 is required, 2 for good luck */
size = sizeof(struct aio_ring); - size += sizeof(struct io_event) * nr_events; + size += io_event_size(ctx) * nr_events;
nr_pages = PFN_UP(size); if (nr_pages < 0) @@ -533,7 +585,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ctx->aio_ring_file = file; nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) - / sizeof(struct io_event); + / io_event_size(ctx);
ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -588,6 +640,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ + ctx->compat = in_compat_syscall();
ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ @@ -603,7 +656,8 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) return 0; }
-static unsigned int get_event_pages_idx(unsigned int event_idx, +static unsigned int get_event_pages_idx(struct kioctx *ctx, + unsigned int event_idx, unsigned int nr_events, size_t *event_offset, unsigned int *nr_pages) @@ -612,12 +666,12 @@ static unsigned int get_event_pages_idx(unsigned int event_idx, size_t off;
off = sizeof(struct aio_ring); - off += sizeof(struct io_event) * event_idx; + off += io_event_size(ctx) * event_idx;
page_idx = off / PAGE_SIZE; *event_offset = offset_in_page(off);
- off += sizeof(struct io_event) * nr_events - 1; + off += io_event_size(ctx) * nr_events - 1; *nr_pages = off / PAGE_SIZE + 1 - page_idx; return page_idx; } @@ -1164,7 +1218,7 @@ static void aio_complete(struct aio_kiocb *iocb)
tail = ctx->tail;
- ctx_page_idx = get_event_pages_idx(tail, 1, &event_offset, &nr_pages); + ctx_page_idx = get_event_pages_idx(ctx, tail, 1, &event_offset, &nr_pages); ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); if (!unlikely(ring_pages)) { pr_warn("Couldn't map aio ring event pages\n"); @@ -1173,7 +1227,7 @@ static void aio_complete(struct aio_kiocb *iocb) } event = ring_pages + event_offset;
- *event = iocb->ki_res; + copy_io_event_to_ring(ctx, &iocb->ki_res, event);
vunmap(ring_pages); for (unsigned int page_idx = ctx_page_idx; page_idx < ctx_page_idx + nr_pages; page_idx++) @@ -1284,7 +1338,7 @@ static long aio_read_events_ring(struct kioctx *ctx, break; avail = (head <= tail ? tail : ctx->nr_events) - head; avail = min(avail, nr - ret); - ctx_page_idx = get_event_pages_idx(head, avail, &event_offset, &nr_pages); + ctx_page_idx = get_event_pages_idx(ctx, head, avail, &event_offset, &nr_pages); ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); if (!ring_pages) { ret = -ENOMEM; @@ -1293,7 +1347,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
copy_ret = copy_to_user(event, ring_pages + event_offset, - sizeof(struct io_event) * avail); + io_event_size(ctx) * avail);
vunmap(ring_pages);
@@ -1322,7 +1376,9 @@ static long aio_read_events_ring(struct kioctx *ctx, static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i) { - long ret = aio_read_events_ring(ctx, event + *i, nr - *i); + long ret = aio_read_events_ring(ctx, + get_ith_user_io_event(ctx, event, *i), + nr - *i);
if (ret > 0) *i += ret; @@ -2001,7 +2057,7 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, req->ki_eventfd = eventfd; }
- if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) { + if (unlikely(put_user(KIOCB_KEY, get_user_key_ptr(ctx, user_iocb)))) { pr_debug("EFAULT: aio_key\n"); return -EFAULT; } @@ -2223,15 +2279,20 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, u32 key; u64 obj = user_ptr_addr(iocb);
- if (unlikely(get_user(key, &iocb->aio_key))) - return -EFAULT; - if (unlikely(key != KIOCB_KEY)) - return -EINVAL; - ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL;
+ if (unlikely(get_user(key, get_user_key_ptr(ctx, iocb)))) { + ret = -EFAULT; + goto out; + } + + if (unlikely(key != KIOCB_KEY)) { + ret = -EINVAL; + goto out; + } + spin_lock_irq(&ctx->ctx_lock); /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { @@ -2252,6 +2313,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, ret = -EINPROGRESS; }
+out: percpu_ref_put(&ctx->users);
return ret;
On 04/04/2023 15:53, Tudor Cretu wrote:
Some userspace software, including libaio, use the layouts for struct aio_ring and struct io_event, even if the uAPI headers don't expose them. This is typically done to skip the io_getevents syscall's overhead by reading directly from the io_event array in the aio_ring, which is shared with the userspace.
I don't really understand this paragraph. io_event is the main struct exposed by <uapi/linux/aio_abi.h>, which is required if only because that's what io_getevents() manipulates. struct aio_ring does not really contain pointers (the flexible array at the end is just syntactic sugar), so on the whole I don't think we need to invoke direct access to the ring to justify this change.
As the io_event struct might contain pointers as members, introduce a compat version of the struct. Also, implement functions that convert the compat version to the native version of the struct.
A subsequent patch is going to change the io_event struct to enable it to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 16 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index c835deda5cdcc..e6e45498ccd27 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -53,6 +53,13 @@ #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 +struct compat_io_event {
- __u64 data;
- __u64 obj;
- __s64 res;
- __s64 res2;
+};
Maybe adding it before struct compat_iocb would make more sense?
struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ @@ -164,6 +171,7 @@ struct kioctx { struct file *aio_ring_file; unsigned id;
- bool compat;
Given that we use it only in compat64, and in fact only in is_compat64_aio_ctx() (besides initialising it), we could easily #ifdef it - meaning no-op unless CONFIG_COMPAT64 is actually selected.
}; /* @@ -280,6 +288,50 @@ static struct vfsmount *aio_mnt; static const struct file_operations aio_ring_fops; static const struct address_space_operations aio_ctx_aops; +static inline bool is_compat64_aio_ctx(struct kioctx *ctx)
Maybe we could call it aio_in_compat64(), mirroring io_in_compat64() for io_uring.
+{
- return IS_ENABLED(CONFIG_COMPAT64) && ctx->compat;
+}
+static inline size_t io_event_size(struct kioctx *ctx) +{
- return is_compat64_aio_ctx(ctx) ? sizeof(struct compat_io_event)
: sizeof(struct io_event);
+}
+static struct io_event __user *get_ith_user_io_event(struct kioctx *ctx,
struct io_event __user *event_array,
int i)
I think it would be nicer to avoid this sort of function as it feels rather wonky when combined with the typing. The way it's used together with aio_read_events_ring() is a bit confusing because you effectively compute the address of the event in the array then return a struct io_event *, even though it may really be a compat struct instead. Further inside aio_read_events_ring() a certain number of events then get written, effectively ignoring the type as the size is computed based on the mode.
Since we fortunately only need to do this in one place, I would suggest the following alternative: have aio_read_events_ring() take the original, unmodified user-provided pointer and the index to write at separately. Then replace the copy_to_user() with a function that copies a certain number of events at a given offset in the destination. This way the fact that the struct may have a different layout is hidden away in just one function.
+{
- if (is_compat64_aio_ctx(ctx))
return (struct io_event __user *)&((struct compat_io_event __user *)event_array)[i];
- return &event_array[i];
+}
+static u32 __user *get_user_key_ptr(struct kioctx *ctx,
struct iocb __user *user_iocb)
To be more similar to io_event_size(), maybe aio_key_uptr()?
+{
- return is_compat64_aio_ctx(ctx) ?
&((struct compat_iocb __user *)user_iocb)->aio_key :
&user_iocb->aio_key;
+}
+static void copy_io_event_to_ring(struct kioctx *ctx,
struct io_event *native_event,
struct io_event *ring_event)
I'd swap the arguments as the destination is normally before the source.
+{
- if (is_compat64_aio_ctx(ctx)) {
struct compat_io_event *compat_ring_event = (struct compat_io_event *)ring_event;
compat_ring_event->data = native_event->data;
compat_ring_event->obj = native_event->obj;
compat_ring_event->res = native_event->res;
compat_ring_event->res2 = native_event->res2;
return;
- }
- *ring_event = *native_event;
+}
static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct file *file; @@ -519,7 +571,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring);
- size += sizeof(struct io_event) * nr_events;
- size += io_event_size(ctx) * nr_events;
nr_pages = PFN_UP(size); if (nr_pages < 0) @@ -533,7 +585,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->aio_ring_file = file; nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
/ sizeof(struct io_event);
/ io_event_size(ctx);
There's another instance of sizeof(struct io_event) in ioctx_alloc(), probably should be changed too.
ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -588,6 +640,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */
- ctx->compat = in_compat_syscall();
ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ @@ -603,7 +656,8 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) return 0; } -static unsigned int get_event_pages_idx(unsigned int event_idx, +static unsigned int get_event_pages_idx(struct kioctx *ctx,
unsigned int event_idx, unsigned int nr_events, size_t *event_offset, unsigned int *nr_pages)
@@ -612,12 +666,12 @@ static unsigned int get_event_pages_idx(unsigned int event_idx, size_t off; off = sizeof(struct aio_ring);
- off += sizeof(struct io_event) * event_idx;
- off += io_event_size(ctx) * event_idx;
page_idx = off / PAGE_SIZE; *event_offset = offset_in_page(off);
- off += sizeof(struct io_event) * nr_events - 1;
- off += io_event_size(ctx) * nr_events - 1; *nr_pages = off / PAGE_SIZE + 1 - page_idx; return page_idx;
} @@ -1164,7 +1218,7 @@ static void aio_complete(struct aio_kiocb *iocb) tail = ctx->tail;
- ctx_page_idx = get_event_pages_idx(tail, 1, &event_offset, &nr_pages);
- ctx_page_idx = get_event_pages_idx(ctx, tail, 1, &event_offset, &nr_pages); ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); if (!unlikely(ring_pages)) { pr_warn("Couldn't map aio ring event pages\n");
@@ -1173,7 +1227,7 @@ static void aio_complete(struct aio_kiocb *iocb) } event = ring_pages + event_offset;
- *event = iocb->ki_res;
- copy_io_event_to_ring(ctx, &iocb->ki_res, event);
vunmap(ring_pages); for (unsigned int page_idx = ctx_page_idx; page_idx < ctx_page_idx + nr_pages; page_idx++) @@ -1284,7 +1338,7 @@ static long aio_read_events_ring(struct kioctx *ctx, break; avail = (head <= tail ? tail : ctx->nr_events) - head; avail = min(avail, nr - ret);
ctx_page_idx = get_event_pages_idx(head, avail, &event_offset, &nr_pages);
ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); if (!ring_pages) { ret = -ENOMEM;ctx_page_idx = get_event_pages_idx(ctx, head, avail, &event_offset, &nr_pages);
@@ -1293,7 +1347,7 @@ static long aio_read_events_ring(struct kioctx *ctx, copy_ret = copy_to_user(event, ring_pages + event_offset,
sizeof(struct io_event) * avail);
io_event_size(ctx) * avail);
vunmap(ring_pages); @@ -1322,7 +1376,9 @@ static long aio_read_events_ring(struct kioctx *ctx, static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i) {
- long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
- long ret = aio_read_events_ring(ctx,
get_ith_user_io_event(ctx, event, *i),
nr - *i);
if (ret > 0) *i += ret; @@ -2001,7 +2057,7 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
Just noticed that the compat handler for io_submit takes nr as int while the native handler takes it as long. Very unlikely to make any difference in practice but could be worth changing it to compat_long_t for consistency (in a separate patch).
Kevin
req->ki_eventfd = eventfd;
}
- if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
- if (unlikely(put_user(KIOCB_KEY, get_user_key_ptr(ctx, user_iocb)))) { pr_debug("EFAULT: aio_key\n"); return -EFAULT; }
@@ -2223,15 +2279,20 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, u32 key; u64 obj = user_ptr_addr(iocb);
- if (unlikely(get_user(key, &iocb->aio_key)))
return -EFAULT;
- if (unlikely(key != KIOCB_KEY))
return -EINVAL;
- ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL;
- if (unlikely(get_user(key, get_user_key_ptr(ctx, iocb)))) {
ret = -EFAULT;
goto out;
- }
- if (unlikely(key != KIOCB_KEY)) {
ret = -EINVAL;
goto out;
- }
- spin_lock_irq(&ctx->ctx_lock); /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
@@ -2252,6 +2313,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, ret = -EINPROGRESS; } +out: percpu_ref_put(&ctx->users); return ret;
On 05-06-2023 09:41, Kevin Brodsky wrote:
On 04/04/2023 15:53, Tudor Cretu wrote:
Some userspace software, including libaio, use the layouts for struct aio_ring and struct io_event, even if the uAPI headers don't expose them. This is typically done to skip the io_getevents syscall's overhead by reading directly from the io_event array in the aio_ring, which is shared with the userspace.
I don't really understand this paragraph. io_event is the main struct exposed by <uapi/linux/aio_abi.h>, which is required if only because that's what io_getevents() manipulates. struct aio_ring does not really contain pointers (the flexible array at the end is just syntactic sugar), so on the whole I don't think we need to invoke direct access to the ring to justify this change.
I don't really understand it either anymore :). I've updated it, thank you!
As the io_event struct might contain pointers as members, introduce a compat version of the struct. Also, implement functions that convert the compat version to the native version of the struct.
A subsequent patch is going to change the io_event struct to enable it to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 16 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index c835deda5cdcc..e6e45498ccd27 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -53,6 +53,13 @@ #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 +struct compat_io_event {
- __u64 data;
- __u64 obj;
- __s64 res;
- __s64 res2;
+};
Maybe adding it before struct compat_iocb would make more sense?
- struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */
@@ -164,6 +171,7 @@ struct kioctx { struct file *aio_ring_file; unsigned id;
- bool compat;
Given that we use it only in compat64, and in fact only in is_compat64_aio_ctx() (besides initialising it), we could easily #ifdef it - meaning no-op unless CONFIG_COMPAT64 is actually selected.
Done!
}; /* @@ -280,6 +288,50 @@ static struct vfsmount *aio_mnt; static const struct file_operations aio_ring_fops; static const struct address_space_operations aio_ctx_aops; +static inline bool is_compat64_aio_ctx(struct kioctx *ctx)
Maybe we could call it aio_in_compat64(), mirroring io_in_compat64() for io_uring.
Done!
+{
- return IS_ENABLED(CONFIG_COMPAT64) && ctx->compat;
+}
+static inline size_t io_event_size(struct kioctx *ctx) +{
- return is_compat64_aio_ctx(ctx) ? sizeof(struct compat_io_event)
: sizeof(struct io_event);
+}
+static struct io_event __user *get_ith_user_io_event(struct kioctx *ctx,
struct io_event __user *event_array,
int i)
I think it would be nicer to avoid this sort of function as it feels rather wonky when combined with the typing. The way it's used together with aio_read_events_ring() is a bit confusing because you effectively compute the address of the event in the array then return a struct io_event *, even though it may really be a compat struct instead. Further inside aio_read_events_ring() a certain number of events then get written, effectively ignoring the type as the size is computed based on the mode.
Since we fortunately only need to do this in one place, I would suggest the following alternative: have aio_read_events_ring() take the original, unmodified user-provided pointer and the index to write at separately. Then replace the copy_to_user() with a function that copies a certain number of events at a given offset in the destination. This way the fact that the struct may have a different layout is hidden away in just one function.
Thanks for the suggestions. I also found them quite wonky and I'm happy you found some improvements! Thanks!
+{
- if (is_compat64_aio_ctx(ctx))
return (struct io_event __user *)&((struct compat_io_event __user *)event_array)[i];
- return &event_array[i];
+}
+static u32 __user *get_user_key_ptr(struct kioctx *ctx,
struct iocb __user *user_iocb)
To be more similar to io_event_size(), maybe aio_key_uptr()?
Done!
+{
- return is_compat64_aio_ctx(ctx) ?
&((struct compat_iocb __user *)user_iocb)->aio_key :
&user_iocb->aio_key;
+}
+static void copy_io_event_to_ring(struct kioctx *ctx,
struct io_event *native_event,
struct io_event *ring_event)
I'd swap the arguments as the destination is normally before the source.
Done!
+{
- if (is_compat64_aio_ctx(ctx)) {
struct compat_io_event *compat_ring_event = (struct compat_io_event *)ring_event;
compat_ring_event->data = native_event->data;
compat_ring_event->obj = native_event->obj;
compat_ring_event->res = native_event->res;
compat_ring_event->res2 = native_event->res2;
return;
- }
- *ring_event = *native_event;
+}
- static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct file *file;
@@ -519,7 +571,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring);
- size += sizeof(struct io_event) * nr_events;
- size += io_event_size(ctx) * nr_events;
nr_pages = PFN_UP(size); if (nr_pages < 0) @@ -533,7 +585,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->aio_ring_file = file; nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
/ sizeof(struct io_event);
/ io_event_size(ctx);
There's another instance of sizeof(struct io_event) in ioctx_alloc(), probably should be changed too.
I was aware of that and didn't think it was worth updating given the check wasn't strict and the max limit is much lower than that check. I agree though, let's change it everywhere, for consistency at least.
ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -588,6 +640,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */
- ctx->compat = in_compat_syscall();
ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ @@ -603,7 +656,8 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) return 0; } -static unsigned int get_event_pages_idx(unsigned int event_idx, +static unsigned int get_event_pages_idx(struct kioctx *ctx,
unsigned int event_idx, unsigned int nr_events, size_t *event_offset, unsigned int *nr_pages)
@@ -612,12 +666,12 @@ static unsigned int get_event_pages_idx(unsigned int event_idx, size_t off; off = sizeof(struct aio_ring);
- off += sizeof(struct io_event) * event_idx;
- off += io_event_size(ctx) * event_idx;
page_idx = off / PAGE_SIZE; *event_offset = offset_in_page(off);
- off += sizeof(struct io_event) * nr_events - 1;
- off += io_event_size(ctx) * nr_events - 1; *nr_pages = off / PAGE_SIZE + 1 - page_idx; return page_idx; }
@@ -1164,7 +1218,7 @@ static void aio_complete(struct aio_kiocb *iocb) tail = ctx->tail;
- ctx_page_idx = get_event_pages_idx(tail, 1, &event_offset, &nr_pages);
- ctx_page_idx = get_event_pages_idx(ctx, tail, 1, &event_offset, &nr_pages); ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); if (!unlikely(ring_pages)) { pr_warn("Couldn't map aio ring event pages\n");
@@ -1173,7 +1227,7 @@ static void aio_complete(struct aio_kiocb *iocb) } event = ring_pages + event_offset;
- *event = iocb->ki_res;
- copy_io_event_to_ring(ctx, &iocb->ki_res, event);
vunmap(ring_pages); for (unsigned int page_idx = ctx_page_idx; page_idx < ctx_page_idx + nr_pages; page_idx++) @@ -1284,7 +1338,7 @@ static long aio_read_events_ring(struct kioctx *ctx, break; avail = (head <= tail ? tail : ctx->nr_events) - head; avail = min(avail, nr - ret);
ctx_page_idx = get_event_pages_idx(head, avail, &event_offset, &nr_pages);
ring_pages = vmap(ctx->ring_pages + ctx_page_idx, nr_pages, VM_MAP, PAGE_KERNEL); if (!ring_pages) { ret = -ENOMEM;ctx_page_idx = get_event_pages_idx(ctx, head, avail, &event_offset, &nr_pages);
@@ -1293,7 +1347,7 @@ static long aio_read_events_ring(struct kioctx *ctx, copy_ret = copy_to_user(event, ring_pages + event_offset,
sizeof(struct io_event) * avail);
io_event_size(ctx) * avail);
vunmap(ring_pages); @@ -1322,7 +1376,9 @@ static long aio_read_events_ring(struct kioctx *ctx, static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i) {
- long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
- long ret = aio_read_events_ring(ctx,
get_ith_user_io_event(ctx, event, *i),
nr - *i);
if (ret > 0) *i += ret; @@ -2001,7 +2057,7 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
Just noticed that the compat handler for io_submit takes nr as int while the native handler takes it as long. Very unlikely to make any difference in practice but could be worth changing it to compat_long_t for consistency (in a separate patch).
Sounds good! Done!
Kevin
req->ki_eventfd = eventfd;
}
- if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
- if (unlikely(put_user(KIOCB_KEY, get_user_key_ptr(ctx, user_iocb)))) { pr_debug("EFAULT: aio_key\n"); return -EFAULT; }
@@ -2223,15 +2279,20 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, u32 key; u64 obj = user_ptr_addr(iocb);
- if (unlikely(get_user(key, &iocb->aio_key)))
return -EFAULT;
- if (unlikely(key != KIOCB_KEY))
return -EINVAL;
- ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL;
- if (unlikely(get_user(key, get_user_key_ptr(ctx, iocb)))) {
ret = -EFAULT;
goto out;
- }
- if (unlikely(key != KIOCB_KEY)) {
ret = -EINVAL;
goto out;
- }
- spin_lock_irq(&ctx->ctx_lock); /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
@@ -2252,6 +2313,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, ret = -EINPROGRESS; } +out: percpu_ref_put(&ctx->users); return ret;
The aio_ring buffer is a shared memory region that hosts the io_event array. The io_event struct may contain user pointers, so the memory region must be allowed to store and load capability pointers.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- fs/aio.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/fs/aio.c b/fs/aio.c index e6e45498ccd27..a83bf7f656ca4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -466,6 +466,16 @@ static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { vma->vm_flags |= VM_DONTEXPAND; vma->vm_ops = &aio_ring_vm_ops; + if (IS_ENABLED(CONFIG_CHERI_PURECAP_UABI)) { + size_t sz = vma->vm_end - vma->vm_start; + struct kioctx *ctx = file->f_mapping->private_data; + unsigned long pfn = page_to_pfn(ctx->ring_pages[0]); + + vma->vm_flags |= VM_READ_CAPS | VM_WRITE_CAPS; + vma_set_page_prot(vma); + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); + } + return 0; }
On 04/04/2023 15:53, Tudor Cretu wrote:
The aio_ring buffer is a shared memory region that hosts the io_event array. The io_event struct may contain user pointers, so the memory region must be allowed to store and load capability pointers.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/fs/aio.c b/fs/aio.c index e6e45498ccd27..a83bf7f656ca4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -466,6 +466,16 @@ static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { vma->vm_flags |= VM_DONTEXPAND; vma->vm_ops = &aio_ring_vm_ops;
- if (IS_ENABLED(CONFIG_CHERI_PURECAP_UABI)) {
size_t sz = vma->vm_end - vma->vm_start;
struct kioctx *ctx = file->f_mapping->private_data;
unsigned long pfn = page_to_pfn(ctx->ring_pages[0]);
This looks suspicious: we allocated the pages individually so there is no guarantee that they are physically contiguous. Maybe we don't need to do that at all though, see below.
vma->vm_flags |= VM_READ_CAPS | VM_WRITE_CAPS;
vma_set_page_prot(vma);
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
It is not clear to me why we need to manually map these pages ahead of time to get capability access. I can see that this is how it is done in io_uring, but I think the situation is quite different there: there is no actual filesystem involved, so the .mmap handler has to do all the work itself. OTOH in AIO we have a pseudo-filesystem and actually create a file for the ring, allowing us to use the standard filemap_fault callback in our vm_operations_struct. Wouldn't it be sufficient to add our vm_flags here to get them used by filemap_fault?
Kevin
On 05-06-2023 09:42, Kevin Brodsky wrote:
On 04/04/2023 15:53, Tudor Cretu wrote:
The aio_ring buffer is a shared memory region that hosts the io_event array. The io_event struct may contain user pointers, so the memory region must be allowed to store and load capability pointers.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/fs/aio.c b/fs/aio.c index e6e45498ccd27..a83bf7f656ca4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -466,6 +466,16 @@ static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { vma->vm_flags |= VM_DONTEXPAND; vma->vm_ops = &aio_ring_vm_ops;
- if (IS_ENABLED(CONFIG_CHERI_PURECAP_UABI)) {
size_t sz = vma->vm_end - vma->vm_start;
struct kioctx *ctx = file->f_mapping->private_data;
unsigned long pfn = page_to_pfn(ctx->ring_pages[0]);
This looks suspicious: we allocated the pages individually so there is no guarantee that they are physically contiguous. Maybe we don't need to do that at all though, see below.
vma->vm_flags |= VM_READ_CAPS | VM_WRITE_CAPS;
vma_set_page_prot(vma);
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
It is not clear to me why we need to manually map these pages ahead of time to get capability access. I can see that this is how it is done in io_uring, but I think the situation is quite different there: there is no actual filesystem involved, so the .mmap handler has to do all the work itself. OTOH in AIO we have a pseudo-filesystem and actually create a file for the ring, allowing us to use the standard filemap_fault callback in our vm_operations_struct. Wouldn't it be sufficient to add our vm_flags here to get them used by filemap_fault?
Indeed, well spotted! I think I encountered some issue and decided to do this, but just forgot to check that it really needs remapping, which indeed, it doesn't need them. Thanks, this definitely makes it neater.
Tudor
Kevin
The io_event struct may contain user pointers. In the PCuABI, a user pointer is a 129-bit capability, so the __u64 type is not big enough to hold it. Use the __kernel_uintptr_t type instead, which is big enough on the affected architectures while remaining 64-bit on others.
The aio_context_t represents the pointer to the userspace mapping of aio_ring buffer. In some userspace software, including libaio, it's used to directly access the aio_ring. As __kernel_ulong_t type is not large enough to fit a user pointer in the PCuABI, change the aio_context_t to __kernel_uintptr_t.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com --- fs/aio.c | 29 +++++++++++++++-------------- include/uapi/linux/aio_abi.h | 12 ++++++------ 2 files changed, 21 insertions(+), 20 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index a83bf7f656ca4..3405e056a8ac5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -323,8 +323,8 @@ static void copy_io_event_to_ring(struct kioctx *ctx, if (is_compat64_aio_ctx(ctx)) { struct compat_io_event *compat_ring_event = (struct compat_io_event *)ring_event;
- compat_ring_event->data = native_event->data; - compat_ring_event->obj = native_event->obj; + compat_ring_event->data = (__u64)native_event->data; + compat_ring_event->obj = (__u64)native_event->obj; compat_ring_event->res = native_event->res; compat_ring_event->res2 = native_event->res2; return; @@ -1168,7 +1168,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) return req; }
-static struct kioctx *lookup_ioctx(unsigned long ctx_id) +static struct kioctx *lookup_ioctx(aio_context_t ctx_id) { struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; @@ -1244,8 +1244,9 @@ static void aio_complete(struct aio_kiocb *iocb) flush_dcache_page(ctx->ring_pages[page_idx]);
pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, - (void __user *)(unsigned long)iocb->ki_res.obj, - iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); + (void __user *)iocb->ki_res.obj, + (unsigned long long)iocb->ki_res.data, + iocb->ki_res.res, iocb->ki_res.res2);
/* after flagging the request as done, we * must never even look at it again @@ -1447,24 +1448,25 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) { struct kioctx *ioctx = NULL; - unsigned long ctx; + aio_context_t ctx; long ret;
- ret = get_user(ctx, ctxp); + ret = get_user_ptr(ctx, ctxp); if (unlikely(ret)) goto out;
ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: ctx %lu nr_events %u\n", - ctx, nr_events); + (unsigned long)ctx, nr_events); goto out; }
ioctx = ioctx_alloc(nr_events); ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { - ret = put_user(ioctx->user_id, ctxp); + /* TODO [PCuABI] - derive proper capability */ + ret = put_user_ptr(uaddr_to_user_ptr_safe(ioctx->user_id), ctxp); if (ret) kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); @@ -2072,7 +2074,7 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, return -EFAULT; }
- req->ki_res.obj = user_ptr_addr(user_iocb); + req->ki_res.obj = (__kernel_uintptr_t)user_iocb; req->ki_res.data = iocb->aio_data; req->ki_res.res = 0; req->ki_res.res2 = 0; @@ -2103,7 +2105,7 @@ static int get_compat_iocb(struct iocb *iocb, const struct iocb __user *user_ioc struct compat_iocb compat_iocb; if (unlikely(copy_from_user(&compat_iocb, user_iocb, sizeof(struct compat_iocb)))) return -EFAULT; - iocb->aio_data = compat_iocb.aio_data; + iocb->aio_data = (__kernel_uintptr_t)compat_iocb.aio_data; iocb->aio_key = compat_iocb.aio_key; iocb->aio_rw_flags = compat_iocb.aio_rw_flags; iocb->aio_lio_opcode = compat_iocb.aio_lio_opcode; @@ -2287,7 +2289,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key; - u64 obj = user_ptr_addr(iocb);
ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) @@ -2306,7 +2307,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, spin_lock_irq(&ctx->ctx_lock); /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { - if (kiocb->ki_res.obj == obj) { + if (user_ptr_is_same((struct iocb __user *)kiocb->ki_res.obj, iocb)) { ret = kiocb->ki_cancel(&kiocb->rw); list_del_init(&kiocb->ki_list); break; @@ -2403,7 +2404,7 @@ SYSCALL_DEFINE6(io_pgetevents, if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT;
- if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) + if (usig && copy_from_user_with_ptr(&ksig, usig, sizeof(ksig))) return -EFAULT;
ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index 8050909c28876..b8804ef20b8f5 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -31,7 +31,7 @@ #include <linux/fs.h> #include <asm/byteorder.h>
-typedef __kernel_ulong_t aio_context_t; +typedef __kernel_uintptr_t aio_context_t;
enum { IOCB_CMD_PREAD = 0, @@ -58,10 +58,10 @@ enum {
/* read() from /dev/aio returns these structures. */ struct io_event { - __u64 data; /* the data field from the iocb */ - __u64 obj; /* what iocb this event came from */ - __s64 res; /* result code for this event */ - __s64 res2; /* secondary result */ + __kernel_uintptr_t data; /* the data field from the iocb */ + __kernel_uintptr_t obj; /* what iocb this event came from */ + __s64 res; /* result code for this event */ + __s64 res2; /* secondary result */ };
/* @@ -72,7 +72,7 @@ struct io_event {
struct iocb { /* these are internal to the kernel/libc. */ - __u64 aio_data; /* data to be returned in event's data */ + __kernel_uintptr_t aio_data; /* data to be returned in event's data */
#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) __u32 aio_key; /* the kernel sets aio_key to the req # */
On 04/04/2023 15:53, Tudor Cretu wrote:
The io_event struct may contain user pointers. In the PCuABI, a user pointer is a 129-bit capability, so the __u64 type is not big enough to hold it. Use the __kernel_uintptr_t type instead, which is big enough on the affected architectures while remaining 64-bit on others.
We should probably elaborate on what libaio does here too, as it's not obvious that struct io_event::data and even ::obj should be pointers. libaio needs both to be pointers for its callback mechanism, so we don't have much choice.
The aio_context_t represents the pointer to the userspace mapping of aio_ring buffer. In some userspace software, including libaio, it's used to directly access the aio_ring. As __kernel_ulong_t type is not large enough to fit a user pointer in the PCuABI, change the aio_context_t to __kernel_uintptr_t.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 29 +++++++++++++++-------------- include/uapi/linux/aio_abi.h | 12 ++++++------ 2 files changed, 21 insertions(+), 20 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index a83bf7f656ca4..3405e056a8ac5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -323,8 +323,8 @@ static void copy_io_event_to_ring(struct kioctx *ctx, if (is_compat64_aio_ctx(ctx)) { struct compat_io_event *compat_ring_event = (struct compat_io_event *)ring_event;
compat_ring_event->data = native_event->data;
compat_ring_event->obj = native_event->obj;
compat_ring_event->data = (__u64)native_event->data;
compat_ring_event->res = native_event->res; compat_ring_event->res2 = native_event->res2; return;compat_ring_event->obj = (__u64)native_event->obj;
@@ -1168,7 +1168,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) return req; } -static struct kioctx *lookup_ioctx(unsigned long ctx_id) +static struct kioctx *lookup_ioctx(aio_context_t ctx_id)
From an ABI perspective, we need to decide what to do with the way we match aio_context_t. Currently struct kioctx::user_id remains an unsigned long so we only match the address, which is not very consistent with the way we're handling this sort of situation in general. That said, doing a full capability comparison may prove challenging. It should not be too hard in the normal case, as do_mmap() will eventually return a capability. However, if the mapping is remapped, aio_ring_mremap() will adjust user_id accordingly, but if we represent it as a capability we would need to ensure that we derive exactly the same capability as returned by the mremap syscall. Not sure if this is worth the hassle, as the security benefit seems dubious, but the inconsistency is not great either.
{ struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; @@ -1244,8 +1244,9 @@ static void aio_complete(struct aio_kiocb *iocb) flush_dcache_page(ctx->ring_pages[page_idx]); pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
(void __user *)(unsigned long)iocb->ki_res.obj,
iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2);
(void __user *)iocb->ki_res.obj,
I'm suspecting this line doesn't get built at all, as otherwise we'd get a warning in PCuABI here (at least with GCC), as %p is not appropriate to print capabilities. Unfortunately I still haven't got round to looking into [1] to add a new specifier to print user pointers generically. Printing the address should be good enough here though.
[1] https://git.morello-project.org/morello/kernel/linux/-/issues/53
(unsigned long long)iocb->ki_res.data,
iocb->ki_res.res, iocb->ki_res.res2);
/* after flagging the request as done, we * must never even look at it again @@ -1447,24 +1448,25 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) { struct kioctx *ioctx = NULL;
- unsigned long ctx;
- aio_context_t ctx; long ret;
- ret = get_user(ctx, ctxp);
- ret = get_user_ptr(ctx, ctxp); if (unlikely(ret)) goto out;
ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: ctx %lu nr_events %u\n",
ctx, nr_events);
(unsigned long)ctx, nr_events);
Note that it will also warn if aio_context_t is changed to be a user pointer (see below). Using user_ptr_addr() would be appropriate in that case.
goto out;
} ioctx = ioctx_alloc(nr_events); ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
/* TODO [PCuABI] - derive proper capability */
if (ret) kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users);ret = put_user_ptr(uaddr_to_user_ptr_safe(ioctx->user_id), ctxp);
@@ -2072,7 +2074,7 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, return -EFAULT; }
- req->ki_res.obj = user_ptr_addr(user_iocb);
- req->ki_res.obj = (__kernel_uintptr_t)user_iocb; req->ki_res.data = iocb->aio_data; req->ki_res.res = 0; req->ki_res.res2 = 0;
@@ -2103,7 +2105,7 @@ static int get_compat_iocb(struct iocb *iocb, const struct iocb __user *user_ioc struct compat_iocb compat_iocb; if (unlikely(copy_from_user(&compat_iocb, user_iocb, sizeof(struct compat_iocb)))) return -EFAULT;
- iocb->aio_data = compat_iocb.aio_data;
- iocb->aio_data = (__kernel_uintptr_t)compat_iocb.aio_data; iocb->aio_key = compat_iocb.aio_key; iocb->aio_rw_flags = compat_iocb.aio_rw_flags; iocb->aio_lio_opcode = compat_iocb.aio_lio_opcode;
@@ -2287,7 +2289,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key;
- u64 obj = user_ptr_addr(iocb);
ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) @@ -2306,7 +2307,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, spin_lock_irq(&ctx->ctx_lock); /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
if (kiocb->ki_res.obj == obj) {
if (user_ptr_is_same((struct iocb __user *)kiocb->ki_res.obj, iocb)) { ret = kiocb->ki_cancel(&kiocb->rw); list_del_init(&kiocb->ki_list); break;
@@ -2403,7 +2404,7 @@ SYSCALL_DEFINE6(io_pgetevents, if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT;
- if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
- if (usig && copy_from_user_with_ptr(&ksig, usig, sizeof(ksig))) return -EFAULT;
ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index 8050909c28876..b8804ef20b8f5 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -31,7 +31,7 @@ #include <linux/fs.h> #include <asm/byteorder.h> -typedef __kernel_ulong_t aio_context_t; +typedef __kernel_uintptr_t aio_context_t;
__kernel_uintptr_t is only backwards-compatible when replacing __u64. In this case we would effectively expand the type to 64 bits in 32-bit, which is not intended.
I wonder if we could get away with just defining it as void __user *. It looks like a few fixups in the kernel would be needed (Clang and GCC don't seem to understand T __capability * if T is itself a capability), and more importantly it is a uapi change, but I am quite hopeful it wouldn't break much if anything. libaio doesn't use aio_context_t and instead defines its own io_context_t as a pointer. A Debian Code Search does show quite a few users of aio_context_t, but AFAICT it is only ever passed to the AIO syscalls or initialised to 0 (works if defined as a pointer too) - pretty much an opaque type.
Kevin
enum { IOCB_CMD_PREAD = 0, @@ -58,10 +58,10 @@ enum { /* read() from /dev/aio returns these structures. */ struct io_event {
- __u64 data; /* the data field from the iocb */
- __u64 obj; /* what iocb this event came from */
- __s64 res; /* result code for this event */
- __s64 res2; /* secondary result */
- __kernel_uintptr_t data; /* the data field from the iocb */
- __kernel_uintptr_t obj; /* what iocb this event came from */
- __s64 res; /* result code for this event */
- __s64 res2; /* secondary result */
}; /* @@ -72,7 +72,7 @@ struct io_event { struct iocb { /* these are internal to the kernel/libc. */
- __u64 aio_data; /* data to be returned in event's data */
- __kernel_uintptr_t aio_data; /* data to be returned in event's data */
#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) __u32 aio_key; /* the kernel sets aio_key to the req # */
On 05-06-2023 09:43, Kevin Brodsky wrote:
On 04/04/2023 15:53, Tudor Cretu wrote:
The io_event struct may contain user pointers. In the PCuABI, a user pointer is a 129-bit capability, so the __u64 type is not big enough to hold it. Use the __kernel_uintptr_t type instead, which is big enough on the affected architectures while remaining 64-bit on others.
We should probably elaborate on what libaio does here too, as it's not obvious that struct io_event::data and even ::obj should be pointers. libaio needs both to be pointers for its callback mechanism, so we don't have much choice.
Hopefully it's clearer now! Thanks!
The aio_context_t represents the pointer to the userspace mapping of aio_ring buffer. In some userspace software, including libaio, it's used to directly access the aio_ring. As __kernel_ulong_t type is not large enough to fit a user pointer in the PCuABI, change the aio_context_t to __kernel_uintptr_t.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
fs/aio.c | 29 +++++++++++++++-------------- include/uapi/linux/aio_abi.h | 12 ++++++------ 2 files changed, 21 insertions(+), 20 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index a83bf7f656ca4..3405e056a8ac5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -323,8 +323,8 @@ static void copy_io_event_to_ring(struct kioctx *ctx, if (is_compat64_aio_ctx(ctx)) { struct compat_io_event *compat_ring_event = (struct compat_io_event *)ring_event;
compat_ring_event->data = native_event->data;
compat_ring_event->obj = native_event->obj;
compat_ring_event->data = (__u64)native_event->data;
compat_ring_event->res = native_event->res; compat_ring_event->res2 = native_event->res2; return;compat_ring_event->obj = (__u64)native_event->obj;
@@ -1168,7 +1168,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) return req; } -static struct kioctx *lookup_ioctx(unsigned long ctx_id) +static struct kioctx *lookup_ioctx(aio_context_t ctx_id)
From an ABI perspective, we need to decide what to do with the way we match aio_context_t. Currently struct kioctx::user_id remains an unsigned long so we only match the address, which is not very consistent with the way we're handling this sort of situation in general. That said, doing a full capability comparison may prove challenging. It should not be too hard in the normal case, as do_mmap() will eventually return a capability. However, if the mapping is remapped, aio_ring_mremap() will adjust user_id accordingly, but if we represent it as a capability we would need to ensure that we derive exactly the same capability as returned by the mremap syscall. Not sure if this is worth the hassle, as the security benefit seems dubious, but the inconsistency is not great either.
In the compat case it's a bit tricky to do the full capability comparison. I don't mind either way, I changed to the full capability comparison for consistency. It can also prevent accessing native contexts from compat tasks, so it might be useful after all.
{ struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; @@ -1244,8 +1244,9 @@ static void aio_complete(struct aio_kiocb *iocb) flush_dcache_page(ctx->ring_pages[page_idx]); pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
(void __user *)(unsigned long)iocb->ki_res.obj,
iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2);
(void __user *)iocb->ki_res.obj,
I'm suspecting this line doesn't get built at all, as otherwise we'd get a warning in PCuABI here (at least with GCC), as %p is not appropriate to print capabilities. Unfortunately I still haven't got round to looking into [1] to add a new specifier to print user pointers generically. Printing the address should be good enough here though.
[1] https://git.morello-project.org/morello/kernel/linux/-/issues/53
This is a bit strange, I changed the pr_debug to pr_warn and I can confirm that this gets built, but gcc still doesn't give a warning about it. In any case, I agree, let's just be consistent and print the address only.
(unsigned long long)iocb->ki_res.data,
iocb->ki_res.res, iocb->ki_res.res2);
/* after flagging the request as done, we * must never even look at it again @@ -1447,24 +1448,25 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) { struct kioctx *ioctx = NULL;
- unsigned long ctx;
- aio_context_t ctx; long ret;
- ret = get_user(ctx, ctxp);
- ret = get_user_ptr(ctx, ctxp); if (unlikely(ret)) goto out;
ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: ctx %lu nr_events %u\n",
ctx, nr_events);
(unsigned long)ctx, nr_events);
Note that it will also warn if aio_context_t is changed to be a user pointer (see below). Using user_ptr_addr() would be appropriate in that case.
Done!
goto out;
} ioctx = ioctx_alloc(nr_events); ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
/* TODO [PCuABI] - derive proper capability */
if (ret) kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users);ret = put_user_ptr(uaddr_to_user_ptr_safe(ioctx->user_id), ctxp);
@@ -2072,7 +2074,7 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, return -EFAULT; }
- req->ki_res.obj = user_ptr_addr(user_iocb);
- req->ki_res.obj = (__kernel_uintptr_t)user_iocb; req->ki_res.data = iocb->aio_data; req->ki_res.res = 0; req->ki_res.res2 = 0;
@@ -2103,7 +2105,7 @@ static int get_compat_iocb(struct iocb *iocb, const struct iocb __user *user_ioc struct compat_iocb compat_iocb; if (unlikely(copy_from_user(&compat_iocb, user_iocb, sizeof(struct compat_iocb)))) return -EFAULT;
- iocb->aio_data = compat_iocb.aio_data;
- iocb->aio_data = (__kernel_uintptr_t)compat_iocb.aio_data; iocb->aio_key = compat_iocb.aio_key; iocb->aio_rw_flags = compat_iocb.aio_rw_flags; iocb->aio_lio_opcode = compat_iocb.aio_lio_opcode;
@@ -2287,7 +2289,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key;
- u64 obj = user_ptr_addr(iocb);
ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) @@ -2306,7 +2307,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, spin_lock_irq(&ctx->ctx_lock); /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
if (kiocb->ki_res.obj == obj) {
if (user_ptr_is_same((struct iocb __user *)kiocb->ki_res.obj, iocb)) { ret = kiocb->ki_cancel(&kiocb->rw); list_del_init(&kiocb->ki_list); break;
@@ -2403,7 +2404,7 @@ SYSCALL_DEFINE6(io_pgetevents, if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT;
- if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
- if (usig && copy_from_user_with_ptr(&ksig, usig, sizeof(ksig))) return -EFAULT;
ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index 8050909c28876..b8804ef20b8f5 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -31,7 +31,7 @@ #include <linux/fs.h> #include <asm/byteorder.h> -typedef __kernel_ulong_t aio_context_t; +typedef __kernel_uintptr_t aio_context_t;
__kernel_uintptr_t is only backwards-compatible when replacing __u64. In this case we would effectively expand the type to 64 bits in 32-bit, which is not intended.
I thought we can get away with this given that io_setup has a compat handler and would return a 32-bit truncated address anyway...
I wonder if we could get away with just defining it as void __user *. It looks like a few fixups in the kernel would be needed (Clang and GCC don't seem to understand T __capability * if T is itself a capability), and more importantly it is a uapi change, but I am quite hopeful it wouldn't break much if anything. libaio doesn't use aio_context_t and instead defines its own io_context_t as a pointer. A Debian Code Search does show quite a few users of aio_context_t, but AFAICT it is only ever passed to the AIO syscalls or initialised to 0 (works if defined as a pointer too) - pretty much an opaque type.
I also thought about changing it to void __user *. It's a bit odd having the compat_aio_context_t be an integer value and this to be a pointer... I separated the aio_context_t in another patch, for clarity around all the changes. Feel free to suggest as many improvements as you see fit, I'm dedicated to make this series as neat and proper as it can be. :)
Thanks, Tudor
Kevin
enum { IOCB_CMD_PREAD = 0, @@ -58,10 +58,10 @@ enum { /* read() from /dev/aio returns these structures. */ struct io_event {
- __u64 data; /* the data field from the iocb */
- __u64 obj; /* what iocb this event came from */
- __s64 res; /* result code for this event */
- __s64 res2; /* secondary result */
- __kernel_uintptr_t data; /* the data field from the iocb */
- __kernel_uintptr_t obj; /* what iocb this event came from */
- __s64 res; /* result code for this event */
- __s64 res2; /* secondary result */ };
/* @@ -72,7 +72,7 @@ struct io_event { struct iocb { /* these are internal to the kernel/libc. */
- __u64 aio_data; /* data to be returned in event's data */
- __kernel_uintptr_t aio_data; /* data to be returned in event's data */
#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) __u32 aio_key; /* the kernel sets aio_key to the req # */
On 15/06/2023 17:58, Tudor Cretu wrote:
-typedef __kernel_ulong_t aio_context_t; +typedef __kernel_uintptr_t aio_context_t;
__kernel_uintptr_t is only backwards-compatible when replacing __u64. In this case we would effectively expand the type to 64 bits in 32-bit, which is not intended.
I thought we can get away with this given that io_setup has a compat handler and would return a 32-bit truncated address anyway...
compat is not the issue, we already have compat_aio_context_t anyway. The issue is 32-bit architectures: making aio_context_t 64-bit on them wouldn't be backwards-compatible.
I wonder if we could get away with just defining it as void __user *. It looks like a few fixups in the kernel would be needed (Clang and GCC don't seem to understand T __capability * if T is itself a capability), and more importantly it is a uapi change, but I am quite hopeful it wouldn't break much if anything. libaio doesn't use aio_context_t and instead defines its own io_context_t as a pointer. A Debian Code Search does show quite a few users of aio_context_t, but AFAICT it is only ever passed to the AIO syscalls or initialised to 0 (works if defined as a pointer too) - pretty much an opaque type.
I also thought about changing it to void __user *. It's a bit odd having the compat_aio_context_t be an integer value and this to be a pointer... I separated the aio_context_t in another patch, for clarity around all the changes. Feel free to suggest as many improvements as you see fit, I'm dedicated to make this series as neat and proper as it can be. :)
Very nice change indeed! For compat_aio_context_t, it's not really weird: all compat types are integers, even if they represent pointers. That said you make a good point, since aio_context_t is now an actual user pointer, we could make compat_aio_context_t a compat_uptr_t to be consistent (also avoids having a different definition in compat64)..
Kevin
On 04/04/2023 15:53, Tudor Cretu wrote:
This series makes it possible for purecap apps to use the aio_ring shared memory region to bypass the io_getevents syscall's overhead. This functionality is also used in libaio.
With these patches, all io_* LTP tests pass in both Purecap and plain AArch64 modes. Note that the LTP tests only address the basic functionality of the aio system and a significant portion of the functionality is untested in LTP.
For a more comprehensive testing, libaio has been updated with the new uAPI and ported. All the tests in libaio pass accordingly, in both Purecap and plain AArch64 modes.
Gitlab issue: https://git.morello-project.org/morello/kernel/linux/-/issues/49
Review branch: https://git.morello-project.org/tudcre01/linux/-/commits/morello/aio_v1
Tudor Cretu (4): aio: Fix the relationship between ctx pages and io_events array aio: Implement compat handling for the io_event struct aio: Allow capability tag access on the shared memory aio: Use user pointer type in the io_event struct and aio_context_t
Sorry it took me so long to get to these patches! Looks good overall, though we may want a different approach in patch 1 (comments inline).
Kevin
fs/aio.c | 197 ++++++++++++++++++++++++++--------- include/uapi/linux/aio_abi.h | 12 +-- 2 files changed, 153 insertions(+), 56 deletions(-)
On 05-06-2023 09:46, Kevin Brodsky wrote:
On 04/04/2023 15:53, Tudor Cretu wrote:
This series makes it possible for purecap apps to use the aio_ring shared memory region to bypass the io_getevents syscall's overhead. This functionality is also used in libaio.
With these patches, all io_* LTP tests pass in both Purecap and plain AArch64 modes. Note that the LTP tests only address the basic functionality of the aio system and a significant portion of the functionality is untested in LTP.
For a more comprehensive testing, libaio has been updated with the new uAPI and ported. All the tests in libaio pass accordingly, in both Purecap and plain AArch64 modes.
Gitlab issue: https://git.morello-project.org/morello/kernel/linux/-/issues/49
Review branch: https://git.morello-project.org/tudcre01/linux/-/commits/morello/aio_v1
Tudor Cretu (4): aio: Fix the relationship between ctx pages and io_events array aio: Implement compat handling for the io_event struct aio: Allow capability tag access on the shared memory aio: Use user pointer type in the io_event struct and aio_context_t
Sorry it took me so long to get to these patches! Looks good overall, though we may want a different approach in patch 1 (comments inline).
Thanks a lot for the detailed review! Greatly appreciated!
Tudor
Kevin
fs/aio.c | 197 ++++++++++++++++++++++++++--------- include/uapi/linux/aio_abi.h | 12 +-- 2 files changed, 153 insertions(+), 56 deletions(-)
linux-morello@op-lists.linaro.org