Hi All,
This series adds capabilities support for clone3 syscall along with set of testcases in morello clone kselftests.
Struct clone_args signature after proposed changes:
__CHERI_PURE_CAPABILITY__ :
struct clone_args { __u64 flags __attribute__((__aligned__(8))); /* 0 8 */ __u8 __rsvd_v0_1[8]; /* 8 8 */ __kernel_aligned_uintptr_t pidfd; /* 16 16 */ __kernel_aligned_uintptr_t child_tid; /* 32 16 */ __kernel_aligned_uintptr_t parent_tid; /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u64 exit_signal __attribute__((__aligned__(8))); /* 64 8 */ __u8 __rsvd_v0_2[8]; /* 72 8 */ __kernel_aligned_uintptr_t stack; /* 80 16 */ __u64 stack_size __attribute__((__aligned__(8))); /* 96 8 */ __u8 __rsvd_v0_3[8]; /* 104 8 */ __kernel_aligned_uintptr_t tls; /* 112 16 */ /* --- cacheline 2 boundary (128 bytes) --- */ __kernel_aligned_uintptr_t set_tid; /* 128 16 */ __u64 set_tid_size __attribute__((__aligned__(8))); /* 144 8 */ __u64 cgroup __attribute__((__aligned__(8))); /* 152 8 */
/* size: 160, cachelines: 3, members: 14 */ /* forced alignments: 5 */ /* last cacheline: 32 bytes */ };
otherwise:
struct clone_args { __u64 flags __attribute__((__aligned__(8))); /* 0 8 */ __u8 __rsvd_v0_1[0]; /* 8 0 */ __kernel_aligned_uintptr_t pidfd __attribute__((__aligned__(8))); /* 8 8 */ __kernel_aligned_uintptr_t child_tid __attribute__((__aligned__(8))); /* 16 8 */ __kernel_aligned_uintptr_t parent_tid __attribute__((__aligned__(8))); /* 24 8 */ __u64 exit_signal __attribute__((__aligned__(8))); /* 32 8 */ __u8 __rsvd_v0_2[0]; /* 40 0 */ __kernel_aligned_uintptr_t stack __attribute__((__aligned__(8))); /* 40 8 */ __u64 stack_size __attribute__((__aligned__(8))); /* 48 8 */ __u8 __rsvd_v0_3[0]; /* 56 0 */ __kernel_aligned_uintptr_t tls __attribute__((__aligned__(8))); /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ __kernel_aligned_uintptr_t set_tid __attribute__((__aligned__(8))); /* 64 8 */ __u64 set_tid_size __attribute__((__aligned__(8))); /* 72 8 */ __u64 cgroup __attribute__((__aligned__(8))); /* 80 8 */
/* size: 88, cachelines: 2, members: 14 */ /* forced alignments: 11 */ /* last cacheline: 24 bytes */ };
Changes available at: https://git.morello-project.org/Bea/linux/-/tree/morello/clone3 LTP changes: https://git.morello-project.org/Bea/morello-linux-ltp/-/tree/morello/clone3 To run clone3 tests: ./runltp -f syscalls -s clone3
Beata Michalska (3): uaccess: Preserve capability tags with copy_struct_from_user fork: clone3: Add support for architectural capabilities kselftests/arm64: morello: Add clone3 test-cases
include/linux/uaccess.h | 2 +- include/uapi/linux/sched.h | 37 ++- kernel/fork.c | 123 ++++++-- tools/testing/selftests/arm64/morello/clone.c | 269 +++++++++++++++++- 4 files changed, 391 insertions(+), 40 deletions(-)
Make copy_struct_from_user capability-aware by switching the actual copying routine to copy_to_user_with_ptr, one that can preserve capability tags throughout the process.
Signed-off-by: Beata Michalska beata.michalska@arm.com --- include/linux/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 88b2224e85c3..ec31478634cc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -419,7 +419,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */ - if (copy_from_user(dst, src, size)) + if (copy_from_user_with_ptr(dst, src, size)) return -EFAULT; return 0; }
Hi Beata,
On 18-11-2022 00:05, Beata Michalska wrote:
Make copy_struct_from_user capability-aware by switching the actual copying routine to copy_to_user_with_ptr, one that can preserve capability tags throughout the process.
Just a short question: we're keeping both copy_from_user and copy_from_user_with_ptr because we don't want to preserve capability tags by default and each time this is done should be intentional, right? Is there anything specific to copy_struct_from_user that would make it fine to preserve capability tags by default? I think the alternative would be to have two versions: copy_struct_from_user and copy_struct_from_user_with_ptr so that we don't preserve tags by default.
Thanks, Tudor
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/linux/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 88b2224e85c3..ec31478634cc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -419,7 +419,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */
- if (copy_from_user(dst, src, size))
- if (copy_from_user_with_ptr(dst, src, size)) return -EFAULT; return 0; }
On Mon, Nov 21, 2022 at 10:57:10AM +0000, Tudor Cretu wrote:
Hi Beata,
On 18-11-2022 00:05, Beata Michalska wrote:
Make copy_struct_from_user capability-aware by switching the actual copying routine to copy_to_user_with_ptr, one that can preserve capability tags throughout the process.
Just a short question: we're keeping both copy_from_user and copy_from_user_with_ptr because we don't want to preserve capability tags by default and each time this is done should be intentional, right? Is there anything specific to copy_struct_from_user that would make it fine to preserve capability tags by default? I think the alternative would be to have two versions: copy_struct_from_user and copy_struct_from_user_with_ptr so that we don't preserve tags by default.
The 'intentional' part here should be applied when copying to userspace, not the other way round, with the stress point being on not providing valid capabilities to user space when not indented, unless I am missing smth (?)
--- BR B.
Thanks, Tudor
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/linux/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 88b2224e85c3..ec31478634cc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -419,7 +419,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */
- if (copy_from_user(dst, src, size))
- if (copy_from_user_with_ptr(dst, src, size)) return -EFAULT; return 0; }
On 21-11-2022 11:40, Beata Michalska wrote:
On Mon, Nov 21, 2022 at 10:57:10AM +0000, Tudor Cretu wrote:
Hi Beata,
On 18-11-2022 00:05, Beata Michalska wrote:
Make copy_struct_from_user capability-aware by switching the actual copying routine to copy_to_user_with_ptr, one that can preserve capability tags throughout the process.
Just a short question: we're keeping both copy_from_user and copy_from_user_with_ptr because we don't want to preserve capability tags by default and each time this is done should be intentional, right? Is there anything specific to copy_struct_from_user that would make it fine to preserve capability tags by default? I think the alternative would be to have two versions: copy_struct_from_user and copy_struct_from_user_with_ptr so that we don't preserve tags by default.
The 'intentional' part here should be applied when copying to userspace, not the other way round, with the stress point being on not providing valid capabilities to user space when not indented, unless I am missing smth (?)
Alright, I guess because there isn't a copy_struct_to_user as well, then we can just do your change without creating confusion. LGMT, thanks for the clarification!
Thanks, Tudor
BR B.
Thanks, Tudor
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/linux/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 88b2224e85c3..ec31478634cc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -419,7 +419,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */
- if (copy_from_user(dst, src, size))
- if (copy_from_user_with_ptr(dst, src, size)) return -EFAULT; return 0; }
On 21/11/2022 12:08, Tudor Cretu wrote:
On 21-11-2022 11:40, Beata Michalska wrote:
On Mon, Nov 21, 2022 at 10:57:10AM +0000, Tudor Cretu wrote:
Hi Beata,
On 18-11-2022 00:05, Beata Michalska wrote:
Make copy_struct_from_user capability-aware by switching the actual copying routine to copy_to_user_with_ptr, one that can preserve capability tags throughout the process.
Just a short question: we're keeping both copy_from_user and copy_from_user_with_ptr because we don't want to preserve capability tags by default and each time this is done should be intentional, right? Is there anything specific to copy_struct_from_user that would make it fine to preserve capability tags by default? I think the alternative would be to have two versions: copy_struct_from_user and copy_struct_from_user_with_ptr so that we don't preserve tags by default.
The 'intentional' part here should be applied when copying to userspace, not the other way round, with the stress point being on not providing valid capabilities to user space when not indented, unless I am missing smth (?)
Alright, I guess because there isn't a copy_struct_to_user as well, then we can just do your change without creating confusion. LGMT, thanks for the clarification!
Right, just so it's clear in my mind - if we do copy_*from*_user without _with_ptr here, then we do not preserve cap tags, that means the kernel would not be able to access or otherwise use that pointer?
I can't think of a scenario where you'd pass a pointer into to kernel but want that pointer unreadable or otherwise unusable.
The copy_*to*_user is the bit where care is needed.
Thanks, Zach
Thanks, Tudor
BR B.
Thanks, Tudor
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/linux/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 88b2224e85c3..ec31478634cc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -419,7 +419,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */ - if (copy_from_user(dst, src, size)) + if (copy_from_user_with_ptr(dst, src, size)) return -EFAULT; return 0; }
linux-morello mailing list -- linux-morello@op-lists.linaro.org To unsubscribe send an email to linux-morello-leave@op-lists.linaro.org
On 25/11/2022 13:13, Zachary Leaf wrote:
Just a short question: we're keeping both copy_from_user and copy_from_user_with_ptr because we don't want to preserve capability tags by default and each time this is done should be intentional, right? Is there anything specific to copy_struct_from_user that would make it fine to preserve capability tags by default? I think the alternative would be to have two versions: copy_struct_from_user and copy_struct_from_user_with_ptr so that we don't preserve tags by default.
The 'intentional' part here should be applied when copying to userspace, not the other way round, with the stress point being on not providing valid capabilities to user space when not indented, unless I am missing smth (?)
Alright, I guess because there isn't a copy_struct_to_user as well, then we can just do your change without creating confusion. LGMT, thanks for the clarification!
Right, just so it's clear in my mind - if we do copy_*from*_user without _with_ptr here, then we do not preserve cap tags, that means the kernel would not be able to access or otherwise use that pointer?
I can't think of a scenario where you'd pass a pointer into to kernel but want that pointer unreadable or otherwise unusable.
The copy_*to*_user is the bit where care is needed.
Yes, *copy*_to_user* are the most critical ones in terms of enforcing the capability model. However, at times controlling *copy*_from_user* is equally important, see Amit's "Handle siginfo_t user pointer tags" series for an example (__copy_siginfo_from_user() notably). Even outside of these special cases, defaulting to a non-pointer-preserving copy is a plus in terms of intentionality, strengthening our confidence that user pointers don't get propagated by accident.
I am therefore strongly in favour of keeping the explicit approach (separate _with_ptr variant) for *copy*_from_user*, and logically that should apply to all variants including copy_struct_from_user.
Kevin
On Wed, Nov 30, 2022 at 03:39:36PM +0100, Kevin Brodsky wrote:
On 25/11/2022 13:13, Zachary Leaf wrote:
Just a short question: we're keeping both copy_from_user and copy_from_user_with_ptr because we don't want to preserve capability tags by default and each time this is done should be intentional, right? Is there anything specific to copy_struct_from_user that would make it fine to preserve capability tags by default? I think the alternative would be to have two versions: copy_struct_from_user and copy_struct_from_user_with_ptr so that we don't preserve tags by default.
The 'intentional' part here should be applied when copying to userspace, not the other way round, with the stress point being on not providing valid capabilities to user space when not indented, unless I am missing smth (?)
Alright, I guess because there isn't a copy_struct_to_user as well, then we can just do your change without creating confusion. LGMT, thanks for the clarification!
Right, just so it's clear in my mind - if we do copy_*from*_user without _with_ptr here, then we do not preserve cap tags, that means the kernel would not be able to access or otherwise use that pointer?
I can't think of a scenario where you'd pass a pointer into to kernel but want that pointer unreadable or otherwise unusable.
The copy_*to*_user is the bit where care is needed.
Yes, *copy*_to_user* are the most critical ones in terms of enforcing the capability model. However, at times controlling *copy*_from_user* is equally important, see Amit's "Handle siginfo_t user pointer tags" series for an example (__copy_siginfo_from_user() notably). Even outside of these special cases, defaulting to a non-pointer-preserving copy is a plus in terms of intentionality, strengthening our confidence that user pointers don't get propagated by accident.
I do agree with that, though splitting those into 2 variants does not make it bulletproof against potential misuse. Though I see your point.
I am therefore strongly in favour of keeping the explicit approach (separate _with_ptr variant) for *copy*_from_user*, and logically that should apply to all variants including copy_struct_from_user.
So in other words you would like to see almost exact replica of copy_struct_from_user. Can do that.
--- BR B.
Kevin
On 30/11/2022 21:00, Beata Michalska wrote:
On Wed, Nov 30, 2022 at 03:39:36PM +0100, Kevin Brodsky wrote:
On 25/11/2022 13:13, Zachary Leaf wrote:
Just a short question: we're keeping both copy_from_user and copy_from_user_with_ptr because we don't want to preserve capability tags by default and each time this is done should be intentional, right? Is there anything specific to copy_struct_from_user that would make it fine to preserve capability tags by default? I think the alternative would be to have two versions: copy_struct_from_user and copy_struct_from_user_with_ptr so that we don't preserve tags by default.
The 'intentional' part here should be applied when copying to userspace, not the other way round, with the stress point being on not providing valid capabilities to user space when not indented, unless I am missing smth (?)
Alright, I guess because there isn't a copy_struct_to_user as well, then we can just do your change without creating confusion. LGMT, thanks for the clarification!
Right, just so it's clear in my mind - if we do copy_*from*_user without _with_ptr here, then we do not preserve cap tags, that means the kernel would not be able to access or otherwise use that pointer?
I can't think of a scenario where you'd pass a pointer into to kernel but want that pointer unreadable or otherwise unusable.
The copy_*to*_user is the bit where care is needed.
Yes, *copy*_to_user* are the most critical ones in terms of enforcing the capability model. However, at times controlling *copy*_from_user* is equally important, see Amit's "Handle siginfo_t user pointer tags" series for an example (__copy_siginfo_from_user() notably). Even outside of these special cases, defaulting to a non-pointer-preserving copy is a plus in terms of intentionality, strengthening our confidence that user pointers don't get propagated by accident.
I do agree with that, though splitting those into 2 variants does not make it bulletproof against potential misuse. Though I see your point.
Agreed, we can't prevent logic errors, but at least a simple grep allows identifying where user pointers are copied in.
I am therefore strongly in favour of keeping the explicit approach (separate _with_ptr variant) for *copy*_from_user*, and logically that should apply to all variants including copy_struct_from_user.
So in other words you would like to see almost exact replica of copy_struct_from_user. Can do that.
Unfortunately yes that means duplicating copy_struct_from_user, it's not _that_ big though so probably not worth using macro magic to avoid duplication (as that ends up being a lot less readable).
Kevin
On Fri, Nov 25, 2022 at 12:13:26PM +0000, Zachary Leaf wrote:
On 21/11/2022 12:08, Tudor Cretu wrote:
On 21-11-2022 11:40, Beata Michalska wrote:
On Mon, Nov 21, 2022 at 10:57:10AM +0000, Tudor Cretu wrote:
Hi Beata,
On 18-11-2022 00:05, Beata Michalska wrote:
Make copy_struct_from_user capability-aware by switching the actual copying routine to copy_to_user_with_ptr, one that can preserve capability tags throughout the process.
Just a short question: we're keeping both copy_from_user and copy_from_user_with_ptr because we don't want to preserve capability tags by default and each time this is done should be intentional, right? Is there anything specific to copy_struct_from_user that would make it fine to preserve capability tags by default? I think the alternative would be to have two versions: copy_struct_from_user and copy_struct_from_user_with_ptr so that we don't preserve tags by default.
The 'intentional' part here should be applied when copying to userspace, not the other way round, with the stress point being on not providing valid capabilities to user space when not indented, unless I am missing smth (?)
Alright, I guess because there isn't a copy_struct_to_user as well, then we can just do your change without creating confusion. LGMT, thanks for the clarification!
Right, just so it's clear in my mind - if we do copy_*from*_user without _with_ptr here, then we do not preserve cap tags, that means the kernel would not be able to access or otherwise use that pointer?
Yes, regular memcpy routines are not aware of capability registers nor the alignment requirements, so the capability tags (if any) will be lost on loads from the memory. The '_with_ptr' variants use capability registers and take care of the access alignment so the tags can be carried over through loads and stores. As a result the capabilities copied over with those routines are fully preserved. As long as valid capability has been provided (where it was expected), that capability can be used by the kernel to access given memory area. If the regular variants of memcpy were to be used, any memory access through such capability would result in capability tag fault.
I can't think of a scenario where you'd pass a pointer into to kernel but want that pointer unreadable or otherwise unusable.
Capabilities can be used to protect non-pointer types (there are software-defined bits in the permissions set that can be used for that purpose). though we are not dealing with those, at least not at this point. That said, such capabilities should not turn up where actual pointers are expected, not on purpose that is. What led you to thinking about invalid capability passed as a pointer? Not sure I caught your thought here.
--- BR B.
The copy_*to*_user is the bit where care is needed.
Thanks, Zach
Thanks, Tudor
BR B.
Thanks, Tudor
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/linux/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 88b2224e85c3..ec31478634cc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -419,7 +419,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */ - if (copy_from_user(dst, src, size)) + if (copy_from_user_with_ptr(dst, src, size)) return -EFAULT; return 0; }
linux-morello mailing list -- linux-morello@op-lists.linaro.org To unsubscribe send an email to linux-morello-leave@op-lists.linaro.org
linux-morello mailing list -- linux-morello@op-lists.linaro.org To unsubscribe send an email to linux-morello-leave@op-lists.linaro.org
On 30/11/2022 19:31, Beata Michalska wrote:
On Fri, Nov 25, 2022 at 12:13:26PM +0000, Zachary Leaf wrote:
On 21/11/2022 12:08, Tudor Cretu wrote:
On 21-11-2022 11:40, Beata Michalska wrote:
On Mon, Nov 21, 2022 at 10:57:10AM +0000, Tudor Cretu wrote:
Hi Beata,
On 18-11-2022 00:05, Beata Michalska wrote:
Make copy_struct_from_user capability-aware by switching the actual copying routine to copy_to_user_with_ptr, one that can preserve capability tags throughout the process.
Just a short question: we're keeping both copy_from_user and copy_from_user_with_ptr because we don't want to preserve capability tags by default and each time this is done should be intentional, right? Is there anything specific to copy_struct_from_user that would make it fine to preserve capability tags by default? I think the alternative would be to have two versions: copy_struct_from_user and copy_struct_from_user_with_ptr so that we don't preserve tags by default.
The 'intentional' part here should be applied when copying to userspace, not the other way round, with the stress point being on not providing valid capabilities to user space when not indented, unless I am missing smth (?)
Alright, I guess because there isn't a copy_struct_to_user as well, then we can just do your change without creating confusion. LGMT, thanks for the clarification!
Right, just so it's clear in my mind - if we do copy_*from*_user without _with_ptr here, then we do not preserve cap tags, that means the kernel would not be able to access or otherwise use that pointer?
Yes, regular memcpy routines are not aware of capability registers nor the alignment requirements, so the capability tags (if any) will be lost on loads from the memory. The '_with_ptr' variants use capability registers and take care of the access alignment so the tags can be carried over through loads and stores. As a result the capabilities copied over with those routines are fully preserved. As long as valid capability has been provided (where it was expected), that capability can be used by the kernel to access given memory area. If the regular variants of memcpy were to be used, any memory access through such capability would result in capability tag fault.
Ah yep, that makes sense - since the tag bit is stored out of band unless you explicitly copy that then it will be lost.
I can't think of a scenario where you'd pass a pointer into to kernel but want that pointer unreadable or otherwise unusable.
Capabilities can be used to protect non-pointer types (there are software-defined bits in the permissions set that can be used for that purpose). though we are not dealing with those, at least not at this point. That said, such capabilities should not turn up where actual pointers are expected, not on purpose that is. What led you to thinking about invalid> capability passed as a pointer? Not sure I caught your thought here.
That's good to know as well. I just meant if you are copying some block of memory containing pointers, is there a scenario where you would ever want to use the non-_with_ptr variant to *intentionally* lose the tag bits.
I think Kevin already answered this under the same patch - there are some special cases where you'd want to do that e.g. in the siginfo_t patches[1] some memory/data might be provided to other processes, so we should avoid copying the tags in those cases:
if (current_pid ? copy_from_user_with_ptr(to, from, sizeof(struct kernel_siginfo)) : copy_from_user(to, from, sizeof(struct kernel_siginfo)))
Thanks, Zach
1: [linux-morello] [PATCH 0/6] Handle siginfo_t user pointer tags
BR B.
The copy_*to*_user is the bit where care is needed.
Thanks, Zach
Thanks, Tudor
BR B.
Thanks, Tudor
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/linux/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 88b2224e85c3..ec31478634cc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -419,7 +419,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */ - if (copy_from_user(dst, src, size)) + if (copy_from_user_with_ptr(dst, src, size)) return -EFAULT; return 0; }
linux-morello mailing list -- linux-morello@op-lists.linaro.org To unsubscribe send an email to linux-morello-leave@op-lists.linaro.org
linux-morello mailing list -- linux-morello@op-lists.linaro.org To unsubscribe send an email to linux-morello-leave@op-lists.linaro.org
linux-morello mailing list -- linux-morello@op-lists.linaro.org To unsubscribe send an email to linux-morello-leave@op-lists.linaro.org
The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure, with some of its members representing user pointers. Currently those are specified as being of __u64 type (alignment aside) which is problematic for some architectures, where that type cannot actually hold said pointers. In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type. This though, brings its own drawbacks. The structure itself gains revised signature with explicit paddings for cases where sizeof(__kernel_uintptr_t) > sizeof(__u64), which results in noticeable growth in the structure's size. Furthermore, for affected architectures, as a consequence clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Signed-off-by: Beata Michalska beata.michalska@arm.com --- include/uapi/linux/sched.h | 36 ++++++++--- kernel/fork.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 37 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..56ca73ca763c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,42 @@ * * The structure is versioned by size and thus extensible. * New struct members must go at the end of the struct and - * must be properly 64bit aligned. + * must be properly aligned on at least 64bit boundary. + * See below on explicit padding when: + * ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) + * */ + +#define __rsvd(tag) \ + __u8 __rsvd_##tag[sizeof(__kernel_uintptr_t) - sizeof(__u64)] + struct clone_args { __aligned_u64 flags; - __aligned_u64 pidfd; - __aligned_u64 child_tid; - __aligned_u64 parent_tid; + __rsvd(v0_1); + __kernel_aligned_uintptr_t pidfd; + __kernel_aligned_uintptr_t child_tid; + __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal; - __aligned_u64 stack; + __rsvd(v0_2); + __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size; - __aligned_u64 tls; - __aligned_u64 set_tid; + __rsvd(v0_3); + __kernel_aligned_uintptr_t tls; + /* VER0 boundary */ + __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size; + /* VER1 boundary */ __aligned_u64 cgroup; + /* VER2 boundary */ }; #endif
-#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \ + ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 128 : 64) +#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \ + ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 152 : 80) +#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \ + ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 160 : 88)
/* * Scheduling policies diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..72de1187a586 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,136 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp,
#ifdef __ARCH_WANT_SYS_CLONE3
+#define __clone_args_size_ver(ver, pfx) \ + pfx##CLONE_ARGS_SIZE_VER##ver + +#ifdef CONFIG_COMPAT64 +struct compat_clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; + __aligned_u64 cgroup; +}; + +#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ + +#define clone_args_size_ver(ver) \ + (in_compat_syscall() ? __clone_args_size_ver(ver, COMPAT_) \ + : __clone_args_size_ver(ver, )) + +#define clone_args_get(args, member) \ + (in_compat_syscall() ? (args)->__compat_args.member \ + : (args)->__args.member) + +#define clone_args_as_user_ptr(args, member) \ + (in_compat_syscall() ? compat_ptr(clone_args_get(args, member)) \ + : as_user_ptr(clone_args_get(args, member))) +#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(ver __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_as_user_ptr(args, member) \ + as_user_ptr(clone_args_get(args, member)) +#endif /* CONFIG_COMPAT64 */ + +static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \ +do { \ + BUILD_BUG_ON(offsetofend(type, tls) != pfx##_SIZE_VER0); \ + BUILD_BUG_ON(offsetofend(type, set_tid_size) != pfx##_SIZE_VER1);\ + BUILD_BUG_ON(offsetofend(type, cgroup) != pfx##_SIZE_VER2); \ + BUILD_BUG_ON(sizeof(type) != pfx##_SIZE_VER2); \ +} while (0) + + CLONE_ARGS_BUILD_BUG_ON(struct clone_args, CLONE_ARGS); +#ifdef CONFIG_COMPAT64 + CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_CLONE_ARGS); + BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args)); +#endif +} + noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64 + union { + struct clone_args __args; + struct compat_clone_args __compat_args; + } args; +#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) != - CLONE_ARGS_SIZE_VER0); - BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != - CLONE_ARGS_SIZE_VER1); - BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != - CLONE_ARGS_SIZE_VER2); - BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); + clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG; - if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) + if (unlikely(usize < clone_args_size_ver(0))) return -EINVAL;
+#ifdef CONFIG_COMPAT64 + err = copy_struct_from_user(&args, (in_compat_syscall() ? + sizeof(struct compat_clone_args) : + sizeof(args)), + uargs, usize); +#else err = copy_struct_from_user(&args, sizeof(args), uargs, usize); +#endif + if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) + if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0)) + if (unlikely(!clone_args_get(&args, set_tid) && + clone_args_get(&args, set_tid_size) > 0)) return -EINVAL;
- if (unlikely(args.set_tid && args.set_tid_size == 0)) + if (unlikely(clone_args_get(&args, set_tid) && + clone_args_get(&args, set_tid_size) == 0)) return -EINVAL;
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */ - if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || - !valid_signal(args.exit_signal))) + if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) || + !valid_signal(clone_args_get(&args, exit_signal)))) return -EINVAL;
- if ((args.flags & CLONE_INTO_CGROUP) && - (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) + if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) && + (clone_args_get(&args, cgroup) > INT_MAX || + usize < clone_args_size_ver(2))) return -EINVAL;
*kargs = (struct kernel_clone_args){ - .flags = args.flags, - .pidfd = u64_to_user_ptr(args.pidfd), - .child_tid = u64_to_user_ptr(args.child_tid), - .parent_tid = u64_to_user_ptr(args.parent_tid), - .exit_signal = args.exit_signal, - .stack = args.stack, - .stack_size = args.stack_size, - .tls = args.tls, - .set_tid_size = args.set_tid_size, - .cgroup = args.cgroup, + .flags = clone_args_get(&args, flags), + .pidfd = clone_args_as_user_ptr(&args, pidfd), + .child_tid = clone_args_as_user_ptr(&args, child_tid), + .parent_tid = clone_args_as_user_ptr(&args, parent_tid), + .exit_signal = clone_args_get(&args, exit_signal), + .stack = clone_args_get(&args, stack), + .stack_size = clone_args_get(&args, stack_size), + .tls = clone_args_get(&args, tls), + .set_tid_size = clone_args_get(&args, set_tid_size), + .cgroup = clone_args_get(&args, cgroup), };
- if (args.set_tid && - copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), + if (clone_args_get(&args, set_tid) && + copy_from_user(kset_tid, clone_args_as_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t)))) return -EFAULT;
On 18/11/2022 00:05, Beata Michalska wrote:
The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure, with some of its members representing user pointers. Currently those are specified as being of __u64 type (alignment aside) which is problematic for some architectures, where that type cannot actually hold said pointers. In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type. This though, brings its own drawbacks. The structure itself gains revised signature with explicit paddings for cases where sizeof(__kernel_uintptr_t) > sizeof(__u64), which results in noticeable growth in the structure's size. Furthermore, for affected architectures, as a consequence clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/uapi/linux/sched.h | 36 ++++++++--- kernel/fork.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 37 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..56ca73ca763c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,42 @@
- The structure is versioned by size and thus extensible.
- New struct members must go at the end of the struct and
- must be properly 64bit aligned.
- must be properly aligned on at least 64bit boundary.
- See below on explicit padding when:
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64))
I think this would still work without the explicit padding, right? What's the reason for adding it explicitly? This is related to the issue of struct holes at https://git.morello-project.org/morello/kernel/linux/-/issues/38
Seems to me it adds some extra complexity as Kristina mentioned in above issue, but might be missing something here.
*/
+#define __rsvd(tag) \
- __u8 __rsvd_##tag[sizeof(__kernel_uintptr_t) - sizeof(__u64)]
struct clone_args { __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __rsvd(v0_1);
- __kernel_aligned_uintptr_t pidfd;
- __kernel_aligned_uintptr_t child_tid;
- __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __rsvd(v0_2);
- __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __rsvd(v0_3);
- __kernel_aligned_uintptr_t tls;
- /* VER0 boundary */
- __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size;
- /* VER1 boundary */ __aligned_u64 cgroup;
- /* VER2 boundary */
}; #endif -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 128 : 64)
+#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 152 : 80)
+#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 160 : 88)
This is just hardcoding in the struct sizes which is fine, but just as a general question, would it ever be possible for __alignof__(__kernel_uintptr_t) to be different or more than 16?
Seems like what we've got here is just a proxy for: ((__alignof__(__kernel_uintptr_t) == 16) ? 160 : 88)
Thanks, Zach
/*
- Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..72de1187a586 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,136 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 +#define __clone_args_size_ver(ver, pfx) \
- pfx##CLONE_ARGS_SIZE_VER##ver
+#ifdef CONFIG_COMPAT64 +struct compat_clone_args {
- __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __aligned_u64 set_tid_size;
- __aligned_u64 cgroup;
+};
+#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define clone_args_size_ver(ver) \
- (in_compat_syscall() ? __clone_args_size_ver(ver, COMPAT_) \
: __clone_args_size_ver(ver, ))
+#define clone_args_get(args, member) \
- (in_compat_syscall() ? (args)->__compat_args.member \
: (args)->__args.member)
+#define clone_args_as_user_ptr(args, member) \
- (in_compat_syscall() ? compat_ptr(clone_args_get(args, member)) \
: as_user_ptr(clone_args_get(args, member)))
+#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(ver __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_as_user_ptr(args, member) \
- as_user_ptr(clone_args_get(args, member))
+#endif /* CONFIG_COMPAT64 */
+static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \ +do { \
- BUILD_BUG_ON(offsetofend(type, tls) != pfx##_SIZE_VER0); \
- BUILD_BUG_ON(offsetofend(type, set_tid_size) != pfx##_SIZE_VER1);\
- BUILD_BUG_ON(offsetofend(type, cgroup) != pfx##_SIZE_VER2); \
- BUILD_BUG_ON(sizeof(type) != pfx##_SIZE_VER2); \
+} while (0)
- CLONE_ARGS_BUILD_BUG_ON(struct clone_args, CLONE_ARGS);
+#ifdef CONFIG_COMPAT64
- CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_CLONE_ARGS);
- BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args));
+#endif +}
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64
- union {
struct clone_args __args;
struct compat_clone_args __compat_args;
- } args;
+#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
CLONE_ARGS_SIZE_VER0);
- BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
CLONE_ARGS_SIZE_VER1);
- BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
- BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
- clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG;
- if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
- if (unlikely(usize < clone_args_size_ver(0))) return -EINVAL;
+#ifdef CONFIG_COMPAT64
- err = copy_struct_from_user(&args, (in_compat_syscall() ?
sizeof(struct compat_clone_args) :
sizeof(args)),> + uargs, usize);
+#else err = copy_struct_from_user(&args, sizeof(args), uargs, usize); +#endif
- if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
- if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0))
- if (unlikely(!clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) > 0))
- if (unlikely(args.set_tid && args.set_tid_size == 0))
- if (unlikely(clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) == 0))
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */
- if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
!valid_signal(args.exit_signal)))
- if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) ||
return -EINVAL;!valid_signal(clone_args_get(&args, exit_signal))))
- if ((args.flags & CLONE_INTO_CGROUP) &&
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
- if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) &&
(clone_args_get(&args, cgroup) > INT_MAX ||
return -EINVAL;usize < clone_args_size_ver(2)))
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
.child_tid = u64_to_user_ptr(args.child_tid),
.parent_tid = u64_to_user_ptr(args.parent_tid),
.exit_signal = args.exit_signal,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
.flags = clone_args_get(&args, flags),
.pidfd = clone_args_as_user_ptr(&args, pidfd),
.child_tid = clone_args_as_user_ptr(&args, child_tid),
.parent_tid = clone_args_as_user_ptr(&args, parent_tid),
.exit_signal = clone_args_get(&args, exit_signal),
.stack = clone_args_get(&args, stack),
.stack_size = clone_args_get(&args, stack_size),
.tls = clone_args_get(&args, tls),
.set_tid_size = clone_args_get(&args, set_tid_size),
};.cgroup = clone_args_get(&args, cgroup),
- if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
- if (clone_args_get(&args, set_tid) &&
return -EFAULT;copy_from_user(kset_tid, clone_args_as_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t))))
On Fri, Nov 25, 2022 at 12:13:38PM +0000, Zachary Leaf wrote:
On 18/11/2022 00:05, Beata Michalska wrote:
The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure, with some of its members representing user pointers. Currently those are specified as being of __u64 type (alignment aside) which is problematic for some architectures, where that type cannot actually hold said pointers. In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type. This though, brings its own drawbacks. The structure itself gains revised signature with explicit paddings for cases where sizeof(__kernel_uintptr_t) > sizeof(__u64), which results in noticeable growth in the structure's size. Furthermore, for affected architectures, as a consequence clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/uapi/linux/sched.h | 36 ++++++++--- kernel/fork.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 37 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..56ca73ca763c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,42 @@
- The structure is versioned by size and thus extensible.
- New struct members must go at the end of the struct and
- must be properly 64bit aligned.
- must be properly aligned on at least 64bit boundary.
- See below on explicit padding when:
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64))
I think this would still work without the explicit padding, right? What's the reason for adding it explicitly? This is related to the issue of struct holes at https://git.morello-project.org/morello/kernel/linux/-/issues/38
It is somewhat related, though does not suffer from the risk of potentially leaking data as, in this case, we are moving memory only in one direction: from user-space into kernel space. And yes, it would work without explicit padding - the only thing that is happening here is making the padding more visible.
Seems to me it adds some extra complexity as Kristina mentioned in above issue, but might be missing something here.
It's mainly visualizing the padding - there is no additional complexity here. And as this struct is extendable with size versioning it might actually be convenient when specifying the size of future extensions and verifying struct's layout. Though if you believe it is still too 'disruptive' I can remove those.
*/
+#define __rsvd(tag) \
- __u8 __rsvd_##tag[sizeof(__kernel_uintptr_t) - sizeof(__u64)]
struct clone_args { __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __rsvd(v0_1);
- __kernel_aligned_uintptr_t pidfd;
- __kernel_aligned_uintptr_t child_tid;
- __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __rsvd(v0_2);
- __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __rsvd(v0_3);
- __kernel_aligned_uintptr_t tls;
- /* VER0 boundary */
- __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size;
- /* VER1 boundary */ __aligned_u64 cgroup;
- /* VER2 boundary */
}; #endif -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 128 : 64)
+#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 152 : 80)
+#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 160 : 88)
This is just hardcoding in the struct sizes which is fine, but just as a
That's how this struct is being versioned.
general question, would it ever be possible for __alignof__(__kernel_uintptr_t) to be different or more than 16?
If __kernel_uintptr_t is 64bits then yes, the alignment will be on 8-byte boundaries. Additionally, as __kernel_uintptr_t is defined as __u64 for !PCuABI it can be aligned on 4-byte boundaries for some 32-bit platforms, which is why this struct is forcing the alignment on it's members to be (now at least) 8 bytes.
Seems like what we've got here is just a proxy for: ((__alignof__(__kernel_uintptr_t) == 16) ? 160 : 88)
Yes, although I do prefer the verbosity here. Do you believe this is not readable and should be changed ?
--- BR B.
Thanks, Zach
/*
- Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..72de1187a586 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,136 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 +#define __clone_args_size_ver(ver, pfx) \
- pfx##CLONE_ARGS_SIZE_VER##ver
+#ifdef CONFIG_COMPAT64 +struct compat_clone_args {
- __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __aligned_u64 set_tid_size;
- __aligned_u64 cgroup;
+};
+#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define clone_args_size_ver(ver) \
- (in_compat_syscall() ? __clone_args_size_ver(ver, COMPAT_) \
: __clone_args_size_ver(ver, ))
+#define clone_args_get(args, member) \
- (in_compat_syscall() ? (args)->__compat_args.member \
: (args)->__args.member)
+#define clone_args_as_user_ptr(args, member) \
- (in_compat_syscall() ? compat_ptr(clone_args_get(args, member)) \
: as_user_ptr(clone_args_get(args, member)))
+#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(ver __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_as_user_ptr(args, member) \
- as_user_ptr(clone_args_get(args, member))
+#endif /* CONFIG_COMPAT64 */
+static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \ +do { \
- BUILD_BUG_ON(offsetofend(type, tls) != pfx##_SIZE_VER0); \
- BUILD_BUG_ON(offsetofend(type, set_tid_size) != pfx##_SIZE_VER1);\
- BUILD_BUG_ON(offsetofend(type, cgroup) != pfx##_SIZE_VER2); \
- BUILD_BUG_ON(sizeof(type) != pfx##_SIZE_VER2); \
+} while (0)
- CLONE_ARGS_BUILD_BUG_ON(struct clone_args, CLONE_ARGS);
+#ifdef CONFIG_COMPAT64
- CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_CLONE_ARGS);
- BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args));
+#endif +}
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64
- union {
struct clone_args __args;
struct compat_clone_args __compat_args;
- } args;
+#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
CLONE_ARGS_SIZE_VER0);
- BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
CLONE_ARGS_SIZE_VER1);
- BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
- BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
- clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG;
- if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
- if (unlikely(usize < clone_args_size_ver(0))) return -EINVAL;
+#ifdef CONFIG_COMPAT64
- err = copy_struct_from_user(&args, (in_compat_syscall() ?
sizeof(struct compat_clone_args) :
sizeof(args)),> + uargs, usize);
+#else err = copy_struct_from_user(&args, sizeof(args), uargs, usize); +#endif
- if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
- if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0))
- if (unlikely(!clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) > 0))
- if (unlikely(args.set_tid && args.set_tid_size == 0))
- if (unlikely(clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) == 0))
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */
- if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
!valid_signal(args.exit_signal)))
- if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) ||
return -EINVAL;!valid_signal(clone_args_get(&args, exit_signal))))
- if ((args.flags & CLONE_INTO_CGROUP) &&
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
- if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) &&
(clone_args_get(&args, cgroup) > INT_MAX ||
return -EINVAL;usize < clone_args_size_ver(2)))
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
.child_tid = u64_to_user_ptr(args.child_tid),
.parent_tid = u64_to_user_ptr(args.parent_tid),
.exit_signal = args.exit_signal,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
.flags = clone_args_get(&args, flags),
.pidfd = clone_args_as_user_ptr(&args, pidfd),
.child_tid = clone_args_as_user_ptr(&args, child_tid),
.parent_tid = clone_args_as_user_ptr(&args, parent_tid),
.exit_signal = clone_args_get(&args, exit_signal),
.stack = clone_args_get(&args, stack),
.stack_size = clone_args_get(&args, stack_size),
.tls = clone_args_get(&args, tls),
.set_tid_size = clone_args_get(&args, set_tid_size),
};.cgroup = clone_args_get(&args, cgroup),
- if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
- if (clone_args_get(&args, set_tid) &&
return -EFAULT;copy_from_user(kset_tid, clone_args_as_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t))))
linux-morello mailing list -- linux-morello@op-lists.linaro.org To unsubscribe send an email to linux-morello-leave@op-lists.linaro.org
On 30/11/2022 21:26, Beata Michalska wrote:
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..56ca73ca763c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,42 @@
- The structure is versioned by size and thus extensible.
- New struct members must go at the end of the struct and
- must be properly 64bit aligned.
- must be properly aligned on at least 64bit boundary.
- See below on explicit padding when:
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64))
I think this would still work without the explicit padding, right? What's the reason for adding it explicitly? This is related to the issue of struct holes at https://git.morello-project.org/morello/kernel/linux/-/issues/38
It is somewhat related, though does not suffer from the risk of potentially leaking data as, in this case, we are moving memory only in one direction: from user-space into kernel space. And yes, it would work without explicit padding - the only thing that is happening here is making the padding more visible.
Seems to me it adds some extra complexity as Kristina mentioned in above issue, but might be missing something here.
It's mainly visualizing the padding - there is no additional complexity here. And as this struct is extendable with size versioning it might actually be convenient when specifying the size of future extensions and verifying struct's layout. Though if you believe it is still too 'disruptive' I can remove those.
As ever I would like us to be as consistent as possible, so if there is no particular reason to have explicit padding here I would leave it out, as otherwise it becomes unclear why we took different approaches for different uapi structs.
Kevin
On 30/11/2022 20:26, Beata Michalska wrote:
On Fri, Nov 25, 2022 at 12:13:38PM +0000, Zachary Leaf wrote:
On 18/11/2022 00:05, Beata Michalska wrote:
The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure, with some of its members representing user pointers. Currently those are specified as being of __u64 type (alignment aside) which is problematic for some architectures, where that type cannot actually hold said pointers. In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type. This though, brings its own drawbacks. The structure itself gains revised signature with explicit paddings for cases where sizeof(__kernel_uintptr_t) > sizeof(__u64), which results in noticeable growth in the structure's size. Furthermore, for affected architectures, as a consequence clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/uapi/linux/sched.h | 36 ++++++++--- kernel/fork.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 37 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..56ca73ca763c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,42 @@
- The structure is versioned by size and thus extensible.
- New struct members must go at the end of the struct and
- must be properly 64bit aligned.
- must be properly aligned on at least 64bit boundary.
- See below on explicit padding when:
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64))
I think this would still work without the explicit padding, right? What's the reason for adding it explicitly? This is related to the issue of struct holes at https://git.morello-project.org/morello/kernel/linux/-/issues/38
It is somewhat related, though does not suffer from the risk of potentially leaking data as, in this case, we are moving memory only in one direction: from user-space into kernel space. And yes, it would work without explicit padding - the only thing that is happening here is making the padding more visible.
Seems to me it adds some extra complexity as Kristina mentioned in above issue, but might be missing something here.
It's mainly visualizing the padding - there is no additional complexity here. And as this struct is extendable with size versioning it might actually be convenient when specifying the size of future extensions and verifying struct's layout. Though if you believe it is still too 'disruptive' I can remove those.
*/
+#define __rsvd(tag) \
- __u8 __rsvd_##tag[sizeof(__kernel_uintptr_t) - sizeof(__u64)]
struct clone_args { __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __rsvd(v0_1);
- __kernel_aligned_uintptr_t pidfd;
- __kernel_aligned_uintptr_t child_tid;
- __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __rsvd(v0_2);
- __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __rsvd(v0_3);
- __kernel_aligned_uintptr_t tls;
- /* VER0 boundary */
- __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size;
- /* VER1 boundary */ __aligned_u64 cgroup;
- /* VER2 boundary */
}; #endif -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 128 : 64)
+#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 152 : 80)
+#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 160 : 88)
This is just hardcoding in the struct sizes which is fine, but just as a
That's how this struct is being versioned.
general question, would it ever be possible for __alignof__(__kernel_uintptr_t) to be different or more than 16?
If __kernel_uintptr_t is 64bits then yes, the alignment will be on 8-byte boundaries. Additionally, as __kernel_uintptr_t is defined as __u64 for !PCuABI it can be aligned on 4-byte boundaries for some 32-bit platforms, which is why this struct is forcing the alignment on it's members to be (now at least) 8 bytes.
Seems like what we've got here is just a proxy for: ((__alignof__(__kernel_uintptr_t) == 16) ? 160 : 88)
Yes, although I do prefer the verbosity here. Do you believe this is not readable and should be changed ?
In hindsight I think it's fine.
My initial concern/question here was if ever in future we might have have __kernel_uintptr_t being 32B/256-bit or something like that. Then it might be better to be explicit about what values of alignof(x) relate to what struct sizes instead of specifying it just as a relation of x > y.
In that case we've got more than 2 options and what I suggested wouldn't work either anyway and would need additional cases.
Thanks, Zach
BR B.
Thanks, Zach
/*
- Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..72de1187a586 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,136 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 +#define __clone_args_size_ver(ver, pfx) \
- pfx##CLONE_ARGS_SIZE_VER##ver
+#ifdef CONFIG_COMPAT64 +struct compat_clone_args {
- __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __aligned_u64 set_tid_size;
- __aligned_u64 cgroup;
+};
+#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define clone_args_size_ver(ver) \
- (in_compat_syscall() ? __clone_args_size_ver(ver, COMPAT_) \
: __clone_args_size_ver(ver, ))
+#define clone_args_get(args, member) \
- (in_compat_syscall() ? (args)->__compat_args.member \
: (args)->__args.member)
+#define clone_args_as_user_ptr(args, member) \
- (in_compat_syscall() ? compat_ptr(clone_args_get(args, member)) \
: as_user_ptr(clone_args_get(args, member)))
+#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(ver __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_as_user_ptr(args, member) \
- as_user_ptr(clone_args_get(args, member))
+#endif /* CONFIG_COMPAT64 */
+static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \ +do { \
- BUILD_BUG_ON(offsetofend(type, tls) != pfx##_SIZE_VER0); \
- BUILD_BUG_ON(offsetofend(type, set_tid_size) != pfx##_SIZE_VER1);\
- BUILD_BUG_ON(offsetofend(type, cgroup) != pfx##_SIZE_VER2); \
- BUILD_BUG_ON(sizeof(type) != pfx##_SIZE_VER2); \
+} while (0)
- CLONE_ARGS_BUILD_BUG_ON(struct clone_args, CLONE_ARGS);
+#ifdef CONFIG_COMPAT64
- CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_CLONE_ARGS);
- BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args));
+#endif +}
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64
- union {
struct clone_args __args;
struct compat_clone_args __compat_args;
- } args;
+#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
CLONE_ARGS_SIZE_VER0);
- BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
CLONE_ARGS_SIZE_VER1);
- BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
- BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
- clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG;
- if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
- if (unlikely(usize < clone_args_size_ver(0))) return -EINVAL;
+#ifdef CONFIG_COMPAT64
- err = copy_struct_from_user(&args, (in_compat_syscall() ?
sizeof(struct compat_clone_args) :
sizeof(args)),> + uargs, usize);
+#else err = copy_struct_from_user(&args, sizeof(args), uargs, usize); +#endif
- if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
- if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0))
- if (unlikely(!clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) > 0))
- if (unlikely(args.set_tid && args.set_tid_size == 0))
- if (unlikely(clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) == 0))
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */
- if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
!valid_signal(args.exit_signal)))
- if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) ||
return -EINVAL;!valid_signal(clone_args_get(&args, exit_signal))))
- if ((args.flags & CLONE_INTO_CGROUP) &&
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
- if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) &&
(clone_args_get(&args, cgroup) > INT_MAX ||
return -EINVAL;usize < clone_args_size_ver(2)))
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
.child_tid = u64_to_user_ptr(args.child_tid),
.parent_tid = u64_to_user_ptr(args.parent_tid),
.exit_signal = args.exit_signal,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
.flags = clone_args_get(&args, flags),
.pidfd = clone_args_as_user_ptr(&args, pidfd),
.child_tid = clone_args_as_user_ptr(&args, child_tid),
.parent_tid = clone_args_as_user_ptr(&args, parent_tid),
.exit_signal = clone_args_get(&args, exit_signal),
.stack = clone_args_get(&args, stack),
.stack_size = clone_args_get(&args, stack_size),
.tls = clone_args_get(&args, tls),
.set_tid_size = clone_args_get(&args, set_tid_size),
};.cgroup = clone_args_get(&args, cgroup),
- if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
- if (clone_args_get(&args, set_tid) &&
return -EFAULT;copy_from_user(kset_tid, clone_args_as_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t))))
linux-morello mailing list -- linux-morello@op-lists.linaro.org To unsubscribe send an email to linux-morello-leave@op-lists.linaro.org
On 01/12/2022 12:14, Zachary Leaf wrote:
-#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 128 : 64)
+#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 152 : 80)
+#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 160 : 88)
This is just hardcoding in the struct sizes which is fine, but just as a
That's how this struct is being versioned.
general question, would it ever be possible for __alignof__(__kernel_uintptr_t) to be different or more than 16?
If __kernel_uintptr_t is 64bits then yes, the alignment will be on 8-byte boundaries. Additionally, as __kernel_uintptr_t is defined as __u64 for !PCuABI it can be aligned on 4-byte boundaries for some 32-bit platforms, which is why this struct is forcing the alignment on it's members to be (now at least) 8 bytes.
Seems like what we've got here is just a proxy for: ((__alignof__(__kernel_uintptr_t) == 16) ? 160 : 88)
Yes, although I do prefer the verbosity here. Do you believe this is not readable and should be changed ?
In hindsight I think it's fine.
My initial concern/question here was if ever in future we might have have __kernel_uintptr_t being 32B/256-bit or something like that. Then it might be better to be explicit about what values of alignof(x) relate to what struct sizes instead of specifying it just as a relation of x > y.
In that case we've got more than 2 options and what I suggested wouldn't work either anyway and would need additional cases.
In this particular case we need to handle each size (above 64-bit) explicitly so there's no fundamental difference between either of these approaches.
I don't have a strong preference here but just one thing I wanted to note, it doesn't really make sense to look at the alignment of __kernel_uintptr_t vs __u64, as what we have in that struct is the __aligned variant of both. I think we can safely assume that __aligned means self-aligned, i.e. sizeof() == alignof(), so I would replace __alignof__ with sizeof in the tests.
Kevin
On 18/11/2022 01:05, Beata Michalska wrote:
The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure, with some of its members representing user pointers. Currently those are specified as being of __u64 type (alignment aside) which is problematic for some architectures, where that type cannot actually hold said pointers. In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type. This though, brings its own drawbacks. The structure itself gains revised signature with explicit paddings for cases where sizeof(__kernel_uintptr_t) > sizeof(__u64), which results in noticeable growth in the structure's size. Furthermore, for affected architectures, as a consequence clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Might want to break that down into a couple of paragraphs to give the avid reader some space to breathe :)
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/uapi/linux/sched.h | 36 ++++++++--- kernel/fork.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 37 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..56ca73ca763c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,42 @@
- The structure is versioned by size and thus extensible.
- New struct members must go at the end of the struct and
- must be properly 64bit aligned.
- must be properly aligned on at least 64bit boundary.
- See below on explicit padding when:
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64))
*/
+#define __rsvd(tag) \
- __u8 __rsvd_##tag[sizeof(__kernel_uintptr_t) - sizeof(__u64)]
struct clone_args { __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __rsvd(v0_1);
- __kernel_aligned_uintptr_t pidfd;
- __kernel_aligned_uintptr_t child_tid;
- __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __rsvd(v0_2);
- __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __rsvd(v0_3);
- __kernel_aligned_uintptr_t tls;
- /* VER0 boundary */
- __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size;
- /* VER1 boundary */ __aligned_u64 cgroup;
- /* VER2 boundary */
}; #endif -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 128 : 64)
+#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 152 : 80)
+#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 160 : 88)
/*
- Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..72de1187a586 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,136 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 +#define __clone_args_size_ver(ver, pfx) \
- pfx##CLONE_ARGS_SIZE_VER##ver
+#ifdef CONFIG_COMPAT64 +struct compat_clone_args {
- __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __aligned_u64 set_tid_size;
- __aligned_u64 cgroup;
+};
+#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define clone_args_size_ver(ver) \
- (in_compat_syscall() ? __clone_args_size_ver(ver, COMPAT_) \
We are ending up with really a lot of calls to in_compat_syscall() in copy_clone_args_from_user(). It looks like in_compat_syscall() is completely implemented inline, so maybe the compiler can decide that it can just call it once and cache the result, but that's definitely worth checking.
: __clone_args_size_ver(ver, ))
+#define clone_args_get(args, member) \
- (in_compat_syscall() ? (args)->__compat_args.member \
: (args)->__args.member)
+#define clone_args_as_user_ptr(args, member) \
- (in_compat_syscall() ? compat_ptr(clone_args_get(args, member)) \
: as_user_ptr(clone_args_get(args, member)))
+#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(ver __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_as_user_ptr(args, member) \
- as_user_ptr(clone_args_get(args, member))
I was going to say that this does not do what you want, but, having just looked again at the implementation of as_user_ptr(), I stand corrected! In any case, this was not the intention for as_user_ptr(), it is meant to signal that an arbitrary integer is being stored into a user pointer (see Documentation/core-api/user_ptr.rst). This is not what happens here, you just want to get a void __user * from a __kernel_uintptr_t. Since this does not involve any representation change [1], a simple cast is appropriate. This is what we now have in aio_setup_rw() for instance.
To avoid any confusion I would also rename the macro, maybe clone_args_get_user_ptr()?
[1] Writing this I realise this is not strictly true: in 32-bit, __kernel_uintptr_t is 64-bit but void __user * is 32-bit. This is not a concern though as the compiler is happy to cast a 64-bit integer to a 32-bit pointer.
+#endif /* CONFIG_COMPAT64 */
+static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \
Nit: might as well add an extra hard tab to the indentation of , better than just a space.
+do { \
- BUILD_BUG_ON(offsetofend(type, tls) != pfx##_SIZE_VER0); \
- BUILD_BUG_ON(offsetofend(type, set_tid_size) != pfx##_SIZE_VER1);\
Nit: a stray hard tab after != on that line, and two spaces after != on the line above.
- BUILD_BUG_ON(offsetofend(type, cgroup) != pfx##_SIZE_VER2); \
- BUILD_BUG_ON(sizeof(type) != pfx##_SIZE_VER2); \
Why not use that nice __clone_args_size_ver() you introduced?
+} while (0)
- CLONE_ARGS_BUILD_BUG_ON(struct clone_args, CLONE_ARGS);
+#ifdef CONFIG_COMPAT64
- CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_CLONE_ARGS);
- BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args));
+#endif +}
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64
- union {
struct clone_args __args;
struct compat_clone_args __compat_args;
- } args;
+#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
CLONE_ARGS_SIZE_VER0);
- BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
CLONE_ARGS_SIZE_VER1);
- BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
- BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
- clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG;
- if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
- if (unlikely(usize < clone_args_size_ver(0))) return -EINVAL;
+#ifdef CONFIG_COMPAT64
- err = copy_struct_from_user(&args, (in_compat_syscall() ?
sizeof(struct compat_clone_args) :
sizeof(args)),
uargs, usize);
+#else err = copy_struct_from_user(&args, sizeof(args), uargs, usize); +#endif
- if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
- if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0))
- if (unlikely(!clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) > 0))
- if (unlikely(args.set_tid && args.set_tid_size == 0))
- if (unlikely(clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) == 0))
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */
- if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
!valid_signal(args.exit_signal)))
- if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) ||
return -EINVAL;!valid_signal(clone_args_get(&args, exit_signal))))
- if ((args.flags & CLONE_INTO_CGROUP) &&
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
- if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) &&
(clone_args_get(&args, cgroup) > INT_MAX ||
return -EINVAL;usize < clone_args_size_ver(2)))
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
.child_tid = u64_to_user_ptr(args.child_tid),
.parent_tid = u64_to_user_ptr(args.parent_tid),
.exit_signal = args.exit_signal,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
.flags = clone_args_get(&args, flags),
.pidfd = clone_args_as_user_ptr(&args, pidfd),
.child_tid = clone_args_as_user_ptr(&args, child_tid),
.parent_tid = clone_args_as_user_ptr(&args, parent_tid),
.exit_signal = clone_args_get(&args, exit_signal),
.stack = clone_args_get(&args, stack),
.stack_size = clone_args_get(&args, stack_size),
.tls = clone_args_get(&args, tls),
.set_tid_size = clone_args_get(&args, set_tid_size),
};.cgroup = clone_args_get(&args, cgroup),
- if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
- if (clone_args_get(&args, set_tid) &&
return -EFAULT;copy_from_user(kset_tid, clone_args_as_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t))))
On Wed, Nov 30, 2022 at 03:43:16PM +0100, Kevin Brodsky wrote:
On 18/11/2022 01:05, Beata Michalska wrote:
The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure, with some of its members representing user pointers. Currently those are specified as being of __u64 type (alignment aside) which is problematic for some architectures, where that type cannot actually hold said pointers. In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type. This though, brings its own drawbacks. The structure itself gains revised signature with explicit paddings for cases where sizeof(__kernel_uintptr_t) > sizeof(__u64), which results in noticeable growth in the structure's size. Furthermore, for affected architectures, as a consequence clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Might want to break that down into a couple of paragraphs to give the avid reader some space to breathe :)
Noted.
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/uapi/linux/sched.h | 36 ++++++++--- kernel/fork.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 37 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..56ca73ca763c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,42 @@
- The structure is versioned by size and thus extensible.
- New struct members must go at the end of the struct and
- must be properly 64bit aligned.
- must be properly aligned on at least 64bit boundary.
- See below on explicit padding when:
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64))
*/
+#define __rsvd(tag) \
- __u8 __rsvd_##tag[sizeof(__kernel_uintptr_t) - sizeof(__u64)]
struct clone_args { __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __rsvd(v0_1);
- __kernel_aligned_uintptr_t pidfd;
- __kernel_aligned_uintptr_t child_tid;
- __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __rsvd(v0_2);
- __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __rsvd(v0_3);
- __kernel_aligned_uintptr_t tls;
- /* VER0 boundary */
- __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size;
- /* VER1 boundary */ __aligned_u64 cgroup;
- /* VER2 boundary */
}; #endif -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 128 : 64)
+#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 152 : 80)
+#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 160 : 88)
/*
- Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..72de1187a586 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,136 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 +#define __clone_args_size_ver(ver, pfx) \
- pfx##CLONE_ARGS_SIZE_VER##ver
+#ifdef CONFIG_COMPAT64 +struct compat_clone_args {
- __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __aligned_u64 set_tid_size;
- __aligned_u64 cgroup;
+};
+#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define clone_args_size_ver(ver) \
- (in_compat_syscall() ? __clone_args_size_ver(ver, COMPAT_) \
We are ending up with really a lot of calls to in_compat_syscall() in copy_clone_args_from_user(). It looks like in_compat_syscall() is completely implemented inline, so maybe the compiler can decide that it can just call it once and cache the result, but that's definitely worth checking.
I was concerned a bit about that, which is why I did mention performance impact in the commit message. I'll have another thought on how to optimize that one out.
: __clone_args_size_ver(ver, ))
+#define clone_args_get(args, member) \
- (in_compat_syscall() ? (args)->__compat_args.member \
: (args)->__args.member)
+#define clone_args_as_user_ptr(args, member) \
- (in_compat_syscall() ? compat_ptr(clone_args_get(args, member)) \
: as_user_ptr(clone_args_get(args, member)))
+#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(ver __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_as_user_ptr(args, member) \
- as_user_ptr(clone_args_get(args, member))
I was going to say that this does not do what you want, but, having just looked again at the implementation of as_user_ptr(), I stand corrected! In any case, this was not the intention for as_user_ptr(), it is meant to signal that an arbitrary integer is being stored into a user pointer (see Documentation/core-api/user_ptr.rst). This is not what happens here, you just want to get a void __user * from a __kernel_uintptr_t. Since this does not involve any representation change [1], a simple cast is appropriate. This is what we now have in aio_setup_rw() for instance.
To avoid any confusion I would also rename the macro, maybe clone_args_get_user_ptr()?
[1] Writing this I realise this is not strictly true: in 32-bit, __kernel_uintptr_t is 64-bit but void __user * is 32-bit. This is not a concern though as the compiler is happy to cast a 64-bit integer to a 32-bit pointer.
I actually did hesitate with that one for non-PCuABI cases but I agree, this one should be updated.
+#endif /* CONFIG_COMPAT64 */
+static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \
Nit: might as well add an extra hard tab to the indentation of , better than just a space.
+do { \
- BUILD_BUG_ON(offsetofend(type, tls) != pfx##_SIZE_VER0); \
- BUILD_BUG_ON(offsetofend(type, set_tid_size) != pfx##_SIZE_VER1);\
Nit: a stray hard tab after != on that line, and two spaces after != on the line above.
- BUILD_BUG_ON(offsetofend(type, cgroup) != pfx##_SIZE_VER2); \
- BUILD_BUG_ON(sizeof(type) != pfx##_SIZE_VER2); \
Why not use that nice __clone_args_size_ver() you introduced?
That's a fair point! I think those came first :) Will change that.
--- BR B.
+} while (0)
- CLONE_ARGS_BUILD_BUG_ON(struct clone_args, CLONE_ARGS);
+#ifdef CONFIG_COMPAT64
- CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_CLONE_ARGS);
- BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args));
+#endif +}
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64
- union {
struct clone_args __args;
struct compat_clone_args __compat_args;
- } args;
+#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
CLONE_ARGS_SIZE_VER0);
- BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
CLONE_ARGS_SIZE_VER1);
- BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
- BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
- clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG;
- if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
- if (unlikely(usize < clone_args_size_ver(0))) return -EINVAL;
+#ifdef CONFIG_COMPAT64
- err = copy_struct_from_user(&args, (in_compat_syscall() ?
sizeof(struct compat_clone_args) :
sizeof(args)),
uargs, usize);
+#else err = copy_struct_from_user(&args, sizeof(args), uargs, usize); +#endif
- if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
- if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0))
- if (unlikely(!clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) > 0))
- if (unlikely(args.set_tid && args.set_tid_size == 0))
- if (unlikely(clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) == 0))
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */
- if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
!valid_signal(args.exit_signal)))
- if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) ||
return -EINVAL;!valid_signal(clone_args_get(&args, exit_signal))))
- if ((args.flags & CLONE_INTO_CGROUP) &&
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
- if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) &&
(clone_args_get(&args, cgroup) > INT_MAX ||
return -EINVAL;usize < clone_args_size_ver(2)))
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
.child_tid = u64_to_user_ptr(args.child_tid),
.parent_tid = u64_to_user_ptr(args.parent_tid),
.exit_signal = args.exit_signal,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
.flags = clone_args_get(&args, flags),
.pidfd = clone_args_as_user_ptr(&args, pidfd),
.child_tid = clone_args_as_user_ptr(&args, child_tid),
.parent_tid = clone_args_as_user_ptr(&args, parent_tid),
.exit_signal = clone_args_get(&args, exit_signal),
.stack = clone_args_get(&args, stack),
.stack_size = clone_args_get(&args, stack_size),
.tls = clone_args_get(&args, tls),
.set_tid_size = clone_args_get(&args, set_tid_size),
};.cgroup = clone_args_get(&args, cgroup),
- if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
- if (clone_args_get(&args, set_tid) &&
return -EFAULT;copy_from_user(kset_tid, clone_args_as_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t))))
On 30/11/2022 21:38, Beata Michalska wrote:
+#define clone_args_size_ver(ver) \
- (in_compat_syscall() ? __clone_args_size_ver(ver, COMPAT_) \
We are ending up with really a lot of calls to in_compat_syscall() in copy_clone_args_from_user(). It looks like in_compat_syscall() is completely implemented inline, so maybe the compiler can decide that it can just call it once and cache the result, but that's definitely worth checking.
I was concerned a bit about that, which is why I did mention performance impact in the commit message. I'll have another thought on how to optimize that one out.
It's really down to what the compiler outputs, if everything gets optimised out I have no problem keeping it this way.
Kevin
Add a bunch of test-cases for clone3 syscall, focusing mainly on handling struct clone_args versioning and sizing, alongside the obvious req for respecting capabilities.
Signed-off-by: Beata Michalska beata.michalska@arm.com --- tools/testing/selftests/arm64/morello/clone.c | 269 +++++++++++++++++- 1 file changed, 267 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/arm64/morello/clone.c b/tools/testing/selftests/arm64/morello/clone.c index 8217840ba504..13c84dc7ca77 100644 --- a/tools/testing/selftests/arm64/morello/clone.c +++ b/tools/testing/selftests/arm64/morello/clone.c @@ -1,15 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2021 Arm Limited +#include "freestanding.h" #include <linux/mman.h> #include <linux/resource.h> #include <linux/wait.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/errno.h> +#include <linux/signal.h> +#include <limits.h> #include <cheriintrin.h> -#include "freestanding.h"
-#define STACK_SIZE 1024*1024 +#define STACK_SIZE 1024*1024 +#define DEFAULT_PAGE_SIZE 4096
#define in_restricted() \ (!(cheri_perms_get(cheri_pcc_get()) & ARM_CAP_PERMISSION_EXECUTIVE)) @@ -17,6 +20,10 @@
#define MAGIC_NR 0x3562 /* whatever really ..... */
+#ifndef MAX_PID_NS_LEVEL +#define MAX_PID_NS_LEVEL 32 +#endif + /* Cloned thread result */ #define CTR_SUCCESS 1 #define CTR_FAILED -1 @@ -256,6 +263,263 @@ TEST(test_wait_rusage) RUN_WITH_FIXTURE(data, CLONE_TH_RUSAGE, wait_single); }
+ +#define GET_ERRNO(err) (((err) + MAX_ERRNO) ^ MAX_ERRNO) + +#define DECLARE_DESC(arg, val) \ + { \ + offsetof(struct clone_args, arg), \ + sizeof(((struct clone_args *)0)->arg), \ + (uintcap_t)(val) \ + } + +struct clone3_fixture { + size_t args_size; + struct { + unsigned int offset; + unsigned int size; + uintcap_t value; + /* This will get extra size but that keeps things on the safe side */ + } const args_desc[sizeof(struct clone_args)/sizeof(__u64)]; + int n_desc; + int e_result; +} const clone3_data[] = { + /* BEGIN_SECTION: expected failure */ + /* size of clone_args smaller than CLONE_ARGS_SIZE_VER0 */ + { + .args_size = offsetof(struct clone_args, tls), + .args_desc = {}, + .n_desc = 0, + .e_result = -EINVAL + }, /* @{0} */ + /* invalid set_tid array size */ + { + .args_size = sizeof(struct clone_args), + .args_desc = { + DECLARE_DESC(set_tid_size, MAX_PID_NS_LEVEL + 1), + }, + .n_desc = 1, + .e_result = -EINVAL + }, /* @{1} */ + /* Invalid combination of set_tid & set_tid_size */ + { + .args_size = sizeof(struct clone_args), + .args_desc = { + DECLARE_DESC(set_tid, (uintcap_t) NULL), + DECLARE_DESC(set_tid_size, 1) + }, + .n_desc = 2, + .e_result = -EINVAL + }, /* @{2} */ + /* Invalid exit_signal */ + { + .args_size = sizeof(struct clone_args), + .args_desc = { + DECLARE_DESC(exit_signal, _NSIG + 1), + }, + .n_desc = 1, + .e_result = -EINVAL + }, /* @{3} */ + /* Invalid cgroup number */ + { + .args_size = sizeof(struct clone_args), + .args_desc = { + DECLARE_DESC(flags, CLONE_INTO_CGROUP), + DECLARE_DESC(cgroup, (__u64)INT_MAX + 1), + }, + .n_desc = 2, + .e_result = -EINVAL + }, /* @{4} */ + /* Invalid size for clone_args with cgroup */ + { + .args_size = offsetof(struct clone_args, cgroup), + .args_desc = { + DECLARE_DESC(flags, CLONE_INTO_CGROUP), + DECLARE_DESC(cgroup, 1), + }, + .n_desc = 2, + .e_result = -EINVAL + }, /* @{5} */ + /* Invalid stack & stack_size combination */ + { + .args_size = sizeof(struct clone_args), + .args_desc = { + DECLARE_DESC(stack, NULL), + DECLARE_DESC(stack_size, 4096), + }, + .n_desc = 2, + .e_result = -EINVAL + }, /* @{6} */ + { + .args_size = sizeof(struct clone_args), + .args_desc = { + DECLARE_DESC(stack_size, 0), + }, + .n_desc = 1, + .e_result = -EINVAL + }, /* @{7} */ + { + .args_size = sizeof(struct clone_args), + .args_desc = { + DECLARE_DESC(set_tid, &(pid_t){1}), + DECLARE_DESC(set_tid_size, 1), + }, + .n_desc = 2, + .e_result = -EEXIST + }, /* @{8} */ + + /* END_SECTION: expected failure */ + { + .args_size = sizeof(struct clone_args), + .args_desc = { + DECLARE_DESC(flags, CLONE_PIDFD | + CLONE_CHILD_SETTID | + CLONE_CHILD_CLEARTID), + }, + .n_desc = 1, + .e_result = 0 + }, /* @{9} */ + { + .args_size = sizeof(struct clone_args), + .args_desc = { + DECLARE_DESC(flags, CLONE_PARENT_SETTID | + CLONE_CHILD_SETTID), + }, + .n_desc = 1, + .e_result = 0 + }, /* @{10} */ + { + .args_size = sizeof(struct clone_args), + .args_desc = { + DECLARE_DESC(flags, CLONE_SETTLS), + }, + .n_desc = 1, + .e_result = 0 + }, /* @{11} */ +}; + +__thread int tls_data; + +static __attribute__((noinline)) void run_child(struct clone_args *args) +{ + int count = 1000; + + if (args->flags & CLONE_CHILD_SETTID) { + pid_t current_pid = (pid_t)syscall(__NR_getpid); + + ASSERT_FALSE(current_pid != *(pid_t *)args->child_tid); + } + + if (args->flags & CLONE_SETTLS && args->tls) { + ptraddr_t base_addr = cheri_address_get(args->tls); + ptraddr_t ref_addr = cheri_address_get(&tls_data); + size_t length = cheri_length_get(args->tls); + + ASSERT_TRUE(cheri_tag_get(args->tls)); + ASSERT_FALSE(ref_addr < base_addr || ref_addr >= base_addr + length); + } + /* + * This is grisly .... + * Spin for a while to give a chance for the parent to be + * scheduled in after clone and before this thread terminates + * (not entirely reliable though) + */ + while (--count) + sleep_cycles(&(int){1000}); + syscall(__NR_exit, 0); +} + +static inline __attribute__((always_inline)) +void run_clone3(const struct clone3_fixture *data) +{ + int pidfd, parent_tid, child_tid; + siginfo_t *wait_si = &(siginfo_t){0}; + int result; + pid_t pid; + + struct clone_args args = { + .flags = 0, + .pidfd = (uintcap_t)&pidfd, + .parent_tid = (uintcap_t)&parent_tid, + .child_tid = (uintcap_t)&child_tid, + .exit_signal = SIGCHLD, + .stack = (uintcap_t) mmap_verified(NULL, STACK_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + -1, 0, STACK_REQ_PERMS), + .stack_size = STACK_SIZE, + }; + + for (int i = 0; i < data->n_desc; ++i) { + unsigned char *arg; + + arg = (unsigned char *)&args + data->args_desc[i].offset; + + if (data->args_desc[i].size != sizeof(uintcap_t)) + *(__u64 *)arg = (__u64)data->args_desc[i].value; + else + *(uintcap_t *)arg = data->args_desc[i].value; + } + + /* Don't bother with the full set-up for expected failures */ + if (data->e_result) { + result = syscall(__NR_clone3, &args, data->args_size); + ASSERT_EQ(result, data->e_result) { + TH_LOG("Expected: %d while %d was received", + GET_ERRNO(data->e_result), GET_ERRNO(result)); + } + munmap((void *)args.stack, STACK_SIZE); + return; + } + + args.flags |= CLONE_VM; + + if (args.flags & CLONE_SETTLS) { + void *addr = mmap_verified(NULL, DEFAULT_PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0, + CAP_LOAD_PERMS | CAP_STORE_PERMS); + + args.tls = (uintcap_t)cheri_bounds_set(addr, DEFAULT_PAGE_SIZE); + } + + pid = syscall(__NR_clone3, &args, data->args_size); + + ASSERT_GE(pid, 0); + if (!pid) + run_child(&args); + + if (args.flags & CLONE_PIDFD) { + /* Signal handling is not the test target here: valid pidfd is */ + result = syscall(__NR_pidfd_send_signal, pidfd, SIGUSR1, NULL, 0); + ASSERT_NE(result, -EBADF); + } + + waitid(P_PID, pid, wait_si, WEXITED, NULL); + + /* child_tid set once the thread gets scheduled */ + if (args.flags & CLONE_PARENT_SETTID && args.flags & CLONE_CHILD_SETTID + && !(args.flags & CLONE_CHILD_CLEARTID)) + ASSERT_EQ(*(pid_t *)args.parent_tid, *(pid_t *)args.child_tid) + + if (args.flags & CLONE_CHILD_CLEARTID) + ASSERT_EQ(*(pid_t *)args.child_tid, 0); + + munmap((void *)args.stack, STACK_SIZE); + if (args.flags & CLONE_SETTLS) + munmap((void *)args.tls, STACK_SIZE); +} + +TEST(test_clone3) +{ + int ncount = sizeof(clone3_data)/sizeof(clone3_data[0]); + + for (int i = 0; i < ncount; ++i) { + TH_LOG("Validating clone3 @{%d}", i); + run_clone3(&clone3_data[i]); + } +} + void run_restricted(uintcap_t entry_point) { void *new_stack = allocate_mem_raw(STACK_SIZE); @@ -290,5 +554,6 @@ int main(void) run_restricted((uintcap_t)test_clone_tls_restricted); test_wait_raw(); test_wait_rusage(); + test_clone3(); return 0; }
On 18/11/2022 01:05, Beata Michalska wrote:
Add a bunch of test-cases for clone3 syscall, focusing mainly on handling struct clone_args versioning and sizing, alongside the obvious req for respecting capabilities.
Signed-off-by: Beata Michalska beata.michalska@arm.com
tools/testing/selftests/arm64/morello/clone.c | 269 +++++++++++++++++- 1 file changed, 267 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/arm64/morello/clone.c b/tools/testing/selftests/arm64/morello/clone.c index 8217840ba504..13c84dc7ca77 100644 --- a/tools/testing/selftests/arm64/morello/clone.c +++ b/tools/testing/selftests/arm64/morello/clone.c @@ -1,15 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2021 Arm Limited +#include "freestanding.h" #include <linux/mman.h> #include <linux/resource.h> #include <linux/wait.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/errno.h> +#include <linux/signal.h> +#include <limits.h> #include <cheriintrin.h> -#include "freestanding.h" -#define STACK_SIZE 1024*1024 +#define STACK_SIZE 1024*1024 +#define DEFAULT_PAGE_SIZE 4096 #define in_restricted() \ (!(cheri_perms_get(cheri_pcc_get()) & ARM_CAP_PERMISSION_EXECUTIVE)) @@ -17,6 +20,10 @@ #define MAGIC_NR 0x3562 /* whatever really ..... */ +#ifndef MAX_PID_NS_LEVEL +#define MAX_PID_NS_LEVEL 32 +#endif
/* Cloned thread result */ #define CTR_SUCCESS 1 #define CTR_FAILED -1 @@ -256,6 +263,263 @@ TEST(test_wait_rusage) RUN_WITH_FIXTURE(data, CLONE_TH_RUSAGE, wait_single); }
+#define GET_ERRNO(err) (((err) + MAX_ERRNO) ^ MAX_ERRNO)
This magic vaguely rings a bell but I'm not entirely sure why we need it here, could you elaborate?
+#define DECLARE_DESC(arg, val) \
- { \
offsetof(struct clone_args, arg), \
sizeof(((struct clone_args *)0)->arg), \
(uintcap_t)(val) \
- }
+struct clone3_fixture {
- size_t args_size;
- struct {
unsigned int offset;
unsigned int size;
uintcap_t value;
This is quite a complicated way to initialise a struct per-test. Having directly struct clone_args here would make things a lot simpler (and it would take less space in fact).
I understand that what I'm suggesting is not quite the same, as instead of overriding the default values here, you would be providing the defaults and run_clone3() would need to figure out what to provide itself. AFAICT this can simply be done by checking if the field is 0 (e.g. .flags), with the notable exception of .stack. You could have a .custom_stack field or similar to tell run_clone3() not to touch .stack / .stack_size.
It's worth noting that the current approach does not really work for the stack either: in case 6, you end up leaking the whole stack as the munmap() in run_clone3() will get NULL instead of the originally allocated stack. Therefore it seems to me that something like a .custom_stack field is required anyway.
- /* This will get extra size but that keeps things on the safe side */
- } const args_desc[sizeof(struct clone_args)/sizeof(__u64)];
- int n_desc;
- int e_result;
+} const clone3_data[] = {
- /* BEGIN_SECTION: expected failure */
- /* size of clone_args smaller than CLONE_ARGS_SIZE_VER0 */
- {
.args_size = offsetof(struct clone_args, tls),
.args_desc = {},
.n_desc = 0,
.e_result = -EINVAL
- }, /* @{0} */
- /* invalid set_tid array size */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(set_tid_size, MAX_PID_NS_LEVEL + 1),
},
.n_desc = 1,
.e_result = -EINVAL
- }, /* @{1} */
- /* Invalid combination of set_tid & set_tid_size */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(set_tid, (uintcap_t) NULL),
DECLARE_DESC(set_tid_size, 1)
},
.n_desc = 2,
.e_result = -EINVAL
- }, /* @{2} */
- /* Invalid exit_signal */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(exit_signal, _NSIG + 1),
},
.n_desc = 1,
.e_result = -EINVAL
- }, /* @{3} */
- /* Invalid cgroup number */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(flags, CLONE_INTO_CGROUP),
DECLARE_DESC(cgroup, (__u64)INT_MAX + 1),
},
.n_desc = 2,
.e_result = -EINVAL
- }, /* @{4} */
- /* Invalid size for clone_args with cgroup */
- {
.args_size = offsetof(struct clone_args, cgroup),
.args_desc = {
DECLARE_DESC(flags, CLONE_INTO_CGROUP),
DECLARE_DESC(cgroup, 1),
},
.n_desc = 2,
.e_result = -EINVAL
- }, /* @{5} */
- /* Invalid stack & stack_size combination */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(stack, NULL),
DECLARE_DESC(stack_size, 4096),
},
.n_desc = 2,
.e_result = -EINVAL
- }, /* @{6} */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(stack_size, 0),
},
.n_desc = 1,
.e_result = -EINVAL
- }, /* @{7} */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(set_tid, &(pid_t){1}),
DECLARE_DESC(set_tid_size, 1),
},
.n_desc = 2,
.e_result = -EEXIST
- }, /* @{8} */
- /* END_SECTION: expected failure */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(flags, CLONE_PIDFD |
CLONE_CHILD_SETTID |
CLONE_CHILD_CLEARTID),
},
.n_desc = 1,
.e_result = 0
- }, /* @{9} */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(flags, CLONE_PARENT_SETTID |
CLONE_CHILD_SETTID),
},
.n_desc = 1,
.e_result = 0
- }, /* @{10} */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(flags, CLONE_SETTLS),
},
.n_desc = 1,
.e_result = 0
- }, /* @{11} */
+};
+__thread int tls_data;
Nit: could be static and local to run_child().
+static __attribute__((noinline)) void run_child(struct clone_args *args) +{
- int count = 1000;
- if (args->flags & CLONE_CHILD_SETTID) {
pid_t current_pid = (pid_t)syscall(__NR_getpid);
We've got a getpid() wrapper in freestanding.h, could use it here.
ASSERT_FALSE(current_pid != *(pid_t *)args->child_tid);
- }
- if (args->flags & CLONE_SETTLS && args->tls) {
ptraddr_t base_addr = cheri_address_get(args->tls);
ptraddr_t ref_addr = cheri_address_get(&tls_data);
size_t length = cheri_length_get(args->tls);
ASSERT_TRUE(cheri_tag_get(args->tls));
ASSERT_FALSE(ref_addr < base_addr || ref_addr >= base_addr + length);
Nit: I think the reverse condition with ASSERT_TRUE() is a bit more readable.
- }
- /*
* This is grisly ....
* Spin for a while to give a chance for the parent to be
* scheduled in after clone and before this thread terminates
* (not entirely reliable though)
*/
- while (--count)
sleep_cycles(&(int){1000});
- syscall(__NR_exit, 0);
+}
+static inline __attribute__((always_inline)) +void run_clone3(const struct clone3_fixture *data) +{
- int pidfd, parent_tid, child_tid;
- siginfo_t *wait_si = &(siginfo_t){0};
Can't we just have `siginfo_t wait_si;` (no need to initialise it either as it's never read)?
- int result;
- pid_t pid;
- struct clone_args args = {
.flags = 0,
.pidfd = (uintcap_t)&pidfd,
.parent_tid = (uintcap_t)&parent_tid,
.child_tid = (uintcap_t)&child_tid,
.exit_signal = SIGCHLD,
.stack = (uintcap_t) mmap_verified(NULL, STACK_SIZE,
PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE,
-1, 0, STACK_REQ_PERMS),
.stack_size = STACK_SIZE,
- };
- for (int i = 0; i < data->n_desc; ++i) {
unsigned char *arg;
arg = (unsigned char *)&args + data->args_desc[i].offset;
if (data->args_desc[i].size != sizeof(uintcap_t))
*(__u64 *)arg = (__u64)data->args_desc[i].value;
else
*(uintcap_t *)arg = data->args_desc[i].value;
- }
- /* Don't bother with the full set-up for expected failures */
- if (data->e_result) {
result = syscall(__NR_clone3, &args, data->args_size);
ASSERT_EQ(result, data->e_result) {
ASSERT_* causes the whole program to exit, so you probably want EXPECT_EQ() here.
TH_LOG("Expected: %d while %d was received",
GET_ERRNO(data->e_result), GET_ERRNO(result));
}
munmap((void *)args.stack, STACK_SIZE);
return;
- }
- args.flags |= CLONE_VM;
- if (args.flags & CLONE_SETTLS) {
void *addr = mmap_verified(NULL, DEFAULT_PAGE_SIZE,
Not a big fan of calling this DEFAULT_PAGE_SIZE, as we shouldn't make any assumption on the page size. Fortunately the page size doesn't actually matter here as both mmap() and munmap() will align up the length to the page size. Maybe simply TLS_SIZE?
PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0,
CAP_LOAD_PERMS | CAP_STORE_PERMS);
args.tls = (uintcap_t)cheri_bounds_set(addr, DEFAULT_PAGE_SIZE);
- }
- pid = syscall(__NR_clone3, &args, data->args_size);
- ASSERT_GE(pid, 0);
- if (!pid)
run_child(&args);
- if (args.flags & CLONE_PIDFD) {
/* Signal handling is not the test target here: valid pidfd is */
result = syscall(__NR_pidfd_send_signal, pidfd, SIGUSR1, NULL, 0);
ASSERT_NE(result, -EBADF);
Agreed that we're not testing signal handling here, in fact for that reason I'd suggest using a signal that is ignored by default like SIGCHLD, not SIGUSR1 which causes termination by default. Alternatively, you could use sigwaitinfo() in the child, which also means you wouldn't need the busy loop.
OTOH I think it wouldn't hurt to check that pidfd_send_signal() didn't fail (in any way), as if it does that would suggest something went wrong with the test setup.
- }
- waitid(P_PID, pid, wait_si, WEXITED, NULL);
Should check that this one doesn't fail. This also makes me think, can we now get rid of the scary wait substitute in clone_single(), since we're not bothering with it here?
- /* child_tid set once the thread gets scheduled */
- if (args.flags & CLONE_PARENT_SETTID && args.flags & CLONE_CHILD_SETTID
&& !(args.flags & CLONE_CHILD_CLEARTID))
ASSERT_EQ(*(pid_t *)args.parent_tid, *(pid_t *)args.child_tid)
Considering that none of the testcases override either argument, that could be simply written as ASSERT_EQ(parent_tid, child_tid).
For extra robustness, maybe also 0-init both variables and check that the value is not 0 (i.e. the kernel didn't perform either write)?
- if (args.flags & CLONE_CHILD_CLEARTID)
ASSERT_EQ(*(pid_t *)args.child_tid, 0);
- munmap((void *)args.stack, STACK_SIZE);
- if (args.flags & CLONE_SETTLS)
munmap((void *)args.tls, STACK_SIZE);
s/STACK_SIZE/DEFAULT_PAGE_SIZE/
Kevin
+}
+TEST(test_clone3) +{
- int ncount = sizeof(clone3_data)/sizeof(clone3_data[0]);
- for (int i = 0; i < ncount; ++i) {
TH_LOG("Validating clone3 @{%d}", i);
run_clone3(&clone3_data[i]);
- }
+}
void run_restricted(uintcap_t entry_point) { void *new_stack = allocate_mem_raw(STACK_SIZE); @@ -290,5 +554,6 @@ int main(void) run_restricted((uintcap_t)test_clone_tls_restricted); test_wait_raw(); test_wait_rusage();
- test_clone3(); return 0;
}
On Wed, Nov 30, 2022 at 03:49:38PM +0100, Kevin Brodsky wrote:
On 18/11/2022 01:05, Beata Michalska wrote:
Add a bunch of test-cases for clone3 syscall, focusing mainly on handling struct clone_args versioning and sizing, alongside the obvious req for respecting capabilities.
Signed-off-by: Beata Michalska beata.michalska@arm.com
tools/testing/selftests/arm64/morello/clone.c | 269 +++++++++++++++++- 1 file changed, 267 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/arm64/morello/clone.c b/tools/testing/selftests/arm64/morello/clone.c index 8217840ba504..13c84dc7ca77 100644 --- a/tools/testing/selftests/arm64/morello/clone.c +++ b/tools/testing/selftests/arm64/morello/clone.c @@ -1,15 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2021 Arm Limited +#include "freestanding.h" #include <linux/mman.h> #include <linux/resource.h> #include <linux/wait.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/errno.h> +#include <linux/signal.h> +#include <limits.h> #include <cheriintrin.h> -#include "freestanding.h" -#define STACK_SIZE 1024*1024 +#define STACK_SIZE 1024*1024 +#define DEFAULT_PAGE_SIZE 4096 #define in_restricted() \ (!(cheri_perms_get(cheri_pcc_get()) & ARM_CAP_PERMISSION_EXECUTIVE)) @@ -17,6 +20,10 @@ #define MAGIC_NR 0x3562 /* whatever really ..... */ +#ifndef MAX_PID_NS_LEVEL +#define MAX_PID_NS_LEVEL 32 +#endif
/* Cloned thread result */ #define CTR_SUCCESS 1 #define CTR_FAILED -1 @@ -256,6 +263,263 @@ TEST(test_wait_rusage) RUN_WITH_FIXTURE(data, CLONE_TH_RUSAGE, wait_single); }
+#define GET_ERRNO(err) (((err) + MAX_ERRNO) ^ MAX_ERRNO)
This magic vaguely rings a bell but I'm not entirely sure why we need it here, could you elaborate?
Getting the absolute value to report the errno. It's not strictly necessary, though I think I found that convenient at some point....
+#define DECLARE_DESC(arg, val) \
- { \
offsetof(struct clone_args, arg), \
sizeof(((struct clone_args *)0)->arg), \
(uintcap_t)(val) \
- }
+struct clone3_fixture {
- size_t args_size;
- struct {
unsigned int offset;
unsigned int size;
uintcap_t value;
This is quite a complicated way to initialise a struct per-test. Having directly struct clone_args here would make things a lot simpler (and it would take less space in fact).
I understand that what I'm suggesting is not quite the same, as instead of overriding the default values here, you would be providing the defaults and run_clone3() would need to figure out what to provide itself. AFAICT this can simply be done by checking if the field is 0 (e.g. .flags), with the notable exception of .stack. You could have a .custom_stack field or similar to tell run_clone3() not to touch .stack / .stack_size.
OK, will try to re-arrange things here.
It's worth noting that the current approach does not really work for the stack either: in case 6, you end up leaking the whole stack as the munmap() in run_clone3() will get NULL instead of the originally allocated stack. Therefore it seems to me that something like a .custom_stack field is required anyway.
Yeah, that one I've missed!
- /* This will get extra size but that keeps things on the safe side */
- } const args_desc[sizeof(struct clone_args)/sizeof(__u64)];
- int n_desc;
- int e_result;
+} const clone3_data[] = {
- /* BEGIN_SECTION: expected failure */
- /* size of clone_args smaller than CLONE_ARGS_SIZE_VER0 */
- {
.args_size = offsetof(struct clone_args, tls),
.args_desc = {},
.n_desc = 0,
.e_result = -EINVAL
- }, /* @{0} */
- /* invalid set_tid array size */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(set_tid_size, MAX_PID_NS_LEVEL + 1),
},
.n_desc = 1,
.e_result = -EINVAL
- }, /* @{1} */
- /* Invalid combination of set_tid & set_tid_size */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(set_tid, (uintcap_t) NULL),
DECLARE_DESC(set_tid_size, 1)
},
.n_desc = 2,
.e_result = -EINVAL
- }, /* @{2} */
- /* Invalid exit_signal */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(exit_signal, _NSIG + 1),
},
.n_desc = 1,
.e_result = -EINVAL
- }, /* @{3} */
- /* Invalid cgroup number */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(flags, CLONE_INTO_CGROUP),
DECLARE_DESC(cgroup, (__u64)INT_MAX + 1),
},
.n_desc = 2,
.e_result = -EINVAL
- }, /* @{4} */
- /* Invalid size for clone_args with cgroup */
- {
.args_size = offsetof(struct clone_args, cgroup),
.args_desc = {
DECLARE_DESC(flags, CLONE_INTO_CGROUP),
DECLARE_DESC(cgroup, 1),
},
.n_desc = 2,
.e_result = -EINVAL
- }, /* @{5} */
- /* Invalid stack & stack_size combination */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(stack, NULL),
DECLARE_DESC(stack_size, 4096),
},
.n_desc = 2,
.e_result = -EINVAL
- }, /* @{6} */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(stack_size, 0),
},
.n_desc = 1,
.e_result = -EINVAL
- }, /* @{7} */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(set_tid, &(pid_t){1}),
DECLARE_DESC(set_tid_size, 1),
},
.n_desc = 2,
.e_result = -EEXIST
- }, /* @{8} */
- /* END_SECTION: expected failure */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(flags, CLONE_PIDFD |
CLONE_CHILD_SETTID |
CLONE_CHILD_CLEARTID),
},
.n_desc = 1,
.e_result = 0
- }, /* @{9} */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(flags, CLONE_PARENT_SETTID |
CLONE_CHILD_SETTID),
},
.n_desc = 1,
.e_result = 0
- }, /* @{10} */
- {
.args_size = sizeof(struct clone_args),
.args_desc = {
DECLARE_DESC(flags, CLONE_SETTLS),
},
.n_desc = 1,
.e_result = 0
- }, /* @{11} */
+};
+__thread int tls_data;
Nit: could be static and local to run_child().
Noted.
+static __attribute__((noinline)) void run_child(struct clone_args *args) +{
- int count = 1000;
- if (args->flags & CLONE_CHILD_SETTID) {
pid_t current_pid = (pid_t)syscall(__NR_getpid);
We've got a getpid() wrapper in freestanding.h, could use it here.
Good idea.
ASSERT_FALSE(current_pid != *(pid_t *)args->child_tid);
- }
- if (args->flags & CLONE_SETTLS && args->tls) {
ptraddr_t base_addr = cheri_address_get(args->tls);
ptraddr_t ref_addr = cheri_address_get(&tls_data);
size_t length = cheri_length_get(args->tls);
ASSERT_TRUE(cheri_tag_get(args->tls));
ASSERT_FALSE(ref_addr < base_addr || ref_addr >= base_addr + length);
Nit: I think the reverse condition with ASSERT_TRUE() is a bit more readable.
Noted.
- }
- /*
* This is grisly ....
* Spin for a while to give a chance for the parent to be
* scheduled in after clone and before this thread terminates
* (not entirely reliable though)
*/
- while (--count)
sleep_cycles(&(int){1000});
- syscall(__NR_exit, 0);
+}
+static inline __attribute__((always_inline)) +void run_clone3(const struct clone3_fixture *data) +{
- int pidfd, parent_tid, child_tid;
- siginfo_t *wait_si = &(siginfo_t){0};
Can't we just have `siginfo_t wait_si;` (no need to initialise it either as it's never read)?
This is simply getting the stack address for an object of type typeof(siginfo_t) but yeah, I guess I can do that the 'regular' way,
- int result;
- pid_t pid;
- struct clone_args args = {
.flags = 0,
.pidfd = (uintcap_t)&pidfd,
.parent_tid = (uintcap_t)&parent_tid,
.child_tid = (uintcap_t)&child_tid,
.exit_signal = SIGCHLD,
.stack = (uintcap_t) mmap_verified(NULL, STACK_SIZE,
PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE,
-1, 0, STACK_REQ_PERMS),
.stack_size = STACK_SIZE,
- };
- for (int i = 0; i < data->n_desc; ++i) {
unsigned char *arg;
arg = (unsigned char *)&args + data->args_desc[i].offset;
if (data->args_desc[i].size != sizeof(uintcap_t))
*(__u64 *)arg = (__u64)data->args_desc[i].value;
else
*(uintcap_t *)arg = data->args_desc[i].value;
- }
- /* Don't bother with the full set-up for expected failures */
- if (data->e_result) {
result = syscall(__NR_clone3, &args, data->args_size);
ASSERT_EQ(result, data->e_result) {
ASSERT_* causes the whole program to exit, so you probably want EXPECT_EQ() here.
That would mean all ASSERT_x should be replaced with EXPECT_x counterparts ?
TH_LOG("Expected: %d while %d was received",
GET_ERRNO(data->e_result), GET_ERRNO(result));
}
munmap((void *)args.stack, STACK_SIZE);
return;
- }
- args.flags |= CLONE_VM;
- if (args.flags & CLONE_SETTLS) {
void *addr = mmap_verified(NULL, DEFAULT_PAGE_SIZE,
Not a big fan of calling this DEFAULT_PAGE_SIZE, as we shouldn't make any assumption on the page size. Fortunately the page size doesn't actually matter here as both mmap() and munmap() will align up the length to the page size. Maybe simply TLS_SIZE?
Yeah, I did hesitate on that one for a moment, but as this one is not literally expecting PAGE_SIZE I went for it. I'll change that to TLS_SIZE instead.
PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0,
CAP_LOAD_PERMS | CAP_STORE_PERMS);
args.tls = (uintcap_t)cheri_bounds_set(addr, DEFAULT_PAGE_SIZE);
- }
- pid = syscall(__NR_clone3, &args, data->args_size);
- ASSERT_GE(pid, 0);
- if (!pid)
run_child(&args);
- if (args.flags & CLONE_PIDFD) {
/* Signal handling is not the test target here: valid pidfd is */
result = syscall(__NR_pidfd_send_signal, pidfd, SIGUSR1, NULL, 0);
ASSERT_NE(result, -EBADF);
Agreed that we're not testing signal handling here, in fact for that reason I'd suggest using a signal that is ignored by default like SIGCHLD, not SIGUSR1 which causes termination by default. Alternatively, you could use sigwaitinfo() in the child, which also means you wouldn't need the busy loop.
True, though I was trying to avoid using signal handling here on purpose, to not to rely on other 'functionality' to align with our initial approach to kselftests - test with minimal setup to avoid the test failing on things that are not the test subject themselves.
OTOH I think it wouldn't hurt to check that pidfd_send_signal() didn't fail (in any way), as if it does that would suggest something went wrong with the test setup.
... or smth is wrong with handling signals which relates to my comment above. I can change both if we want to relax our kselftests.
- }
- waitid(P_PID, pid, wait_si, WEXITED, NULL);
Should check that this one doesn't fail. This also makes me think, can we now get rid of the scary wait substitute in clone_single(), since we're not bothering with it here?
We could though at this point, the wait testcases are done, which means we can safely assume wait is working as expected and it's safe to rely on it. Again - exercising limited trust.
- /* child_tid set once the thread gets scheduled */
- if (args.flags & CLONE_PARENT_SETTID && args.flags & CLONE_CHILD_SETTID
&& !(args.flags & CLONE_CHILD_CLEARTID))
ASSERT_EQ(*(pid_t *)args.parent_tid, *(pid_t *)args.child_tid)
Considering that none of the testcases override either argument, that could be simply written as ASSERT_EQ(parent_tid, child_tid).
True
For extra robustness, maybe also 0-init both variables and check that the value is not 0 (i.e. the kernel didn't perform either write)?
OK
- if (args.flags & CLONE_CHILD_CLEARTID)
ASSERT_EQ(*(pid_t *)args.child_tid, 0);
- munmap((void *)args.stack, STACK_SIZE);
- if (args.flags & CLONE_SETTLS)
munmap((void *)args.tls, STACK_SIZE);
s/STACK_SIZE/DEFAULT_PAGE_SIZE/
Busted!
Thanks for the comments.
--- BR B.
Kevin
+}
+TEST(test_clone3) +{
- int ncount = sizeof(clone3_data)/sizeof(clone3_data[0]);
- for (int i = 0; i < ncount; ++i) {
TH_LOG("Validating clone3 @{%d}", i);
run_clone3(&clone3_data[i]);
- }
+}
void run_restricted(uintcap_t entry_point) { void *new_stack = allocate_mem_raw(STACK_SIZE); @@ -290,5 +554,6 @@ int main(void) run_restricted((uintcap_t)test_clone_tls_restricted); test_wait_raw(); test_wait_rusage();
- test_clone3(); return 0;
}
On 30/11/2022 22:27, Beata Michalska wrote:
On Wed, Nov 30, 2022 at 03:49:38PM +0100, Kevin Brodsky wrote:
On 18/11/2022 01:05, Beata Michalska wrote:
Add a bunch of test-cases for clone3 syscall, focusing mainly on handling struct clone_args versioning and sizing, alongside the obvious req for respecting capabilities.
Signed-off-by: Beata Michalska beata.michalska@arm.com
tools/testing/selftests/arm64/morello/clone.c | 269 +++++++++++++++++- 1 file changed, 267 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/arm64/morello/clone.c b/tools/testing/selftests/arm64/morello/clone.c index 8217840ba504..13c84dc7ca77 100644 --- a/tools/testing/selftests/arm64/morello/clone.c +++ b/tools/testing/selftests/arm64/morello/clone.c @@ -1,15 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2021 Arm Limited +#include "freestanding.h" #include <linux/mman.h> #include <linux/resource.h> #include <linux/wait.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/errno.h> +#include <linux/signal.h> +#include <limits.h> #include <cheriintrin.h> -#include "freestanding.h" -#define STACK_SIZE 1024*1024 +#define STACK_SIZE 1024*1024 +#define DEFAULT_PAGE_SIZE 4096 #define in_restricted() \ (!(cheri_perms_get(cheri_pcc_get()) & ARM_CAP_PERMISSION_EXECUTIVE)) @@ -17,6 +20,10 @@ #define MAGIC_NR 0x3562 /* whatever really ..... */ +#ifndef MAX_PID_NS_LEVEL +#define MAX_PID_NS_LEVEL 32 +#endif
/* Cloned thread result */ #define CTR_SUCCESS 1 #define CTR_FAILED -1 @@ -256,6 +263,263 @@ TEST(test_wait_rusage) RUN_WITH_FIXTURE(data, CLONE_TH_RUSAGE, wait_single); }
+#define GET_ERRNO(err) (((err) + MAX_ERRNO) ^ MAX_ERRNO)
This magic vaguely rings a bell but I'm not entirely sure why we need it here, could you elaborate?
Getting the absolute value to report the errno. It's not strictly necessary, though I think I found that convenient at some point....
Why not, I'm not sure it helps that much though, in fact it feels like "ASSERT_EQ(result, data->e_result)" is enough on its own without the explicit print after.
[...]
- int result;
- pid_t pid;
- struct clone_args args = {
.flags = 0,
.pidfd = (uintcap_t)&pidfd,
.parent_tid = (uintcap_t)&parent_tid,
.child_tid = (uintcap_t)&child_tid,
.exit_signal = SIGCHLD,
.stack = (uintcap_t) mmap_verified(NULL, STACK_SIZE,
PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE,
-1, 0, STACK_REQ_PERMS),
.stack_size = STACK_SIZE,
- };
- for (int i = 0; i < data->n_desc; ++i) {
unsigned char *arg;
arg = (unsigned char *)&args + data->args_desc[i].offset;
if (data->args_desc[i].size != sizeof(uintcap_t))
*(__u64 *)arg = (__u64)data->args_desc[i].value;
else
*(uintcap_t *)arg = data->args_desc[i].value;
- }
- /* Don't bother with the full set-up for expected failures */
- if (data->e_result) {
result = syscall(__NR_clone3, &args, data->args_size);
ASSERT_EQ(result, data->e_result) {
ASSERT_* causes the whole program to exit, so you probably want EXPECT_EQ() here.
That would mean all ASSERT_x should be replaced with EXPECT_x counterparts ?
Sorry I'm realising I didn't think this straight, if you exit the following munmap() is irrelevant, so there is no particular reason to use EXPECT here. I don't have much opinion on ASSERT vs EXPECT, I don't think it really matters for these tests, so no problem leaving it unchanged.
TH_LOG("Expected: %d while %d was received",
GET_ERRNO(data->e_result), GET_ERRNO(result));
}
munmap((void *)args.stack, STACK_SIZE);
return;
- }
- args.flags |= CLONE_VM;
- if (args.flags & CLONE_SETTLS) {
void *addr = mmap_verified(NULL, DEFAULT_PAGE_SIZE,
Not a big fan of calling this DEFAULT_PAGE_SIZE, as we shouldn't make any assumption on the page size. Fortunately the page size doesn't actually matter here as both mmap() and munmap() will align up the length to the page size. Maybe simply TLS_SIZE?
Yeah, I did hesitate on that one for a moment, but as this one is not literally expecting PAGE_SIZE I went for it. I'll change that to TLS_SIZE instead.
PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0,
CAP_LOAD_PERMS | CAP_STORE_PERMS);
args.tls = (uintcap_t)cheri_bounds_set(addr, DEFAULT_PAGE_SIZE);
- }
- pid = syscall(__NR_clone3, &args, data->args_size);
- ASSERT_GE(pid, 0);
- if (!pid)
run_child(&args);
- if (args.flags & CLONE_PIDFD) {
/* Signal handling is not the test target here: valid pidfd is */
result = syscall(__NR_pidfd_send_signal, pidfd, SIGUSR1, NULL, 0);
ASSERT_NE(result, -EBADF);
Agreed that we're not testing signal handling here, in fact for that reason I'd suggest using a signal that is ignored by default like SIGCHLD, not SIGUSR1 which causes termination by default. Alternatively, you could use sigwaitinfo() in the child, which also means you wouldn't need the busy loop.
True, though I was trying to avoid using signal handling here on purpose, to not to rely on other 'functionality' to align with our initial approach to kselftests - test with minimal setup to avoid the test failing on things that are not the test subject themselves.
OTOH I think it wouldn't hurt to check that pidfd_send_signal() didn't fail (in any way), as if it does that would suggest something went wrong with the test setup.
... or smth is wrong with handling signals which relates to my comment above. I can change both if we want to relax our kselftests.
I'd go for that, assuming that fundamental subsystems like signals cannot be relied on creates a lot of overhead, so I'd only do that for the most basic tests, and I don't consider clone3 one of them.
- }
- waitid(P_PID, pid, wait_si, WEXITED, NULL);
Should check that this one doesn't fail. This also makes me think, can we now get rid of the scary wait substitute in clone_single(), since we're not bothering with it here?
We could though at this point, the wait testcases are done, which means we can safely assume wait is working as expected and it's safe to rely on it. Again - exercising limited trust.
Yep makes sense.
Kevin
Hi Beata,
On 18-11-2022 00:05, Beata Michalska wrote:
Hi All,
This series adds capabilities support for clone3 syscall along with set of testcases in morello clone kselftests.
Struct clone_args signature after proposed changes:
__CHERI_PURE_CAPABILITY__ :
struct clone_args { __u64 flags __attribute__((__aligned__(8))); /* 0 8 */ __u8 __rsvd_v0_1[8]; /* 8 8 */ __kernel_aligned_uintptr_t pidfd; /* 16 16 */ __kernel_aligned_uintptr_t child_tid; /* 32 16 */ __kernel_aligned_uintptr_t parent_tid; /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u64 exit_signal __attribute__((__aligned__(8))); /* 64 8 */ __u8 __rsvd_v0_2[8]; /* 72 8 */ __kernel_aligned_uintptr_t stack; /* 80 16 */ __u64 stack_size __attribute__((__aligned__(8))); /* 96 8 */ __u8 __rsvd_v0_3[8]; /* 104 8 */ __kernel_aligned_uintptr_t tls; /* 112 16 */ /* --- cacheline 2 boundary (128 bytes) --- */ __kernel_aligned_uintptr_t set_tid; /* 128 16 */ __u64 set_tid_size __attribute__((__aligned__(8))); /* 144 8 */ __u64 cgroup __attribute__((__aligned__(8))); /* 152 8 */
/* size: 160, cachelines: 3, members: 14 */ /* forced alignments: 5 */ /* last cacheline: 32 bytes */
};
otherwise:
struct clone_args { __u64 flags __attribute__((__aligned__(8))); /* 0 8 */ __u8 __rsvd_v0_1[0]; /* 8 0 */ __kernel_aligned_uintptr_t pidfd __attribute__((__aligned__(8))); /* 8 8 */ __kernel_aligned_uintptr_t child_tid __attribute__((__aligned__(8))); /* 16 8 */ __kernel_aligned_uintptr_t parent_tid __attribute__((__aligned__(8))); /* 24 8 */ __u64 exit_signal __attribute__((__aligned__(8))); /* 32 8 */ __u8 __rsvd_v0_2[0]; /* 40 0 */ __kernel_aligned_uintptr_t stack __attribute__((__aligned__(8))); /* 40 8 */ __u64 stack_size __attribute__((__aligned__(8))); /* 48 8 */ __u8 __rsvd_v0_3[0]; /* 56 0 */ __kernel_aligned_uintptr_t tls __attribute__((__aligned__(8))); /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ __kernel_aligned_uintptr_t set_tid __attribute__((__aligned__(8))); /* 64 8 */ __u64 set_tid_size __attribute__((__aligned__(8))); /* 72 8 */ __u64 cgroup __attribute__((__aligned__(8))); /* 80 8 */
/* size: 88, cachelines: 2, members: 14 */ /* forced alignments: 11 */ /* last cacheline: 24 bytes */
};
Changes available at: https://git.morello-project.org/Bea/linux/-/tree/morello/clone3 LTP changes: https://git.morello-project.org/Bea/morello-linux-ltp/-/tree/morello/clone3 To run clone3 tests: ./runltp -f syscalls -s clone3
Both this series and the LTP change look great to me! I am impressed by how neat they are...
Thanks, Tudor
Beata Michalska (3): uaccess: Preserve capability tags with copy_struct_from_user fork: clone3: Add support for architectural capabilities kselftests/arm64: morello: Add clone3 test-cases
include/linux/uaccess.h | 2 +- include/uapi/linux/sched.h | 37 ++- kernel/fork.c | 123 ++++++-- tools/testing/selftests/arm64/morello/clone.c | 269 +++++++++++++++++- 4 files changed, 391 insertions(+), 40 deletions(-)
On Mon, Nov 21, 2022 at 12:24:38PM +0000, Tudor Cretu wrote:
Hi Beata,
Hi Tudor,
On 18-11-2022 00:05, Beata Michalska wrote:
Hi All,
This series adds capabilities support for clone3 syscall along with set of testcases in morello clone kselftests.
Struct clone_args signature after proposed changes:
__CHERI_PURE_CAPABILITY__ :
struct clone_args { __u64 flags __attribute__((__aligned__(8))); /* 0 8 */ __u8 __rsvd_v0_1[8]; /* 8 8 */ __kernel_aligned_uintptr_t pidfd; /* 16 16 */ __kernel_aligned_uintptr_t child_tid; /* 32 16 */ __kernel_aligned_uintptr_t parent_tid; /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u64 exit_signal __attribute__((__aligned__(8))); /* 64 8 */ __u8 __rsvd_v0_2[8]; /* 72 8 */ __kernel_aligned_uintptr_t stack; /* 80 16 */ __u64 stack_size __attribute__((__aligned__(8))); /* 96 8 */ __u8 __rsvd_v0_3[8]; /* 104 8 */ __kernel_aligned_uintptr_t tls; /* 112 16 */ /* --- cacheline 2 boundary (128 bytes) --- */ __kernel_aligned_uintptr_t set_tid; /* 128 16 */ __u64 set_tid_size __attribute__((__aligned__(8))); /* 144 8 */ __u64 cgroup __attribute__((__aligned__(8))); /* 152 8 */
/* size: 160, cachelines: 3, members: 14 */ /* forced alignments: 5 */ /* last cacheline: 32 bytes */
};
otherwise:
struct clone_args { __u64 flags __attribute__((__aligned__(8))); /* 0 8 */ __u8 __rsvd_v0_1[0]; /* 8 0 */ __kernel_aligned_uintptr_t pidfd __attribute__((__aligned__(8))); /* 8 8 */ __kernel_aligned_uintptr_t child_tid __attribute__((__aligned__(8))); /* 16 8 */ __kernel_aligned_uintptr_t parent_tid __attribute__((__aligned__(8))); /* 24 8 */ __u64 exit_signal __attribute__((__aligned__(8))); /* 32 8 */ __u8 __rsvd_v0_2[0]; /* 40 0 */ __kernel_aligned_uintptr_t stack __attribute__((__aligned__(8))); /* 40 8 */ __u64 stack_size __attribute__((__aligned__(8))); /* 48 8 */ __u8 __rsvd_v0_3[0]; /* 56 0 */ __kernel_aligned_uintptr_t tls __attribute__((__aligned__(8))); /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ __kernel_aligned_uintptr_t set_tid __attribute__((__aligned__(8))); /* 64 8 */ __u64 set_tid_size __attribute__((__aligned__(8))); /* 72 8 */ __u64 cgroup __attribute__((__aligned__(8))); /* 80 8 */
/* size: 88, cachelines: 2, members: 14 */ /* forced alignments: 11 */ /* last cacheline: 24 bytes */
};
Changes available at: https://git.morello-project.org/Bea/linux/-/tree/morello/clone3 LTP changes: https://git.morello-project.org/Bea/morello-linux-ltp/-/tree/morello/clone3 To run clone3 tests: ./runltp -f syscalls -s clone3
Both this series and the LTP change look great to me! I am impressed by how neat they are...
Thanks a lot for having a look at the series!
--- BR B.
Thanks, Tudor
Beata Michalska (3): uaccess: Preserve capability tags with copy_struct_from_user fork: clone3: Add support for architectural capabilities kselftests/arm64: morello: Add clone3 test-cases
include/linux/uaccess.h | 2 +- include/uapi/linux/sched.h | 37 ++- kernel/fork.c | 123 ++++++-- tools/testing/selftests/arm64/morello/clone.c | 269 +++++++++++++++++- 4 files changed, 391 insertions(+), 40 deletions(-)
On 21/11/2022 12:54, Beata Michalska wrote:
On Mon, Nov 21, 2022 at 12:24:38PM +0000, Tudor Cretu wrote:
Hi Beata,
Hi Tudor,
On 18-11-2022 00:05, Beata Michalska wrote:
Hi All,
This series adds capabilities support for clone3 syscall along with set of testcases in morello clone kselftests.
Struct clone_args signature after proposed changes:
__CHERI_PURE_CAPABILITY__ :
struct clone_args { __u64 flags __attribute__((__aligned__(8))); /* 0 8 */ __u8 __rsvd_v0_1[8]; /* 8 8 */ __kernel_aligned_uintptr_t pidfd; /* 16 16 */ __kernel_aligned_uintptr_t child_tid; /* 32 16 */ __kernel_aligned_uintptr_t parent_tid; /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u64 exit_signal __attribute__((__aligned__(8))); /* 64 8 */ __u8 __rsvd_v0_2[8]; /* 72 8 */ __kernel_aligned_uintptr_t stack; /* 80 16 */ __u64 stack_size __attribute__((__aligned__(8))); /* 96 8 */ __u8 __rsvd_v0_3[8]; /* 104 8 */ __kernel_aligned_uintptr_t tls; /* 112 16 */ /* --- cacheline 2 boundary (128 bytes) --- */ __kernel_aligned_uintptr_t set_tid; /* 128 16 */ __u64 set_tid_size __attribute__((__aligned__(8))); /* 144 8 */ __u64 cgroup __attribute__((__aligned__(8))); /* 152 8 */
/* size: 160, cachelines: 3, members: 14 */ /* forced alignments: 5 */ /* last cacheline: 32 bytes */
};
otherwise:
struct clone_args { __u64 flags __attribute__((__aligned__(8))); /* 0 8 */ __u8 __rsvd_v0_1[0]; /* 8 0 */ __kernel_aligned_uintptr_t pidfd __attribute__((__aligned__(8))); /* 8 8 */ __kernel_aligned_uintptr_t child_tid __attribute__((__aligned__(8))); /* 16 8 */ __kernel_aligned_uintptr_t parent_tid __attribute__((__aligned__(8))); /* 24 8 */ __u64 exit_signal __attribute__((__aligned__(8))); /* 32 8 */ __u8 __rsvd_v0_2[0]; /* 40 0 */ __kernel_aligned_uintptr_t stack __attribute__((__aligned__(8))); /* 40 8 */ __u64 stack_size __attribute__((__aligned__(8))); /* 48 8 */ __u8 __rsvd_v0_3[0]; /* 56 0 */ __kernel_aligned_uintptr_t tls __attribute__((__aligned__(8))); /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ __kernel_aligned_uintptr_t set_tid __attribute__((__aligned__(8))); /* 64 8 */ __u64 set_tid_size __attribute__((__aligned__(8))); /* 72 8 */ __u64 cgroup __attribute__((__aligned__(8))); /* 80 8 */
/* size: 88, cachelines: 2, members: 14 */ /* forced alignments: 11 */ /* last cacheline: 24 bytes */
};
Changes available at: https://git.morello-project.org/Bea/linux/-/tree/morello/clone3 LTP changes: https://git.morello-project.org/Bea/morello-linux-ltp/-/tree/morello/clone3 To run clone3 tests: ./runltp -f syscalls -s clone3
Both this series and the LTP change look great to me! I am impressed by how neat they are...
Yeah agreed, the different sized versions is tricky to deal with. The getter macros for compat/non-compat structs are nice too. Plus some very extensive tests included as well.
Nice patches.
Thanks, Zach
Thanks a lot for having a look at the series!
BR B.
Thanks, Tudor
Beata Michalska (3): uaccess: Preserve capability tags with copy_struct_from_user fork: clone3: Add support for architectural capabilities kselftests/arm64: morello: Add clone3 test-cases
include/linux/uaccess.h | 2 +- include/uapi/linux/sched.h | 37 ++- kernel/fork.c | 123 ++++++-- tools/testing/selftests/arm64/morello/clone.c | 269 +++++++++++++++++- 4 files changed, 391 insertions(+), 40 deletions(-)
linux-morello mailing list -- linux-morello@op-lists.linaro.org To unsubscribe send an email to linux-morello-leave@op-lists.linaro.org
On 25/11/2022 13:13, Zachary Leaf wrote:
Both this series and the LTP change look great to me! I am impressed by how neat they are...
Yeah agreed, the different sized versions is tricky to deal with. The getter macros for compat/non-compat structs are nice too. Plus some very extensive tests included as well.
Nice patches.
Agreed, I haven't got the chance to look at the LTP tests yet but the kselftests are certainly more extensive than I expected :) And patch 2 is probably the "smoothest" approach to do this.
Kevin
linux-morello@op-lists.linaro.org