On 30/11/2022 20:26, Beata Michalska wrote:
On Fri, Nov 25, 2022 at 12:13:38PM +0000, Zachary Leaf wrote:
On 18/11/2022 00:05, Beata Michalska wrote:
The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure, with some of its members representing user pointers. Currently those are specified as being of __u64 type (alignment aside) which is problematic for some architectures, where that type cannot actually hold said pointers. In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type. This though, brings its own drawbacks. The structure itself gains revised signature with explicit paddings for cases where sizeof(__kernel_uintptr_t) > sizeof(__u64), which results in noticeable growth in the structure's size. Furthermore, for affected architectures, as a consequence clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/uapi/linux/sched.h | 36 ++++++++--- kernel/fork.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 37 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..56ca73ca763c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,42 @@
- The structure is versioned by size and thus extensible.
- New struct members must go at the end of the struct and
- must be properly 64bit aligned.
- must be properly aligned on at least 64bit boundary.
- See below on explicit padding when:
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64))
I think this would still work without the explicit padding, right? What's the reason for adding it explicitly? This is related to the issue of struct holes at https://git.morello-project.org/morello/kernel/linux/-/issues/38
It is somewhat related, though does not suffer from the risk of potentially leaking data as, in this case, we are moving memory only in one direction: from user-space into kernel space. And yes, it would work without explicit padding - the only thing that is happening here is making the padding more visible.
Seems to me it adds some extra complexity as Kristina mentioned in above issue, but might be missing something here.
It's mainly visualizing the padding - there is no additional complexity here. And as this struct is extendable with size versioning it might actually be convenient when specifying the size of future extensions and verifying struct's layout. Though if you believe it is still too 'disruptive' I can remove those.
*/
+#define __rsvd(tag) \
- __u8 __rsvd_##tag[sizeof(__kernel_uintptr_t) - sizeof(__u64)]
struct clone_args { __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __rsvd(v0_1);
- __kernel_aligned_uintptr_t pidfd;
- __kernel_aligned_uintptr_t child_tid;
- __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __rsvd(v0_2);
- __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __rsvd(v0_3);
- __kernel_aligned_uintptr_t tls;
- /* VER0 boundary */
- __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size;
- /* VER1 boundary */ __aligned_u64 cgroup;
- /* VER2 boundary */
}; #endif -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 128 : 64)
+#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 152 : 80)
+#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 160 : 88)
This is just hardcoding in the struct sizes which is fine, but just as a
That's how this struct is being versioned.
general question, would it ever be possible for __alignof__(__kernel_uintptr_t) to be different or more than 16?
If __kernel_uintptr_t is 64bits then yes, the alignment will be on 8-byte boundaries. Additionally, as __kernel_uintptr_t is defined as __u64 for !PCuABI it can be aligned on 4-byte boundaries for some 32-bit platforms, which is why this struct is forcing the alignment on it's members to be (now at least) 8 bytes.
Seems like what we've got here is just a proxy for: ((__alignof__(__kernel_uintptr_t) == 16) ? 160 : 88)
Yes, although I do prefer the verbosity here. Do you believe this is not readable and should be changed ?
In hindsight I think it's fine.
My initial concern/question here was if ever in future we might have have __kernel_uintptr_t being 32B/256-bit or something like that. Then it might be better to be explicit about what values of alignof(x) relate to what struct sizes instead of specifying it just as a relation of x > y.
In that case we've got more than 2 options and what I suggested wouldn't work either anyway and would need additional cases.
Thanks, Zach
BR B.
Thanks, Zach
/*
- Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..72de1187a586 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,136 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 +#define __clone_args_size_ver(ver, pfx) \
- pfx##CLONE_ARGS_SIZE_VER##ver
+#ifdef CONFIG_COMPAT64 +struct compat_clone_args {
- __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __aligned_u64 set_tid_size;
- __aligned_u64 cgroup;
+};
+#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define clone_args_size_ver(ver) \
- (in_compat_syscall() ? __clone_args_size_ver(ver, COMPAT_) \
: __clone_args_size_ver(ver, ))
+#define clone_args_get(args, member) \
- (in_compat_syscall() ? (args)->__compat_args.member \
: (args)->__args.member)
+#define clone_args_as_user_ptr(args, member) \
- (in_compat_syscall() ? compat_ptr(clone_args_get(args, member)) \
: as_user_ptr(clone_args_get(args, member)))
+#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(ver __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_as_user_ptr(args, member) \
- as_user_ptr(clone_args_get(args, member))
+#endif /* CONFIG_COMPAT64 */
+static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \ +do { \
- BUILD_BUG_ON(offsetofend(type, tls) != pfx##_SIZE_VER0); \
- BUILD_BUG_ON(offsetofend(type, set_tid_size) != pfx##_SIZE_VER1);\
- BUILD_BUG_ON(offsetofend(type, cgroup) != pfx##_SIZE_VER2); \
- BUILD_BUG_ON(sizeof(type) != pfx##_SIZE_VER2); \
+} while (0)
- CLONE_ARGS_BUILD_BUG_ON(struct clone_args, CLONE_ARGS);
+#ifdef CONFIG_COMPAT64
- CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_CLONE_ARGS);
- BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args));
+#endif +}
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64
- union {
struct clone_args __args;
struct compat_clone_args __compat_args;
- } args;
+#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
CLONE_ARGS_SIZE_VER0);
- BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
CLONE_ARGS_SIZE_VER1);
- BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
- BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
- clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG;
- if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
- if (unlikely(usize < clone_args_size_ver(0))) return -EINVAL;
+#ifdef CONFIG_COMPAT64
- err = copy_struct_from_user(&args, (in_compat_syscall() ?
sizeof(struct compat_clone_args) :
sizeof(args)),> + uargs, usize);
+#else err = copy_struct_from_user(&args, sizeof(args), uargs, usize); +#endif
- if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
- if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0))
- if (unlikely(!clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) > 0))
- if (unlikely(args.set_tid && args.set_tid_size == 0))
- if (unlikely(clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) == 0))
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */
- if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
!valid_signal(args.exit_signal)))
- if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) ||
return -EINVAL;!valid_signal(clone_args_get(&args, exit_signal))))
- if ((args.flags & CLONE_INTO_CGROUP) &&
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
- if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) &&
(clone_args_get(&args, cgroup) > INT_MAX ||
return -EINVAL;usize < clone_args_size_ver(2)))
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
.child_tid = u64_to_user_ptr(args.child_tid),
.parent_tid = u64_to_user_ptr(args.parent_tid),
.exit_signal = args.exit_signal,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
.flags = clone_args_get(&args, flags),
.pidfd = clone_args_as_user_ptr(&args, pidfd),
.child_tid = clone_args_as_user_ptr(&args, child_tid),
.parent_tid = clone_args_as_user_ptr(&args, parent_tid),
.exit_signal = clone_args_get(&args, exit_signal),
.stack = clone_args_get(&args, stack),
.stack_size = clone_args_get(&args, stack_size),
.tls = clone_args_get(&args, tls),
.set_tid_size = clone_args_get(&args, set_tid_size),
};.cgroup = clone_args_get(&args, cgroup),
- if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
- if (clone_args_get(&args, set_tid) &&
return -EFAULT;copy_from_user(kset_tid, clone_args_as_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t))))
linux-morello mailing list -- linux-morello@op-lists.linaro.org To unsubscribe send an email to linux-morello-leave@op-lists.linaro.org