On 18/11/2022 00:05, Beata Michalska wrote:
The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure, with some of its members representing user pointers. Currently those are specified as being of __u64 type (alignment aside) which is problematic for some architectures, where that type cannot actually hold said pointers. In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type. This though, brings its own drawbacks. The structure itself gains revised signature with explicit paddings for cases where sizeof(__kernel_uintptr_t) > sizeof(__u64), which results in noticeable growth in the structure's size. Furthermore, for affected architectures, as a consequence clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/uapi/linux/sched.h | 36 ++++++++--- kernel/fork.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 37 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..56ca73ca763c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,42 @@
- The structure is versioned by size and thus extensible.
- New struct members must go at the end of the struct and
- must be properly 64bit aligned.
- must be properly aligned on at least 64bit boundary.
- See below on explicit padding when:
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64))
I think this would still work without the explicit padding, right? What's the reason for adding it explicitly? This is related to the issue of struct holes at https://git.morello-project.org/morello/kernel/linux/-/issues/38
Seems to me it adds some extra complexity as Kristina mentioned in above issue, but might be missing something here.
*/
+#define __rsvd(tag) \
- __u8 __rsvd_##tag[sizeof(__kernel_uintptr_t) - sizeof(__u64)]
struct clone_args { __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __rsvd(v0_1);
- __kernel_aligned_uintptr_t pidfd;
- __kernel_aligned_uintptr_t child_tid;
- __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __rsvd(v0_2);
- __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __rsvd(v0_3);
- __kernel_aligned_uintptr_t tls;
- /* VER0 boundary */
- __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size;
- /* VER1 boundary */ __aligned_u64 cgroup;
- /* VER2 boundary */
}; #endif -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 128 : 64)
+#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 152 : 80)
+#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 160 : 88)
This is just hardcoding in the struct sizes which is fine, but just as a general question, would it ever be possible for __alignof__(__kernel_uintptr_t) to be different or more than 16?
Seems like what we've got here is just a proxy for: ((__alignof__(__kernel_uintptr_t) == 16) ? 160 : 88)
Thanks, Zach
/*
- Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..72de1187a586 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,136 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 +#define __clone_args_size_ver(ver, pfx) \
- pfx##CLONE_ARGS_SIZE_VER##ver
+#ifdef CONFIG_COMPAT64 +struct compat_clone_args {
- __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __aligned_u64 set_tid_size;
- __aligned_u64 cgroup;
+};
+#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define clone_args_size_ver(ver) \
- (in_compat_syscall() ? __clone_args_size_ver(ver, COMPAT_) \
: __clone_args_size_ver(ver, ))
+#define clone_args_get(args, member) \
- (in_compat_syscall() ? (args)->__compat_args.member \
: (args)->__args.member)
+#define clone_args_as_user_ptr(args, member) \
- (in_compat_syscall() ? compat_ptr(clone_args_get(args, member)) \
: as_user_ptr(clone_args_get(args, member)))
+#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(ver __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_as_user_ptr(args, member) \
- as_user_ptr(clone_args_get(args, member))
+#endif /* CONFIG_COMPAT64 */
+static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \ +do { \
- BUILD_BUG_ON(offsetofend(type, tls) != pfx##_SIZE_VER0); \
- BUILD_BUG_ON(offsetofend(type, set_tid_size) != pfx##_SIZE_VER1);\
- BUILD_BUG_ON(offsetofend(type, cgroup) != pfx##_SIZE_VER2); \
- BUILD_BUG_ON(sizeof(type) != pfx##_SIZE_VER2); \
+} while (0)
- CLONE_ARGS_BUILD_BUG_ON(struct clone_args, CLONE_ARGS);
+#ifdef CONFIG_COMPAT64
- CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_CLONE_ARGS);
- BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args));
+#endif +}
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64
- union {
struct clone_args __args;
struct compat_clone_args __compat_args;
- } args;
+#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
CLONE_ARGS_SIZE_VER0);
- BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
CLONE_ARGS_SIZE_VER1);
- BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
- BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
- clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG;
- if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
- if (unlikely(usize < clone_args_size_ver(0))) return -EINVAL;
+#ifdef CONFIG_COMPAT64
- err = copy_struct_from_user(&args, (in_compat_syscall() ?
sizeof(struct compat_clone_args) :
sizeof(args)),> + uargs, usize);
+#else err = copy_struct_from_user(&args, sizeof(args), uargs, usize); +#endif
- if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
- if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0))
- if (unlikely(!clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) > 0))
- if (unlikely(args.set_tid && args.set_tid_size == 0))
- if (unlikely(clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) == 0))
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */
- if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
!valid_signal(args.exit_signal)))
- if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) ||
return -EINVAL;!valid_signal(clone_args_get(&args, exit_signal))))
- if ((args.flags & CLONE_INTO_CGROUP) &&
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
- if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) &&
(clone_args_get(&args, cgroup) > INT_MAX ||
return -EINVAL;usize < clone_args_size_ver(2)))
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
.child_tid = u64_to_user_ptr(args.child_tid),
.parent_tid = u64_to_user_ptr(args.parent_tid),
.exit_signal = args.exit_signal,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
.flags = clone_args_get(&args, flags),
.pidfd = clone_args_as_user_ptr(&args, pidfd),
.child_tid = clone_args_as_user_ptr(&args, child_tid),
.parent_tid = clone_args_as_user_ptr(&args, parent_tid),
.exit_signal = clone_args_get(&args, exit_signal),
.stack = clone_args_get(&args, stack),
.stack_size = clone_args_get(&args, stack_size),
.tls = clone_args_get(&args, tls),
.set_tid_size = clone_args_get(&args, set_tid_size),
};.cgroup = clone_args_get(&args, cgroup),
- if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
- if (clone_args_get(&args, set_tid) &&
return -EFAULT;copy_from_user(kset_tid, clone_args_as_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t))))