The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure, with some of its members representing user pointers. Currently those are specified as being of __u64 type (alignment aside) which is problematic for some architectures, where that type cannot actually hold said pointers. In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type. This though, brings its own drawbacks. The structure itself gains revised signature with explicit paddings for cases where sizeof(__kernel_uintptr_t) > sizeof(__u64), which results in noticeable growth in the structure's size. Furthermore, for affected architectures, as a consequence clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Signed-off-by: Beata Michalska beata.michalska@arm.com --- include/uapi/linux/sched.h | 36 ++++++++--- kernel/fork.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 37 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..56ca73ca763c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,42 @@ * * The structure is versioned by size and thus extensible. * New struct members must go at the end of the struct and - * must be properly 64bit aligned. + * must be properly aligned on at least 64bit boundary. + * See below on explicit padding when: + * ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) + * */ + +#define __rsvd(tag) \ + __u8 __rsvd_##tag[sizeof(__kernel_uintptr_t) - sizeof(__u64)] + struct clone_args { __aligned_u64 flags; - __aligned_u64 pidfd; - __aligned_u64 child_tid; - __aligned_u64 parent_tid; + __rsvd(v0_1); + __kernel_aligned_uintptr_t pidfd; + __kernel_aligned_uintptr_t child_tid; + __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal; - __aligned_u64 stack; + __rsvd(v0_2); + __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size; - __aligned_u64 tls; - __aligned_u64 set_tid; + __rsvd(v0_3); + __kernel_aligned_uintptr_t tls; + /* VER0 boundary */ + __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size; + /* VER1 boundary */ __aligned_u64 cgroup; + /* VER2 boundary */ }; #endif
-#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \ + ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 128 : 64) +#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \ + ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 152 : 80) +#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \ + ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 160 : 88)
/* * Scheduling policies diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..72de1187a586 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,136 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp,
#ifdef __ARCH_WANT_SYS_CLONE3
+#define __clone_args_size_ver(ver, pfx) \ + pfx##CLONE_ARGS_SIZE_VER##ver + +#ifdef CONFIG_COMPAT64 +struct compat_clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; + __aligned_u64 cgroup; +}; + +#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ + +#define clone_args_size_ver(ver) \ + (in_compat_syscall() ? __clone_args_size_ver(ver, COMPAT_) \ + : __clone_args_size_ver(ver, )) + +#define clone_args_get(args, member) \ + (in_compat_syscall() ? (args)->__compat_args.member \ + : (args)->__args.member) + +#define clone_args_as_user_ptr(args, member) \ + (in_compat_syscall() ? compat_ptr(clone_args_get(args, member)) \ + : as_user_ptr(clone_args_get(args, member))) +#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(ver __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_as_user_ptr(args, member) \ + as_user_ptr(clone_args_get(args, member)) +#endif /* CONFIG_COMPAT64 */ + +static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \ +do { \ + BUILD_BUG_ON(offsetofend(type, tls) != pfx##_SIZE_VER0); \ + BUILD_BUG_ON(offsetofend(type, set_tid_size) != pfx##_SIZE_VER1);\ + BUILD_BUG_ON(offsetofend(type, cgroup) != pfx##_SIZE_VER2); \ + BUILD_BUG_ON(sizeof(type) != pfx##_SIZE_VER2); \ +} while (0) + + CLONE_ARGS_BUILD_BUG_ON(struct clone_args, CLONE_ARGS); +#ifdef CONFIG_COMPAT64 + CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_CLONE_ARGS); + BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args)); +#endif +} + noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64 + union { + struct clone_args __args; + struct compat_clone_args __compat_args; + } args; +#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) != - CLONE_ARGS_SIZE_VER0); - BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != - CLONE_ARGS_SIZE_VER1); - BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != - CLONE_ARGS_SIZE_VER2); - BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); + clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG; - if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) + if (unlikely(usize < clone_args_size_ver(0))) return -EINVAL;
+#ifdef CONFIG_COMPAT64 + err = copy_struct_from_user(&args, (in_compat_syscall() ? + sizeof(struct compat_clone_args) : + sizeof(args)), + uargs, usize); +#else err = copy_struct_from_user(&args, sizeof(args), uargs, usize); +#endif + if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) + if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0)) + if (unlikely(!clone_args_get(&args, set_tid) && + clone_args_get(&args, set_tid_size) > 0)) return -EINVAL;
- if (unlikely(args.set_tid && args.set_tid_size == 0)) + if (unlikely(clone_args_get(&args, set_tid) && + clone_args_get(&args, set_tid_size) == 0)) return -EINVAL;
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */ - if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || - !valid_signal(args.exit_signal))) + if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) || + !valid_signal(clone_args_get(&args, exit_signal)))) return -EINVAL;
- if ((args.flags & CLONE_INTO_CGROUP) && - (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) + if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) && + (clone_args_get(&args, cgroup) > INT_MAX || + usize < clone_args_size_ver(2))) return -EINVAL;
*kargs = (struct kernel_clone_args){ - .flags = args.flags, - .pidfd = u64_to_user_ptr(args.pidfd), - .child_tid = u64_to_user_ptr(args.child_tid), - .parent_tid = u64_to_user_ptr(args.parent_tid), - .exit_signal = args.exit_signal, - .stack = args.stack, - .stack_size = args.stack_size, - .tls = args.tls, - .set_tid_size = args.set_tid_size, - .cgroup = args.cgroup, + .flags = clone_args_get(&args, flags), + .pidfd = clone_args_as_user_ptr(&args, pidfd), + .child_tid = clone_args_as_user_ptr(&args, child_tid), + .parent_tid = clone_args_as_user_ptr(&args, parent_tid), + .exit_signal = clone_args_get(&args, exit_signal), + .stack = clone_args_get(&args, stack), + .stack_size = clone_args_get(&args, stack_size), + .tls = clone_args_get(&args, tls), + .set_tid_size = clone_args_get(&args, set_tid_size), + .cgroup = clone_args_get(&args, cgroup), };
- if (args.set_tid && - copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), + if (clone_args_get(&args, set_tid) && + copy_from_user(kset_tid, clone_args_as_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t)))) return -EFAULT;