On Wed, Nov 30, 2022 at 03:43:16PM +0100, Kevin Brodsky wrote:
On 18/11/2022 01:05, Beata Michalska wrote:
The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure, with some of its members representing user pointers. Currently those are specified as being of __u64 type (alignment aside) which is problematic for some architectures, where that type cannot actually hold said pointers. In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type. This though, brings its own drawbacks. The structure itself gains revised signature with explicit paddings for cases where sizeof(__kernel_uintptr_t) > sizeof(__u64), which results in noticeable growth in the structure's size. Furthermore, for affected architectures, as a consequence clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Might want to break that down into a couple of paragraphs to give the avid reader some space to breathe :)
Noted.
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/uapi/linux/sched.h | 36 ++++++++--- kernel/fork.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 37 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..56ca73ca763c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,42 @@
- The structure is versioned by size and thus extensible.
- New struct members must go at the end of the struct and
- must be properly 64bit aligned.
- must be properly aligned on at least 64bit boundary.
- See below on explicit padding when:
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64))
*/
+#define __rsvd(tag) \
- __u8 __rsvd_##tag[sizeof(__kernel_uintptr_t) - sizeof(__u64)]
struct clone_args { __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __rsvd(v0_1);
- __kernel_aligned_uintptr_t pidfd;
- __kernel_aligned_uintptr_t child_tid;
- __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __rsvd(v0_2);
- __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __rsvd(v0_3);
- __kernel_aligned_uintptr_t tls;
- /* VER0 boundary */
- __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size;
- /* VER1 boundary */ __aligned_u64 cgroup;
- /* VER2 boundary */
}; #endif -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 128 : 64)
+#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 152 : 80)
+#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \
- ((__alignof__(__kernel_uintptr_t) > __alignof__(__u64)) ? 160 : 88)
/*
- Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..72de1187a586 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,136 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 +#define __clone_args_size_ver(ver, pfx) \
- pfx##CLONE_ARGS_SIZE_VER##ver
+#ifdef CONFIG_COMPAT64 +struct compat_clone_args {
- __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __aligned_u64 set_tid_size;
- __aligned_u64 cgroup;
+};
+#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define clone_args_size_ver(ver) \
- (in_compat_syscall() ? __clone_args_size_ver(ver, COMPAT_) \
We are ending up with really a lot of calls to in_compat_syscall() in copy_clone_args_from_user(). It looks like in_compat_syscall() is completely implemented inline, so maybe the compiler can decide that it can just call it once and cache the result, but that's definitely worth checking.
I was concerned a bit about that, which is why I did mention performance impact in the commit message. I'll have another thought on how to optimize that one out.
: __clone_args_size_ver(ver, ))
+#define clone_args_get(args, member) \
- (in_compat_syscall() ? (args)->__compat_args.member \
: (args)->__args.member)
+#define clone_args_as_user_ptr(args, member) \
- (in_compat_syscall() ? compat_ptr(clone_args_get(args, member)) \
: as_user_ptr(clone_args_get(args, member)))
+#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(ver __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_as_user_ptr(args, member) \
- as_user_ptr(clone_args_get(args, member))
I was going to say that this does not do what you want, but, having just looked again at the implementation of as_user_ptr(), I stand corrected! In any case, this was not the intention for as_user_ptr(), it is meant to signal that an arbitrary integer is being stored into a user pointer (see Documentation/core-api/user_ptr.rst). This is not what happens here, you just want to get a void __user * from a __kernel_uintptr_t. Since this does not involve any representation change [1], a simple cast is appropriate. This is what we now have in aio_setup_rw() for instance.
To avoid any confusion I would also rename the macro, maybe clone_args_get_user_ptr()?
[1] Writing this I realise this is not strictly true: in 32-bit, __kernel_uintptr_t is 64-bit but void __user * is 32-bit. This is not a concern though as the compiler is happy to cast a 64-bit integer to a 32-bit pointer.
I actually did hesitate with that one for non-PCuABI cases but I agree, this one should be updated.
+#endif /* CONFIG_COMPAT64 */
+static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \
Nit: might as well add an extra hard tab to the indentation of , better than just a space.
+do { \
- BUILD_BUG_ON(offsetofend(type, tls) != pfx##_SIZE_VER0); \
- BUILD_BUG_ON(offsetofend(type, set_tid_size) != pfx##_SIZE_VER1);\
Nit: a stray hard tab after != on that line, and two spaces after != on the line above.
- BUILD_BUG_ON(offsetofend(type, cgroup) != pfx##_SIZE_VER2); \
- BUILD_BUG_ON(sizeof(type) != pfx##_SIZE_VER2); \
Why not use that nice __clone_args_size_ver() you introduced?
That's a fair point! I think those came first :) Will change that.
--- BR B.
+} while (0)
- CLONE_ARGS_BUILD_BUG_ON(struct clone_args, CLONE_ARGS);
+#ifdef CONFIG_COMPAT64
- CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_CLONE_ARGS);
- BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args));
+#endif +}
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64
- union {
struct clone_args __args;
struct compat_clone_args __compat_args;
- } args;
+#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
CLONE_ARGS_SIZE_VER0);
- BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
CLONE_ARGS_SIZE_VER1);
- BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
- BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
- clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG;
- if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
- if (unlikely(usize < clone_args_size_ver(0))) return -EINVAL;
+#ifdef CONFIG_COMPAT64
- err = copy_struct_from_user(&args, (in_compat_syscall() ?
sizeof(struct compat_clone_args) :
sizeof(args)),
uargs, usize);
+#else err = copy_struct_from_user(&args, sizeof(args), uargs, usize); +#endif
- if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
- if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0))
- if (unlikely(!clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) > 0))
- if (unlikely(args.set_tid && args.set_tid_size == 0))
- if (unlikely(clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) == 0))
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */
- if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
!valid_signal(args.exit_signal)))
- if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) ||
return -EINVAL;!valid_signal(clone_args_get(&args, exit_signal))))
- if ((args.flags & CLONE_INTO_CGROUP) &&
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
- if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) &&
(clone_args_get(&args, cgroup) > INT_MAX ||
return -EINVAL;usize < clone_args_size_ver(2)))
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
.child_tid = u64_to_user_ptr(args.child_tid),
.parent_tid = u64_to_user_ptr(args.parent_tid),
.exit_signal = args.exit_signal,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
.flags = clone_args_get(&args, flags),
.pidfd = clone_args_as_user_ptr(&args, pidfd),
.child_tid = clone_args_as_user_ptr(&args, child_tid),
.parent_tid = clone_args_as_user_ptr(&args, parent_tid),
.exit_signal = clone_args_get(&args, exit_signal),
.stack = clone_args_get(&args, stack),
.stack_size = clone_args_get(&args, stack_size),
.tls = clone_args_get(&args, tls),
.set_tid_size = clone_args_get(&args, set_tid_size),
};.cgroup = clone_args_get(&args, cgroup),
- if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
- if (clone_args_get(&args, set_tid) &&
return -EFAULT;copy_from_user(kset_tid, clone_args_as_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t))))