On Mon, Dec 12, 2022 at 03:57:40PM +0100, Kevin Brodsky wrote:
On 08/12/2022 11:26, Beata Michalska wrote:
The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure.
As it stands now, members of said structure that do represent user pointers, are specified as being of __u64 type (alignment aside). This is problematic for some architectures, where that type cannot actually hold said pointers.
In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type.
This though, brings its own drawbacks. The structure itself gains revised signature (with implicit paddings) which results in noticeable growth in its size. Furthermore, for affected architectures, as a consequence, clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Signed-off-by: Beata Michalska beata.michalska@arm.com
include/uapi/linux/sched.h | 30 ++++++--- kernel/fork.c | 135 +++++++++++++++++++++++++++++-------- 2 files changed, 127 insertions(+), 38 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..e3ca56b530d2 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,36 @@
- The structure is versioned by size and thus extensible.
- New struct members must go at the end of the struct and
- must be properly 64bit aligned.
- must be properly aligned on at least 64bit boundary.
- Mind implicit padding when:
- ((sizeof(__kernel_uintptr_t) > sizeof(__u64))
*/
struct clone_args { __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __kernel_aligned_uintptr_t pidfd;
- __kernel_aligned_uintptr_t child_tid;
- __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __kernel_aligned_uintptr_t tls;
- /* VER0 boundary */
- __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size;
- /* VER1 boundary */ __aligned_u64 cgroup;
- /* VER2 boundary */
}; #endif -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \
- ((sizeof(__kernel_uintptr_t) > sizeof(__u64)) ? 128 : 64)
+#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \
- ((sizeof(__kernel_uintptr_t) > sizeof(__u64)) ? 152 : 80)
+#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \
- ((sizeof(__kernel_uintptr_t) > sizeof(__u64)) ? 160 : 88)
/*
- Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..64151e60c9bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,146 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 +#define __clone_args_size_ver(ver, pfx) \
- pfx##CLONE_ARGS_SIZE_VER##ver
+#ifdef CONFIG_COMPAT64 +struct compat_clone_args {
- __aligned_u64 flags;
- __aligned_u64 pidfd;
- __aligned_u64 child_tid;
- __aligned_u64 parent_tid;
- __aligned_u64 exit_signal;
- __aligned_u64 stack;
- __aligned_u64 stack_size;
- __aligned_u64 tls;
- __aligned_u64 set_tid;
- __aligned_u64 set_tid_size;
- __aligned_u64 cgroup;
+};
+#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define clone_args_size_ver(ver, args) \
I'd swap the arguments to be consistent with the other macros.
Will do.
- ((args)->compat_mode ? __clone_args_size_ver(ver, COMPAT_) \
: __clone_args_size_ver(ver, ))
+#define clone_args_get(args, member) \
- ((args)->compat_mode ? (args)->__compat_args.member \
: (args)->__args.member)
+#define clone_args_get_user_ptr(args, member) \
- ((args)->compat_mode ? \
compat_ptr(clone_args_get(args, member)) : \
(void __user *)(user_uitnptr_t)(clone_args_get(args, member)))
s/user_uitnptr_t/user_uintptr_t/
So for a moment there I was wondering how that did slip through until I have noticed I've been messing with the config to test all possible scenarios and somewhat left it at plain-aarch64 setup. So my bad and thanks for catching this one (and the one below)
+#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(ver, args) __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_get_user_ptr(args, member) \
- (void __user *)(user_uintptr_t)(clone_args_get(args, member))
+#endif /* CONFIG_COMPAT64 */
+static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \ +do { \
- BUILD_BUG_ON(offsetofend(type, tls) != \
__clone_args_size_ver(0, pfx)); \
- BUILD_BUG_ON(offsetofend(type, set_tid_size) != \
__clone_args_size_ver(1, pfx)); \
- BUILD_BUG_ON(offsetofend(type, cgroup) != \
__clone_args_size_ver(2, pfx)); \
- BUILD_BUG_ON(sizeof(type) != __clone_args_size_ver(2, pfx)); \
+} while (0)
- CLONE_ARGS_BUILD_BUG_ON(struct clone_args, );
+#ifdef CONFIG_COMPAT64
- CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_);
- BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args));
+#endif +}
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64
- struct {
union {
struct clone_args __args;
struct compat_clone_args __compat_args;
};
bool compat_mode;
- } args = {
.compat_mode = in_compat_syscall(),
- };
+#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
CLONE_ARGS_SIZE_VER0);
- BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
CLONE_ARGS_SIZE_VER1);
- BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
- BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
- clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG;
- if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
- if (unlikely(usize < clone_args_size_ver(0, &args))) return -EINVAL;
- err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+#ifdef CONFIG_COMPAT64
- err = args.compat_mode ?
Nit: I think a normal if/else would be more readable, considering how long each call is.
copy_struct_from_user(&args,sizeof(struct compat_clone_args),
Nit: missing space before "sizeof".
uargs, usize) :
copy_struct_from_user_with_ptr(&args, sizeof(args), uargs, usize);
This is the wrong size with the new definition of args. I'd recommend passing &__args and sizeof(__args) to make sure we don't end up overwriting something we shouldn't, same above.
To be fixed. Thanks!
--- BR B.
Kevin
+#else
- err = copy_struct_from_user_with_ptr(&args, sizeof(args), uargs, usize);
+#endif
- if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
- if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0))
- if (unlikely(!clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) > 0))
- if (unlikely(args.set_tid && args.set_tid_size == 0))
- if (unlikely(clone_args_get(&args, set_tid) &&
return -EINVAL;clone_args_get(&args, set_tid_size) == 0))
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */
- if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
!valid_signal(args.exit_signal)))
- if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) ||
return -EINVAL;!valid_signal(clone_args_get(&args, exit_signal))))
- if ((args.flags & CLONE_INTO_CGROUP) &&
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
- if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) &&
(clone_args_get(&args, cgroup) > INT_MAX ||
return -EINVAL;usize < clone_args_size_ver(2, &args)))
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
.child_tid = u64_to_user_ptr(args.child_tid),
.parent_tid = u64_to_user_ptr(args.parent_tid),
.exit_signal = args.exit_signal,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
.flags = clone_args_get(&args, flags),
.pidfd = clone_args_get_user_ptr(&args, pidfd),
.child_tid = clone_args_get_user_ptr(&args, child_tid),
.parent_tid = clone_args_get_user_ptr(&args, parent_tid),
.exit_signal = clone_args_get(&args, exit_signal),
.stack = clone_args_get(&args, stack),
.stack_size = clone_args_get(&args, stack_size),
.tls = clone_args_get(&args, tls),
.set_tid_size = clone_args_get(&args, set_tid_size),
};.cgroup = clone_args_get(&args, cgroup),
- if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
- if (clone_args_get(&args, set_tid) &&
return -EFAULT;copy_from_user(kset_tid, clone_args_get_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t))))