[linux-morello] Re: [PATCH 3/3] kselftests/arm64: morello: Add clone3 test-cases

30 Nov 2022

On 18/11/2022 01:05, Beata Michalska wrote:
...
Add a bunch of test-cases for clone3 syscall, focusing mainly
on handling struct clone_args versioning and sizing, alongside
the obvious req for respecting capabilities.
Signed-off-by: Beata Michalska beata.michalska@arm.com
tools/testing/selftests/arm64/morello/clone.c | 269 +++++++++++++++++-
 1 file changed, 267 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/arm64/morello/clone.c b/tools/testing/selftests/arm64/morello/clone.c
index 8217840ba504..13c84dc7ca77 100644
--- a/tools/testing/selftests/arm64/morello/clone.c
+++ b/tools/testing/selftests/arm64/morello/clone.c
@@ -1,15 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (C) 2021  Arm Limited
+#include "freestanding.h"
 #include <linux/mman.h>
 #include <linux/resource.h>
 #include <linux/wait.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/signal.h>
+#include <limits.h>
 #include <cheriintrin.h>
-#include "freestanding.h"
 
-#define STACK_SIZE	1024*1024
+#define STACK_SIZE 1024*1024
+#define DEFAULT_PAGE_SIZE 4096
 
 #define in_restricted()	\
   (!(cheri_perms_get(cheri_pcc_get()) & ARM_CAP_PERMISSION_EXECUTIVE))
@@ -17,6 +20,10 @@
 
 #define MAGIC_NR	0x3562 /* whatever really ..... */
 
+#ifndef MAX_PID_NS_LEVEL
+#define MAX_PID_NS_LEVEL 32
+#endif



/* Cloned thread result */
 #define CTR_SUCCESS  	 1
 #define CTR_FAILED	-1
@@ -256,6 +263,263 @@ TEST(test_wait_rusage)
   RUN_WITH_FIXTURE(data, CLONE_TH_RUSAGE, wait_single);
 }



+#define GET_ERRNO(err)	(((err) + MAX_ERRNO) ^ MAX_ERRNO)
This magic vaguely rings a bell but I'm not entirely sure why we need it
here, could you elaborate?
...



+#define DECLARE_DESC(arg, val)				\

{						\
offsetof(struct clone_args, arg),	\


sizeof(((struct clone_args *)0)->arg),	\


(uintcap_t)(val)			\


}


+struct clone3_fixture {

size_t args_size;
struct {
unsigned int offset;


unsigned int size;


uintcap_t    value;



This is quite a complicated way to initialise a struct per-test. Having
directly struct clone_args here would make things a lot simpler (and it
would take less space in fact).
I understand that what I'm suggesting is not quite the same, as instead
of overriding the default values here, you would be providing the
defaults and run_clone3() would need to figure out what to provide
itself. AFAICT this can simply be done by checking if the field is 0
(e.g. .flags), with the notable exception of .stack. You could have a
.custom_stack field or similar to tell run_clone3() not to touch .stack
/ .stack_size.
It's worth noting that the current approach does not really work for the
stack either: in case 6, you end up leaking the whole stack as the
munmap() in run_clone3() will get NULL instead of the originally
allocated stack. Therefore it seems to me that something like a
.custom_stack field is required anyway.
...

/* This will get extra size but that keeps things on the safe side */
} const args_desc[sizeof(struct clone_args)/sizeof(__u64)];
int n_desc;
int e_result;

+} const clone3_data[] = {

/* BEGIN_SECTION: expected failure */
/* size of clone_args smaller than CLONE_ARGS_SIZE_VER0 */
{
.args_size = offsetof(struct clone_args, tls),


.args_desc = {},


.n_desc = 0,


.e_result = -EINVAL


}, /* @{0} */
/* invalid set_tid array size */
{
.args_size = sizeof(struct clone_args),


.args_desc = {


		DECLARE_DESC(set_tid_size, MAX_PID_NS_LEVEL + 1),


	     },


.n_desc = 1,


.e_result = -EINVAL


}, /* @{1} */
/* Invalid combination of set_tid & set_tid_size */
{
.args_size = sizeof(struct clone_args),


.args_desc = {


		DECLARE_DESC(set_tid, (uintcap_t) NULL),


		DECLARE_DESC(set_tid_size, 1)


	     },


.n_desc = 2,


.e_result = -EINVAL


}, /* @{2} */
/* Invalid exit_signal */
{
.args_size = sizeof(struct clone_args),


.args_desc = {


		DECLARE_DESC(exit_signal, _NSIG + 1),


	     },


.n_desc = 1,


.e_result = -EINVAL


}, /* @{3} */
/* Invalid cgroup number */
{
.args_size = sizeof(struct clone_args),


.args_desc = {


		DECLARE_DESC(flags, CLONE_INTO_CGROUP),


		DECLARE_DESC(cgroup, (__u64)INT_MAX + 1),


	     },


.n_desc = 2,


.e_result = -EINVAL


}, /* @{4} */
/* Invalid size for clone_args with cgroup */
{
.args_size = offsetof(struct clone_args, cgroup),


.args_desc = {


		DECLARE_DESC(flags, CLONE_INTO_CGROUP),


		DECLARE_DESC(cgroup, 1),


	     },


.n_desc = 2,


.e_result = -EINVAL


}, /* @{5} */
/* Invalid stack & stack_size combination */
{
.args_size = sizeof(struct clone_args),


.args_desc = {


		DECLARE_DESC(stack, NULL),


		DECLARE_DESC(stack_size, 4096),


	     },


.n_desc = 2,


.e_result = -EINVAL


}, /* @{6} */
{
.args_size = sizeof(struct clone_args),


.args_desc = {


		DECLARE_DESC(stack_size, 0),


	     },


.n_desc = 1,


.e_result = -EINVAL


}, /* @{7} */
{
.args_size = sizeof(struct clone_args),


.args_desc = {


		DECLARE_DESC(set_tid, &(pid_t){1}),


		DECLARE_DESC(set_tid_size, 1),


	     },


.n_desc = 2,


.e_result = -EEXIST


}, /* @{8} */

/* END_SECTION: expected failure */
{
.args_size = sizeof(struct clone_args),


.args_desc = {


		DECLARE_DESC(flags, CLONE_PIDFD |


			     CLONE_CHILD_SETTID |


			     CLONE_CHILD_CLEARTID),


	     },


.n_desc = 1,


.e_result = 0


}, /* @{9} */
{
.args_size = sizeof(struct clone_args),


.args_desc = {


		DECLARE_DESC(flags, CLONE_PARENT_SETTID |


			     CLONE_CHILD_SETTID),


	     },


.n_desc = 1,


.e_result = 0


}, /* @{10} */
{
.args_size = sizeof(struct clone_args),


.args_desc = {


		DECLARE_DESC(flags, CLONE_SETTLS),


	     },


.n_desc = 1,


.e_result = 0


}, /* @{11} */

+};



+__thread int tls_data;
Nit: could be static and local to run_child().
...



+static __attribute__((noinline)) void run_child(struct clone_args *args)
+{

int count = 1000;

if (args->flags & CLONE_CHILD_SETTID) {
pid_t current_pid = (pid_t)syscall(__NR_getpid);



We've got a getpid() wrapper in freestanding.h, could use it here.
...


ASSERT_FALSE(current_pid != *(pid_t *)args->child_tid);


}

if (args->flags & CLONE_SETTLS && args->tls) {
ptraddr_t base_addr = cheri_address_get(args->tls);


ptraddr_t ref_addr = cheri_address_get(&tls_data);


size_t length = cheri_length_get(args->tls);



ASSERT_TRUE(cheri_tag_get(args->tls));


ASSERT_FALSE(ref_addr < base_addr || ref_addr >= base_addr + length);



Nit: I think the reverse condition with ASSERT_TRUE() is a bit more
readable.
...

}
/*
* This is grisly ....


* Spin for a while to give a chance for the parent to be


* scheduled in after clone and before this thread terminates


* (not entirely reliable though)


*/


while (--count)
sleep_cycles(&(int){1000});


syscall(__NR_exit, 0);

+}



+static inline __attribute__((always_inline))
+void run_clone3(const struct clone3_fixture *data)
+{

int pidfd, parent_tid, child_tid;
siginfo_t *wait_si = &(siginfo_t){0};

Can't we just have `siginfo_t wait_si;` (no need to initialise it either
as it's never read)?
...

int result;
pid_t pid;

struct clone_args args = {
.flags = 0,


.pidfd = (uintcap_t)&pidfd,


.parent_tid = (uintcap_t)&parent_tid,


.child_tid = (uintcap_t)&child_tid,


.exit_signal = SIGCHLD,


.stack = (uintcap_t) mmap_verified(NULL, STACK_SIZE,


				   PROT_READ | PROT_WRITE,


				   MAP_ANONYMOUS | MAP_PRIVATE,


				   -1, 0, STACK_REQ_PERMS),


.stack_size = STACK_SIZE,


};

for (int i = 0; i < data->n_desc; ++i) {
unsigned char *arg;



arg = (unsigned char *)&args + data->args_desc[i].offset;



if (data->args_desc[i].size != sizeof(uintcap_t))


	*(__u64 *)arg = (__u64)data->args_desc[i].value;


else


	*(uintcap_t *)arg = data->args_desc[i].value;


}

/* Don't bother with the full set-up for expected failures */
if (data->e_result) {
result = syscall(__NR_clone3, &args, data->args_size);


ASSERT_EQ(result, data->e_result) {



ASSERT_* causes the whole program to exit, so you probably want
EXPECT_EQ() here.
...

	TH_LOG("Expected: %d while %d was received",


	       GET_ERRNO(data->e_result), GET_ERRNO(result));


}


munmap((void *)args.stack, STACK_SIZE);


return;


}

args.flags |= CLONE_VM;

if (args.flags & CLONE_SETTLS) {
void *addr = mmap_verified(NULL, DEFAULT_PAGE_SIZE,



Not a big fan of calling this DEFAULT_PAGE_SIZE, as we shouldn't make
any assumption on the page size. Fortunately the page size doesn't
actually matter here as both mmap() and munmap() will align up the
length to the page size. Maybe simply TLS_SIZE?
...

			   PROT_READ | PROT_WRITE,


			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0,


			   CAP_LOAD_PERMS | CAP_STORE_PERMS);



args.tls = (uintcap_t)cheri_bounds_set(addr, DEFAULT_PAGE_SIZE);


}

pid = syscall(__NR_clone3, &args, data->args_size);

ASSERT_GE(pid, 0);
if (!pid)
run_child(&args);



if (args.flags & CLONE_PIDFD) {
/* Signal handling is not the test target here: valid pidfd is */


result = syscall(__NR_pidfd_send_signal, pidfd, SIGUSR1, NULL, 0);


ASSERT_NE(result, -EBADF);



Agreed that we're not testing signal handling here, in fact for that
reason I'd suggest using a signal that is ignored by default like
SIGCHLD, not SIGUSR1 which causes termination by default. Alternatively,
you could use sigwaitinfo() in the child, which also means you wouldn't
need the busy loop.
OTOH I think it wouldn't hurt to check that pidfd_send_signal() didn't
fail (in any way), as if it does that would suggest something went wrong
with the test setup.
...

}

waitid(P_PID, pid, wait_si, WEXITED, NULL);

Should check that this one doesn't fail. This also makes me think, can
we now get rid of the scary wait substitute in clone_single(), since
we're not bothering with it here?
...


/* child_tid set once the thread gets scheduled */
if (args.flags & CLONE_PARENT_SETTID && args.flags & CLONE_CHILD_SETTID
   && !(args.flags & CLONE_CHILD_CLEARTID))


ASSERT_EQ(*(pid_t *)args.parent_tid, *(pid_t *)args.child_tid)



Considering that none of the testcases override either argument, that
could be simply written as ASSERT_EQ(parent_tid, child_tid).
For extra robustness, maybe also 0-init both variables and check that
the value is not 0 (i.e. the kernel didn't perform either write)?
...


if (args.flags & CLONE_CHILD_CLEARTID)
ASSERT_EQ(*(pid_t *)args.child_tid, 0);



munmap((void *)args.stack, STACK_SIZE);
if (args.flags & CLONE_SETTLS)
munmap((void *)args.tls, STACK_SIZE);



s/STACK_SIZE/DEFAULT_PAGE_SIZE/
Kevin
...
+}



+TEST(test_clone3)
+{

int ncount = sizeof(clone3_data)/sizeof(clone3_data[0]);

for (int i = 0; i < ncount; ++i) {
TH_LOG("Validating clone3 @{%d}", i);


run_clone3(&clone3_data[i]);


}

+}



void run_restricted(uintcap_t entry_point)
 {
   void *new_stack = allocate_mem_raw(STACK_SIZE);
@@ -290,5 +554,6 @@ int main(void)
   run_restricted((uintcap_t)test_clone_tls_restricted);
   test_wait_raw();
   test_wait_rusage();

test_clone3();
return 0;

}

    

2025

2024

2023

2022

[linux-morello] Re: [PATCH 3/3] kselftests/arm64: morello: Add clone3 test-cases

Signed-off-by: Beata Michalska beata.michalska@arm.com