Hi All,
This series are modifications in kselftests required to compile with alpha version of Morello Gnu toochain recently released [1].
The whole series can also be found here [2].
Changes in v4 as reviewed by Kevin: * Patch 5 modified for indentation. * Patch 9 modified for cleaning up extra header. * Patch 10 modified to shrink the extra code.
Thanks, Amit Daniel
[1]: https://developer.arm.com/downloads/-/arm-gnu-toolchain-for-morello-download... [2]: git@git.morello-project.org:amitdaniel/linux.git gcc_kselftests_support_v4
Amit Daniel Kachhap (10): kselftests/arm64: morello: Fix inline assembly syntax kselftests/arm64: morello: Define uintcap_t for GCC kselftests/arm64: morello: Fix the -Wcast-function-type warnings kselftests/arm64: morello: Fix the fallthrough warning kselftests/arm64: morello: Avoid limits.h for Gcc kselftests/arm64: morello: clone: Initialize the local variable kselftests/arm64: morello: clone: Guard if clause to avoid warning kselftests/arm64: morello: Process the dynamic relocations kselftests/arm64: morello: Add optimized routine for memcpy kselftests/arm64: morello: Enable Gcc toolchain support
.../testing/selftests/arm64/morello/Makefile | 19 +- .../selftests/arm64/morello/bootstrap.c | 7 +- tools/testing/selftests/arm64/morello/clone.c | 6 +- .../selftests/arm64/morello/freestanding.h | 7 + .../arm64/morello/freestanding_init_globals.c | 133 ++++- .../arm64/morello/freestanding_start.S | 5 +- .../selftests/arm64/morello/morello_memcpy.S | 530 ++++++++++++++++++ .../testing/selftests/arm64/morello/signal.c | 4 +- 8 files changed, 689 insertions(+), 22 deletions(-) create mode 100644 tools/testing/selftests/arm64/morello/morello_memcpy.S
Inline assembly modifier(%w0) as shown below is not recognized in GCC and generates error,
asm volatile("adr %w0, #0" : "=r"(cap));
/tmp/cc49utqN.s: Assembler messages: /tmp/cc49utqN.s:1110: Error: operand mismatch -- `adr w0,#0' /tmp/cc49utqN.s:1110: Info: did you mean this? /tmp/cc49utqN.s:1110: Info: adr x0, #0x0 /tmp/cc49utqN.s:1088: Error: undefined symbol c30 used as an immediate value
Fix the above by using the syntax as below which is recognized in both Clang and GCC.
asm volatile("adr %0, #0" : "=C"(cap));
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com --- tools/testing/selftests/arm64/morello/bootstrap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/arm64/morello/bootstrap.c b/tools/testing/selftests/arm64/morello/bootstrap.c index 6a91ba39dd83..75c636688c5b 100644 --- a/tools/testing/selftests/arm64/morello/bootstrap.c +++ b/tools/testing/selftests/arm64/morello/bootstrap.c @@ -138,12 +138,12 @@ TEST(test_c64) { uintcap_t cap;
- asm volatile("mov %w0, c30" : "=r"(cap)); + asm volatile("mov %0, c30" : "=C"(cap)); /* LSB is set on LR if we came from C64 */ ASSERT_TRUE(cap & 0x1) TH_LOG("we did not come from C64");
/* will write to Cn register in C64 and Xn in A64, clearing the tag */ - asm volatile("adr %w0, #0" : "=r"(cap)); + asm volatile("adr %0, #0" : "=C"(cap)); ASSERT_TRUE(cheri_tag_get(cap)) TH_LOG("not running in C64"); }
Clang toolchain defines uintcap_t in stdint.h present in the system include path obtained with -print-file-name=include option.
In case of GCC toolchain uintcap_t, it is not present in the system include path but in the libc include path. As the kselftests does not use libc at the moment so define uintcap_t.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com --- tools/testing/selftests/arm64/morello/freestanding.h | 3 +++ .../selftests/arm64/morello/freestanding_init_globals.c | 1 + 2 files changed, 4 insertions(+)
diff --git a/tools/testing/selftests/arm64/morello/freestanding.h b/tools/testing/selftests/arm64/morello/freestanding.h index ede290f3ca1b..3575146ef732 100644 --- a/tools/testing/selftests/arm64/morello/freestanding.h +++ b/tools/testing/selftests/arm64/morello/freestanding.h @@ -21,6 +21,9 @@ typedef __kernel_timer_t timer_t; typedef __kernel_clockid_t clockid_t; typedef __kernel_uid_t uid_t; typedef __kernel_mode_t mode_t; +#ifndef __clang__ +typedef __uintcap_t uintcap_t; +#endif
#define EXIT_SUCCESS 0
diff --git a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c index 61e37d042789..6139de724f24 100644 --- a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c +++ b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c @@ -5,6 +5,7 @@ #include <stdbool.h> #include <stddef.h> #include <stdint.h> +#include "freestanding.h"
struct cap_reloc { size_t capability_location;
Gcc toolchain generates the following warning in signal.c
signal.c:98:26: warning: cast between incompatible function types from ‘void (*)(int, siginfo_t *, void *)’ {aka ‘void (*) (int, struct siginfo *, void *)’} to ‘void (*)(int)’ [-Wcast-function-type] sa->sa_handler = (sighandler_t)sigusr1_handler;
Fix the above by using (void *) as the intermediate typecast.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com --- tools/testing/selftests/arm64/morello/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/arm64/morello/signal.c b/tools/testing/selftests/arm64/morello/signal.c index 50b1d623bacb..3dcea434275c 100644 --- a/tools/testing/selftests/arm64/morello/signal.c +++ b/tools/testing/selftests/arm64/morello/signal.c @@ -95,7 +95,7 @@ static void sigusr1_handler(int n, siginfo_t *si, static void setup_sigusr1_handler(struct sigaction *sa, int mask_how) { ASSERT_EQ(sigemptyset(&sa->sa_mask), 0); - sa->sa_handler = (sighandler_t)sigusr1_handler; + sa->sa_handler = (sighandler_t)(void *)sigusr1_handler; sa->sa_flags = SA_SIGINFO; ASSERT_EQ(sigaction(SIGUSR1, sa, NULL), 0); ASSERT_EQ(sigaddset(&sa->sa_mask, SIGUSR1), 0); @@ -229,7 +229,7 @@ TEST(test_signal_basic) }; sigaltstack(&ss, 0); sigemptyset(&sa.sa_mask); - sa.sa_handler = (sighandler_t)basic_handler; + sa.sa_handler = (sighandler_t)(void *)basic_handler; sa.sa_flags = SA_SIGINFO | SA_ONSTACK; sigaction(SIGALRM, &sa, NULL); sigaction(SIGILL, &sa, NULL);
The morello Gcc toolchain generates below warning which is fixed by adding "fallthrough" comment before the next case.
bootstrap.c:87:20: warning: this statement may fall through [-Wimplicit-fallthrough=] 87 | if ((void *)auxv->a_val == NULL)
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com --- tools/testing/selftests/arm64/morello/bootstrap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/arm64/morello/bootstrap.c b/tools/testing/selftests/arm64/morello/bootstrap.c index 75c636688c5b..d594fcb3fade 100644 --- a/tools/testing/selftests/arm64/morello/bootstrap.c +++ b/tools/testing/selftests/arm64/morello/bootstrap.c @@ -83,9 +83,10 @@ int verify_auxval(struct morello_auxv *auxv) case AT_CHERI_INTERP_RW_CAP: case AT_CHERI_INTERP_RX_CAP: case AT_BASE: - /* Fall through if not null, abi allows it */ + /* ABI allows this entry to be null */ if ((void *)auxv->a_val == NULL) break; + /* fallthrough */ case AT_ENTRY: case AT_EXECFN: case AT_PLATFORM:
Unlike Clang, Gcc toolchain does not keep limits.h in the system include path obtained with -print-file-name=include option.
As the kselftests does not use libc at the moment so avoid using limits.h. But this causes clone.c to complain about missing INT_MAX so define INT_MAX if not defined.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com --- tools/testing/selftests/arm64/morello/clone.c | 1 - tools/testing/selftests/arm64/morello/freestanding.h | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/arm64/morello/clone.c b/tools/testing/selftests/arm64/morello/clone.c index a405b0fd0e4a..53e9fb236e55 100644 --- a/tools/testing/selftests/arm64/morello/clone.c +++ b/tools/testing/selftests/arm64/morello/clone.c @@ -9,7 +9,6 @@ #include <linux/errno.h> #include <linux/signal.h> #include <linux/types.h> -#include <limits.h> #include <cheriintrin.h> #include "signal_common.h"
diff --git a/tools/testing/selftests/arm64/morello/freestanding.h b/tools/testing/selftests/arm64/morello/freestanding.h index 3575146ef732..19611b315de9 100644 --- a/tools/testing/selftests/arm64/morello/freestanding.h +++ b/tools/testing/selftests/arm64/morello/freestanding.h @@ -25,6 +25,10 @@ typedef __kernel_mode_t mode_t; typedef __uintcap_t uintcap_t; #endif
+#ifndef INT_MAX +#define INT_MAX __INT_MAX__ +#endif + #define EXIT_SUCCESS 0
struct __test_meta {
Gcc toolchain generates the below warning in clone.c,
freestanding.h:44:9: warning: ‘tls’ may be used uninitialized in this function [-Wmaybe-uninitialized] 44 | __syscall(sys_no, __cap(arg1), __cap(arg2), __cap(arg3), __cap(arg4), __cap(arg5), __cap(arg6)) | ^~~~~~~~~ clone.c:392:15: note: ‘tls’ was declared here 392 | void *tls;
Fix the above warning by initializing the pointer.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com --- tools/testing/selftests/arm64/morello/clone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/arm64/morello/clone.c b/tools/testing/selftests/arm64/morello/clone.c index 53e9fb236e55..ca06adbf0dd9 100644 --- a/tools/testing/selftests/arm64/morello/clone.c +++ b/tools/testing/selftests/arm64/morello/clone.c @@ -385,7 +385,7 @@ void run_clone3(struct clone3_fixture *data) siginfo_t wait_si; int result; pid_t pid; - void *tls; + void *tls = NULL;
args->pidfd = (uintcap_t)&pidfd; args->parent_tid = (uintcap_t)&parent_tid;
Gcc toolchain generates the warning as shown below. Fix them by adding necessary guarding.
clone.c: In function ‘run_clone3’: freestanding.h:82:9: warning: macro expands to multiple statements [-Wmultistatement-macros] 82 | do { \ | ^~ freestanding.h:97:31: note: in expansion of macro ‘__EXPECT’ 97 | #define ASSERT(exp, seen, op) __EXPECT(exp, seen, op, 1) | ^~~~~~~~ freestanding.h:100:30: note: in expansion of macro ‘ASSERT’ 100 | #define ASSERT_EQ(exp, seen) ASSERT(exp, seen, ==) | ^~~~~~ clone.c:484:17: note: in expansion of macro ‘ASSERT_EQ’ 484 | ASSERT_EQ(child_tid, 0); | ^~~~~~~~~ clone.c:483:9: note: some parts of macro expansion are not guarded by this ‘if’ clause 483 | if (args->flags & CLONE_CHILD_CLEARTID) | ^~
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com --- tools/testing/selftests/arm64/morello/clone.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/arm64/morello/clone.c b/tools/testing/selftests/arm64/morello/clone.c index ca06adbf0dd9..69189287144a 100644 --- a/tools/testing/selftests/arm64/morello/clone.c +++ b/tools/testing/selftests/arm64/morello/clone.c @@ -456,8 +456,9 @@ void run_clone3(struct clone3_fixture *data) ASSERT_NE(parent_tid, 0); }
- if (args->flags & CLONE_CHILD_CLEARTID) + if (args->flags & CLONE_CHILD_CLEARTID) { ASSERT_EQ(child_tid, 0); + }
munmap((void *)args->stack, STACK_SIZE); if (args->flags & CLONE_SETTLS)
Gcc emits only dynamic relocations and not cap relocations like Clang toolchain. Hence the existing kselftests compiled in static and standalone mode need to process the dynamic relocations and fill the entries to relevant capability details.
The code changes are similar to the start up code of the recently released Arm GNU Toolchain for Morello.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com --- .../arm64/morello/freestanding_init_globals.c | 132 +++++++++++++++++- .../arm64/morello/freestanding_start.S | 5 +- 2 files changed, 130 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c index 6139de724f24..0522282fec1d 100644 --- a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c +++ b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c @@ -5,8 +5,40 @@ #include <stdbool.h> #include <stddef.h> #include <stdint.h> +#include <linux/auxvec.h> +#include <linux/elf.h> + #include "freestanding.h"
+#define DRELOC_FRAG_EXEC 4 +#define DRELOC_FRAG_RWDATA 2 +#define DRELOC_FRAG_RODATA 1 + +#define CHERI_PERM_MASK_BASE ( \ + CHERI_PERM_GLOBAL) + +#define CHERI_PERM_MASK_R ( \ + CHERI_PERM_MASK_BASE | \ + CHERI_PERM_LOAD | \ + CHERI_PERM_LOAD_CAP | \ + ARM_CAP_PERMISSION_MUTABLE_LOAD) + +#define CHERI_PERM_MASK_RW ( \ + CHERI_PERM_MASK_R | \ + CHERI_PERM_STORE | \ + CHERI_PERM_STORE_CAP | \ + CHERI_PERM_STORE_LOCAL_CAP) + +#define CHERI_PERM_MASK_RX ( \ + CHERI_PERM_MASK_R | \ + CHERI_PERM_EXECUTE | \ + ARM_CAP_PERMISSION_EXECUTIVE | \ + CHERI_PERM_SYSTEM_REGS) + +#ifndef R_MORELLO_RELATIVE +#define R_MORELLO_RELATIVE 59395 +#endif + struct cap_reloc { size_t capability_location; size_t base; @@ -23,7 +55,7 @@ struct cap_reloc { * because capability relocations must have already been processed in order to * refer to such symbols. */ -void __morello_init_globals(void) +void __morello_process_cap_relocs(void) { const struct cap_reloc *start_cap_relocs, *end_cap_relocs, *reloc; uintcap_t root_cap; @@ -33,11 +65,17 @@ void __morello_init_globals(void) * not be indirected through the GOT, as this would create a capability * relocation. We need assembly to refer to those directly. */ - asm("adrp %0, __start___cap_relocs\n\t" + asm(".weak __start___cap_relocs\n\t" + ".hidden __start___cap_relocs\n\t" + "adrp %0, __start___cap_relocs\n\t" "add %0, %0, #:lo12:__start___cap_relocs\n\t" - "adrp %1, __stop___cap_relocs\n\t" - "add %1, %1, #:lo12:__stop___cap_relocs" - : "=C"(start_cap_relocs), "=C"(end_cap_relocs)); + : "=C"(start_cap_relocs)); + + asm(".weak __stop___cap_relocs\n\t" + ".hidden __stop___cap_relocs\n\t" + "adrp %0, __stop___cap_relocs\n\t" + "add %0, %0, #:lo12:__stop___cap_relocs" + : "=C"(end_cap_relocs));
root_cap = (uintcap_t)cheri_ddc_get();
@@ -63,3 +101,87 @@ void __morello_init_globals(void) *target = cap; } } + +static void get_caps(uintptr_t *cap_rx, uintptr_t *cap_rw, const uintptr_t *auxv) +{ + for (;;) { + switch ((unsigned long)auxv[0]) { + case AT_NULL: + return; + case AT_CHERI_EXEC_RX_CAP: + *cap_rx = auxv[1]; + *cap_rx = cheri_perms_and(*cap_rx, CHERI_PERM_MASK_RX); + break; + case AT_CHERI_EXEC_RW_CAP: + *cap_rw = auxv[1]; + *cap_rw = cheri_perms_and(*cap_rw, CHERI_PERM_MASK_RW); + break; + } + auxv += 2; + } +} + +static inline uintptr_t morello_relative(uint64_t base, uintptr_t cap_rx, uintptr_t cap_rw, + Elf64_Rela *reloc, void *reloc_addr) +{ + uint64_t __attribute__((may_alias)) *u64_reloc_addr = reloc_addr; + + /* + * Fragment identified by r_offset has the following information: + * | 64-bit: address | 56-bits: length | 8-bits: permissions | + */ + unsigned long loc = u64_reloc_addr[0]; + unsigned long len = u64_reloc_addr[1] & ((1UL << 56) - 1); + unsigned long perm = u64_reloc_addr[1] >> 56; + uintptr_t value; + + if (perm == DRELOC_FRAG_RWDATA) + value = cheri_address_set(cap_rw, base + loc); + else + value = cheri_address_set(cap_rx, base + loc); + + value = cheri_bounds_set_exact(value, len); + value = value + reloc->r_addend; + + if (perm == DRELOC_FRAG_RODATA) + value = cheri_perms_and(value, CHERI_PERM_MASK_R); + + /* Seal executable capabilities with MORELLO_RB. */ + if (perm == DRELOC_FRAG_EXEC) + value = cheri_sentry_create(value); + + return value; +} + +void __morello_process_dynamic_relocs(int argc __attribute__((unused)), + char **argv __attribute__((unused)), + char **envp __attribute__((unused)), + void *auxv) +{ + Elf64_Rela *rela_dyn_start, *rela_dyn_end, *reloc; + uintptr_t cap_rx = 0; + uintptr_t cap_rw = 0; + + asm(".weak __rela_dyn_start\n\t" + ".hidden __rela_dyn_start\n\t" + "adrp %0, __rela_dyn_start\n\t" + "add %0, %0, #:lo12:__rela_dyn_start\n\t" + : "=C"(rela_dyn_start)); + + asm(".weak __rela_dyn_end\n\t" + ".hidden __rela_dyn_end\n\t" + "adrp %0, __rela_dyn_end\n\t" + "add %0, %0, #:lo12:__rela_dyn_end" + : "=C"(rela_dyn_end)); + + get_caps(&cap_rx, &cap_rw, auxv); + for (reloc = rela_dyn_start; reloc < rela_dyn_end; ++reloc) { + uintptr_t *reloc_addr, value; + + if (reloc->r_info != R_MORELLO_RELATIVE) + continue; + reloc_addr = (uintptr_t *)cheri_address_set(cap_rw, reloc->r_offset); + value = morello_relative(0, cap_rx, cap_rw, reloc, reloc_addr); + *reloc_addr = value; + } +} diff --git a/tools/testing/selftests/arm64/morello/freestanding_start.S b/tools/testing/selftests/arm64/morello/freestanding_start.S index 66dfe2bab676..27e81df006c2 100644 --- a/tools/testing/selftests/arm64/morello/freestanding_start.S +++ b/tools/testing/selftests/arm64/morello/freestanding_start.S @@ -25,7 +25,7 @@ * besides the .space we specify: * - a space for a capability pointing to it (in the .got section) * - a capability relocation (in __cap_relocs section) to make the capability - * valid (see __morello_init_globals() for more details) + * valid (see __morello_process_{cap,dynamic}_relocs() for more details) * We use * adrp c0, :got:name * ldr c0, [c0, :got_lo12:name] @@ -72,7 +72,8 @@ FUNCTION_START(_start) mov c22, c2 mov c23, c3
- bl __morello_init_globals + bl __morello_process_cap_relocs + bl __morello_process_dynamic_relocs
/* save the initial kernel stack to a global variable */ got_get_ptr stack_from_kernel, c0
On 16/02/2023 07:07, Amit Daniel Kachhap wrote:
Gcc emits only dynamic relocations and not cap relocations like Clang toolchain. Hence the existing kselftests compiled in static and standalone mode need to process the dynamic relocations and fill the entries to relevant capability details.
The code changes are similar to the start up code of the recently released Arm GNU Toolchain for Morello.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com
.../arm64/morello/freestanding_init_globals.c | 132 +++++++++++++++++- .../arm64/morello/freestanding_start.S | 5 +- 2 files changed, 130 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c index 6139de724f24..0522282fec1d 100644 --- a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c +++ b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c @@ -5,8 +5,40 @@ #include <stdbool.h> #include <stddef.h> #include <stdint.h> +#include <linux/auxvec.h> +#include <linux/elf.h>
#include "freestanding.h" +#define DRELOC_FRAG_EXEC 4 +#define DRELOC_FRAG_RWDATA 2 +#define DRELOC_FRAG_RODATA 1
+#define CHERI_PERM_MASK_BASE ( \
- CHERI_PERM_GLOBAL)
+#define CHERI_PERM_MASK_R ( \
- CHERI_PERM_MASK_BASE | \
- CHERI_PERM_LOAD | \
- CHERI_PERM_LOAD_CAP | \
- ARM_CAP_PERMISSION_MUTABLE_LOAD)
+#define CHERI_PERM_MASK_RW ( \
- CHERI_PERM_MASK_R | \
- CHERI_PERM_STORE | \
- CHERI_PERM_STORE_CAP | \
- CHERI_PERM_STORE_LOCAL_CAP)
+#define CHERI_PERM_MASK_RX ( \
- CHERI_PERM_MASK_R | \
- CHERI_PERM_EXECUTE | \
- ARM_CAP_PERMISSION_EXECUTIVE | \
- CHERI_PERM_SYSTEM_REGS)
+#ifndef R_MORELLO_RELATIVE +#define R_MORELLO_RELATIVE 59395 +#endif
struct cap_reloc { size_t capability_location; size_t base; @@ -23,7 +55,7 @@ struct cap_reloc {
- because capability relocations must have already been processed in order to
- refer to such symbols.
*/ -void __morello_init_globals(void) +void __morello_process_cap_relocs(void) { const struct cap_reloc *start_cap_relocs, *end_cap_relocs, *reloc; uintcap_t root_cap; @@ -33,11 +65,17 @@ void __morello_init_globals(void) * not be indirected through the GOT, as this would create a capability * relocation. We need assembly to refer to those directly. */
- asm("adrp %0, __start___cap_relocs\n\t"
- asm(".weak __start___cap_relocs\n\t"
".hidden __start___cap_relocs\n\t"
"adrp %0, __start___cap_relocs\n\t" "add %0, %0, #:lo12:__start___cap_relocs\n\t"
"adrp %1, __stop___cap_relocs\n\t"
"add %1, %1, #:lo12:__stop___cap_relocs"
: "=C"(start_cap_relocs), "=C"(end_cap_relocs));
: "=C"(start_cap_relocs));
- asm(".weak __stop___cap_relocs\n\t"
".hidden __stop___cap_relocs\n\t"
"adrp %0, __stop___cap_relocs\n\t"
"add %0, %0, #:lo12:__stop___cap_relocs"
: "=C"(end_cap_relocs));
root_cap = (uintcap_t)cheri_ddc_get(); @@ -63,3 +101,87 @@ void __morello_init_globals(void) *target = cap; } }
+static void get_caps(uintptr_t *cap_rx, uintptr_t *cap_rw, const uintptr_t *auxv) +{
- for (;;) {
switch ((unsigned long)auxv[0]) {
case AT_NULL:
return;
case AT_CHERI_EXEC_RX_CAP:
*cap_rx = auxv[1];
*cap_rx = cheri_perms_and(*cap_rx, CHERI_PERM_MASK_RX);
break;
case AT_CHERI_EXEC_RW_CAP:
*cap_rw = auxv[1];
*cap_rw = cheri_perms_and(*cap_rw, CHERI_PERM_MASK_RW);
break;
}
auxv += 2;
- }
+}
+static inline uintptr_t morello_relative(uint64_t base, uintptr_t cap_rx, uintptr_t cap_rw,
Elf64_Rela *reloc, void *reloc_addr)
+{
- uint64_t __attribute__((may_alias)) *u64_reloc_addr = reloc_addr;
- /*
* Fragment identified by r_offset has the following information:
* | 64-bit: address | 56-bits: length | 8-bits: permissions |
*/
- unsigned long loc = u64_reloc_addr[0];
- unsigned long len = u64_reloc_addr[1] & ((1UL << 56) - 1);
- unsigned long perm = u64_reloc_addr[1] >> 56;
- uintptr_t value;
- if (perm == DRELOC_FRAG_RWDATA)
value = cheri_address_set(cap_rw, base + loc);
- else
value = cheri_address_set(cap_rx, base + loc);
- value = cheri_bounds_set_exact(value, len);
- value = value + reloc->r_addend;
- if (perm == DRELOC_FRAG_RODATA)
value = cheri_perms_and(value, CHERI_PERM_MASK_R);
- /* Seal executable capabilities with MORELLO_RB. */
- if (perm == DRELOC_FRAG_EXEC)
value = cheri_sentry_create(value);
- return value;
+}
+void __morello_process_dynamic_relocs(int argc __attribute__((unused)),
char **argv __attribute__((unused)),
char **envp __attribute__((unused)),
void *auxv)
+{
- Elf64_Rela *rela_dyn_start, *rela_dyn_end, *reloc;
- uintptr_t cap_rx = 0;
- uintptr_t cap_rw = 0;
- asm(".weak __rela_dyn_start\n\t"
".hidden __rela_dyn_start\n\t"
"adrp %0, __rela_dyn_start\n\t"
"add %0, %0, #:lo12:__rela_dyn_start\n\t"
: "=C"(rela_dyn_start));
- asm(".weak __rela_dyn_end\n\t"
".hidden __rela_dyn_end\n\t"
"adrp %0, __rela_dyn_end\n\t"
"add %0, %0, #:lo12:__rela_dyn_end"
: "=C"(rela_dyn_end));
- get_caps(&cap_rx, &cap_rw, auxv);
- for (reloc = rela_dyn_start; reloc < rela_dyn_end; ++reloc) {
uintptr_t *reloc_addr, value;
if (reloc->r_info != R_MORELLO_RELATIVE)
continue;
reloc_addr = (uintptr_t *)cheri_address_set(cap_rw, reloc->r_offset);
value = morello_relative(0, cap_rx, cap_rw, reloc, reloc_addr);
*reloc_addr = value;
- }
+} diff --git a/tools/testing/selftests/arm64/morello/freestanding_start.S b/tools/testing/selftests/arm64/morello/freestanding_start.S index 66dfe2bab676..27e81df006c2 100644 --- a/tools/testing/selftests/arm64/morello/freestanding_start.S +++ b/tools/testing/selftests/arm64/morello/freestanding_start.S @@ -25,7 +25,7 @@
- besides the .space we specify:
- a space for a capability pointing to it (in the .got section)
- a capability relocation (in __cap_relocs section) to make the capability
- valid (see __morello_init_globals() for more details)
- valid (see __morello_process_{cap,dynamic}_relocs() for more details)
- We use
- adrp c0, :got:name
- ldr c0, [c0, :got_lo12:name]
@@ -72,7 +72,8 @@ FUNCTION_START(_start) mov c22, c2 mov c23, c3
- bl __morello_init_globals
- bl __morello_process_cap_relocs
- bl __morello_process_dynamic_relocs
I have just realised something: __morello_process_dynamic_relocs() takes 4 arguments (ignoring all but the 4th), i.e. it relies on c3 to be set to its startup value. However since we are calling __morello_process_cap_relocs() first, isn't that working by pure accident (i.e. __morello_process_cap_relocs() doesn't corrupt c3 by chance)?
We could just add a mov c0, c22 here and remove the unused arguments of __morello_process_dynamic_relocs(), that's arguably easier to understand anyway.
Kevin
/* save the initial kernel stack to a global variable */ got_get_ptr stack_from_kernel, c0
On 2/20/23 13:56, Kevin Brodsky wrote:
On 16/02/2023 07:07, Amit Daniel Kachhap wrote:
Gcc emits only dynamic relocations and not cap relocations like Clang toolchain. Hence the existing kselftests compiled in static and standalone mode need to process the dynamic relocations and fill the entries to relevant capability details.
The code changes are similar to the start up code of the recently released Arm GNU Toolchain for Morello.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com
.../arm64/morello/freestanding_init_globals.c | 132 +++++++++++++++++- .../arm64/morello/freestanding_start.S | 5 +- 2 files changed, 130 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c index 6139de724f24..0522282fec1d 100644 --- a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c +++ b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c @@ -5,8 +5,40 @@ #include <stdbool.h> #include <stddef.h> #include <stdint.h> +#include <linux/auxvec.h> +#include <linux/elf.h>
- #include "freestanding.h"
+#define DRELOC_FRAG_EXEC 4 +#define DRELOC_FRAG_RWDATA 2 +#define DRELOC_FRAG_RODATA 1
+#define CHERI_PERM_MASK_BASE ( \
- CHERI_PERM_GLOBAL)
+#define CHERI_PERM_MASK_R ( \
- CHERI_PERM_MASK_BASE | \
- CHERI_PERM_LOAD | \
- CHERI_PERM_LOAD_CAP | \
- ARM_CAP_PERMISSION_MUTABLE_LOAD)
+#define CHERI_PERM_MASK_RW ( \
- CHERI_PERM_MASK_R | \
- CHERI_PERM_STORE | \
- CHERI_PERM_STORE_CAP | \
- CHERI_PERM_STORE_LOCAL_CAP)
+#define CHERI_PERM_MASK_RX ( \
- CHERI_PERM_MASK_R | \
- CHERI_PERM_EXECUTE | \
- ARM_CAP_PERMISSION_EXECUTIVE | \
- CHERI_PERM_SYSTEM_REGS)
+#ifndef R_MORELLO_RELATIVE +#define R_MORELLO_RELATIVE 59395 +#endif
- struct cap_reloc { size_t capability_location; size_t base;
@@ -23,7 +55,7 @@ struct cap_reloc {
- because capability relocations must have already been processed in order to
- refer to such symbols.
*/ -void __morello_init_globals(void) +void __morello_process_cap_relocs(void) { const struct cap_reloc *start_cap_relocs, *end_cap_relocs, *reloc; uintcap_t root_cap; @@ -33,11 +65,17 @@ void __morello_init_globals(void) * not be indirected through the GOT, as this would create a capability * relocation. We need assembly to refer to those directly. */
- asm("adrp %0, __start___cap_relocs\n\t"
- asm(".weak __start___cap_relocs\n\t"
".hidden __start___cap_relocs\n\t"
"adrp %0, __start___cap_relocs\n\t" "add %0, %0, #:lo12:__start___cap_relocs\n\t"
"adrp %1, __stop___cap_relocs\n\t"
"add %1, %1, #:lo12:__stop___cap_relocs"
: "=C"(start_cap_relocs), "=C"(end_cap_relocs));
: "=C"(start_cap_relocs));
- asm(".weak __stop___cap_relocs\n\t"
".hidden __stop___cap_relocs\n\t"
"adrp %0, __stop___cap_relocs\n\t"
"add %0, %0, #:lo12:__stop___cap_relocs"
: "=C"(end_cap_relocs));
root_cap = (uintcap_t)cheri_ddc_get(); @@ -63,3 +101,87 @@ void __morello_init_globals(void) *target = cap; } }
+static void get_caps(uintptr_t *cap_rx, uintptr_t *cap_rw, const uintptr_t *auxv) +{
- for (;;) {
switch ((unsigned long)auxv[0]) {
case AT_NULL:
return;
case AT_CHERI_EXEC_RX_CAP:
*cap_rx = auxv[1];
*cap_rx = cheri_perms_and(*cap_rx, CHERI_PERM_MASK_RX);
break;
case AT_CHERI_EXEC_RW_CAP:
*cap_rw = auxv[1];
*cap_rw = cheri_perms_and(*cap_rw, CHERI_PERM_MASK_RW);
break;
}
auxv += 2;
- }
+}
+static inline uintptr_t morello_relative(uint64_t base, uintptr_t cap_rx, uintptr_t cap_rw,
Elf64_Rela *reloc, void *reloc_addr)
+{
- uint64_t __attribute__((may_alias)) *u64_reloc_addr = reloc_addr;
- /*
* Fragment identified by r_offset has the following information:
* | 64-bit: address | 56-bits: length | 8-bits: permissions |
*/
- unsigned long loc = u64_reloc_addr[0];
- unsigned long len = u64_reloc_addr[1] & ((1UL << 56) - 1);
- unsigned long perm = u64_reloc_addr[1] >> 56;
- uintptr_t value;
- if (perm == DRELOC_FRAG_RWDATA)
value = cheri_address_set(cap_rw, base + loc);
- else
value = cheri_address_set(cap_rx, base + loc);
- value = cheri_bounds_set_exact(value, len);
- value = value + reloc->r_addend;
- if (perm == DRELOC_FRAG_RODATA)
value = cheri_perms_and(value, CHERI_PERM_MASK_R);
- /* Seal executable capabilities with MORELLO_RB. */
- if (perm == DRELOC_FRAG_EXEC)
value = cheri_sentry_create(value);
- return value;
+}
+void __morello_process_dynamic_relocs(int argc __attribute__((unused)),
char **argv __attribute__((unused)),
char **envp __attribute__((unused)),
void *auxv)
+{
- Elf64_Rela *rela_dyn_start, *rela_dyn_end, *reloc;
- uintptr_t cap_rx = 0;
- uintptr_t cap_rw = 0;
- asm(".weak __rela_dyn_start\n\t"
".hidden __rela_dyn_start\n\t"
"adrp %0, __rela_dyn_start\n\t"
"add %0, %0, #:lo12:__rela_dyn_start\n\t"
: "=C"(rela_dyn_start));
- asm(".weak __rela_dyn_end\n\t"
".hidden __rela_dyn_end\n\t"
"adrp %0, __rela_dyn_end\n\t"
"add %0, %0, #:lo12:__rela_dyn_end"
: "=C"(rela_dyn_end));
- get_caps(&cap_rx, &cap_rw, auxv);
- for (reloc = rela_dyn_start; reloc < rela_dyn_end; ++reloc) {
uintptr_t *reloc_addr, value;
if (reloc->r_info != R_MORELLO_RELATIVE)
continue;
reloc_addr = (uintptr_t *)cheri_address_set(cap_rw, reloc->r_offset);
value = morello_relative(0, cap_rx, cap_rw, reloc, reloc_addr);
*reloc_addr = value;
- }
+} diff --git a/tools/testing/selftests/arm64/morello/freestanding_start.S b/tools/testing/selftests/arm64/morello/freestanding_start.S index 66dfe2bab676..27e81df006c2 100644 --- a/tools/testing/selftests/arm64/morello/freestanding_start.S +++ b/tools/testing/selftests/arm64/morello/freestanding_start.S @@ -25,7 +25,7 @@
- besides the .space we specify:
- a space for a capability pointing to it (in the .got section)
- a capability relocation (in __cap_relocs section) to make the capability
- valid (see __morello_init_globals() for more details)
- valid (see __morello_process_{cap,dynamic}_relocs() for more details)
- We use
- adrp c0, :got:name
- ldr c0, [c0, :got_lo12:name]
@@ -72,7 +72,8 @@ FUNCTION_START(_start) mov c22, c2 mov c23, c3
- bl __morello_init_globals
- bl __morello_process_cap_relocs
- bl __morello_process_dynamic_relocs
I have just realised something: __morello_process_dynamic_relocs() takes 4 arguments (ignoring all but the 4th), i.e. it relies on c3 to be set to its startup value. However since we are calling __morello_process_cap_relocs() first, isn't that working by pure accident (i.e. __morello_process_cap_relocs() doesn't corrupt c3 by chance)?
We could just add a mov c0, c22 here and remove the unused arguments of __morello_process_dynamic_relocs(), that's arguably easier to understand anyway.
Clearly this looks buggy. I will resend this patch again.
Amit
Kevin
/* save the initial kernel stack to a global variable */
got_get_ptr stack_from_kernel, c0
Gcc emits only dynamic relocations and not cap relocations like Clang toolchain. Hence the existing kselftests compiled in static and standalone mode need to process the dynamic relocations and fill the entries to relevant capability details.
The code changes are similar to the start up code of the recently released Arm GNU Toolchain for Morello.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com --- .../arm64/morello/freestanding_init_globals.c | 129 +++++++++++++++++- .../arm64/morello/freestanding_start.S | 10 +- 2 files changed, 131 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c index 6139de724f24..ddb1635a8762 100644 --- a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c +++ b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c @@ -5,8 +5,40 @@ #include <stdbool.h> #include <stddef.h> #include <stdint.h> +#include <linux/auxvec.h> +#include <linux/elf.h> + #include "freestanding.h"
+#define DRELOC_FRAG_EXEC 4 +#define DRELOC_FRAG_RWDATA 2 +#define DRELOC_FRAG_RODATA 1 + +#define CHERI_PERM_MASK_BASE ( \ + CHERI_PERM_GLOBAL) + +#define CHERI_PERM_MASK_R ( \ + CHERI_PERM_MASK_BASE | \ + CHERI_PERM_LOAD | \ + CHERI_PERM_LOAD_CAP | \ + ARM_CAP_PERMISSION_MUTABLE_LOAD) + +#define CHERI_PERM_MASK_RW ( \ + CHERI_PERM_MASK_R | \ + CHERI_PERM_STORE | \ + CHERI_PERM_STORE_CAP | \ + CHERI_PERM_STORE_LOCAL_CAP) + +#define CHERI_PERM_MASK_RX ( \ + CHERI_PERM_MASK_R | \ + CHERI_PERM_EXECUTE | \ + ARM_CAP_PERMISSION_EXECUTIVE | \ + CHERI_PERM_SYSTEM_REGS) + +#ifndef R_MORELLO_RELATIVE +#define R_MORELLO_RELATIVE 59395 +#endif + struct cap_reloc { size_t capability_location; size_t base; @@ -23,7 +55,7 @@ struct cap_reloc { * because capability relocations must have already been processed in order to * refer to such symbols. */ -void __morello_init_globals(void) +void __morello_process_cap_relocs(void) { const struct cap_reloc *start_cap_relocs, *end_cap_relocs, *reloc; uintcap_t root_cap; @@ -33,11 +65,17 @@ void __morello_init_globals(void) * not be indirected through the GOT, as this would create a capability * relocation. We need assembly to refer to those directly. */ - asm("adrp %0, __start___cap_relocs\n\t" + asm(".weak __start___cap_relocs\n\t" + ".hidden __start___cap_relocs\n\t" + "adrp %0, __start___cap_relocs\n\t" "add %0, %0, #:lo12:__start___cap_relocs\n\t" - "adrp %1, __stop___cap_relocs\n\t" - "add %1, %1, #:lo12:__stop___cap_relocs" - : "=C"(start_cap_relocs), "=C"(end_cap_relocs)); + : "=C"(start_cap_relocs)); + + asm(".weak __stop___cap_relocs\n\t" + ".hidden __stop___cap_relocs\n\t" + "adrp %0, __stop___cap_relocs\n\t" + "add %0, %0, #:lo12:__stop___cap_relocs" + : "=C"(end_cap_relocs));
root_cap = (uintcap_t)cheri_ddc_get();
@@ -63,3 +101,84 @@ void __morello_init_globals(void) *target = cap; } } + +static void get_caps(uintptr_t *cap_rx, uintptr_t *cap_rw, const uintptr_t *auxv) +{ + for (;;) { + switch ((unsigned long)auxv[0]) { + case AT_NULL: + return; + case AT_CHERI_EXEC_RX_CAP: + *cap_rx = auxv[1]; + *cap_rx = cheri_perms_and(*cap_rx, CHERI_PERM_MASK_RX); + break; + case AT_CHERI_EXEC_RW_CAP: + *cap_rw = auxv[1]; + *cap_rw = cheri_perms_and(*cap_rw, CHERI_PERM_MASK_RW); + break; + } + auxv += 2; + } +} + +static inline uintptr_t morello_relative(uint64_t base, uintptr_t cap_rx, uintptr_t cap_rw, + Elf64_Rela *reloc, void *reloc_addr) +{ + uint64_t __attribute__((may_alias)) *u64_reloc_addr = reloc_addr; + + /* + * Fragment identified by r_offset has the following information: + * | 64-bit: address | 56-bits: length | 8-bits: permissions | + */ + unsigned long loc = u64_reloc_addr[0]; + unsigned long len = u64_reloc_addr[1] & ((1UL << 56) - 1); + unsigned long perm = u64_reloc_addr[1] >> 56; + uintptr_t value; + + if (perm == DRELOC_FRAG_RWDATA) + value = cheri_address_set(cap_rw, base + loc); + else + value = cheri_address_set(cap_rx, base + loc); + + value = cheri_bounds_set_exact(value, len); + value = value + reloc->r_addend; + + if (perm == DRELOC_FRAG_RODATA) + value = cheri_perms_and(value, CHERI_PERM_MASK_R); + + /* Seal executable capabilities with MORELLO_RB. */ + if (perm == DRELOC_FRAG_EXEC) + value = cheri_sentry_create(value); + + return value; +} + +void __morello_process_dynamic_relocs(void *auxv) +{ + Elf64_Rela *rela_dyn_start, *rela_dyn_end, *reloc; + uintptr_t cap_rx = 0; + uintptr_t cap_rw = 0; + + asm(".weak __rela_dyn_start\n\t" + ".hidden __rela_dyn_start\n\t" + "adrp %0, __rela_dyn_start\n\t" + "add %0, %0, #:lo12:__rela_dyn_start\n\t" + : "=C"(rela_dyn_start)); + + asm(".weak __rela_dyn_end\n\t" + ".hidden __rela_dyn_end\n\t" + "adrp %0, __rela_dyn_end\n\t" + "add %0, %0, #:lo12:__rela_dyn_end" + : "=C"(rela_dyn_end)); + + get_caps(&cap_rx, &cap_rw, auxv); + for (reloc = rela_dyn_start; reloc < rela_dyn_end; ++reloc) { + uintptr_t *reloc_addr, value; + + if (reloc->r_info != R_MORELLO_RELATIVE) + continue; + reloc_addr = (uintptr_t *)cheri_address_set(cap_rw, reloc->r_offset); + value = morello_relative(0, cap_rx, cap_rw, reloc, reloc_addr); + *reloc_addr = value; + } +} diff --git a/tools/testing/selftests/arm64/morello/freestanding_start.S b/tools/testing/selftests/arm64/morello/freestanding_start.S index 66dfe2bab676..92260fdc384d 100644 --- a/tools/testing/selftests/arm64/morello/freestanding_start.S +++ b/tools/testing/selftests/arm64/morello/freestanding_start.S @@ -25,7 +25,7 @@ * besides the .space we specify: * - a space for a capability pointing to it (in the .got section) * - a capability relocation (in __cap_relocs section) to make the capability - * valid (see __morello_init_globals() for more details) + * valid (see __morello_process_{cap,dynamic}_relocs() for more details) * We use * adrp c0, :got:name * ldr c0, [c0, :got_lo12:name] @@ -72,7 +72,11 @@ FUNCTION_START(_start) mov c22, c2 mov c23, c3
- bl __morello_init_globals + /* call void __morello_process_cap_relocs(void) */ + bl __morello_process_cap_relocs + /* call void __morello_process_dynamic_relocs(char *auxv) */ + mov c0, c23 + bl __morello_process_dynamic_relocs
/* save the initial kernel stack to a global variable */ got_get_ptr stack_from_kernel, c0 @@ -90,7 +94,7 @@ FUNCTION_START(_start) mov csp, c0 #endif
- /* call int main(int argc, char **argv, char **envp, char **auxv) */ + /* call int main(int argc, char **argv, char **envp, char *auxv) */ mov x0, x20 mov c1, c21 mov c2, c22
Gcc toolchain complains about the missing memcpy function with -nostdlib linker option so a memcpy/memmove implementation is added.
This commit is similar to previous commit "arm64: morello: Use the Morello optimized routine for memcpy" which adds an optimized routine for memcpy in the kernel.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com --- .../testing/selftests/arm64/morello/Makefile | 3 +- .../selftests/arm64/morello/morello_memcpy.S | 530 ++++++++++++++++++ 2 files changed, 532 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/arm64/morello/morello_memcpy.S
diff --git a/tools/testing/selftests/arm64/morello/Makefile b/tools/testing/selftests/arm64/morello/Makefile index 21906770f216..72f34426c7c0 100644 --- a/tools/testing/selftests/arm64/morello/Makefile +++ b/tools/testing/selftests/arm64/morello/Makefile @@ -33,7 +33,8 @@ $(OUTPUT)/%.o:%.S $(DEPS) $(OUTPUT)/%.o:%.c $(DEPS) $(CC) $< -o $@ $(CFLAGS) -c
-$(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/freestanding_start.o $(OUTPUT)/freestanding_init_globals.o $(OUTPUT)/freestanding.o +$(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/freestanding_start.o $(OUTPUT)/freestanding_init_globals.o \ + $(OUTPUT)/freestanding.o $(OUTPUT)/morello_memcpy.o $(CC) $^ -o $@ $(LDFLAGS)
$(OUTPUT)/signal: $(OUTPUT)/signal_common.o diff --git a/tools/testing/selftests/arm64/morello/morello_memcpy.S b/tools/testing/selftests/arm64/morello/morello_memcpy.S new file mode 100644 index 000000000000..d04d7e693031 --- /dev/null +++ b/tools/testing/selftests/arm64/morello/morello_memcpy.S @@ -0,0 +1,530 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2019-2023, Arm Limited. + * + * morello_memcpy - copy memory area + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/1c9fdae82a43049e/str... + */ + +#define FUNCTION_START(name) \ + .global name; \ + .align 4; \ + .type name STT_FUNC; \ + name: + +#define FUNCTION_END(name) \ + .size name, .-name + +#define FUNCTION_ALIAS(name) \ + .global name; \ + .type name STT_FUNC; \ + name: + +#define L(label) .L ## label + +#define xdstin x0 +#define xsrc x1 +#define count x2 +#define xdst x3 +#define xsrcend x4 +#define xdstend x5 +#define auoff x14 +#define cap_count x15 +#define tmp1 x16 +#define tmp2 x17 + +#if defined(__CHERI_PURE_CAPABILITY__) +#define dstin c0 +#define src c1 +#define dst c3 +#define srcend c4 +#define dstend c5 +#define tmp1_ptr c16 +#else +#define dstin x0 +#define src x1 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1_ptr x16 +#endif + +#define A_l x6 +#define B_l x7 +#define C_l x8 +#define D_l x9 +#define E_l x10 +#define F_l x11 +#define G_l x12 +#define H_l x13 + +#define A_cap c6 +#define B_cap c7 +#define C_cap c8 +#define D_cap c9 +#define E_cap c10 +#define F_cap c11 +#define G_cap c12 +#define H_cap c13 + +/* This algorithm has not been benchmarked. It's derived + from the base aarch64 one with small changes to account + for copying tags. + 1. We're copying less than 16 bytes, so no capabilities. + Use the traditional code path for these. + 2. src mod 16 != dst mode 16. We're not copying capabilities, + so again use the traditional memcpy. + 3. We're copying more than 8 capabilities plus the head and tail. + a. No overlap, use forward copy + b. Overlap, use backward copy + 4. We're copying 0..8 capabilities + a. No capabilities to copy. This means we are copying 16..30 bytes. + Use the existing code path to do this from the original algorithm. + b. Copying 1..2 capabilities plus the head and tail + Use a branchless sequence. + c. Copying 3..4 capabilities plus the head and tail + Use a branchless sequence. + d. Copying 5..8 capabilities plus the head and tail + Use a branchless sequence. + */ +FUNCTION_ALIAS(memmove) +FUNCTION_START(memcpy) + add srcend, src, count + add dstend, dstin, count + + /* Copies of less than 16 bytes don't use capabilities. */ + cmp count, 16 + b.lo L(copy16) + + /* If src mod 16 != dst mod 16 we're not transferring tags. */ + and tmp1, xsrc, 15 + and tmp2, xdstin, 15 + cmp tmp1, tmp2 + b.ne L(memcpy_nocap) + + /* Get the number of capabilities that we need to store. */ + neg tmp2, tmp1 + add tmp2, tmp2, 16 + and auoff, tmp2, 15 + + sub cap_count, count, auoff + lsr cap_count, cap_count, 4 + + cmp cap_count, 8 + b.hi L(copy_long_cap) + cmp cap_count, 2 + b.hi L(copy32_128_cap) + + /* Copy 0..2 capabilities using a branchless sequence. */ + cbz cap_count, L(copy32) + ldp E_l, F_l, [src] + ldp C_l, D_l, [srcend, -16] + add src, src, auoff /* align up src to 16 bytes */ +#if defined(__CHERI_PURE_CAPABILITY__) + alignd srcend, srcend, 4 +#else + bic srcend, srcend, 15 +#endif + ldr A_cap, [src] + ldr B_cap, [srcend, -16] + stp E_l, F_l, [dstin] + stp C_l, D_l, [dstend, -16] + add tmp1_ptr, dstin, auoff /* align up dstin to 16 bytes */ +#if defined (__CHERI_PURE_CAPABILITY__) + alignd dstend, dstend, 4 +#else + bic dstend, dstend, 15 +#endif + str A_cap, [tmp1_ptr] + str B_cap, [dstend, -16] + ret + + .p2align 4 +L(copy32_128_cap): + cmp cap_count, 4 + b.hi L(copy128_cap) + /* Copy 3..4 capabilities using a branchless sequence. */ + ldp E_l, F_l, [src] + ldp G_l, H_l, [srcend, -16] + add src, src, auoff /* align up src to 16 bytes */ +#if defined (__CHERI_PURE_CAPABILITY__) + alignd srcend, srcend, 4 +#else + bic srcend, srcend, 15 +#endif + ldp A_cap, B_cap, [src] + ldp C_cap, D_cap, [srcend, -32] + stp E_l, F_l, [dstin] + stp G_l, H_l, [dstend, -16] + add tmp1_ptr, dstin, auoff /* align up dstin to 16 bytes */ +#if defined (__CHERI_PURE_CAPABILITY__) + alignd dstend, dstend, 4 +#else + bic dstend, dstend, 15 +#endif + stp A_cap, B_cap, [tmp1_ptr] + stp C_cap, D_cap, [dstend, -32] + ret + + .p2align 4 +L(copy128_cap): + /* Copy 5..8 capabilities using a branchless sequence. */ + ldp count, tmp2, [src] + ldp tmp1, cap_count, [srcend, -16] + add src, src, auoff /* align up src to 16 bytes */ +#if defined (__CHERI_PURE_CAPABILITY__) + alignd srcend, srcend, 4 +#else + bic srcend, srcend, 15 +#endif + ldp A_cap, B_cap, [src] + ldp C_cap, D_cap, [src, 32] + ldp E_cap, F_cap, [srcend, -32] + ldp G_cap, H_cap, [srcend, -64] + stp count, tmp2, [dstin] + stp tmp1, cap_count, [dstend, -16] + add tmp1_ptr, dstin, auoff /* align up src to 16 bytes */ +#if defined (__CHERI_PURE_CAPABILITY__) + alignd dstend, dstend, 4 +#else + bic dstend, dstend, 15 +#endif + stp A_cap, B_cap, [tmp1_ptr] + stp C_cap, D_cap, [tmp1_ptr, 32] + stp E_cap, F_cap, [dstend, -32] + stp G_cap, H_cap, [dstend, -64] + ret + +L(copy_long_cap): + /* Use backwards copy if there is an overlap. */ + sub tmp1, xdstin, xsrc + cmp tmp1, count + b.lo L(copy_long_backwards_cap) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldp C_l, D_l, [src] + ldr E_cap, [src, auoff] + and tmp1, xsrc, 15 +#if defined(__CHERI_PURE_CAPABILITY__) + alignd src, src, 4 + neg tmp2, tmp1 + add dst, dstin, tmp2 +#else + bic src, src, 15 + sub dst, dstin, tmp1 +#endif + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_cap, B_cap, [src, 16] + stp C_l, D_l, [dstin] + str E_cap, [dstin, auoff] + ldp C_cap, D_cap, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end_cap) +L(loop64_cap): + stp A_cap, B_cap, [dst, 16] + ldp A_cap, B_cap, [src, 80] + stp C_cap, D_cap, [dst, 48] + ldp C_cap, D_cap, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64_cap) + + /* Write the last iteration and copy the last 16-byte aligned 64 byte block + from the end and the tail. */ +L(copy64_from_end_cap): + ldp G_l, H_l, [srcend, -16] +#if defined(__CHERI_PURE_CAPABILITY__) + alignd srcend, srcend, 4 + alignd tmp1_ptr, dstend, 4 +#else + bic srcend, srcend, 15 + bic tmp1_ptr, dstend, 15 +#endif + ldp E_cap, F_cap, [srcend, -64] + stp A_cap, B_cap, [dst, 16] + ldp A_cap, B_cap, [srcend, -32] + stp C_cap, D_cap, [dst, 48] + stp E_cap, F_cap, [tmp1_ptr, -64] + stp G_l, H_l, [dstend, -16] + stp A_cap, B_cap, [tmp1_ptr, -32] + ret + +L(copy_long_backwards_cap): + cbz tmp1, L(copy0) + ldp E_l, F_l, [srcend, -16] + and tmp1, xsrcend, 15 +#if defined(__CHERI_PURE_CAPABILITY__) + alignd srcend, srcend, 4 + neg tmp2, tmp1 + add count, count, tmp2 +#else + bic srcend, srcend, 15 + sub count, count, tmp1 +#endif + ldp A_cap, B_cap, [srcend, -32] + stp E_l, F_l, [dstend, -16] + ldp C_cap, D_cap, [srcend, -64] +#if defined(__CHERI_PURE_CAPABILITY__) + add dstend, dstend, tmp2 /* tmp1 was negated above to tmp2. */ +#else + sub dstend, dstend, tmp1 +#endif + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards_cap): + str B_cap, [dstend, -16] + str A_cap, [dstend, -32] + ldp A_cap, B_cap, [srcend, -96] + str D_cap, [dstend, -48] + str C_cap, [dstend, -64]! + ldp C_cap, D_cap, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards_cap) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start_cap): + ldp G_l, H_l, [src] + add src, src, auoff /* align up src to 16 bytes */ + add tmp1_ptr, dstin, auoff /* align up dstin to 16 bytes */ + ldp E_cap, F_cap, [src, 32] + stp A_cap, B_cap, [dstend, -32] + ldp A_cap, B_cap, [src] + stp C_cap, D_cap, [dstend, -64] + stp E_cap, F_cap, [tmp1_ptr, 32] + stp G_l, H_l, [dstin] + stp A_cap, B_cap, [tmp1_ptr] + ret + +L(memcpy_nocap): + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + +#undef A_l +#undef B_l +#undef C_l +#undef D_l +#undef E_l +#undef F_l +#undef G_l +#undef H_l +#undef tmp1 +#undef tmp1_ptr +#undef tmp2 + +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h xdst +#define H_l xsrc +#define H_h xsrcend +#define tmp1 E_l +#define tmp2 F_l + +L(copy32): + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, xdstin, xsrc + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] + and tmp1, xdstin, 15 +#if defined(__CHERI_PURE_CAPABILITY__) + alignd dst, dstin, 4 + neg tmp2, tmp1 + add src, src, tmp2 +#else + bic dst, dstin, 15 + sub src, src, tmp1 +#endif + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) + +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): + ldp D_l, D_h, [srcend, -16] + and tmp1, xdstend, 15 +#if defined(__CHERI_PURE_CAPABILITY__) + neg tmp2, tmp1 + add srcend, srcend, tmp2 +#else + sub srcend, srcend, tmp1 +#endif + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! +#if defined(__CHERI_PURE_CAPABILITY__) + add dstend, dstend, tmp2 +#else + sub dstend, dstend, tmp1 +#endif + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret + +FUNCTION_END(memcpy) +FUNCTION_END(memmove)
This commit allows morello kselftest to be compiled by Gcc toolchain. While at it, remove clang compiler flags -integrated-as, --target, -g, -Wall as they are already set in kselftest and arm64 root makefile.
Note: This commit requires CC or LLVM to be set for Clang toolchain which was not the case earlier.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com --- tools/testing/selftests/arm64/morello/Makefile | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/tools/testing/selftests/arm64/morello/Makefile b/tools/testing/selftests/arm64/morello/Makefile index 72f34426c7c0..ecfa6fc5e989 100644 --- a/tools/testing/selftests/arm64/morello/Makefile +++ b/tools/testing/selftests/arm64/morello/Makefile @@ -1,16 +1,20 @@ # SPDX-License-Identifier: GPL-2.0 # Copyright (C) 2021 Arm Limited
-# lib.mk sets CC. This switch triggers it to clang +# Set various Clang flags if LLVM is defined or CC is defined as clang +ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),) +# Explicitly define LLVM to select CLANG_FLAGS appropriately in lib.mk LLVM := 1 +endif +ifneq ($(LLVM),) +CLANG_LDFLAGS = -fuse-ld=lld +endif
-CLANG_FLAGS = --target=aarch64-linux-gnu CFLAGS_PURECAP = -march=morello+c64 -mabi=purecap -CFLAGS_COMMON = -g -ffreestanding -Wall -Wextra -MMD +CFLAGS_COMMON = -ffreestanding -Wextra -MMD CFLAGS_COMMON += -nostdinc -isystem $(shell $(CC) -print-file-name=include 2>/dev/null) -CFLAGS_CLANG = $(CLANG_FLAGS) -integrated-as -CFLAGS += $(CFLAGS_CLANG) $(CFLAGS_PURECAP) $(CFLAGS_COMMON) -LDFLAGS := -fuse-ld=lld $(CLANG_FLAGS) -nostdlib -static +CFLAGS += $(CLANG_FLAGS) $(CFLAGS_PURECAP) $(CFLAGS_COMMON) +LDFLAGS += $(CLANG_LDFLAGS) $(CLANG_FLAGS) -nostdlib -static
SRCS := $(wildcard *.c) $(wildcard *.S) PROGS := bootstrap clone exit mmap read_write sched signal
On 16/02/2023 07:07, Amit Daniel Kachhap wrote:
Hi All,
This series are modifications in kselftests required to compile with alpha version of Morello Gnu toochain recently released [1].
The whole series can also be found here [2].
Changes in v4 as reviewed by Kevin:
- Patch 5 modified for indentation.
- Patch 9 modified for cleaning up extra header.
- Patch 10 modified to shrink the extra code.
Thanks, Amit Daniel
[2]: git@git.morello-project.org:amitdaniel/linux.git gcc_kselftests_support_v4
Amit Daniel Kachhap (10): kselftests/arm64: morello: Fix inline assembly syntax kselftests/arm64: morello: Define uintcap_t for GCC kselftests/arm64: morello: Fix the -Wcast-function-type warnings kselftests/arm64: morello: Fix the fallthrough warning kselftests/arm64: morello: Avoid limits.h for Gcc kselftests/arm64: morello: clone: Initialize the local variable kselftests/arm64: morello: clone: Guard if clause to avoid warning kselftests/arm64: morello: Process the dynamic relocations kselftests/arm64: morello: Add optimized routine for memcpy kselftests/arm64: morello: Enable Gcc toolchain support
Applied on next, with the "Gcc fixes for clone test" series inserted before the last patch as it seemed more logical. Thanks!
Kevin
.../testing/selftests/arm64/morello/Makefile | 19 +- .../selftests/arm64/morello/bootstrap.c | 7 +- tools/testing/selftests/arm64/morello/clone.c | 6 +- .../selftests/arm64/morello/freestanding.h | 7 + .../arm64/morello/freestanding_init_globals.c | 133 ++++- .../arm64/morello/freestanding_start.S | 5 +- .../selftests/arm64/morello/morello_memcpy.S | 530 ++++++++++++++++++ .../testing/selftests/arm64/morello/signal.c | 4 +- 8 files changed, 689 insertions(+), 22 deletions(-) create mode 100644 tools/testing/selftests/arm64/morello/morello_memcpy.S
linux-morello@op-lists.linaro.org