On 02/05/2023 12:08, Teo Couprie Diaz wrote:
argv and envp strings can be large enough for their bounds to not be exactly representable. During allocation on the stack, align the strings that need an adjusted base and pad if needed for alignment or exact bounds. Handle this eventual padding in fs/binfmt_elf by detecting unexpected 0s.
Add a a helper function to put already created capabilities on the stack during elf setup.
Signed-off-by: Teo Couprie Diaz teo.coupriediaz@arm.com
fs/binfmt_elf.c | 110 +++++++++++++++++++++++++++++++++++++++++++- fs/exec.c | 66 ++++++++++++++++++++++++++ include/linux/elf.h | 4 ++ 3 files changed, 178 insertions(+), 2 deletions(-)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1d82465cb9e9..e5f9c8622c32 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -386,11 +386,74 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, p = mm->arg_end = mm->arg_start; while (argc-- > 0) { size_t len;
if (elf_stack_put_user_ptr(p, sp++))
+#if defined(CONFIG_CHERI_PURECAP_UABI) && (ELF_COMPAT == 0)
uintcap_t for_stack;
char __user *access = uaddr_to_user_ptr_safe(p);
char checked_char;
int adjusted_string = 0;
We can use bool nowadays. The surrounding code won't as it's mostly ancient but in that case it's better not to get too much inspiration from it!
General comment: the variable names are a little confusing. As a rule, a variable name should first and foremost indicate what the variable represents, and then potentially how it's used (if disambiguation is needed). If we consider for_stack for instance, what it is (will be) is a pointer to the string, so we could call it str_ptr for instance; the fact that it will be stored on the stack is not essential to understand what it represents. Also, in the kernel more specifically, short local variable names are encouraged (there's a classic rant on that in the coding style [1]). For instance here I would probably just call checked_char "c", as its function is to store some arbitrary character and it doesn't have a very specific meaning otherwise.
These are just loose guidelines and naming is always a matter of preference to some extent, but broadly following them helps the reader a lot to get an idea of what you're about to do.
[1] https://docs.kernel.org/process/coding-style.html#naming
if (copy_from_user(&checked_char, access, sizeof(char)))
For such single-element reads, it's easier to just use get_user() instead.
return -EFAULT;
/*
* If right after the end of the previous argument we have a zero,
* that means the string start has been adjusted to allow exact bounds
* for the controlling capability.
* Find the real start of the string and go from there.
* It should already be properly aligned for exact bounds.
*/
if (checked_char == '\0') {
size_t pad_len = 0;
adjusted_string = 1;
while (pad_len < MAX_ARG_STRLEN && checked_char == '\0') {
access++;
pad_len++;
if (copy_from_user(&checked_char, access, sizeof(char)))
return -EFAULT;
}
p += pad_len;
}
len = strnlen_user(uaddr_to_user_ptr_safe(p), MAX_ARG_STRLEN);
We can use access here instead of deriving another capability (all these uses of uaddr_to_user_ptr_safe() will eventually go anyway).
if (!len || len > MAX_ARG_STRLEN)
return -EINVAL;
if (!adjusted_string &&
copy_from_user(&checked_char, access+len, sizeof(char)))
return -EFAULT;
/*
* Check if the next arg string starts right after the current one.
* If not, the len was not exactly representable and there is padding.
*/
if (!adjusted_string && checked_char == '\0')
adjusted_string = 1;
/*
* If the string has been adjusted for exact bounds representation,
* its start should already be properly aligned. Get the representable
* length by using the same length that was used during allocation:
* the length of the original string.
* This takes into account the padding due to length change after
* the string, but not that for alignment. Thus we might not end up
* at the start of the next arg. If not, it will take the slow path
* above.
*/
if (adjusted_string)
len = __builtin_cheri_round_representable_length(len);
for_stack = cheri_build_user_cap(p, len,
(CHERI_PERM_GLOBAL | CHERI_PERM_STORE | CHERI_PERM_LOAD));
if (elf_stack_put_user_cap(for_stack, sp++)) return -EFAULT;
+#else len = strnlen_user(uaddr_to_user_ptr_safe(p), MAX_ARG_STRLEN); if (!len || len > MAX_ARG_STRLEN) return -EINVAL;
if (elf_stack_put_user_ptr(p, sp++))
return -EFAULT;
+#endif p += len; } if (elf_stack_put_user(0, sp++)) @@ -404,11 +467,54 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, mm->env_end = mm->env_start = p; while (envc-- > 0) { size_t len;
if (elf_stack_put_user_ptr(p, sp++))
+#if defined(CONFIG_CHERI_PURECAP_UABI) && (ELF_COMPAT == 0)
uintcap_t for_stack;
char __user *access = uaddr_to_user_ptr_safe(p);
char checked_char;
int adjusted_string = 0;
if (copy_from_user(&checked_char, access, sizeof(char)))
return -EFAULT;
if (checked_char == '\0') {
size_t pad_len = 0;
adjusted_string = 1;
while (pad_len < MAX_ARG_STRLEN && checked_char == '\0') {
access++;
pad_len++;
if (copy_from_user(&checked_char, access, sizeof(char)))
return -EFAULT;
}
p += pad_len;
}
len = strnlen_user(uaddr_to_user_ptr_safe(p), MAX_ARG_STRLEN);
if (!len || len > MAX_ARG_STRLEN)
return -EINVAL;
if (!adjusted_string &&
copy_from_user(&checked_char, access+len, sizeof(char)))
return -EFAULT;
if (!adjusted_string && checked_char == '\0')
adjusted_string = 1;
if (adjusted_string)
len = __builtin_cheri_round_representable_length(len);
for_stack = cheri_build_user_cap(p, len,
(CHERI_PERM_GLOBAL | CHERI_PERM_STORE | CHERI_PERM_LOAD));
if (elf_stack_put_user_cap(for_stack, sp++)) return -EFAULT;
+#else
As this is exactly the same as for argv, we should really introduce a helper function, which would also be a nice way to hide the #ifdef'ing.
len = strnlen_user(uaddr_to_user_ptr_safe(p), MAX_ARG_STRLEN); if (!len || len > MAX_ARG_STRLEN) return -EINVAL;
if (elf_stack_put_user_ptr(p, sp++))
return -EFAULT;
+#endif p += len; } if (elf_stack_put_user(0, sp++)) diff --git a/fs/exec.c b/fs/exec.c index f5782fd96fcb..cf69e7798083 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -530,15 +530,48 @@ static int copy_strings(int argc, struct user_arg_ptr argv, const char __user *str; int len; unsigned long pos; +#if defined(CONFIG_CHERI_PURECAP_UABI)
All the changes need to be PCuABI-only but also native-only, so similarly to binfmt_elf.c we need to check if we are in compat or not, as copy_strings() is used in both cases. Looks like argv.is_compat can be used for that purpose (if CONFIG_COMPAT is defined).
size_t arg_len, str_len, pad_len = 0;
size_t repr_align;
To avoid having to think about representation changes, better use the same type as len (changing the type of either len or these new variables).
+#endif ret = -EFAULT; str = get_user_arg_ptr(argv, argc); if (USER_PTR_IS_ERR(str)) goto out; +#if defined(CONFIG_CHERI_PURECAP_UABI)
str_len = strnlen_user(str, MAX_ARG_STRLEN);
if (!str_len)
goto out;
arg_len = __builtin_cheri_round_representable_length(str_len);
<cheriintrin.h> provides a wrapper, cheri_representable_length(). Include <linux/cheri.h> to pull it in.
if (arg_len < str_len) {
Considering that str_len is at most MAX_ARG_STRLEN (32 pages), such a situation would clearly be a bug in the RRLEN instruction itself, which is not something we normally try to handle. I think it is safe to assume that arg_len is the same as str_len if it is not greater.
pr_warn_once("exec %s: Argument overflowed CHERI representable length, not changing it.",
bprm->filename);
/* This should be bigger than MAX_ARG_STRLEN anyway, fail later. */
arg_len = str_len;
}
if (arg_len > str_len) {
repr_align = ~__builtin_cheri_representable_alignment_mask(str_len) + 1;
len = bprm->p - ALIGN_DOWN(bprm->p - arg_len, repr_align);
As discussed offline, there is an interesting issue with the fact that bprm->p is yet to be relocated by setup_arg_pages() at this point. This means that the alignment calculations we are making are not based on the final addresses. Although the maximum string size is 32 pages, and the shift is a multiple of the page size by definition, there is no absolute guarantee that the address range for each string remain representable after relocation (for Morello specifically, it looks like this may cause problems for 64K pages).
To be on the safe side, I think the best is to ensure that the shift is a multiple of MAX_ARG_STRLEN in setup_arg_pages(). It's worth noting that we will eventually move all these strings out of the stack [1], but the problem remains the same if we keep randomising the address of this separate mapping.
[1] https://git.morello-project.org/morello/kernel/linux/-/issues/22
/*
* We want the capability base to be aligned. As we work from the
* last to the first argument, we can place the start of the string
* to be aligned for representability.
* All padding then goes after it, both for the representable
* length and for the alignment, to fill the space we left before
* the previous argument we put on the stack.
*/
pad_len = len - str_len;
} else {
len = str_len;
}
+#else len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out; +#endif ret = -E2BIG; if (!valid_arg_len(bprm, len)) @@ -546,7 +579,11 @@ static int copy_strings(int argc, struct user_arg_ptr argv, /* We're going to work our way backwards. */ pos = bprm->p; +#if defined(CONFIG_CHERI_PURECAP_UABI)
str += str_len;
+#else str += len; +#endif bprm->p -= len; #ifdef CONFIG_MMU if (bprm->p < bprm->argmin) @@ -555,6 +592,9 @@ static int copy_strings(int argc, struct user_arg_ptr argv, while (len > 0) { int offset, bytes_to_copy; +#if defined(CONFIG_CHERI_PURECAP_UABI)
int doing_padding = 0;
+#endif if (fatal_signal_pending(current)) { ret = -ERESTARTNOHAND; @@ -566,13 +606,30 @@ static int copy_strings(int argc, struct user_arg_ptr argv, if (offset == 0) offset = PAGE_SIZE; +#if defined(CONFIG_CHERI_PURECAP_UABI) bytes_to_copy = offset;
/* Don't complicate things: only put padding, then only the arg. */
if (pad_len > 0 && bytes_to_copy > pad_len)
bytes_to_copy = pad_len;
if (pad_len == 0 && str_len > 0 && bytes_to_copy > str_len)
bytes_to_copy = str_len;
+#endif if (bytes_to_copy > len) bytes_to_copy = len; offset -= bytes_to_copy; pos -= bytes_to_copy; +#if defined(CONFIG_CHERI_PURECAP_UABI)
if (pad_len > 0) {
doing_padding = 1;
pad_len -= bytes_to_copy;
} else if (str_len > 0) {
str_len -= bytes_to_copy;
str -= bytes_to_copy;
}
+#else str -= bytes_to_copy; +#endif len -= bytes_to_copy; if (!kmapped_page || kpos != (pos & PAGE_MASK)) { @@ -594,10 +651,19 @@ static int copy_strings(int argc, struct user_arg_ptr argv, kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } +#if defined(CONFIG_CHERI_PURECAP_UABI)
if (!doing_padding) {
if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
ret = -EFAULT;
goto out;
}
}
+#else
Maybe I'm missing something obvious, but do we really need that many changes in the loop body? Since we do not actually have to write any padding (the pages are already zeroed), I was expecting we'd just skip the padding after the string before entering the loop, and then adjusting bprm->p as needed after the loop to account for the padding before the string.
Considering the need for an #ifdef (+ compat check) for every block, regrouping changes helps a lot with readability.
if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; }
+#endif } } ret = 0; diff --git a/include/linux/elf.h b/include/linux/elf.h index 039ad1867045..7bb2b39944bd 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -68,6 +68,10 @@ extern Elf64_Dyn _DYNAMIC []; #define elf_uaddr_to_user_ptr(addr) uaddr_to_user_ptr_safe(addr) #define elf_copy_to_user_stack(to, from, len) copy_to_user_with_ptr(to, from, len) #ifdef CONFIG_CHERI_PURECAP_UABI +#define elf_stack_put_user_cap(cap, ptr) \
morello_put_user_cap(cap, \
(void * __capability * __capability)ptr)
Just use put_user_ptr() directly, and make for_stack a void __user *. The code we're adding is PCuABI-specific so there is no need to add this sort of abstraction.
Overall that patch is a very good start, well done! Some refactoring is desirable but the foundations look solid.
Kevin
#define elf_stack_put_user_ptr(val, ptr) \ put_user_ptr(elf_uaddr_to_user_ptr(val), \ (void * __capability * __capability)ptr)