diff --git a/arch.h b/arch.h new file mode 100644 index 0000000..c9ccbe4 --- /dev/null +++ b/arch.h @@ -0,0 +1,21 @@ +/* Copyright © 2024 Arista Networks, Inc. All rights reserved. + * + * Use of this source code is governed by the MIT license that can be found + * in the LICENSE file. + */ + +#ifndef ARCH_H_ +# define ARCH_H_ + +# include "config.h" + +# define ARCH_STR_(x) #x +# define ARCH_STR(x) ARCH_STR_(x) + +/* *INDENT-OFF* - formatters try to add spaces here */ +# define ARCH_HEADER_BASE arch/ARCH +/* *INDENT-ON* */ + +# include ARCH_STR(ARCH_HEADER_BASE/syscall.h) + +#endif /* !ARCH_H_ */ diff --git a/arch/x86/gen-syscall.bash b/arch/x86/gen-syscall.bash new file mode 100755 index 0000000..3502ed9 --- /dev/null +++ b/arch/x86/gen-syscall.bash @@ -0,0 +1,130 @@ +#!/bin/bash + +# This script generates the classic BPF program to intercept system calls +# in x86 userspace. + +# From asm/unistd_64.h +declare -A x86_64_syscalls=( + ["mknod"]="133" + ["mknodat"]="259" +) + +# From asm/unistd_32.h +declare -A i386_syscalls=( + ["mknod"]="14" + ["mknodat"]="297" +) + +prelude=( + # Check that we're running on x86_64 or i386 + 'BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch)))' + 'BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_I386, $(($i386_offset-2)), 0)' + 'BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 1, 0)' + 'BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)' + + # The x32 ABI (not to be confused with the i386 ABI!) uses the + # same system call numbers as x86_64, but set bit 30. Clear it so we share + # the same table. + 'BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr)))' + 'BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1)' + 'BPF_STMT(BPF_ALU | BPF_SUB | BPF_K, X32_SYSCALL_BIT)' +) + +syscall_jump=( + 'BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, $nr, 0, 1)' + 'BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF)' +) + +i386_offset=$((${#prelude[@]} + ${#syscall_jump[@]}*${#x86_64_syscalls[@]} + 1)) + +# NOTE: indentation is done with tabs. Do not use spaces, do not remove tabs, +# lest you break all HEREDOCs. + +gen_source() { + cat <<-EOF + /* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */ + + #include + #include + #include + #include + #include + + /* For the x32 ABI, all system call numbers have bit 30 set */ + #define X32_SYSCALL_BIT 0x40000000 + + const struct sock_filter syscall_filter[] = { + EOF + + for stmt in "${prelude[@]}"; do + eval "echo $'\t'\"$stmt\"," + done + + for syscall in "${!x86_64_syscalls[@]}"; do + nr=${x86_64_syscalls[$syscall]} + for stmt in "${syscall_jump[@]}"; do + eval "echo $'\t'\"$stmt\"," + done + done + + echo $'\t''BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),' + echo $'\t''BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))),' + + for syscall in "${!i386_syscalls[@]}"; do + nr=${i386_syscalls[$syscall]} + for stmt in "${syscall_jump[@]}"; do + eval "echo $'\t'\"$stmt\"," + done + done + + echo $'\t''BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),' + + cat <<-EOF + }; + + const size_t syscall_filter_length = sizeof (syscall_filter) / sizeof (struct sock_filter); + + /* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */ + EOF +} + +gen_header() { + cat <<-EOF + /* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */ + + extern const struct sock_filter syscall_filter[]; + extern const size_t syscall_filter_length; + + EOF + + for syscall in "${!x86_64_syscalls[@]}"; do + echo "#define BST_NR_${syscall} ${x86_64_syscalls[$syscall]}" + done + + for syscall in "${!i386_syscalls[@]}"; do + echo "#define BST_NR_${syscall}_32 ${i386_syscalls[$syscall]}" + done + + max=0 + for syscall in "${!x86_64_syscalls[@]}"; do + (( ${x86_64_syscalls[$syscall]} > max )) && max=${x86_64_syscalls[$syscall]} + done + + max32=0 + for syscall in "${!i386_syscalls[@]}"; do + (( ${i386_syscalls[$syscall]} > max32 )) && max32=${i386_syscalls[$syscall]} + done + + cat <<-EOF + + #define BST_SECCOMP_32 1 + + #define BST_NR_MAX $max + #define BST_NR_MAX32 $max32 + + /* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */ + EOF +} + +gen_source > arch/x86/syscall.c +gen_header > arch/x86/syscall.h diff --git a/arch/x86/syscall.c b/arch/x86/syscall.c new file mode 100644 index 0000000..fc426c9 --- /dev/null +++ b/arch/x86/syscall.c @@ -0,0 +1,35 @@ +/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */ + +#include +#include +#include +#include +#include + +/* For the x32 ABI, all system call numbers have bit 30 set */ +#define X32_SYSCALL_BIT 0x40000000 + +const struct sock_filter syscall_filter[] = { + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_I386, 10, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 1, 0), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1), + BPF_STMT(BPF_ALU | BPF_SUB | BPF_K, X32_SYSCALL_BIT), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 133, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 259, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 14, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 297, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), +}; + +const size_t syscall_filter_length = sizeof (syscall_filter) / sizeof (struct sock_filter); + +/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */ diff --git a/arch/x86/syscall.h b/arch/x86/syscall.h new file mode 100644 index 0000000..8ed4914 --- /dev/null +++ b/arch/x86/syscall.h @@ -0,0 +1,16 @@ +/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */ + +extern const struct sock_filter syscall_filter[]; +extern const size_t syscall_filter_length; + +#define BST_NR_mknod 133 +#define BST_NR_mknodat 259 +#define BST_NR_mknod_32 14 +#define BST_NR_mknodat_32 297 + +#define BST_SECCOMP_32 1 + +#define BST_NR_MAX 259 +#define BST_NR_MAX32 297 + +/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */ diff --git a/arch/x86_64 b/arch/x86_64 new file mode 120000 index 0000000..f4bad79 --- /dev/null +++ b/arch/x86_64 @@ -0,0 +1 @@ +x86 \ No newline at end of file diff --git a/capable.h b/capable.h index 92e62e5..7a3d598 100644 --- a/capable.h +++ b/capable.h @@ -20,6 +20,7 @@ # define BST_CAP_SETUID ((uint64_t) 1 << CAP_SETUID) # define BST_CAP_SETGID ((uint64_t) 1 << CAP_SETGID) # define BST_CAP_SYS_CHROOT ((uint64_t) 1 << CAP_SYS_CHROOT) +# define BST_CAP_MKNOD ((uint64_t) 1 << CAP_MKNOD) extern int deny_new_capabilities; diff --git a/config.h.in b/config.h.in index 14292b9..c596351 100644 --- a/config.h.in +++ b/config.h.in @@ -12,8 +12,13 @@ # define LIBEXECDIR "@libexecdir@" # define VERSION "@version@" +#mesondefine ARCH +#mesondefine ARCH_X86 +#mesondefine ARCH_X86_64 + +#mesondefine HAVE_SECCOMP_UNOTIFY +#mesondefine HAVE_SYSTEMD #mesondefine HAVE_SYS_mount_setattr #mesondefine HAVE_close_range -#mesondefine HAVE_SYSTEMD #endif /* !CONFIG_H_ */ diff --git a/enter.c b/enter.c index 2c8357e..64e4106 100644 --- a/enter.c +++ b/enter.c @@ -27,8 +27,10 @@ #include "bst_limits.h" #include "capable.h" #include "compat.h" +#include "config.h" #include "enter.h" #include "errutil.h" +#include "fd.h" #include "mount.h" #include "net.h" #include "ns.h" @@ -40,6 +42,10 @@ #include "util.h" #include "fd.h" +#ifdef HAVE_SECCOMP_UNOTIFY +# include "sec.h" +#endif + static inline size_t append_argv(char **argv, size_t argc, char *arg) { if (argc >= ARG_MAX) { @@ -456,6 +462,14 @@ int enter(struct entry_settings *opts) } ns_enter_postfork(namespaces, ns_len); +#ifdef HAVE_SECCOMP_UNOTIFY + int seccomp_fd = sec_seccomp_install_filter(); + if (seccomp_fd != -1) { + send_fd(outer_helper.fd, seccomp_fd); + close(seccomp_fd); + } +#endif + outer_helper_close(&outer_helper); int rtnl = init_rtnetlink_socket(); diff --git a/meson.build b/meson.build index ae645e5..51984f8 100644 --- a/meson.build +++ b/meson.build @@ -51,18 +51,26 @@ if get_option('optimization') != '0' language: ['c']) endif +arch = host_machine.cpu_family() + config = configuration_data() config.set('package', meson.project_name()) config.set('bindir', bindir) config.set('libexecdir', libexecdir) config.set('version', version) +config.set('ARCH', arch) +config.set('ARCH_@0@'.format(arch.to_upper()), 1) + config.set('HAVE_SYS_mount_setattr', cc.has_header_symbol('syscall.h', 'SYS_mount_setattr')) config.set('HAVE_close_range', cc.has_function('close_range')) libdbus = dependency('dbus-1', required: false) config.set('HAVE_SYSTEMD', libdbus.found()) +has_seccomp_unotify = cc.has_header_symbol('linux/seccomp.h', 'SECCOMP_FILTER_FLAG_NEW_LISTENER') +config.set('HAVE_SECCOMP_UNOTIFY', has_seccomp_unotify) + configure_file(input: 'config.h.in', output: 'config.h', configuration: config) bst_init_sources = [ @@ -113,6 +121,14 @@ if libdbus.found() bst_sources += ['cgroup_systemd.c'] endif +if has_seccomp_unotify + bst_sources += [ + 'arch/@0@/syscall.c'.format(arch), + 'proc.c', + 'sec.c', + ] +endif + executable('bst', bst_sources, install: true, dependencies: [libdbus]) if not get_option('no-setcap-or-suid') @@ -125,6 +141,7 @@ if not get_option('no-setcap-or-suid') 'cap_sys_admin', 'cap_sys_chroot', 'cap_sys_ptrace', + 'cap_mknod', ], 'bst-unpersist': [ 'cap_sys_admin', diff --git a/outer.c b/outer.c index 5dc402a..860f95e 100644 --- a/outer.c +++ b/outer.c @@ -23,6 +23,7 @@ #include "capable.h" #include "cgroup.h" #include "compat.h" +#include "config.h" #include "enter.h" #include "fd.h" #include "outer.h" @@ -31,6 +32,10 @@ #include "userns.h" #include "util.h" +#ifdef HAVE_SECCOMP_UNOTIFY +# include "sec.h" +#endif + enum { /* This should be enough for defining our mappings. If we assign 340 mappings, and since each line would contain at most @@ -404,7 +409,13 @@ void outer_helper_spawn(struct outer_helper *helper) ssize_t count = write(fd, &ok, sizeof (ok)); assert((ssize_t)(sizeof (ok)) == count); +#ifdef HAVE_SECCOMP_UNOTIFY + int seccomp_fd = recv_fd(fd); + sec_seccomp_supervisor(seccomp_fd); + __builtin_unreachable(); +#else _exit(0); +#endif } void outer_helper_sendpid(const struct outer_helper *helper, pid_t pid) diff --git a/proc.c b/proc.c new file mode 100644 index 0000000..af7f922 --- /dev/null +++ b/proc.c @@ -0,0 +1,31 @@ +/* Copyright © 2022 Arista Networks, Inc. All rights reserved. + * + * Use of this source code is governed by the MIT license that can be found + * in the LICENSE file. + */ + +#include +#include +#include + +#include "proc.h" + +int proc_read_status(int procfd, struct proc_status *out) +{ + memset(out, 0, sizeof (*out)); + + int statusfd = openat(procfd, "status", O_RDONLY | O_CLOEXEC); + if (statusfd == -1) { + return -1; + } + + FILE *f = fdopen(statusfd, "r"); + + char line[4096]; + while (fgets(line, sizeof (line) - 1, f)) { + sscanf(line, "Umask:\t%o\n", &out->umask); + } + + fclose(f); + return 0; +} diff --git a/proc.h b/proc.h new file mode 100644 index 0000000..c204e6b --- /dev/null +++ b/proc.h @@ -0,0 +1,16 @@ +/* Copyright © 2022 Arista Networks, Inc. All rights reserved. + * + * Use of this source code is governed by the MIT license that can be found + * in the LICENSE file. + */ + +#ifndef PROC_H_ +# define PROC_H_ + +struct proc_status { + mode_t umask; +}; + +int proc_read_status(int procfd, struct proc_status *out); + +#endif /* !PROC_H_ */ diff --git a/sec.c b/sec.c new file mode 100644 index 0000000..2d394ca --- /dev/null +++ b/sec.c @@ -0,0 +1,477 @@ +/* Copyright © 2024 Arista Networks, Inc. All rights reserved. + * + * Use of this source code is governed by the MIT license that can be found + * in the LICENSE file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arch.h" +#include "capable.h" +#include "proc.h" +#include "sec.h" +#include "util.h" + +typedef int syscall_handler_func(int, int, struct seccomp_notif *); + +enum { + SYSCALL_HANDLED, + SYSCALL_CONTINUE, +}; + +static int self_mnt_nsfd(void) { + + static int fd = -1; + + if (fd == -1) { + fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC); + if (fd == -1) { + err(1, "open /proc/self/ns/mnt"); + } + } + + return fd; +} + +static int check_seccomp_cookie(int seccomp_fd, __u64 *id) +{ + return ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_ID_VALID, id); +} + +static int resolve_dirfd(int procfd, int dirfd) +{ + int realdirfd = -1; + if (dirfd == AT_FDCWD) { + make_capable(BST_CAP_SYS_PTRACE | BST_CAP_DAC_OVERRIDE); + realdirfd = openat(procfd, "cwd", O_PATH | O_CLOEXEC); + reset_capabilities(); + } else { + char fdpath[PATH_MAX+1]; + if ((size_t) snprintf(fdpath, PATH_MAX, "fd/%d", dirfd) >= sizeof (fdpath)) { + warnx("fd/%d takes more than PATH_MAX bytes.", dirfd); + return -EINVAL; + } + + make_capable(BST_CAP_SYS_PTRACE | BST_CAP_DAC_OVERRIDE); + realdirfd = openat(procfd, fdpath, O_PATH | O_CLOEXEC); + reset_capabilities(); + } + if (realdirfd == -1) { + warn("open"); + return -EINVAL; + } + return realdirfd; +} + +struct arg_buf { + uintptr_t addr; + size_t size; + void *buf; +}; + +typedef int runproc_func(int procfd, void *cookie); + +static int run_in_process_context(int seccomp_fd, int procfd, + struct seccomp_notif *req, + struct arg_buf *in, + struct arg_buf *out, + void *cookie, + runproc_func *fn) +{ + int rc = 0; + + make_capable(BST_CAP_SYS_PTRACE | BST_CAP_DAC_OVERRIDE); + + int selfmnt = self_mnt_nsfd(); + int memfd = openat(procfd, "mem", O_RDWR | O_CLOEXEC); + int mntns = openat(procfd, "ns/mnt", O_RDONLY | O_CLOEXEC); + + reset_capabilities(); + + if (memfd == -1) { + warn("open /proc//mem"); + rc = -EINVAL; + goto error_close; + } + + if (mntns == -1) { + warn("open /proc//ns/mnt"); + rc = -EINVAL; + goto error_close; + } + + for (struct arg_buf *a = in; a && a->addr; a++) { + size_t total = 0; + while (total < a->size) { + ssize_t nread = pread(memfd, a->buf, a->size, a->addr); + if (nread == -1) { + warn("pread %lx:%zu", a->addr, a->size); + rc = -EFAULT; + goto error_close; + } + if (nread == 0) { + break; + } + total += nread; + } + a->size = total; + } + + /* Check again that the process is alive and blocked on the syscall. This + handles cases where the syscall got interrupted by a signal handler + and the program state changed before we read the pathname or other + information from proc. */ + + if (check_seccomp_cookie(seccomp_fd, &req->id) == -1) { + rc = -errno; + goto error_close; + } + + make_capable(BST_CAP_SYS_ADMIN | BST_CAP_SYS_CHROOT); + int rc2 = setns(mntns, CLONE_NEWNS); + reset_capabilities(); + + if (rc2 == -1) { + warn("setns"); + rc = -EOPNOTSUPP; + goto error; + } + + if ((rc = fn(procfd, cookie)) == -1) { + goto error; + } + + for (struct arg_buf *a = out; a && a->addr; a++) { + while (a->size > 0) { + ssize_t nwrite = pwrite(memfd, a->buf, a->size, a->addr); + if (nwrite == -1) { + warn("pwrite %lx:%zu", a->addr, a->size); + rc = -EFAULT; + goto error; + } + a->size -= nwrite; + a->addr += nwrite; + } + } + +error: + make_capable(BST_CAP_SYS_ADMIN | BST_CAP_SYS_CHROOT); + rc2 = setns(selfmnt, CLONE_NEWNS); + reset_capabilities(); + + if (rc2 == -1) { + err(1, "setns"); + } + +error_close: + close(mntns); + close(memfd); + return rc; +} + +struct mknodat_args { + int dirfd; + mode_t mode; + dev_t dev; + char pathname[PATH_MAX]; +}; + +static int sec__mknodat_callback(int procfd, void *cookie) +{ + struct mknodat_args *args = cookie; + + struct proc_status status; + if (proc_read_status(procfd, &status) == -1) { + warn("proc_read_status /proc//status"); + return -EINVAL; + } + + mode_t old_umask = umask(status.umask); + + make_capable(BST_CAP_MKNOD); + + int rc = 0; + if (mknodat(args->dirfd, args->pathname, args->mode, args->dev) == -1) { + rc = -errno; + } + + reset_capabilities(); + + if (old_umask != (mode_t) -1) { + umask(old_umask); + } + + return rc; +} + +static int sec__mknodat_impl(int seccomp_fd, int procfd, + struct seccomp_notif *req, + int dirfd, + uintptr_t pathnameaddr, + mode_t mode, + dev_t dev) +{ + if ((mode & S_IFCHR) == 0 || (mode & S_IFBLK) == 0) { + /* Fallthrough for non-privileged operations -- the caller already + has the rights to do this themselves. */ + return SYSCALL_CONTINUE; + } + + /* Is this one of the safe devices? */ + + struct devtype { + mode_t type; + dev_t dev; + }; + + const struct devtype safe_devices[] = { + { .type = S_IFCHR, .dev = makedev(0, 0) }, // whiteout device + { .type = S_IFCHR, .dev = makedev(1, 3) }, // null device + { .type = S_IFCHR, .dev = makedev(1, 5) }, // zero device + { .type = S_IFCHR, .dev = makedev(1, 7) }, // full device + { .type = S_IFCHR, .dev = makedev(1, 8) }, // random device + { .type = S_IFCHR, .dev = makedev(1, 9) }, // urandom device + { .type = S_IFCHR, .dev = makedev(5, 0) }, // tty device + }; + + for (size_t i = 0; i < lengthof(safe_devices); i++) { + if ((mode & S_IFMT) == safe_devices[i].type && dev == safe_devices[i].dev) { + goto safe; + } + } + return SYSCALL_CONTINUE; + +safe: {} + /* The device is safe to create -- perform shenanigans */ + + int realdirfd = resolve_dirfd(procfd, dirfd); + if (realdirfd < 0) { + return realdirfd; + } + + struct mknodat_args args = { + .dirfd = realdirfd, + .dev = dev, + .mode = mode, + }; + + struct arg_buf in[] = { + { + .addr = pathnameaddr, + .buf = &args.pathname[0], + .size = PATH_MAX-1, + }, + { + .addr = 0, + }, + }; + + int rc = run_in_process_context(seccomp_fd, procfd, req, in, NULL, &args, sec__mknodat_callback); + + close(realdirfd); + return rc; +} + +static int sec__mknod(int seccomp_fd, int procfd, struct seccomp_notif *req) +{ + uintptr_t pathnameaddr = req->data.args[0]; + mode_t mode = req->data.args[1]; + dev_t dev = req->data.args[2]; + + return sec__mknodat_impl(seccomp_fd, procfd, req, AT_FDCWD, pathnameaddr, mode, dev); +} + +static int sec__mknodat(int seccomp_fd, int procfd, struct seccomp_notif *req) +{ + int dirfd = req->data.args[0]; + uintptr_t pathnameaddr = req->data.args[1]; + mode_t mode = req->data.args[2]; + dev_t dev = req->data.args[3]; + + return sec__mknodat_impl(seccomp_fd, procfd, req, dirfd, pathnameaddr, mode, dev); +} + +static int seccomp(unsigned int op, unsigned int flags, void *args) +{ + return syscall(__NR_seccomp, op, flags, args); +} + +int sec_seccomp_install_filter(void) +{ + struct sock_fprog prog = { + .len = syscall_filter_length, + .filter = (struct sock_filter *)syscall_filter, + }; + + int fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); + if (fd == -1) { + if (errno == EBUSY) { + // We're likely running bst in bst; ignore the error, and return + // a useless file descriptor to pass to the seccomp supervisor + return epoll_create1(EPOLL_CLOEXEC); + } + err(1, "seccomp SECCOMP_SET_MODE_FILTER"); + } + return fd; +} + +static void sec_seccomp_dispatch_syscall(int seccomp_fd, + struct seccomp_notif *req, + struct seccomp_notif_resp *resp) +{ + static syscall_handler_func *const syscall_table[BST_NR_MAX+1] = { +#ifdef BST_NR_mknod + [BST_NR_mknod] = sec__mknod, +#endif + [BST_NR_mknodat] = sec__mknodat, + }; + +#ifdef BST_SECCOMP_32 + syscall_handler_func *syscall_table_32[BST_NR_MAX32+1] = { +#ifdef BST_NR_mknod_32 + [BST_NR_mknod_32] = sec__mknod, +#endif + [BST_NR_mknodat_32] = sec__mknodat, + }; +#endif + + resp->id = req->id; + + syscall_handler_func *const *table = syscall_table; + size_t nr_syscall = lengthof(syscall_table); +#ifdef ARCH_X86_64 +#ifdef BST_SECCOMP_32 + if (req->data.arch == AUDIT_ARCH_I386) { + table = syscall_table_32; + nr_syscall = lengthof(syscall_table_32); + } +#endif + if (req->data.arch == AUDIT_ARCH_X86_64) { + /* x32 system calls are the same as x86_64, except they have bit 30 + * set; we're not making any difference here, so reset it */ + req->data.nr &= ~0x40000000; + } +#endif + + if (req->data.nr <= 0 || (size_t) req->data.nr >= nr_syscall) { + goto fallthrough; + } + syscall_handler_func *fn = table[(size_t) req->data.nr]; + if (!fn) { + goto fallthrough; + } + + char procpath[PATH_MAX+1]; + if ((size_t) snprintf(procpath, PATH_MAX, "/proc/%d", req->pid) >= sizeof (procpath)) { + errx(1, "/proc/%d takes more than PATH_MAX bytes.", req->pid); + } + + int procfd = open(procpath, O_PATH | O_DIRECTORY | O_CLOEXEC); + if (procfd == -1) { + if (errno == ENOENT) { + goto fallthrough; + } + err(1, "open"); + } + + int rc = fn(seccomp_fd, procfd, req); + close(procfd); + + if (rc < 0) { + resp->error = rc; + } else if (rc == SYSCALL_CONTINUE) { + goto fallthrough; + } + +send: + if (ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_SEND, resp) == -1) { + // ENOENT is normal -- this means the syscall got interrupted by a + // signal. + if (errno != ENOENT) { + warn("ioctl SECCOMP_IOCTL_NOTIF_SEND"); + } + } + return; + +fallthrough: + resp->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE; + goto send; +} + +noreturn void sec_seccomp_supervisor(int seccomp_fd) +{ + /* Run the seccomp supervisor. This supervisor is a privileged helper + that runs safe syscalls on behalf of the unprivileged child in a + user namespace. + + Use-cases include: + * Allowing mknod on devices deemed "safe", like /dev/null, or the + overlayfs whiteout file. + * Allow devtmpfs mount with our custom bst_devtmpfs logic. + + For now, this is intended to be a blocking loop -- if we need other + long-running agents down the line we might need to consider using + an epoll loop or forking these into other processes. */ + + struct seccomp_notif_sizes sizes; + + if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) == -1) + err(1, "seccomp SECCOMP_GET_NOTIF_SIZES"); + + struct seccomp_notif *req = malloc(sizes.seccomp_notif); + if (req == NULL) + err(1, "malloc"); + + /* When allocating the response buffer, we must allow for the fact + that the user-space binary may have been built with user-space + headers where 'struct seccomp_notif_resp' is bigger than the + response buffer expected by the (older) kernel. Therefore, we + allocate a buffer that is the maximum of the two sizes. This + ensures that if the supervisor places bytes into the response + structure that are past the response size that the kernel expects, + then the supervisor is not touching an invalid memory location. */ + + size_t resp_size = sizes.seccomp_notif_resp; + if (sizeof (struct seccomp_notif_resp) > resp_size) + resp_size = sizeof (struct seccomp_notif_resp); + + struct seccomp_notif_resp *resp = malloc(resp_size); + if (resp == NULL) + err(1, "malloc"); + + for (;;) { + memset(req, 0, sizes.seccomp_notif); + memset(resp, 0, resp_size); + + if (ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_RECV, req) == -1) { + switch (errno) { + case EINTR: + continue; + case ENOTTY: + /* seccomp running in seccomp, which is not supported/needed */ + _exit(0); + } + err(1, "ioctl SECCOMP_IOCTL_NOTIF_RECV"); + } + + sec_seccomp_dispatch_syscall(seccomp_fd, req, resp); + } +} + diff --git a/sec.h b/sec.h new file mode 100644 index 0000000..1da2ce3 --- /dev/null +++ b/sec.h @@ -0,0 +1,15 @@ +/* Copyright © 2022 Arista Networks, Inc. All rights reserved. + * + * Use of this source code is governed by the MIT license that can be found + * in the LICENSE file. + */ + +#ifndef SEC_H_ +# define SEC_H_ + +# include + +int sec_seccomp_install_filter(void); +noreturn void sec_seccomp_supervisor(int); + +#endif /* !SEC_H_ */