diff --git a/arch/x86/gen-syscall.bash b/arch/x86/gen-syscall.bash index 3502ed9..05a7e5c 100755 --- a/arch/x86/gen-syscall.bash +++ b/arch/x86/gen-syscall.bash @@ -13,6 +13,12 @@ declare -A x86_64_syscalls=( declare -A i386_syscalls=( ["mknod"]="14" ["mknodat"]="297" + + ["stat64"]="195" + ["lstat64"]="196" + ["fstat64"]="197" + ["fstatat64"]="300" + ["statx"]="383" ) prelude=( diff --git a/arch/x86/syscall.c b/arch/x86/syscall.c index fc426c9..32d1dbb 100644 --- a/arch/x86/syscall.c +++ b/arch/x86/syscall.c @@ -23,8 +23,18 @@ const struct sock_filter syscall_filter[] = { BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 196, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 195, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 197, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 14, 0, 1), BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 383, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 300, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 297, 0, 1), BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), diff --git a/arch/x86/syscall.h b/arch/x86/syscall.h index 8ed4914..250c60e 100644 --- a/arch/x86/syscall.h +++ b/arch/x86/syscall.h @@ -5,12 +5,17 @@ extern const size_t syscall_filter_length; #define BST_NR_mknod 133 #define BST_NR_mknodat 259 +#define BST_NR_lstat64_32 196 +#define BST_NR_stat64_32 195 +#define BST_NR_fstat64_32 197 #define BST_NR_mknod_32 14 +#define BST_NR_statx_32 383 +#define BST_NR_fstatat64_32 300 #define BST_NR_mknodat_32 297 #define BST_SECCOMP_32 1 #define BST_NR_MAX 259 -#define BST_NR_MAX32 297 +#define BST_NR_MAX32 383 /* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */ diff --git a/main.c b/main.c index f1fc7ab..35973db 100644 --- a/main.c +++ b/main.c @@ -29,6 +29,7 @@ #include "util.h" #include "path.h" #include "util.h" +#include "sec.h" enum { OPTION_VERSION = 128, @@ -63,6 +64,10 @@ enum { OPTION_CLOSE_FD, OPTION_CGROUP_DRIVER, + /* Opt-in feature flags */ + OPTION_FIX_STAT_32BIT_OVERFLOW, + + /* Opt-out feature flags */ OPTION_NO_FAKE_DEVTMPFS, OPTION_NO_DERANDOMIZE, OPTION_NO_PROC_REMOUNT, @@ -316,6 +321,9 @@ int main(int argc, char *argv[], char *envp[]) { "close-fd", optional_argument, NULL, OPTION_CLOSE_FD }, { "cgroup-driver", required_argument, NULL, OPTION_CGROUP_DRIVER }, + /* Opt-in feature flags */ + { "fix-stat-32bit-overflow", no_argument, NULL, OPTION_FIX_STAT_32BIT_OVERFLOW }, + /* Opt-out feature flags */ { "no-copy-hard-rlimits", no_argument, NULL, OPTION_NO_COPY_HARD_RLIMITS }, { "no-fake-devtmpfs", no_argument, NULL, OPTION_NO_FAKE_DEVTMPFS }, @@ -781,6 +789,12 @@ int main(int argc, char *argv[], char *envp[]) break; } + case OPTION_FIX_STAT_32BIT_OVERFLOW: + { + sec_seccomp_fix_stat_32bit = 1; + break; + } + case 'r': opts.root = optarg; break; diff --git a/man/bst.1.scd b/man/bst.1.scd index 6195402..38541ad 100644 --- a/man/bst.1.scd +++ b/man/bst.1.scd @@ -390,6 +390,23 @@ spacetime process. be useful to pass out-of-band data to the setup program without leaking file descriptors to the spacetime process. +\--fix-stat-32bit-overflow + Hijack calls to the stat64 family of system calls and return quantities + within 32-bit boundaries. + + On most i686 distributions, glibc implements stat() for programs compiled + without -D_FILE_OFFSET_BITS=64 by calling the corresponding stat64 system + call, and if any of the 64-bit quantities in the statbuf are larger than + 2^32-1, the glibc wrapper pretends the file does not exist by returning + ENOENT. + + This flag mitigates the issue by pulling the rug under glibc and rewriting + the quantities to stay within bounds. For timestamps, a fixed date within + range is used. For inode numbers, the value is rewritten in a way that + keeps the (device, inode) pair unique. + + This flag has no effect on programs running with a 64-bit personality. + \--no-copy-hard-rlimits Do not copy hard limit values to soft limits for all resources mentioned above. diff --git a/sec.c b/sec.c index 2d394ca..100838f 100644 --- a/sec.c +++ b/sec.c @@ -29,6 +29,8 @@ #include "sec.h" #include "util.h" +int sec_seccomp_fix_stat_32bit = 0; + typedef int syscall_handler_func(int, int, struct seccomp_notif *); enum { @@ -308,6 +310,259 @@ static int sec__mknodat(int seccomp_fd, int procfd, struct seccomp_notif *req) return sec__mknodat_impl(seccomp_fd, procfd, req, dirfd, pathnameaddr, mode, dev); } +struct statx_args { + int dirfd; + char pathname[PATH_MAX]; + int flags; + unsigned int mask; + struct statx statxbuf; +}; + +static int do_statx(int dirfd, char *pathname, int flags, unsigned int mask, struct statx *statxbuf) +{ + /* We always mock timestamps, so no need to query them. */ + mask &= ~(STATX_ATIME | STATX_BTIME | STATX_MTIME | STATX_CTIME); + + if (statx(dirfd, pathname, flags, mask, statxbuf) == -1) { + return -errno; + } + + /* Normalize the timestamps to a fixed 32-bit date. */ + struct statx_timestamp well_known_date = { + .tv_sec = 946728000, /* 2000-01-01 12:00:00 +0000 UTC */ + }; + + statxbuf->stx_atime = well_known_date; + statxbuf->stx_btime = well_known_date; + statxbuf->stx_mtime = well_known_date; + statxbuf->stx_ctime = well_known_date; + + /* Normalize the inode so that it fits in 32-bit space. + There's no good way to solve this perfectly, but a reasonable compromise + that keeps the (dev, ino) pair unique is to move the upper 32-bits into + st_dev. On the 32-bit stat struct however, st_dev is also 32-bit wide, + which means we have to split the upper and lower 16 bits of the upper + 32-bits of stx_ino into the minor and major numbers of st_dev + respectively. + */ + const uint32_t prime32 = 3432918353; + const uint16_t prime16 = 62533; + + if (statxbuf->stx_ino > UINT32_MAX) { + uint32_t major, minor; + minor = (uint32_t)statxbuf->stx_dev_minor * prime32; + minor ^= ((statxbuf->stx_ino >> 48) & 0xffff); + statxbuf->stx_dev_minor = minor; + major = (uint32_t)statxbuf->stx_dev_major * prime32; + major ^= ((statxbuf->stx_ino >> 32) & 0xffff); + statxbuf->stx_dev_major = major; + statxbuf->stx_ino &= 0xffffffff; + } + if (statxbuf->stx_dev_major > UINT16_MAX) { + uint16_t major; + major = (uint16_t)statxbuf->stx_dev_major * prime16; + major ^= (uint16_t)(statxbuf->stx_dev_major >> 16); + statxbuf->stx_dev_major = major; + } + if (statxbuf->stx_dev_minor > UINT16_MAX) { + uint16_t minor; + minor = (uint16_t)statxbuf->stx_dev_minor * prime16; + minor ^= (uint16_t)(statxbuf->stx_dev_minor >> 16); + statxbuf->stx_dev_minor = minor; + } + return 0; +} + +static int sec__statx_callback(int procfd, void *cookie) +{ + struct statx_args *args = cookie; + return do_statx(args->dirfd, args->pathname, args->flags, args->mask, &args->statxbuf); +} + +static int sec__statx(int seccomp_fd, int procfd, struct seccomp_notif *req) +{ + int dirfd = req->data.args[0]; + uintptr_t pathnameaddr = req->data.args[1]; + int flags = req->data.args[2]; + unsigned int mask = req->data.args[3]; + uintptr_t statxbufaddr = req->data.args[4]; + + int realdirfd = resolve_dirfd(procfd, dirfd); + if (realdirfd < 0) { + return realdirfd; + } + + struct statx_args args = { + .dirfd = realdirfd, + .flags = flags, + .mask = mask, + }; + + struct arg_buf in[] = { + { + .addr = pathnameaddr, + .buf = &args.pathname[0], + .size = PATH_MAX-1, + }, + { + .addr = 0, + }, + }; + + struct arg_buf out[] = { + { + .addr = statxbufaddr, + .buf = (char *)&args.statxbuf, + .size = sizeof (struct statx), + }, + { + .addr = 0, + }, + }; + + int rc = run_in_process_context(seccomp_fd, procfd, req, in, out, &args, sec__statx_callback); + + close(realdirfd); + return rc; +} + +struct sec__stat64 { + uint64_t dev; + uint64_t ino; + uint64_t nlink; + + uint32_t mode; + uint32_t uid; + uint32_t gid; + uint32_t __pad0; + uint64_t rdev; + int64_t size; + int64_t blksize; + int64_t blocks; + + uint64_t atime; + uint64_t atime_nsec; + uint64_t mtime; + uint64_t mtime_nsec; + uint64_t ctime; + uint64_t ctime_nsec; + int64_t __unused[3]; +}; + +struct fstatat64_args { + int dirfd; + char pathname[PATH_MAX]; + int flags; + unsigned int mask; + struct sec__stat64 statbuf; +}; + +static inline uint64_t makedev64(uint32_t major, uint32_t minor) +{ + /* We can't use makedev() since it's bit-dependent */ + uint64_t dev; + dev = (((dev_t) (major & 0x00000fffu)) << 8); + dev |= (((dev_t) (major & 0xfffff000u)) << 32); + dev |= (((dev_t) (minor & 0x000000ffu)) << 0); + dev |= (((dev_t) (minor & 0xffffff00u)) << 12); + return dev; +} + +static int sec__fstatat64_callback(int procfd, void *cookie) +{ + struct fstatat64_args *args = cookie; + struct statx statxbuf; + + int rc = do_statx(args->dirfd, args->pathname, args->flags, STATX_BASIC_STATS, &statxbuf); + if (rc < 0) { + return rc; + } + + args->statbuf.dev = makedev64(statxbuf.stx_dev_major, statxbuf.stx_dev_minor); + args->statbuf.ino = statxbuf.stx_ino; + args->statbuf.nlink = statxbuf.stx_nlink; + args->statbuf.mode = statxbuf.stx_mode; + args->statbuf.uid = statxbuf.stx_uid; + args->statbuf.gid = statxbuf.stx_gid; + args->statbuf.rdev = makedev64(statxbuf.stx_rdev_major, statxbuf.stx_rdev_minor); + args->statbuf.size = statxbuf.stx_size; + args->statbuf.blksize = statxbuf.stx_blksize; + args->statbuf.blocks = statxbuf.stx_blocks; + args->statbuf.atime = statxbuf.stx_atime.tv_sec; + args->statbuf.atime_nsec = statxbuf.stx_atime.tv_nsec; + args->statbuf.mtime = statxbuf.stx_mtime.tv_sec; + args->statbuf.mtime_nsec = statxbuf.stx_mtime.tv_nsec; + args->statbuf.ctime = statxbuf.stx_ctime.tv_sec; + args->statbuf.ctime_nsec = statxbuf.stx_ctime.tv_nsec; + + return 0; +} + +static int sec__fstatat64_impl(int seccomp_fd, int procfd, + struct seccomp_notif *req, + int dirfd, + uintptr_t pathnameaddr, + uintptr_t statbufaddr, + int flags) +{ + int realdirfd = resolve_dirfd(procfd, dirfd); + if (realdirfd < 0) { + return realdirfd; + } + + struct fstatat64_args args = { + .dirfd = realdirfd, + .flags = flags, + }; + + struct arg_buf in[] = { + { + .addr = pathnameaddr, + .buf = &args.pathname[0], + .size = PATH_MAX-1, + }, + { + .addr = 0, + }, + }; + + struct arg_buf out[] = { + { + .addr = statbufaddr, + .buf = (char *)&args.statbuf, + .size = sizeof (struct sec__stat64), + }, + { + .addr = 0, + }, + }; + + int rc = run_in_process_context(seccomp_fd, procfd, req, in, out, &args, sec__fstatat64_callback); + + close(realdirfd); + return rc; +} + +static int sec__stat64(int seccomp_fd, int procfd, struct seccomp_notif *req) +{ + return sec__fstatat64_impl(seccomp_fd, procfd, req, AT_FDCWD, req->data.args[0], req->data.args[1], 0); +} + +static int sec__lstat64(int seccomp_fd, int procfd, struct seccomp_notif *req) +{ + return sec__fstatat64_impl(seccomp_fd, procfd, req, AT_FDCWD, req->data.args[0], req->data.args[1], AT_SYMLINK_NOFOLLOW); +} + +static int sec__fstat64(int seccomp_fd, int procfd, struct seccomp_notif *req) +{ + return sec__fstatat64_impl(seccomp_fd, procfd, req, req->data.args[0], 0, req->data.args[1], AT_EMPTY_PATH); +} + +static int sec__fstatat64(int seccomp_fd, int procfd, struct seccomp_notif *req) +{ + return sec__fstatat64_impl(seccomp_fd, procfd, req, req->data.args[0], req->data.args[1], req->data.args[2], req->data.args[3]); +} + static int seccomp(unsigned int op, unsigned int flags, void *args) { return syscall(__NR_seccomp, op, flags, args); @@ -350,6 +605,22 @@ static void sec_seccomp_dispatch_syscall(int seccomp_fd, #endif [BST_NR_mknodat_32] = sec__mknodat, }; + + if (sec_seccomp_fix_stat_32bit) { +#ifdef BST_NR_stat64_32 + syscall_table_32[BST_NR_stat64_32] = sec__stat64; +#endif +#ifdef BST_NR_lstat64_32 + syscall_table_32[BST_NR_lstat64_32] = sec__lstat64; +#endif +#ifdef BST_NR_fstat64_32 + syscall_table_32[BST_NR_fstat64_32] = sec__fstat64; +#endif +#ifdef BST_NR_fstatat64_32 + syscall_table_32[BST_NR_fstatat64_32] = sec__fstatat64; +#endif + syscall_table_32[BST_NR_statx_32] = sec__statx; + } #endif resp->id = req->id; diff --git a/sec.h b/sec.h index 1da2ce3..08b3bc9 100644 --- a/sec.h +++ b/sec.h @@ -12,4 +12,6 @@ int sec_seccomp_install_filter(void); noreturn void sec_seccomp_supervisor(int); +extern int sec_seccomp_fix_stat_32bit; + #endif /* !SEC_H_ */