Skip to content

Commit

Permalink
seccomp: add syscall emulation for safe syscalls, like mknod of /dev/…
Browse files Browse the repository at this point in the history
…null devices.
  • Loading branch information
Snaipe committed Nov 4, 2024
1 parent 2461d3d commit 67dc346
Show file tree
Hide file tree
Showing 14 changed files with 791 additions and 1 deletion.
21 changes: 21 additions & 0 deletions arch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/* Copyright © 2024 Arista Networks, Inc. All rights reserved.
*
* Use of this source code is governed by the MIT license that can be found
* in the LICENSE file.
*/

#ifndef ARCH_H_
# define ARCH_H_

# include "config.h"

# define ARCH_STR_(x) #x
# define ARCH_STR(x) ARCH_STR_(x)

/* *INDENT-OFF* - formatters try to add spaces here */
# define ARCH_HEADER_BASE arch/ARCH
/* *INDENT-ON* */

# include ARCH_STR(ARCH_HEADER_BASE/syscall.h)

#endif /* !ARCH_H_ */
130 changes: 130 additions & 0 deletions arch/x86/gen-syscall.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/bin/bash

# This script generates the classic BPF program to intercept system calls
# in x86 userspace.

# From asm/unistd_64.h
declare -A x86_64_syscalls=(
["mknod"]="133"
["mknodat"]="259"
)

# From asm/unistd_32.h
declare -A i386_syscalls=(
["mknod"]="14"
["mknodat"]="297"
)

prelude=(
# Check that we're running on x86_64 or i386
'BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch)))'
'BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_I386, $(($i386_offset-2)), 0)'
'BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 1, 0)'
'BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)'

# The x32 ABI (not to be confused with the i386 ABI!) uses the
# same system call numbers as x86_64, but set bit 30. Clear it so we share
# the same table.
'BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr)))'
'BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1)'
'BPF_STMT(BPF_ALU | BPF_SUB | BPF_K, X32_SYSCALL_BIT)'
)

syscall_jump=(
'BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, $nr, 0, 1)'
'BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF)'
)

i386_offset=$((${#prelude[@]} + ${#syscall_jump[@]}*${#x86_64_syscalls[@]} + 1))

# NOTE: indentation is done with tabs. Do not use spaces, do not remove tabs,
# lest you break all HEREDOCs.

gen_source() {
cat <<-EOF
/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
#include <stddef.h>
#include <linux/audit.h>
#include <linux/bpf_common.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
/* For the x32 ABI, all system call numbers have bit 30 set */
#define X32_SYSCALL_BIT 0x40000000
const struct sock_filter syscall_filter[] = {
EOF

for stmt in "${prelude[@]}"; do
eval "echo $'\t'\"$stmt\","
done

for syscall in "${!x86_64_syscalls[@]}"; do
nr=${x86_64_syscalls[$syscall]}
for stmt in "${syscall_jump[@]}"; do
eval "echo $'\t'\"$stmt\","
done
done

echo $'\t''BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),'
echo $'\t''BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))),'

for syscall in "${!i386_syscalls[@]}"; do
nr=${i386_syscalls[$syscall]}
for stmt in "${syscall_jump[@]}"; do
eval "echo $'\t'\"$stmt\","
done
done

echo $'\t''BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),'

cat <<-EOF
};
const size_t syscall_filter_length = sizeof (syscall_filter) / sizeof (struct sock_filter);
/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
EOF
}

gen_header() {
cat <<-EOF
/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
extern const struct sock_filter syscall_filter[];
extern const size_t syscall_filter_length;
EOF

for syscall in "${!x86_64_syscalls[@]}"; do
echo "#define BST_NR_${syscall} ${x86_64_syscalls[$syscall]}"
done

for syscall in "${!i386_syscalls[@]}"; do
echo "#define BST_NR_${syscall}_32 ${i386_syscalls[$syscall]}"
done

max=0
for syscall in "${!x86_64_syscalls[@]}"; do
(( ${x86_64_syscalls[$syscall]} > max )) && max=${x86_64_syscalls[$syscall]}
done

max32=0
for syscall in "${!i386_syscalls[@]}"; do
(( ${i386_syscalls[$syscall]} > max32 )) && max32=${i386_syscalls[$syscall]}
done

cat <<-EOF
#define BST_SECCOMP_32 1
#define BST_NR_MAX $max
#define BST_NR_MAX32 $max32
/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
EOF
}

gen_source > arch/x86/syscall.c
gen_header > arch/x86/syscall.h
35 changes: 35 additions & 0 deletions arch/x86/syscall.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */

#include <stddef.h>
#include <linux/audit.h>
#include <linux/bpf_common.h>
#include <linux/filter.h>
#include <linux/seccomp.h>

/* For the x32 ABI, all system call numbers have bit 30 set */
#define X32_SYSCALL_BIT 0x40000000

const struct sock_filter syscall_filter[] = {
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_I386, 10, 0),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 1, 0),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))),
BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1),
BPF_STMT(BPF_ALU | BPF_SUB | BPF_K, X32_SYSCALL_BIT),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 133, 0, 1),
BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 259, 0, 1),
BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 14, 0, 1),
BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 297, 0, 1),
BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
};

const size_t syscall_filter_length = sizeof (syscall_filter) / sizeof (struct sock_filter);

/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
16 changes: 16 additions & 0 deletions arch/x86/syscall.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */

extern const struct sock_filter syscall_filter[];
extern const size_t syscall_filter_length;

#define BST_NR_mknod 133
#define BST_NR_mknodat 259
#define BST_NR_mknod_32 14
#define BST_NR_mknodat_32 297

#define BST_SECCOMP_32 1

#define BST_NR_MAX 259
#define BST_NR_MAX32 297

/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
1 change: 1 addition & 0 deletions arch/x86_64
1 change: 1 addition & 0 deletions capable.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# define BST_CAP_SETUID ((uint64_t) 1 << CAP_SETUID)
# define BST_CAP_SETGID ((uint64_t) 1 << CAP_SETGID)
# define BST_CAP_SYS_CHROOT ((uint64_t) 1 << CAP_SYS_CHROOT)
# define BST_CAP_MKNOD ((uint64_t) 1 << CAP_MKNOD)

extern int deny_new_capabilities;

Expand Down
7 changes: 6 additions & 1 deletion config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@
# define LIBEXECDIR "@libexecdir@"
# define VERSION "@version@"

#mesondefine ARCH
#mesondefine ARCH_X86
#mesondefine ARCH_X86_64

#mesondefine HAVE_SECCOMP_UNOTIFY
#mesondefine HAVE_SYSTEMD
#mesondefine HAVE_SYS_mount_setattr
#mesondefine HAVE_close_range
#mesondefine HAVE_SYSTEMD

#endif /* !CONFIG_H_ */
14 changes: 14 additions & 0 deletions enter.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
#include "bst_limits.h"
#include "capable.h"
#include "compat.h"
#include "config.h"
#include "enter.h"
#include "errutil.h"
#include "fd.h"
#include "mount.h"
#include "net.h"
#include "ns.h"
Expand All @@ -40,6 +42,10 @@
#include "util.h"
#include "fd.h"

#ifdef HAVE_SECCOMP_UNOTIFY
# include "sec.h"
#endif

static inline size_t append_argv(char **argv, size_t argc, char *arg)
{
if (argc >= ARG_MAX) {
Expand Down Expand Up @@ -456,6 +462,14 @@ int enter(struct entry_settings *opts)
}
ns_enter_postfork(namespaces, ns_len);

#ifdef HAVE_SECCOMP_UNOTIFY
int seccomp_fd = sec_seccomp_install_filter();
if (seccomp_fd != -1) {
send_fd(outer_helper.fd, seccomp_fd);
close(seccomp_fd);
}
#endif

outer_helper_close(&outer_helper);

int rtnl = init_rtnetlink_socket();
Expand Down
17 changes: 17 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,26 @@ if get_option('optimization') != '0'
language: ['c'])
endif

arch = host_machine.cpu_family()

config = configuration_data()
config.set('package', meson.project_name())
config.set('bindir', bindir)
config.set('libexecdir', libexecdir)
config.set('version', version)

config.set('ARCH', arch)
config.set('ARCH_@0@'.format(arch.to_upper()), 1)

config.set('HAVE_SYS_mount_setattr', cc.has_header_symbol('syscall.h', 'SYS_mount_setattr'))
config.set('HAVE_close_range', cc.has_function('close_range'))

libdbus = dependency('dbus-1', required: false)
config.set('HAVE_SYSTEMD', libdbus.found())

has_seccomp_unotify = cc.has_header_symbol('linux/seccomp.h', 'SECCOMP_FILTER_FLAG_NEW_LISTENER')
config.set('HAVE_SECCOMP_UNOTIFY', has_seccomp_unotify)

configure_file(input: 'config.h.in', output: 'config.h', configuration: config)

bst_init_sources = [
Expand Down Expand Up @@ -113,6 +121,14 @@ if libdbus.found()
bst_sources += ['cgroup_systemd.c']
endif

if has_seccomp_unotify
bst_sources += [
'arch/@0@/syscall.c'.format(arch),
'proc.c',
'sec.c',
]
endif

executable('bst', bst_sources, install: true, dependencies: [libdbus])

if not get_option('no-setcap-or-suid')
Expand All @@ -125,6 +141,7 @@ if not get_option('no-setcap-or-suid')
'cap_sys_admin',
'cap_sys_chroot',
'cap_sys_ptrace',
'cap_mknod',
],
'bst-unpersist': [
'cap_sys_admin',
Expand Down
11 changes: 11 additions & 0 deletions outer.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "capable.h"
#include "cgroup.h"
#include "compat.h"
#include "config.h"
#include "enter.h"
#include "fd.h"
#include "outer.h"
Expand All @@ -31,6 +32,10 @@
#include "userns.h"
#include "util.h"

#ifdef HAVE_SECCOMP_UNOTIFY
# include "sec.h"
#endif

enum {
/* This should be enough for defining our mappings. If we assign
340 mappings, and since each line would contain at most
Expand Down Expand Up @@ -404,7 +409,13 @@ void outer_helper_spawn(struct outer_helper *helper)
ssize_t count = write(fd, &ok, sizeof (ok));
assert((ssize_t)(sizeof (ok)) == count);

#ifdef HAVE_SECCOMP_UNOTIFY
int seccomp_fd = recv_fd(fd);
sec_seccomp_supervisor(seccomp_fd);
__builtin_unreachable();
#else
_exit(0);
#endif
}

void outer_helper_sendpid(const struct outer_helper *helper, pid_t pid)
Expand Down
31 changes: 31 additions & 0 deletions proc.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/* Copyright © 2022 Arista Networks, Inc. All rights reserved.
*
* Use of this source code is governed by the MIT license that can be found
* in the LICENSE file.
*/

#include <fcntl.h>
#include <stdio.h>
#include <string.h>

#include "proc.h"

int proc_read_status(int procfd, struct proc_status *out)
{
memset(out, 0, sizeof (*out));

int statusfd = openat(procfd, "status", O_RDONLY | O_CLOEXEC);
if (statusfd == -1) {
return -1;
}

FILE *f = fdopen(statusfd, "r");

char line[4096];
while (fgets(line, sizeof (line) - 1, f)) {
sscanf(line, "Umask:\t%o\n", &out->umask);
}

fclose(f);
return 0;
}
Loading

0 comments on commit 67dc346

Please sign in to comment.