[lxc-devel] [lxd/master] [RFC]: seccomp: enable unpriviled bpf through syscall interception
brauner on Github
lxc-bot at linuxcontainers.org
Thu Aug 6 15:57:36 UTC 2020
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 364 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200806/00c6c414/attachment-0001.bin>
-------------- next part --------------
From 94bcac3febf6790aa9a292ee1bb91d12579e73e1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Thu, 6 Aug 2020 11:16:19 +0200
Subject: [PATCH] [RFC]: seccomp: enable unpriviled bpf through syscall
interception
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
lxd/daemon.go | 8 +
lxd/include/lxd_seccomp.h | 23 ++
lxd/include/syscall_numbers.h | 35 +++
lxd/instance/drivers/driver_lxc.go | 11 +
lxd/instance/instance_interface.go | 1 +
lxd/main_checkfeature.go | 128 ++++++++++-
lxd/seccomp/seccomp.go | 336 ++++++++++++++++++++++++-----
lxd/sys/os.go | 1 +
shared/instance.go | 30 +--
9 files changed, 502 insertions(+), 71 deletions(-)
diff --git a/lxd/daemon.go b/lxd/daemon.go
index 6ce21a1c19..2d62378164 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -631,6 +631,7 @@ func (d *Daemon) init() error {
"pidfd",
"seccomp_allow_deny_syntax",
"devpts_fd",
+ "seccomp_proxy_send_notify_fd",
}
for _, extension := range lxcExtensions {
d.os.LXCFeatures[extension] = liblxc.HasApiExtension(extension)
@@ -675,6 +676,13 @@ func (d *Daemon) init() error {
logger.Infof(" - seccomp listener continue syscalls: no")
}
+ if canUseSeccompListenerAddfd() && d.os.LXCFeatures["seccomp_proxy_send_notify_fd"] {
+ d.os.SeccompListenerAddfd = true
+ logger.Infof(" - seccomp listener add file descriptors: yes")
+ } else {
+ logger.Infof(" - seccomp listener add file descriptors: no")
+ }
+
if d.os.LXCFeatures["devpts_fd"] && canUseNativeTerminals() {
d.os.NativeTerminals = true
logger.Infof(" - safe native terminal allocation : yes")
diff --git a/lxd/include/lxd_seccomp.h b/lxd/include/lxd_seccomp.h
index 242347e3e1..976947e4bc 100644
--- a/lxd/include/lxd_seccomp.h
+++ b/lxd/include/lxd_seccomp.h
@@ -65,4 +65,27 @@ struct seccomp_notif_sizes {
struct seccomp_notif_resp)
#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64)
#endif
+
+#ifndef SECCOMP_IOCTL_NOTIF_ADDFD
+#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, struct seccomp_notif_addfd)
+
+/* valid flags for seccomp_notif_addfd */
+#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */
+
+/**
+ * struct seccomp_notif_addfd
+ * @id: The ID of the seccomp notification
+ * @flags: SECCOMP_ADDFD_FLAG_*
+ * @srcfd: The local fd number
+ * @newfd: Optional remote FD number if SETFD option is set, otherwise 0.
+ * @newfd_flags: The O_* flags the remote FD should have applied
+ */
+struct seccomp_notif_addfd {
+ __u64 id;
+ __u32 flags;
+ __u32 srcfd;
+ __u32 newfd;
+ __u32 newfd_flags;
+};
+#endif
#endif /* LXD_SECCOMP_H */
diff --git a/lxd/include/syscall_numbers.h b/lxd/include/syscall_numbers.h
index f953a26911..269b8c795b 100644
--- a/lxd/include/syscall_numbers.h
+++ b/lxd/include/syscall_numbers.h
@@ -74,4 +74,39 @@
#endif
#endif
+#ifndef __NR_bpf
+ #if defined __i386__
+ #define __NR_bpf 357
+ #elif defined __x86_64__
+ #define __NR_bpf 321
+ #elif defined __arm__
+ #define __NR_bpf 386
+ #elif defined __aarch64__
+ #define __NR_bpf 386
+ #elif defined __s390__
+ #define __NR_bpf 351
+ #elif defined __powerpc__
+ #define __NR_bpf 361
+ #elif defined __riscv
+ #define __NR_bpf 280
+ #elif defined __sparc__
+ #define __NR_bpf 349
+ #elif defined __ia64__
+ #define __NR_bpf (317 + 1024)
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_bpf 4355
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_bpf 6319
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_bpf 5315
+ #endif
+ #else
+ #define -1
+ #warning "__NR_bpf not defined for your architecture"
+ #endif
+#endif
+
#endif /* __LXD_SYSCALL_NUMBERS_H */
diff --git a/lxd/instance/drivers/driver_lxc.go b/lxd/instance/drivers/driver_lxc.go
index 6b3de9a40e..aa1f0f52e0 100644
--- a/lxd/instance/drivers/driver_lxc.go
+++ b/lxd/instance/drivers/driver_lxc.go
@@ -6665,6 +6665,17 @@ func (c *lxc) DevptsFd() (*os.File, error) {
return c.c.DevptsFd()
}
+// SeccompNotifyFd returns seccomp notify fd of the container.
+func (c *lxc) SeccompNotifyFd() (*os.File, error) {
+ // Load the go-lxc struct
+ err := c.initLXC(false)
+ if err != nil {
+ return nil, err
+ }
+
+ return c.c.SeccompNotifyFd()
+}
+
// LocalConfig returns local config.
func (c *lxc) LocalConfig() map[string]string {
return c.localConfig
diff --git a/lxd/instance/instance_interface.go b/lxd/instance/instance_interface.go
index e03db25bfb..c8dbf1cd5d 100644
--- a/lxd/instance/instance_interface.go
+++ b/lxd/instance/instance_interface.go
@@ -148,6 +148,7 @@ type Container interface {
ConsoleLog(opts liblxc.ConsoleLogOptions) (string, error)
InsertSeccompUnixDevice(prefix string, m deviceConfig.Device, pid int) error
DevptsFd() (*os.File, error)
+ SeccompNotifyFd() (*os.File, error)
}
// CriuMigrationArgs arguments for CRIU migration.
diff --git a/lxd/main_checkfeature.go b/lxd/main_checkfeature.go
index 01995462b9..77a7f53738 100644
--- a/lxd/main_checkfeature.go
+++ b/lxd/main_checkfeature.go
@@ -293,6 +293,127 @@ static void is_user_notification_continue_aware(void)
seccomp_notify_aware = 2;
}
+__noreturn static void __do_user_notification_addfd(void)
+{
+ __do_close int listener = -EBADF;
+ pid_t pid;
+ int ret;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+ struct seccomp_notif_addfd addfd = {};
+ struct pollfd pollfd;
+
+ listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ if (listener < 0)
+ _exit(EXIT_FAILURE);
+
+ pid = fork();
+ if (pid < 0)
+ _exit(EXIT_FAILURE);
+
+ if (pid == 0) {
+ int dup_fd, pipe_fds[2];
+ pid_t self;
+
+ // Don't bother cleaning up. On child exit all of those
+ // will be closed anyway.
+ ret = pipe(pipe_fds);
+ if (ret < 0)
+ _exit(EXIT_FAILURE);
+
+ // O_CLOEXEC doesn't matter as we're in the child and we're
+ // not going to exec.
+ dup_fd = dup(pipe_fds[0]);
+ if (dup_fd < 0)
+ _exit(EXIT_FAILURE);
+
+ self = getpid();
+
+ ret = filecmp(self, self, pipe_fds[0], dup_fd);
+ if (ret)
+ _exit(EXIT_FAILURE);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ pollfd.fd = listener;
+ pollfd.events = POLLIN | POLLOUT;
+
+ ret = poll(&pollfd, 1, 5000);
+ if (ret <= 0)
+ goto cleanup_sigkill;
+
+ if (!(pollfd.revents & POLLIN))
+ goto cleanup_sigkill;
+
+ ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
+ if (ret)
+ goto cleanup_sigkill;
+
+ pollfd.fd = listener;
+ pollfd.events = POLLIN | POLLOUT;
+
+ ret = poll(&pollfd, 1, 5000);
+ if (ret <= 0)
+ goto cleanup_sigkill;
+
+ if (!(pollfd.revents & POLLOUT))
+ goto cleanup_sigkill;
+
+ if (req.data.nr != __NR_dup)
+ goto cleanup_sigkill;
+
+ addfd.srcfd = 3;
+ addfd.id = req.id;
+ addfd.flags = 0;
+
+ // Inject the fd into the task.
+ ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
+ if (ret < 0)
+ goto cleanup_sigkill;
+ close(ret);
+
+ resp.id = req.id;
+ resp.flags |= SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+ ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+ resp.error = -EPERM;
+ resp.flags = 0;
+ if (ret) {
+ ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+ goto cleanup_sigkill;
+ }
+
+cleanup_wait:
+ ret = wait_for_pid(pid);
+ if (ret)
+ _exit(EXIT_FAILURE);
+ _exit(EXIT_SUCCESS);
+
+cleanup_sigkill:
+ kill(pid, SIGKILL);
+ goto cleanup_wait;
+}
+
+static void is_user_notification_addfd_aware(void)
+{
+ int ret;
+ pid_t pid;
+
+ pid = fork();
+ if (pid < 0)
+ return;
+
+ if (pid == 0) {
+ __do_user_notification_addfd();
+ // Should not be reached.
+ _exit(EXIT_FAILURE);
+ }
+
+ ret = wait_for_pid(pid);
+ if (!ret)
+ seccomp_notify_aware = 3;
+}
+
static void is_seccomp_notify_aware(void)
{
__u32 action[] = { SECCOMP_RET_USER_NOTIF };
@@ -300,6 +421,8 @@ static void is_seccomp_notify_aware(void)
if (syscall(__NR_seccomp, SECCOMP_GET_ACTION_AVAIL, 0, &action[0]) == 0) {
seccomp_notify_aware = 1;
is_user_notification_continue_aware();
+ if (seccomp_notify_aware == 2)
+ is_user_notification_addfd_aware();
}
}
@@ -403,9 +526,12 @@ func canUseSeccompListener() bool {
}
func canUseSeccompListenerContinue() bool {
- return bool(C.seccomp_notify_aware == 2)
+ return bool(C.seccomp_notify_aware >= 2)
}
+func canUseSeccompListenerAddfd() bool {
+ return bool(C.seccomp_notify_aware == 3)
+}
func canUsePidFds() bool {
return bool(C.pidfd_aware)
}
diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go
index 52e88049a1..d1d6e40dfe 100644
--- a/lxd/seccomp/seccomp.go
+++ b/lxd/seccomp/seccomp.go
@@ -15,6 +15,8 @@ import (
"strings"
"unsafe"
+ "github.com/pkg/errors"
+
"golang.org/x/sys/unix"
liblxc "gopkg.in/lxc/go-lxc.v2"
@@ -41,6 +43,7 @@ import (
#include <elf.h>
#include <errno.h>
#include <fcntl.h>
+#include <linux/bpf.h>
#include <linux/seccomp.h>
#include <linux/types.h>
#include <linux/kdev_t.h>
@@ -58,6 +61,8 @@ import (
#include <unistd.h>
#include "../include/lxd_seccomp.h"
+#include "../include/memory_utils.h"
+#include "../include/process_utils.h"
struct seccomp_notif_sizes expected_sizes;
@@ -123,69 +128,71 @@ struct lxd_seccomp_data_arch {
int nr_mknodat;
int nr_setxattr;
int nr_mount;
+ int nr_bpf;
};
#define LXD_SECCOMP_NOTIFY_MKNOD 0
#define LXD_SECCOMP_NOTIFY_MKNODAT 1
#define LXD_SECCOMP_NOTIFY_SETXATTR 2
#define LXD_SECCOMP_NOTIFY_MOUNT 3
+#define LXD_SECCOMP_NOTIFY_BPF 4
// ordered by likelihood of usage...
static const struct lxd_seccomp_data_arch seccomp_notify_syscall_table[] = {
- { -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, LXD_SECCOMP_NOTIFY_SETXATTR, LXD_SECCOMP_NOTIFY_MOUNT },
+ { -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, LXD_SECCOMP_NOTIFY_SETXATTR, LXD_SECCOMP_NOTIFY_MOUNT, LXD_SECCOMP_NOTIFY_BPF },
#ifdef AUDIT_ARCH_X86_64
- { AUDIT_ARCH_X86_64, 133, 259, 188, 165 },
+ { AUDIT_ARCH_X86_64, 133, 259, 188, 165, 321 },
#endif
#ifdef AUDIT_ARCH_I386
- { AUDIT_ARCH_I386, 14, 297, 226, 21 },
+ { AUDIT_ARCH_I386, 14, 297, 226, 21, 357 },
#endif
#ifdef AUDIT_ARCH_AARCH64
- { AUDIT_ARCH_AARCH64, -1, 33, 5, 21 },
+ { AUDIT_ARCH_AARCH64, -1, 33, 5, 21, 386 },
#endif
#ifdef AUDIT_ARCH_ARM
- { AUDIT_ARCH_ARM, 14, 324, 226, 21 },
+ { AUDIT_ARCH_ARM, 14, 324, 226, 21, 386 },
#endif
#ifdef AUDIT_ARCH_ARMEB
- { AUDIT_ARCH_ARMEB, 14, 324, 226, 21 },
+ { AUDIT_ARCH_ARMEB, 14, 324, 226, 21, 386 },
#endif
#ifdef AUDIT_ARCH_S390
- { AUDIT_ARCH_S390, 14, 290, 224, 21 },
+ { AUDIT_ARCH_S390, 14, 290, 224, 21, 386 },
#endif
#ifdef AUDIT_ARCH_S390X
- { AUDIT_ARCH_S390X, 14, 290, 224, 21 },
+ { AUDIT_ARCH_S390X, 14, 290, 224, 21, 351 },
#endif
#ifdef AUDIT_ARCH_PPC
- { AUDIT_ARCH_PPC, 14, 288, 209, 21 },
+ { AUDIT_ARCH_PPC, 14, 288, 209, 21, 361 },
#endif
#ifdef AUDIT_ARCH_PPC64
- { AUDIT_ARCH_PPC64, 14, 288, 209, 21 },
+ { AUDIT_ARCH_PPC64, 14, 288, 209, 21, 361 },
#endif
#ifdef AUDIT_ARCH_PPC64LE
- { AUDIT_ARCH_PPC64LE, 14, 288, 209, 21 },
+ { AUDIT_ARCH_PPC64LE, 14, 288, 209, 21, 361 },
#endif
#ifdef AUDIT_ARCH_SPARC
- { AUDIT_ARCH_SPARC, 14, 286, 169, 167 },
+ { AUDIT_ARCH_SPARC, 14, 286, 169, 167, 349 },
#endif
#ifdef AUDIT_ARCH_SPARC64
- { AUDIT_ARCH_SPARC64, 14, 286, 169, 167 },
+ { AUDIT_ARCH_SPARC64, 14, 286, 169, 167, 349 },
#endif
#ifdef AUDIT_ARCH_MIPS
- { AUDIT_ARCH_MIPS, 14, 290, 224, 21 },
+ { AUDIT_ARCH_MIPS, 14, 290, 224, 21, -1 },
#endif
#ifdef AUDIT_ARCH_MIPSEL
- { AUDIT_ARCH_MIPSEL, 14, 290, 224, 21 },
+ { AUDIT_ARCH_MIPSEL, 14, 290, 224, 21, -1 },
#endif
#ifdef AUDIT_ARCH_MIPS64
- { AUDIT_ARCH_MIPS64, 131, 249, 180, 160 },
+ { AUDIT_ARCH_MIPS64, 131, 249, 180, 160, -1 },
#endif
#ifdef AUDIT_ARCH_MIPS64N32
- { AUDIT_ARCH_MIPS64N32, 131, 253, 180, 160 },
+ { AUDIT_ARCH_MIPS64N32, 131, 253, 180, 160, -1 },
#endif
#ifdef AUDIT_ARCH_MIPSEL64
- { AUDIT_ARCH_MIPSEL64, 131, 249, 180, 160 },
+ { AUDIT_ARCH_MIPSEL64, 131, 249, 180, 160, -1 },
#endif
#ifdef AUDIT_ARCH_MIPSEL64N32
- { AUDIT_ARCH_MIPSEL64N32, 131, 253, 180, 160 },
+ { AUDIT_ARCH_MIPSEL64N32, 131, 253, 180, 160, -1 },
#endif
};
@@ -217,6 +224,9 @@ static int seccomp_notify_get_syscall(struct seccomp_notif *req,
if (entry->nr_mount == req->data.nr)
return LXD_SECCOMP_NOTIFY_MOUNT;
+ if (entry->nr_bpf == req->data.nr)
+ return LXD_SECCOMP_NOTIFY_BPF;
+
break;
}
@@ -249,6 +259,109 @@ static void prepare_seccomp_iovec(struct iovec *iov,
iov[3].iov_len = SECCOMP_COOKIE_SIZE;
}
+static inline int pidfd_getfd(int pidfd, int fd, int flags)
+{
+ return syscall(__NR_pidfd_getfd, pidfd, fd, flags);
+}
+
+static int handleBpfSyscall(int notify_fd, int mem_fd,
+ struct seccomp_notify_proxy_msg *msg,
+ struct seccomp_notif *req,
+ struct seccomp_notif_resp *resp,
+ char *buf, size_t *buf_size)
+{
+ __do_close int pidfd = -EBADF, bpf_target_fd = -EBADF,
+ bpf_attach_fd = -EBADF, bpf_prog_fd = -EBADF;
+ union bpf_attr attr = {};
+ unsigned int attr_len = sizeof(attr);
+ struct seccomp_notif_addfd addfd = {};
+ int ret;
+ int cmd;
+
+ if (attr_len < req->data.args[2])
+ return -1;
+ attr_len = req->data.args[2];
+
+ switch (req->data.args[0]) {
+ case BPF_PROG_LOAD:
+ cmd = BPF_PROG_LOAD;
+ break;
+ case BPF_PROG_ATTACH:
+ cmd = BPF_PROG_ATTACH;
+ break;
+ case BPF_PROG_DETACH:
+ cmd = BPF_PROG_DETACH;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ ret = pread(mem_fd, &attr, attr_len, (off_t)req->data.args[1]);
+ if (ret < 0)
+ return -1;
+
+ switch (attr.prog_type) {
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ pidfd = pidfd_open(req->pid, 0);
+ if (pidfd < 0)
+ return -errno;
+
+ switch (cmd) {
+ case BPF_PROG_LOAD:
+ bpf_prog_fd = syscall(__NR_bpf, cmd, &attr, attr_len);
+ if (ret < 0)
+ return -errno;
+
+ addfd.srcfd = bpf_prog_fd;
+ addfd.id = req->id;
+ addfd.flags = 0;
+
+ // Inject the fd into the task.
+ ret = ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
+ if (ret < 0)
+ return -errno;
+
+ // Tell the caller what fd it got.
+ // Let me tell you, coding this is absurdly exciting. :D
+ resp->val = ret;
+ ret = 0;
+ break;
+ case BPF_PROG_ATTACH:
+ bpf_target_fd = pidfd_getfd(pidfd, attr.target_fd, 0);
+ if (bpf_target_fd < 0)
+ return -errno;
+
+ bpf_attach_fd = pidfd_getfd(pidfd, attr.attach_bpf_fd, 0);
+ if (bpf_attach_fd < 0)
+ return -errno;
+
+ attr.target_fd = bpf_target_fd;
+ attr.attach_bpf_fd = bpf_attach_fd;
+ ret = syscall(__NR_bpf, cmd, &attr, attr_len);
+ break;
+ case BPF_PROG_DETACH:
+ bpf_target_fd = pidfd_getfd(pidfd, attr.target_fd, 0);
+ if (bpf_target_fd < 0)
+ return -10;
+
+ bpf_attach_fd = pidfd_getfd(pidfd, attr.attach_bpf_fd, 0);
+ if (bpf_attach_fd < 0)
+ return -11;
+
+ attr.target_fd = bpf_target_fd;
+ attr.attach_bpf_fd = bpf_attach_fd;
+ ret = syscall(__NR_bpf, cmd, &attr, attr_len);
+ break;
+ }
+
+ return ret;
+}
+
#ifndef MS_LAZYTIME
#define MS_LAZYTIME (1<<25)
#endif
@@ -259,6 +372,7 @@ const lxdSeccompNotifyMknod = C.LXD_SECCOMP_NOTIFY_MKNOD
const lxdSeccompNotifyMknodat = C.LXD_SECCOMP_NOTIFY_MKNODAT
const lxdSeccompNotifySetxattr = C.LXD_SECCOMP_NOTIFY_SETXATTR
const lxdSeccompNotifyMount = C.LXD_SECCOMP_NOTIFY_MOUNT
+const lxdSeccompNotifyBpf = C.LXD_SECCOMP_NOTIFY_BPF
const seccompHeader = `2
`
@@ -327,6 +441,14 @@ move_mount errno 38
const seccompNotifyMount = `mount notify [3,0,SCMP_CMP_MASKED_EQ,18446744070422410016]
`
+// 5 == BPF_PROG_LOAD
+// 8 == BPF_PROG_ATTACH
+// 9 == BPF_PROG_DETACH
+const seccompNotifyBpf = `bpf notify [0,5,SCMP_CMP_EQ]
+bpf notify [0,8,SCMP_CMP_EQ]
+bpf notify [0,9,SCMP_CMP_EQ]
+`
+
const compatBlockingPolicy = `[%s]
compat_sys_rt_sigaction errno 38
stub_x32_rt_sigreturn errno 38
@@ -412,6 +534,7 @@ func InstanceNeedsPolicy(c Instance) bool {
"security.syscalls.intercept.mknod",
"security.syscalls.intercept.setxattr",
"security.syscalls.intercept.mount",
+ "security.syscalls.intercept.bpf",
}
for _, k := range keys {
@@ -446,20 +569,22 @@ func InstanceNeedsIntercept(s *state.State, c Instance) (bool, error) {
config := c.ExpandedConfig()
- var keys = map[string]func(state *state.State) bool{
+ var keys = map[string]func(state *state.State) error{
"security.syscalls.intercept.mknod": lxcSupportSeccompNotify,
"security.syscalls.intercept.setxattr": lxcSupportSeccompNotify,
"security.syscalls.intercept.mount": lxcSupportSeccompNotifyContinue,
+ "security.syscalls.intercept.bpf": lxcSupportSeccompNotifyAddfd,
}
needed := false
- for key, isSupported := range keys {
+ for key, check := range keys {
if !shared.IsTrue(config[key]) {
continue
}
- if !isSupported(s) {
- return needed, fmt.Errorf("System doesn't support syscall interception")
+ err := check(s)
+ if err != nil {
+ return needed, err
}
needed = true
@@ -546,6 +671,11 @@ func seccompGetPolicyContent(s *state.State, c Instance) (string, error) {
// multiple syscalls.
policy += seccompBlockNewMountAPI
}
+
+ if shared.IsTrue(config["security.syscalls.intercept.bpf"]) &&
+ shared.IsTrue(config["security.syscalls.intercept.bpf.prog.type.device"]) {
+ policy += seccompNotifyBpf
+ }
}
if allowlist != "" {
@@ -617,14 +747,15 @@ type Server struct {
// Iovec defines an iovec to move data between kernel and userspace.
type Iovec struct {
- ucred *unix.Ucred
- memFd int
- procFd int
- msg *C.struct_seccomp_notify_proxy_msg
- req *C.struct_seccomp_notif
- resp *C.struct_seccomp_notif_resp
- cookie *C.char
- iov *C.struct_iovec
+ ucred *unix.Ucred
+ memFd int
+ procFd int
+ notifyFd int
+ msg *C.struct_seccomp_notify_proxy_msg
+ req *C.struct_seccomp_notif
+ resp *C.struct_seccomp_notif_resp
+ cookie *C.char
+ iov *C.struct_iovec
}
// NewSeccompIovec creates a new seccomp iovec.
@@ -652,14 +783,15 @@ func NewSeccompIovec(ucred *unix.Ucred) *Iovec {
C.prepare_seccomp_iovec(iov, msg, req, resp, cookie)
return &Iovec{
- memFd: -1,
- procFd: -1,
- msg: msg,
- req: req,
- resp: resp,
- cookie: cookie,
- iov: iov,
- ucred: ucred,
+ memFd: -1,
+ procFd: -1,
+ notifyFd: -1,
+ msg: msg,
+ req: req,
+ resp: resp,
+ cookie: cookie,
+ iov: iov,
+ ucred: ucred,
}
}
@@ -671,6 +803,9 @@ func (siov *Iovec) PutSeccompIovec() {
if siov.procFd >= 0 {
unix.Close(siov.procFd)
}
+ if siov.notifyFd >= 0 {
+ unix.Close(siov.notifyFd)
+ }
C.free(unsafe.Pointer(siov.msg))
C.free(unsafe.Pointer(siov.req))
C.free(unsafe.Pointer(siov.resp))
@@ -678,20 +813,30 @@ func (siov *Iovec) PutSeccompIovec() {
C.free(unsafe.Pointer(siov.iov))
}
-// ReceiveSeccompIovec receives a seccomp iovec.
-func (siov *Iovec) ReceiveSeccompIovec(fd int) (uint64, error) {
+// ReceiveSeccompIovecV1 receives a v1 seccomp iovec.
+func (siov *Iovec) ReceiveSeccompIovecV1(fd int) (uint64, error) {
bytes, fds, err := netutils.AbstractUnixReceiveFdData(fd, 2, unsafe.Pointer(siov.iov), 4)
if err != nil || err == io.EOF {
return 0, err
}
- if len(fds) == 2 {
- siov.procFd = int(fds[0])
- siov.memFd = int(fds[1])
- } else {
- siov.memFd = int(fds[0])
+ siov.procFd = int(fds[0])
+ siov.memFd = int(fds[1])
+
+ return bytes, nil
+}
+
+// ReceiveSeccompIovecV2 receives a v2 seccomp iovec.
+func (siov *Iovec) ReceiveSeccompIovecV2(fd int) (uint64, error) {
+ bytes, fds, err := netutils.AbstractUnixReceiveFdData(fd, 3, unsafe.Pointer(siov.iov), 4)
+ if err != nil || err == io.EOF {
+ return 0, err
}
+ siov.procFd = int(fds[0])
+ siov.memFd = int(fds[1])
+ siov.notifyFd = int(fds[2])
+
return bytes, nil
}
@@ -810,8 +955,15 @@ func NewSeccompServer(s *state.State, path string, findPID func(pid int32, state
}
for {
+ var bytes uint64
+ var err error
+
siov := NewSeccompIovec(ucred)
- bytes, err := siov.ReceiveSeccompIovec(int(unixFile.Fd()))
+ if lxcSupportSeccompV2(server.s) {
+ bytes, err = siov.ReceiveSeccompIovecV2(int(unixFile.Fd()))
+ } else {
+ bytes, err = siov.ReceiveSeccompIovecV1(int(unixFile.Fd()))
+ }
if err != nil {
logger.Debugf("Disconnected from seccomp socket after failed receive: pid=%v, err=%s", ucred.Pid, err)
c.Close()
@@ -1601,6 +1753,45 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
return 0
}
+// BpfArgs arguments for mount.
+type BpfArgs struct {
+ pid int
+}
+
+// HandleBpfSyscall handles mount syscalls.
+func (s *Server) HandleBpfSyscall(c Instance, siov *Iovec) int {
+ ctx := log.Ctx{"container": c.Name(),
+ "project": c.Project(),
+ "syscall_number": siov.req.data.nr,
+ "audit_architecture": siov.req.data.arch,
+ "seccomp_notify_id": siov.req.id,
+ "seccomp_notify_flags": siov.req.flags,
+ }
+
+ defer logger.Debug("Handling bpf syscall", ctx)
+
+ args := BpfArgs{
+ pid: int(siov.req.pid),
+ }
+
+ pidFdNr, pidFd := inheritPidFd(args.pid, s.s)
+ if pidFdNr >= 0 {
+ defer pidFd.Close()
+ }
+
+ cBpfAttrBuf := [4096]C.char{}
+ cBpfAttrSize := C.size_t(len(cBpfAttrBuf))
+
+ ret := C.handleBpfSyscall(C.int(siov.notifyFd), C.int(siov.memFd), siov.msg, siov.req, siov.resp, &cBpfAttrBuf[0], &cBpfAttrSize)
+ if ret < 0 {
+ ctx["syscall_continue"] = "true"
+ C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
+ return 0
+ }
+
+ return 0
+}
+
func (s *Server) handleSyscall(c Instance, siov *Iovec) int {
switch int(C.seccomp_notify_get_syscall(siov.req, siov.resp)) {
case lxdSeccompNotifyMknod:
@@ -1611,6 +1802,8 @@ func (s *Server) handleSyscall(c Instance, siov *Iovec) int {
return s.HandleSetxattrSyscall(c, siov)
case lxdSeccompNotifyMount:
return s.HandleMountSyscall(c, siov)
+ case lxdSeccompNotifyBpf:
+ return s.HandleBpfSyscall(c, siov)
}
return int(-C.EINVAL)
@@ -1649,39 +1842,70 @@ func (s *Server) Stop() error {
return s.l.Close()
}
-func lxcSupportSeccompNotifyContinue(state *state.State) bool {
- if !lxcSupportSeccompNotify(state) {
+func lxcSupportSeccompV2(state *state.State) bool {
+ err := lxcSupportSeccompNotify(state)
+ if err != nil {
return false
}
- if !state.OS.SeccompListenerContinue {
+ if !state.OS.LXCFeatures["seccomp_proxy_send_notify_fd"] {
return false
}
return true
}
-func lxcSupportSeccompNotify(state *state.State) bool {
+func lxcSupportSeccompNotifyContinue(state *state.State) error {
+ err := lxcSupportSeccompNotify(state)
+ if err != nil {
+ return err
+ }
+
+ if !state.OS.SeccompListenerContinue {
+ return fmt.Errorf("Seccomp notify doesn't support continuing syscalls")
+ }
+
+ return nil
+}
+
+func lxcSupportSeccompNotifyAddfd(state *state.State) error {
+ err := lxcSupportSeccompNotify(state)
+ if err != nil {
+ return err
+ }
+
+ if !state.OS.SeccompListenerContinue {
+ return fmt.Errorf("Seccomp notify doesn't support continuing syscalls")
+ }
+
+ if !state.OS.SeccompListenerAddfd {
+ return fmt.Errorf("Seccomp notify doesn't support adding file descriptors")
+ }
+
+ return nil
+}
+
+func lxcSupportSeccompNotify(state *state.State) error {
if !state.OS.SeccompListener {
- return false
+ return fmt.Errorf("Seccomp notify not supported")
}
if !state.OS.LXCFeatures["seccomp_notify"] {
- return false
+ return fmt.Errorf("LXC doesn't support seccomp notify")
}
c, err := liblxc.NewContainer("test-seccomp", state.OS.LxcPath)
if err != nil {
- return false
+ return fmt.Errorf("Failed to load seccomp notify test container")
}
err = c.SetConfigItem("lxc.seccomp.notify.proxy", fmt.Sprintf("unix:%s", shared.VarPath("seccomp.socket")))
if err != nil {
- return false
+ return errors.Wrap(err, "LXC doesn't support notify proxy")
}
c.Release()
- return true
+ return nil
}
// MountSyscallFilter creates a mount syscall filter from the config.
diff --git a/lxd/sys/os.go b/lxd/sys/os.go
index d20b6aef85..b7faa7d4e3 100644
--- a/lxd/sys/os.go
+++ b/lxd/sys/os.go
@@ -67,6 +67,7 @@ type OS struct {
NetnsGetifaddrs bool
PidFds bool
SeccompListener bool
+ SeccompListenerAddfd bool
SeccompListenerContinue bool
Shiftfs bool
UeventInjection bool
diff --git a/shared/instance.go b/shared/instance.go
index 42eba631c1..b8072a53b3 100644
--- a/shared/instance.go
+++ b/shared/instance.go
@@ -199,20 +199,22 @@ var KnownInstanceConfigKeys = map[string]func(value string) error{
"security.secureboot": validate.Optional(validate.IsBool),
- "security.syscalls.allow": validate.IsAny,
- "security.syscalls.blacklist_default": validate.Optional(validate.IsBool),
- "security.syscalls.blacklist_compat": validate.Optional(validate.IsBool),
- "security.syscalls.blacklist": validate.IsAny,
- "security.syscalls.deny_default": validate.Optional(validate.IsBool),
- "security.syscalls.deny_compat": validate.Optional(validate.IsBool),
- "security.syscalls.deny": validate.IsAny,
- "security.syscalls.intercept.mknod": validate.Optional(validate.IsBool),
- "security.syscalls.intercept.mount": validate.Optional(validate.IsBool),
- "security.syscalls.intercept.mount.allowed": validate.IsAny,
- "security.syscalls.intercept.mount.fuse": validate.IsAny,
- "security.syscalls.intercept.mount.shift": validate.Optional(validate.IsBool),
- "security.syscalls.intercept.setxattr": validate.Optional(validate.IsBool),
- "security.syscalls.whitelist": validate.IsAny,
+ "security.syscalls.allow": validate.IsAny,
+ "security.syscalls.blacklist_default": validate.Optional(validate.IsBool),
+ "security.syscalls.blacklist_compat": validate.Optional(validate.IsBool),
+ "security.syscalls.blacklist": validate.IsAny,
+ "security.syscalls.deny_default": validate.Optional(validate.IsBool),
+ "security.syscalls.deny_compat": validate.Optional(validate.IsBool),
+ "security.syscalls.deny": validate.IsAny,
+ "security.syscalls.intercept.bpf": validate.Optional(validate.IsBool),
+ "security.syscalls.intercept.bpf.prog.type.device": validate.Optional(validate.IsBool),
+ "security.syscalls.intercept.mknod": validate.Optional(validate.IsBool),
+ "security.syscalls.intercept.mount": validate.Optional(validate.IsBool),
+ "security.syscalls.intercept.mount.allowed": validate.IsAny,
+ "security.syscalls.intercept.mount.fuse": validate.IsAny,
+ "security.syscalls.intercept.mount.shift": validate.Optional(validate.IsBool),
+ "security.syscalls.intercept.setxattr": validate.Optional(validate.IsBool),
+ "security.syscalls.whitelist": validate.IsAny,
"snapshots.schedule": func(value string) error {
if value == "" {
More information about the lxc-devel
mailing list