[lxc-devel] [lxd/master] [RFC]: seccomp: enable unpriviled bpf through syscall interception

brauner on Github lxc-bot at linuxcontainers.org
Thu Aug 6 15:57:36 UTC 2020


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 364 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200806/00c6c414/attachment-0001.bin>
-------------- next part --------------
From 94bcac3febf6790aa9a292ee1bb91d12579e73e1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Thu, 6 Aug 2020 11:16:19 +0200
Subject: [PATCH] [RFC]: seccomp: enable unpriviled bpf through syscall
 interception

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/daemon.go                      |   8 +
 lxd/include/lxd_seccomp.h          |  23 ++
 lxd/include/syscall_numbers.h      |  35 +++
 lxd/instance/drivers/driver_lxc.go |  11 +
 lxd/instance/instance_interface.go |   1 +
 lxd/main_checkfeature.go           | 128 ++++++++++-
 lxd/seccomp/seccomp.go             | 336 ++++++++++++++++++++++++-----
 lxd/sys/os.go                      |   1 +
 shared/instance.go                 |  30 +--
 9 files changed, 502 insertions(+), 71 deletions(-)

diff --git a/lxd/daemon.go b/lxd/daemon.go
index 6ce21a1c19..2d62378164 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -631,6 +631,7 @@ func (d *Daemon) init() error {
 		"pidfd",
 		"seccomp_allow_deny_syntax",
 		"devpts_fd",
+		"seccomp_proxy_send_notify_fd",
 	}
 	for _, extension := range lxcExtensions {
 		d.os.LXCFeatures[extension] = liblxc.HasApiExtension(extension)
@@ -675,6 +676,13 @@ func (d *Daemon) init() error {
 		logger.Infof(" - seccomp listener continue syscalls: no")
 	}
 
+	if canUseSeccompListenerAddfd() && d.os.LXCFeatures["seccomp_proxy_send_notify_fd"] {
+		d.os.SeccompListenerAddfd = true
+		logger.Infof(" - seccomp listener add file descriptors: yes")
+	} else {
+		logger.Infof(" - seccomp listener add file descriptors: no")
+	}
+
 	if d.os.LXCFeatures["devpts_fd"] && canUseNativeTerminals() {
 		d.os.NativeTerminals = true
 		logger.Infof(" - safe native terminal allocation : yes")
diff --git a/lxd/include/lxd_seccomp.h b/lxd/include/lxd_seccomp.h
index 242347e3e1..976947e4bc 100644
--- a/lxd/include/lxd_seccomp.h
+++ b/lxd/include/lxd_seccomp.h
@@ -65,4 +65,27 @@ struct seccomp_notif_sizes {
 						struct seccomp_notif_resp)
 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
 #endif
+
+#ifndef SECCOMP_IOCTL_NOTIF_ADDFD
+#define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3, struct seccomp_notif_addfd)
+
+/* valid flags for seccomp_notif_addfd */
+#define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
+
+/**
+ * struct seccomp_notif_addfd
+ * @id: The ID of the seccomp notification
+ * @flags: SECCOMP_ADDFD_FLAG_*
+ * @srcfd: The local fd number
+ * @newfd: Optional remote FD number if SETFD option is set, otherwise 0.
+ * @newfd_flags: The O_* flags the remote FD should have applied
+ */
+struct seccomp_notif_addfd {
+	__u64 id;
+	__u32 flags;
+	__u32 srcfd;
+	__u32 newfd;
+	__u32 newfd_flags;
+};
+#endif
 #endif /* LXD_SECCOMP_H */
diff --git a/lxd/include/syscall_numbers.h b/lxd/include/syscall_numbers.h
index f953a26911..269b8c795b 100644
--- a/lxd/include/syscall_numbers.h
+++ b/lxd/include/syscall_numbers.h
@@ -74,4 +74,39 @@
 	#endif
 #endif
 
+#ifndef __NR_bpf
+	#if defined __i386__
+		#define __NR_bpf 357
+	#elif defined __x86_64__
+		#define __NR_bpf 321
+	#elif defined __arm__
+		#define __NR_bpf 386
+	#elif defined __aarch64__
+		#define __NR_bpf 386
+	#elif defined __s390__
+		#define __NR_bpf 351
+	#elif defined __powerpc__
+		#define __NR_bpf 361
+	#elif defined __riscv
+		#define __NR_bpf 280
+	#elif defined __sparc__
+		#define __NR_bpf 349
+	#elif defined __ia64__
+		#define __NR_bpf (317 + 1024)
+	#elif defined _MIPS_SIM
+		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
+			#define __NR_bpf 4355
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
+			#define __NR_bpf 6319
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
+			#define __NR_bpf 5315
+		#endif
+	#else
+		#define -1
+		#warning "__NR_bpf not defined for your architecture"
+	#endif
+#endif
+
 #endif /* __LXD_SYSCALL_NUMBERS_H */
diff --git a/lxd/instance/drivers/driver_lxc.go b/lxd/instance/drivers/driver_lxc.go
index 6b3de9a40e..aa1f0f52e0 100644
--- a/lxd/instance/drivers/driver_lxc.go
+++ b/lxd/instance/drivers/driver_lxc.go
@@ -6665,6 +6665,17 @@ func (c *lxc) DevptsFd() (*os.File, error) {
 	return c.c.DevptsFd()
 }
 
+// SeccompNotifyFd returns seccomp notify fd of the container.
+func (c *lxc) SeccompNotifyFd() (*os.File, error) {
+	// Load the go-lxc struct
+	err := c.initLXC(false)
+	if err != nil {
+		return nil, err
+	}
+
+	return c.c.SeccompNotifyFd()
+}
+
 // LocalConfig returns local config.
 func (c *lxc) LocalConfig() map[string]string {
 	return c.localConfig
diff --git a/lxd/instance/instance_interface.go b/lxd/instance/instance_interface.go
index e03db25bfb..c8dbf1cd5d 100644
--- a/lxd/instance/instance_interface.go
+++ b/lxd/instance/instance_interface.go
@@ -148,6 +148,7 @@ type Container interface {
 	ConsoleLog(opts liblxc.ConsoleLogOptions) (string, error)
 	InsertSeccompUnixDevice(prefix string, m deviceConfig.Device, pid int) error
 	DevptsFd() (*os.File, error)
+	SeccompNotifyFd() (*os.File, error)
 }
 
 // CriuMigrationArgs arguments for CRIU migration.
diff --git a/lxd/main_checkfeature.go b/lxd/main_checkfeature.go
index 01995462b9..77a7f53738 100644
--- a/lxd/main_checkfeature.go
+++ b/lxd/main_checkfeature.go
@@ -293,6 +293,127 @@ static void is_user_notification_continue_aware(void)
 		seccomp_notify_aware = 2;
 }
 
+__noreturn static void __do_user_notification_addfd(void)
+{
+	__do_close int listener = -EBADF;
+	pid_t pid;
+	int ret;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+	struct seccomp_notif_addfd addfd = {};
+	struct pollfd pollfd;
+
+	listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	if (listener < 0)
+		_exit(EXIT_FAILURE);
+
+	pid = fork();
+	if (pid < 0)
+		_exit(EXIT_FAILURE);
+
+	if (pid == 0) {
+		int dup_fd, pipe_fds[2];
+		pid_t self;
+
+		// Don't bother cleaning up. On child exit all of those
+		// will be closed anyway.
+		ret = pipe(pipe_fds);
+		if (ret < 0)
+			_exit(EXIT_FAILURE);
+
+		// O_CLOEXEC doesn't matter as we're in the child and we're
+		// not going to exec.
+		dup_fd = dup(pipe_fds[0]);
+		if (dup_fd < 0)
+			_exit(EXIT_FAILURE);
+
+		self = getpid();
+
+		ret = filecmp(self, self, pipe_fds[0], dup_fd);
+		if (ret)
+			_exit(EXIT_FAILURE);
+
+		_exit(EXIT_SUCCESS);
+	}
+
+	pollfd.fd = listener;
+	pollfd.events = POLLIN | POLLOUT;
+
+	ret = poll(&pollfd, 1, 5000);
+	if (ret <= 0)
+		goto cleanup_sigkill;
+
+	if (!(pollfd.revents & POLLIN))
+		goto cleanup_sigkill;
+
+	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
+	if (ret)
+		goto cleanup_sigkill;
+
+	pollfd.fd = listener;
+	pollfd.events = POLLIN | POLLOUT;
+
+	ret = poll(&pollfd, 1, 5000);
+	if (ret <= 0)
+		goto cleanup_sigkill;
+
+	if (!(pollfd.revents & POLLOUT))
+		goto cleanup_sigkill;
+
+	if (req.data.nr != __NR_dup)
+		goto cleanup_sigkill;
+
+	addfd.srcfd	= 3;
+	addfd.id 	= req.id;
+	addfd.flags 	= 0;
+
+	// Inject the fd into the task.
+	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
+	if (ret < 0)
+		goto cleanup_sigkill;
+	close(ret);
+
+	resp.id = req.id;
+	resp.flags |= SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+	resp.error = -EPERM;
+	resp.flags = 0;
+	if (ret) {
+		ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+		goto cleanup_sigkill;
+	}
+
+cleanup_wait:
+	ret = wait_for_pid(pid);
+	if (ret)
+		_exit(EXIT_FAILURE);
+	_exit(EXIT_SUCCESS);
+
+cleanup_sigkill:
+	kill(pid, SIGKILL);
+	goto cleanup_wait;
+}
+
+static void is_user_notification_addfd_aware(void)
+{
+	int ret;
+	pid_t pid;
+
+	pid = fork();
+	if (pid < 0)
+		return;
+
+	if (pid == 0) {
+		__do_user_notification_addfd();
+		// Should not be reached.
+		_exit(EXIT_FAILURE);
+	}
+
+	ret = wait_for_pid(pid);
+	if (!ret)
+		seccomp_notify_aware = 3;
+}
+
 static void is_seccomp_notify_aware(void)
 {
 	__u32 action[] = { SECCOMP_RET_USER_NOTIF };
@@ -300,6 +421,8 @@ static void is_seccomp_notify_aware(void)
 	if (syscall(__NR_seccomp, SECCOMP_GET_ACTION_AVAIL, 0, &action[0]) == 0) {
 		seccomp_notify_aware = 1;
 		is_user_notification_continue_aware();
+		if (seccomp_notify_aware == 2)
+			is_user_notification_addfd_aware();
 	}
 
 }
@@ -403,9 +526,12 @@ func canUseSeccompListener() bool {
 }
 
 func canUseSeccompListenerContinue() bool {
-	return bool(C.seccomp_notify_aware == 2)
+	return bool(C.seccomp_notify_aware >= 2)
 }
 
+func canUseSeccompListenerAddfd() bool {
+	return bool(C.seccomp_notify_aware == 3)
+}
 func canUsePidFds() bool {
 	return bool(C.pidfd_aware)
 }
diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go
index 52e88049a1..d1d6e40dfe 100644
--- a/lxd/seccomp/seccomp.go
+++ b/lxd/seccomp/seccomp.go
@@ -15,6 +15,8 @@ import (
 	"strings"
 	"unsafe"
 
+	"github.com/pkg/errors"
+
 	"golang.org/x/sys/unix"
 	liblxc "gopkg.in/lxc/go-lxc.v2"
 
@@ -41,6 +43,7 @@ import (
 #include <elf.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/bpf.h>
 #include <linux/seccomp.h>
 #include <linux/types.h>
 #include <linux/kdev_t.h>
@@ -58,6 +61,8 @@ import (
 #include <unistd.h>
 
 #include "../include/lxd_seccomp.h"
+#include "../include/memory_utils.h"
+#include "../include/process_utils.h"
 
 struct seccomp_notif_sizes expected_sizes;
 
@@ -123,69 +128,71 @@ struct lxd_seccomp_data_arch {
 	int nr_mknodat;
 	int nr_setxattr;
 	int nr_mount;
+	int nr_bpf;
 };
 
 #define LXD_SECCOMP_NOTIFY_MKNOD    0
 #define LXD_SECCOMP_NOTIFY_MKNODAT  1
 #define LXD_SECCOMP_NOTIFY_SETXATTR 2
 #define LXD_SECCOMP_NOTIFY_MOUNT 3
+#define LXD_SECCOMP_NOTIFY_BPF 4
 
 // ordered by likelihood of usage...
 static const struct lxd_seccomp_data_arch seccomp_notify_syscall_table[] = {
-	{ -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, LXD_SECCOMP_NOTIFY_SETXATTR, LXD_SECCOMP_NOTIFY_MOUNT },
+	{ -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, LXD_SECCOMP_NOTIFY_SETXATTR, LXD_SECCOMP_NOTIFY_MOUNT, LXD_SECCOMP_NOTIFY_BPF },
 #ifdef AUDIT_ARCH_X86_64
-	{ AUDIT_ARCH_X86_64,      133, 259, 188, 165 },
+	{ AUDIT_ARCH_X86_64,      133, 259, 188, 165, 321 },
 #endif
 #ifdef AUDIT_ARCH_I386
-	{ AUDIT_ARCH_I386,         14, 297, 226,  21 },
+	{ AUDIT_ARCH_I386,         14, 297, 226,  21, 357 },
 #endif
 #ifdef AUDIT_ARCH_AARCH64
-	{ AUDIT_ARCH_AARCH64,      -1,  33,   5,  21 },
+	{ AUDIT_ARCH_AARCH64,      -1,  33,   5,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_ARM
-	{ AUDIT_ARCH_ARM,          14, 324, 226,  21 },
+	{ AUDIT_ARCH_ARM,          14, 324, 226,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_ARMEB
-	{ AUDIT_ARCH_ARMEB,        14, 324, 226,  21 },
+	{ AUDIT_ARCH_ARMEB,        14, 324, 226,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_S390
-	{ AUDIT_ARCH_S390,         14, 290, 224,  21 },
+	{ AUDIT_ARCH_S390,         14, 290, 224,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_S390X
-	{ AUDIT_ARCH_S390X,        14, 290, 224,  21 },
+	{ AUDIT_ARCH_S390X,        14, 290, 224,  21, 351 },
 #endif
 #ifdef AUDIT_ARCH_PPC
-	{ AUDIT_ARCH_PPC,          14, 288, 209,  21 },
+	{ AUDIT_ARCH_PPC,          14, 288, 209,  21, 361 },
 #endif
 #ifdef AUDIT_ARCH_PPC64
-	{ AUDIT_ARCH_PPC64,        14, 288, 209,  21 },
+	{ AUDIT_ARCH_PPC64,        14, 288, 209,  21, 361 },
 #endif
 #ifdef AUDIT_ARCH_PPC64LE
-	{ AUDIT_ARCH_PPC64LE,      14, 288, 209,  21 },
+	{ AUDIT_ARCH_PPC64LE,      14, 288, 209,  21, 361 },
 #endif
 #ifdef AUDIT_ARCH_SPARC
-	{ AUDIT_ARCH_SPARC,        14, 286, 169, 167 },
+	{ AUDIT_ARCH_SPARC,        14, 286, 169, 167, 349 },
 #endif
 #ifdef AUDIT_ARCH_SPARC64
-	{ AUDIT_ARCH_SPARC64,      14, 286, 169, 167 },
+	{ AUDIT_ARCH_SPARC64,      14, 286, 169, 167, 349 },
 #endif
 #ifdef AUDIT_ARCH_MIPS
-	{ AUDIT_ARCH_MIPS,         14, 290, 224,  21 },
+	{ AUDIT_ARCH_MIPS,         14, 290, 224,  21,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL
-	{ AUDIT_ARCH_MIPSEL,       14, 290, 224,  21 },
+	{ AUDIT_ARCH_MIPSEL,       14, 290, 224,  21,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPS64
-	{ AUDIT_ARCH_MIPS64,      131, 249, 180, 160 },
+	{ AUDIT_ARCH_MIPS64,      131, 249, 180, 160,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPS64N32
-	{ AUDIT_ARCH_MIPS64N32,   131, 253, 180, 160 },
+	{ AUDIT_ARCH_MIPS64N32,   131, 253, 180, 160,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL64
-	{ AUDIT_ARCH_MIPSEL64,    131, 249, 180, 160 },
+	{ AUDIT_ARCH_MIPSEL64,    131, 249, 180, 160,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL64N32
-	{ AUDIT_ARCH_MIPSEL64N32, 131, 253, 180, 160 },
+	{ AUDIT_ARCH_MIPSEL64N32, 131, 253, 180, 160,  -1 },
 #endif
 };
 
@@ -217,6 +224,9 @@ static int seccomp_notify_get_syscall(struct seccomp_notif *req,
 		if (entry->nr_mount == req->data.nr)
 			return LXD_SECCOMP_NOTIFY_MOUNT;
 
+		if (entry->nr_bpf == req->data.nr)
+			return LXD_SECCOMP_NOTIFY_BPF;
+
 		break;
 	}
 
@@ -249,6 +259,109 @@ static void prepare_seccomp_iovec(struct iovec *iov,
 	iov[3].iov_len = SECCOMP_COOKIE_SIZE;
 }
 
+static inline int pidfd_getfd(int pidfd, int fd, int flags)
+{
+	return syscall(__NR_pidfd_getfd, pidfd, fd, flags);
+}
+
+static int handleBpfSyscall(int notify_fd, int mem_fd,
+			    struct seccomp_notify_proxy_msg *msg,
+			    struct seccomp_notif *req,
+			    struct seccomp_notif_resp *resp,
+			    char *buf, size_t *buf_size)
+{
+	__do_close int pidfd = -EBADF, bpf_target_fd = -EBADF,
+		       bpf_attach_fd = -EBADF, bpf_prog_fd = -EBADF;
+	union bpf_attr attr = {};
+	unsigned int attr_len = sizeof(attr);
+	struct seccomp_notif_addfd addfd = {};
+	int ret;
+	int cmd;
+
+	if (attr_len < req->data.args[2])
+		return -1;
+	attr_len = req->data.args[2];
+
+	switch (req->data.args[0]) {
+	case BPF_PROG_LOAD:
+		cmd = BPF_PROG_LOAD;
+		break;
+	case BPF_PROG_ATTACH:
+		cmd = BPF_PROG_ATTACH;
+		break;
+	case BPF_PROG_DETACH:
+		cmd = BPF_PROG_DETACH;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ret = pread(mem_fd, &attr, attr_len, (off_t)req->data.args[1]);
+	if (ret < 0)
+		return -1;
+
+	switch (attr.prog_type) {
+	case BPF_PROG_TYPE_CGROUP_DEVICE:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	pidfd = pidfd_open(req->pid, 0);
+	if (pidfd < 0)
+		return -errno;
+
+	switch (cmd) {
+	case BPF_PROG_LOAD:
+		bpf_prog_fd = syscall(__NR_bpf, cmd, &attr, attr_len);
+		if (ret < 0)
+			return -errno;
+
+		addfd.srcfd 	= bpf_prog_fd;
+		addfd.id 	= req->id;
+		addfd.flags 	= 0;
+
+		// Inject the fd into the task.
+		ret = ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
+		if (ret < 0)
+			return -errno;
+
+		// Tell the caller what fd it got.
+		// Let me tell you, coding this is absurdly exciting. :D
+		resp->val = ret;
+		ret = 0;
+		break;
+	case BPF_PROG_ATTACH:
+		bpf_target_fd = pidfd_getfd(pidfd, attr.target_fd, 0);
+		if (bpf_target_fd < 0)
+			return -errno;
+
+		bpf_attach_fd = pidfd_getfd(pidfd, attr.attach_bpf_fd, 0);
+		if (bpf_attach_fd < 0)
+			return -errno;
+
+		attr.target_fd = bpf_target_fd;
+		attr.attach_bpf_fd = bpf_attach_fd;
+		ret = syscall(__NR_bpf, cmd, &attr, attr_len);
+		break;
+	case BPF_PROG_DETACH:
+		bpf_target_fd = pidfd_getfd(pidfd, attr.target_fd, 0);
+		if (bpf_target_fd < 0)
+			return -10;
+
+		bpf_attach_fd = pidfd_getfd(pidfd, attr.attach_bpf_fd, 0);
+		if (bpf_attach_fd < 0)
+			return -11;
+
+		attr.target_fd = bpf_target_fd;
+		attr.attach_bpf_fd = bpf_attach_fd;
+		ret = syscall(__NR_bpf, cmd, &attr, attr_len);
+		break;
+	}
+
+	return ret;
+}
+
 #ifndef MS_LAZYTIME
 #define MS_LAZYTIME (1<<25)
 #endif
@@ -259,6 +372,7 @@ const lxdSeccompNotifyMknod = C.LXD_SECCOMP_NOTIFY_MKNOD
 const lxdSeccompNotifyMknodat = C.LXD_SECCOMP_NOTIFY_MKNODAT
 const lxdSeccompNotifySetxattr = C.LXD_SECCOMP_NOTIFY_SETXATTR
 const lxdSeccompNotifyMount = C.LXD_SECCOMP_NOTIFY_MOUNT
+const lxdSeccompNotifyBpf = C.LXD_SECCOMP_NOTIFY_BPF
 
 const seccompHeader = `2
 `
@@ -327,6 +441,14 @@ move_mount errno 38
 const seccompNotifyMount = `mount notify [3,0,SCMP_CMP_MASKED_EQ,18446744070422410016]
 `
 
+// 5 == BPF_PROG_LOAD
+// 8 == BPF_PROG_ATTACH
+// 9 == BPF_PROG_DETACH
+const seccompNotifyBpf = `bpf notify [0,5,SCMP_CMP_EQ]
+bpf notify [0,8,SCMP_CMP_EQ]
+bpf notify [0,9,SCMP_CMP_EQ]
+`
+
 const compatBlockingPolicy = `[%s]
 compat_sys_rt_sigaction errno 38
 stub_x32_rt_sigreturn errno 38
@@ -412,6 +534,7 @@ func InstanceNeedsPolicy(c Instance) bool {
 		"security.syscalls.intercept.mknod",
 		"security.syscalls.intercept.setxattr",
 		"security.syscalls.intercept.mount",
+		"security.syscalls.intercept.bpf",
 	}
 
 	for _, k := range keys {
@@ -446,20 +569,22 @@ func InstanceNeedsIntercept(s *state.State, c Instance) (bool, error) {
 
 	config := c.ExpandedConfig()
 
-	var keys = map[string]func(state *state.State) bool{
+	var keys = map[string]func(state *state.State) error{
 		"security.syscalls.intercept.mknod":    lxcSupportSeccompNotify,
 		"security.syscalls.intercept.setxattr": lxcSupportSeccompNotify,
 		"security.syscalls.intercept.mount":    lxcSupportSeccompNotifyContinue,
+		"security.syscalls.intercept.bpf":      lxcSupportSeccompNotifyAddfd,
 	}
 
 	needed := false
-	for key, isSupported := range keys {
+	for key, check := range keys {
 		if !shared.IsTrue(config[key]) {
 			continue
 		}
 
-		if !isSupported(s) {
-			return needed, fmt.Errorf("System doesn't support syscall interception")
+		err := check(s)
+		if err != nil {
+			return needed, err
 		}
 
 		needed = true
@@ -546,6 +671,11 @@ func seccompGetPolicyContent(s *state.State, c Instance) (string, error) {
 			// multiple syscalls.
 			policy += seccompBlockNewMountAPI
 		}
+
+		if shared.IsTrue(config["security.syscalls.intercept.bpf"]) &&
+			shared.IsTrue(config["security.syscalls.intercept.bpf.prog.type.device"]) {
+			policy += seccompNotifyBpf
+		}
 	}
 
 	if allowlist != "" {
@@ -617,14 +747,15 @@ type Server struct {
 
 // Iovec defines an iovec to move data between kernel and userspace.
 type Iovec struct {
-	ucred  *unix.Ucred
-	memFd  int
-	procFd int
-	msg    *C.struct_seccomp_notify_proxy_msg
-	req    *C.struct_seccomp_notif
-	resp   *C.struct_seccomp_notif_resp
-	cookie *C.char
-	iov    *C.struct_iovec
+	ucred    *unix.Ucred
+	memFd    int
+	procFd   int
+	notifyFd int
+	msg      *C.struct_seccomp_notify_proxy_msg
+	req      *C.struct_seccomp_notif
+	resp     *C.struct_seccomp_notif_resp
+	cookie   *C.char
+	iov      *C.struct_iovec
 }
 
 // NewSeccompIovec creates a new seccomp iovec.
@@ -652,14 +783,15 @@ func NewSeccompIovec(ucred *unix.Ucred) *Iovec {
 	C.prepare_seccomp_iovec(iov, msg, req, resp, cookie)
 
 	return &Iovec{
-		memFd:  -1,
-		procFd: -1,
-		msg:    msg,
-		req:    req,
-		resp:   resp,
-		cookie: cookie,
-		iov:    iov,
-		ucred:  ucred,
+		memFd:    -1,
+		procFd:   -1,
+		notifyFd: -1,
+		msg:      msg,
+		req:      req,
+		resp:     resp,
+		cookie:   cookie,
+		iov:      iov,
+		ucred:    ucred,
 	}
 }
 
@@ -671,6 +803,9 @@ func (siov *Iovec) PutSeccompIovec() {
 	if siov.procFd >= 0 {
 		unix.Close(siov.procFd)
 	}
+	if siov.notifyFd >= 0 {
+		unix.Close(siov.notifyFd)
+	}
 	C.free(unsafe.Pointer(siov.msg))
 	C.free(unsafe.Pointer(siov.req))
 	C.free(unsafe.Pointer(siov.resp))
@@ -678,20 +813,30 @@ func (siov *Iovec) PutSeccompIovec() {
 	C.free(unsafe.Pointer(siov.iov))
 }
 
-// ReceiveSeccompIovec receives a seccomp iovec.
-func (siov *Iovec) ReceiveSeccompIovec(fd int) (uint64, error) {
+// ReceiveSeccompIovecV1 receives a v1 seccomp iovec.
+func (siov *Iovec) ReceiveSeccompIovecV1(fd int) (uint64, error) {
 	bytes, fds, err := netutils.AbstractUnixReceiveFdData(fd, 2, unsafe.Pointer(siov.iov), 4)
 	if err != nil || err == io.EOF {
 		return 0, err
 	}
 
-	if len(fds) == 2 {
-		siov.procFd = int(fds[0])
-		siov.memFd = int(fds[1])
-	} else {
-		siov.memFd = int(fds[0])
+	siov.procFd = int(fds[0])
+	siov.memFd = int(fds[1])
+
+	return bytes, nil
+}
+
+// ReceiveSeccompIovecV2 receives a v2 seccomp iovec.
+func (siov *Iovec) ReceiveSeccompIovecV2(fd int) (uint64, error) {
+	bytes, fds, err := netutils.AbstractUnixReceiveFdData(fd, 3, unsafe.Pointer(siov.iov), 4)
+	if err != nil || err == io.EOF {
+		return 0, err
 	}
 
+	siov.procFd = int(fds[0])
+	siov.memFd = int(fds[1])
+	siov.notifyFd = int(fds[2])
+
 	return bytes, nil
 }
 
@@ -810,8 +955,15 @@ func NewSeccompServer(s *state.State, path string, findPID func(pid int32, state
 				}
 
 				for {
+					var bytes uint64
+					var err error
+
 					siov := NewSeccompIovec(ucred)
-					bytes, err := siov.ReceiveSeccompIovec(int(unixFile.Fd()))
+					if lxcSupportSeccompV2(server.s) {
+						bytes, err = siov.ReceiveSeccompIovecV2(int(unixFile.Fd()))
+					} else {
+						bytes, err = siov.ReceiveSeccompIovecV1(int(unixFile.Fd()))
+					}
 					if err != nil {
 						logger.Debugf("Disconnected from seccomp socket after failed receive: pid=%v, err=%s", ucred.Pid, err)
 						c.Close()
@@ -1601,6 +1753,45 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
 	return 0
 }
 
+// BpfArgs arguments for mount.
+type BpfArgs struct {
+	pid int
+}
+
+// HandleBpfSyscall handles mount syscalls.
+func (s *Server) HandleBpfSyscall(c Instance, siov *Iovec) int {
+	ctx := log.Ctx{"container": c.Name(),
+		"project":              c.Project(),
+		"syscall_number":       siov.req.data.nr,
+		"audit_architecture":   siov.req.data.arch,
+		"seccomp_notify_id":    siov.req.id,
+		"seccomp_notify_flags": siov.req.flags,
+	}
+
+	defer logger.Debug("Handling bpf syscall", ctx)
+
+	args := BpfArgs{
+		pid: int(siov.req.pid),
+	}
+
+	pidFdNr, pidFd := inheritPidFd(args.pid, s.s)
+	if pidFdNr >= 0 {
+		defer pidFd.Close()
+	}
+
+	cBpfAttrBuf := [4096]C.char{}
+	cBpfAttrSize := C.size_t(len(cBpfAttrBuf))
+
+	ret := C.handleBpfSyscall(C.int(siov.notifyFd), C.int(siov.memFd), siov.msg, siov.req, siov.resp, &cBpfAttrBuf[0], &cBpfAttrSize)
+	if ret < 0 {
+		ctx["syscall_continue"] = "true"
+		C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
+		return 0
+	}
+
+	return 0
+}
+
 func (s *Server) handleSyscall(c Instance, siov *Iovec) int {
 	switch int(C.seccomp_notify_get_syscall(siov.req, siov.resp)) {
 	case lxdSeccompNotifyMknod:
@@ -1611,6 +1802,8 @@ func (s *Server) handleSyscall(c Instance, siov *Iovec) int {
 		return s.HandleSetxattrSyscall(c, siov)
 	case lxdSeccompNotifyMount:
 		return s.HandleMountSyscall(c, siov)
+	case lxdSeccompNotifyBpf:
+		return s.HandleBpfSyscall(c, siov)
 	}
 
 	return int(-C.EINVAL)
@@ -1649,39 +1842,70 @@ func (s *Server) Stop() error {
 	return s.l.Close()
 }
 
-func lxcSupportSeccompNotifyContinue(state *state.State) bool {
-	if !lxcSupportSeccompNotify(state) {
+func lxcSupportSeccompV2(state *state.State) bool {
+	err := lxcSupportSeccompNotify(state)
+	if err != nil {
 		return false
 	}
 
-	if !state.OS.SeccompListenerContinue {
+	if !state.OS.LXCFeatures["seccomp_proxy_send_notify_fd"] {
 		return false
 	}
 
 	return true
 }
 
-func lxcSupportSeccompNotify(state *state.State) bool {
+func lxcSupportSeccompNotifyContinue(state *state.State) error {
+	err := lxcSupportSeccompNotify(state)
+	if err != nil {
+		return err
+	}
+
+	if !state.OS.SeccompListenerContinue {
+		return fmt.Errorf("Seccomp notify doesn't support continuing syscalls")
+	}
+
+	return nil
+}
+
+func lxcSupportSeccompNotifyAddfd(state *state.State) error {
+	err := lxcSupportSeccompNotify(state)
+	if err != nil {
+		return err
+	}
+
+	if !state.OS.SeccompListenerContinue {
+		return fmt.Errorf("Seccomp notify doesn't support continuing syscalls")
+	}
+
+	if !state.OS.SeccompListenerAddfd {
+		return fmt.Errorf("Seccomp notify doesn't support adding file descriptors")
+	}
+
+	return nil
+}
+
+func lxcSupportSeccompNotify(state *state.State) error {
 	if !state.OS.SeccompListener {
-		return false
+		return fmt.Errorf("Seccomp notify not supported")
 	}
 
 	if !state.OS.LXCFeatures["seccomp_notify"] {
-		return false
+		return fmt.Errorf("LXC doesn't support seccomp notify")
 	}
 
 	c, err := liblxc.NewContainer("test-seccomp", state.OS.LxcPath)
 	if err != nil {
-		return false
+		return fmt.Errorf("Failed to load seccomp notify test container")
 	}
 
 	err = c.SetConfigItem("lxc.seccomp.notify.proxy", fmt.Sprintf("unix:%s", shared.VarPath("seccomp.socket")))
 	if err != nil {
-		return false
+		return errors.Wrap(err, "LXC doesn't support notify proxy")
 	}
 
 	c.Release()
-	return true
+	return nil
 }
 
 // MountSyscallFilter creates a mount syscall filter from the config.
diff --git a/lxd/sys/os.go b/lxd/sys/os.go
index d20b6aef85..b7faa7d4e3 100644
--- a/lxd/sys/os.go
+++ b/lxd/sys/os.go
@@ -67,6 +67,7 @@ type OS struct {
 	NetnsGetifaddrs         bool
 	PidFds                  bool
 	SeccompListener         bool
+	SeccompListenerAddfd    bool
 	SeccompListenerContinue bool
 	Shiftfs                 bool
 	UeventInjection         bool
diff --git a/shared/instance.go b/shared/instance.go
index 42eba631c1..b8072a53b3 100644
--- a/shared/instance.go
+++ b/shared/instance.go
@@ -199,20 +199,22 @@ var KnownInstanceConfigKeys = map[string]func(value string) error{
 
 	"security.secureboot": validate.Optional(validate.IsBool),
 
-	"security.syscalls.allow":                   validate.IsAny,
-	"security.syscalls.blacklist_default":       validate.Optional(validate.IsBool),
-	"security.syscalls.blacklist_compat":        validate.Optional(validate.IsBool),
-	"security.syscalls.blacklist":               validate.IsAny,
-	"security.syscalls.deny_default":            validate.Optional(validate.IsBool),
-	"security.syscalls.deny_compat":             validate.Optional(validate.IsBool),
-	"security.syscalls.deny":                    validate.IsAny,
-	"security.syscalls.intercept.mknod":         validate.Optional(validate.IsBool),
-	"security.syscalls.intercept.mount":         validate.Optional(validate.IsBool),
-	"security.syscalls.intercept.mount.allowed": validate.IsAny,
-	"security.syscalls.intercept.mount.fuse":    validate.IsAny,
-	"security.syscalls.intercept.mount.shift":   validate.Optional(validate.IsBool),
-	"security.syscalls.intercept.setxattr":      validate.Optional(validate.IsBool),
-	"security.syscalls.whitelist":               validate.IsAny,
+	"security.syscalls.allow":                          validate.IsAny,
+	"security.syscalls.blacklist_default":              validate.Optional(validate.IsBool),
+	"security.syscalls.blacklist_compat":               validate.Optional(validate.IsBool),
+	"security.syscalls.blacklist":                      validate.IsAny,
+	"security.syscalls.deny_default":                   validate.Optional(validate.IsBool),
+	"security.syscalls.deny_compat":                    validate.Optional(validate.IsBool),
+	"security.syscalls.deny":                           validate.IsAny,
+	"security.syscalls.intercept.bpf":                  validate.Optional(validate.IsBool),
+	"security.syscalls.intercept.bpf.prog.type.device": validate.Optional(validate.IsBool),
+	"security.syscalls.intercept.mknod":                validate.Optional(validate.IsBool),
+	"security.syscalls.intercept.mount":                validate.Optional(validate.IsBool),
+	"security.syscalls.intercept.mount.allowed":        validate.IsAny,
+	"security.syscalls.intercept.mount.fuse":           validate.IsAny,
+	"security.syscalls.intercept.mount.shift":          validate.Optional(validate.IsBool),
+	"security.syscalls.intercept.setxattr":             validate.Optional(validate.IsBool),
+	"security.syscalls.whitelist":                      validate.IsAny,
 
 	"snapshots.schedule": func(value string) error {
 		if value == "" {


More information about the lxc-devel mailing list