[lxc-devel] [lxd/master] seccomp: enable unpriviled (device) bpf through syscall interception

brauner on Github lxc-bot at linuxcontainers.org
Fri Aug 7 11:17:38 UTC 2020


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1160 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200807/f4b03c86/attachment-0001.bin>
-------------- next part --------------
From 89274121d95f176cda870ca25f563da5c8acffa1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Thu, 6 Aug 2020 11:16:19 +0200
Subject: [PATCH 1/5] seccomp: enable unpriviled (device) bpf through syscall
 interception

Thing's aren't normal on Linux anymore and we have users that want to run bpf
in unprivileged containers. This is of course completely opt-in and not to be
used lightly.

This patch makes it possible for LXD to manage the bpf() syscalls of an
unprivileged task in a container.

For a start, this enables support for BPF_PROG_TYPE_CGROUP_DEVICE program types
and BPF_CGROUP_DEVICE attach and detach types. This serves as a POC that it is
possible to manage bpf programs for a task running in an unprivileged
container.

I expect this to be extended in the future should users have a need for it.
There are a several ways we could extend this:
- supporting more program types
- supporting maps of pre-compiled or pre-written bpf programs instead of
  loading the container's own
.
.
.

This requires an update to our production guide too, since bpf checks for
memlock limit and even root can't exceed it. So in order to make this work one
needs to set:

root	soft	memlock	unlimited	unset	maximum locked-in-memory address space (KB)
root	hard	memlock	unlimited	unset	maximum locked-in-memory address space (KB)

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/daemon.go                      |   8 +
 lxd/include/lxd_seccomp.h          |  23 ++
 lxd/include/syscall_numbers.h      |  59 ++++-
 lxd/instance/drivers/driver_lxc.go |  11 +
 lxd/instance/instance_interface.go |   1 +
 lxd/main_checkfeature.go           | 128 +++++++++-
 lxd/seccomp/seccomp.go             | 381 ++++++++++++++++++++++++-----
 lxd/sys/os.go                      |   1 +
 shared/instance.go                 |   2 +
 9 files changed, 555 insertions(+), 59 deletions(-)

diff --git a/lxd/daemon.go b/lxd/daemon.go
index 6ce21a1c19..2d62378164 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -631,6 +631,7 @@ func (d *Daemon) init() error {
 		"pidfd",
 		"seccomp_allow_deny_syntax",
 		"devpts_fd",
+		"seccomp_proxy_send_notify_fd",
 	}
 	for _, extension := range lxcExtensions {
 		d.os.LXCFeatures[extension] = liblxc.HasApiExtension(extension)
@@ -675,6 +676,13 @@ func (d *Daemon) init() error {
 		logger.Infof(" - seccomp listener continue syscalls: no")
 	}
 
+	if canUseSeccompListenerAddfd() && d.os.LXCFeatures["seccomp_proxy_send_notify_fd"] {
+		d.os.SeccompListenerAddfd = true
+		logger.Infof(" - seccomp listener add file descriptors: yes")
+	} else {
+		logger.Infof(" - seccomp listener add file descriptors: no")
+	}
+
 	if d.os.LXCFeatures["devpts_fd"] && canUseNativeTerminals() {
 		d.os.NativeTerminals = true
 		logger.Infof(" - safe native terminal allocation : yes")
diff --git a/lxd/include/lxd_seccomp.h b/lxd/include/lxd_seccomp.h
index 242347e3e1..976947e4bc 100644
--- a/lxd/include/lxd_seccomp.h
+++ b/lxd/include/lxd_seccomp.h
@@ -65,4 +65,27 @@ struct seccomp_notif_sizes {
 						struct seccomp_notif_resp)
 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
 #endif
+
+#ifndef SECCOMP_IOCTL_NOTIF_ADDFD
+#define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3, struct seccomp_notif_addfd)
+
+/* valid flags for seccomp_notif_addfd */
+#define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
+
+/**
+ * struct seccomp_notif_addfd
+ * @id: The ID of the seccomp notification
+ * @flags: SECCOMP_ADDFD_FLAG_*
+ * @srcfd: The local fd number
+ * @newfd: Optional remote FD number if SETFD option is set, otherwise 0.
+ * @newfd_flags: The O_* flags the remote FD should have applied
+ */
+struct seccomp_notif_addfd {
+	__u64 id;
+	__u32 flags;
+	__u32 srcfd;
+	__u32 newfd;
+	__u32 newfd_flags;
+};
+#endif
 #endif /* LXD_SECCOMP_H */
diff --git a/lxd/include/syscall_numbers.h b/lxd/include/syscall_numbers.h
index f953a26911..ee6593ff5b 100644
--- a/lxd/include/syscall_numbers.h
+++ b/lxd/include/syscall_numbers.h
@@ -28,9 +28,29 @@
 			#define __NR_pidfd_open 5434
 		#endif
 	#elif defined __ia64__
-		#define __NR_clone3 (424 + 1024)
+		#define __NR_pidfd_open (434 + 1024)
 	#else
-		#define __NR_pidfd_open 424
+		#define __NR_pidfd_open 434
+	#endif
+#endif
+
+#ifndef __NR_pidfd_getfd
+	#if defined __alpha__
+		#define __NR_pidfd_getfd 548
+	#elif defined _MIPS_SIM
+		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
+			#define __NR_pidfd_getfd 4438
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
+			#define __NR_pidfd_getfd 6438
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
+			#define __NR_pidfd_getfd 5438
+		#endif
+	#elif defined __ia64__
+		#define __NR_pidfd_getfd (438 + 1024)
+	#else
+		#define __NR_pidfd_getfd 438
 	#endif
 #endif
 
@@ -74,4 +94,39 @@
 	#endif
 #endif
 
+#ifndef __NR_bpf
+	#if defined __i386__
+		#define __NR_bpf 357
+	#elif defined __x86_64__
+		#define __NR_bpf 321
+	#elif defined __arm__
+		#define __NR_bpf 386
+	#elif defined __aarch64__
+		#define __NR_bpf 386
+	#elif defined __s390__
+		#define __NR_bpf 351
+	#elif defined __powerpc__
+		#define __NR_bpf 361
+	#elif defined __riscv
+		#define __NR_bpf 280
+	#elif defined __sparc__
+		#define __NR_bpf 349
+	#elif defined __ia64__
+		#define __NR_bpf (317 + 1024)
+	#elif defined _MIPS_SIM
+		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
+			#define __NR_bpf 4355
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
+			#define __NR_bpf 6319
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
+			#define __NR_bpf 5315
+		#endif
+	#else
+		#define -1
+		#warning "__NR_bpf not defined for your architecture"
+	#endif
+#endif
+
 #endif /* __LXD_SYSCALL_NUMBERS_H */
diff --git a/lxd/instance/drivers/driver_lxc.go b/lxd/instance/drivers/driver_lxc.go
index 85b796ac91..bb99d82fb5 100644
--- a/lxd/instance/drivers/driver_lxc.go
+++ b/lxd/instance/drivers/driver_lxc.go
@@ -6669,6 +6669,17 @@ func (c *lxc) DevptsFd() (*os.File, error) {
 	return c.c.DevptsFd()
 }
 
+// SeccompNotifyFd returns seccomp notify fd of the container.
+func (c *lxc) SeccompNotifyFd() (*os.File, error) {
+	// Load the go-lxc struct
+	err := c.initLXC(false)
+	if err != nil {
+		return nil, err
+	}
+
+	return c.c.SeccompNotifyFd()
+}
+
 // LocalConfig returns local config.
 func (c *lxc) LocalConfig() map[string]string {
 	return c.localConfig
diff --git a/lxd/instance/instance_interface.go b/lxd/instance/instance_interface.go
index e03db25bfb..c8dbf1cd5d 100644
--- a/lxd/instance/instance_interface.go
+++ b/lxd/instance/instance_interface.go
@@ -148,6 +148,7 @@ type Container interface {
 	ConsoleLog(opts liblxc.ConsoleLogOptions) (string, error)
 	InsertSeccompUnixDevice(prefix string, m deviceConfig.Device, pid int) error
 	DevptsFd() (*os.File, error)
+	SeccompNotifyFd() (*os.File, error)
 }
 
 // CriuMigrationArgs arguments for CRIU migration.
diff --git a/lxd/main_checkfeature.go b/lxd/main_checkfeature.go
index 01995462b9..77a7f53738 100644
--- a/lxd/main_checkfeature.go
+++ b/lxd/main_checkfeature.go
@@ -293,6 +293,127 @@ static void is_user_notification_continue_aware(void)
 		seccomp_notify_aware = 2;
 }
 
+__noreturn static void __do_user_notification_addfd(void)
+{
+	__do_close int listener = -EBADF;
+	pid_t pid;
+	int ret;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+	struct seccomp_notif_addfd addfd = {};
+	struct pollfd pollfd;
+
+	listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	if (listener < 0)
+		_exit(EXIT_FAILURE);
+
+	pid = fork();
+	if (pid < 0)
+		_exit(EXIT_FAILURE);
+
+	if (pid == 0) {
+		int dup_fd, pipe_fds[2];
+		pid_t self;
+
+		// Don't bother cleaning up. On child exit all of those
+		// will be closed anyway.
+		ret = pipe(pipe_fds);
+		if (ret < 0)
+			_exit(EXIT_FAILURE);
+
+		// O_CLOEXEC doesn't matter as we're in the child and we're
+		// not going to exec.
+		dup_fd = dup(pipe_fds[0]);
+		if (dup_fd < 0)
+			_exit(EXIT_FAILURE);
+
+		self = getpid();
+
+		ret = filecmp(self, self, pipe_fds[0], dup_fd);
+		if (ret)
+			_exit(EXIT_FAILURE);
+
+		_exit(EXIT_SUCCESS);
+	}
+
+	pollfd.fd = listener;
+	pollfd.events = POLLIN | POLLOUT;
+
+	ret = poll(&pollfd, 1, 5000);
+	if (ret <= 0)
+		goto cleanup_sigkill;
+
+	if (!(pollfd.revents & POLLIN))
+		goto cleanup_sigkill;
+
+	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
+	if (ret)
+		goto cleanup_sigkill;
+
+	pollfd.fd = listener;
+	pollfd.events = POLLIN | POLLOUT;
+
+	ret = poll(&pollfd, 1, 5000);
+	if (ret <= 0)
+		goto cleanup_sigkill;
+
+	if (!(pollfd.revents & POLLOUT))
+		goto cleanup_sigkill;
+
+	if (req.data.nr != __NR_dup)
+		goto cleanup_sigkill;
+
+	addfd.srcfd	= 3;
+	addfd.id 	= req.id;
+	addfd.flags 	= 0;
+
+	// Inject the fd into the task.
+	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
+	if (ret < 0)
+		goto cleanup_sigkill;
+	close(ret);
+
+	resp.id = req.id;
+	resp.flags |= SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+	resp.error = -EPERM;
+	resp.flags = 0;
+	if (ret) {
+		ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+		goto cleanup_sigkill;
+	}
+
+cleanup_wait:
+	ret = wait_for_pid(pid);
+	if (ret)
+		_exit(EXIT_FAILURE);
+	_exit(EXIT_SUCCESS);
+
+cleanup_sigkill:
+	kill(pid, SIGKILL);
+	goto cleanup_wait;
+}
+
+static void is_user_notification_addfd_aware(void)
+{
+	int ret;
+	pid_t pid;
+
+	pid = fork();
+	if (pid < 0)
+		return;
+
+	if (pid == 0) {
+		__do_user_notification_addfd();
+		// Should not be reached.
+		_exit(EXIT_FAILURE);
+	}
+
+	ret = wait_for_pid(pid);
+	if (!ret)
+		seccomp_notify_aware = 3;
+}
+
 static void is_seccomp_notify_aware(void)
 {
 	__u32 action[] = { SECCOMP_RET_USER_NOTIF };
@@ -300,6 +421,8 @@ static void is_seccomp_notify_aware(void)
 	if (syscall(__NR_seccomp, SECCOMP_GET_ACTION_AVAIL, 0, &action[0]) == 0) {
 		seccomp_notify_aware = 1;
 		is_user_notification_continue_aware();
+		if (seccomp_notify_aware == 2)
+			is_user_notification_addfd_aware();
 	}
 
 }
@@ -403,9 +526,12 @@ func canUseSeccompListener() bool {
 }
 
 func canUseSeccompListenerContinue() bool {
-	return bool(C.seccomp_notify_aware == 2)
+	return bool(C.seccomp_notify_aware >= 2)
 }
 
+func canUseSeccompListenerAddfd() bool {
+	return bool(C.seccomp_notify_aware == 3)
+}
 func canUsePidFds() bool {
 	return bool(C.pidfd_aware)
 }
diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go
index 52e88049a1..ba2a942862 100644
--- a/lxd/seccomp/seccomp.go
+++ b/lxd/seccomp/seccomp.go
@@ -11,10 +11,13 @@ import (
 	"os"
 	"path"
 	"regexp"
+	"runtime"
 	"strconv"
 	"strings"
 	"unsafe"
 
+	"github.com/pkg/errors"
+
 	"golang.org/x/sys/unix"
 	liblxc "gopkg.in/lxc/go-lxc.v2"
 
@@ -41,6 +44,8 @@ import (
 #include <elf.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 #include <linux/seccomp.h>
 #include <linux/types.h>
 #include <linux/kdev_t.h>
@@ -55,9 +60,12 @@ import (
 #include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
+#include <sys/uio.h>
 #include <unistd.h>
 
 #include "../include/lxd_seccomp.h"
+#include "../include/memory_utils.h"
+#include "../include/process_utils.h"
 
 struct seccomp_notif_sizes expected_sizes;
 
@@ -123,69 +131,71 @@ struct lxd_seccomp_data_arch {
 	int nr_mknodat;
 	int nr_setxattr;
 	int nr_mount;
+	int nr_bpf;
 };
 
 #define LXD_SECCOMP_NOTIFY_MKNOD    0
 #define LXD_SECCOMP_NOTIFY_MKNODAT  1
 #define LXD_SECCOMP_NOTIFY_SETXATTR 2
 #define LXD_SECCOMP_NOTIFY_MOUNT 3
+#define LXD_SECCOMP_NOTIFY_BPF 4
 
 // ordered by likelihood of usage...
 static const struct lxd_seccomp_data_arch seccomp_notify_syscall_table[] = {
-	{ -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, LXD_SECCOMP_NOTIFY_SETXATTR, LXD_SECCOMP_NOTIFY_MOUNT },
+	{ -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, LXD_SECCOMP_NOTIFY_SETXATTR, LXD_SECCOMP_NOTIFY_MOUNT, LXD_SECCOMP_NOTIFY_BPF },
 #ifdef AUDIT_ARCH_X86_64
-	{ AUDIT_ARCH_X86_64,      133, 259, 188, 165 },
+	{ AUDIT_ARCH_X86_64,      133, 259, 188, 165, 321 },
 #endif
 #ifdef AUDIT_ARCH_I386
-	{ AUDIT_ARCH_I386,         14, 297, 226,  21 },
+	{ AUDIT_ARCH_I386,         14, 297, 226,  21, 357 },
 #endif
 #ifdef AUDIT_ARCH_AARCH64
-	{ AUDIT_ARCH_AARCH64,      -1,  33,   5,  21 },
+	{ AUDIT_ARCH_AARCH64,      -1,  33,   5,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_ARM
-	{ AUDIT_ARCH_ARM,          14, 324, 226,  21 },
+	{ AUDIT_ARCH_ARM,          14, 324, 226,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_ARMEB
-	{ AUDIT_ARCH_ARMEB,        14, 324, 226,  21 },
+	{ AUDIT_ARCH_ARMEB,        14, 324, 226,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_S390
-	{ AUDIT_ARCH_S390,         14, 290, 224,  21 },
+	{ AUDIT_ARCH_S390,         14, 290, 224,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_S390X
-	{ AUDIT_ARCH_S390X,        14, 290, 224,  21 },
+	{ AUDIT_ARCH_S390X,        14, 290, 224,  21, 351 },
 #endif
 #ifdef AUDIT_ARCH_PPC
-	{ AUDIT_ARCH_PPC,          14, 288, 209,  21 },
+	{ AUDIT_ARCH_PPC,          14, 288, 209,  21, 361 },
 #endif
 #ifdef AUDIT_ARCH_PPC64
-	{ AUDIT_ARCH_PPC64,        14, 288, 209,  21 },
+	{ AUDIT_ARCH_PPC64,        14, 288, 209,  21, 361 },
 #endif
 #ifdef AUDIT_ARCH_PPC64LE
-	{ AUDIT_ARCH_PPC64LE,      14, 288, 209,  21 },
+	{ AUDIT_ARCH_PPC64LE,      14, 288, 209,  21, 361 },
 #endif
 #ifdef AUDIT_ARCH_SPARC
-	{ AUDIT_ARCH_SPARC,        14, 286, 169, 167 },
+	{ AUDIT_ARCH_SPARC,        14, 286, 169, 167, 349 },
 #endif
 #ifdef AUDIT_ARCH_SPARC64
-	{ AUDIT_ARCH_SPARC64,      14, 286, 169, 167 },
+	{ AUDIT_ARCH_SPARC64,      14, 286, 169, 167, 349 },
 #endif
 #ifdef AUDIT_ARCH_MIPS
-	{ AUDIT_ARCH_MIPS,         14, 290, 224,  21 },
+	{ AUDIT_ARCH_MIPS,         14, 290, 224,  21,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL
-	{ AUDIT_ARCH_MIPSEL,       14, 290, 224,  21 },
+	{ AUDIT_ARCH_MIPSEL,       14, 290, 224,  21,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPS64
-	{ AUDIT_ARCH_MIPS64,      131, 249, 180, 160 },
+	{ AUDIT_ARCH_MIPS64,      131, 249, 180, 160,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPS64N32
-	{ AUDIT_ARCH_MIPS64N32,   131, 253, 180, 160 },
+	{ AUDIT_ARCH_MIPS64N32,   131, 253, 180, 160,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL64
-	{ AUDIT_ARCH_MIPSEL64,    131, 249, 180, 160 },
+	{ AUDIT_ARCH_MIPSEL64,    131, 249, 180, 160,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL64N32
-	{ AUDIT_ARCH_MIPSEL64N32, 131, 253, 180, 160 },
+	{ AUDIT_ARCH_MIPSEL64N32, 131, 253, 180, 160,  -1 },
 #endif
 };
 
@@ -217,6 +227,9 @@ static int seccomp_notify_get_syscall(struct seccomp_notif *req,
 		if (entry->nr_mount == req->data.nr)
 			return LXD_SECCOMP_NOTIFY_MOUNT;
 
+		if (entry->nr_bpf == req->data.nr)
+			return LXD_SECCOMP_NOTIFY_BPF;
+
 		break;
 	}
 
@@ -249,6 +262,156 @@ static void prepare_seccomp_iovec(struct iovec *iov,
 	iov[3].iov_len = SECCOMP_COOKIE_SIZE;
 }
 
+static inline int pidfd_getfd(int pidfd, int fd, int flags)
+{
+	return syscall(__NR_pidfd_getfd, pidfd, fd, flags);
+}
+
+#define ptr_to_u64(p) ((__aligned_u64)((uintptr_t)(p)))
+
+static inline int bpf(int cmd, union bpf_attr *attr, size_t size)
+{
+	return syscall(__NR_bpf, cmd, attr, size);
+}
+
+static int handleBpfSyscall(int notify_fd, int mem_fd,
+			    struct seccomp_notify_proxy_msg *msg,
+			    struct seccomp_notif *req,
+			    struct seccomp_notif_resp *resp,
+			    int *bpf_cmd, int *bpf_prog_type, int *bpf_attach_type)
+{
+	__do_close int pidfd = -EBADF, bpf_target_fd = -EBADF,
+		       bpf_attach_fd = -EBADF, bpf_prog_fd = -EBADF;
+	union bpf_attr attr = {};
+	unsigned int attr_len = sizeof(attr);
+	struct seccomp_notif_addfd addfd = {};
+	int ret;
+	int cmd;
+
+	*bpf_cmd = -EINVAL;
+	*bpf_prog_type = -EINVAL;
+	*bpf_attach_type = -EINVAL;
+
+	if (attr_len < req->data.args[2])
+		return -EFBIG;
+	attr_len = req->data.args[2];
+
+	*bpf_cmd = req->data.args[0];
+	switch (req->data.args[0]) {
+	case BPF_PROG_LOAD:
+		cmd = BPF_PROG_LOAD;
+		break;
+	case BPF_PROG_ATTACH:
+		cmd = BPF_PROG_ATTACH;
+		break;
+	case BPF_PROG_DETACH:
+		cmd = BPF_PROG_DETACH;
+		break;
+	default:
+		*bpf_cmd = req->data.args[0];
+		return -EINVAL;
+	}
+
+	ret = pread(mem_fd, &attr, attr_len, req->data.args[1]);
+	if (ret < 0)
+		return -errno;
+
+	*bpf_prog_type = attr.prog_type;
+	switch (cmd) {
+	case BPF_PROG_LOAD:
+		if (attr.prog_type != BPF_PROG_TYPE_CGROUP_DEVICE)
+			return -EINVAL;
+		break;
+	case BPF_PROG_ATTACH:
+		__fallthrough;
+	case BPF_PROG_DETACH:
+		*bpf_attach_type = attr.attach_type;
+		if (attr.attach_type != BPF_CGROUP_DEVICE)
+			return -EINVAL;
+		break;
+	}
+
+	pidfd = pidfd_open(req->pid, 0);
+	if (pidfd < 0)
+		return -errno;
+
+	switch (cmd) {
+	case BPF_PROG_LOAD:
+		{
+			__do_free char *log_buf = NULL;
+			__do_free struct bpf_insn *insn = NULL;
+			size_t insn_size = sizeof(struct bpf_insn) * attr.insn_cnt;
+
+
+
+			insn = malloc(insn_size);
+			if (!insn)
+				return -ENOMEM;
+			memset(insn, 0, insn_size);
+
+			ret = pread(mem_fd, insn, insn_size, attr.insns);
+			if (ret < 0)
+				return -errno;
+
+			if (attr.log_size > 0 && attr.log_size <= (UINT_MAX / 2)) {
+				log_buf = malloc(attr.log_size);
+				if (!log_buf)
+					return -ENOMEM;
+			}
+
+			attr.insns	= ptr_to_u64(insn);
+			attr.license	= ptr_to_u64("GPL");
+			attr.log_buf	= ptr_to_u64(log_buf),
+
+			bpf_prog_fd = bpf(cmd, &attr, sizeof(attr));
+			if (bpf_prog_fd < 0)
+				return -errno;
+
+			addfd.srcfd 	= bpf_prog_fd;
+			addfd.id	= req->id;
+			addfd.flags 	= 0;
+
+			// Inject the fd into the task.
+			ret = ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
+			if (ret < 0)
+				return -errno;
+
+			// Tell the caller what fd it got.
+			resp->val = ret;
+			ret = 0;
+		}
+		break;
+	case BPF_PROG_ATTACH:
+		bpf_target_fd = pidfd_getfd(pidfd, attr.target_fd, 0);
+		if (bpf_target_fd < 0)
+			return -errno;
+
+		bpf_attach_fd = pidfd_getfd(pidfd, attr.attach_bpf_fd, 0);
+		if (bpf_attach_fd < 0)
+			return -errno;
+
+		attr.target_fd		= bpf_target_fd;
+		attr.attach_bpf_fd	= bpf_attach_fd;
+		ret = bpf(cmd, &attr, attr_len);
+		break;
+	case BPF_PROG_DETACH:
+		bpf_target_fd = pidfd_getfd(pidfd, attr.target_fd, 0);
+		if (bpf_target_fd < 0)
+			return -errno;
+
+		bpf_attach_fd = pidfd_getfd(pidfd, attr.attach_bpf_fd, 0);
+		if (bpf_attach_fd < 0)
+			return -errno;
+
+		attr.target_fd		= bpf_target_fd;
+		attr.attach_bpf_fd	= bpf_attach_fd;
+		ret = bpf(cmd, &attr, attr_len);
+		break;
+	}
+
+	return ret;
+}
+
 #ifndef MS_LAZYTIME
 #define MS_LAZYTIME (1<<25)
 #endif
@@ -259,6 +422,7 @@ const lxdSeccompNotifyMknod = C.LXD_SECCOMP_NOTIFY_MKNOD
 const lxdSeccompNotifyMknodat = C.LXD_SECCOMP_NOTIFY_MKNODAT
 const lxdSeccompNotifySetxattr = C.LXD_SECCOMP_NOTIFY_SETXATTR
 const lxdSeccompNotifyMount = C.LXD_SECCOMP_NOTIFY_MOUNT
+const lxdSeccompNotifyBpf = C.LXD_SECCOMP_NOTIFY_BPF
 
 const seccompHeader = `2
 `
@@ -327,6 +491,14 @@ move_mount errno 38
 const seccompNotifyMount = `mount notify [3,0,SCMP_CMP_MASKED_EQ,18446744070422410016]
 `
 
+// 5 == BPF_PROG_LOAD
+// 8 == BPF_PROG_ATTACH
+// 9 == BPF_PROG_DETACH
+const seccompNotifyBpf = `bpf notify [0,5,SCMP_CMP_EQ]
+bpf notify [0,8,SCMP_CMP_EQ]
+bpf notify [0,9,SCMP_CMP_EQ]
+`
+
 const compatBlockingPolicy = `[%s]
 compat_sys_rt_sigaction errno 38
 stub_x32_rt_sigreturn errno 38
@@ -412,6 +584,7 @@ func InstanceNeedsPolicy(c Instance) bool {
 		"security.syscalls.intercept.mknod",
 		"security.syscalls.intercept.setxattr",
 		"security.syscalls.intercept.mount",
+		"security.syscalls.intercept.bpf",
 	}
 
 	for _, k := range keys {
@@ -446,20 +619,22 @@ func InstanceNeedsIntercept(s *state.State, c Instance) (bool, error) {
 
 	config := c.ExpandedConfig()
 
-	var keys = map[string]func(state *state.State) bool{
+	var keys = map[string]func(state *state.State) error{
 		"security.syscalls.intercept.mknod":    lxcSupportSeccompNotify,
 		"security.syscalls.intercept.setxattr": lxcSupportSeccompNotify,
 		"security.syscalls.intercept.mount":    lxcSupportSeccompNotifyContinue,
+		"security.syscalls.intercept.bpf":      lxcSupportSeccompNotifyAddfd,
 	}
 
 	needed := false
-	for key, isSupported := range keys {
+	for key, check := range keys {
 		if !shared.IsTrue(config[key]) {
 			continue
 		}
 
-		if !isSupported(s) {
-			return needed, fmt.Errorf("System doesn't support syscall interception")
+		err := check(s)
+		if err != nil {
+			return needed, err
 		}
 
 		needed = true
@@ -546,6 +721,11 @@ func seccompGetPolicyContent(s *state.State, c Instance) (string, error) {
 			// multiple syscalls.
 			policy += seccompBlockNewMountAPI
 		}
+
+		if shared.IsTrue(config["security.syscalls.intercept.bpf"]) &&
+			shared.IsTrue(config["security.syscalls.intercept.bpf.devices"]) {
+			policy += seccompNotifyBpf
+		}
 	}
 
 	if allowlist != "" {
@@ -617,14 +797,15 @@ type Server struct {
 
 // Iovec defines an iovec to move data between kernel and userspace.
 type Iovec struct {
-	ucred  *unix.Ucred
-	memFd  int
-	procFd int
-	msg    *C.struct_seccomp_notify_proxy_msg
-	req    *C.struct_seccomp_notif
-	resp   *C.struct_seccomp_notif_resp
-	cookie *C.char
-	iov    *C.struct_iovec
+	ucred    *unix.Ucred
+	memFd    int
+	procFd   int
+	notifyFd int
+	msg      *C.struct_seccomp_notify_proxy_msg
+	req      *C.struct_seccomp_notif
+	resp     *C.struct_seccomp_notif_resp
+	cookie   *C.char
+	iov      *C.struct_iovec
 }
 
 // NewSeccompIovec creates a new seccomp iovec.
@@ -652,14 +833,15 @@ func NewSeccompIovec(ucred *unix.Ucred) *Iovec {
 	C.prepare_seccomp_iovec(iov, msg, req, resp, cookie)
 
 	return &Iovec{
-		memFd:  -1,
-		procFd: -1,
-		msg:    msg,
-		req:    req,
-		resp:   resp,
-		cookie: cookie,
-		iov:    iov,
-		ucred:  ucred,
+		memFd:    -1,
+		procFd:   -1,
+		notifyFd: -1,
+		msg:      msg,
+		req:      req,
+		resp:     resp,
+		cookie:   cookie,
+		iov:      iov,
+		ucred:    ucred,
 	}
 }
 
@@ -671,6 +853,9 @@ func (siov *Iovec) PutSeccompIovec() {
 	if siov.procFd >= 0 {
 		unix.Close(siov.procFd)
 	}
+	if siov.notifyFd >= 0 {
+		unix.Close(siov.notifyFd)
+	}
 	C.free(unsafe.Pointer(siov.msg))
 	C.free(unsafe.Pointer(siov.req))
 	C.free(unsafe.Pointer(siov.resp))
@@ -678,20 +863,30 @@ func (siov *Iovec) PutSeccompIovec() {
 	C.free(unsafe.Pointer(siov.iov))
 }
 
-// ReceiveSeccompIovec receives a seccomp iovec.
-func (siov *Iovec) ReceiveSeccompIovec(fd int) (uint64, error) {
+// ReceiveSeccompIovecV1 receives a v1 seccomp iovec.
+func (siov *Iovec) ReceiveSeccompIovecV1(fd int) (uint64, error) {
 	bytes, fds, err := netutils.AbstractUnixReceiveFdData(fd, 2, unsafe.Pointer(siov.iov), 4)
 	if err != nil || err == io.EOF {
 		return 0, err
 	}
 
-	if len(fds) == 2 {
-		siov.procFd = int(fds[0])
-		siov.memFd = int(fds[1])
-	} else {
-		siov.memFd = int(fds[0])
+	siov.procFd = int(fds[0])
+	siov.memFd = int(fds[1])
+
+	return bytes, nil
+}
+
+// ReceiveSeccompIovecV2 receives a v2 seccomp iovec.
+func (siov *Iovec) ReceiveSeccompIovecV2(fd int) (uint64, error) {
+	bytes, fds, err := netutils.AbstractUnixReceiveFdData(fd, 3, unsafe.Pointer(siov.iov), 4)
+	if err != nil || err == io.EOF {
+		return 0, err
 	}
 
+	siov.procFd = int(fds[0])
+	siov.memFd = int(fds[1])
+	siov.notifyFd = int(fds[2])
+
 	return bytes, nil
 }
 
@@ -810,8 +1005,15 @@ func NewSeccompServer(s *state.State, path string, findPID func(pid int32, state
 				}
 
 				for {
+					var bytes uint64
+					var err error
+
 					siov := NewSeccompIovec(ucred)
-					bytes, err := siov.ReceiveSeccompIovec(int(unixFile.Fd()))
+					if lxcSupportSeccompV2(server.s) {
+						bytes, err = siov.ReceiveSeccompIovecV2(int(unixFile.Fd()))
+					} else {
+						bytes, err = siov.ReceiveSeccompIovecV1(int(unixFile.Fd()))
+					}
 					if err != nil {
 						logger.Debugf("Disconnected from seccomp socket after failed receive: pid=%v, err=%s", ucred.Pid, err)
 						c.Close()
@@ -1601,6 +1803,40 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
 	return 0
 }
 
+// HandleBpfSyscall handles mount syscalls.
+func (s *Server) HandleBpfSyscall(c Instance, siov *Iovec) int {
+	ctx := log.Ctx{"container": c.Name(),
+		"project":               c.Project(),
+		"syscall_number":        siov.req.data.nr,
+		"audit_architecture":    siov.req.data.arch,
+		"seccomp_notify_id":     siov.req.id,
+		"seccomp_notify_flags":  siov.req.flags,
+		"seccomp_notify_pid":    siov.req.pid,
+		"seccomp_notify_fd":     siov.notifyFd,
+		"seccomp_notify_mem_fd": siov.memFd,
+	}
+
+	defer logger.Debug("Handling bpf syscall", ctx)
+	var bpfCmd, bpfProgType, bpfAttachType C.int
+
+	// Locking to a thread shouldn't be necessary but it still makes me
+	// queezy that Go could just wander off to somehwere.
+	runtime.LockOSThread()
+	ret := C.handleBpfSyscall(C.int(siov.notifyFd), C.int(siov.memFd), siov.msg, siov.req, siov.resp, &bpfCmd, &bpfProgType, &bpfAttachType)
+	runtime.UnlockOSThread()
+	if ret < 0 {
+		ctx["syscall_continue"] = "true"
+		ctx["syscall_handler_error"] = fmt.Sprintf("%s - Failed to handle bpf syscall", unix.Errno(-ret))
+		ctx["bpf_cmd"] = fmt.Sprintf("%d", bpfCmd)
+		ctx["bpf_prog_type"] = fmt.Sprintf("%d", bpfProgType)
+		ctx["bpf_attach_type"] = fmt.Sprintf("%d", bpfAttachType)
+		C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
+		return 0
+	}
+
+	return 0
+}
+
 func (s *Server) handleSyscall(c Instance, siov *Iovec) int {
 	switch int(C.seccomp_notify_get_syscall(siov.req, siov.resp)) {
 	case lxdSeccompNotifyMknod:
@@ -1611,6 +1847,8 @@ func (s *Server) handleSyscall(c Instance, siov *Iovec) int {
 		return s.HandleSetxattrSyscall(c, siov)
 	case lxdSeccompNotifyMount:
 		return s.HandleMountSyscall(c, siov)
+	case lxdSeccompNotifyBpf:
+		return s.HandleBpfSyscall(c, siov)
 	}
 
 	return int(-C.EINVAL)
@@ -1649,39 +1887,70 @@ func (s *Server) Stop() error {
 	return s.l.Close()
 }
 
-func lxcSupportSeccompNotifyContinue(state *state.State) bool {
-	if !lxcSupportSeccompNotify(state) {
+func lxcSupportSeccompV2(state *state.State) bool {
+	err := lxcSupportSeccompNotify(state)
+	if err != nil {
 		return false
 	}
 
-	if !state.OS.SeccompListenerContinue {
+	if !state.OS.LXCFeatures["seccomp_proxy_send_notify_fd"] {
 		return false
 	}
 
 	return true
 }
 
-func lxcSupportSeccompNotify(state *state.State) bool {
+func lxcSupportSeccompNotifyContinue(state *state.State) error {
+	err := lxcSupportSeccompNotify(state)
+	if err != nil {
+		return err
+	}
+
+	if !state.OS.SeccompListenerContinue {
+		return fmt.Errorf("Seccomp notify doesn't support continuing syscalls")
+	}
+
+	return nil
+}
+
+func lxcSupportSeccompNotifyAddfd(state *state.State) error {
+	err := lxcSupportSeccompNotify(state)
+	if err != nil {
+		return err
+	}
+
+	if !state.OS.SeccompListenerContinue {
+		return fmt.Errorf("Seccomp notify doesn't support continuing syscalls")
+	}
+
+	if !state.OS.SeccompListenerAddfd {
+		return fmt.Errorf("Seccomp notify doesn't support adding file descriptors")
+	}
+
+	return nil
+}
+
+func lxcSupportSeccompNotify(state *state.State) error {
 	if !state.OS.SeccompListener {
-		return false
+		return fmt.Errorf("Seccomp notify not supported")
 	}
 
 	if !state.OS.LXCFeatures["seccomp_notify"] {
-		return false
+		return fmt.Errorf("LXC doesn't support seccomp notify")
 	}
 
 	c, err := liblxc.NewContainer("test-seccomp", state.OS.LxcPath)
 	if err != nil {
-		return false
+		return fmt.Errorf("Failed to load seccomp notify test container")
 	}
 
 	err = c.SetConfigItem("lxc.seccomp.notify.proxy", fmt.Sprintf("unix:%s", shared.VarPath("seccomp.socket")))
 	if err != nil {
-		return false
+		return errors.Wrap(err, "LXC doesn't support notify proxy")
 	}
 
 	c.Release()
-	return true
+	return nil
 }
 
 // MountSyscallFilter creates a mount syscall filter from the config.
diff --git a/lxd/sys/os.go b/lxd/sys/os.go
index d20b6aef85..b7faa7d4e3 100644
--- a/lxd/sys/os.go
+++ b/lxd/sys/os.go
@@ -67,6 +67,7 @@ type OS struct {
 	NetnsGetifaddrs         bool
 	PidFds                  bool
 	SeccompListener         bool
+	SeccompListenerAddfd    bool
 	SeccompListenerContinue bool
 	Shiftfs                 bool
 	UeventInjection         bool
diff --git a/shared/instance.go b/shared/instance.go
index 42eba631c1..06541b4539 100644
--- a/shared/instance.go
+++ b/shared/instance.go
@@ -206,6 +206,8 @@ var KnownInstanceConfigKeys = map[string]func(value string) error{
 	"security.syscalls.deny_default":            validate.Optional(validate.IsBool),
 	"security.syscalls.deny_compat":             validate.Optional(validate.IsBool),
 	"security.syscalls.deny":                    validate.IsAny,
+	"security.syscalls.intercept.bpf":           validate.Optional(validate.IsBool),
+	"security.syscalls.intercept.bpf.devices":   validate.Optional(validate.IsBool),
 	"security.syscalls.intercept.mknod":         validate.Optional(validate.IsBool),
 	"security.syscalls.intercept.mount":         validate.Optional(validate.IsBool),
 	"security.syscalls.intercept.mount.allowed": validate.IsAny,

From 8d790378345ce7c09f19ceeb9f7c03c3260e7fb4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Fri, 7 Aug 2020 13:04:12 +0200
Subject: [PATCH 2/5] doc: add security.syscalls.intercept.bpf and
 security.syscalls.intercept.bpf.prog.devices

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 doc/instances.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/instances.md b/doc/instances.md
index af9cff14b0..4884e2c8cd 100644
--- a/doc/instances.md
+++ b/doc/instances.md
@@ -85,6 +85,8 @@ security.syscalls.allow                     | string    | -                 | no
 security.syscalls.deny                      | string    | -                 | no            | container                 | A '\n' separated list of syscalls to deny
 security.syscalls.deny\_compat              | boolean   | false             | no            | container                 | On x86\_64 this enables blocking of compat\_\* syscalls, it is a no-op on other arches
 security.syscalls.deny\_default             | boolean   | true              | no            | container                 | Enables the default syscall deny
+security.syscalls.intercept.bpf             | boolean   | false             | no            | container                 | Handles the `bpf` system call
+security.syscalls.intercept.bpf.devices     | boolean   | false             | no            | container                 | Allows `bpf` programs for the devices cgroup in the unified hierarchy to be loaded.
 security.syscalls.intercept.mknod           | boolean   | false             | no            | container                 | Handles the `mknod` and `mknodat` system calls (allows creation of a limited subset of char/block devices)
 security.syscalls.intercept.mount           | boolean   | false             | no            | container                 | Handles the `mount` system call
 security.syscalls.intercept.mount.allowed   | string    | -                 | yes           | container                 | Specify a comma-separated list of filesystems that are safe to mount for processes inside the instance

From b3919c1e0fa98a41543b41c3a11860a69dc183c7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Fri, 7 Aug 2020 13:01:11 +0200
Subject: [PATCH 3/5] api: add container_syscall_intercept_bpf_devices
 extension

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 doc/api-extensions.md | 3 +++
 shared/version/api.go | 1 +
 2 files changed, 4 insertions(+)

diff --git a/doc/api-extensions.md b/doc/api-extensions.md
index 1f39de3831..faca0ce2d4 100644
--- a/doc/api-extensions.md
+++ b/doc/api-extensions.md
@@ -1137,3 +1137,6 @@ specify which parent interface should be used for creating NIC device interfaces
 
 Also adds `network` configuration key support for `sriov` NICs to allow them to specify the associated network of
 the same type that they should use as the basis for the NIC device.
+
+## container\_syscall\_intercept\_bpf\_devices
+This adds support to intercept the bpf syscall in containers. Specifically, it allows to manage device cgroup bpf programs.
diff --git a/shared/version/api.go b/shared/version/api.go
index bbb6b9cf3a..bb5a40e2a9 100644
--- a/shared/version/api.go
+++ b/shared/version/api.go
@@ -222,6 +222,7 @@ var APIExtensions = []string{
 	"projects_limits_disk",
 	"network_type_macvlan",
 	"network_type_sriov",
+	"container_syscall_intercept_bpf_devices",
 }
 
 // APIExtensionsCount returns the number of available API extensions.

From 82f8feec7b9870894b2d0a16ba3ac9d0b386bed2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Fri, 7 Aug 2020 13:09:51 +0200
Subject: [PATCH 4/5] lxd-client: add security.syscalls.intercept.bpf
 security.syscalls.intercept.bpf.devices to completion

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 scripts/bash/lxd-client | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/bash/lxd-client b/scripts/bash/lxd-client
index 37996d89b7..9c94e8b874 100644
--- a/scripts/bash/lxd-client
+++ b/scripts/bash/lxd-client
@@ -102,6 +102,7 @@ _have lxc && {
       security.syscalls.allow \
       security.syscalls.deny \
       security.syscalls.deny_compat security.syscalls.deny_default \
+      security.syscalls.intercept.bpf security.syscalls.intercept.bpf.devices \
       security.syscalls.intercept.mknod security.syscalls.intercept.mount \
       security.syscalls.intercept.mount.allowed \
       security.syscalls.intercept.mount.fuse \

From bb2f092ba03f275d543c5add665a5a4533a0e756 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Fri, 7 Aug 2020 13:15:08 +0200
Subject: [PATCH 5/5] production-setup: mention bpf-specific memlock settings

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 doc/production-setup.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/production-setup.md b/doc/production-setup.md
index 624800868b..f0e439fd68 100644
--- a/doc/production-setup.md
+++ b/doc/production-setup.md
@@ -30,6 +30,8 @@ root    | soft  | nofile  | 1048576   | unset     | maximum number of open files
 root    | hard  | nofile  | 1048576   | unset     | maximum number of open files
 \*      | soft  | memlock | unlimited | unset     | maximum locked-in-memory address space (KB)
 \*      | hard  | memlock | unlimited | unset     | maximum locked-in-memory address space (KB)
+root    | soft  | memlock | unlimited | unset     | maximum locked-in-memory address space (KB) (Only need with `bpf` syscall supervision)
+root    | hard  | memlock | unlimited | unset     | maximum locked-in-memory address space (KB) (Only need with `bpf` syscall supervision)
 
 
 ### /etc/sysctl.conf


More information about the lxc-devel mailing list