[lxc-devel] [lxd/master] seccomp: implement syscall continuation

brauner on Github lxc-bot at linuxcontainers.org
Thu Oct 17 12:39:41 UTC 2019


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 364 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20191017/7ccf0c81/attachment.bin>
-------------- next part --------------
From 9237febe65eb507249a76d79cd3f729ceaf21b66 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Thu, 17 Oct 2019 11:59:04 +0200
Subject: [PATCH 1/2] seccomp: test for syscall continuation support

With kernel 5.5 (or kernels with backports) we will have the ability to
continue syscalls from the seccomp notifier. This adds the
infrastructure to test for support.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/api_1.0.go           |  11 +--
 lxd/daemon.go            |   7 ++
 lxd/main_checkfeature.go | 164 +++++++++++++++++++++++++++++++++++++--
 lxd/sys/os.go            |  11 +--
 4 files changed, 178 insertions(+), 15 deletions(-)

diff --git a/lxd/api_1.0.go b/lxd/api_1.0.go
index cd7182ba9c..454be206f4 100644
--- a/lxd/api_1.0.go
+++ b/lxd/api_1.0.go
@@ -205,11 +205,12 @@ func api10Get(d *Daemon, r *http.Request) response.Response {
 	}
 
 	env.KernelFeatures = map[string]string{
-		"netnsid_getifaddrs": fmt.Sprintf("%v", d.os.NetnsGetifaddrs),
-		"uevent_injection":   fmt.Sprintf("%v", d.os.UeventInjection),
-		"unpriv_fscaps":      fmt.Sprintf("%v", d.os.VFS3Fscaps),
-		"seccomp_listener":   fmt.Sprintf("%v", d.os.SeccompListener),
-		"shiftfs":            fmt.Sprintf("%v", d.os.Shiftfs),
+		"netnsid_getifaddrs":                fmt.Sprintf("%v", d.os.NetnsGetifaddrs),
+		"uevent_injection":                  fmt.Sprintf("%v", d.os.UeventInjection),
+		"unpriv_fscaps":                     fmt.Sprintf("%v", d.os.VFS3Fscaps),
+		"seccomp_listener":                  fmt.Sprintf("%v", d.os.SeccompListener),
+		"seccomp_listener_continue_syscall": fmt.Sprintf("%v", d.os.SeccompListenerContinue),
+		"shiftfs":                           fmt.Sprintf("%v", d.os.Shiftfs),
 	}
 
 	if d.os.LXCFeatures != nil {
diff --git a/lxd/daemon.go b/lxd/daemon.go
index 552cf0cd61..2717207af9 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -609,6 +609,13 @@ func (d *Daemon) init() error {
 		logger.Infof(" - seccomp listener: no")
 	}
 
+	d.os.SeccompListenerContinue = CanUseSeccompListenerContinue()
+	if d.os.SeccompListenerContinue {
+		logger.Infof(" - seccomp listener continue syscalls: yes")
+	} else {
+		logger.Infof(" - seccomp listener continue syscalls: no")
+	}
+
 	/*
 	 * During daemon startup we're the only thread that touches VFS3Fscaps
 	 * so we don't need to bother with atomic.StoreInt32() when touching
diff --git a/lxd/main_checkfeature.go b/lxd/main_checkfeature.go
index 9bfff90093..583f5a2215 100644
--- a/lxd/main_checkfeature.go
+++ b/lxd/main_checkfeature.go
@@ -8,7 +8,9 @@ import (
 #define _GNU_SOURCE
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/kcmp.h>
 #include <linux/types.h>
+#include <poll.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -25,11 +27,12 @@ import (
 #include <sys/ptrace.h>
 
 #include "../shared/netutils/netns_getifaddrs.c"
+#include "include/compiler.h"
 #include "include/memory_utils.h"
 
 bool netnsid_aware = false;
 bool uevent_aware = false;
-bool seccomp_notify_aware = false;
+int seccomp_notify_aware = 0;
 char errbuf[4096];
 
 extern int can_inject_uevent(const char *uevent, size_t len);
@@ -138,12 +141,158 @@ void is_uevent_aware()
 #define SECCOMP_GET_ACTION_AVAIL 2
 #endif
 
-void is_seccomp_notify_aware(void)
+#ifndef SECCOMP_RET_ALLOW
+#define SECCOMP_RET_ALLOW 0x7fff0000U
+#endif
+
+#ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
+#define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
+#endif
+
+static void is_seccomp_notify_aware(void)
 {
 	__u32 action[] = { SECCOMP_RET_USER_NOTIF };
-	seccomp_notify_aware = (syscall(__NR_seccomp, SECCOMP_GET_ACTION_AVAIL,
-					0, &action[0]) == 0);
+	if (syscall(__NR_seccomp, SECCOMP_GET_ACTION_AVAIL, 0, &action[0]) == 0)
+		seccomp_notify_aware = 1;
+
+}
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+static int user_trap_syscall(int nr, unsigned int flags)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+	};
+
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+
+	return syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, flags, &prog);
+}
+
+// The ifdef can be safely ignored. We don't work on a kernel that old.
+static int filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
+{
+#ifdef __NR_kcmp
+	return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+__noreturn static void __do_user_notification_continue(void)
+{
+	pid_t pid;
+	int ret;
+	int status, listener;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+	struct pollfd pollfd;
+
+	listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	if (listener < 0)
+		exit(1);
+
+	pid = fork();
+	if (pid < 0)
+		exit(1);
+
+	if (pid == 0) {
+		int dup_fd, pipe_fds[2];
+		pid_t self;
+
+		// Don't bother cleaning up. On child exit all of those
+		// will be closed anyway.
+		ret = pipe(pipe_fds);
+		if (ret < 0)
+			exit(1);
+
+		// O_CLOEXEC doesn't matter as we're in the child and we're
+		// not going to exec.
+		dup_fd = dup(pipe_fds[0]);
+		if (dup_fd < 0)
+			exit(1);
+
+		self = getpid();
+
+		ret = filecmp(self, self, pipe_fds[0], dup_fd);
+		if (ret)
+			exit(2);
+
+		exit(0);
+	}
+
+	pollfd.fd = listener;
+	pollfd.events = POLLIN | POLLOUT;
 
+	ret = poll(&pollfd, 1, 5000);
+	if (ret <= 0)
+		goto cleanup_sigkill;
+
+	if (!(pollfd.revents & POLLIN))
+		goto cleanup_sigkill;
+
+	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
+	if (ret)
+		goto cleanup_sigkill;
+
+	pollfd.fd = listener;
+	pollfd.events = POLLIN | POLLOUT;
+
+	ret = poll(&pollfd, 1, 5000);
+	if (ret <= 0)
+		goto cleanup_sigkill;
+
+	if (!(pollfd.revents & POLLOUT))
+		goto cleanup_sigkill;
+
+	if (req.data.nr != __NR_dup)
+		goto cleanup_sigkill;
+
+	resp.id = req.id;
+	resp.flags |= SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+	resp.error = -EPERM;
+	resp.flags = 0;
+	if (ret) {
+		ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+		goto cleanup_sigkill;
+	}
+
+cleanup_wait:
+	ret = waitpid(pid, &status, 0);
+	if ((ret != pid) || !WIFEXITED(status) || WEXITSTATUS(status))
+		exit(1);
+	exit(0);
+
+cleanup_sigkill:
+	kill(pid, SIGKILL);
+	goto cleanup_wait;
+}
+
+static void is_user_notification_continue_aware(void)
+{
+	int ret, status;
+	pid_t pid;
+
+	pid = fork();
+	if (pid < 0)
+		return;
+
+	if (pid == 0)
+		__do_user_notification_continue();
+
+	ret = waitpid(pid, &status, 0);
+	if ((ret == pid) && WIFEXITED(status) && !WEXITSTATUS(status))
+		seccomp_notify_aware = 2;
 }
 
 void checkfeature()
@@ -153,6 +302,7 @@ void checkfeature()
 	is_netnsid_aware(&hostnetns_fd, &newnetns_fd);
 	is_uevent_aware();
 	is_seccomp_notify_aware();
+	is_user_notification_continue_aware();
 
 	if (setns(hostnetns_fd, CLONE_NEWNET) < 0)
 		(void)sprintf(errbuf, "%s", "Failed to attach to host network namespace");
@@ -180,5 +330,9 @@ func CanUseUeventInjection() bool {
 }
 
 func CanUseSeccompListener() bool {
-	return bool(C.seccomp_notify_aware)
+	return bool(C.seccomp_notify_aware > 0)
+}
+
+func CanUseSeccompListenerContinue() bool {
+	return bool(C.seccomp_notify_aware == 2)
 }
diff --git a/lxd/sys/os.go b/lxd/sys/os.go
index c6abcc0c4d..08ddba1df7 100644
--- a/lxd/sys/os.go
+++ b/lxd/sys/os.go
@@ -67,11 +67,12 @@ type OS struct {
 	CGroupSwapAccounting        bool
 
 	// Kernel features
-	NetnsGetifaddrs bool
-	SeccompListener bool
-	Shiftfs         bool
-	UeventInjection bool
-	VFS3Fscaps      bool
+	NetnsGetifaddrs         bool
+	SeccompListener         bool
+	SeccompListenerContinue bool
+	Shiftfs                 bool
+	UeventInjection         bool
+	VFS3Fscaps              bool
 
 	// LXC features
 	LXCFeatures map[string]bool

From 0d171b6ffd126556b2efa4966ba27b0ce392ba8a Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Thu, 17 Oct 2019 14:35:40 +0200
Subject: [PATCH 2/2] seccomp: implement syscall continuation for mknod(),
 mknodat(), and setxattr()

This will continue syscalls for mknod(), mknodat(), and setxattr() in
various cases. This is safe to do because we're more privileged than the
container.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/seccomp/seccomp.go | 95 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 83 insertions(+), 12 deletions(-)

diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go
index 492b311f17..f7822b0b70 100644
--- a/lxd/seccomp/seccomp.go
+++ b/lxd/seccomp/seccomp.go
@@ -249,9 +249,10 @@ static int seccomp_notify_get_syscall(struct seccomp_notif *req,
 }
 
 static void seccomp_notify_update_response(struct seccomp_notif_resp *resp,
-					   int new_neg_errno)
+					   int new_neg_errno, uint32_t flags)
 {
 	resp->error = new_neg_errno;
+	resp->flags |= flags;
 }
 
 static void prepare_seccomp_iovec(struct iovec *iov,
@@ -654,8 +655,8 @@ func (siov *Iovec) IsValidSeccompIovec(size uint64) bool {
 }
 
 // SendSeccompIovec sends seccomp iovec.
-func (siov *Iovec) SendSeccompIovec(fd int, errno int) error {
-	C.seccomp_notify_update_response(siov.resp, C.int(errno))
+func (siov *Iovec) SendSeccompIovec(fd int, errno int, flags uint32) error {
+	C.seccomp_notify_update_response(siov.resp, C.int(errno), C.uint32_t(flags))
 
 	msghdr := C.struct_msghdr{}
 	msghdr.msg_iov = siov.iov
@@ -926,17 +927,29 @@ func (s *Server) HandleMknodSyscall(c Instance, siov *Iovec) int {
 			"seccomp_notify_flags": siov.req.flags,
 		})
 
-	siov.resp.error = C.device_allowed(C.dev_t(siov.req.data.args[2]), C.mode_t(siov.req.data.args[1]))
-	if siov.resp.error != 0 {
+	if C.device_allowed(C.dev_t(siov.req.data.args[2]), C.mode_t(siov.req.data.args[1])) < 0 {
 		logger.Debugf("Device not allowed")
-		return int(siov.resp.error)
+		if s.s.OS.SeccompListenerContinue {
+			logger.Debugf("Continuing mknod syscall")
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(SeccompUserNotifFlagContinue))
+			return 0
+		} else {
+			return int(siov.resp.error)
+		}
 	}
 
 	cPathBuf := [unix.PathMax]C.char{}
 	_, err := C.pread(C.int(siov.memFd), unsafe.Pointer(&cPathBuf[0]), C.size_t(unix.PathMax), C.off_t(siov.req.data.args[0]))
 	if err != nil {
 		logger.Errorf("Failed to read memory for mknod syscall: %s", err)
-		return int(-C.EPERM)
+		logger.Debugf("Device not allowed")
+		if s.s.OS.SeccompListenerContinue {
+			logger.Debugf("Continuing mknod syscall")
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(SeccompUserNotifFlagContinue))
+			return 0
+		} else {
+			return int(-C.EPERM)
+		}
 	}
 
 	args := MknodArgs{
@@ -964,20 +977,38 @@ func (s *Server) HandleMknodatSyscall(c Instance, siov *Iovec) int {
 	// built on 64bit userspace correctly.
 	if int32(siov.req.data.args[0]) != int32(C.AT_FDCWD) {
 		logger.Debugf("Non AT_FDCWD mknodat calls are not allowed")
-		return int(-C.EINVAL)
+		if s.s.OS.SeccompListenerContinue {
+			logger.Debugf("Continuing mknodat syscall")
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(SeccompUserNotifFlagContinue))
+			return 0
+		} else {
+			return int(-C.EINVAL)
+		}
 	}
 
 	siov.resp.error = C.device_allowed(C.dev_t(siov.req.data.args[3]), C.mode_t(siov.req.data.args[2]))
 	if siov.resp.error != 0 {
 		logger.Debugf("Device not allowed")
-		return int(siov.resp.error)
+		if s.s.OS.SeccompListenerContinue {
+			logger.Debugf("Continuing mknodat syscall")
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(SeccompUserNotifFlagContinue))
+			return 0
+		} else {
+			return int(siov.resp.error)
+		}
 	}
 
 	cPathBuf := [unix.PathMax]C.char{}
 	_, err := C.pread(C.int(siov.memFd), unsafe.Pointer(&cPathBuf[0]), C.size_t(unix.PathMax), C.off_t(siov.req.data.args[1]))
 	if err != nil {
 		logger.Errorf("Failed to read memory for mknodat syscall: %s", err)
-		return int(-C.EPERM)
+		if s.s.OS.SeccompListenerContinue {
+			logger.Debugf("Continuing mknodat syscall")
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(SeccompUserNotifFlagContinue))
+			return 0
+		} else {
+			return int(-C.EPERM)
+		}
 	}
 
 	args := MknodArgs{
@@ -1020,11 +1051,23 @@ func (s *Server) HandleSetxattrSyscall(c Instance, siov *Iovec) int {
 	args.pid = int(siov.req.pid)
 	uid, gid, fsuid, fsgid, err := TaskIDs(args.pid)
 	if err != nil {
+		if s.s.OS.SeccompListenerContinue {
+			logger.Debugf("Continuing setxattr syscall")
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(SeccompUserNotifFlagContinue))
+			return 0
+		}
+
 		return int(-C.EPERM)
 	}
 
 	idmapset, err := c.CurrentIdmap()
 	if err != nil {
+		if s.s.OS.SeccompListenerContinue {
+			logger.Debugf("Continuing setxattr syscall")
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(SeccompUserNotifFlagContinue))
+			return 0
+		}
+
 		return int(-C.EINVAL)
 	}
 
@@ -1035,6 +1078,12 @@ func (s *Server) HandleSetxattrSyscall(c Instance, siov *Iovec) int {
 	cBuf := [unix.PathMax]C.char{}
 	_, err = C.pread(C.int(siov.memFd), unsafe.Pointer(&cBuf[0]), C.size_t(unix.PathMax), C.off_t(siov.req.data.args[0]))
 	if err != nil {
+		if s.s.OS.SeccompListenerContinue {
+			logger.Debugf("Continuing setxattr syscall")
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(SeccompUserNotifFlagContinue))
+			return 0
+		}
+
 		logger.Errorf("Failed to read memory for setxattr syscall: %s", err)
 		return int(-C.EPERM)
 	}
@@ -1043,6 +1092,12 @@ func (s *Server) HandleSetxattrSyscall(c Instance, siov *Iovec) int {
 	// const char *name
 	_, err = C.pread(C.int(siov.memFd), unsafe.Pointer(&cBuf[0]), C.size_t(unix.PathMax), C.off_t(siov.req.data.args[1]))
 	if err != nil {
+		if s.s.OS.SeccompListenerContinue {
+			logger.Debugf("Continuing setxattr syscall")
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(SeccompUserNotifFlagContinue))
+			return 0
+		}
+
 		logger.Errorf("Failed to read memory for setxattr syscall: %s", err)
 		return int(-C.EPERM)
 	}
@@ -1057,6 +1112,12 @@ func (s *Server) HandleSetxattrSyscall(c Instance, siov *Iovec) int {
 	buf := make([]byte, args.size)
 	_, err = C.pread(C.int(siov.memFd), unsafe.Pointer(&buf[0]), C.size_t(args.size), C.off_t(siov.req.data.args[2]))
 	if err != nil {
+		if s.s.OS.SeccompListenerContinue {
+			logger.Debugf("Continuing setxattr syscall")
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(SeccompUserNotifFlagContinue))
+			return 0
+		}
+
 		logger.Errorf("Failed to read memory for setxattr syscall: %s", err)
 		return int(-C.EPERM)
 	}
@@ -1065,6 +1126,10 @@ func (s *Server) HandleSetxattrSyscall(c Instance, siov *Iovec) int {
 	whiteout := 0
 	if string(args.name) == "trusted.overlay.opaque" && string(args.value) == "y" {
 		whiteout = 1
+	} else if s.s.OS.SeccompListenerContinue {
+		logger.Debugf("Continuing setxattr syscall")
+		C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(SeccompUserNotifFlagContinue))
+		return 0
 	}
 
 	_, stderr, err := shared.RunCommandSplit(nil, util.GetExecPath(),
@@ -1106,19 +1171,25 @@ func (s *Server) handleSyscall(c Instance, siov *Iovec) int {
 	return int(-C.EINVAL)
 }
 
+const SeccompUserNotifFlagContinue uint32 = 0x00000001
+
 func (s *Server) handler(fd int, siov *Iovec, findPID func(pid int32, state *state.State) (Instance, error)) error {
 	defer siov.PutSeccompIovec()
 
 	c, err := findPID(int32(siov.msg.monitor_pid), s.s)
 	if err != nil {
-		siov.SendSeccompIovec(fd, int(-C.EPERM))
+		if s.s.OS.SeccompListenerContinue {
+			siov.SendSeccompIovec(fd, 0, SeccompUserNotifFlagContinue)
+		} else {
+			siov.SendSeccompIovec(fd, int(-C.EPERM), 0)
+		}
 		logger.Errorf("Failed to find container for monitor %d", siov.msg.monitor_pid)
 		return err
 	}
 
 	errno := s.handleSyscall(c, siov)
 
-	err = siov.SendSeccompIovec(fd, errno)
+	err = siov.SendSeccompIovec(fd, errno, 0)
 	if err != nil {
 		return err
 	}


More information about the lxc-devel mailing list