[lxc-devel] [lxd/master] [WIP]: seccomp: implement mount syscall interception
    brauner on Github 
    lxc-bot at linuxcontainers.org
       
    Mon Oct 21 14:17:29 UTC 2019
    
    
  
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1402 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20191021/8fea6c04/attachment.bin>
-------------- next part --------------
From af3cff057c5b56fbdd6725fdad222c8adabdec9e Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Fri, 18 Oct 2019 20:51:00 +0200
Subject: [PATCH] [WIP]: seccomp: implement mount syscall interception
This allows to intercept and subsequently to emulate the mount syscall.
To implement an efficient seccomp filter, we filter based on the flags
argument of the mount syscall. We don't want to filter any of the
following flag combinations since they do not cause the creation of a
new superblock:
MS_REMOUNT
MS_BIND
MS_MOVE
MS_UNBINDABLE
MS_PRIVATE
MS_SLAVE
MS_SHARED
MS_KERNMOUNT
MS_I_VERSION
So define the following mask of allowed flags:
long unsigned int mask = MS_MGC_VAL | MS_RDONLY | MS_NOSUID | MS_NODEV |
                         MS_NOEXEC | MS_SYNCHRONOUS | MS_MANDLOCK |
                         MS_DIRSYNC | MS_NOATIME | MS_NODIRATIME | MS_REC |
                         MS_VERBOSE | MS_SILENT | MS_POSIXACL | MS_RELATIME |
                         MS_STRICTATIME | MS_LAZYTIME;
Now we inverse the flag:
inverse_mask ~= mask;
Seccomp will now only intercept these flags if they do not contain any of
the allowed flags, i.e. we only intercept combinations were a new superblock
is created.
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/container.go        |   1 +
 lxd/container_lxc.go    |   6 +-
 lxd/main_forksyscall.go |  31 +++++-
 lxd/seccomp/seccomp.go  | 219 ++++++++++++++++++++++++++++++++++++----
 shared/container.go     |   1 +
 5 files changed, 232 insertions(+), 26 deletions(-)
diff --git a/lxd/container.go b/lxd/container.go
index f4cfe3fe44..27819190bc 100644
--- a/lxd/container.go
+++ b/lxd/container.go
@@ -245,6 +245,7 @@ type container interface {
 	OnStop(target string) error
 
 	InsertSeccompUnixDevice(prefix string, m deviceConfig.Device, pid int) error
+	InsertMountLXD(source, target, fstype string, flags int, mntnsPID int, shiftfs bool) error
 
 	CurrentIdmap() (*idmap.IdmapSet, error)
 	DiskIdmap() (*idmap.IdmapSet, error)
diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index f3282cdde6..5c470b1028 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -6122,7 +6122,7 @@ func (c *containerLXC) StorageStop() (bool, error) {
 }
 
 // Mount handling
-func (c *containerLXC) insertMountLXD(source, target, fstype string, flags int, mntnsPID int, shiftfs bool) error {
+func (c *containerLXC) InsertMountLXD(source, target, fstype string, flags int, mntnsPID int, shiftfs bool) error {
 	pid := mntnsPID
 	if pid <= 0 {
 		// Get the init PID
@@ -6204,7 +6204,7 @@ func (c *containerLXC) insertMount(source, target, fstype string, flags int, shi
 		return c.insertMountLXC(source, target, fstype, flags)
 	}
 
-	return c.insertMountLXD(source, target, fstype, flags, -1, shiftfs)
+	return c.InsertMountLXD(source, target, fstype, flags, -1, shiftfs)
 }
 
 func (c *containerLXC) removeMount(mount string) error {
@@ -6291,7 +6291,7 @@ func (c *containerLXC) InsertSeccompUnixDevice(prefix string, m deviceConfig.Dev
 
 	// Bind-mount it into the container
 	defer os.Remove(devPath)
-	return c.insertMountLXD(devPath, tgtPath, "none", unix.MS_BIND, pid, false)
+	return c.InsertMountLXD(devPath, tgtPath, "none", unix.MS_BIND, pid, false)
 }
 
 func (c *containerLXC) removeUnixDevices() error {
diff --git a/lxd/main_forksyscall.go b/lxd/main_forksyscall.go
index 188b0e2c3e..c89a6b4698 100644
--- a/lxd/main_forksyscall.go
+++ b/lxd/main_forksyscall.go
@@ -95,7 +95,7 @@ static bool acquire_basic_creds(pid_t pid)
 
 // Expects command line to be in the form:
 // <PID> <root-uid> <root-gid> <path> <mode> <dev>
-static void forkmknod()
+static void mknod_emulate()
 {
 	__do_close_prot_errno int target_dir_fd = -EBADF;
 	char *cur = NULL, *target = NULL, *target_dir = NULL, *target_host = NULL;
@@ -235,7 +235,7 @@ static bool change_creds(int ns_fd, cap_t caps, uid_t nsuid, gid_t nsgid, uid_t
 	return true;
 }
 
-static void forksetxattr()
+static void setxattr_emulate()
 {
 	__do_close_prot_errno int ns_fd = -EBADF, target_fd = -EBADF;
 	int flags = 0;
@@ -316,6 +316,27 @@ static void forksetxattr()
 	}
 }
 
+static void mount_emulate()
+{
+	pid_t pid = -1;
+	char *source = NULL, *target = NULL, *fstype = NULL;
+	unsigned long flags = 0;
+	const void *data;
+
+	pid = atoi(advance_arg(true));
+	source = advance_arg(true);
+	target = advance_arg(true);
+	fstype = advance_arg(true);
+	flags = atoi(advance_arg(true));
+	data = advance_arg(false);
+
+	if (!acquire_basic_creds(pid))
+		_exit(EXIT_FAILURE);
+
+	if (mount(source, target, fstype, flags, data) < 0)
+		_exit(EXIT_FAILURE);
+}
+
 void forksyscall()
 {
 	char *syscall = NULL;
@@ -332,9 +353,11 @@ void forksyscall()
 		_exit(EXIT_SUCCESS);
 
 	if (strcmp(syscall, "mknod") == 0)
-		forkmknod();
+		mknod_emulate();
 	else if (strcmp(syscall, "setxattr") == 0)
-		forksetxattr();
+		setxattr_emulate();
+	else if (strcmp(syscall, "mount") == 0)
+		mount_emulate();
 	else
 		_exit(EXIT_FAILURE);
 
diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go
index 1610276161..dd1deb9bfd 100644
--- a/lxd/seccomp/seccomp.go
+++ b/lxd/seccomp/seccomp.go
@@ -124,68 +124,70 @@ struct lxd_seccomp_data_arch {
 	int nr_mknod;
 	int nr_mknodat;
 	int nr_setxattr;
+	int nr_mount;
 };
 
 #define LXD_SECCOMP_NOTIFY_MKNOD    0
 #define LXD_SECCOMP_NOTIFY_MKNODAT  1
 #define LXD_SECCOMP_NOTIFY_SETXATTR 2
+#define LXD_SECCOMP_NOTIFY_MOUNT 3
 
 // ordered by likelihood of usage...
 static const struct lxd_seccomp_data_arch seccomp_notify_syscall_table[] = {
-	{ -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, LXD_SECCOMP_NOTIFY_SETXATTR },
+	{ -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, LXD_SECCOMP_NOTIFY_SETXATTR, LXD_SECCOMP_NOTIFY_MOUNT },
 #ifdef AUDIT_ARCH_X86_64
-	{ AUDIT_ARCH_X86_64,      133, 259, 188 },
+	{ AUDIT_ARCH_X86_64,      133, 259, 188, 165 },
 #endif
 #ifdef AUDIT_ARCH_I386
-	{ AUDIT_ARCH_I386,         14, 297, 226 },
+	{ AUDIT_ARCH_I386,         14, 297, 226,  21 },
 #endif
 #ifdef AUDIT_ARCH_AARCH64
-	{ AUDIT_ARCH_AARCH64,      -1,  33,   5 },
+	{ AUDIT_ARCH_AARCH64,      -1,  33,   5,  21 },
 #endif
 #ifdef AUDIT_ARCH_ARM
-	{ AUDIT_ARCH_ARM,          14, 324, 226 },
+	{ AUDIT_ARCH_ARM,          14, 324, 226,  21 },
 #endif
 #ifdef AUDIT_ARCH_ARMEB
-	{ AUDIT_ARCH_ARMEB,        14, 324, 226 },
+	{ AUDIT_ARCH_ARMEB,        14, 324, 226,  21 },
 #endif
 #ifdef AUDIT_ARCH_S390
-	{ AUDIT_ARCH_S390,         14, 290, 224 },
+	{ AUDIT_ARCH_S390,         14, 290, 224,  21 },
 #endif
 #ifdef AUDIT_ARCH_S390X
-	{ AUDIT_ARCH_S390X,        14, 290, 224 },
+	{ AUDIT_ARCH_S390X,        14, 290, 224,  21 },
 #endif
 #ifdef AUDIT_ARCH_PPC
-	{ AUDIT_ARCH_PPC,          14, 288, 209 },
+	{ AUDIT_ARCH_PPC,          14, 288, 209,  21 },
 #endif
 #ifdef AUDIT_ARCH_PPC64
-	{ AUDIT_ARCH_PPC64,        14, 288, 209 },
+	{ AUDIT_ARCH_PPC64,        14, 288, 209,  21 },
 #endif
 #ifdef AUDIT_ARCH_PPC64LE
-	{ AUDIT_ARCH_PPC64LE,      14, 288, 209 },
+	{ AUDIT_ARCH_PPC64LE,      14, 288, 209,  21 },
 #endif
 #ifdef AUDIT_ARCH_SPARC
-	{ AUDIT_ARCH_SPARC,        14, 286, 169 },
+	{ AUDIT_ARCH_SPARC,        14, 286, 169, 167 },
 #endif
 #ifdef AUDIT_ARCH_SPARC64
-	{ AUDIT_ARCH_SPARC64,      14, 286, 169 },
+	{ AUDIT_ARCH_SPARC64,      14, 286, 169, 167 },
 #endif
 #ifdef AUDIT_ARCH_MIPS
-	{ AUDIT_ARCH_MIPS,         14, 290, 224 },
+	{ AUDIT_ARCH_MIPS,         14, 290, 224,  21 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL
-	{ AUDIT_ARCH_MIPSEL,       14, 290, 224 },
+	{ AUDIT_ARCH_MIPSEL,       14, 290, 224,  21 },
 #endif
 #ifdef AUDIT_ARCH_MIPS64
-	{ AUDIT_ARCH_MIPS64,      131, 249, 180 },
+	{ AUDIT_ARCH_MIPS64,      131, 249, 180, 160 },
 #endif
 #ifdef AUDIT_ARCH_MIPS64N32
-	{ AUDIT_ARCH_MIPS64N32,   131, 253, 180 },
+	{ AUDIT_ARCH_MIPS64N32,   131, 253, 180, 160 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL64
-	{ AUDIT_ARCH_MIPSEL64,    131, 249, 180 },
+	{ AUDIT_ARCH_MIPSEL64,    131, 249, 180, 160 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL64N32
-	{ AUDIT_ARCH_MIPSEL64N32, 131, 253, 180 },
+	{ AUDIT_ARCH_MIPSEL64N32, 131, 253, 180, 160 },
 #endif
 };
 
@@ -214,6 +216,9 @@ static int seccomp_notify_get_syscall(struct seccomp_notif *req,
 		if (entry->nr_setxattr == req->data.nr)
 			return LXD_SECCOMP_NOTIFY_SETXATTR;
 
+		if (entry->nr_mount == req->data.nr)
+			return LXD_SECCOMP_NOTIFY_MOUNT;
+
 		break;
 	}
 
@@ -251,6 +256,7 @@ import "C"
 const lxdSeccompNotifyMknod = C.LXD_SECCOMP_NOTIFY_MKNOD
 const lxdSeccompNotifyMknodat = C.LXD_SECCOMP_NOTIFY_MKNODAT
 const lxdSeccompNotifySetxattr = C.LXD_SECCOMP_NOTIFY_SETXATTR
+const lxdSeccompNotifyMount = C.LXD_SECCOMP_NOTIFY_MOUNT
 
 const seccompHeader = `2
 `
@@ -272,6 +278,38 @@ mknodat notify [2,24576,SCMP_CMP_MASKED_EQ,61440]
 const seccompNotifySetxattr = `setxattr notify [3,1,SCMP_CMP_EQ]
 `
 
+// We don't want to filter any of the following flag combinations since they do
+// not cause the creation of a new superblock:
+//
+// MS_REMOUNT
+// MS_BIND
+// MS_MOVE
+// MS_UNBINDABLE
+// MS_PRIVATE
+// MS_SLAVE
+// MS_SHARED
+// MS_KERNMOUNT
+// MS_I_VERSION
+//
+// So define the following mask of allowed flags:
+//
+// long unsigned int mask = MS_MGC_VAL | MS_RDONLY | MS_NOSUID | MS_NODEV |
+//                          MS_NOEXEC | MS_SYNCHRONOUS | MS_MANDLOCK |
+//                          MS_DIRSYNC | MS_NOATIME | MS_NODIRATIME | MS_REC |
+//                          MS_VERBOSE | MS_SILENT | MS_POSIXACL | MS_RELATIME |
+//                          MS_STRICTATIME | MS_LAZYTIME;
+//
+// Now we inverse the flag:
+//
+// inverse_mask ~= mask;
+//
+// Seccomp will now only intercept these flags if they do not contain any of
+// the allowed flags, i.e. we only intercept combinations were a new superblock
+// is created.
+
+const seccompNotifyMount = `mount notify [3,0,SCMP_CMP_MASKED_EQ,18446744070422410016]
+`
+
 const compatBlockingPolicy = `[%s]
 compat_sys_rt_sigaction errno 38
 stub_x32_rt_sigreturn errno 38
@@ -320,6 +358,7 @@ type Instance interface {
 	RootfsPath() string
 	CurrentIdmap() (*idmap.IdmapSet, error)
 	InsertSeccompUnixDevice(prefix string, m deviceConfig.Device, pid int) error
+	InsertMountLXD(source, target, fstype string, flags int, mntnsPID int, shiftfs bool) error
 }
 
 var seccompPath = shared.VarPath("security", "seccomp")
@@ -352,6 +391,7 @@ func InstanceNeedsPolicy(c Instance) bool {
 		"security.syscalls.blacklist_compat",
 		"security.syscalls.intercept.mknod",
 		"security.syscalls.intercept.setxattr",
+		"security.syscalls.intercept.mount",
 	}
 
 	for _, k := range keys {
@@ -392,6 +432,7 @@ func InstanceNeedsIntercept(c Instance) (bool, error) {
 	keys := []string{
 		"security.syscalls.intercept.mknod",
 		"security.syscalls.intercept.setxattr",
+		"security.syscalls.intercept.mount",
 	}
 
 	needed := false
@@ -449,6 +490,10 @@ func seccompGetPolicyContent(c Instance) (string, error) {
 		if shared.IsTrue(config["security.syscalls.intercept.setxattr"]) {
 			policy += seccompNotifySetxattr
 		}
+
+		if shared.IsTrue(config["security.syscalls.intercept.mount"]) {
+			policy += seccompNotifyMount
+		}
 	}
 
 	if whitelist != "" {
@@ -1132,6 +1177,140 @@ func (s *Server) HandleSetxattrSyscall(c Instance, siov *Iovec) int {
 	return 0
 }
 
+// MountArgs arguments for mount.
+type MountArgs struct {
+	source string
+	target string
+	fstype string
+	flags  int
+	data   string
+	pid    int
+}
+
+// HandleMountSyscall handles mount syscalls.
+func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
+	ctx := log.Ctx{"container": c.Name(),
+		"project":              c.Project(),
+		"syscall_number":       siov.req.data.nr,
+		"audit_architecture":   siov.req.data.arch,
+		"seccomp_notify_id":    siov.req.id,
+		"seccomp_notify_flags": siov.req.flags,
+	}
+
+	defer logger.Debug("Handling mount syscall", ctx)
+
+	args := MountArgs{
+		pid: int(siov.req.pid),
+	}
+
+	// const char *source
+	args.source = ""
+	if siov.req.data.args[0] != 0 {
+		cBuf := [unix.PathMax]C.char{}
+		_, err := C.pread(C.int(siov.memFd), unsafe.Pointer(&cBuf[0]), C.size_t(unix.PathMax), C.off_t(siov.req.data.args[0]))
+		if err != nil {
+			ctx["err"] = fmt.Sprintf("Failed to read memory for first argument of mount syscall: %s", err)
+			if s.s.OS.SeccompListenerContinue {
+				ctx["syscall_continue"] = "true"
+				C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
+				return 0
+			}
+
+			return int(-C.EPERM)
+		}
+		args.source = C.GoString(&cBuf[0])
+	}
+
+	// const char *target
+	args.target = ""
+	if siov.req.data.args[1] != 0 {
+		cBuf := [unix.PathMax]C.char{}
+		_, err := C.pread(C.int(siov.memFd), unsafe.Pointer(&cBuf[0]), C.size_t(unix.PathMax), C.off_t(siov.req.data.args[1]))
+		if err != nil {
+			ctx["err"] = fmt.Sprintf("Failed to read memory for second argument of mount syscall: %s", err)
+			if s.s.OS.SeccompListenerContinue {
+				ctx["syscall_continue"] = "true"
+				C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
+				return 0
+			}
+
+			return int(-C.EPERM)
+		}
+		args.target = C.GoString(&cBuf[0])
+	}
+
+	// const char *filesystemtype
+	args.fstype = ""
+	if siov.req.data.args[2] != 0 {
+		cBuf := [unix.PathMax]C.char{}
+		_, err := C.pread(C.int(siov.memFd), unsafe.Pointer(&cBuf[0]), C.size_t(unix.PathMax), C.off_t(siov.req.data.args[2]))
+		if err != nil {
+			ctx["err"] = fmt.Sprintf("Failed to read memory for third argument of mount syscall: %s", err)
+			if s.s.OS.SeccompListenerContinue {
+				ctx["syscall_continue"] = "true"
+				C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
+				return 0
+			}
+
+			return int(-C.EPERM)
+		}
+		args.fstype = C.GoString(&cBuf[0])
+	}
+
+	if args.fstype != "debugfs" {
+		ctx["err"] = fmt.Sprintf("Invalid filesystem type '%s'", args.fstype)
+		if s.s.OS.SeccompListenerContinue {
+			ctx["syscall_continue"] = "true"
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
+			return 0
+		}
+
+		return int(-C.EPERM)
+	}
+
+	// unsigned long mountflags
+	args.flags = int(siov.req.data.args[3])
+
+	// const void *data
+	args.data = ""
+	if siov.req.data.args[4] != 0 {
+		cBuf := [unix.PathMax]C.char{}
+		_, err := C.pread(C.int(siov.memFd), unsafe.Pointer(&cBuf[0]), C.size_t(unix.PathMax), C.off_t(siov.req.data.args[4]))
+		if err != nil {
+			ctx["err"] = fmt.Sprintf("Failed to read memory for fifth argument of mount syscall: %s", err)
+			if s.s.OS.SeccompListenerContinue {
+				ctx["syscall_continue"] = "true"
+				C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
+				return 0
+			}
+
+			return int(-C.EPERM)
+		}
+		args.data = C.GoString(&cBuf[0])
+	}
+
+	_, _, err := shared.RunCommandSplit(nil, util.GetExecPath(),
+		"forksyscall",
+		"mount",
+		fmt.Sprintf("%d", args.pid),
+		fmt.Sprintf("%s", args.source),
+		fmt.Sprintf("%s", args.target),
+		fmt.Sprintf("%s", args.fstype),
+		fmt.Sprintf("%d", args.flags),
+		fmt.Sprintf("%s", args.data))
+	if err != nil {
+		if s.s.OS.SeccompListenerContinue {
+			ctx["syscall_continue"] = "true"
+			C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
+			return 0
+		}
+
+		return int(-C.EPERM)
+	}
+
+	return 0
+}
+
 func (s *Server) handleSyscall(c Instance, siov *Iovec) int {
 	switch int(C.seccomp_notify_get_syscall(siov.req, siov.resp)) {
 	case lxdSeccompNotifyMknod:
@@ -1140,6 +1319,8 @@ func (s *Server) handleSyscall(c Instance, siov *Iovec) int {
 		return s.HandleMknodatSyscall(c, siov)
 	case lxdSeccompNotifySetxattr:
 		return s.HandleSetxattrSyscall(c, siov)
+	case lxdSeccompNotifyMount:
+		return s.HandleMountSyscall(c, siov)
 	}
 
 	return int(-C.EINVAL)
diff --git a/shared/container.go b/shared/container.go
index 4d78aa6118..1ee16c9a3c 100644
--- a/shared/container.go
+++ b/shared/container.go
@@ -298,6 +298,7 @@ var KnownContainerConfigKeys = map[string]func(value string) error{
 	"security.syscalls.blacklist_compat":   IsBool,
 	"security.syscalls.blacklist":          IsAny,
 	"security.syscalls.intercept.mknod":    IsBool,
+	"security.syscalls.intercept.mount":    IsBool,
 	"security.syscalls.intercept.setxattr": IsBool,
 	"security.syscalls.whitelist":          IsAny,
 
    
    
More information about the lxc-devel
mailing list